From c4cc0adb2426642b46cb56d2ec679e6047916a76 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 11 Jul 2014 03:38:49 +0000 Subject: [PATCH] Import pandas_0.14.1.orig.tar.gz [dgit import orig pandas_0.14.1.orig.tar.gz] --- .coveragerc | 26 + .gitattributes | 15 + .gitignore | 90 + .travis.yml | 111 + CONTRIBUTING.md | 98 + LICENSE | 87 + LICENSES/MSGPACK_LICENSE | 13 + LICENSES/MSGPACK_NUMPY_LICENSE | 33 + LICENSES/NUMPY_LICENSE | 30 + LICENSES/OTHER | 80 + LICENSES/PSF_LICENSE | 279 + LICENSES/SCIPY_LICENSE | 31 + LICENSES/SIX | 21 + LICENSES/ULTRAJSON_LICENSE | 34 + MANIFEST.in | 24 + Makefile | 25 + README.md | 224 + RELEASE.md | 6 + bench/alignment.py | 22 + bench/bench_dense_to_sparse.py | 14 + bench/bench_get_put_value.py | 56 + bench/bench_groupby.py | 65 + bench/bench_join_panel.py | 85 + bench/bench_khash_dict.py | 89 + bench/bench_merge.R | 161 + bench/bench_merge.py | 105 + bench/bench_merge_sqlite.py | 87 + bench/bench_pivot.R | 27 + bench/bench_pivot.py | 16 + bench/bench_sparse.py | 93 + bench/bench_take_indexing.py | 55 + bench/bench_unique.py | 278 + bench/bench_with_subset.R | 53 + bench/bench_with_subset.py | 116 + bench/better_unique.py | 80 + bench/duplicated.R | 22 + bench/io_roundtrip.py | 116 + bench/larry.py | 0 bench/serialize.py | 89 + bench/test.py | 70 + bench/zoo_bench.R | 71 + bench/zoo_bench.py | 36 + ci/README.txt | 17 + ci/after_script.sh | 26 + ci/before_install.sh | 13 + ci/build_docs.sh | 49 + ci/cron/go_doc.sh | 99 + ci/install.sh | 150 + ci/ironcache/get.py | 41 + ci/ironcache/put.py | 48 + ci/prep_ccache.sh | 47 + ci/print_skipped.py | 51 + ci/print_versions.py | 28 + ci/requirements-2.6.txt | 16 + ci/requirements-2.7.txt | 25 + ci/requirements-2.7_LOCALE.txt | 18 + ci/requirements-2.7_NUMPY_DEV_1_8_x.txt | 3 + ci/requirements-2.7_NUMPY_DEV_master.txt | 3 + ci/requirements-3.2.txt | 14 + ci/requirements-3.3.txt | 17 + ci/requirements-3.4.txt | 19 + ci/script.sh | 27 + ci/speedpack/Vagrantfile | 22 + ci/speedpack/build.sh | 117 + ci/speedpack/nginx/nginx.conf.template | 48 + ci/submit_ccache.sh | 29 + doc/README.rst | 170 + doc/_templates/autosummary/class.rst | 33 + doc/data/baseball.csv | 101 + doc/data/fx_prices | Bin 0 -> 16177 bytes doc/data/iris.data | 151 + doc/data/mindex_ex.csv | 16 + doc/data/test.xls | Bin 0 -> 30720 bytes doc/data/tips.csv | 245 + doc/make.py | 374 + doc/plots/stats/moment_plots.py | 30 + doc/plots/stats/moments_ewma.py | 15 + doc/plots/stats/moments_ewmvol.py | 23 + doc/plots/stats/moments_expw.py | 35 + doc/plots/stats/moments_rolling.py | 24 + doc/plots/stats/moments_rolling_binary.py | 30 + doc/source/10min.rst | 753 + doc/source/_static/banklist.html | 4885 ++++++ doc/source/_static/df_repr_truncated.png | Bin 0 -> 8040 bytes doc/source/_static/eval-perf-small.png | Bin 0 -> 25314 bytes doc/source/_static/eval-perf.png | Bin 0 -> 18603 bytes doc/source/_static/legacy_0.10.h5 | Bin 0 -> 238321 bytes doc/source/_static/query-perf-small.png | Bin 0 -> 21731 bytes doc/source/_static/query-perf.png | Bin 0 -> 20351 bytes doc/source/_static/stub | 0 doc/source/_static/trunc_after.png | Bin 0 -> 29195 bytes doc/source/_static/trunc_before.png | Bin 0 -> 50913 bytes doc/source/api.rst | 1476 ++ doc/source/basics.rst | 1635 ++ doc/source/comparison_with_r.rst | 479 + doc/source/comparison_with_sql.rst | 380 + doc/source/computation.rst | 555 + doc/source/conf.py | 309 + doc/source/contributing.rst | 16 + doc/source/cookbook.rst | 690 + doc/source/dsintro.rst | 964 ++ doc/source/ecosystem.rst | 91 + doc/source/enhancingperf.rst | 670 + doc/source/faq.rst | 301 + doc/source/gotchas.rst | 581 + doc/source/groupby.rst | 997 ++ doc/source/index.rst.template | 151 + doc/source/indexing.rst | 2301 +++ doc/source/install.rst | 239 + doc/source/io.rst | 3662 ++++ doc/source/merging.rst | 713 + doc/source/missing_data.rst | 671 + doc/source/options.rst | 411 + doc/source/overview.rst | 121 + doc/source/r_interface.rst | 110 + doc/source/release.rst | 4245 +++++ doc/source/remote_data.rst | 256 + doc/source/reshaping.rst | 458 + doc/source/rplot.rst | 179 + doc/source/sparse.rst | 137 + .../themes/nature_with_gtoc/layout.html | 69 + .../nature_with_gtoc/static/nature.css_t | 310 + doc/source/themes/nature_with_gtoc/theme.conf | 4 + doc/source/timeseries.rst | 1611 ++ doc/source/tutorials.rst | 126 + doc/source/v0.10.0.txt | 356 + doc/source/v0.10.1.txt | 211 + doc/source/v0.11.0.txt | 332 + doc/source/v0.12.0.txt | 494 + doc/source/v0.13.0.txt | 983 ++ doc/source/v0.13.1.txt | 286 + doc/source/v0.14.0.txt | 1042 ++ doc/source/v0.14.1.txt | 271 + doc/source/v0.15.0.txt | 225 + doc/source/v0.4.x.txt | 63 + doc/source/v0.5.0.txt | 43 + doc/source/v0.6.0.txt | 56 + doc/source/v0.6.1.txt | 50 + doc/source/v0.7.0.txt | 272 + doc/source/v0.7.1.txt | 30 + doc/source/v0.7.2.txt | 27 + doc/source/v0.7.3.txt | 96 + doc/source/v0.8.0.txt | 274 + doc/source/v0.8.1.txt | 36 + doc/source/v0.9.0.txt | 97 + doc/source/v0.9.1.txt | 145 + doc/source/visualization.rst | 1146 ++ doc/source/whatsnew.rst | 58 + doc/sphinxext/README.rst | 17 + doc/sphinxext/ipython_sphinxext/__init__.py | 0 .../ipython_console_highlighting.py | 116 + .../ipython_sphinxext/ipython_directive.py | 1085 ++ doc/sphinxext/numpydoc/LICENSE.txt | 94 + doc/sphinxext/numpydoc/README.rst | 51 + doc/sphinxext/numpydoc/__init__.py | 3 + doc/sphinxext/numpydoc/comment_eater.py | 169 + doc/sphinxext/numpydoc/compiler_unparse.py | 865 + doc/sphinxext/numpydoc/docscrape.py | 527 + doc/sphinxext/numpydoc/docscrape_sphinx.py | 274 + doc/sphinxext/numpydoc/linkcode.py | 83 + doc/sphinxext/numpydoc/numpydoc.py | 187 + doc/sphinxext/numpydoc/phantom_import.py | 167 + doc/sphinxext/numpydoc/plot_directive.py | 642 + .../numpydoc/tests/test_docscrape.py | 767 + doc/sphinxext/numpydoc/tests/test_linkcode.py | 5 + .../numpydoc/tests/test_phantom_import.py | 5 + .../numpydoc/tests/test_plot_directive.py | 5 + .../numpydoc/tests/test_traitsdoc.py | 5 + doc/sphinxext/numpydoc/traitsdoc.py | 142 + examples/data/SOURCES | 0 examples/finance.py | 86 + examples/regressions.py | 51 + ez_setup.py | 264 + fake_pyrex/Pyrex/Distutils/__init__.py | 1 + fake_pyrex/Pyrex/Distutils/build_ext.py | 1 + fake_pyrex/Pyrex/__init__.py | 1 + pandas/__init__.py | 55 + pandas/algos.pyx | 2276 +++ pandas/compat/__init__.py | 754 + pandas/compat/chainmap.py | 26 + pandas/compat/chainmap_impl.py | 136 + pandas/compat/openpyxl_compat.py | 24 + pandas/compat/pickle_compat.py | 113 + pandas/computation/__init__.py | 0 pandas/computation/align.py | 183 + pandas/computation/api.py | 2 + pandas/computation/common.py | 24 + pandas/computation/engines.py | 147 + pandas/computation/eval.py | 242 + pandas/computation/expr.py | 662 + pandas/computation/expressions.py | 258 + pandas/computation/ops.py | 493 + pandas/computation/pytables.py | 604 + pandas/computation/scope.py | 297 + pandas/computation/tests/__init__.py | 0 pandas/computation/tests/test_eval.py | 1631 ++ pandas/core/__init__.py | 0 pandas/core/algorithms.py | 522 + pandas/core/api.py | 35 + pandas/core/array.py | 37 + pandas/core/base.py | 494 + pandas/core/categorical.py | 226 + pandas/core/common.py | 2931 ++++ pandas/core/config.py | 805 + pandas/core/config_init.py | 343 + pandas/core/datetools.py | 63 + pandas/core/format.py | 2298 +++ pandas/core/frame.py | 4985 ++++++ pandas/core/generic.py | 3949 +++++ pandas/core/groupby.py | 3566 ++++ pandas/core/index.py | 4093 +++++ pandas/core/indexing.py | 1706 ++ pandas/core/internals.py | 4090 +++++ pandas/core/matrix.py | 1 + pandas/core/nanops.py | 718 + pandas/core/ops.py | 985 ++ pandas/core/panel.py | 1451 ++ pandas/core/panel4d.py | 41 + pandas/core/panelnd.py | 109 + pandas/core/reshape.py | 1113 ++ pandas/core/series.py | 2531 +++ pandas/core/sparse.py | 10 + pandas/core/strings.py | 1039 ++ pandas/hashtable.pxd | 24 + pandas/hashtable.pyx | 1064 ++ pandas/index.pyx | 616 + pandas/info.py | 20 + pandas/io/__init__.py | 0 pandas/io/api.py | 15 + pandas/io/auth.py | 123 + pandas/io/clipboard.py | 97 + pandas/io/common.py | 167 + pandas/io/data.py | 1203 ++ pandas/io/date_converters.py | 62 + pandas/io/excel.py | 864 + pandas/io/ga.py | 456 + pandas/io/gbq.py | 435 + pandas/io/html.py | 851 + pandas/io/json.py | 756 + pandas/io/packers.py | 642 + pandas/io/parsers.py | 2320 +++ pandas/io/pickle.py | 65 + pandas/io/pytables.py | 4375 +++++ pandas/io/sql.py | 1244 ++ pandas/io/stata.py | 1378 ++ pandas/io/tests/__init__.py | 4 + pandas/io/tests/data/banklist.csv | 507 + pandas/io/tests/data/banklist.html | 4885 ++++++ pandas/io/tests/data/computer_sales_page.html | 619 + pandas/io/tests/data/gbq_fake_job.txt | 1 + .../data/html_encoding/chinese_utf16.html | Bin 0 -> 824 bytes .../data/html_encoding/chinese_utf32.html | Bin 0 -> 1648 bytes .../data/html_encoding/chinese_utf8.html | 26 + .../tests/data/html_encoding/letz_latin1.html | 26 + pandas/io/tests/data/iris.csv | 151 + pandas/io/tests/data/legacy_hdf/legacy.h5 | Bin 0 -> 14928 bytes .../io/tests/data/legacy_hdf/legacy_0.10.h5 | Bin 0 -> 238321 bytes .../io/tests/data/legacy_hdf/legacy_table.h5 | Bin 0 -> 211111 bytes .../data/legacy_hdf/legacy_table_0.11.h5 | Bin 0 -> 293877 bytes .../tests/data/legacy_hdf/pytables_native.h5 | Bin 0 -> 74246 bytes .../tests/data/legacy_hdf/pytables_native2.h5 | Bin 0 -> 12336 bytes .../0.10.1/AMD64_windows_2.7.3.pickle | Bin 0 -> 4381 bytes .../0.10.1/x86_64_linux_2.7.3.pickle | Bin 0 -> 4338 bytes .../0.11.0/0.11.0_x86_64_linux_3.3.0.pickle | Bin 0 -> 8978 bytes .../0.11.0/x86_64_linux_2.7.3.pickle | Bin 0 -> 4338 bytes .../0.11.0/x86_64_linux_3.3.0.pickle | Bin 0 -> 5822 bytes .../0.12.0/0.12.0_AMD64_windows_2.7.3.pickle | Bin 0 -> 8692 bytes .../0.12.0/0.12.0_x86_64_linux_2.7.3.pickle | Bin 0 -> 8768 bytes .../0.13.0/0.13.0_AMD64_windows_2.7.3.pickle | Bin 0 -> 7208 bytes .../0.13.0/0.13.0_i686_linux_2.6.5.pickle | Bin 0 -> 7143 bytes .../0.13.0/0.13.0_i686_linux_2.7.3.pickle | Bin 0 -> 7123 bytes .../0.13.0/0.13.0_i686_linux_3.2.3.pickle | Bin 0 -> 10019 bytes .../0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle | Bin 0 -> 7278 bytes .../0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle | Bin 0 -> 7445 bytes .../0.13.0/0.13.0_x86_64_linux_2.7.3.pickle | Bin 0 -> 7278 bytes .../0.13.0/0.13.0_x86_64_linux_3.3.0.pickle | Bin 0 -> 10049 bytes .../0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle | Bin 0 -> 8159 bytes pandas/io/tests/data/macau.html | 3691 ++++ pandas/io/tests/data/nyse_wsj.html | 1207 ++ pandas/io/tests/data/salary.table | 47 + pandas/io/tests/data/spam.html | 797 + pandas/io/tests/data/stata1_114.dta | Bin 0 -> 1130 bytes pandas/io/tests/data/stata1_117.dta | Bin 0 -> 1569 bytes pandas/io/tests/data/stata1_encoding.dta | Bin 0 -> 3507 bytes pandas/io/tests/data/stata2_113.dta | Bin 0 -> 1490 bytes pandas/io/tests/data/stata2_114.dta | Bin 0 -> 1786 bytes pandas/io/tests/data/stata2_115.dta | Bin 0 -> 1786 bytes ...for testing alternative Stata file formats | Bin 0 -> 1786 bytes pandas/io/tests/data/stata2_117.dta | Bin 0 -> 2228 bytes pandas/io/tests/data/stata3.csv | 204 + pandas/io/tests/data/stata3_113.dta | Bin 0 -> 12737 bytes pandas/io/tests/data/stata3_114.dta | Bin 0 -> 13255 bytes pandas/io/tests/data/stata3_115.dta | Bin 0 -> 13255 bytes ...for testing alternative Stata file formats | Bin 0 -> 13255 bytes pandas/io/tests/data/stata3_117.dta | Bin 0 -> 13703 bytes pandas/io/tests/data/stata4_113.dta | Bin 0 -> 1528 bytes pandas/io/tests/data/stata4_114.dta | Bin 0 -> 1713 bytes pandas/io/tests/data/stata4_115.dta | Bin 0 -> 1713 bytes ...for testing alternative Stata file formats | Bin 0 -> 1713 bytes pandas/io/tests/data/stata4_117.dta | Bin 0 -> 2185 bytes pandas/io/tests/data/stata5.csv | 19 + pandas/io/tests/data/stata5_113.dta | Bin 0 -> 4628 bytes pandas/io/tests/data/stata5_114.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata5_115.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata5_117.dta | Bin 0 -> 5366 bytes pandas/io/tests/data/stata6.csv | 6 + pandas/io/tests/data/stata6_113.dta | Bin 0 -> 2752 bytes pandas/io/tests/data/stata6_114.dta | Bin 0 -> 3048 bytes pandas/io/tests/data/stata6_115.dta | Bin 0 -> 3048 bytes pandas/io/tests/data/stata6_117.dta | Bin 0 -> 3490 bytes pandas/io/tests/data/test.xls | Bin 0 -> 30720 bytes pandas/io/tests/data/test.xlsm | Bin 0 -> 45056 bytes pandas/io/tests/data/test.xlsx | Bin 0 -> 44929 bytes pandas/io/tests/data/test1.csv | 8 + pandas/io/tests/data/test2.csv | 6 + pandas/io/tests/data/test2.xls | Bin 0 -> 5632 bytes pandas/io/tests/data/test2.xlsx | Bin 0 -> 28216 bytes pandas/io/tests/data/test3.xls | Bin 0 -> 23040 bytes pandas/io/tests/data/test_types.xls | Bin 0 -> 26112 bytes pandas/io/tests/data/test_types.xlsx | Bin 0 -> 33769 bytes pandas/io/tests/data/times_1900.xls | Bin 0 -> 16384 bytes pandas/io/tests/data/times_1904.xls | Bin 0 -> 16384 bytes pandas/io/tests/data/tips.csv | 245 + pandas/io/tests/data/unicode_series.csv | 18 + pandas/io/tests/data/utf16_ex.txt | Bin 0 -> 11406 bytes pandas/io/tests/data/valid_markup.html | 62 + pandas/io/tests/data/yahoo_options1.html | 329 + pandas/io/tests/data/yahoo_options2.html | 329 + pandas/io/tests/generate_legacy_pickles.py | 154 + pandas/io/tests/test_clipboard.py | 103 + pandas/io/tests/test_cparser.py | 346 + pandas/io/tests/test_data.py | 512 + pandas/io/tests/test_date_converters.py | 126 + pandas/io/tests/test_excel.py | 1311 ++ pandas/io/tests/test_ga.py | 185 + pandas/io/tests/test_gbq.py | 290 + pandas/io/tests/test_html.py | 723 + pandas/io/tests/test_json/__init__.py | 0 .../test_json/data/tsframe_iso_v012.json | 1 + .../io/tests/test_json/data/tsframe_v012.json | 1 + pandas/io/tests/test_json/test_pandas.py | 631 + pandas/io/tests/test_json/test_ujson.py | 1536 ++ pandas/io/tests/test_json_norm.py | 207 + pandas/io/tests/test_packers.py | 452 + pandas/io/tests/test_parsers.py | 3569 ++++ pandas/io/tests/test_pickle.py | 101 + pandas/io/tests/test_pytables.py | 4364 +++++ pandas/io/tests/test_sql.py | 1911 +++ pandas/io/tests/test_stata.py | 531 + pandas/io/tests/test_wb.py | 55 + pandas/io/wb.py | 192 + pandas/lib.pyx | 1690 ++ pandas/msgpack.pyx | 669 + pandas/parser.pyx | 1953 +++ pandas/rpy/__init__.py | 4 + pandas/rpy/base.py | 12 + pandas/rpy/common.py | 357 + pandas/rpy/mass.py | 2 + pandas/rpy/tests/__init__.py | 0 pandas/rpy/tests/test_common.py | 213 + pandas/rpy/vars.py | 20 + pandas/sandbox/__init__.py | 0 pandas/sandbox/qtpandas.py | 135 + pandas/sparse/__init__.py | 0 pandas/sparse/api.py | 7 + pandas/sparse/array.py | 531 + pandas/sparse/frame.py | 832 + pandas/sparse/list.py | 142 + pandas/sparse/panel.py | 555 + pandas/sparse/series.py | 660 + pandas/sparse/tests/__init__.py | 0 pandas/sparse/tests/test_array.py | 182 + pandas/sparse/tests/test_libsparse.py | 398 + pandas/sparse/tests/test_list.py | 107 + pandas/sparse/tests/test_sparse.py | 1778 ++ pandas/src/datetime.pxd | 193 + pandas/src/datetime/np_datetime.c | 1018 ++ pandas/src/datetime/np_datetime.h | 119 + pandas/src/datetime/np_datetime_strings.c | 1463 ++ pandas/src/datetime/np_datetime_strings.h | 86 + pandas/src/datetime_helper.h | 6 + pandas/src/generate_code.py | 2395 +++ pandas/src/generated.pyx | 8756 ++++++++++ pandas/src/headers/math.h | 11 + pandas/src/headers/ms_inttypes.h | 305 + pandas/src/headers/ms_stdint.h | 247 + pandas/src/headers/portable.h | 8 + pandas/src/headers/stdint.h | 10 + pandas/src/helper.h | 16 + pandas/src/inference.pyx | 1124 ++ pandas/src/join.pyx | 241 + pandas/src/khash.pxd | 124 + pandas/src/klib/khash.h | 578 + pandas/src/klib/khash_python.h | 49 + pandas/src/klib/ktypes.h | 6 + pandas/src/klib/kvec.h | 151 + pandas/src/msgpack/pack.h | 108 + pandas/src/msgpack/pack_template.h | 771 + pandas/src/msgpack/sysdep.h | 195 + pandas/src/msgpack/unpack.h | 235 + pandas/src/msgpack/unpack_define.h | 93 + pandas/src/msgpack/unpack_template.h | 492 + pandas/src/numpy.pxd | 984 ++ pandas/src/numpy_helper.h | 185 + pandas/src/offsets.pyx | 367 + pandas/src/parse_helper.h | 246 + pandas/src/parser/.gitignore | 2 + pandas/src/parser/Makefile | 13 + pandas/src/parser/io.c | 281 + pandas/src/parser/io.h | 85 + pandas/src/parser/tokenizer.c | 2232 +++ pandas/src/parser/tokenizer.h | 266 + pandas/src/period.c | 1441 ++ pandas/src/period.h | 169 + pandas/src/properties.pyx | 65 + pandas/src/reduce.pyx | 596 + pandas/src/skiplist.h | 281 + pandas/src/skiplist.pxd | 21 + pandas/src/skiplist.pyx | 153 + pandas/src/sparse.pyx | 1190 ++ pandas/src/testing.pyx | 141 + pandas/src/ujson/lib/ultrajson.h | 313 + pandas/src/ujson/lib/ultrajsondec.c | 929 + pandas/src/ujson/lib/ultrajsonenc.c | 947 ++ pandas/src/ujson/python/JSONtoObj.c | 736 + pandas/src/ujson/python/objToJSON.c | 2103 +++ pandas/src/ujson/python/py_defines.h | 52 + pandas/src/ujson/python/ujson.c | 112 + pandas/src/ujson/python/version.h | 38 + pandas/src/util.pxd | 84 + pandas/stats/__init__.py | 0 pandas/stats/api.py | 9 + pandas/stats/common.py | 41 + pandas/stats/fama_macbeth.py | 226 + pandas/stats/interface.py | 135 + pandas/stats/math.py | 130 + pandas/stats/misc.py | 386 + pandas/stats/moments.py | 991 ++ pandas/stats/ols.py | 1363 ++ pandas/stats/plm.py | 814 + pandas/stats/tests/__init__.py | 0 pandas/stats/tests/common.py | 160 + pandas/stats/tests/test_fama_macbeth.py | 64 + pandas/stats/tests/test_math.py | 67 + pandas/stats/tests/test_moments.py | 1064 ++ pandas/stats/tests/test_ols.py | 890 + pandas/stats/tests/test_var.py | 195 + pandas/stats/var.py | 595 + pandas/tests/__init__.py | 0 pandas/tests/data/iris.csv | 151 + pandas/tests/data/mindex_073.pickle | Bin 0 -> 670 bytes pandas/tests/data/multiindex_v1.pickle | 149 + pandas/tests/data/tips.csv | 245 + pandas/tests/data/unicode_series.csv | 18 + pandas/tests/test_algos.py | 264 + pandas/tests/test_base.py | 738 + pandas/tests/test_categorical.py | 222 + pandas/tests/test_common.py | 876 + pandas/tests/test_compat.py | 70 + pandas/tests/test_config.py | 426 + pandas/tests/test_expressions.py | 419 + pandas/tests/test_format.py | 2944 ++++ pandas/tests/test_frame.py | 14117 ++++++++++++++++ pandas/tests/test_generic.py | 1219 ++ pandas/tests/test_graphics.py | 2578 +++ pandas/tests/test_groupby.py | 4473 +++++ pandas/tests/test_index.py | 2832 ++++ pandas/tests/test_indexing.py | 3725 ++++ pandas/tests/test_internals.py | 1010 ++ pandas/tests/test_msgpack/__init__.py | 0 pandas/tests/test_msgpack/test_buffer.py | 12 + pandas/tests/test_msgpack/test_case.py | 101 + pandas/tests/test_msgpack/test_except.py | 29 + pandas/tests/test_msgpack/test_format.py | 70 + pandas/tests/test_msgpack/test_obj.py | 71 + pandas/tests/test_msgpack/test_pack.py | 144 + pandas/tests/test_msgpack/test_read_size.py | 65 + pandas/tests/test_msgpack/test_seq.py | 44 + pandas/tests/test_msgpack/test_sequnpack.py | 84 + pandas/tests/test_msgpack/test_subtype.py | 21 + pandas/tests/test_msgpack/test_unpack_raw.py | 28 + pandas/tests/test_multilevel.py | 2150 +++ pandas/tests/test_nanops.py | 756 + pandas/tests/test_panel.py | 2345 +++ pandas/tests/test_panel4d.py | 1055 ++ pandas/tests/test_panelnd.py | 110 + pandas/tests/test_reshape.py | 334 + pandas/tests/test_rplot.py | 298 + pandas/tests/test_series.py | 6050 +++++++ pandas/tests/test_stats.py | 137 + pandas/tests/test_strings.py | 1199 ++ pandas/tests/test_testing.py | 186 + pandas/tests/test_tseries.py | 722 + pandas/tools/__init__.py | 0 pandas/tools/describe.py | 17 + pandas/tools/merge.py | 1093 ++ pandas/tools/pivot.py | 409 + pandas/tools/plotting.py | 3075 ++++ pandas/tools/rplot.py | 885 + pandas/tools/tests/__init__.py | 0 pandas/tools/tests/cut_data.csv | 1 + pandas/tools/tests/test_merge.py | 2158 +++ pandas/tools/tests/test_pivot.py | 644 + pandas/tools/tests/test_tile.py | 241 + pandas/tools/tests/test_tools.py | 23 + pandas/tools/tests/test_util.py | 95 + pandas/tools/tile.py | 266 + pandas/tools/util.py | 49 + pandas/tseries/__init__.py | 0 pandas/tseries/api.py | 12 + pandas/tseries/converter.py | 987 ++ pandas/tseries/frequencies.py | 1048 ++ pandas/tseries/holiday.py | 355 + pandas/tseries/index.py | 2071 +++ pandas/tseries/interval.py | 37 + pandas/tseries/offsets.py | 2244 +++ pandas/tseries/period.py | 1337 ++ pandas/tseries/plotting.py | 251 + pandas/tseries/resample.py | 448 + pandas/tseries/tests/__init__.py | 0 .../tseries/tests/data/daterange_073.pickle | Bin 0 -> 650 bytes pandas/tseries/tests/data/frame.pickle | Bin 0 -> 1182 bytes pandas/tseries/tests/data/series.pickle | Bin 0 -> 646 bytes .../tests/data/series_daterange0.pickle | Bin 0 -> 357 bytes pandas/tseries/tests/test_converter.py | 98 + pandas/tseries/tests/test_daterange.py | 687 + pandas/tseries/tests/test_frequencies.py | 348 + pandas/tseries/tests/test_holiday.py | 168 + pandas/tseries/tests/test_offsets.py | 3083 ++++ pandas/tseries/tests/test_period.py | 2663 +++ pandas/tseries/tests/test_plotting.py | 1045 ++ pandas/tseries/tests/test_resample.py | 1399 ++ pandas/tseries/tests/test_timedeltas.py | 425 + pandas/tseries/tests/test_timeseries.py | 4212 +++++ .../tseries/tests/test_timeseries_legacy.py | 238 + pandas/tseries/tests/test_timezones.py | 1140 ++ pandas/tseries/tests/test_tslib.py | 491 + pandas/tseries/tests/test_util.py | 107 + pandas/tseries/timedeltas.py | 251 + pandas/tseries/tools.py | 593 + pandas/tseries/util.py | 95 + pandas/tslib.pxd | 4 + pandas/tslib.pyx | 3769 +++++ pandas/util/__init__.py | 0 pandas/util/clipboard.py | 176 + pandas/util/decorators.py | 232 + pandas/util/misc.py | 10 + pandas/util/print_versions.py | 151 + pandas/util/terminal.py | 121 + pandas/util/testing.py | 1656 ++ scripts/bench_join.R | 50 + scripts/bench_join.py | 211 + scripts/bench_join_multi.py | 32 + scripts/bench_refactor.py | 51 + scripts/boxplot_test.py | 14 + scripts/count_code.sh | 1 + scripts/faster_xs.py | 15 + scripts/file_sizes.py | 208 + scripts/find_commits_touching_func.py | 202 + scripts/find_undoc_args.py | 126 + scripts/gen_release_notes.py | 95 + scripts/git-mrb | 82 + scripts/git_code_churn.py | 34 + scripts/groupby_sample.py | 54 + scripts/groupby_speed.py | 35 + scripts/groupby_test.py | 145 + scripts/hdfstore_panel_perf.py | 17 + scripts/json_manip.py | 423 + scripts/leak.py | 13 + scripts/parser_magic.py | 74 + scripts/preepoch_test.py | 23 + scripts/pypistats.py | 101 + scripts/roll_median_leak.py | 26 + scripts/runtests.py | 5 + scripts/test_py25.bat | 8 + scripts/test_py26.bat | 8 + scripts/test_py27.bat | 6 + scripts/test_py31.bat | 8 + scripts/test_py32.bat | 8 + scripts/testmed.py | 171 + scripts/touchup_gh_issues.py | 44 + scripts/use_build_cache.py | 354 + scripts/winbuild_py25.bat | 2 + scripts/winbuild_py27.bat | 2 + scripts/windows_builder/build_26-32.bat | 21 + scripts/windows_builder/build_26-64.bat | 25 + scripts/windows_builder/build_27-32.bat | 25 + scripts/windows_builder/build_27-64.bat | 25 + scripts/windows_builder/build_33-32.bat | 27 + scripts/windows_builder/build_33-64.bat | 27 + scripts/windows_builder/build_34-32.bat | 27 + scripts/windows_builder/build_34-64.bat | 27 + scripts/windows_builder/check_and_build.bat | 2 + scripts/windows_builder/check_and_build.py | 194 + scripts/windows_builder/readme.txt | 17 + setup.py | 606 + test.sh | 11 + test_fast.sh | 1 + test_multi.sh | 1 + test_perf.sh | 5 + test_rebuild.sh | 12 + tox.ini | 72 + vb_suite/.gitignore | 4 + vb_suite/attrs_caching.py | 20 + vb_suite/binary_ops.py | 129 + vb_suite/ctors.py | 39 + vb_suite/eval.py | 150 + vb_suite/frame_ctor.py | 100 + vb_suite/frame_methods.py | 463 + vb_suite/generate_rst_files.py | 2 + vb_suite/groupby.py | 448 + vb_suite/hdfstore_bench.py | 278 + vb_suite/index_object.py | 119 + vb_suite/indexing.py | 213 + vb_suite/inference.py | 36 + vb_suite/io_bench.py | 133 + vb_suite/join_merge.py | 239 + vb_suite/make.py | 167 + vb_suite/measure_memory_consumption.py | 55 + vb_suite/miscellaneous.py | 34 + vb_suite/packers.py | 123 + vb_suite/pandas_vb_common.py | 25 + vb_suite/panel_ctor.py | 76 + vb_suite/panel_methods.py | 28 + vb_suite/parser_vb.py | 81 + vb_suite/perf_HEAD.py | 243 + vb_suite/plotting.py | 25 + vb_suite/reindex.py | 214 + vb_suite/replace.py | 36 + vb_suite/reshape.py | 65 + vb_suite/run_suite.py | 15 + vb_suite/series_methods.py | 29 + vb_suite/source/_static/stub | 0 vb_suite/source/conf.py | 225 + vb_suite/source/themes/agogo/layout.html | 95 + .../source/themes/agogo/static/agogo.css_t | 476 + .../source/themes/agogo/static/bgfooter.png | Bin 0 -> 434 bytes vb_suite/source/themes/agogo/static/bgtop.png | Bin 0 -> 430 bytes vb_suite/source/themes/agogo/theme.conf | 19 + vb_suite/sparse.py | 39 + vb_suite/stat_ops.py | 126 + vb_suite/strings.py | 58 + vb_suite/suite.py | 162 + vb_suite/test.py | 67 + vb_suite/test_perf.py | 616 + vb_suite/timedelta.py | 32 + vb_suite/timeseries.py | 335 + 647 files changed, 294518 insertions(+) create mode 100644 .coveragerc create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 LICENSES/MSGPACK_LICENSE create mode 100644 LICENSES/MSGPACK_NUMPY_LICENSE create mode 100644 LICENSES/NUMPY_LICENSE create mode 100644 LICENSES/OTHER create mode 100644 LICENSES/PSF_LICENSE create mode 100644 LICENSES/SCIPY_LICENSE create mode 100644 LICENSES/SIX create mode 100644 LICENSES/ULTRAJSON_LICENSE create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 README.md create mode 100644 RELEASE.md create mode 100644 bench/alignment.py create mode 100644 bench/bench_dense_to_sparse.py create mode 100644 bench/bench_get_put_value.py create mode 100644 bench/bench_groupby.py create mode 100644 bench/bench_join_panel.py create mode 100644 bench/bench_khash_dict.py create mode 100644 bench/bench_merge.R create mode 100644 bench/bench_merge.py create mode 100644 bench/bench_merge_sqlite.py create mode 100644 bench/bench_pivot.R create mode 100644 bench/bench_pivot.py create mode 100644 bench/bench_sparse.py create mode 100644 bench/bench_take_indexing.py create mode 100644 bench/bench_unique.py create mode 100644 bench/bench_with_subset.R create mode 100644 bench/bench_with_subset.py create mode 100644 bench/better_unique.py create mode 100644 bench/duplicated.R create mode 100644 bench/io_roundtrip.py create mode 100644 bench/larry.py create mode 100644 bench/serialize.py create mode 100644 bench/test.py create mode 100644 bench/zoo_bench.R create mode 100644 bench/zoo_bench.py create mode 100644 ci/README.txt create mode 100755 ci/after_script.sh create mode 100755 ci/before_install.sh create mode 100755 ci/build_docs.sh create mode 100755 ci/cron/go_doc.sh create mode 100755 ci/install.sh create mode 100644 ci/ironcache/get.py create mode 100644 ci/ironcache/put.py create mode 100755 ci/prep_ccache.sh create mode 100755 ci/print_skipped.py create mode 100755 ci/print_versions.py create mode 100644 ci/requirements-2.6.txt create mode 100644 ci/requirements-2.7.txt create mode 100644 ci/requirements-2.7_LOCALE.txt create mode 100644 ci/requirements-2.7_NUMPY_DEV_1_8_x.txt create mode 100644 ci/requirements-2.7_NUMPY_DEV_master.txt create mode 100644 ci/requirements-3.2.txt create mode 100644 ci/requirements-3.3.txt create mode 100644 ci/requirements-3.4.txt create mode 100755 ci/script.sh create mode 100644 ci/speedpack/Vagrantfile create mode 100755 ci/speedpack/build.sh create mode 100644 ci/speedpack/nginx/nginx.conf.template create mode 100755 ci/submit_ccache.sh create mode 100644 doc/README.rst create mode 100644 doc/_templates/autosummary/class.rst create mode 100644 doc/data/baseball.csv create mode 100644 doc/data/fx_prices create mode 100644 doc/data/iris.data create mode 100644 doc/data/mindex_ex.csv create mode 100644 doc/data/test.xls create mode 100644 doc/data/tips.csv create mode 100755 doc/make.py create mode 100644 doc/plots/stats/moment_plots.py create mode 100644 doc/plots/stats/moments_ewma.py create mode 100644 doc/plots/stats/moments_ewmvol.py create mode 100644 doc/plots/stats/moments_expw.py create mode 100644 doc/plots/stats/moments_rolling.py create mode 100644 doc/plots/stats/moments_rolling_binary.py create mode 100644 doc/source/10min.rst create mode 100644 doc/source/_static/banklist.html create mode 100644 doc/source/_static/df_repr_truncated.png create mode 100644 doc/source/_static/eval-perf-small.png create mode 100644 doc/source/_static/eval-perf.png create mode 100644 doc/source/_static/legacy_0.10.h5 create mode 100644 doc/source/_static/query-perf-small.png create mode 100644 doc/source/_static/query-perf.png create mode 100644 doc/source/_static/stub create mode 100644 doc/source/_static/trunc_after.png create mode 100644 doc/source/_static/trunc_before.png create mode 100644 doc/source/api.rst create mode 100644 doc/source/basics.rst create mode 100644 doc/source/comparison_with_r.rst create mode 100644 doc/source/comparison_with_sql.rst create mode 100644 doc/source/computation.rst create mode 100644 doc/source/conf.py create mode 100644 doc/source/contributing.rst create mode 100644 doc/source/cookbook.rst create mode 100644 doc/source/dsintro.rst create mode 100644 doc/source/ecosystem.rst create mode 100644 doc/source/enhancingperf.rst create mode 100644 doc/source/faq.rst create mode 100644 doc/source/gotchas.rst create mode 100644 doc/source/groupby.rst create mode 100644 doc/source/index.rst.template create mode 100644 doc/source/indexing.rst create mode 100644 doc/source/install.rst create mode 100644 doc/source/io.rst create mode 100644 doc/source/merging.rst create mode 100644 doc/source/missing_data.rst create mode 100644 doc/source/options.rst create mode 100644 doc/source/overview.rst create mode 100644 doc/source/r_interface.rst create mode 100644 doc/source/release.rst create mode 100644 doc/source/remote_data.rst create mode 100644 doc/source/reshaping.rst create mode 100644 doc/source/rplot.rst create mode 100644 doc/source/sparse.rst create mode 100644 doc/source/themes/nature_with_gtoc/layout.html create mode 100644 doc/source/themes/nature_with_gtoc/static/nature.css_t create mode 100644 doc/source/themes/nature_with_gtoc/theme.conf create mode 100644 doc/source/timeseries.rst create mode 100644 doc/source/tutorials.rst create mode 100644 doc/source/v0.10.0.txt create mode 100644 doc/source/v0.10.1.txt create mode 100644 doc/source/v0.11.0.txt create mode 100644 doc/source/v0.12.0.txt create mode 100644 doc/source/v0.13.0.txt create mode 100644 doc/source/v0.13.1.txt create mode 100644 doc/source/v0.14.0.txt create mode 100644 doc/source/v0.14.1.txt create mode 100644 doc/source/v0.15.0.txt create mode 100644 doc/source/v0.4.x.txt create mode 100644 doc/source/v0.5.0.txt create mode 100644 doc/source/v0.6.0.txt create mode 100644 doc/source/v0.6.1.txt create mode 100644 doc/source/v0.7.0.txt create mode 100644 doc/source/v0.7.1.txt create mode 100644 doc/source/v0.7.2.txt create mode 100644 doc/source/v0.7.3.txt create mode 100644 doc/source/v0.8.0.txt create mode 100644 doc/source/v0.8.1.txt create mode 100644 doc/source/v0.9.0.txt create mode 100644 doc/source/v0.9.1.txt create mode 100644 doc/source/visualization.rst create mode 100644 doc/source/whatsnew.rst create mode 100644 doc/sphinxext/README.rst create mode 100644 doc/sphinxext/ipython_sphinxext/__init__.py create mode 100644 doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py create mode 100644 doc/sphinxext/ipython_sphinxext/ipython_directive.py create mode 100755 doc/sphinxext/numpydoc/LICENSE.txt create mode 100755 doc/sphinxext/numpydoc/README.rst create mode 100755 doc/sphinxext/numpydoc/__init__.py create mode 100755 doc/sphinxext/numpydoc/comment_eater.py create mode 100755 doc/sphinxext/numpydoc/compiler_unparse.py create mode 100755 doc/sphinxext/numpydoc/docscrape.py create mode 100755 doc/sphinxext/numpydoc/docscrape_sphinx.py create mode 100644 doc/sphinxext/numpydoc/linkcode.py create mode 100755 doc/sphinxext/numpydoc/numpydoc.py create mode 100755 doc/sphinxext/numpydoc/phantom_import.py create mode 100755 doc/sphinxext/numpydoc/plot_directive.py create mode 100755 doc/sphinxext/numpydoc/tests/test_docscrape.py create mode 100644 doc/sphinxext/numpydoc/tests/test_linkcode.py create mode 100644 doc/sphinxext/numpydoc/tests/test_phantom_import.py create mode 100644 doc/sphinxext/numpydoc/tests/test_plot_directive.py create mode 100644 doc/sphinxext/numpydoc/tests/test_traitsdoc.py create mode 100755 doc/sphinxext/numpydoc/traitsdoc.py create mode 100644 examples/data/SOURCES create mode 100644 examples/finance.py create mode 100644 examples/regressions.py create mode 100644 ez_setup.py create mode 100644 fake_pyrex/Pyrex/Distutils/__init__.py create mode 100644 fake_pyrex/Pyrex/Distutils/build_ext.py create mode 100644 fake_pyrex/Pyrex/__init__.py create mode 100644 pandas/__init__.py create mode 100644 pandas/algos.pyx create mode 100644 pandas/compat/__init__.py create mode 100644 pandas/compat/chainmap.py create mode 100644 pandas/compat/chainmap_impl.py create mode 100644 pandas/compat/openpyxl_compat.py create mode 100644 pandas/compat/pickle_compat.py create mode 100644 pandas/computation/__init__.py create mode 100644 pandas/computation/align.py create mode 100644 pandas/computation/api.py create mode 100644 pandas/computation/common.py create mode 100644 pandas/computation/engines.py create mode 100644 pandas/computation/eval.py create mode 100644 pandas/computation/expr.py create mode 100644 pandas/computation/expressions.py create mode 100644 pandas/computation/ops.py create mode 100644 pandas/computation/pytables.py create mode 100644 pandas/computation/scope.py create mode 100644 pandas/computation/tests/__init__.py create mode 100644 pandas/computation/tests/test_eval.py create mode 100644 pandas/core/__init__.py create mode 100644 pandas/core/algorithms.py create mode 100644 pandas/core/api.py create mode 100644 pandas/core/array.py create mode 100644 pandas/core/base.py create mode 100644 pandas/core/categorical.py create mode 100644 pandas/core/common.py create mode 100644 pandas/core/config.py create mode 100644 pandas/core/config_init.py create mode 100644 pandas/core/datetools.py create mode 100644 pandas/core/format.py create mode 100644 pandas/core/frame.py create mode 100644 pandas/core/generic.py create mode 100644 pandas/core/groupby.py create mode 100644 pandas/core/index.py create mode 100644 pandas/core/indexing.py create mode 100644 pandas/core/internals.py create mode 100644 pandas/core/matrix.py create mode 100644 pandas/core/nanops.py create mode 100644 pandas/core/ops.py create mode 100644 pandas/core/panel.py create mode 100644 pandas/core/panel4d.py create mode 100644 pandas/core/panelnd.py create mode 100644 pandas/core/reshape.py create mode 100644 pandas/core/series.py create mode 100644 pandas/core/sparse.py create mode 100644 pandas/core/strings.py create mode 100644 pandas/hashtable.pxd create mode 100644 pandas/hashtable.pyx create mode 100644 pandas/index.pyx create mode 100644 pandas/info.py create mode 100644 pandas/io/__init__.py create mode 100644 pandas/io/api.py create mode 100644 pandas/io/auth.py create mode 100644 pandas/io/clipboard.py create mode 100644 pandas/io/common.py create mode 100644 pandas/io/data.py create mode 100644 pandas/io/date_converters.py create mode 100644 pandas/io/excel.py create mode 100644 pandas/io/ga.py create mode 100644 pandas/io/gbq.py create mode 100644 pandas/io/html.py create mode 100644 pandas/io/json.py create mode 100644 pandas/io/packers.py create mode 100644 pandas/io/parsers.py create mode 100644 pandas/io/pickle.py create mode 100644 pandas/io/pytables.py create mode 100644 pandas/io/sql.py create mode 100644 pandas/io/stata.py create mode 100644 pandas/io/tests/__init__.py create mode 100644 pandas/io/tests/data/banklist.csv create mode 100644 pandas/io/tests/data/banklist.html create mode 100644 pandas/io/tests/data/computer_sales_page.html create mode 100644 pandas/io/tests/data/gbq_fake_job.txt create mode 100644 pandas/io/tests/data/html_encoding/chinese_utf16.html create mode 100644 pandas/io/tests/data/html_encoding/chinese_utf32.html create mode 100644 pandas/io/tests/data/html_encoding/chinese_utf8.html create mode 100644 pandas/io/tests/data/html_encoding/letz_latin1.html create mode 100644 pandas/io/tests/data/iris.csv create mode 100644 pandas/io/tests/data/legacy_hdf/legacy.h5 create mode 100644 pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 create mode 100644 pandas/io/tests/data/legacy_hdf/legacy_table.h5 create mode 100644 pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 create mode 100644 pandas/io/tests/data/legacy_hdf/pytables_native.h5 create mode 100644 pandas/io/tests/data/legacy_hdf/pytables_native2.h5 create mode 100644 pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_AMD64_windows_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_AMD64_windows_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.6.5.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_i686_linux_3.2.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.5.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_darwin_2.7.6.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_3.3.0.pickle create mode 100644 pandas/io/tests/data/legacy_pickle/0.14.0/0.14.0_x86_64_darwin_2.7.6.pickle create mode 100644 pandas/io/tests/data/macau.html create mode 100644 pandas/io/tests/data/nyse_wsj.html create mode 100644 pandas/io/tests/data/salary.table create mode 100644 pandas/io/tests/data/spam.html create mode 100644 pandas/io/tests/data/stata1_114.dta create mode 100644 pandas/io/tests/data/stata1_117.dta create mode 100644 pandas/io/tests/data/stata1_encoding.dta create mode 100644 pandas/io/tests/data/stata2_113.dta create mode 100644 pandas/io/tests/data/stata2_114.dta create mode 100644 pandas/io/tests/data/stata2_115.dta create mode 100644 pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats create mode 100644 pandas/io/tests/data/stata2_117.dta create mode 100644 pandas/io/tests/data/stata3.csv create mode 100644 pandas/io/tests/data/stata3_113.dta create mode 100644 pandas/io/tests/data/stata3_114.dta create mode 100644 pandas/io/tests/data/stata3_115.dta create mode 100644 pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats create mode 100644 pandas/io/tests/data/stata3_117.dta create mode 100644 pandas/io/tests/data/stata4_113.dta create mode 100644 pandas/io/tests/data/stata4_114.dta create mode 100644 pandas/io/tests/data/stata4_115.dta create mode 100644 pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats create mode 100644 pandas/io/tests/data/stata4_117.dta create mode 100644 pandas/io/tests/data/stata5.csv create mode 100644 pandas/io/tests/data/stata5_113.dta create mode 100644 pandas/io/tests/data/stata5_114.dta create mode 100644 pandas/io/tests/data/stata5_115.dta create mode 100644 pandas/io/tests/data/stata5_117.dta create mode 100644 pandas/io/tests/data/stata6.csv create mode 100644 pandas/io/tests/data/stata6_113.dta create mode 100644 pandas/io/tests/data/stata6_114.dta create mode 100644 pandas/io/tests/data/stata6_115.dta create mode 100644 pandas/io/tests/data/stata6_117.dta create mode 100644 pandas/io/tests/data/test.xls create mode 100644 pandas/io/tests/data/test.xlsm create mode 100644 pandas/io/tests/data/test.xlsx create mode 100644 pandas/io/tests/data/test1.csv create mode 100644 pandas/io/tests/data/test2.csv create mode 100644 pandas/io/tests/data/test2.xls create mode 100644 pandas/io/tests/data/test2.xlsx create mode 100644 pandas/io/tests/data/test3.xls create mode 100644 pandas/io/tests/data/test_types.xls create mode 100644 pandas/io/tests/data/test_types.xlsx create mode 100644 pandas/io/tests/data/times_1900.xls create mode 100644 pandas/io/tests/data/times_1904.xls create mode 100644 pandas/io/tests/data/tips.csv create mode 100644 pandas/io/tests/data/unicode_series.csv create mode 100644 pandas/io/tests/data/utf16_ex.txt create mode 100644 pandas/io/tests/data/valid_markup.html create mode 100644 pandas/io/tests/data/yahoo_options1.html create mode 100644 pandas/io/tests/data/yahoo_options2.html create mode 100644 pandas/io/tests/generate_legacy_pickles.py create mode 100644 pandas/io/tests/test_clipboard.py create mode 100644 pandas/io/tests/test_cparser.py create mode 100644 pandas/io/tests/test_data.py create mode 100644 pandas/io/tests/test_date_converters.py create mode 100644 pandas/io/tests/test_excel.py create mode 100644 pandas/io/tests/test_ga.py create mode 100644 pandas/io/tests/test_gbq.py create mode 100644 pandas/io/tests/test_html.py create mode 100644 pandas/io/tests/test_json/__init__.py create mode 100644 pandas/io/tests/test_json/data/tsframe_iso_v012.json create mode 100644 pandas/io/tests/test_json/data/tsframe_v012.json create mode 100644 pandas/io/tests/test_json/test_pandas.py create mode 100644 pandas/io/tests/test_json/test_ujson.py create mode 100644 pandas/io/tests/test_json_norm.py create mode 100644 pandas/io/tests/test_packers.py create mode 100644 pandas/io/tests/test_parsers.py create mode 100644 pandas/io/tests/test_pickle.py create mode 100644 pandas/io/tests/test_pytables.py create mode 100644 pandas/io/tests/test_sql.py create mode 100644 pandas/io/tests/test_stata.py create mode 100644 pandas/io/tests/test_wb.py create mode 100644 pandas/io/wb.py create mode 100644 pandas/lib.pyx create mode 100644 pandas/msgpack.pyx create mode 100644 pandas/parser.pyx create mode 100644 pandas/rpy/__init__.py create mode 100644 pandas/rpy/base.py create mode 100644 pandas/rpy/common.py create mode 100644 pandas/rpy/mass.py create mode 100644 pandas/rpy/tests/__init__.py create mode 100644 pandas/rpy/tests/test_common.py create mode 100644 pandas/rpy/vars.py create mode 100644 pandas/sandbox/__init__.py create mode 100644 pandas/sandbox/qtpandas.py create mode 100644 pandas/sparse/__init__.py create mode 100644 pandas/sparse/api.py create mode 100644 pandas/sparse/array.py create mode 100644 pandas/sparse/frame.py create mode 100644 pandas/sparse/list.py create mode 100644 pandas/sparse/panel.py create mode 100644 pandas/sparse/series.py create mode 100644 pandas/sparse/tests/__init__.py create mode 100644 pandas/sparse/tests/test_array.py create mode 100644 pandas/sparse/tests/test_libsparse.py create mode 100644 pandas/sparse/tests/test_list.py create mode 100644 pandas/sparse/tests/test_sparse.py create mode 100644 pandas/src/datetime.pxd create mode 100644 pandas/src/datetime/np_datetime.c create mode 100644 pandas/src/datetime/np_datetime.h create mode 100644 pandas/src/datetime/np_datetime_strings.c create mode 100644 pandas/src/datetime/np_datetime_strings.h create mode 100644 pandas/src/datetime_helper.h create mode 100644 pandas/src/generate_code.py create mode 100644 pandas/src/generated.pyx create mode 100644 pandas/src/headers/math.h create mode 100644 pandas/src/headers/ms_inttypes.h create mode 100644 pandas/src/headers/ms_stdint.h create mode 100644 pandas/src/headers/portable.h create mode 100644 pandas/src/headers/stdint.h create mode 100644 pandas/src/helper.h create mode 100644 pandas/src/inference.pyx create mode 100644 pandas/src/join.pyx create mode 100644 pandas/src/khash.pxd create mode 100644 pandas/src/klib/khash.h create mode 100644 pandas/src/klib/khash_python.h create mode 100644 pandas/src/klib/ktypes.h create mode 100644 pandas/src/klib/kvec.h create mode 100644 pandas/src/msgpack/pack.h create mode 100644 pandas/src/msgpack/pack_template.h create mode 100644 pandas/src/msgpack/sysdep.h create mode 100644 pandas/src/msgpack/unpack.h create mode 100644 pandas/src/msgpack/unpack_define.h create mode 100644 pandas/src/msgpack/unpack_template.h create mode 100644 pandas/src/numpy.pxd create mode 100644 pandas/src/numpy_helper.h create mode 100644 pandas/src/offsets.pyx create mode 100644 pandas/src/parse_helper.h create mode 100644 pandas/src/parser/.gitignore create mode 100644 pandas/src/parser/Makefile create mode 100644 pandas/src/parser/io.c create mode 100644 pandas/src/parser/io.h create mode 100644 pandas/src/parser/tokenizer.c create mode 100644 pandas/src/parser/tokenizer.h create mode 100644 pandas/src/period.c create mode 100644 pandas/src/period.h create mode 100644 pandas/src/properties.pyx create mode 100644 pandas/src/reduce.pyx create mode 100644 pandas/src/skiplist.h create mode 100644 pandas/src/skiplist.pxd create mode 100644 pandas/src/skiplist.pyx create mode 100644 pandas/src/sparse.pyx create mode 100644 pandas/src/testing.pyx create mode 100644 pandas/src/ujson/lib/ultrajson.h create mode 100644 pandas/src/ujson/lib/ultrajsondec.c create mode 100644 pandas/src/ujson/lib/ultrajsonenc.c create mode 100644 pandas/src/ujson/python/JSONtoObj.c create mode 100644 pandas/src/ujson/python/objToJSON.c create mode 100644 pandas/src/ujson/python/py_defines.h create mode 100644 pandas/src/ujson/python/ujson.c create mode 100644 pandas/src/ujson/python/version.h create mode 100644 pandas/src/util.pxd create mode 100644 pandas/stats/__init__.py create mode 100644 pandas/stats/api.py create mode 100644 pandas/stats/common.py create mode 100644 pandas/stats/fama_macbeth.py create mode 100644 pandas/stats/interface.py create mode 100644 pandas/stats/math.py create mode 100644 pandas/stats/misc.py create mode 100644 pandas/stats/moments.py create mode 100644 pandas/stats/ols.py create mode 100644 pandas/stats/plm.py create mode 100644 pandas/stats/tests/__init__.py create mode 100644 pandas/stats/tests/common.py create mode 100644 pandas/stats/tests/test_fama_macbeth.py create mode 100644 pandas/stats/tests/test_math.py create mode 100644 pandas/stats/tests/test_moments.py create mode 100644 pandas/stats/tests/test_ols.py create mode 100644 pandas/stats/tests/test_var.py create mode 100644 pandas/stats/var.py create mode 100644 pandas/tests/__init__.py create mode 100644 pandas/tests/data/iris.csv create mode 100644 pandas/tests/data/mindex_073.pickle create mode 100644 pandas/tests/data/multiindex_v1.pickle create mode 100644 pandas/tests/data/tips.csv create mode 100644 pandas/tests/data/unicode_series.csv create mode 100644 pandas/tests/test_algos.py create mode 100644 pandas/tests/test_base.py create mode 100644 pandas/tests/test_categorical.py create mode 100644 pandas/tests/test_common.py create mode 100644 pandas/tests/test_compat.py create mode 100644 pandas/tests/test_config.py create mode 100644 pandas/tests/test_expressions.py create mode 100644 pandas/tests/test_format.py create mode 100644 pandas/tests/test_frame.py create mode 100644 pandas/tests/test_generic.py create mode 100644 pandas/tests/test_graphics.py create mode 100644 pandas/tests/test_groupby.py create mode 100644 pandas/tests/test_index.py create mode 100644 pandas/tests/test_indexing.py create mode 100644 pandas/tests/test_internals.py create mode 100644 pandas/tests/test_msgpack/__init__.py create mode 100644 pandas/tests/test_msgpack/test_buffer.py create mode 100644 pandas/tests/test_msgpack/test_case.py create mode 100644 pandas/tests/test_msgpack/test_except.py create mode 100644 pandas/tests/test_msgpack/test_format.py create mode 100644 pandas/tests/test_msgpack/test_obj.py create mode 100644 pandas/tests/test_msgpack/test_pack.py create mode 100644 pandas/tests/test_msgpack/test_read_size.py create mode 100644 pandas/tests/test_msgpack/test_seq.py create mode 100644 pandas/tests/test_msgpack/test_sequnpack.py create mode 100644 pandas/tests/test_msgpack/test_subtype.py create mode 100644 pandas/tests/test_msgpack/test_unpack_raw.py create mode 100644 pandas/tests/test_multilevel.py create mode 100644 pandas/tests/test_nanops.py create mode 100644 pandas/tests/test_panel.py create mode 100644 pandas/tests/test_panel4d.py create mode 100644 pandas/tests/test_panelnd.py create mode 100644 pandas/tests/test_reshape.py create mode 100644 pandas/tests/test_rplot.py create mode 100644 pandas/tests/test_series.py create mode 100644 pandas/tests/test_stats.py create mode 100644 pandas/tests/test_strings.py create mode 100644 pandas/tests/test_testing.py create mode 100644 pandas/tests/test_tseries.py create mode 100644 pandas/tools/__init__.py create mode 100644 pandas/tools/describe.py create mode 100644 pandas/tools/merge.py create mode 100644 pandas/tools/pivot.py create mode 100644 pandas/tools/plotting.py create mode 100644 pandas/tools/rplot.py create mode 100644 pandas/tools/tests/__init__.py create mode 100644 pandas/tools/tests/cut_data.csv create mode 100644 pandas/tools/tests/test_merge.py create mode 100644 pandas/tools/tests/test_pivot.py create mode 100644 pandas/tools/tests/test_tile.py create mode 100644 pandas/tools/tests/test_tools.py create mode 100644 pandas/tools/tests/test_util.py create mode 100644 pandas/tools/tile.py create mode 100644 pandas/tools/util.py create mode 100644 pandas/tseries/__init__.py create mode 100644 pandas/tseries/api.py create mode 100644 pandas/tseries/converter.py create mode 100644 pandas/tseries/frequencies.py create mode 100644 pandas/tseries/holiday.py create mode 100644 pandas/tseries/index.py create mode 100644 pandas/tseries/interval.py create mode 100644 pandas/tseries/offsets.py create mode 100644 pandas/tseries/period.py create mode 100644 pandas/tseries/plotting.py create mode 100644 pandas/tseries/resample.py create mode 100644 pandas/tseries/tests/__init__.py create mode 100644 pandas/tseries/tests/data/daterange_073.pickle create mode 100644 pandas/tseries/tests/data/frame.pickle create mode 100644 pandas/tseries/tests/data/series.pickle create mode 100644 pandas/tseries/tests/data/series_daterange0.pickle create mode 100644 pandas/tseries/tests/test_converter.py create mode 100644 pandas/tseries/tests/test_daterange.py create mode 100644 pandas/tseries/tests/test_frequencies.py create mode 100644 pandas/tseries/tests/test_holiday.py create mode 100644 pandas/tseries/tests/test_offsets.py create mode 100644 pandas/tseries/tests/test_period.py create mode 100644 pandas/tseries/tests/test_plotting.py create mode 100644 pandas/tseries/tests/test_resample.py create mode 100644 pandas/tseries/tests/test_timedeltas.py create mode 100644 pandas/tseries/tests/test_timeseries.py create mode 100644 pandas/tseries/tests/test_timeseries_legacy.py create mode 100644 pandas/tseries/tests/test_timezones.py create mode 100644 pandas/tseries/tests/test_tslib.py create mode 100644 pandas/tseries/tests/test_util.py create mode 100644 pandas/tseries/timedeltas.py create mode 100644 pandas/tseries/tools.py create mode 100644 pandas/tseries/util.py create mode 100644 pandas/tslib.pxd create mode 100644 pandas/tslib.pyx create mode 100644 pandas/util/__init__.py create mode 100644 pandas/util/clipboard.py create mode 100644 pandas/util/decorators.py create mode 100644 pandas/util/misc.py create mode 100644 pandas/util/print_versions.py create mode 100644 pandas/util/terminal.py create mode 100644 pandas/util/testing.py create mode 100644 scripts/bench_join.R create mode 100644 scripts/bench_join.py create mode 100644 scripts/bench_join_multi.py create mode 100644 scripts/bench_refactor.py create mode 100644 scripts/boxplot_test.py create mode 100755 scripts/count_code.sh create mode 100644 scripts/faster_xs.py create mode 100644 scripts/file_sizes.py create mode 100755 scripts/find_commits_touching_func.py create mode 100755 scripts/find_undoc_args.py create mode 100644 scripts/gen_release_notes.py create mode 100644 scripts/git-mrb create mode 100644 scripts/git_code_churn.py create mode 100644 scripts/groupby_sample.py create mode 100644 scripts/groupby_speed.py create mode 100644 scripts/groupby_test.py create mode 100644 scripts/hdfstore_panel_perf.py create mode 100644 scripts/json_manip.py create mode 100644 scripts/leak.py create mode 100644 scripts/parser_magic.py create mode 100644 scripts/preepoch_test.py create mode 100644 scripts/pypistats.py create mode 100644 scripts/roll_median_leak.py create mode 100644 scripts/runtests.py create mode 100644 scripts/test_py25.bat create mode 100644 scripts/test_py26.bat create mode 100644 scripts/test_py27.bat create mode 100644 scripts/test_py31.bat create mode 100644 scripts/test_py32.bat create mode 100644 scripts/testmed.py create mode 100755 scripts/touchup_gh_issues.py create mode 100755 scripts/use_build_cache.py create mode 100644 scripts/winbuild_py25.bat create mode 100644 scripts/winbuild_py27.bat create mode 100644 scripts/windows_builder/build_26-32.bat create mode 100644 scripts/windows_builder/build_26-64.bat create mode 100644 scripts/windows_builder/build_27-32.bat create mode 100644 scripts/windows_builder/build_27-64.bat create mode 100644 scripts/windows_builder/build_33-32.bat create mode 100644 scripts/windows_builder/build_33-64.bat create mode 100644 scripts/windows_builder/build_34-32.bat create mode 100644 scripts/windows_builder/build_34-64.bat create mode 100644 scripts/windows_builder/check_and_build.bat create mode 100644 scripts/windows_builder/check_and_build.py create mode 100644 scripts/windows_builder/readme.txt create mode 100755 setup.py create mode 100755 test.sh create mode 100755 test_fast.sh create mode 100755 test_multi.sh create mode 100755 test_perf.sh create mode 100755 test_rebuild.sh create mode 100644 tox.ini create mode 100644 vb_suite/.gitignore create mode 100644 vb_suite/attrs_caching.py create mode 100644 vb_suite/binary_ops.py create mode 100644 vb_suite/ctors.py create mode 100644 vb_suite/eval.py create mode 100644 vb_suite/frame_ctor.py create mode 100644 vb_suite/frame_methods.py create mode 100644 vb_suite/generate_rst_files.py create mode 100644 vb_suite/groupby.py create mode 100644 vb_suite/hdfstore_bench.py create mode 100644 vb_suite/index_object.py create mode 100644 vb_suite/indexing.py create mode 100644 vb_suite/inference.py create mode 100644 vb_suite/io_bench.py create mode 100644 vb_suite/join_merge.py create mode 100755 vb_suite/make.py create mode 100755 vb_suite/measure_memory_consumption.py create mode 100644 vb_suite/miscellaneous.py create mode 100644 vb_suite/packers.py create mode 100644 vb_suite/pandas_vb_common.py create mode 100644 vb_suite/panel_ctor.py create mode 100644 vb_suite/panel_methods.py create mode 100644 vb_suite/parser_vb.py create mode 100755 vb_suite/perf_HEAD.py create mode 100644 vb_suite/plotting.py create mode 100644 vb_suite/reindex.py create mode 100644 vb_suite/replace.py create mode 100644 vb_suite/reshape.py create mode 100755 vb_suite/run_suite.py create mode 100644 vb_suite/series_methods.py create mode 100644 vb_suite/source/_static/stub create mode 100644 vb_suite/source/conf.py create mode 100644 vb_suite/source/themes/agogo/layout.html create mode 100644 vb_suite/source/themes/agogo/static/agogo.css_t create mode 100644 vb_suite/source/themes/agogo/static/bgfooter.png create mode 100644 vb_suite/source/themes/agogo/static/bgtop.png create mode 100644 vb_suite/source/themes/agogo/theme.conf create mode 100644 vb_suite/sparse.py create mode 100644 vb_suite/stat_ops.py create mode 100644 vb_suite/strings.py create mode 100644 vb_suite/suite.py create mode 100644 vb_suite/test.py create mode 100755 vb_suite/test_perf.py create mode 100644 vb_suite/timedelta.py create mode 100644 vb_suite/timeseries.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..5b264a62 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,26 @@ +# .coveragerc to control coverage.py +[run] +branch = False + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: + +ignore_errors = False + +[html] +directory = coverage_html_report \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..0ef16e42 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,15 @@ +* text=auto +# enforce text on certain files +*.py text +*.pyx text +*.pyd text +*.c text +*.h text +*.html text +*.csv text +*.json text +*.pickle binary +*.h5 binary +*.dta binary +*.xls binary +*.xlsx binary diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..92a7e4d3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,90 @@ +######################################### +# Editor temporary/working/backup files # +.#* +*\#*\# +[#]*# +*~ +*$ +*.bak +*flymake* +*.kdev4 +*.log +*.swp +*.pdb +.project +.pydevproject +.settings +.idea +.vagrant +.noseids + +# Compiled source # +################### +*.a +*.com +*.class +*.dll +*.exe +*.o +*.py[ocd] +*.so +.build_cache_dir +MANIFEST + +# Python files # +################ +# setup.py working directory +build +# sphinx build directory +doc/_build +# setup.py dist directory +dist +# Egg metadata +*.egg-info +# tox testing tool +.tox +# rope +.ropeproject +# wheel files +*.whl +**/wheelhouse/* +# coverage +.coverage + +# OS generated files # +###################### +.directory +.gdb_history +.DS_Store? +ehthumbs.db +Icon? +Thumbs.db + +# Data files # +############## +*.dta +*.h5 +pandas/io/*.dat +pandas/io/*.json +scikits + +# Generated Sources # +##################### +!skts.c +!np_datetime.c +!np_datetime_strings.c +*.c +*.cpp + +# Things specific to this project # +################################### +pandas/version.py + +# Documentation generated files # +################################# +doc/source/generated +doc/source/_static +doc/source/vbench +doc/source/vbench.rst +doc/source/index.rst +doc/build/html/index.html diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..d1350980 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,111 @@ + +language: python + +env: + global: + # scatterci API key + #- secure: "Bx5umgo6WjuGY+5XFa004xjCiX/vq0CyMZ/ETzcs7EIBI1BE/0fIDXOoWhoxbY9HPfdPGlDnDgB9nGqr5wArO2s+BavyKBWg6osZ3dmkfuJPMOWeyCa92EeP+sfKw8e5HSU5MizW9e319wHWOF/xkzdHR7T67Qd5erhv91x4DnQ=" + # ironcache API key + - secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA=" + - secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw=" + # pandas-docs-bot GH + - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" + +matrix: + fast_finish: true + include: + - python: 2.6 + env: + - NOSE_ARGS="not slow and not network and not disabled" + - CLIPBOARD=xclip + - LOCALE_OVERRIDE="it_IT.UTF-8" + - JOB_NAME: "26_nslow_nnet" + - python: 2.7 + env: + - NOSE_ARGS="slow and not network and not disabled" + - LOCALE_OVERRIDE="zh_CN.GB18030" + - FULL_DEPS=true + - JOB_TAG=_LOCALE + - JOB_NAME: "27_slow_nnet_LOCALE" + - python: 2.7 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=gtk2 + - JOB_NAME: "27_nslow" + - DOC_BUILD=true # if rst files were changed, build docs in parallel with tests + - python: 3.3 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - JOB_NAME: "33_nslow" + - python: 3.4 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - JOB_NAME: "34_nslow" + - python: 3.2 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=qt4 + - JOB_NAME: "32_nslow" + - python: 2.7 + env: + - EXPERIMENTAL=true + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_master" + - JOB_TAG=_NUMPY_DEV_master + - NUMPY_BUILD=master + - PANDAS_TESTING_MODE="deprecate" + allow_failures: + - python: 3.2 + env: + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=qt4 + - JOB_NAME: "32_nslow" + - python: 2.7 + env: + - EXPERIMENTAL=true + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_master" + - JOB_TAG=_NUMPY_DEV_master + - NUMPY_BUILD=master + - PANDAS_TESTING_MODE="deprecate" + +before_install: + - echo "before_install" + - echo $VIRTUAL_ENV + - df -h + - date + - pwd + - uname -a + - python -V + - ci/before_install.sh + # Xvfb stuff for clipboard functionality; see the travis-ci documentation + - export DISPLAY=:99.0 + - sh -e /etc/init.d/xvfb start + +install: + - echo "install" + - ci/prep_ccache.sh + - ci/install.sh + - ci/submit_ccache.sh + +before_script: + - mysql -e 'create database pandas_nosetest;' + - psql -c 'create database pandas_nosetest;' -U postgres + +script: + - echo "script" + - ci/script.sh +# nothing here, or failed tests won't fail travis + +after_script: + - if [ -f /tmp/doc.log ]; then cat /tmp/doc.log; fi + - ci/print_versions.py + - ci/print_skipped.py /tmp/nosetests.xml + - ci/after_script.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..e6ae1d0a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,98 @@ +###Guidelines + +All contributions, bug reports, bug fixes, documentation improvements, +enhancements and ideas are welcome. + +The [GitHub "issues" tab](https://github.com/pydata/pandas/issues) +contains some issues labeled "Good as first PR"; Look those up if you're +looking for a quick way to help out. + +#### Bug Reports + + - Please include a short, self-contained Python snippet reproducing the problem. + You can have the code formatted nicely by using [GitHub Flavored Markdown](http://github.github.com/github-flavored-markdown/) : + + ```python + + print("I ♥ pandas!") + + ``` + + - Include the full version string of pandas and it's dependencies. In recent (>0.12) versions + of pandas you can use a built in function: + + ```python + >>> from pandas.util.print_versions import show_versions + >>> show_versions() + ``` + + and in 0.13.1 onwards: + ```python + >>> pd.show_versions() + ``` + + - Explain what the expected behavior was, and what you saw instead. + +#### Pull Requests + + - **Make sure the test suite passes** on your box, Use the provided `test_*.sh` scripts or tox. + - Use [proper commit messages](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html): + - a subject line with `< 80` chars. + - One blank line. + - Optionally, a commit message body. + - Please reference relevant Github issues in your commit message using `GH1234` + or `#1234`. Either style is fine but the '#' style generates noise when your rebase your PR. + - `doc/source/vx.y.z.txt` contains an ongoing + changelog for each release. Add an entry to this file + as needed in your PR: document the fix, enhancement, + or (unavoidable) breaking change. + - Keep style fixes to a separate commit to make your PR more readable. + - An informal commit message format is in effect for the project. Please try + and adhere to it. Check `git log` for examples. Here are some common prefixes + along with general guidelines for when to use them: + - **ENH**: Enhancement, new functionality + - **BUG**: Bug fix + - **DOC**: Additions/updates to documentation + - **TST**: Additions/updates to tests + - **BLD**: Updates to the build process/scripts + - **PERF**: Performance improvement + - **CLN**: Code cleanup + - Maintain backward-compatibility. Pandas has lots of users with lots of existing code. Don't break it. + - If you think breakage is required clearly state why as part of the PR. + - Be careful when changing method signatures. + - Add deprecation warnings where needed. + - Performance matters. Make sure your PR hasn't introduced perf regressions by using `test_perf.sh`. + - Docstrings follow the [numpydoc](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt) format. + - Write tests. + - When writing tests, use 2.6 compatible `self.assertFoo` methods. Some polyfills such as `assertRaises` + can be found in `pandas.util.testing`. + - Do not attach doctrings to tests. Make the test itself readable and use comments if needed. + - Generally, pandas source files should not contain attributions. You can include a "thanks to..." + in the release changelog. The rest is `git blame`/`git log`. + - When you start working on a PR, start by creating a new branch pointing at the latest + commit on github master. + - **Do not** merge upstream into a branch you're going to submit as a PR. + Use `git rebase` against the current github master. + - For extra brownie points, you can squash and reorder the commits in your PR using `git rebase -i`. + Use your own judgment to decide what history needs to be preserved. If git frightens you, that's OK too. + - Use `raise AssertionError` over `assert` unless you want the assertion stripped by `python -o`. + - The pandas copyright policy is detailed in the pandas [LICENSE](https://github.com/pydata/pandas/blob/master/LICENSE). + - On the subject of [PEP8](http://www.python.org/dev/peps/pep-0008/): yes. + - We've written a tool to check that your commits are PEP8 great, + [`pip install pep8radius`](https://github.com/hayd/pep8radius). Look at PEP8 fixes in your branch + vs master with `pep8radius master --diff` and make these changes with + `pep8radius master --diff --in-place`. + - On the subject of a massive PEP8-storm touching everything: not too often (once per release works). + +### Notes on plotting function conventions + +https://groups.google.com/forum/#!topic/pystatsmodels/biNlCvJPNNY/discussion + +####More developer docs + +* See the [developers](http://pandas.pydata.org/developers.html) page on the + project website for more details. +* [`pandas` wiki](https://github.com/pydata/pandas/wiki) +* [Tips and tricks](https://github.com/pydata/pandas/wiki/Tips-&-Tricks) +* [Git tips and tricks](https://github.com/pydata/pandas/wiki/Using-Git) +* [Testing advice and best practices in `pandas`](https://github.com/pydata/pandas/wiki/Testing) diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..c9b8834e --- /dev/null +++ b/LICENSE @@ -0,0 +1,87 @@ +======= +License +======= + +pandas is distributed under a 3-clause ("Simplified" or "New") BSD +license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have +BSD-compatible licenses, are included. Their licenses follow the pandas +license. + +pandas license +============== + +Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Copyright (c) 2008-2011 AQR Capital Management, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +About the Copyright Holders +=========================== + +AQR Capital Management began pandas development in 2008. Development was +led by Wes McKinney. AQR released the source under this license in 2009. +Wes is now an employee of Lambda Foundry, and remains the pandas project +lead. + +The PyData Development Team is the collection of developers of the PyData +project. This includes all of the PyData sub-projects, including pandas. The +core team that coordinates development on GitHub can be found here: +http://github.com/pydata. + +Full credits for pandas contributors can be found in the documentation. + +Our Copyright Policy +==================== + +PyData uses a shared copyright model. Each contributor maintains copyright +over their contributions to PyData. However, it is important to note that +these contributions are typically only changes to the repositories. Thus, +the PyData source code, in its entirety, is not the copyright of any single +person or institution. Instead, it is the collective copyright of the +entire PyData Development Team. If individual contributors want to maintain +a record of what changes/contributions they have specific copyright on, +they should indicate their copyright in the commit message of the change +when they commit the change to one of the PyData repositories. + +With this in mind, the following banner should be used in any source code +file to indicate the copyright and license terms: + +#----------------------------------------------------------------------------- +# Copyright (c) 2012, PyData Development Team +# All rights reserved. +# +# Distributed under the terms of the BSD Simplified License. +# +# The full license is in the LICENSE file, distributed with this software. +#----------------------------------------------------------------------------- + +Other licenses can be found in the LICENSES directory. \ No newline at end of file diff --git a/LICENSES/MSGPACK_LICENSE b/LICENSES/MSGPACK_LICENSE new file mode 100644 index 00000000..ae1b0f2f --- /dev/null +++ b/LICENSES/MSGPACK_LICENSE @@ -0,0 +1,13 @@ +Copyright (C) 2008-2011 INADA Naoki + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/LICENSES/MSGPACK_NUMPY_LICENSE b/LICENSES/MSGPACK_NUMPY_LICENSE new file mode 100644 index 00000000..e570011e --- /dev/null +++ b/LICENSES/MSGPACK_NUMPY_LICENSE @@ -0,0 +1,33 @@ +.. -*- rst -*- + +License +======= + +Copyright (c) 2013, Lev Givon. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Lev Givon nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/NUMPY_LICENSE b/LICENSES/NUMPY_LICENSE new file mode 100644 index 00000000..7e972cff --- /dev/null +++ b/LICENSES/NUMPY_LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2005-2011, NumPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NumPy Developers nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/OTHER b/LICENSES/OTHER new file mode 100644 index 00000000..f0550b4e --- /dev/null +++ b/LICENSES/OTHER @@ -0,0 +1,80 @@ +numpydoc license +---------------- + +The numpydoc license is in pandas/doc/sphinxext/LICENSE.txt + +Bottleneck license +------------------ + +Copyright (c) 2010-2012 Archipel Asset Management AB. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +google-api-python-client license +-------------------------------- + +Copyright (C) 2012 Google Inc. +All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Pyperclip v1.3 license +---------------------- + +Copyright (c) 2010, Albert Sweigart +All rights reserved. + +BSD-style license: + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the pyperclip nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/PSF_LICENSE b/LICENSES/PSF_LICENSE new file mode 100644 index 00000000..5cdb01e8 --- /dev/null +++ b/LICENSES/PSF_LICENSE @@ -0,0 +1,279 @@ +A. HISTORY OF THE SOFTWARE +========================== + +Python was created in the early 1990s by Guido van Rossum at Stichting +Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands +as a successor of a language called ABC. Guido remains Python's +principal author, although it includes many contributions from others. + +In 1995, Guido continued his work on Python at the Corporation for +National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) +in Reston, Virginia where he released several versions of the +software. + +In May 2000, Guido and the Python core development team moved to +BeOpen.com to form the BeOpen PythonLabs team. In October of the same +year, the PythonLabs team moved to Digital Creations (now Zope +Corporation, see http://www.zope.com). In 2001, the Python Software +Foundation (PSF, see http://www.python.org/psf/) was formed, a +non-profit organization created specifically to own Python-related +Intellectual Property. Zope Corporation is a sponsoring member of +the PSF. + +All Python releases are Open Source (see http://www.opensource.org for +the Open Source Definition). Historically, most, but not all, Python +releases have also been GPL-compatible; the table below summarizes +the various releases. + + Release Derived Year Owner GPL- + from compatible? (1) + + 0.9.0 thru 1.2 1991-1995 CWI yes + 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes + 1.6 1.5.2 2000 CNRI no + 2.0 1.6 2000 BeOpen.com no + 1.6.1 1.6 2001 CNRI yes (2) + 2.1 2.0+1.6.1 2001 PSF no + 2.0.1 2.0+1.6.1 2001 PSF yes + 2.1.1 2.1+2.0.1 2001 PSF yes + 2.2 2.1.1 2001 PSF yes + 2.1.2 2.1.1 2002 PSF yes + 2.1.3 2.1.2 2002 PSF yes + 2.2.1 2.2 2002 PSF yes + 2.2.2 2.2.1 2002 PSF yes + 2.2.3 2.2.2 2003 PSF yes + 2.3 2.2.2 2002-2003 PSF yes + 2.3.1 2.3 2002-2003 PSF yes + 2.3.2 2.3.1 2002-2003 PSF yes + 2.3.3 2.3.2 2002-2003 PSF yes + 2.3.4 2.3.3 2004 PSF yes + 2.3.5 2.3.4 2005 PSF yes + 2.4 2.3 2004 PSF yes + 2.4.1 2.4 2005 PSF yes + 2.4.2 2.4.1 2005 PSF yes + 2.4.3 2.4.2 2006 PSF yes + 2.4.4 2.4.3 2006 PSF yes + 2.5 2.4 2006 PSF yes + 2.5.1 2.5 2007 PSF yes + 2.5.2 2.5.1 2008 PSF yes + 2.5.3 2.5.2 2008 PSF yes + 2.6 2.5 2008 PSF yes + 2.6.1 2.6 2008 PSF yes + 2.6.2 2.6.1 2009 PSF yes + 2.6.3 2.6.2 2009 PSF yes + 2.6.4 2.6.3 2009 PSF yes + 2.6.5 2.6.4 2010 PSF yes + 2.7 2.6 2010 PSF yes + +Footnotes: + +(1) GPL-compatible doesn't mean that we're distributing Python under + the GPL. All Python licenses, unlike the GPL, let you distribute + a modified version without making your changes open source. The + GPL-compatible licenses make it possible to combine Python with + other software that is released under the GPL; the others don't. + +(2) According to Richard Stallman, 1.6.1 is not GPL-compatible, + because its license has a choice of law clause. According to + CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1 + is "not incompatible" with the GPL. + +Thanks to the many outside volunteers who have worked under Guido's +direction to make these releases possible. + + +B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON +=============================================================== + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 +Python Software Foundation; All Rights Reserved" are retained in Python alone or +in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0 +------------------------------------------- + +BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1 + +1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an +office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the +Individual or Organization ("Licensee") accessing and otherwise using +this software in source or binary form and its associated +documentation ("the Software"). + +2. Subject to the terms and conditions of this BeOpen Python License +Agreement, BeOpen hereby grants Licensee a non-exclusive, +royalty-free, world-wide license to reproduce, analyze, test, perform +and/or display publicly, prepare derivative works, distribute, and +otherwise use the Software alone or in any derivative version, +provided, however, that the BeOpen Python License is retained in the +Software, alone or in any derivative version prepared by Licensee. + +3. BeOpen is making the Software available to Licensee on an "AS IS" +basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE +SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS +AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY +DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +5. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +6. This License Agreement shall be governed by and interpreted in all +respects by the law of the State of California, excluding conflict of +law provisions. Nothing in this License Agreement shall be deemed to +create any relationship of agency, partnership, or joint venture +between BeOpen and Licensee. This License Agreement does not grant +permission to use BeOpen trademarks or trade names in a trademark +sense to endorse or promote products or services of Licensee, or any +third party. As an exception, the "BeOpen Python" logos available at +http://www.pythonlabs.com/logos.html may be used according to the +permissions granted on that web page. + +7. By copying, installing or otherwise using the software, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. + + +CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1 +--------------------------------------- + +1. This LICENSE AGREEMENT is between the Corporation for National +Research Initiatives, having an office at 1895 Preston White Drive, +Reston, VA 20191 ("CNRI"), and the Individual or Organization +("Licensee") accessing and otherwise using Python 1.6.1 software in +source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, CNRI +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python 1.6.1 +alone or in any derivative version, provided, however, that CNRI's +License Agreement and CNRI's notice of copyright, i.e., "Copyright (c) +1995-2001 Corporation for National Research Initiatives; All Rights +Reserved" are retained in Python 1.6.1 alone or in any derivative +version prepared by Licensee. Alternately, in lieu of CNRI's License +Agreement, Licensee may substitute the following text (omitting the +quotes): "Python 1.6.1 is made available subject to the terms and +conditions in CNRI's License Agreement. This Agreement together with +Python 1.6.1 may be located on the Internet using the following +unique, persistent identifier (known as a handle): 1895.22/1013. This +Agreement may also be obtained from a proxy server on the Internet +using the following URL: http://hdl.handle.net/1895.22/1013". + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python 1.6.1 or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python 1.6.1. + +4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" +basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. This License Agreement shall be governed by the federal +intellectual property law of the United States, including without +limitation the federal copyright law, and, to the extent such +U.S. federal law does not apply, by the law of the Commonwealth of +Virginia, excluding Virginia's conflict of law provisions. +Notwithstanding the foregoing, with regard to derivative works based +on Python 1.6.1 that incorporate non-separable material that was +previously distributed under the GNU General Public License (GPL), the +law of the Commonwealth of Virginia shall govern this License +Agreement only as to issues arising under or with respect to +Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this +License Agreement shall be deemed to create any relationship of +agency, partnership, or joint venture between CNRI and Licensee. This +License Agreement does not grant permission to use CNRI trademarks or +trade name in a trademark sense to endorse or promote products or +services of Licensee, or any third party. + +8. By clicking on the "ACCEPT" button where indicated, or by copying, +installing or otherwise using Python 1.6.1, Licensee agrees to be +bound by the terms and conditions of this License Agreement. + + ACCEPT + + +CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2 +-------------------------------------------------- + +Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, +The Netherlands. All rights reserved. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of Stichting Mathematisch +Centrum or CWI not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE +FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/LICENSES/SCIPY_LICENSE b/LICENSES/SCIPY_LICENSE new file mode 100644 index 00000000..d887ce5f --- /dev/null +++ b/LICENSES/SCIPY_LICENSE @@ -0,0 +1,31 @@ +Copyright (c) 2001, 2002 Enthought, Inc. +All rights reserved. + +Copyright (c) 2003-2012 SciPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of Enthought nor the names of the SciPy Developers + may be used to endorse or promote products derived from this software + without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + diff --git a/LICENSES/SIX b/LICENSES/SIX new file mode 100644 index 00000000..6fd669af --- /dev/null +++ b/LICENSES/SIX @@ -0,0 +1,21 @@ +six license (substantial portions used in the python 3 compatibility module) +=========================================================================== +Copyright (c) 2010-2013 Benjamin Peterson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +# +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +# +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE new file mode 100644 index 00000000..defca46e --- /dev/null +++ b/LICENSES/ULTRAJSON_LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..5bf02ad8 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,24 @@ +include MANIFEST.in +include LICENSE +include RELEASE.md +include README.rst +include setup.py + +graft doc +prune doc/build + +graft examples +graft pandas + +global-exclude *.so +global-exclude *.pyd +global-exclude *.pyc +global-exclude .git* +global-exclude .DS_Store +global-exclude *.png + +# include examples/data/* +# recursive-include examples *.py +# recursive-include doc/source * +# recursive-include doc/sphinxext * +# recursive-include LICENSES * diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..9a768932 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +tseries: pandas/lib.pyx pandas/tslib.pyx pandas/hashtable.pyx + python setup.py build_ext --inplace + +.PHONY : develop build clean clean_pyc tseries doc + +clean: + -python setup.py clean + +clean_pyc: + -find . -name '*.py[co]' -exec rm {} \; + +sparse: pandas/src/sparse.pyx + python setup.py build_ext --inplace + +build: clean_pyc + python setup.py build_ext --inplace + +develop: build + -python setup.py develop + +doc: + -rm -rf doc/build doc/source/generated + cd doc; \ + python make.py clean; \ + python make.py html diff --git a/README.md b/README.md new file mode 100644 index 00000000..79a84440 --- /dev/null +++ b/README.md @@ -0,0 +1,224 @@ +# pandas: powerful Python data analysis toolkit + +![Travis-CI Build Status](https://travis-ci.org/pydata/pandas.svg) + +[![Scatter-CI Status page](http://scatterci.github.io/scatterci48.jpg)](http://scatterci.github.io/pydata/pandas) + +## What is it + +**pandas** is a Python package providing fast, flexible, and expressive data +structures designed to make working with "relational" or "labeled" data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, **real world** data analysis in Python. Additionally, it has +the broader goal of becoming **the most powerful and flexible open source data +analysis / manipulation tool available in any language**. It is already well on +its way toward this goal. + +## Main Features +Here are just a few of the things that pandas does well: + + - Easy handling of [**missing data**][missing-data] (represented as + `NaN`) in floating point as well as non-floating point data + - Size mutability: columns can be [**inserted and + deleted**][insertion-deletion] from DataFrame and higher dimensional + objects + - Automatic and explicit [**data alignment**][alignment]: objects can + be explicitly aligned to a set of labels, or the user can simply + ignore the labels and let `Series`, `DataFrame`, etc. automatically + align the data for you in computations + - Powerful, flexible [**group by**][groupby] functionality to perform + split-apply-combine operations on data sets, for both aggregating + and transforming data + - Make it [**easy to convert**][conversion] ragged, + differently-indexed data in other Python and NumPy data structures + into DataFrame objects + - Intelligent label-based [**slicing**][slicing], [**fancy + indexing**][fancy-indexing], and [**subsetting**][subsetting] of + large data sets + - Intuitive [**merging**][merging] and [**joining**][joining] data + sets + - Flexible [**reshaping**][reshape] and [**pivoting**][pivot-table] of + data sets + - [**Hierarchical**][mi] labeling of axes (possible to have multiple + labels per tick) + - Robust IO tools for loading data from [**flat files**][flat-files] + (CSV and delimited), [**Excel files**][excel], [**databases**][db], + and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] + - [**Time series**][timeseries]-specific functionality: date range + generation and frequency conversion, moving window statistics, + moving window linear regressions, date shifting and lagging, etc. + + + [missing-data]: http://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data + [insertion-deletion]: http://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion + [alignment]: http://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures + [groupby]: http://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine + [conversion]: http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe + [slicing]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges + [fancy-indexing]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix + [subsetting]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing + [merging]: http://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging + [joining]: http://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index + [reshape]: http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables + [pivot-table]: http://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations + [mi]: http://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex + [flat-files]: http://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files + [excel]: http://pandas.pydata.org/pandas-docs/stable/io.html#excel-files + [db]: http://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries + [hdfstore]: http://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables + [timeseries]: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality + +## Where to get it +The source code is currently hosted on GitHub at: +http://github.com/pydata/pandas + +Binary installers for the latest released version are available at the Python +package index + + http://pypi.python.org/pypi/pandas/ + +And via `easy_install`: + +```sh +easy_install pandas +``` + +or `pip`: + +```sh +pip install pandas +``` + +## Dependencies +- [NumPy](http://www.numpy.org): 1.6.1 or higher +- [python-dateutil](http://labix.org/python-dateutil): 1.5 or higher +- [pytz](http://pytz.sourceforge.net) + - Needed for time zone support with ``pandas.date_range`` + +### Highly Recommended Dependencies +- [numexpr](http://code.google.com/p/numexpr/) + - Needed to accelerate some expression evaluation operations + - Required by PyTables +- [bottleneck](http://berkeleyanalytics.com/bottleneck) + - Needed to accelerate certain numerical operations + +### Optional dependencies +- [Cython](http://www.cython.org): Only necessary to build development version. Version 0.17.1 or higher. +- [SciPy](http://www.scipy.org): miscellaneous statistical functions +- [PyTables](http://www.pytables.org): necessary for HDF5-based storage +- [SQLAlchemy](http://www.sqlalchemy.org): for SQL database support. Version 0.8.1 or higher recommended. +- [matplotlib](http://matplotlib.sourceforge.net/): for plotting +- [statsmodels](http://statsmodels.sourceforge.net/) + - Needed for parts of `pandas.stats` +- For Excel I/O: + - [xlrd/xlwt](http://www.python-excel.org/) + - Excel reading (xlrd) and writing (xlwt) + - [openpyxl](http://packages.python.org/openpyxl/) + - openpyxl version 1.6.1 or higher, but lower than 2.0.0, for + writing .xlsx files + - xlrd >= 0.9.0 + - [XlsxWriter](https://pypi.python.org/pypi/XlsxWriter) + - Alternative Excel writer. +- [Google bq Command Line Tool](https://developers.google.com/bigquery/bq-command-line-tool/) + - Needed for `pandas.io.gbq` +- [boto](https://pypi.python.org/pypi/boto): necessary for Amazon S3 access. +- One of the following combinations of libraries is needed to use the + top-level [`pandas.read_html`][read-html-docs] function: + - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] (Any + recent version of [html5lib][html5lib] is okay.) + - [BeautifulSoup4][BeautifulSoup4] and [lxml][lxml] + - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] and [lxml][lxml] + - Only [lxml][lxml], although see [HTML reading gotchas][html-gotchas] + for reasons as to why you should probably **not** take this approach. + +#### Notes about HTML parsing libraries +- If you install [BeautifulSoup4][BeautifulSoup4] you must install + either [lxml][lxml] or [html5lib][html5lib] or both. + `pandas.read_html` will **not** work with *only* `BeautifulSoup4` + installed. +- You are strongly encouraged to read [HTML reading + gotchas][html-gotchas]. It explains issues surrounding the + installation and usage of the above three libraries. +- You may need to install an older version of + [BeautifulSoup4][BeautifulSoup4]: + - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and + 32-bit Ubuntu/Debian +- Additionally, if you're using [Anaconda][Anaconda] you should + definitely read [the gotchas about HTML parsing][html-gotchas] + libraries +- If you're on a system with `apt-get` you can do + + ```sh + sudo apt-get build-dep python-lxml + ``` + + to get the necessary dependencies for installation of [lxml][lxml]. + This will prevent further headaches down the line. + + [html5lib]: https://github.com/html5lib/html5lib-python "html5lib" + [BeautifulSoup4]: http://www.crummy.com/software/BeautifulSoup "BeautifulSoup4" + [lxml]: http://lxml.de + [Anaconda]: https://store.continuum.io/cshop/anaconda + [NumPy]: http://numpy.scipy.org/ + [html-gotchas]: http://pandas.pydata.org/pandas-docs/stable/gotchas.html#html-table-parsing + [read-html-docs]: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.html.read_html.html#pandas.io.html.read_html + +## Installation from sources +To install pandas from source you need Cython in addition to the normal +dependencies above. Cython can be installed from pypi: + +```sh +pip install cython +``` + +In the `pandas` directory (same one where you found this file after +cloning the git repo), execute: + +```sh +python setup.py install +``` + +or for installing in [development mode](http://www.pip-installer.org/en/latest/usage.html): + +```sh +python setup.py develop +``` + +Alternatively, you can use `pip` if you want all the dependencies pulled +in automatically (the `-e` option is for installing it in [development +mode](http://www.pip-installer.org/en/latest/usage.html)): + +```sh +pip install -e . +``` + +On Windows, you will need to install MinGW and execute: + +```sh +python setup.py build --compiler=mingw32 +python setup.py install +``` + +See http://pandas.pydata.org/ for more information. + +## License +BSD + +## Documentation +The official documentation is hosted on PyData.org: http://pandas.pydata.org/ + +The Sphinx documentation should provide a good starting point for learning how +to use the library. Expect the docs to continue to expand as time goes on. + +## Background +Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and +has been under active development since then. + +## Discussion and Development +Since pandas development is related to a number of other scientific +Python projects, questions are welcome on the scipy-user mailing +list. Specialized discussions or design issues should take place on +the pystatsmodels mailing list / Google group, where +``scikits.statsmodels`` and other libraries will also be discussed: + +http://groups.google.com/group/pystatsmodels diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 00000000..b1e2aadf --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,6 @@ +Release Notes +============= + +The list of changes to pandas between each release can be found +[here](http://pandas.pydata.org/pandas-docs/dev/release.html). For full +details, see the commit logs at http://github.com/pydata/pandas. diff --git a/bench/alignment.py b/bench/alignment.py new file mode 100644 index 00000000..bc3134f5 --- /dev/null +++ b/bench/alignment.py @@ -0,0 +1,22 @@ +# Setup +from pandas.compat import range, lrange +import numpy as np +import pandas +import la +N = 1000 +K = 50 +arr1 = np.random.randn(N, K) +arr2 = np.random.randn(N, K) +idx1 = lrange(N) +idx2 = lrange(K) + +# pandas +dma1 = pandas.DataFrame(arr1, idx1, idx2) +dma2 = pandas.DataFrame(arr2, idx1[::-1], idx2[::-1]) + +# larry +lar1 = la.larry(arr1, [idx1, idx2]) +lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]]) + +for i in range(100): + result = lar1 + lar2 diff --git a/bench/bench_dense_to_sparse.py b/bench/bench_dense_to_sparse.py new file mode 100644 index 00000000..e1dcd345 --- /dev/null +++ b/bench/bench_dense_to_sparse.py @@ -0,0 +1,14 @@ +from pandas import * + +K = 100 +N = 100000 +rng = DatetimeIndex('1/1/2000', periods=N, offset=datetools.Minute()) + +rng2 = np.asarray(rng).astype('M8[us]').astype('i8') + +series = {} +for i in range(1, K + 1): + data = np.random.randn(N)[:-i] + this_rng = rng2[:-i] + data[100:] = np.nan + series[i] = SparseSeries(data, index=this_rng) diff --git a/bench/bench_get_put_value.py b/bench/bench_get_put_value.py new file mode 100644 index 00000000..427e0b1b --- /dev/null +++ b/bench/bench_get_put_value.py @@ -0,0 +1,56 @@ +from pandas import * +from pandas.util.testing import rands +from pandas.compat import range + +N = 1000 +K = 50 + + +def _random_index(howmany): + return Index([rands(10) for _ in range(howmany)]) + +df = DataFrame(np.random.randn(N, K), index=_random_index(N), + columns=_random_index(K)) + + +def get1(): + for col in df.columns: + for row in df.index: + _ = df[col][row] + + +def get2(): + for col in df.columns: + for row in df.index: + _ = df.get_value(row, col) + + +def put1(): + for col in df.columns: + for row in df.index: + df[col][row] = 0 + + +def put2(): + for col in df.columns: + for row in df.index: + df.set_value(row, col, 0) + + +def resize1(): + buf = DataFrame() + for col in df.columns: + for row in df.index: + buf = buf.set_value(row, col, 5.) + return buf + + +def resize2(): + from collections import defaultdict + + buf = defaultdict(dict) + for col in df.columns: + for row in df.index: + buf[col][row] = 5. + + return DataFrame(buf) diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py new file mode 100644 index 00000000..a86e8ed6 --- /dev/null +++ b/bench/bench_groupby.py @@ -0,0 +1,65 @@ +from pandas import * +from pandas.util.testing import rands +from pandas.compat import range + +import string +import random + +k = 20000 +n = 10 + +foo = np.tile(np.array([rands(10) for _ in range(k)], dtype='O'), n) +foo2 = list(foo) +random.shuffle(foo) +random.shuffle(foo2) + +df = DataFrame({'A': foo, + 'B': foo2, + 'C': np.random.randn(n * k)}) + +import pandas._sandbox as sbx + + +def f(): + table = sbx.StringHashTable(len(df)) + ret = table.factorize(df['A']) + return ret + + +def g(): + table = sbx.PyObjectHashTable(len(df)) + ret = table.factorize(df['A']) + return ret + +ret = f() + +""" +import pandas._tseries as lib + +f = np.std + + +grouped = df.groupby(['A', 'B']) + +label_list = [ping.labels for ping in grouped.groupings] +shape = [len(ping.ids) for ping in grouped.groupings] + +from pandas.core.groupby import get_group_index + + +group_index = get_group_index(label_list, shape).astype('i4') + +ngroups = np.prod(shape) + +indexer = lib.groupsort_indexer(group_index, ngroups) + +values = df['C'].values.take(indexer) +group_index = group_index.take(indexer) + +f = lambda x: x.std(ddof=1) + +grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups) +result = grouper.get_result() + +expected = grouped.std() +""" diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py new file mode 100644 index 00000000..f3c3f8ba --- /dev/null +++ b/bench/bench_join_panel.py @@ -0,0 +1,85 @@ +# reasonably efficient + + +def create_panels_append(cls, panels): + """ return an append list of panels """ + panels = [a for a in panels if a is not None] + # corner cases + if len(panels) == 0: + return None + elif len(panels) == 1: + return panels[0] + elif len(panels) == 2 and panels[0] == panels[1]: + return panels[0] + # import pdb; pdb.set_trace() + # create a joint index for the axis + + def joint_index_for_axis(panels, axis): + s = set() + for p in panels: + s.update(list(getattr(p, axis))) + return sorted(list(s)) + + def reindex_on_axis(panels, axis, axis_reindex): + new_axis = joint_index_for_axis(panels, axis) + new_panels = [p.reindex(**{axis_reindex: new_axis, + 'copy': False}) for p in panels] + return new_panels, new_axis + # create the joint major index, dont' reindex the sub-panels - we are + # appending + major = joint_index_for_axis(panels, 'major_axis') + # reindex on minor axis + panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor') + # reindex on items + panels, items = reindex_on_axis(panels, 'items', 'items') + # concatenate values + try: + values = np.concatenate([p.values for p in panels], axis=1) + except Exception as detail: + raise Exception("cannot append values that dont' match dimensions! -> [%s] %s" + % (','.join(["%s" % p for p in panels]), str(detail))) + # pm('append - create_panel') + p = Panel(values, items=items, major_axis=major, + minor_axis=minor) + # pm('append - done') + return p + + +# does the job but inefficient (better to handle like you read a table in +# pytables...e.g create a LongPanel then convert to Wide) +def create_panels_join(cls, panels): + """ given an array of panels's, create a single panel """ + panels = [a for a in panels if a is not None] + # corner cases + if len(panels) == 0: + return None + elif len(panels) == 1: + return panels[0] + elif len(panels) == 2 and panels[0] == panels[1]: + return panels[0] + d = dict() + minor, major, items = set(), set(), set() + for panel in panels: + items.update(panel.items) + major.update(panel.major_axis) + minor.update(panel.minor_axis) + values = panel.values + for item, item_index in panel.items.indexMap.items(): + for minor_i, minor_index in panel.minor_axis.indexMap.items(): + for major_i, major_index in panel.major_axis.indexMap.items(): + try: + d[(minor_i, major_i, item)] = values[item_index, major_index, minor_index] + except: + pass + # stack the values + minor = sorted(list(minor)) + major = sorted(list(major)) + items = sorted(list(items)) + # create the 3d stack (items x columns x indicies) + data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan) + for item in items]) + for major_i in major]).transpose() + for minor_i in minor]) + # construct the panel + return Panel(data, items, major, minor) +add_class_method(Panel, create_panels_join, 'join_many') diff --git a/bench/bench_khash_dict.py b/bench/bench_khash_dict.py new file mode 100644 index 00000000..054fc361 --- /dev/null +++ b/bench/bench_khash_dict.py @@ -0,0 +1,89 @@ +""" +Some comparisons of khash.h to Python dict +""" +from __future__ import print_function + +import numpy as np +import os + +from vbench.api import Benchmark +from pandas.util.testing import rands +from pandas.compat import range +import pandas._tseries as lib +import pandas._sandbox as sbx +import time + +import psutil + +pid = os.getpid() +proc = psutil.Process(pid) + + +def object_test_data(n): + pass + + +def string_test_data(n): + return np.array([rands(10) for _ in range(n)], dtype='O') + + +def int_test_data(n): + return np.arange(n, dtype='i8') + +N = 1000000 + +#---------------------------------------------------------------------- +# Benchmark 1: map_locations + + +def map_locations_python_object(): + arr = string_test_data(N) + return _timeit(lambda: lib.map_indices_object(arr)) + + +def map_locations_khash_object(): + arr = string_test_data(N) + + def f(): + table = sbx.PyObjectHashTable(len(arr)) + table.map_locations(arr) + return _timeit(f) + + +def _timeit(f, iterations=10): + start = time.time() + for _ in range(iterations): + foo = f() + elapsed = time.time() - start + return elapsed + +#---------------------------------------------------------------------- +# Benchmark 2: lookup_locations + + +def lookup_python(values): + table = lib.map_indices_object(values) + return _timeit(lambda: lib.merge_indexer_object(values, table)) + + +def lookup_khash(values): + table = sbx.PyObjectHashTable(len(values)) + table.map_locations(values) + locs = table.lookup_locations(values) + # elapsed = _timeit(lambda: table.lookup_locations2(values)) + return table + + +def leak(values): + for _ in range(100): + print(proc.get_memory_info()) + table = lookup_khash(values) + # table.destroy() + +arr = string_test_data(N) + +#---------------------------------------------------------------------- +# Benchmark 3: unique + +#---------------------------------------------------------------------- +# Benchmark 4: factorize diff --git a/bench/bench_merge.R b/bench/bench_merge.R new file mode 100644 index 00000000..3ed46184 --- /dev/null +++ b/bench/bench_merge.R @@ -0,0 +1,161 @@ +library(plyr) +library(data.table) +N <- 10000 +indices = rep(NA, N) +indices2 = rep(NA, N) +for (i in 1:N) { + indices[i] <- paste(sample(letters, 10), collapse="") + indices2[i] <- paste(sample(letters, 10), collapse="") +} +left <- data.frame(key=rep(indices[1:8000], 10), + key2=rep(indices2[1:8000], 10), + value=rnorm(80000)) +right <- data.frame(key=indices[2001:10000], + key2=indices2[2001:10000], + value2=rnorm(8000)) + +right2 <- data.frame(key=rep(right$key, 2), + key2=rep(right$key2, 2), + value2=rnorm(16000)) + +left.dt <- data.table(left, key=c("key", "key2")) +right.dt <- data.table(right, key=c("key", "key2")) +right2.dt <- data.table(right2, key=c("key", "key2")) + +# left.dt2 <- data.table(left) +# right.dt2 <- data.table(right) + +## left <- data.frame(key=rep(indices[1:1000], 10), +## key2=rep(indices2[1:1000], 10), +## value=rnorm(100000)) +## right <- data.frame(key=indices[1:1000], +## key2=indices2[1:1000], +## value2=rnorm(10000)) + +timeit <- function(func, niter=10) { + timing = rep(NA, niter) + for (i in 1:niter) { + gc() + timing[i] <- system.time(func())[3] + } + mean(timing) +} + +left.join <- function(sort=FALSE) { + result <- base::merge(left, right, all.x=TRUE, sort=sort) +} + +right.join <- function(sort=FALSE) { + result <- base::merge(left, right, all.y=TRUE, sort=sort) +} + +outer.join <- function(sort=FALSE) { + result <- base::merge(left, right, all=TRUE, sort=sort) +} + +inner.join <- function(sort=FALSE) { + result <- base::merge(left, right, all=FALSE, sort=sort) +} + +left.join.dt <- function(sort=FALSE) { + result <- right.dt[left.dt] +} + +right.join.dt <- function(sort=FALSE) { + result <- left.dt[right.dt] +} + +outer.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right.dt, all=TRUE, sort=sort) +} + +inner.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right.dt, all=FALSE, sort=sort) +} + +plyr.join <- function(type) { + result <- plyr::join(left, right, by=c("key", "key2"), + type=type, match="first") +} + +sort.options <- c(FALSE, TRUE) + +# many-to-one + +results <- matrix(nrow=4, ncol=3) +colnames(results) <- c("base::merge", "plyr", "data.table") +rownames(results) <- c("inner", "outer", "left", "right") + +base.functions <- c(inner.join, outer.join, left.join, right.join) +plyr.functions <- c(function() plyr.join("inner"), + function() plyr.join("full"), + function() plyr.join("left"), + function() plyr.join("right")) +dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) +for (i in 1:4) { + base.func <- base.functions[[i]] + plyr.func <- plyr.functions[[i]] + dt.func <- dt.functions[[i]] + results[i, 1] <- timeit(base.func) + results[i, 2] <- timeit(plyr.func) + results[i, 3] <- timeit(dt.func) +} + + +# many-to-many + +left.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all.x=TRUE, sort=sort) +} + +right.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all.y=TRUE, sort=sort) +} + +outer.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all=TRUE, sort=sort) +} + +inner.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all=FALSE, sort=sort) +} + +left.join.dt <- function(sort=FALSE) { + result <- right2.dt[left.dt] +} + +right.join.dt <- function(sort=FALSE) { + result <- left.dt[right2.dt] +} + +outer.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right2.dt, all=TRUE, sort=sort) +} + +inner.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right2.dt, all=FALSE, sort=sort) +} + +sort.options <- c(FALSE, TRUE) + +# many-to-one + +results <- matrix(nrow=4, ncol=3) +colnames(results) <- c("base::merge", "plyr", "data.table") +rownames(results) <- c("inner", "outer", "left", "right") + +base.functions <- c(inner.join, outer.join, left.join, right.join) +plyr.functions <- c(function() plyr.join("inner"), + function() plyr.join("full"), + function() plyr.join("left"), + function() plyr.join("right")) +dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) +for (i in 1:4) { + base.func <- base.functions[[i]] + plyr.func <- plyr.functions[[i]] + dt.func <- dt.functions[[i]] + results[i, 1] <- timeit(base.func) + results[i, 2] <- timeit(plyr.func) + results[i, 3] <- timeit(dt.func) +} + diff --git a/bench/bench_merge.py b/bench/bench_merge.py new file mode 100644 index 00000000..330dba7b --- /dev/null +++ b/bench/bench_merge.py @@ -0,0 +1,105 @@ +import random +import gc +import time +from pandas import * +from pandas.compat import range, lrange, StringIO +from pandas.util.testing import rands + +N = 10000 +ngroups = 10 + + +def get_test_data(ngroups=100, n=N): + unique_groups = lrange(ngroups) + arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], + dtype=object) + + random.shuffle(arr) + return arr + +# aggregate multiple columns +# df = DataFrame({'key1' : get_test_data(ngroups=ngroups), +# 'key2' : get_test_data(ngroups=ngroups), +# 'data1' : np.random.randn(N), +# 'data2' : np.random.randn(N)}) + +# df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10), +# 'key2' : get_test_data(ngroups=ngroups//2, n=N//10), +# 'value' : np.random.randn(N // 10)}) +# result = merge.merge(df, df2, on='key2') + +N = 10000 + +indices = np.array([rands(10) for _ in range(N)], dtype='O') +indices2 = np.array([rands(10) for _ in range(N)], dtype='O') +key = np.tile(indices[:8000], 10) +key2 = np.tile(indices2[:8000], 10) + +left = DataFrame({'key': key, 'key2': key2, + 'value': np.random.randn(80000)}) +right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], + 'value2': np.random.randn(8000)}) + +right2 = right.append(right, ignore_index=True) + + +join_methods = ['inner', 'outer', 'left', 'right'] +results = DataFrame(index=join_methods, columns=[False, True]) +niter = 10 +for sort in [False, True]: + for join_method in join_methods: + f = lambda: merge(left, right, how=join_method, sort=sort) + gc.disable() + start = time.time() + for _ in range(niter): + f() + elapsed = (time.time() - start) / niter + gc.enable() + results[sort][join_method] = elapsed +# results.columns = ['pandas'] +results.columns = ['dont_sort', 'sort'] + + +# R results +# many to one +r_results = read_table(StringIO(""" base::merge plyr data.table +inner 0.2475 0.1183 0.1100 +outer 0.4213 0.1916 0.2090 +left 0.2998 0.1188 0.0572 +right 0.3102 0.0536 0.0376 +"""), sep='\s+') + +presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'}) +all_results = presults.join(r_results) + +all_results = all_results.div(all_results['pandas'], axis=0) + +all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', + 'base::merge']] + +sort_results = DataFrame.from_items([('pandas', results['sort']), + ('R', r_results['base::merge'])]) +sort_results['Ratio'] = sort_results['R'] / sort_results['pandas'] + + +nosort_results = DataFrame.from_items([('pandas', results['dont_sort']), + ('R', r_results['base::merge'])]) +nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas'] + +# many to many + +# many to one +r_results = read_table(StringIO("""base::merge plyr data.table +inner 0.4610 0.1276 0.1269 +outer 0.9195 0.1881 0.2725 +left 0.6559 0.1257 0.0678 +right 0.6425 0.0522 0.0428 +"""), sep='\s+') + +all_results = presults.join(r_results) +all_results = all_results.div(all_results['pandas'], axis=0) +all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', + 'base::merge']] diff --git a/bench/bench_merge_sqlite.py b/bench/bench_merge_sqlite.py new file mode 100644 index 00000000..3ad4b810 --- /dev/null +++ b/bench/bench_merge_sqlite.py @@ -0,0 +1,87 @@ +import numpy as np +from collections import defaultdict +import gc +import time +from pandas import DataFrame +from pandas.util.testing import rands +from pandas.compat import range, zip +import random + +N = 10000 + +indices = np.array([rands(10) for _ in range(N)], dtype='O') +indices2 = np.array([rands(10) for _ in range(N)], dtype='O') +key = np.tile(indices[:8000], 10) +key2 = np.tile(indices2[:8000], 10) + +left = DataFrame({'key': key, 'key2': key2, + 'value': np.random.randn(80000)}) +right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], + 'value2': np.random.randn(8000)}) + +# right2 = right.append(right, ignore_index=True) +# right = right2 + +# random.shuffle(key2) +# indices2 = indices.copy() +# random.shuffle(indices2) + +# Prepare Database +import sqlite3 +create_sql_indexes = True + +conn = sqlite3.connect(':memory:') +conn.execute( + 'create table left( key varchar(10), key2 varchar(10), value int);') +conn.execute( + 'create table right( key varchar(10), key2 varchar(10), value2 int);') +conn.executemany('insert into left values (?, ?, ?)', + zip(key, key2, left['value'])) +conn.executemany('insert into right values (?, ?, ?)', + zip(right['key'], right['key2'], right['value2'])) + +# Create Indices +if create_sql_indexes: + conn.execute('create index left_ix on left(key, key2)') + conn.execute('create index right_ix on right(key, key2)') + + +join_methods = ['inner', 'left outer', 'left'] # others not supported +sql_results = DataFrame(index=join_methods, columns=[False]) +niter = 5 +for sort in [False]: + for join_method in join_methods: + sql = """CREATE TABLE test as select * + from left + %s join right + on left.key=right.key + and left.key2 = right.key2;""" % join_method + sql = """select * + from left + %s join right + on left.key=right.key + and left.key2 = right.key2;""" % join_method + + if sort: + sql = '%s order by key, key2' % sql + f = lambda: list(conn.execute(sql)) # list fetches results + g = lambda: conn.execute(sql) # list fetches results + gc.disable() + start = time.time() + # for _ in range(niter): + g() + elapsed = (time.time() - start) / niter + gc.enable() + + cur = conn.execute("DROP TABLE test") + conn.commit() + + sql_results[sort][join_method] = elapsed + sql_results.columns = ['sqlite3'] # ['dont_sort', 'sort'] + sql_results.index = ['inner', 'outer', 'left'] + + sql = """select * + from left + inner join right + on left.key=right.key + and left.key2 = right.key2;""" diff --git a/bench/bench_pivot.R b/bench/bench_pivot.R new file mode 100644 index 00000000..06dc6a10 --- /dev/null +++ b/bench/bench_pivot.R @@ -0,0 +1,27 @@ +library(reshape2) + + +n <- 100000 +a.size <- 5 +b.size <- 5 + +data <- data.frame(a=sample(letters[1:a.size], n, replace=T), + b=sample(letters[1:b.size], n, replace=T), + c=rnorm(n), + d=rnorm(n)) + +timings <- numeric() + +# acast(melt(data, id=c("a", "b")), a ~ b, mean) +# acast(melt(data, id=c("a", "b")), a + b ~ variable, mean) + +for (i in 1:10) { + gc() + tim <- system.time(acast(melt(data, id=c("a", "b")), a ~ b, mean, + subset=.(variable=="c"))) + timings[i] = tim[3] +} + +mean(timings) + +acast(melt(data, id=c("a", "b")), a ~ b, mean, subset=.(variable="c")) diff --git a/bench/bench_pivot.py b/bench/bench_pivot.py new file mode 100644 index 00000000..007bd0aa --- /dev/null +++ b/bench/bench_pivot.py @@ -0,0 +1,16 @@ +from pandas import * +import string + + +n = 100000 +asize = 5 +bsize = 5 + +letters = np.asarray(list(string.letters), dtype=object) + +data = DataFrame(dict(foo=letters[:asize][np.random.randint(0, asize, n)], + bar=letters[:bsize][np.random.randint(0, bsize, n)], + baz=np.random.randn(n), + qux=np.random.randn(n))) + +table = pivot_table(data, xby=['foo', 'bar']) diff --git a/bench/bench_sparse.py b/bench/bench_sparse.py new file mode 100644 index 00000000..7dc2db05 --- /dev/null +++ b/bench/bench_sparse.py @@ -0,0 +1,93 @@ +import sys +import numpy as np + +from pandas import * +import pandas.core.sparse as spm +import pandas.compat as compat +reload(spm) +from pandas.core.sparse import * + +N = 10000. + +arr1 = np.arange(N) +index = Index(np.arange(N)) + +off = N // 10 +arr1[off: 2 * off] = np.NaN +arr1[4 * off: 5 * off] = np.NaN +arr1[8 * off: 9 * off] = np.NaN + +arr2 = np.arange(N) +arr2[3 * off // 2: 2 * off + off // 2] = np.NaN +arr2[8 * off + off // 2: 9 * off + off // 2] = np.NaN + +s1 = SparseSeries(arr1, index=index) +s2 = SparseSeries(arr2, index=index) + +is1 = SparseSeries(arr1, kind='integer', index=index) +is2 = SparseSeries(arr2, kind='integer', index=index) + +s1_dense = s1.to_dense() +s2_dense = s2.to_dense() + +if 'linux' in sys.platform: + pth = '/home/wesm/code/pandas/example' +else: + pth = '/Users/wesm/code/pandas/example' + +dm = DataFrame.load(pth) + +sdf = dm.to_sparse() + + +def new_data_like(sdf): + new_data = {} + for col, series in compat.iteritems(sdf): + new_data[col] = SparseSeries(np.random.randn(len(series.sp_values)), + index=sdf.index, + sparse_index=series.sp_index, + fill_value=series.fill_value) + + return SparseDataFrame(new_data) + +# data = {} +# for col, ser in dm.iteritems(): +# data[col] = SparseSeries(ser) + +dwp = Panel.fromDict({'foo': dm}) +# sdf = SparseDataFrame(data) + + +lp = stack_sparse_frame(sdf) + + +swp = SparsePanel({'A': sdf}) +swp = SparsePanel({'A': sdf, + 'B': sdf, + 'C': sdf, + 'D': sdf}) + +y = sdf +x = SparsePanel({'x1': sdf + new_data_like(sdf) / 10, + 'x2': sdf + new_data_like(sdf) / 10}) + +dense_y = sdf +dense_x = x.to_dense() + +# import hotshot, hotshot.stats +# prof = hotshot.Profile('test.prof') + +# benchtime, stones = prof.runcall(ols, y=y, x=x) + +# prof.close() + +# stats = hotshot.stats.load('test.prof') + +dense_model = ols(y=dense_y, x=dense_x) + +import pandas.stats.plm as plm +import pandas.stats.interface as face +reload(plm) +reload(face) + +# model = face.ols(y=y, x=x) diff --git a/bench/bench_take_indexing.py b/bench/bench_take_indexing.py new file mode 100644 index 00000000..5fb584bc --- /dev/null +++ b/bench/bench_take_indexing.py @@ -0,0 +1,55 @@ +from __future__ import print_function +import numpy as np + +from pandas import * +import pandas._tseries as lib + +from pandas import DataFrame +import timeit +from pandas.compat import zip + +setup = """ +from pandas import Series +import pandas._tseries as lib +import random +import numpy as np + +import random +n = %d +k = %d +arr = np.random.randn(n, k) +indexer = np.arange(n, dtype=np.int32) +indexer = indexer[::-1] +""" + +sizes = [100, 1000, 10000, 100000] +iters = [1000, 1000, 100, 1] + +fancy_2d = [] +take_2d = [] +cython_2d = [] + +n = 1000 + + +def _timeit(stmt, size, k=5, iters=1000): + timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k)) + return timer.timeit(n) / n + +for sz, its in zip(sizes, iters): + print(sz) + fancy_2d.append(_timeit('arr[indexer]', sz, iters=its)) + take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its)) + cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its)) + +df = DataFrame({'fancy': fancy_2d, + 'take': take_2d, + 'cython': cython_2d}) + +print(df) + +from pandas.rpy.common import r +r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)') +r('set.seed(12345') +r('indexer <- sample(1:10000)') +r('mat[indexer,]') diff --git a/bench/bench_unique.py b/bench/bench_unique.py new file mode 100644 index 00000000..87bd2f2d --- /dev/null +++ b/bench/bench_unique.py @@ -0,0 +1,278 @@ +from __future__ import print_function +from pandas import * +from pandas.util.testing import rands +from pandas.compat import range, zip +import pandas._tseries as lib +import numpy as np +import matplotlib.pyplot as plt + +N = 50000 +K = 10000 + +groups = np.array([rands(10) for _ in range(K)], dtype='O') +groups2 = np.array([rands(10) for _ in range(K)], dtype='O') + +labels = np.tile(groups, N // K) +labels2 = np.tile(groups2, N // K) +data = np.random.randn(N) + + +def timeit(f, niter): + import gc + import time + gc.disable() + start = time.time() + for _ in range(niter): + f() + elapsed = (time.time() - start) / niter + gc.enable() + return elapsed + + +def algo1(): + unique_labels = np.unique(labels) + result = np.empty(len(unique_labels)) + for i, label in enumerate(unique_labels): + result[i] = data[labels == label].sum() + + +def algo2(): + unique_labels = np.unique(labels) + indices = lib.groupby_indices(labels) + result = np.empty(len(unique_labels)) + + for i, label in enumerate(unique_labels): + result[i] = data.take(indices[label]).sum() + + +def algo3_nosort(): + rizer = lib.DictFactorizer() + labs, counts = rizer.factorize(labels, sort=False) + k = len(rizer.uniques) + out = np.empty(k) + lib.group_add(out, counts, data, labs) + + +def algo3_sort(): + rizer = lib.DictFactorizer() + labs, counts = rizer.factorize(labels, sort=True) + k = len(rizer.uniques) + out = np.empty(k) + lib.group_add(out, counts, data, labs) + +import numpy as np +import random + + +# dict to hold results +counts = {} + +# a hack to generate random key, value pairs. +# 5k keys, 100k values +x = np.tile(np.arange(5000, dtype='O'), 20) +random.shuffle(x) +xarr = x +x = [int(y) for y in x] +data = np.random.uniform(0, 1, 100000) + + +def f(): + # groupby sum + for k, v in zip(x, data): + try: + counts[k] += v + except KeyError: + counts[k] = v + + +def f2(): + rizer = lib.DictFactorizer() + labs, counts = rizer.factorize(xarr, sort=False) + k = len(rizer.uniques) + out = np.empty(k) + lib.group_add(out, counts, data, labs) + + +def algo4(): + rizer = lib.DictFactorizer() + labs1, _ = rizer.factorize(labels, sort=False) + k1 = len(rizer.uniques) + + rizer = lib.DictFactorizer() + labs2, _ = rizer.factorize(labels2, sort=False) + k2 = len(rizer.uniques) + + group_id = labs1 * k2 + labs2 + max_group = k1 * k2 + + if max_group > 1e6: + rizer = lib.Int64Factorizer(len(group_id)) + group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True) + max_group = len(rizer.uniques) + + out = np.empty(max_group) + counts = np.zeros(max_group, dtype='i4') + lib.group_add(out, counts, data, group_id) + +# cumtime percall filename:lineno(function) +# 0.592 0.592 :1() + # 0.584 0.006 groupby_ex.py:37(algo3_nosort) + # 0.535 0.005 {method 'factorize' of DictFactorizer' objects} + # 0.047 0.000 {pandas._tseries.group_add} + # 0.002 0.000 numeric.py:65(zeros_like) + # 0.001 0.000 {method 'fill' of 'numpy.ndarray' objects} + # 0.000 0.000 {numpy.core.multiarray.empty_like} + # 0.000 0.000 {numpy.core.multiarray.empty} + +# UNIQUE timings + +# N = 10000000 +# K = 500000 + +# groups = np.array([rands(10) for _ in range(K)], dtype='O') + +# labels = np.tile(groups, N // K) +data = np.random.randn(N) + +data = np.random.randn(N) + +Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000] + +# Ks = [500000, 1000000, 2500000, 5000000, 10000000] + +import psutil +import os +import gc + +pid = os.getpid() +proc = psutil.Process(pid) + + +def dict_unique(values, expected_K, sort=False, memory=False): + if memory: + gc.collect() + before_mem = proc.get_memory_info().rss + + rizer = lib.DictFactorizer() + result = rizer.unique_int64(values) + + if memory: + result = proc.get_memory_info().rss - before_mem + return result + + if sort: + result.sort() + assert(len(result) == expected_K) + return result + + +def khash_unique(values, expected_K, size_hint=False, sort=False, + memory=False): + if memory: + gc.collect() + before_mem = proc.get_memory_info().rss + + if size_hint: + rizer = lib.Factorizer(len(values)) + else: + rizer = lib.Factorizer(100) + + result = [] + result = rizer.unique(values) + + if memory: + result = proc.get_memory_info().rss - before_mem + return result + + if sort: + result.sort() + assert(len(result) == expected_K) + + +def khash_unique_str(values, expected_K, size_hint=False, sort=False, + memory=False): + if memory: + gc.collect() + before_mem = proc.get_memory_info().rss + + if size_hint: + rizer = lib.StringHashTable(len(values)) + else: + rizer = lib.StringHashTable(100) + + result = [] + result = rizer.unique(values) + + if memory: + result = proc.get_memory_info().rss - before_mem + return result + + if sort: + result.sort() + assert(len(result) == expected_K) + + +def khash_unique_int64(values, expected_K, size_hint=False, sort=False): + if size_hint: + rizer = lib.Int64HashTable(len(values)) + else: + rizer = lib.Int64HashTable(100) + + result = [] + result = rizer.unique(values) + + if sort: + result.sort() + assert(len(result) == expected_K) + + +def hash_bench(): + numpy = [] + dict_based = [] + dict_based_sort = [] + khash_hint = [] + khash_nohint = [] + for K in Ks: + print(K) + # groups = np.array([rands(10) for _ in range(K)]) + # labels = np.tile(groups, N // K).astype('O') + + groups = np.random.randint(0, long(100000000000), size=K) + labels = np.tile(groups, N // K) + dict_based.append(timeit(lambda: dict_unique(labels, K), 20)) + khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20)) + khash_hint.append(timeit(lambda: khash_unique_int64(labels, K, + size_hint=True), 20)) + + # memory, hard to get + # dict_based.append(np.mean([dict_unique(labels, K, memory=True) + # for _ in range(10)])) + # khash_nohint.append(np.mean([khash_unique(labels, K, memory=True) + # for _ in range(10)])) + # khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True) + # for _ in range(10)])) + + # dict_based_sort.append(timeit(lambda: dict_unique(labels, K, + # sort=True), 10)) + # numpy.append(timeit(lambda: np.unique(labels), 10)) + + # unique_timings = DataFrame({'numpy.unique' : numpy, + # 'dict, no sort' : dict_based, + # 'dict, sort' : dict_based_sort}, + # columns=['dict, no sort', + # 'dict, sort', 'numpy.unique'], + # index=Ks) + + unique_timings = DataFrame({'dict': dict_based, + 'khash, preallocate': khash_hint, + 'khash': khash_nohint}, + columns=['khash, preallocate', 'khash', 'dict'], + index=Ks) + + unique_timings.plot(kind='bar', legend=False) + plt.legend(loc='best') + plt.title('Unique on 100,000 values, int64') + plt.xlabel('Number of unique labels') + plt.ylabel('Mean execution time') + + plt.show() diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R new file mode 100644 index 00000000..69d0f7a9 --- /dev/null +++ b/bench/bench_with_subset.R @@ -0,0 +1,53 @@ +library(microbenchmark) +library(data.table) + + +data.frame.subset.bench <- function (n=1e7, times=30) { + df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), + times=times)) +} + + +# data.table allows something very similar to query with an expression +# but we have chained comparisons AND we're faster BOO YAH! +data.table.subset.expression.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c], + times=times)) +} + + +# compare against subset with data.table for good measure +data.table.subset.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), + times=times)) +} + + +data.frame.with.bench <- function (n=1e7, times=30) { + df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + + print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), + times=times)) +} + + +data.table.with.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), + times=times)) +} + + +bench <- function () { + data.frame.subset.bench() + data.table.subset.expression.bench() + data.table.subset.bench() + data.frame.with.bench() + data.table.with.bench() +} + + +bench() diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py new file mode 100644 index 00000000..017401df --- /dev/null +++ b/bench/bench_with_subset.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +""" +Microbenchmarks for comparison with R's "with" and "subset" functions +""" + +from __future__ import print_function +import numpy as np +from numpy import array +from timeit import repeat as timeit +from pandas.compat import range, zip +from pandas import DataFrame + + +setup_common = """from pandas import DataFrame +from numpy.random import randn +df = DataFrame(randn(%d, 3), columns=list('abc')) +%s""" + + +setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" + + +def bench_with(n, times=10, repeat=3, engine='numexpr'): + return np.array(timeit('df.eval(s, engine=%r)' % engine, + setup=setup_common % (n, setup_with), + repeat=repeat, number=times)) / times + + +setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" + + +def bench_subset(n, times=10, repeat=3, engine='numexpr'): + return np.array(timeit('df.query(s, engine=%r)' % engine, + setup=setup_common % (n, setup_subset), + repeat=repeat, number=times)) / times + + +def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False): + r = np.logspace(mn, mx, num=num).round().astype(int) + + ev = DataFrame(np.empty((num, len(engines))), columns=engines) + qu = ev.copy(deep=True) + + ev['size'] = qu['size'] = r + + for engine in engines: + for i, n in enumerate(r): + if verbose: + print('engine: %r, i == %d' % (engine, i)) + ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine) + qu.loc[i, engine] = bench_subset(n, times=1, repeat=1, + engine=engine) + + return ev, qu + + +def plot_perf(df, engines, title, filename=None): + from matplotlib.pyplot import figure, rc + + try: + from mpltools import style + except ImportError: + pass + else: + style.use('ggplot') + + rc('text', usetex=True) + + fig = figure(figsize=(4, 3), dpi=100) + ax = fig.add_subplot(111) + + for engine in engines: + ax.plot(df.size, df[engine], label=engine, lw=2) + + ax.set_xlabel('Number of Rows') + ax.set_ylabel('Time (s)') + ax.set_title(title) + ax.legend(loc='best') + ax.tick_params(top=False, right=False) + + fig.tight_layout() + + if filename is not None: + fig.savefig(filename) + + +if __name__ == '__main__': + import os + import pandas as pd + + pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + static_path = os.path.join(pandas_dir, 'doc', 'source', '_static') + + join = lambda p: os.path.join(static_path, p) + + fn = join('eval-query-perf-data.h5') + + engines = 'python', 'numexpr' + + if not os.path.exists(fn): + ev, qu = bench(verbose=True) + ev.to_hdf(fn, 'eval') + qu.to_hdf(fn, 'query') + else: + ev = pd.read_hdf(fn, 'eval') + qu = pd.read_hdf(fn, 'query') + + plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png')) + plot_perf(qu, engines, 'DataFrame.query()', + filename=join('query-perf.png')) + + plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()', + filename=join('eval-perf-small.png')) + plot_perf(qu[qu.size <= 500000], engines, 'DataFrame.query()', + filename=join('query-perf-small.png')) diff --git a/bench/better_unique.py b/bench/better_unique.py new file mode 100644 index 00000000..e03a4f43 --- /dev/null +++ b/bench/better_unique.py @@ -0,0 +1,80 @@ +from __future__ import print_function +from pandas import DataFrame +from pandas.compat import range, zip +import timeit + +setup = """ +from pandas import Series +import pandas._tseries as _tseries +from pandas.compat import range +import random +import numpy as np + +def better_unique(values): + uniques = _tseries.fast_unique(values) + id_map = _tseries.map_indices_buf(uniques) + labels = _tseries.get_unique_labels(values, id_map) + return uniques, labels + +tot = 100000 + +def get_test_data(ngroups=100, n=tot): + unique_groups = range(ngroups) + random.shuffle(unique_groups) + arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], + dtype=object) + + return arr + +arr = get_test_data(ngroups=%d) +""" + +group_sizes = [10, 100, 1000, 10000, + 20000, 30000, 40000, + 50000, 60000, 70000, + 80000, 90000, 100000] + +numbers = [100, 100, 50] + [10] * 10 + +numpy = [] +wes = [] + +for sz, n in zip(group_sizes, numbers): + # wes_timer = timeit.Timer(stmt='better_unique(arr)', + # setup=setup % sz) + wes_timer = timeit.Timer(stmt='_tseries.fast_unique(arr)', + setup=setup % sz) + + numpy_timer = timeit.Timer(stmt='np.unique(arr)', + setup=setup % sz) + + print(n) + numpy_result = numpy_timer.timeit(number=n) / n + wes_result = wes_timer.timeit(number=n) / n + + print('Groups: %d, NumPy: %s, Wes: %s' % (sz, numpy_result, wes_result)) + + wes.append(wes_result) + numpy.append(numpy_result) + +result = DataFrame({'wes': wes, 'numpy': numpy}, index=group_sizes) + + +def make_plot(numpy, wes): + pass + +# def get_test_data(ngroups=100, n=100000): +# unique_groups = range(ngroups) +# random.shuffle(unique_groups) +# arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) + +# if len(arr) < n: +# arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], +# dtype=object) + +# return arr + +# arr = get_test_data(ngroups=1000) diff --git a/bench/duplicated.R b/bench/duplicated.R new file mode 100644 index 00000000..eb2376df --- /dev/null +++ b/bench/duplicated.R @@ -0,0 +1,22 @@ +N <- 100000 + +k1 = rep(NA, N) +k2 = rep(NA, N) +for (i in 1:N){ + k1[i] <- paste(sample(letters, 1), collapse="") + k2[i] <- paste(sample(letters, 1), collapse="") +} +df <- data.frame(a=k1, b=k2, c=rep(1:100, N / 100)) +df2 <- data.frame(a=k1, b=k2) + +timings <- numeric() +timings2 <- numeric() +for (i in 1:50) { + gc() + timings[i] = system.time(deduped <- df[!duplicated(df),])[3] + gc() + timings2[i] = system.time(deduped <- df[!duplicated(df[,c("a", "b")]),])[3] +} + +mean(timings) +mean(timings2) diff --git a/bench/io_roundtrip.py b/bench/io_roundtrip.py new file mode 100644 index 00000000..d87da0ec --- /dev/null +++ b/bench/io_roundtrip.py @@ -0,0 +1,116 @@ +from __future__ import print_function +import time +import os +import numpy as np + +import la +import pandas +from pandas.compat import range +from pandas import datetools, DatetimeIndex + + +def timeit(f, iterations): + start = time.clock() + + for i in range(iterations): + f() + + return time.clock() - start + + +def rountrip_archive(N, K=50, iterations=10): + # Create data + arr = np.random.randn(N, K) + # lar = la.larry(arr) + dma = pandas.DataFrame(arr, + DatetimeIndex('1/1/2000', periods=N, + offset=datetools.Minute())) + dma[201] = 'bar' + + # filenames + filename_numpy = '/Users/wesm/tmp/numpy.npz' + filename_larry = '/Users/wesm/tmp/archive.hdf5' + filename_pandas = '/Users/wesm/tmp/pandas_tmp' + + # Delete old files + try: + os.unlink(filename_numpy) + except: + pass + try: + os.unlink(filename_larry) + except: + pass + + try: + os.unlink(filename_pandas) + except: + pass + + # Time a round trip save and load + # numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) + # numpy_time = timeit(numpy_f, iterations) / iterations + + # larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) + # larry_time = timeit(larry_f, iterations) / iterations + + pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) + pandas_time = timeit(pandas_f, iterations) / iterations + print('pandas (HDF5) %7.4f seconds' % pandas_time) + + pickle_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) + pickle_time = timeit(pickle_f, iterations) / iterations + print('pandas (pickle) %7.4f seconds' % pickle_time) + + # print('Numpy (npz) %7.4f seconds' % numpy_time) + # print('larry (HDF5) %7.4f seconds' % larry_time) + + # Delete old files + try: + os.unlink(filename_numpy) + except: + pass + try: + os.unlink(filename_larry) + except: + pass + + try: + os.unlink(filename_pandas) + except: + pass + + +def numpy_roundtrip(filename, arr1, arr2): + np.savez(filename, arr1=arr1, arr2=arr2) + npz = np.load(filename) + arr1 = npz['arr1'] + arr2 = npz['arr2'] + + +def larry_roundtrip(filename, lar1, lar2): + io = la.IO(filename) + io['lar1'] = lar1 + io['lar2'] = lar2 + lar1 = io['lar1'] + lar2 = io['lar2'] + + +def pandas_roundtrip(filename, dma1, dma2): + # What's the best way to code this? + from pandas.io.pytables import HDFStore + store = HDFStore(filename) + store['dma1'] = dma1 + store['dma2'] = dma2 + dma1 = store['dma1'] + dma2 = store['dma2'] + + +def pandas_roundtrip_pickle(filename, dma1, dma2): + dma1.save(filename) + dma1 = pandas.DataFrame.load(filename) + dma2.save(filename) + dma2 = pandas.DataFrame.load(filename) + +if __name__ == '__main__': + rountrip_archive(10000, K=200) diff --git a/bench/larry.py b/bench/larry.py new file mode 100644 index 00000000..e69de29b diff --git a/bench/serialize.py b/bench/serialize.py new file mode 100644 index 00000000..b0edd6a5 --- /dev/null +++ b/bench/serialize.py @@ -0,0 +1,89 @@ +from __future__ import print_function +from pandas.compat import range, lrange +import time +import os +import numpy as np + +import la +import pandas + + +def timeit(f, iterations): + start = time.clock() + + for i in range(iterations): + f() + + return time.clock() - start + + +def roundtrip_archive(N, iterations=10): + + # Create data + arr = np.random.randn(N, N) + lar = la.larry(arr) + dma = pandas.DataFrame(arr, lrange(N), lrange(N)) + + # filenames + filename_numpy = '/Users/wesm/tmp/numpy.npz' + filename_larry = '/Users/wesm/tmp/archive.hdf5' + filename_pandas = '/Users/wesm/tmp/pandas_tmp' + + # Delete old files + try: + os.unlink(filename_numpy) + except: + pass + try: + os.unlink(filename_larry) + except: + pass + try: + os.unlink(filename_pandas) + except: + pass + + # Time a round trip save and load + numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) + numpy_time = timeit(numpy_f, iterations) / iterations + + larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) + larry_time = timeit(larry_f, iterations) / iterations + + pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) + pandas_time = timeit(pandas_f, iterations) / iterations + + print('Numpy (npz) %7.4f seconds' % numpy_time) + print('larry (HDF5) %7.4f seconds' % larry_time) + print('pandas (HDF5) %7.4f seconds' % pandas_time) + + +def numpy_roundtrip(filename, arr1, arr2): + np.savez(filename, arr1=arr1, arr2=arr2) + npz = np.load(filename) + arr1 = npz['arr1'] + arr2 = npz['arr2'] + + +def larry_roundtrip(filename, lar1, lar2): + io = la.IO(filename) + io['lar1'] = lar1 + io['lar2'] = lar2 + lar1 = io['lar1'] + lar2 = io['lar2'] + + +def pandas_roundtrip(filename, dma1, dma2): + from pandas.io.pytables import HDFStore + store = HDFStore(filename) + store['dma1'] = dma1 + store['dma2'] = dma2 + dma1 = store['dma1'] + dma2 = store['dma2'] + + +def pandas_roundtrip_pickle(filename, dma1, dma2): + dma1.save(filename) + dma1 = pandas.DataFrame.load(filename) + dma2.save(filename) + dma2 = pandas.DataFrame.load(filename) diff --git a/bench/test.py b/bench/test.py new file mode 100644 index 00000000..2339deab --- /dev/null +++ b/bench/test.py @@ -0,0 +1,70 @@ +import numpy as np +import itertools +import collections +import scipy.ndimage as ndi +from pandas.compat import zip, range + +N = 10000 + +lat = np.random.randint(0, 360, N) +lon = np.random.randint(0, 360, N) +data = np.random.randn(N) + + +def groupby1(lat, lon, data): + indexer = np.lexsort((lon, lat)) + lat = lat.take(indexer) + lon = lon.take(indexer) + sorted_data = data.take(indexer) + + keys = 1000. * lat + lon + unique_keys = np.unique(keys) + bounds = keys.searchsorted(unique_keys) + + result = group_agg(sorted_data, bounds, lambda x: x.mean()) + + decoder = keys.searchsorted(unique_keys) + + return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) + + +def group_mean(lat, lon, data): + indexer = np.lexsort((lon, lat)) + lat = lat.take(indexer) + lon = lon.take(indexer) + sorted_data = data.take(indexer) + + keys = 1000 * lat + lon + unique_keys = np.unique(keys) + + result = ndi.mean(sorted_data, labels=keys, index=unique_keys) + decoder = keys.searchsorted(unique_keys) + + return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) + + +def group_mean_naive(lat, lon, data): + grouped = collections.defaultdict(list) + for lt, ln, da in zip(lat, lon, data): + grouped[(lt, ln)].append(da) + + averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items()) + + return averaged + + +def group_agg(values, bounds, f): + N = len(values) + result = np.empty(len(bounds), dtype=float) + for i, left_bound in enumerate(bounds): + if i == len(bounds) - 1: + right_bound = N + else: + right_bound = bounds[i + 1] + + result[i] = f(values[left_bound: right_bound]) + + return result + +# for i in range(10): +# groupby1(lat, lon, data) diff --git a/bench/zoo_bench.R b/bench/zoo_bench.R new file mode 100644 index 00000000..294d55f5 --- /dev/null +++ b/bench/zoo_bench.R @@ -0,0 +1,71 @@ +library(zoo) +library(xts) +library(fts) +library(tseries) +library(its) +library(xtable) + +## indices = rep(NA, 100000) +## for (i in 1:100000) +## indices[i] <- paste(sample(letters, 10), collapse="") + + + +## x <- zoo(rnorm(100000), indices) +## y <- zoo(rnorm(90000), indices[sample(1:100000, 90000)]) + +## indices <- as.POSIXct(1:100000) + +indices <- as.POSIXct(Sys.Date()) + seq(1, 100000000, 100) + +sz <- 500000 + +## x <- xts(rnorm(sz), sample(indices, sz)) +## y <- xts(rnorm(sz), sample(indices, sz)) + +zoo.bench <- function(){ + x <- zoo(rnorm(sz), sample(indices, sz)) + y <- zoo(rnorm(sz), sample(indices, sz)) + timeit(function() {x + y}) +} + +xts.bench <- function(){ + x <- xts(rnorm(sz), sample(indices, sz)) + y <- xts(rnorm(sz), sample(indices, sz)) + timeit(function() {x + y}) +} + +fts.bench <- function(){ + x <- fts(rnorm(sz), sort(sample(indices, sz))) + y <- fts(rnorm(sz), sort(sample(indices, sz)) + timeit(function() {x + y}) +} + +its.bench <- function(){ + x <- its(rnorm(sz), sort(sample(indices, sz))) + y <- its(rnorm(sz), sort(sample(indices, sz))) + timeit(function() {x + y}) +} + +irts.bench <- function(){ + x <- irts(sort(sample(indices, sz)), rnorm(sz)) + y <- irts(sort(sample(indices, sz)), rnorm(sz)) + timeit(function() {x + y}) +} + +timeit <- function(f){ + timings <- numeric() + for (i in 1:10) { + gc() + timings[i] = system.time(f())[3] + } + mean(timings) +} + +bench <- function(){ + results <- c(xts.bench(), fts.bench(), its.bench(), zoo.bench()) + names <- c("xts", "fts", "its", "zoo") + data.frame(results, names) +} + +result <- bench() diff --git a/bench/zoo_bench.py b/bench/zoo_bench.py new file mode 100644 index 00000000..74cb1952 --- /dev/null +++ b/bench/zoo_bench.py @@ -0,0 +1,36 @@ +from pandas import * +from pandas.util.testing import rands + +n = 1000000 +# indices = Index([rands(10) for _ in xrange(n)]) + + +def sample(values, k): + sampler = np.random.permutation(len(values)) + return values.take(sampler[:k]) +sz = 500000 +rng = np.arange(0, 10000000000000, 10000000) +stamps = np.datetime64(datetime.now()).view('i8') + rng +idx1 = np.sort(sample(stamps, sz)) +idx2 = np.sort(sample(stamps, sz)) +ts1 = Series(np.random.randn(sz), idx1) +ts2 = Series(np.random.randn(sz), idx2) + + +# subsample_size = 90000 + +# x = Series(np.random.randn(100000), indices) +# y = Series(np.random.randn(subsample_size), +# index=sample(indices, subsample_size)) + + +# lx = larry(np.random.randn(100000), [list(indices)]) +# ly = larry(np.random.randn(subsample_size), [list(y.index)]) + +# Benchmark 1: Two 1-million length time series (int64-based index) with +# randomly chosen timestamps + +# Benchmark 2: Join two 5-variate time series DataFrames (outer and inner join) + +# df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5)) +# df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10)) diff --git a/ci/README.txt b/ci/README.txt new file mode 100644 index 00000000..f69fc832 --- /dev/null +++ b/ci/README.txt @@ -0,0 +1,17 @@ +Travis is a ci service that's well-integrated with github. +The following ypes of breakage should be detected +by travis builds: + +1) Failing tests on any supported version of python. +2) Pandas should install and the tests should run if no optional deps are installed. +That also means tests which rely on optional deps need to raise SkipTest() +if the dep is missing. +3) unicode related fails when running under exotic locales. + +We tried running the vbench suite for a while, but with varying load +on travis machines, that wasn't useful. + +Travis currently (4/2013) has a 5-job concurrency limit. Exceeding it +basically doubles the total runtime for a commit through travis, and +since dep+pandas installation is already quite long, this should become +a hard limit on concurrent travis runs. diff --git a/ci/after_script.sh b/ci/after_script.sh new file mode 100755 index 00000000..b17d69da --- /dev/null +++ b/ci/after_script.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +#wget https://raw.github.com/y-p/ScatterCI-CLI/master/scatter_cli.py +#chmod u+x scatter_cli.py + +pip install -I requests==2.1.0 +echo "${TRAVIS_PYTHON_VERSION:0:4}" +if [ x"${TRAVIS_PYTHON_VERSION:0:4}" == x"2.6" ]; then + pip install simplejson; +fi + +# ScatterCI accepts a build log, but currently does nothing with it. +echo '' > /tmp/build.log + +# nore exposed in the build logs +#export SCATTERCI_ACCESS_KEY= +#export SCATTERCI_HOST= + +# Generate a json file describing system and dep versions +ci/print_versions.py -j /tmp/env.json + +# nose ran using "--with-xunit --xunit-file nosetest.xml" and generated /tmp/nosetest.xml +# Will timeout if server not available, and should not fail the build +#python scatter_cli.py --xunit-file /tmp/nosetests.xml --log-file /tmp/build.log --env-file /tmp/env.json --build-name "$JOB_NAME" --succeed + +true # never fail because bad things happened here diff --git a/ci/before_install.sh b/ci/before_install.sh new file mode 100755 index 00000000..e4376e1b --- /dev/null +++ b/ci/before_install.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# If envars.sh determined we're running in an authorized fork +# and the user opted in to the network cache,and that cached versions +# are available on the cache server, download and deploy the cached +# files to the local filesystem + +echo "inside $0" + +# overview +sudo apt-get update $APT_ARGS # run apt-get update for all versions + +true # never fail because bad things happened here diff --git a/ci/build_docs.sh b/ci/build_docs.sh new file mode 100755 index 00000000..583b3685 --- /dev/null +++ b/ci/build_docs.sh @@ -0,0 +1,49 @@ +#!/bin/bash + + +cd "$TRAVIS_BUILD_DIR" + +git show --pretty="format:" --name-only HEAD~5.. --first-parent | grep -P "rst|txt|doc" + +if [ "$?" != "0" ]; then + echo "Skipping doc build, none were modified" + # nope, skip docs build + exit 0 +fi + + +if [ x"$DOC_BUILD" != x"" ]; then + # we're running network tests, let's build the docs in the meantim + echo "Will build docs" + pip install sphinx==1.1.3 ipython==1.1.0 + + mv "$TRAVIS_BUILD_DIR"/doc /tmp + cd /tmp/doc + + rm /tmp/doc/source/api.rst # no R + rm /tmp/doc/source/r_interface.rst # no R + + echo ############################### > /tmp/doc.log + echo # Log file for the doc build # > /tmp/doc.log + echo ############################### > /tmp/doc.log + echo "" > /tmp/doc.log + echo -e "y\n" | ./make.py --no-api 2>&1 + + cd /tmp/doc/build/html + git config --global user.email "pandas-docs-bot@localhost.foo" + git config --global user.name "pandas-docs-bot" + + git init + touch README + git add README + git commit -m "Initial commit" --allow-empty + git branch gh-pages + git checkout gh-pages + touch .nojekyll + git add --all . + git commit -m "Version" --allow-empty + git remote add origin https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis + git push origin gh-pages -f +fi + +exit 0 diff --git a/ci/cron/go_doc.sh b/ci/cron/go_doc.sh new file mode 100755 index 00000000..89659577 --- /dev/null +++ b/ci/cron/go_doc.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# This is a one-command cron job for setting up +# a virtualenv-based, linux-based, py2-based environment +# for building the Pandas documentation. +# +# The first run will install all required deps from pypi +# into the venv including monsters like scipy. +# You may want to set it up yourself to speed up the +# process. +# +# This is meant to be run as a cron job under a dedicated +# user account whose HOME directory contains this script. +# a CI directory will be created under it and all files +# stored within it. +# +# The hardcoded dep versions will gradually become obsolete +# You may need to tweak them +# +# @y-p, Jan/2014 + +# disto latex is sometimes finicky. Optionall use +# a local texlive install +export PATH=/mnt/debian/texlive/2013/bin/x86_64-linux:$PATH + +# Having ccache will speed things up +export PATH=/usr/lib64/ccache/:$PATH + +# limit disk usage +ccache -M 200M + +BASEDIR="$HOME/CI" +REPO_URL="https://github.com/pydata/pandas" +REPO_LOC="$BASEDIR/pandas" + +if [ ! -d $BASEDIR ]; then + mkdir -p $BASEDIR + virtualenv $BASEDIR/venv +fi + +source $BASEDIR/venv/bin/activate + +pip install numpy==1.7.2 +pip install cython==0.20.0 +pip install python-dateutil==2.2 +pip install --pre pytz==2013.9 +pip install sphinx==1.1.3 +pip install numexpr==2.2.2 + +pip install matplotlib==1.3.0 +pip install lxml==3.2.5 +pip install beautifulsoup4==4.3.2 +pip install html5lib==0.99 + +# You'll need R as well +pip install rpy2==2.3.9 + +pip install tables==3.0.0 +pip install bottleneck==0.7.0 +pip install ipython==0.13.2 + +# only if you have too +pip install scipy==0.13.2 + +pip install openpyxl==1.6.2 +pip install xlrd==0.9.2 +pip install xlwt==0.7.5 +pip install xlsxwriter==0.5.1 +pip install sqlalchemy==0.8.3 + +if [ ! -d "$REPO_LOC" ]; then + git clone "$REPO_URL" "$REPO_LOC" +fi + +cd "$REPO_LOC" +git reset --hard +git clean -df +git checkout master +git pull origin +make + +source $BASEDIR/venv/bin/activate +export PATH="/usr/lib64/ccache/:$PATH" +pip uninstall pandas -yq +pip install "$REPO_LOC" + +cd "$REPO_LOC"/doc + +python make.py clean +python make.py html +if [ ! $? == 0 ]; then + exit 1 +fi +python make.py zip_html +# usually requires manual intervention +# python make.py latex + +# If you have access: +# python make.py upload_dev diff --git a/ci/install.sh b/ci/install.sh new file mode 100755 index 00000000..fd680011 --- /dev/null +++ b/ci/install.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# There are 2 distinct pieces that get zipped and cached +# - The venv site-packages dir including the installed dependencies +# - The pandas build artifacts, using the build cache support via +# scripts/use_build_cache.py +# +# if the user opted in to use the cache and we're on a whitelisted fork +# - if the server doesn't hold a cached version of venv/pandas build, +# do things the slow way, and put the results on the cache server +# for the next time. +# - if the cache files are available, instal some necessaries via apt +# (no compiling needed), then directly goto script and collect 200$. +# + +function edit_init() +{ + if [ -n "$LOCALE_OVERRIDE" ]; then + echo "Adding locale to the first line of pandas/__init__.py" + rm -f pandas/__init__.pyc + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sed -i "$sedc" pandas/__init__.py + echo "head -4 pandas/__init__.py" + head -4 pandas/__init__.py + echo + fi +} + +edit_init + +python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" +[ "$python_major_version" == "2" ] && python_major_version="" + +home_dir=$(pwd) +echo "home_dir: [$home_dir]" + +# known working +# pip==1.5.1 +# setuptools==2.2 +# wheel==0.22 +# nose==1.3.3 + +pip install -I -U pip +pip install -I -U setuptools +pip install wheel==0.22 +pip install nose==1.3.3 + +# comment this line to disable the fetching of wheel files +base_url=http://pandas.pydata.org/pandas-build/dev/wheels + +wheel_box=${TRAVIS_PYTHON_VERSION}${JOB_TAG} +PIP_ARGS+=" -I --use-wheel --find-links=$base_url/$wheel_box/ --allow-external --allow-insecure" + +if [ -n "$LOCALE_OVERRIDE" ]; then + # make sure the locale is available + # probably useless, since you would need to relogin + time sudo locale-gen "$LOCALE_OVERRIDE" +fi + +# we need these for numpy +time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran + +if [ -n "$NUMPY_BUILD" ]; then + # building numpy + + cd $home_dir + echo "cloning numpy" + + rm -Rf /tmp/numpy + cd /tmp + + # remove the system installed numpy + pip uninstall numpy -y + + # install cython + pip install --find-links http://wheels.astropy.org/ --find-links http://wheels2.astropy.org/ --use-wheel Cython + + # clone & install + git clone --branch $NUMPY_BUILD https://github.com/numpy/numpy.git numpy + cd numpy + time pip install . + pip uninstall cython -y + + cd $home_dir + numpy_version=$(python -c 'import numpy; print(numpy.__version__)') + echo "[$home_dir] numpy current: $numpy_version" +fi + +# Force virtualenv to accept system_site_packages +rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt + +time pip install $PIP_ARGS -r ci/requirements-${wheel_box}.txt + +# Need to enable for locale testing. The location of the locale file(s) is +# distro specific. For example, on Arch Linux all of the locales are in a +# commented file--/etc/locale.gen--that must be commented in to be used +# whereas Ubuntu looks in /var/lib/locales/supported.d/* and generates locales +# based on what's in the files in that folder +time echo 'it_CH.UTF-8 UTF-8' | sudo tee -a /var/lib/locales/supported.d/it +time sudo locale-gen + + +# install gui for clipboard testing +if [ -n "$CLIPBOARD_GUI" ]; then + echo "Using CLIPBOARD_GUI: $CLIPBOARD_GUI" + [ -n "$python_major_version" ] && py="py" + python_cb_gui_pkg=python${python_major_version}-${py}${CLIPBOARD_GUI} + time sudo apt-get $APT_ARGS install $python_cb_gui_pkg +fi + + +# install a clipboard if $CLIPBOARD is not empty +if [ -n "$CLIPBOARD" ]; then + echo "Using clipboard: $CLIPBOARD" + time sudo apt-get $APT_ARGS install $CLIPBOARD +fi + + +# Optional Deps +if [ -n "$FULL_DEPS" ]; then + echo "Installing FULL_DEPS" + + # need libhdf5 for PyTables + time sudo apt-get $APT_ARGS install libhdf5-serial-dev +fi + + +# set the compiler cache to work +if [ "$IRON_TOKEN" ]; then + export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH + gcc=$(which gcc) + echo "gcc: $gcc" + ccache=$(which ccache) + echo "ccache: $ccache" + export CC='ccache gcc' +fi + +# build pandas +time python setup.py sdist +pip uninstall cython -y + +# install pandas +time pip install $(find dist | grep gz | head -n 1) + +# restore cython (if not numpy building) +if [ -z "$NUMPY_BUILD" ]; then + time pip install $PIP_ARGS $(cat ci/requirements-${wheel_box}.txt | grep -i cython) +fi + +true diff --git a/ci/ironcache/get.py b/ci/ironcache/get.py new file mode 100644 index 00000000..a4663472 --- /dev/null +++ b/ci/ironcache/get.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import time +import json +import base64 +from hashlib import sha1 +from iron_cache import * +import traceback as tb + +key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), + os.environ.get('JOB_NAME','unk')) +print(key) + +if sys.version_info[0] > 2: + key = bytes(key,encoding='utf8') + +key = sha1(key).hexdigest()[:8]+'.' + +b = b'' +cache = IronCache() +for i in range(20): + print("getting %s" % key+str(i)) + try: + item = cache.get(cache="travis", key=key+str(i)) + v = item.value + if sys.version_info[0] > 2: + v = bytes(v,encoding='utf8') + b += bytes(base64.b64decode(v)) + except Exception as e: + try: + print(tb.format_exc(e)) + except: + print("exception during exception, oh my") + break + +with open(os.path.join(os.environ.get('HOME',''),"ccache.7z"),'wb') as f: + f.write(b) diff --git a/ci/ironcache/put.py b/ci/ironcache/put.py new file mode 100644 index 00000000..f6aef3a3 --- /dev/null +++ b/ci/ironcache/put.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import re +import os +import time +import json +import base64 +from hashlib import sha1 +from iron_cache import * + +key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), + os.environ.get('JOB_NAME','unk')) + +key='KEY.%s.%s' %(os.environ.get('TRAVIS_REPO_SLUG','unk'), + os.environ.get('JOB_NAME','unk')) +print(key) + +if sys.version_info[0] > 2: + key = bytes(key,encoding='utf8') + +key = sha1(key).hexdigest()[:8]+'.' + +os.chdir(os.environ.get('HOME')) + +cache = IronCache() + +i=0 + +for i, fname in enumerate(sorted([x for x in os.listdir('.') if re.match("ccache.\d+$",x)])): + print("Putting %s" % key+str(i)) + with open(fname,"rb") as f: + s= f.read() + value=base64.b64encode(s) + if isinstance(value, bytes): + value = value.decode('ascii') + item = cache.put(cache="travis", key=key+str(i), value=value,options=dict(expires_in=24*60*60)) + +# print("foo") +for i in range(i+1,20): + + try: + item = cache.delete(key+str(i),cache='travis') + print("Deleted %s" % key+str(i)) + except: + break + pass diff --git a/ci/prep_ccache.sh b/ci/prep_ccache.sh new file mode 100755 index 00000000..34e1f252 --- /dev/null +++ b/ci/prep_ccache.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +if [ "$IRON_TOKEN" ]; then + + home_dir=$(pwd) + + # install the compiler cache + sudo apt-get $APT_ARGS install ccache p7zip-full + # iron_cache, pending py3 fixes upstream + pip install -I --allow-external --allow-insecure git+https://github.com/iron-io/iron_cache_python.git@8a451c7d7e4d16e0c3bedffd0f280d5d9bd4fe59#egg=iron_cache + + python ci/ironcache/get.py + ccache -C + + clear_cache=0 + if [ -f ~/ccache.7z ]; then + echo "Cache retrieved" + clear_cache=1 + cd $HOME + 7za e $HOME/ccache.7z + # ls -l $HOME + cd / + tar xvf $HOME/ccache + rm -rf $HOME/ccache.7z + rm -rf $HOME/ccache + + fi + + # did the last commit change cython files? + cd $home_dir + + retval=$(git diff HEAD~3 --numstat | grep -P "pyx|pxd"|wc -l) + echo "number of cython files changed: $retval" + + if [ $clear_cache -eq 1 ] && [ $retval -eq 0 ] + then + # nope, reuse cython files + echo "Will reuse cached cython file" + touch "$TRAVIS_BUILD_DIR"/pandas/*.c + touch "$TRAVIS_BUILD_DIR"/pandas/src/*.c + touch "$TRAVIS_BUILD_DIR"/pandas/*.cpp + else + echo "Rebuilding cythonized files" + fi +fi + +exit 0 diff --git a/ci/print_skipped.py b/ci/print_skipped.py new file mode 100755 index 00000000..9fb05df6 --- /dev/null +++ b/ci/print_skipped.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import sys +import math +import xml.etree.ElementTree as et + + +def parse_results(filename): + tree = et.parse(filename) + root = tree.getroot() + skipped = [] + + current_class = old_class = '' + i = 1 + assert i - 1 == len(skipped) + for el in root.findall('testcase'): + cn = el.attrib['classname'] + for sk in el.findall('skipped'): + old_class = current_class + current_class = cn + name = '{classname}.{name}'.format(classname=current_class, + name=el.attrib['name']) + msg = sk.attrib['message'] + out = '' + if old_class != current_class: + ndigits = int(math.log(i, 10) + 1) + out += ('-' * (len(name + msg) + 4 + ndigits) + '\n') # 4 for : + space + # + space + out += '#{i} {name}: {msg}'.format(i=i, name=name, msg=msg) + skipped.append(out) + i += 1 + assert i - 1 == len(skipped) + assert i - 1 == len(skipped) + assert len(skipped) == int(root.attrib['skip']) + return '\n'.join(skipped) + + +def main(args): + print('SKIPPED TESTS:') + print(parse_results(args.filename)) + return 0 + + +def parse_args(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='XUnit file to parse') + return parser.parse_args() + + +if __name__ == '__main__': + sys.exit(main(parse_args())) diff --git a/ci/print_versions.py b/ci/print_versions.py new file mode 100755 index 00000000..8be79517 --- /dev/null +++ b/ci/print_versions.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + + +def show_versions(as_json=False): + import imp + import os + fn = __file__ + this_dir = os.path.dirname(fn) + pandas_dir = os.path.abspath(os.path.join(this_dir, "..")) + sv_path = os.path.join(pandas_dir, 'pandas', 'util') + mod = imp.load_module( + 'pvmod', *imp.find_module('print_versions', [sv_path])) + return mod.show_versions(as_json) + + +if __name__ == '__main__': + # optparse is 2.6-safe + from optparse import OptionParser + parser = OptionParser() + parser.add_option("-j", "--json", metavar="FILE", nargs=1, + help="Save output as JSON into file, pass in '-' to output to stdout") + + (options, args) = parser.parse_args() + + if options.json == "-": + options.json = True + + show_versions(as_json=options.json) diff --git a/ci/requirements-2.6.txt b/ci/requirements-2.6.txt new file mode 100644 index 00000000..117d1400 --- /dev/null +++ b/ci/requirements-2.6.txt @@ -0,0 +1,16 @@ +numpy==1.6.1 +cython==0.19.1 +python-dateutil==1.5 +pytz==2013b +http://www.crummy.com/software/BeautifulSoup/bs4/download/4.2/beautifulsoup4-4.2.0.tar.gz +html5lib==1.0b2 +numexpr==1.4.2 +sqlalchemy==0.7.1 +pymysql==0.6.0 +psycopg2==2.5 +scipy==0.11.0 +statsmodels==0.4.3 +xlwt==0.7.5 +openpyxl==2.0.3 +xlsxwriter==0.4.6 +xlrd==0.9.2 diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt new file mode 100644 index 00000000..c5addb93 --- /dev/null +++ b/ci/requirements-2.7.txt @@ -0,0 +1,25 @@ +python-dateutil==2.1 +pytz==2013b +xlwt==0.7.5 +numpy==1.8.1 +cython==0.19.1 +bottleneck==0.6.0 +numexpr==2.2.2 +tables==2.3.1 +matplotlib==1.3.1 +openpyxl==1.6.2 +xlsxwriter==0.4.6 +xlrd==0.9.2 +patsy==0.1.0 +sqlalchemy==0.9.6 +pymysql==0.6.1 +psycopg2==2.5.2 +html5lib==1.0b2 +lxml==3.2.1 +scipy==0.13.3 +beautifulsoup4==4.2.1 +statsmodels==0.5.0 +boto==2.26.1 +httplib2==0.8 +python-gflags==2.0 +google-api-python-client==1.2 diff --git a/ci/requirements-2.7_LOCALE.txt b/ci/requirements-2.7_LOCALE.txt new file mode 100644 index 00000000..a4d2b857 --- /dev/null +++ b/ci/requirements-2.7_LOCALE.txt @@ -0,0 +1,18 @@ +python-dateutil +pytz==2013b +xlwt==0.7.5 +openpyxl==1.6.2 +xlsxwriter==0.4.6 +xlrd==0.9.2 +numpy==1.6.1 +cython==0.19.1 +bottleneck==0.6.0 +matplotlib==1.3.0 +patsy==0.1.0 +sqlalchemy==0.8.1 +html5lib==1.0b2 +lxml==3.2.1 +scipy==0.10.0 +beautifulsoup4==4.2.1 +statsmodels==0.4.3 +bigquery==2.0.17 diff --git a/ci/requirements-2.7_NUMPY_DEV_1_8_x.txt b/ci/requirements-2.7_NUMPY_DEV_1_8_x.txt new file mode 100644 index 00000000..90fa8f11 --- /dev/null +++ b/ci/requirements-2.7_NUMPY_DEV_1_8_x.txt @@ -0,0 +1,3 @@ +python-dateutil +pytz==2013b +cython==0.19.1 diff --git a/ci/requirements-2.7_NUMPY_DEV_master.txt b/ci/requirements-2.7_NUMPY_DEV_master.txt new file mode 100644 index 00000000..7d1d11da --- /dev/null +++ b/ci/requirements-2.7_NUMPY_DEV_master.txt @@ -0,0 +1,3 @@ +python-dateutil +pytz +cython==0.19.1 diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt new file mode 100644 index 00000000..40a5310a --- /dev/null +++ b/ci/requirements-3.2.txt @@ -0,0 +1,14 @@ +python-dateutil==2.1 +pytz==2013b +xlsxwriter==0.4.6 +xlrd==0.9.2 +numpy==1.7.1 +cython==0.19.1 +numexpr==2.1 +tables==3.0.0 +matplotlib==1.2.1 +patsy==0.1.0 +lxml==3.2.1 +scipy==0.12.0 +beautifulsoup4==4.2.1 +statsmodels==0.5.0 diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt new file mode 100644 index 00000000..fc8cb043 --- /dev/null +++ b/ci/requirements-3.3.txt @@ -0,0 +1,17 @@ +python-dateutil==2.2 +pytz==2013b +openpyxl==1.6.2 +xlsxwriter==0.4.6 +xlrd==0.9.2 +html5lib==1.0b2 +numpy==1.8.0 +cython==0.19.1 +numexpr==2.3 +tables==3.1.0 +bottleneck==0.8.0 +matplotlib==1.2.1 +patsy==0.1.0 +lxml==3.2.1 +scipy==0.13.3 +beautifulsoup4==4.2.1 +statsmodels==0.5.0 diff --git a/ci/requirements-3.4.txt b/ci/requirements-3.4.txt new file mode 100644 index 00000000..0747e6f5 --- /dev/null +++ b/ci/requirements-3.4.txt @@ -0,0 +1,19 @@ +python-dateutil +pytz +openpyxl +xlsxwriter +xlrd +html5lib +numpy==1.8.0 +cython==0.20.2 +scipy==0.13.3 +numexpr==2.4 +tables==3.1.0 +bottleneck==0.8.0 +matplotlib==1.3.1 +patsy +lxml==3.3.5 +sqlalchemy==0.9.6 +pymysql==0.6.1 +psycopg2==2.5.2 +beautifulsoup4 diff --git a/ci/script.sh b/ci/script.sh new file mode 100755 index 00000000..152a2f1e --- /dev/null +++ b/ci/script.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +echo "inside $0" + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE"; + echo "Setting LC_ALL to $LOCALE_OVERRIDE" + curdir="$(pwd)" + cd /tmp + pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' + python -c "$pycmd" + cd "$curdir" +fi + +# conditionally build and upload docs to GH/pandas-docs/pandas-docs/travis +"$TRAVIS_BUILD_DIR"/ci/build_docs.sh 2>&1 > /tmp/doc.log & +# doc build log will be shown after tests + +echo nosetests --exe -w /tmp -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml +nosetests --exe -w /tmp -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml + +RET="$?" + +# wait until subprocesses finish (build_docs.sh) +wait + +exit "$RET" diff --git a/ci/speedpack/Vagrantfile b/ci/speedpack/Vagrantfile new file mode 100644 index 00000000..ec939b7c --- /dev/null +++ b/ci/speedpack/Vagrantfile @@ -0,0 +1,22 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : +Vagrant.configure("2") do |config| + config.vm.box = "precise64" + config.vm.box_url = "http://files.vagrantup.com/precise64.box" + +# config.vbguest.auto_update = true +# config.vbguest.no_remote = true + + config.vm.synced_folder File.expand_path("..", Dir.pwd), "/reqf" + config.vm.synced_folder "wheelhouse", "/wheelhouse" + + config.vm.provider :virtualbox do |vb| + vb.customize ["modifyvm", :id, "--cpus", "4"] + vb.customize ["modifyvm", :id, "--memory", "2048"] + vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] + vb.customize ["modifyvm", :id, "--natdnsproxy1", "on"] + end + + config.vm.provision :shell, :path => "build.sh" + +end diff --git a/ci/speedpack/build.sh b/ci/speedpack/build.sh new file mode 100755 index 00000000..330d8984 --- /dev/null +++ b/ci/speedpack/build.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# This script is meant to run on a mint precise64 VM. +# The generated wheel files should be compatible +# with travis-ci as of 07/2013. +# +# Runtime can be up to an hour or more. + +echo "Building wheels..." + +# print a trace for everything; RTFM +set -x + +# install and update some basics +apt-get update +apt-get install python-software-properties git -y +apt-add-repository ppa:fkrull/deadsnakes -y +apt-get update + +# install some deps and virtualenv +apt-get install python-pip libfreetype6-dev libpng12-dev libhdf5-serial-dev \ + g++ libatlas-base-dev gfortran libreadline-dev zlib1g-dev flex bison \ + libxml2-dev libxslt-dev libssl-dev -y +pip install virtualenv +apt-get build-dep python-lxml -y + +# install sql servers +apt-get install postgresql-client libpq-dev -y + +export PYTHONIOENCODING='utf-8' +export VIRTUALENV_DISTRIBUTE=0 + +function create_fake_pandas() { + local site_pkg_dir="$1" + rm -rf $site_pkg_dir/pandas + mkdir $site_pkg_dir/pandas + touch $site_pkg_dir/pandas/__init__.py + echo "version = '0.10.0-phony'" > $site_pkg_dir/pandas/version.py +} + + +function get_site_pkgs_dir() { + python$1 -c 'import distutils; print(distutils.sysconfig.get_python_lib())' +} + + +function create_wheel() { + local pip_args="$1" + local wheelhouse="$2" + local n="$3" + local pyver="$4" + + local site_pkgs_dir="$(get_site_pkgs_dir $pyver)" + + + if [[ "$n" == *statsmodels* ]]; then + create_fake_pandas $site_pkgs_dir && \ + pip wheel $pip_args --wheel-dir=$wheelhouse $n && \ + pip install $pip_args --no-index $n && \ + rm -Rf $site_pkgs_dir + else + pip wheel $pip_args --wheel-dir=$wheelhouse $n + pip install $pip_args --no-index $n + fi +} + + +function generate_wheels() { + # get the requirements file + local reqfile="$1" + + # get the python version + local TAG=$(echo $reqfile | grep -Po "(\d\.?[\d\-](_\w+)?)") + + # base dir for wheel dirs + local WHEELSTREET=/wheelhouse + local WHEELHOUSE="$WHEELSTREET/$TAG" + + local PY_VER="${TAG:0:3}" + local PY_MAJOR="${PY_VER:0:1}" + local PIP_ARGS="--use-wheel --find-links=$WHEELHOUSE --download-cache /tmp" + + # install the python version if not installed + apt-get install python$PY_VER python$PY_VER-dev -y + + # create a new virtualenv + rm -Rf /tmp/venv + virtualenv -p python$PY_VER /tmp/venv + source /tmp/venv/bin/activate + + # install pip setuptools + pip install -I --download-cache /tmp 'git+https://github.com/pypa/pip@42102e9d#egg=pip' + pip install -I -U --download-cache /tmp setuptools + pip install -I --download-cache /tmp wheel + + # make the dir if it doesn't exist + mkdir -p $WHEELHOUSE + + # put the requirements file in the wheelhouse + cp $reqfile $WHEELHOUSE + + # install and build the wheels + cat $reqfile | while read N; do + create_wheel "$PIP_ARGS" "$WHEELHOUSE" "$N" "$PY_VER" + done +} + + +# generate a single wheel version +# generate_wheels "/reqf/requirements-2.7.txt" +# +# if vagrant is already up +# run as vagrant provision + +for reqfile in $(ls -1 /reqf/requirements-*.*); do + generate_wheels "$reqfile" +done diff --git a/ci/speedpack/nginx/nginx.conf.template b/ci/speedpack/nginx/nginx.conf.template new file mode 100644 index 00000000..e2cfeaf0 --- /dev/null +++ b/ci/speedpack/nginx/nginx.conf.template @@ -0,0 +1,48 @@ +#user nobody; +worker_processes 1; + +#error_log logs/error.log; +#error_log logs/error.log notice; +#error_log logs/error.log info; + +#pid logs/nginx.pid; + + +events { + worker_connections 1024; +} + + +http { + include mime.types; + default_type application/octet-stream; + + #log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + # '$status $body_bytes_sent "$http_referer" ' + # '"$http_user_agent" "$http_x_forwarded_for"'; + + #access_log logs/access.log on; + + sendfile on; + #tcp_nopush on; + + #keepalive_timeout 0; + keepalive_timeout 65; + + #gzip on; + + server { + listen $OPENSHIFT_IP:$OPENSHIFT_PORT; + + access_log access.log ; + sendfile on; + + location / { + root ../../app-root/data/store/; + autoindex on; + } + + + } + +} diff --git a/ci/submit_ccache.sh b/ci/submit_ccache.sh new file mode 100755 index 00000000..29d76588 --- /dev/null +++ b/ci/submit_ccache.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +home_dir=$(pwd) +ccache -s + +MISSES=$(ccache -s | grep "cache miss" | grep -Po "\d+") +echo "MISSES: $MISSES" + +if [ x"$MISSES" == x"0" ]; then + echo "No cache misses detected, skipping upload" + exit 0 +fi + +if [ "$IRON_TOKEN" ]; then + + rm -rf $HOME/ccache.7z + + tar cf - $HOME/.ccache \ + "$TRAVIS_BUILD_DIR"/pandas/{index,algos,lib,tslib,parser,hashtable}.c \ + "$TRAVIS_BUILD_DIR"/pandas/src/{sparse,testing}.c \ + "$TRAVIS_BUILD_DIR"/pandas/msgpack.cpp \ + | 7za a -si $HOME/ccache.7z + + split -b 500000 -d $HOME/ccache.7z $HOME/ccache. + + python ci/ironcache/put.py +fi; + +exit 0 diff --git a/doc/README.rst b/doc/README.rst new file mode 100644 index 00000000..1a105a7a --- /dev/null +++ b/doc/README.rst @@ -0,0 +1,170 @@ +.. _contributing.docs: + +Contributing to the documentation +================================= + +If you're not the developer type, contributing to the documentation is still +of huge value. You don't even have to be an expert on +*pandas* to do so! Something as simple as rewriting small passages for clarity +as you reference the docs is a simple but effective way to contribute. The +next person to read that passage will be in your debt! + +Actually, there are sections of the docs that are worse off by being written +by experts. If something in the docs doesn't make sense to you, updating the +relevant section after you figure it out is a simple way to ensure it will +help the next person. + +.. contents:: Table of contents: + :local: + + +About the pandas documentation +------------------------------ + +The documentation is written in **reStructuredText**, which is almost like writing +in plain English, and built using `Sphinx `__. The +Sphinx Documentation has an excellent `introduction to reST +`__. Review the Sphinx docs to perform more +complex changes to the documentation as well. + +Some other important things to know about the docs: + +- The pandas documentation consists of two parts: the docstrings in the code + itself and the docs in this folder ``pandas/doc/``. + + The docstrings provide a clear explanation of the usage of the individual + functions, while the documentation in this filder consists of tutorial-like + overviews per topic together with some other information (whatsnew, + installation, etc). + +- The docstrings follow the **Numpy Docstring Standard** which is used widely + in the Scientific Python community. This standard specifies the format of + the different sections of the docstring. See `this document + `_ + for a detailed explanation, or look at some of the existing functions to + extend it in a similar manner. + +- The tutorials make heavy use of the `ipython directive + `_ sphinx extension. + This directive lets you put code in the documentation which will be run + during the doc build. For example: + + :: + + .. ipython:: python + + x = 2 + x**3 + + will be renderd as + + :: + + In [1]: x = 2 + + In [2]: x**3 + Out[2]: 8 + + This means that almost all code examples in the docs are always run (and the + ouptut saved) during the doc build. This way, they will always be up to date, + but it makes the doc building a bit more complex. + + +How to build the pandas documentation +------------------------------------- + +Requirements +^^^^^^^^^^^^ + +To build the pandas docs there are some extra requirements: you will need to +have ``sphinx`` and ``ipython`` installed. `numpydoc +`_ is used to parse the docstrings that +follow the Numpy Docstring Standard (see above), but you don't need to install +this because a local copy of ``numpydoc`` is included in the pandas source +code. + +Furthermore, it is recommended to have all `optional dependencies +`_ +installed. This is not needed, but be aware that you will see some error +messages. Because all the code in the documentation is executed during the doc +build, the examples using this optional dependencies will generate errors. +Run ``pd.show_version()`` to get an overview of the installed version of all +dependencies. + +.. warning:: + + Building the docs with Sphinx version 1.2 is broken. Use the + latest stable version (1.2.1) or the older 1.1.3. + +Building pandas +^^^^^^^^^^^^^^^ + +For a step-by-step overview on how to set up your environment, to work with +the pandas code and git, see `the developer pages +`_. +When you start to work on some docs, be sure to update your code to the latest +development version ('master'):: + + git fetch upstream + git rebase upstream/master + +Often it will be necessary to rebuild the C extension after updating:: + + python setup.py build_ext --inplace + +Building the documentation +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +So how do you build the docs? Navigate to your local the folder +``pandas/doc/`` directory in the console and run:: + + python make.py html + +And then you can find the html output in the folder ``pandas/doc/build/html/``. + +The first time it will take quite a while, because it has to run all the code +examples in the documentation and build all generated docstring pages. +In subsequent evocations, sphinx will try to only build the pages that have +been modified. + +If you want to do a full clean build, do:: + + python make.py clean + python make.py build + + +Staring with 0.13.1 you can tell ``make.py`` to compile only a single section +of the docs, greatly reducing the turn-around time for checking your changes. +You will be prompted to delete unrequired `.rst` files, since the last commited +version can always be restored from git. + +:: + + #omit autosummary and api section + python make.py clean + python make.py --no-api + + # compile the docs with only a single + # section, that which is in indexing.rst + python make.py clean + python make.py --single indexing + +For comparison, a full doc build may take 10 minutes. a ``-no-api`` build +may take 3 minutes and a single section may take 15 seconds. + +Where to start? +--------------- + +There are a number of issues listed under `Docs +`_ +and `Good as first PR +`_ +where you could start out. + +Or maybe you have an idea of you own, by using pandas, looking for something +in the documentation and thinking 'this can be improved', let's do something +about that! + +Feel free to ask questions on `mailing list +`_ or submit an +issue on Github. diff --git a/doc/_templates/autosummary/class.rst b/doc/_templates/autosummary/class.rst new file mode 100644 index 00000000..a9c9bd2b --- /dev/null +++ b/doc/_templates/autosummary/class.rst @@ -0,0 +1,33 @@ +{% extends "!autosummary/class.rst" %} + +{% block methods %} +{% if methods %} + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + {% for item in all_methods %} + {%- if not item.startswith('_') or item in ['__call__'] %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + +{% endif %} +{% endblock %} + +{% block attributes %} +{% if attributes %} + +.. + HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. + .. autosummary:: + :toctree: + {% for item in all_attributes %} + {%- if not item.startswith('_') %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + +{% endif %} +{% endblock %} diff --git a/doc/data/baseball.csv b/doc/data/baseball.csv new file mode 100644 index 00000000..aadbaced --- /dev/null +++ b/doc/data/baseball.csv @@ -0,0 +1,101 @@ +id,player,year,stint,team,lg,g,ab,r,h,X2b,X3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp +88641,womacto01,2006,2,CHN,NL,19,50,6,14,1,0,1,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0 +88643,schilcu01,2006,1,BOS,AL,31,2,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +88645,myersmi01,2006,1,NYA,AL,62,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +88649,helliri01,2006,1,MIL,NL,20,3,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0 +88650,johnsra05,2006,1,NYA,AL,33,6,0,1,0,0,0,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0 +88652,finlest01,2006,1,SFN,NL,139,426,66,105,21,12,6,40.0,7.0,0.0,46,55.0,2.0,2.0,3.0,4.0,6.0 +88653,gonzalu01,2006,1,ARI,NL,153,586,93,159,52,2,15,73.0,0.0,1.0,69,58.0,10.0,7.0,0.0,6.0,14.0 +88662,seleaa01,2006,1,LAN,NL,28,26,2,5,1,0,0,0.0,0.0,0.0,1,7.0,0.0,0.0,6.0,0.0,1.0 +89177,francju01,2007,2,ATL,NL,15,40,1,10,3,0,0,8.0,0.0,0.0,4,10.0,1.0,0.0,0.0,1.0,1.0 +89178,francju01,2007,1,NYN,NL,40,50,7,10,0,0,1,8.0,2.0,1.0,10,13.0,0.0,0.0,0.0,1.0,1.0 +89330,zaungr01,2007,1,TOR,AL,110,331,43,80,24,1,10,52.0,0.0,0.0,51,55.0,8.0,2.0,1.0,6.0,9.0 +89333,witasja01,2007,1,TBA,AL,3,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89334,williwo02,2007,1,HOU,NL,33,59,3,6,0,0,1,2.0,0.0,0.0,0,25.0,0.0,0.0,5.0,0.0,1.0 +89335,wickmbo01,2007,2,ARI,NL,8,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89336,wickmbo01,2007,1,ATL,NL,47,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89337,whitero02,2007,1,MIN,AL,38,109,8,19,4,0,4,20.0,0.0,0.0,6,19.0,0.0,3.0,0.0,1.0,2.0 +89338,whiteri01,2007,1,HOU,NL,20,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89339,wellsda01,2007,2,LAN,NL,7,15,2,4,1,0,0,1.0,0.0,0.0,0,6.0,0.0,0.0,0.0,0.0,0.0 +89340,wellsda01,2007,1,SDN,NL,22,38,1,4,0,0,0,0.0,0.0,0.0,0,12.0,0.0,0.0,4.0,0.0,0.0 +89341,weathda01,2007,1,CIN,NL,67,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89343,walketo04,2007,1,OAK,AL,18,48,5,13,1,0,0,4.0,0.0,0.0,2,4.0,0.0,0.0,0.0,2.0,2.0 +89345,wakefti01,2007,1,BOS,AL,1,2,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0 +89347,vizquom01,2007,1,SFN,NL,145,513,54,126,18,3,4,51.0,14.0,6.0,44,48.0,6.0,1.0,14.0,3.0,14.0 +89348,villoro01,2007,1,NYA,AL,6,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89352,valenjo03,2007,1,NYN,NL,51,166,18,40,11,1,3,18.0,2.0,1.0,15,28.0,4.0,0.0,1.0,1.0,5.0 +89354,trachst01,2007,2,CHN,NL,4,7,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89355,trachst01,2007,1,BAL,AL,3,5,0,0,0,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0 +89359,timlimi01,2007,1,BOS,AL,4,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89360,thomeji01,2007,1,CHA,AL,130,432,79,119,19,0,35,96.0,0.0,1.0,95,134.0,11.0,6.0,0.0,3.0,10.0 +89361,thomafr04,2007,1,TOR,AL,155,531,63,147,30,0,26,95.0,0.0,0.0,81,94.0,3.0,7.0,0.0,5.0,14.0 +89363,tavarju01,2007,1,BOS,AL,2,4,0,1,0,0,0,0.0,0.0,0.0,1,3.0,0.0,0.0,0.0,0.0,0.0 +89365,sweenma01,2007,2,LAN,NL,30,33,2,9,1,0,0,3.0,0.0,0.0,1,11.0,0.0,0.0,0.0,0.0,0.0 +89366,sweenma01,2007,1,SFN,NL,76,90,18,23,8,0,2,10.0,2.0,0.0,13,18.0,0.0,3.0,1.0,0.0,0.0 +89367,suppaje01,2007,1,MIL,NL,33,61,4,8,0,0,0,2.0,0.0,0.0,3,16.0,0.0,0.0,11.0,0.0,2.0 +89368,stinnke01,2007,1,SLN,NL,26,82,7,13,3,0,1,5.0,0.0,0.0,5,22.0,2.0,0.0,0.0,0.0,2.0 +89370,stantmi02,2007,1,CIN,NL,67,2,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89371,stairma01,2007,1,TOR,AL,125,357,58,103,28,1,21,64.0,2.0,1.0,44,66.0,5.0,2.0,0.0,2.0,7.0 +89372,sprinru01,2007,1,SLN,NL,72,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89374,sosasa01,2007,1,TEX,AL,114,412,53,104,24,1,21,92.0,0.0,0.0,34,112.0,3.0,3.0,0.0,5.0,11.0 +89375,smoltjo01,2007,1,ATL,NL,30,54,1,5,1,0,0,2.0,0.0,0.0,1,19.0,0.0,0.0,13.0,0.0,0.0 +89378,sheffga01,2007,1,DET,AL,133,494,107,131,20,1,25,75.0,22.0,5.0,84,71.0,2.0,9.0,0.0,6.0,10.0 +89381,seleaa01,2007,1,NYN,NL,31,4,0,0,0,0,0,0.0,0.0,0.0,1,1.0,0.0,0.0,1.0,0.0,0.0 +89382,seaneru01,2007,1,LAN,NL,68,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89383,schmija01,2007,1,LAN,NL,6,7,1,1,0,0,1,1.0,0.0,0.0,0,4.0,0.0,0.0,1.0,0.0,0.0 +89384,schilcu01,2007,1,BOS,AL,1,2,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89385,sandere02,2007,1,KCA,AL,24,73,12,23,7,0,2,11.0,0.0,1.0,11,15.0,0.0,1.0,0.0,0.0,2.0 +89388,rogerke01,2007,1,DET,AL,1,2,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89389,rodriiv01,2007,1,DET,AL,129,502,50,141,31,3,11,63.0,2.0,2.0,9,96.0,1.0,1.0,1.0,2.0,16.0 +89396,ramirma02,2007,1,BOS,AL,133,483,84,143,33,1,20,88.0,0.0,0.0,71,92.0,13.0,7.0,0.0,8.0,21.0 +89398,piazzmi01,2007,1,OAK,AL,83,309,33,85,17,1,8,44.0,0.0,0.0,18,61.0,0.0,0.0,0.0,2.0,9.0 +89400,perezne01,2007,1,DET,AL,33,64,5,11,3,0,1,6.0,0.0,0.0,4,8.0,0.0,0.0,3.0,0.0,2.0 +89402,parkch01,2007,1,NYN,NL,1,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89406,oliveda02,2007,1,LAA,AL,5,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89410,myersmi01,2007,1,NYA,AL,6,1,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89411,mussimi01,2007,1,NYA,AL,2,2,0,0,0,0,0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0 +89412,moyerja01,2007,1,PHI,NL,33,73,4,9,2,0,0,2.0,0.0,0.0,2,26.0,0.0,0.0,8.0,0.0,1.0 +89420,mesajo01,2007,1,PHI,NL,38,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89421,martipe02,2007,1,NYN,NL,5,9,1,1,1,0,0,0.0,0.0,0.0,0,6.0,0.0,0.0,2.0,0.0,0.0 +89425,maddugr01,2007,1,SDN,NL,33,62,2,9,2,0,0,0.0,1.0,0.0,1,19.0,0.0,0.0,9.0,0.0,2.0 +89426,mabryjo01,2007,1,COL,NL,28,34,4,4,1,0,1,5.0,0.0,0.0,5,10.0,0.0,0.0,0.0,0.0,1.0 +89429,loftoke01,2007,2,CLE,AL,52,173,24,49,9,3,0,15.0,2.0,3.0,17,23.0,0.0,0.0,4.0,2.0,1.0 +89430,loftoke01,2007,1,TEX,AL,84,317,62,96,16,3,7,23.0,21.0,4.0,39,28.0,1.0,2.0,2.0,3.0,5.0 +89431,loaizes01,2007,1,LAN,NL,5,7,0,1,0,0,0,2.0,0.0,0.0,0,2.0,0.0,0.0,2.0,0.0,1.0 +89438,kleskry01,2007,1,SFN,NL,116,362,51,94,27,3,6,44.0,5.0,1.0,46,68.0,2.0,1.0,1.0,1.0,14.0 +89439,kentje01,2007,1,LAN,NL,136,494,78,149,36,1,20,79.0,1.0,3.0,57,61.0,4.0,5.0,0.0,6.0,17.0 +89442,jonesto02,2007,1,DET,AL,5,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89445,johnsra05,2007,1,ARI,NL,10,15,0,1,0,0,0,0.0,0.0,0.0,1,7.0,0.0,0.0,2.0,0.0,0.0 +89450,hoffmtr01,2007,1,SDN,NL,60,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89451,hernaro01,2007,2,LAN,NL,22,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89452,hernaro01,2007,1,CLE,AL,2,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89460,guarded01,2007,1,CIN,NL,15,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89462,griffke02,2007,1,CIN,NL,144,528,78,146,24,1,30,93.0,6.0,1.0,85,99.0,14.0,1.0,0.0,9.0,14.0 +89463,greensh01,2007,1,NYN,NL,130,446,62,130,30,1,10,46.0,11.0,1.0,37,62.0,4.0,5.0,1.0,1.0,14.0 +89464,graffto01,2007,1,MIL,NL,86,231,34,55,8,0,9,30.0,0.0,1.0,24,44.0,6.0,3.0,0.0,2.0,7.0 +89465,gordoto01,2007,1,PHI,NL,44,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89466,gonzalu01,2007,1,LAN,NL,139,464,70,129,23,2,15,68.0,6.0,2.0,56,56.0,4.0,4.0,0.0,2.0,11.0 +89467,gomezch02,2007,2,CLE,AL,19,53,4,15,2,0,0,5.0,0.0,0.0,0,6.0,0.0,0.0,1.0,1.0,1.0 +89468,gomezch02,2007,1,BAL,AL,73,169,17,51,10,1,1,16.0,1.0,2.0,10,20.0,1.0,0.0,5.0,1.0,5.0 +89469,glavito02,2007,1,NYN,NL,33,56,3,12,1,0,0,4.0,0.0,0.0,6,5.0,0.0,0.0,12.0,1.0,0.0 +89473,floydcl01,2007,1,CHN,NL,108,282,40,80,10,1,9,45.0,0.0,0.0,35,47.0,5.0,5.0,0.0,0.0,6.0 +89474,finlest01,2007,1,COL,NL,43,94,9,17,3,0,1,2.0,0.0,0.0,8,4.0,1.0,0.0,0.0,0.0,2.0 +89480,embreal01,2007,1,OAK,AL,4,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89481,edmonji01,2007,1,SLN,NL,117,365,39,92,15,2,12,53.0,0.0,2.0,41,75.0,2.0,0.0,2.0,3.0,9.0 +89482,easleda01,2007,1,NYN,NL,76,193,24,54,6,0,10,26.0,0.0,1.0,19,35.0,1.0,5.0,0.0,1.0,2.0 +89489,delgaca01,2007,1,NYN,NL,139,538,71,139,30,0,24,87.0,4.0,0.0,52,118.0,8.0,11.0,0.0,6.0,12.0 +89493,cormirh01,2007,1,CIN,NL,6,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89494,coninje01,2007,2,NYN,NL,21,41,2,8,2,0,0,5.0,0.0,0.0,7,8.0,2.0,0.0,1.0,1.0,1.0 +89495,coninje01,2007,1,CIN,NL,80,215,23,57,11,1,6,32.0,4.0,0.0,20,28.0,0.0,0.0,1.0,6.0,4.0 +89497,clemero02,2007,1,NYA,AL,2,2,0,1,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89498,claytro01,2007,2,BOS,AL,8,6,1,0,0,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,2.0 +89499,claytro01,2007,1,TOR,AL,69,189,23,48,14,0,1,12.0,2.0,1.0,14,50.0,0.0,1.0,3.0,3.0,8.0 +89501,cirilje01,2007,2,ARI,NL,28,40,6,8,4,0,0,6.0,0.0,0.0,4,6.0,0.0,0.0,0.0,0.0,1.0 +89502,cirilje01,2007,1,MIN,AL,50,153,18,40,9,2,2,21.0,2.0,0.0,15,13.0,0.0,1.0,3.0,2.0,9.0 +89521,bondsba01,2007,1,SFN,NL,126,340,75,94,14,0,28,66.0,5.0,0.0,132,54.0,43.0,3.0,0.0,2.0,13.0 +89523,biggicr01,2007,1,HOU,NL,141,517,68,130,31,3,10,50.0,4.0,3.0,23,112.0,0.0,3.0,7.0,5.0,5.0 +89525,benitar01,2007,2,FLO,NL,34,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89526,benitar01,2007,1,SFN,NL,19,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,16,3,3,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0 +89533,aloumo01,2007,1,NYN,NL,87,328,51,112,19,1,13,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0 +89534,alomasa02,2007,1,NYN,NL,8,22,1,3,1,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0 diff --git a/doc/data/fx_prices b/doc/data/fx_prices new file mode 100644 index 0000000000000000000000000000000000000000..38cadf26909a37e119c1c1cf93a7de94a9020d26 GIT binary patch literal 16177 zcmeI3hgVZc`}h;Oh>8%BP(`qS1;w(;T4q2673^X`QK^C=5(#$23Tr{6H$e~;v7@ZL zqSzbQ6$^F+k&*yu@O$pe?7rvy{R!XK>^Xim$-Q@;dFJUktkVjJ@Cy(0i*yQDyd=me zc!}SlAj6Sxvl=)hBhNkt_{n*9K|HXVi@}?8;OR15zdgXw z3$#BU0&bOrS^N#OG+dB#3FzDL@#CAon!-oZ%Ye~t4_?0l+IMd+{Rk|++_t$I*my(Q zyAc>WcS%tz@OJR)F9y0io=@wG1t!3%v}ZMzz}~x2m#)A|gN7Y&0`k!QH-mr)&-49U zfp!sjpT`1kOs!lq1vvfZTfQ$aL*_9k7}z6j)cM80u;in)%Ym9+hou{UJ4y?iw*epB zGjU1*`UM%E$N}but@v619H6^wVG*!2Tl(u9Fmyw4>@}d$;zZl~z@c`QuFrt?&xuRl z0XvkeSAGNL`b4j-2Rb*b)=~o_m7fRc>hX9wNgvO*2fAl^)ro-5`;Cq23=FyJq38je z#QdN`Rl(=o(W7e|N6`yXn*39PZ%)h?6{p#z{W|Q zf@t92nZ;xN0^Y7Gx}N~_*?MeJ7O*A!*Y|v2y#CKcM}TGRr{$jl`WH;;eg(Mq=!Mz0 zfdf6xy)Osecy~Im0%-8nHuEzuBihF17w~mV?389;Sf|a;`TC&G(;H^B1?KH4X)p!$ z8+c`zHBdU#@klqI?$d70eSsbo@xd;@7t**NZotrX+rqtohXk^GZ{W})b=~Iz*B1Nu zh5$Q^n^7qN=6?1LS_K?=Cq=Un_|!jXeH>7sE=eOM z0|%Dxyg3W_^3OsiuGeD2|!Q)DRFIof2?gCeT$`5@4Y_}%w=NsUJO@{Vgf$yyh z3TlCNT_hC>;K>W&^R(N5KE1;}83AKvKQ$KuMe_3OPQaP!A&(t^KayRh_XDo#J?Qli zU?-Dhkt2b}4@#TH0qcw2^qLObIQI4NdBD<1Bfo?Kr~PnU5DEOc0oAMqI&M8+9|JsR zx4$4BsCMk|CLI`8V&u06Xfw<3^C4izvbT4OfqOzLCR_r3iWs%|CNQeah&E-w)5eE~ zzXFafEG+p5eCb-?{u5|9#&~rj@Ni{29?uZs&bOrl+W@22OV64BjV+c8vjP^k6g}+% z9F=x#rV}tWM;t#0C>|srr-`-R4&w%wqi%s7H2aL2H^Bq`PVR646I4w#%Nexuaobg?!EswXY-g{Ad z;C;oVJtE+o#0%Xz19LOF&h7zp?q~nLKd|JEZQwAVPWhG$cVNnbSnG+v=aSmVGl3z7 zHP8Hkiak|6VZdtf71t=>nxc!PYk`-~bW_CwCr;`Tod8Un5m%fEw6EJbcrUPc%U>6c z02_DK)tv&yI{u2f0=(UK#-ZClpWEJj9s{fVrktz*#=9qZeg@jU+xf5t=>J8b(+q4_ z+w>P7GSa}9d6x`lLmWx1FsnczMKQp72Izh0!%%8Cs6`?akBrdmB3IhXRl2_0q4eXhrE1v{hlpEe-7O-V-*wH{>d|7C}aA4Vs^7+ev{*p&k>wtTg zyL8?P9N2Ntz9is{!U2xizy&*`SN8!`-Ik0x1{~(}dgU45gBveet^$o8xDL1r%nL?m zo&fu`4juXyDAn69`3ltCoiDEidh9pqsRX_lZg@<)J?Qhfe*bpBb{9PEngMgBj+)R3 z_X8WRsGSz^A>8-Nyq(16Hh>4m@ygY3n@TkH_x@EC8+w zdvhidXr%jk=o;Yh?-oxt16?2b_S1ZHwOw0uDP{EzG9I!7HB+EY&jK} zS7)}{7dYyf_n=_lv+ybB7X!shFVrpv7Pdbp-2kjIvp={ExcaEAQwp%1hs}u`;Dqt9 zUkiZmzie7q1hlL9nR^a+GN!7_b>PTO-w)gee$#ezeFj{yt6S+i;Q9P`sR^IiGUjgGom^JOD(S+>;arse9375@T=E_6T^Vp zoYiC9fmU}E4<-QTl=8O?*(pM8Gqpja7}GM{wZMlEq>jv04MIeGy68M(&5(oa-e-*XJG}f z_?A=VXW%HmUN*mgu^}l_nt@_|(sO=Ch&u*qqqe}RT*XcSaJ5U5zy@e*Gw4V+V9}`o z&V7Lm7qf$1fSaae{cr=`@YS*P0#0w_@AU?L+tJ#6F0e;xh;In+yi;(c1gIJKP`C=X zygGl7xR%T z&4G7{TwCmbMrRN9?*+`Bbl~(rpwo|iLl7{cuDxV5kiXSPJ_$G?A*#nL;1kElqk+Jn zl8Aocz!`%_nJ)u=f8>_E4%qX>q4F&zQg5A+u2V-zO#f+-{%KGa2M0$6Nh1x%Ee@B4 zxCPA*4L6hspzAVQBM~}y>(8a%OGE*ItcNk=8kxYLFsYxRL=4Rc&yxpDlexFf!exPx z5{ot7{BTcxl>hkH82=z&i6yi|q;)fdevq_m603i^A8XQzLH`rO^*_P?^glMF0sH^^ zAFkuc?aOsM+^aeIH-GElw;lZUgWqxBcO3W~2mTWWQcoC34sWYRX!@+8%hh$r?P2iK zuM6tXPUmI4J9nx>1LvF@W29AwR@Wb&xG=L81rB*PAug>J^^6t1e9%;jR^D{{Gx2UM zDvjrEs6#JmWn*W?)S@J@&b0QuYthSLD`p%R2>1C`xN9%}g-&wIy{ouYp>6vh! zR3W<^Q4-5`)u_9(!C@<(YE)l7&fj>(PvoB5s5dvg2Bn9aY`k*k7b0)Hexbs>_w$5wdlYN&p}&U>yZ5Q^YxMLb?m-6 z#7Q%4O+6Zx<-PmRk$QA{>80Hked>{s7Iy-5XhS5oay|OG-Qj1uOZCW}D54(StGRv6 z?MMTvbd_*CDiT!y0I6q3YplsEt6 zkI27esP)7uupAlMpLHEH-+;!KCCqLa(STNb?*7}#`^c!7=g831kNZF(Ir`g7b+_iW40Yzr;-?HPUateD zEN9or(e|?o+bDA7=(K+{n6ezr2k+?FC`ZFNv%W89=hlckLCeod8XaOjaIj2{$`^BY$k36{x00=3C)`u~qh+Wm>iw>c`($Vg z=iZGV4}AJ^SR*24{6da`h@r{Rp5j5xC4q88ZhI+5B)o(-qAjlT;0T)#1QBrEMnu9# z+a~lpKVq2S^F|~k2iM5_M~>RP?+qH1BQOq#WO76UmmHDX3uNf4HD{=DM9j6b97T}; zC1?I6L)@t>cQnS67G^HnbdUX6V?3U7BdIZ7%Nd4?F(&?> zZj2)y{8j!o*BEnO7aMfM`H6q>$6fD;iGLsy9J;u6^X+64Ow7K*6w^3pim4t2_;UMQ z{rAYsFtN*AA^y{T?5d4@M3~t3PZ18h=j(YZR){;^Ja;B8Mu@3>3-KJ!Tl&LZ3$ZyD z2q%efcs~Dvu89~Itl$E(7`txxa5G&c#w6Z}#rR+y1gwb`SZ{sGh;3n(m=pKbDND9r zEBqO}c=2v4oD%d`fNzBrKFPW7LQ72aF0#bMGr34+iDP^oU-&4n!o(1o^`gdH1OOyb_dPPl>#b5m{c0?ur%+hQ*2Z0Xw>lQ^);77wxj zOXu5Sa9+3JqiylTxU_JSkG9y8yWp8EPUg&ab!YngQ=PFd=O|fr_*x(sR=FL1#f60+ zJ8WhRnL@{&J}1N;Lr~Kx?b8K6gJ^KRdl&o;qEF!mJDkI%hbB8r?CPi;Uf8oS?aj%~ zR4(JrnEFX)O!AGbEhheu-wBiWe9;Dz`25ov6aUMz!uqvBh>(_;#J9&*n8d39D=OzI zE9^ELPWBvp&)NA03!Dn4@%6j~_P%-mGKK~2lGeI&?J5gQ{n!E@>-uS9&V4b~;R5|Q zFgZ3`rX--5e7$oobHDxpY6)94oo> zJ;ef(IRDB5lk~dRocWUlCUwVj3!J;B24oj*#Cfar*Vl8 zb6Ge~S47XbK#2Q-8r*|~kc+Q9?zvBh!P_A6m|?D(2{$prq`uHM!@sz+87jcE?h)XJ z5cT%W63}y5C&1vvQ?EJ*@O8NAZJik==}t$4p@=9nohZUnQakU6au#FGUG;~F@m3Hw z<%9^&=EAm-2={<$DcV4Udx{|q*0=S^{@)b1Xe;wlhF)mFgtE1-Tl1kRr;hZhR)UC4S37X_H)vr__0 z(zK6&#wk|;Ci#D}0F!dEP=KLWgA6Rdr2c9a;9Xq#a{$hd)B!>aUQ?D(0e;rVB}x${ zbwi5~ZyyT*4&pdhbxj!{#-#iOzbAF5wwT6U&@Y#j#`PEB2rm7WiSR2+PD5f$(uS89 z-&h2WuujbEQjEFiaqyuCll*VQ;)9i#?x#_N{kQ;pTZlz%UWP9j1bn#g=6rR&{G3`%va0YbG9aidMYEOEYwCSt$ zFez74b@B57zXp%GsYC15-nw`>*QTj;F_~li(8X=YKu8aVkb*;>)z=2Bk1?RnQ|seR zTzY(JfQcBsZE!Vr919Iu9b|~PZua6;13ZmO?C%U{-q~P)p*k8of22Mp{p(CUnwN_8 zum?%#x_BQq{<)@uNjo6cp>_3oZA|*#0xeb#Yq5S$i}f#j+AqE3)4cZwpZ1Z_JX&8( z<*{?%;R8^PU3cbTG7fssidsf+g8~hk?`TlM&(&}O8a8KBBf6j(k^03#gJ_?qL0q{u z;g|+-ZN)Kd4I<-|hiU{}8C0-J){iUE#mQWOt3=!s>P;6Fn`bE5+)2siKPvPQ(&j8% zHKOxH4T9kc2-1S?aOL$|4I+K0u?EpTNyX$*BPkiAX%MNqvowg*x1TkL^p`(0h>St( zThU`~Y*g6F<_az73QQAwJ!wJEg+c+ADKK(?fjRk69Eip@#Yi1_7BHS&V#-@MN%l+Br$NQD}p4WFy8Vsj1!o5Ly)H?6wn zreJzeAaXn@N<{rf$>x{}#7+6;I4amYtqD1D<4uPqOi1dHK3O1)x zAm@x8(3vRM+*652{F$LdwC`82Igx_Ro0RA;XcN~*st_5ITB#7JOB)m@k{q~_ox6(J zp$c(r{_#o`B7MPi6(ap?I~C&E$$h<4$dUv=CE}*GnFdNk`b2?(y;mR-7H>Br8V{RU zJZ)xrS0FM6Sk=tt$jxjX)rh#ZFUY(RanbBdTqB}$jYiagG=hz6e%*vfT|T>s&HWnL z9JLV@ac!JY6Pr^vp{Lw9ZFv(S^M$=lh|XP_+4(jzy)+{lUz-sLqi34f+_4#vK2g$) ztVmqSXKT9i7{%+1ypb=DQk1>vA=V2Wn;~YQ(hxUDDKS{;XnieI z7{h1%9;{2|qkNXf{_odGG++PT=Wji58u;I{12Qk}x&buyJY&kOA3`QBZYonl0w%pN~MfWrO$6jrSiQ>q5EH%f=QZsmqOQ@mP*$z zN~O>Hl}hLCS!r}WC{JViOUImeD-AQK{O8lDeDgD?Tx~Nc4`$MNPMb`s*TEU|c^Mg0 zkLxq3-d1JOeaz0Ha+GD!=MBlG*B{EJ*I8v#c|T-R`66?ue!uOe9F|MhdzXXn--8!_ zH+^nw4n2<|cAtAT{rn>1XSQx|7JbeQCbv3+>giEBz1}&UKG!#$UiTr5>RXmZ^fXIhpFyJ(=3!{A7Aw{gUZ^dnVI$5|gPMU)g!{Qs_CY zN~X{CNv8Xrl|-NSfZ3aN5n)Q()zsUF^>)9ZQZ)L+}D(e*Z^(RFSyyX}xc zuiwP%TATUtKYltpldcz+N%u3I`LkgrT{nbX|0A9H>-%)7m(X;24rf?ASjGr`L+bF$ z>GXLq8B`xm8Fb&1GpJnzFuyyWNxy%bLG5Zc^TTc#biY;^bbkXg==ljVsGdKhQ~i8p z@#z}d?_X@6ARpPowKqviH&K`4vpQ zuuQ7wm6>#3lQOB@!F$plcx2J%3wBYx4b7tV^@&}-DUfOUzA1dZaaHFgq`0MrZ4SGy00fp-{zTAAN(w;*E@`(cG2f7-9?|dSVW^ucMy}!od@!Z|?KCN7O-HBYf-zW6*3hP`V-@>!G zRL?DK9|ZlI*;t{%lSMiUwj@p2m4#> zzKmSD&(rLF{X8OM$m@TvUi`gpP6PjY=Pvd4^ObZ8(F^&{pPbv$KRFLE agg+tg>=0rU(k`TZ2>X+9zgT~Nr~e1LGyn$x literal 0 HcmV?d00001 diff --git a/doc/data/iris.data b/doc/data/iris.data new file mode 100644 index 00000000..c19b9c36 --- /dev/null +++ b/doc/data/iris.data @@ -0,0 +1,151 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Name +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/doc/data/mindex_ex.csv b/doc/data/mindex_ex.csv new file mode 100644 index 00000000..935ff936 --- /dev/null +++ b/doc/data/mindex_ex.csv @@ -0,0 +1,16 @@ +year,indiv,zit,xit +1977,"A",1.2,.6 +1977,"B",1.5,.5 +1977,"C",1.7,.8 +1978,"A",.2,.06 +1978,"B",.7,.2 +1978,"C",.8,.3 +1978,"D",.9,.5 +1978,"E",1.4,.9 +1979,"C",.2,.15 +1979,"D",.14,.05 +1979,"E",.5,.15 +1979,"F",1.2,.5 +1979,"G",3.4,1.9 +1979,"H",5.4,2.7 +1979,"I",6.4,1.2 diff --git a/doc/data/test.xls b/doc/data/test.xls new file mode 100644 index 0000000000000000000000000000000000000000..db0f9dec7d5e42c87dc0b0d297b66305f7af4225 GIT binary patch literal 30720 zcmeHQ4RloHnf_)n2>~nwMF`qz3{voigalBALcoA2M2YEup$?c3C&Q3V$Y5q@z`-Wn z)-K)EmTp6pny%Q>?%H&N@w@kafA9Cc-}`&-dmWzn*|Zan{?lbARmxGOveZCgqRMf!yKv84dw}InSwGE1tOrP zpY9scyJbi=s)%Y)VRaX-5nM-mT&&zK%B{g6lljDO zr-fOzTCaAh?f9z)9=p|A)vUH^oY)dmjks?f0q8uePg1Bq#9@S?gI)7*ca}nrBU`P; z3IUBn5G%z-iO&+OZ)neSeZ8=?{}Xee#W~q(w(sbKJmpq7*vyl-KH>Qx&Pu(AvRUn* z?r#i63)Tg0X>4u|?k@OaALSZ{dKYn64i39e??lOiMPVEGj`Jn@{EvFn4`eKQs@3lj zyP~n%sQ>g(Z@E_LaS!NoA2jg# z2V50LN|$^1Sy{u+8g{LNSYcQ+Jm3W}B$6p;c+~QtT~~PC4=IB^F1)&M_Vu%8FQ~ib zL!0NB%I0gX@LUCp58G^-(7(wSY`8OdP$3qB@iv2s79_(#ouKBzX%wc8N`Yn#y%BaD zmb;FqTw7o;v#Oit>4weMe8_4joL>mM>gKJi!zxAU8vJ3ZbKbQD&Pmr6B+Xa>=K(k8 zTudlPUQeUuSaQV*d(j)~+Ak_~I#B?Qnd3+S+sQ-S{#It~ zl)iN1hifO)bmK4Nc9$w3dDbEOoDJUTedteZzh;>B>xXH-Hf=j2rMGDNQ~1azDqVXv z;;MA`Fm}@T)1~R-NWmA~9+lUgmEd(+dyc+d-0soWOWG&s>-6@C`g&=5uD+^vrLQyE zvlD*<8!ncR?&(*#IqJ&qFj70F&c2zg-pKv5Qr~)2?c-_7!CgVlpDq6nN&YJHsO@(=&r=!y*py+HQ&yI zEVw^-^YcyncRky*{~_FaHtsw~)EJJx)pW=H{5^@oYBvgwZ{Pi>O99Sf_r_E|;M@{@ z)P?(lgzdfuV{oM_KGCtj!!^0K2lL9B@7UkIcIUzFJr88N@>JgR+v{id1LpK^T((br z^MfAFWr)OU0D%<`J_0=a2>PF&Sxfi3v;G6Pf@aeczgl8+4so)9Ya6GZXd?tqDcQ^n7{X7>DPOLMee=) z16q^Smbo78huMEl(UCdkX$Fu$9uN* zqyMFpe(rzn!O7VV{j%|D>eXKc?$x+(ZO$iOh&S;gzfW8Jg!xevBQG4g_vf=*Tc%HU z-G_U@9s3Kj#If5ROC087xrFa|N#XlsH_f{ol$o3HO#S(Jd~|2XdlAkJczoymG@OZ7 zt~&dT*MGk5uId$ay8T++ew{x5g>Grr(kC$IXpQ%{K8c}YC?#tN!$FTZu|Dui#v#W(HP(G)B$UcF&;LG`U`*Ka5QjN;;(s|pJWx5Z*RmKGPc zwzd}4b5v0vvYi8?#nmm59l@5^?ppzDJ|+}3#2N~*j6hF92dxc_fmqq3iOa&l-DLn_ z0o&2i*c>aWt`7w(A}!nNal39!q^0qmNOP>d$rp?+E9Q7iv!mOK{m#M+iz-Vso!k_t!EV|VWL557#Qa!i$H*M zQ&~yrf(6Tp^(`Tfh*LCH9}8AA)rSUg!gy?BFcfSl0~B+sQ7kq%E-N;OFEefD=EZF- zjX`iC2Dvhe4HBJp_W8hxG(>U7&9;>!@8DQr#ri;f6NLG~VM7W*wknK4J;3V;NcIJ0 zkQk7%}sdi9CRjCbUg8=uL0QZ)h<9AbqMw+3UeAfgXD zvM}>g;~mROrgP-BA4>7$69?TkRvh%@j#dlP4ULgdz25QjcejV3!LirP_4Av?z;>rH zHYwt}so`Tgqp@HE8-2k*?0l5`cSax?<gTLg++MvWR68nTL=Jp zl=p_@0NSY|@4XskxNR_t7r0=gQaDsHH23B&OyQoz)#2JpO1%pfAPq2hl%|yl3`%Nu z*v+vZ6WE-!aG-eRJ+JNcP+=d5FmAl%A;+u{xADKdx&DH*;u%pmR9WAJ8NYF1TD!gY z#v9HL6?IAicJIQm+v!CimBgX4AcnhSNfP^v-A>yLULAQLQi8%Q~x*4r|mJ_PU>Q&(rCW22@-sD~oAIS6c%2CW*pf zYFnfofKaLYC%+;lCM-Q?wZplb7=OQ0c=DKyx z2cO?~WNn<;#!6}%Gck4>GhTT*p>4OxTqf)6wuiDgE<34B<}zh!k9$Oqj#(8*V#(SjG_I$$8CfYd`nKVM66D%v4(}sLWAKRsTEX-k+~} z>NO43jS*8-;CStS)x7`f8ipGfO;w+mdUs*@p{J}CLcMfv$4k#F`i~WaN}Ok^b_;5j zhWgoEjp>8BEgTA|NO%5f9db z{2L9Gyqu|ySWvSy)bH(m?d${Bzo7BR)=(c-FD%-$_C*aNTkk;C4NrzD{{4(KhEN~* z)UUU{y65{EDy3njDkuYy!(G=M+AtkWVr)$(iz#=iTM_#JOTTXR0%@A9tp3$8)CXQ^AZ# ztQSMckM|?_%5<+9Z@@t#- z?28)1i!rVg<2lpR%r7;@daxb~FBGcR4U=!QP4h@5HPvqq(q=!4r~T|Ky}7sS^iY?P zlu@%Wv%JQzG&uHp%)V|nuPE?#Z9HcxpB9pBCb$~UnL6bn^~JQU3S(YLZG({JvsYX4 zQz@h@BxlpdbEX&Wi}-ZO;Pb(Vr%xkYNMF8WMBz|neHUhYJZCy4f~%o+JH04mYPi!U zN9U(+9?zND|3k;0B(xr$`{OxN>uJTAiA=*I6YW@9vq67BJf1WCljKa9b75{$a-4Gi z$I6-RdsjJA=3AL^vh%1WJId@AkIeI^94VPo<`SjGj2ZVqYRthgcPBMw%9wMrGo>q& zGNsP^t7J-@`B%x5I`gmiUy}jRnSYf`>6WBSsWbm7nNnx|HET%zRWhZ{{HtV2+mkY- z&it!nN}c&vkA~{Zzj}t`UnNs|Fey{&%)d&e)R}*kOsO;f3NB&?ocUMDlpe95N~YAA zf0azBGyj@1B>yUzQfK~EGNmT}Dw$Ge{#7!i&it!nN}c)F2}AO)ltVlJnmZ)_nmZ)_ znmZ)_nyaDO`PUih6A*%f$iJq^!0OK#R$|q2XJGXpU<8dGl7W@J@$(qi0WydmT~P7) zJ6TjJUdN9=j~y7m=S*3XgO6LGj8?flG$+)n-NGUB|h zG0+l;Mz+QZZr&9LHWlb~w1zCHSX<5yzI1z$ZZDGGSs~o>UXjWf4x%lpS&=h=$ij)P$$o-QRs@cq42i=+fm5huc46NPoPi^et^PXn0X4t$#*a+ z>Bq%T)$%Z0JAb&u=F7VJ0N!OpqT7T+=v#^M!i;+#Jihp-i}&sS+@^nbPUK3d52Fp= z^=U3vuU)m_D*08QN+3SXow!IP2GCF9Sa`69#}y!ZQoE}ipA+JP!Y}hnz)$K9YlJLd zX5wd1AZawHdD)X5JSzinHqhky5fzE|M>|!>f4a5X744L3cvH0V_pROTXlEcl633k@ zvL_tph`_A6eAYSAj0i174f8UnCy!IojFU-P&Cl zD=&?7R_!mF7D3%w!Ce_GFAaDJ(O#o!e`Sm<)*3F~p~#-M94q(^h5Y`0dxg8K75sgR zp9%TPODkhs&#X|F-M4zpsPy*9aEw*6y0)t-ZiU@bJWYE$t61#hJuMI}P!mYbf!Dxoo`G6Ku?foWNSUfrbFbj@Uo|Y7l|eZi@NKgZoBt z1#qd)1UG$ciJGpEw>&T2>+h+GmgfcXtD?1iZ3*7Td&!0JJbzE6FATK<+ttflvG98ABp=q0{ImsVSiP$q9o+4D2c?EpAv$x+RWO%@;u_t_DUal?Cap1iW0)8 z>8kXFyliD3=>mr?ZSAfo@pZH%!hX)GD1nW@m_Rnl2%d2#RP|`bZ@EA#~U%B1Ixal>9w5k?~$a zps3b#Rg@6vnyyHkCJ*)n#6>%aE{U!vA(6muMj*eok2APAT!ZUYl#s-#s9;#0$JHxJ zNG4`vh5dvH(*E1xX>Gs1-{=DOnrIR&Y9NUYd5Jp7l>2yZB+j--ob5s`wSxSWks+`2 z*L0D{ke5jEjwKSOoN=8%zQ0H2&;r!<5njAE4%`470anGZ)mBWp`JS0h#wfVKpw zq4Ws`yeCDxhP-5$*%4}#j3SmIdxlwo{6N0Y5cU(#crPVZ=2O&ax@0Xfk*yF%KxR$) zM8e~})Ee#$@#7wFXKMQ>gv6@0PxhQ6K#j-X3{wBcVt%xfo)Q;^fZa$EjvSO7@y)&=*BJ zc_p_gSJ)s(EjNK%K~<#XA_oYA$R?+DcSH=xoLqwN^iZIjq z(I{}9ST2szM2OhyAvMzjLQWooMYQ-o{N)y^RZ_1d1LA zog72QW84(irCf7w$sfumXR}5+|F10pDG&SUb*ZL=$GN0coSlJ0w*{^t@S>D(4aArd zU}xmKPFei-_xXW`@E_-yB5{;9Vw91jXLIjG(zS+6p&23-wA4a_P;AS&Xf-W2b-*dv zq7SqK)Bx7FFT_mrfxAcj@pTYW1`Z4gs-ohoMDpn!WOt;MxS3GRjo5CzAs7N-xiAJv z1ZcY}Bk)69*d!VqLoON0Ie=K(H&N7twmyUhL8`xTU}6d?N^1MOPm6#vv?Fo!7_`>J zK;mPKT^3u(f&pJgHlIis3@(1WDvJ69;zkoFY9pk|+L~0uTgXOFI4-;uQfX`H7^pd! zyDwRWo|$vFAtamH%sW~H_Bn}-cJir$PKt;M_a%5{*c0}14nuvzTei|m$P8k~5uEMs zS^PM+LLTLvXxvWaS4e`Gk0S|nMQ`HY#bM_d&p{Fin?I?IZ}GTj3t@N+z);J0QC(axd;o)SS=B=ZO$F6x_t9<)t+}p`}8oQE;Owf~+VY!IN>^`sAlE+INxD zcrTxh5(}w#;HEJQ~*~xeNp;T(r0n zDpE9Pj`%dlM_VD+q2z|I;Z1lRd(z2Qc^)HSv>yaRgeXTG8fWvoLB5jDbZo?iTEwR>*2F>Zc~IsFp@K2nM)&IE zDY~_IuN%A42h*n?+Gr6p)EqHVq{5BtA4dbq1m}FKO}~?}7z$`yOAAD{ z8Ef}}M}rUqx*mx)PFzLb1J5&n@S5eEquu`F0WaoVF1#W8X?dt`be+^MUggb&m|=OA zpy-j;z{h%nLLH+(F%;vj^W_A^oSuhoN$h73tT&UdA&8Sa%~X_dBqf5O94%I*k8=n~ zUM_GBaUq_3e?w!>IsP6Bervb)X}S#tYqXEvr@1SbgO^X`dAO=qtsu>J2pDT>sh(5< z`Z=MS*Jvl-`iL4(euy3Sw2M zE##nh4QeR$!)UYp-hi4zn=647`{_^UH$)qyMlChe8ttWl;waHc-bqkOU8eUk+DqQb z7zui5A-Ez7y&%^Y7cVtyhu(2$`fE%2{l}~J>$itm9%Q#kCxd)3m6U0bD`HIFM4Kd! zS(F^wHfg8qvX59(dsq`X#PHH;i6J+kDaVp4T!U-V!!axs`^B0q3u5poKior(7wMp` zh}6h<2DIEA>bSAr{{F?UMX3veQqLD4c666c>Zt)qQa?pv9f=?4Lg<_bgAc2)h!rK- z)TiaA1|Pv-pNKzkuIWNd2?*pSSEB^es(7E~8w>lD3bl?B8uHR+@cpZ3r|qXXT3+GE zJ2h>;`TiQ@xhrGgO||__LWzJ)C>cfZk=7%d#d+tOb6@~!|4GY^O^5j|)*HR3G zkuO^69PQTcP%Kj|;vwov!Qp5ME$7+qRj8a2?1`{SC`du2D6x+Ql{FDD#vsn;OHp?T zyu?kLFpNQ#sEN2(!V$woQQ;d+F#{YW^pUk>Fh>$gQ8=M-RF9TL7Q=mqY7T`rf#Ik# z4HWNvNuPP&+AU{lf*Bu}RddMSm-+a}p4xu#nj^y>Rqmy9lE{mscaZi$Fi(jXi2%;w z&M_BPb11abcS!azlQ{uz2${zC3za(T&1{{(tbjNA6@Op!E9YKkRGeFoBZPnmf9sRw zrRU~_nL#oTm&eXM!)#Vveqa_}*{+YjaBd>Z9IUdmwKp(JupD`_Mn$+;fP}+%9|}PG zsR@~K5Sn&~0bj;*z(HpB{aQ+>0 zk-`|>G%BS0{^ONVX2?gwMUIsKeq?ER$3_S)0Wr(yj8Mp5H>>r@Q4pdzk53gc5c{l8 z!elFczVT;&xYR2A`-9u@FZ|txe}nO3quvkxJA4VpkqH0~25rb>V1{EH%l}DudOolJ E|39OKD*ylh literal 0 HcmV?d00001 diff --git a/doc/data/tips.csv b/doc/data/tips.csv new file mode 100644 index 00000000..856a65a6 --- /dev/null +++ b/doc/data/tips.csv @@ -0,0 +1,245 @@ +total_bill,tip,sex,smoker,day,time,size +16.99,1.01,Female,No,Sun,Dinner,2 +10.34,1.66,Male,No,Sun,Dinner,3 +21.01,3.5,Male,No,Sun,Dinner,3 +23.68,3.31,Male,No,Sun,Dinner,2 +24.59,3.61,Female,No,Sun,Dinner,4 +25.29,4.71,Male,No,Sun,Dinner,4 +8.77,2.0,Male,No,Sun,Dinner,2 +26.88,3.12,Male,No,Sun,Dinner,4 +15.04,1.96,Male,No,Sun,Dinner,2 +14.78,3.23,Male,No,Sun,Dinner,2 +10.27,1.71,Male,No,Sun,Dinner,2 +35.26,5.0,Female,No,Sun,Dinner,4 +15.42,1.57,Male,No,Sun,Dinner,2 +18.43,3.0,Male,No,Sun,Dinner,4 +14.83,3.02,Female,No,Sun,Dinner,2 +21.58,3.92,Male,No,Sun,Dinner,2 +10.33,1.67,Female,No,Sun,Dinner,3 +16.29,3.71,Male,No,Sun,Dinner,3 +16.97,3.5,Female,No,Sun,Dinner,3 +20.65,3.35,Male,No,Sat,Dinner,3 +17.92,4.08,Male,No,Sat,Dinner,2 +20.29,2.75,Female,No,Sat,Dinner,2 +15.77,2.23,Female,No,Sat,Dinner,2 +39.42,7.58,Male,No,Sat,Dinner,4 +19.82,3.18,Male,No,Sat,Dinner,2 +17.81,2.34,Male,No,Sat,Dinner,4 +13.37,2.0,Male,No,Sat,Dinner,2 +12.69,2.0,Male,No,Sat,Dinner,2 +21.7,4.3,Male,No,Sat,Dinner,2 +19.65,3.0,Female,No,Sat,Dinner,2 +9.55,1.45,Male,No,Sat,Dinner,2 +18.35,2.5,Male,No,Sat,Dinner,4 +15.06,3.0,Female,No,Sat,Dinner,2 +20.69,2.45,Female,No,Sat,Dinner,4 +17.78,3.27,Male,No,Sat,Dinner,2 +24.06,3.6,Male,No,Sat,Dinner,3 +16.31,2.0,Male,No,Sat,Dinner,3 +16.93,3.07,Female,No,Sat,Dinner,3 +18.69,2.31,Male,No,Sat,Dinner,3 +31.27,5.0,Male,No,Sat,Dinner,3 +16.04,2.24,Male,No,Sat,Dinner,3 +17.46,2.54,Male,No,Sun,Dinner,2 +13.94,3.06,Male,No,Sun,Dinner,2 +9.68,1.32,Male,No,Sun,Dinner,2 +30.4,5.6,Male,No,Sun,Dinner,4 +18.29,3.0,Male,No,Sun,Dinner,2 +22.23,5.0,Male,No,Sun,Dinner,2 +32.4,6.0,Male,No,Sun,Dinner,4 +28.55,2.05,Male,No,Sun,Dinner,3 +18.04,3.0,Male,No,Sun,Dinner,2 +12.54,2.5,Male,No,Sun,Dinner,2 +10.29,2.6,Female,No,Sun,Dinner,2 +34.81,5.2,Female,No,Sun,Dinner,4 +9.94,1.56,Male,No,Sun,Dinner,2 +25.56,4.34,Male,No,Sun,Dinner,4 +19.49,3.51,Male,No,Sun,Dinner,2 +38.01,3.0,Male,Yes,Sat,Dinner,4 +26.41,1.5,Female,No,Sat,Dinner,2 +11.24,1.76,Male,Yes,Sat,Dinner,2 +48.27,6.73,Male,No,Sat,Dinner,4 +20.29,3.21,Male,Yes,Sat,Dinner,2 +13.81,2.0,Male,Yes,Sat,Dinner,2 +11.02,1.98,Male,Yes,Sat,Dinner,2 +18.29,3.76,Male,Yes,Sat,Dinner,4 +17.59,2.64,Male,No,Sat,Dinner,3 +20.08,3.15,Male,No,Sat,Dinner,3 +16.45,2.47,Female,No,Sat,Dinner,2 +3.07,1.0,Female,Yes,Sat,Dinner,1 +20.23,2.01,Male,No,Sat,Dinner,2 +15.01,2.09,Male,Yes,Sat,Dinner,2 +12.02,1.97,Male,No,Sat,Dinner,2 +17.07,3.0,Female,No,Sat,Dinner,3 +26.86,3.14,Female,Yes,Sat,Dinner,2 +25.28,5.0,Female,Yes,Sat,Dinner,2 +14.73,2.2,Female,No,Sat,Dinner,2 +10.51,1.25,Male,No,Sat,Dinner,2 +17.92,3.08,Male,Yes,Sat,Dinner,2 +27.2,4.0,Male,No,Thur,Lunch,4 +22.76,3.0,Male,No,Thur,Lunch,2 +17.29,2.71,Male,No,Thur,Lunch,2 +19.44,3.0,Male,Yes,Thur,Lunch,2 +16.66,3.4,Male,No,Thur,Lunch,2 +10.07,1.83,Female,No,Thur,Lunch,1 +32.68,5.0,Male,Yes,Thur,Lunch,2 +15.98,2.03,Male,No,Thur,Lunch,2 +34.83,5.17,Female,No,Thur,Lunch,4 +13.03,2.0,Male,No,Thur,Lunch,2 +18.28,4.0,Male,No,Thur,Lunch,2 +24.71,5.85,Male,No,Thur,Lunch,2 +21.16,3.0,Male,No,Thur,Lunch,2 +28.97,3.0,Male,Yes,Fri,Dinner,2 +22.49,3.5,Male,No,Fri,Dinner,2 +5.75,1.0,Female,Yes,Fri,Dinner,2 +16.32,4.3,Female,Yes,Fri,Dinner,2 +22.75,3.25,Female,No,Fri,Dinner,2 +40.17,4.73,Male,Yes,Fri,Dinner,4 +27.28,4.0,Male,Yes,Fri,Dinner,2 +12.03,1.5,Male,Yes,Fri,Dinner,2 +21.01,3.0,Male,Yes,Fri,Dinner,2 +12.46,1.5,Male,No,Fri,Dinner,2 +11.35,2.5,Female,Yes,Fri,Dinner,2 +15.38,3.0,Female,Yes,Fri,Dinner,2 +44.3,2.5,Female,Yes,Sat,Dinner,3 +22.42,3.48,Female,Yes,Sat,Dinner,2 +20.92,4.08,Female,No,Sat,Dinner,2 +15.36,1.64,Male,Yes,Sat,Dinner,2 +20.49,4.06,Male,Yes,Sat,Dinner,2 +25.21,4.29,Male,Yes,Sat,Dinner,2 +18.24,3.76,Male,No,Sat,Dinner,2 +14.31,4.0,Female,Yes,Sat,Dinner,2 +14.0,3.0,Male,No,Sat,Dinner,2 +7.25,1.0,Female,No,Sat,Dinner,1 +38.07,4.0,Male,No,Sun,Dinner,3 +23.95,2.55,Male,No,Sun,Dinner,2 +25.71,4.0,Female,No,Sun,Dinner,3 +17.31,3.5,Female,No,Sun,Dinner,2 +29.93,5.07,Male,No,Sun,Dinner,4 +10.65,1.5,Female,No,Thur,Lunch,2 +12.43,1.8,Female,No,Thur,Lunch,2 +24.08,2.92,Female,No,Thur,Lunch,4 +11.69,2.31,Male,No,Thur,Lunch,2 +13.42,1.68,Female,No,Thur,Lunch,2 +14.26,2.5,Male,No,Thur,Lunch,2 +15.95,2.0,Male,No,Thur,Lunch,2 +12.48,2.52,Female,No,Thur,Lunch,2 +29.8,4.2,Female,No,Thur,Lunch,6 +8.52,1.48,Male,No,Thur,Lunch,2 +14.52,2.0,Female,No,Thur,Lunch,2 +11.38,2.0,Female,No,Thur,Lunch,2 +22.82,2.18,Male,No,Thur,Lunch,3 +19.08,1.5,Male,No,Thur,Lunch,2 +20.27,2.83,Female,No,Thur,Lunch,2 +11.17,1.5,Female,No,Thur,Lunch,2 +12.26,2.0,Female,No,Thur,Lunch,2 +18.26,3.25,Female,No,Thur,Lunch,2 +8.51,1.25,Female,No,Thur,Lunch,2 +10.33,2.0,Female,No,Thur,Lunch,2 +14.15,2.0,Female,No,Thur,Lunch,2 +16.0,2.0,Male,Yes,Thur,Lunch,2 +13.16,2.75,Female,No,Thur,Lunch,2 +17.47,3.5,Female,No,Thur,Lunch,2 +34.3,6.7,Male,No,Thur,Lunch,6 +41.19,5.0,Male,No,Thur,Lunch,5 +27.05,5.0,Female,No,Thur,Lunch,6 +16.43,2.3,Female,No,Thur,Lunch,2 +8.35,1.5,Female,No,Thur,Lunch,2 +18.64,1.36,Female,No,Thur,Lunch,3 +11.87,1.63,Female,No,Thur,Lunch,2 +9.78,1.73,Male,No,Thur,Lunch,2 +7.51,2.0,Male,No,Thur,Lunch,2 +14.07,2.5,Male,No,Sun,Dinner,2 +13.13,2.0,Male,No,Sun,Dinner,2 +17.26,2.74,Male,No,Sun,Dinner,3 +24.55,2.0,Male,No,Sun,Dinner,4 +19.77,2.0,Male,No,Sun,Dinner,4 +29.85,5.14,Female,No,Sun,Dinner,5 +48.17,5.0,Male,No,Sun,Dinner,6 +25.0,3.75,Female,No,Sun,Dinner,4 +13.39,2.61,Female,No,Sun,Dinner,2 +16.49,2.0,Male,No,Sun,Dinner,4 +21.5,3.5,Male,No,Sun,Dinner,4 +12.66,2.5,Male,No,Sun,Dinner,2 +16.21,2.0,Female,No,Sun,Dinner,3 +13.81,2.0,Male,No,Sun,Dinner,2 +17.51,3.0,Female,Yes,Sun,Dinner,2 +24.52,3.48,Male,No,Sun,Dinner,3 +20.76,2.24,Male,No,Sun,Dinner,2 +31.71,4.5,Male,No,Sun,Dinner,4 +10.59,1.61,Female,Yes,Sat,Dinner,2 +10.63,2.0,Female,Yes,Sat,Dinner,2 +50.81,10.0,Male,Yes,Sat,Dinner,3 +15.81,3.16,Male,Yes,Sat,Dinner,2 +7.25,5.15,Male,Yes,Sun,Dinner,2 +31.85,3.18,Male,Yes,Sun,Dinner,2 +16.82,4.0,Male,Yes,Sun,Dinner,2 +32.9,3.11,Male,Yes,Sun,Dinner,2 +17.89,2.0,Male,Yes,Sun,Dinner,2 +14.48,2.0,Male,Yes,Sun,Dinner,2 +9.6,4.0,Female,Yes,Sun,Dinner,2 +34.63,3.55,Male,Yes,Sun,Dinner,2 +34.65,3.68,Male,Yes,Sun,Dinner,4 +23.33,5.65,Male,Yes,Sun,Dinner,2 +45.35,3.5,Male,Yes,Sun,Dinner,3 +23.17,6.5,Male,Yes,Sun,Dinner,4 +40.55,3.0,Male,Yes,Sun,Dinner,2 +20.69,5.0,Male,No,Sun,Dinner,5 +20.9,3.5,Female,Yes,Sun,Dinner,3 +30.46,2.0,Male,Yes,Sun,Dinner,5 +18.15,3.5,Female,Yes,Sun,Dinner,3 +23.1,4.0,Male,Yes,Sun,Dinner,3 +15.69,1.5,Male,Yes,Sun,Dinner,2 +19.81,4.19,Female,Yes,Thur,Lunch,2 +28.44,2.56,Male,Yes,Thur,Lunch,2 +15.48,2.02,Male,Yes,Thur,Lunch,2 +16.58,4.0,Male,Yes,Thur,Lunch,2 +7.56,1.44,Male,No,Thur,Lunch,2 +10.34,2.0,Male,Yes,Thur,Lunch,2 +43.11,5.0,Female,Yes,Thur,Lunch,4 +13.0,2.0,Female,Yes,Thur,Lunch,2 +13.51,2.0,Male,Yes,Thur,Lunch,2 +18.71,4.0,Male,Yes,Thur,Lunch,3 +12.74,2.01,Female,Yes,Thur,Lunch,2 +13.0,2.0,Female,Yes,Thur,Lunch,2 +16.4,2.5,Female,Yes,Thur,Lunch,2 +20.53,4.0,Male,Yes,Thur,Lunch,4 +16.47,3.23,Female,Yes,Thur,Lunch,3 +26.59,3.41,Male,Yes,Sat,Dinner,3 +38.73,3.0,Male,Yes,Sat,Dinner,4 +24.27,2.03,Male,Yes,Sat,Dinner,2 +12.76,2.23,Female,Yes,Sat,Dinner,2 +30.06,2.0,Male,Yes,Sat,Dinner,3 +25.89,5.16,Male,Yes,Sat,Dinner,4 +48.33,9.0,Male,No,Sat,Dinner,4 +13.27,2.5,Female,Yes,Sat,Dinner,2 +28.17,6.5,Female,Yes,Sat,Dinner,3 +12.9,1.1,Female,Yes,Sat,Dinner,2 +28.15,3.0,Male,Yes,Sat,Dinner,5 +11.59,1.5,Male,Yes,Sat,Dinner,2 +7.74,1.44,Male,Yes,Sat,Dinner,2 +30.14,3.09,Female,Yes,Sat,Dinner,4 +12.16,2.2,Male,Yes,Fri,Lunch,2 +13.42,3.48,Female,Yes,Fri,Lunch,2 +8.58,1.92,Male,Yes,Fri,Lunch,1 +15.98,3.0,Female,No,Fri,Lunch,3 +13.42,1.58,Male,Yes,Fri,Lunch,2 +16.27,2.5,Female,Yes,Fri,Lunch,2 +10.09,2.0,Female,Yes,Fri,Lunch,2 +20.45,3.0,Male,No,Sat,Dinner,4 +13.28,2.72,Male,No,Sat,Dinner,2 +22.12,2.88,Female,Yes,Sat,Dinner,2 +24.01,2.0,Male,Yes,Sat,Dinner,4 +15.69,3.0,Male,Yes,Sat,Dinner,3 +11.61,3.39,Male,No,Sat,Dinner,2 +10.77,1.47,Male,No,Sat,Dinner,2 +15.53,3.0,Male,Yes,Sat,Dinner,2 +10.07,1.25,Male,No,Sat,Dinner,2 +12.6,1.0,Male,Yes,Sat,Dinner,2 +32.83,1.17,Male,Yes,Sat,Dinner,2 +35.83,4.67,Female,No,Sat,Dinner,3 +29.03,5.92,Male,No,Sat,Dinner,3 +27.18,2.0,Female,Yes,Sat,Dinner,2 +22.67,2.0,Male,Yes,Sat,Dinner,2 +17.82,1.75,Male,No,Sat,Dinner,2 +18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/doc/make.py b/doc/make.py new file mode 100755 index 00000000..4367ac91 --- /dev/null +++ b/doc/make.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python + +""" +Python script for building documentation. + +To build the docs you must have all optional dependencies for pandas +installed. See the installation instructions for a list of these. + +Note: currently latex builds do not work because of table formats that are not +supported in the latex generation. + +2014-01-30: Latex has some issues but 'latex_forced' works ok for 0.13.0-400 or so + +Usage +----- +python make.py clean +python make.py html +""" +from __future__ import print_function + +import glob +import os +import shutil +import sys +import sphinx +import argparse +import jinja2 + +os.environ['PYTHONPATH'] = '..' + +SPHINX_BUILD = 'sphinxbuild' + + +def upload_dev(): + 'push a copy to the pydata dev directory' + if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'): + raise SystemExit('Upload to Pydata Dev failed') + + +def upload_dev_pdf(): + 'push a copy to the pydata dev directory' + if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/'): + raise SystemExit('PDF upload to Pydata Dev failed') + + +def upload_stable(): + 'push a copy to the pydata stable directory' + if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'): + raise SystemExit('Upload to stable failed') + + +def upload_stable_pdf(): + 'push a copy to the pydata dev directory' + if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/'): + raise SystemExit('PDF upload to stable failed') + + +def upload_prev(ver, doc_root='./'): + 'push a copy of older release to appropriate version directory' + local_dir = doc_root + 'build/html' + remote_dir = '/usr/share/nginx/pandas/pandas-docs/version/%s/' % ver + cmd = 'cd %s; rsync -avz . pandas@pandas.pydata.org:%s -essh' + cmd = cmd % (local_dir, remote_dir) + print(cmd) + if os.system(cmd): + raise SystemExit( + 'Upload to %s from %s failed' % (remote_dir, local_dir)) + + local_dir = doc_root + 'build/latex' + pdf_cmd = 'cd %s; scp pandas.pdf pandas@pandas.pydata.org:%s' + pdf_cmd = pdf_cmd % (local_dir, remote_dir) + if os.system(pdf_cmd): + raise SystemExit('Upload PDF to %s from %s failed' % (ver, doc_root)) + +def build_pandas(): + os.chdir('..') + os.system('python setup.py clean') + os.system('python setup.py build_ext --inplace') + os.chdir('doc') + +def build_prev(ver): + if os.system('git checkout v%s' % ver) != 1: + os.chdir('..') + os.system('python setup.py clean') + os.system('python setup.py build_ext --inplace') + os.chdir('doc') + os.system('python make.py clean') + os.system('python make.py html') + os.system('python make.py latex') + os.system('git checkout master') + + +def clean(): + if os.path.exists('build'): + shutil.rmtree('build') + + if os.path.exists('source/generated'): + shutil.rmtree('source/generated') + + +def html(): + check_build() + if os.system('sphinx-build -P -b html -d build/doctrees ' + 'source build/html'): + raise SystemExit("Building HTML failed.") + try: + # remove stale file + os.system('cd build; rm -f html/pandas.zip;') + except: + pass + +def zip_html(): + try: + print("\nZipping up HTML docs...") + # just in case the wonky build box doesn't have zip + # don't fail this. + os.system('cd build; rm -f html/pandas.zip; zip html/pandas.zip -r -q html/* ') + print("\n") + except: + pass + +def latex(): + check_build() + if sys.platform != 'win32': + # LaTeX format. + if os.system('sphinx-build -b latex -d build/doctrees ' + 'source build/latex'): + raise SystemExit("Building LaTeX failed.") + # Produce pdf. + + os.chdir('build/latex') + + # Call the makefile produced by sphinx... + if os.system('make'): + print("Rendering LaTeX failed.") + print("You may still be able to get a usable PDF file by going into 'build/latex'") + print("and executing 'pdflatex pandas.tex' for the requisite number of passes.") + print("Or using the 'latex_forced' target") + raise SystemExit + + os.chdir('../..') + else: + print('latex build has not been tested on windows') + +def latex_forced(): + check_build() + if sys.platform != 'win32': + # LaTeX format. + if os.system('sphinx-build -b latex -d build/doctrees ' + 'source build/latex'): + raise SystemExit("Building LaTeX failed.") + # Produce pdf. + + os.chdir('build/latex') + + # Manually call pdflatex, 3 passes should ensure latex fixes up + # all the required cross-references and such. + os.system('pdflatex -interaction=nonstopmode pandas.tex') + os.system('pdflatex -interaction=nonstopmode pandas.tex') + os.system('pdflatex -interaction=nonstopmode pandas.tex') + raise SystemExit("You should check the file 'build/latex/pandas.pdf' for problems.") + + os.chdir('../..') + else: + print('latex build has not been tested on windows') + + +def check_build(): + build_dirs = [ + 'build', 'build/doctrees', 'build/html', + 'build/latex', 'build/plots', 'build/_static', + 'build/_templates'] + for d in build_dirs: + try: + os.mkdir(d) + except OSError: + pass + + +def all(): + # clean() + html() + + +def auto_dev_build(debug=False): + msg = '' + try: + step = 'clean' + clean() + step = 'html' + html() + step = 'upload dev' + upload_dev() + if not debug: + sendmail(step) + + step = 'latex' + latex() + step = 'upload pdf' + upload_dev_pdf() + if not debug: + sendmail(step) + except (Exception, SystemExit) as inst: + msg = str(inst) + '\n' + sendmail(step, '[ERROR] ' + msg) + + +def sendmail(step=None, err_msg=None): + from_name, to_name = _get_config() + + if step is None: + step = '' + + if err_msg is None or '[ERROR]' not in err_msg: + msgstr = 'Daily docs %s completed successfully' % step + subject = "DOC: %s successful" % step + else: + msgstr = err_msg + subject = "DOC: %s failed" % step + + import smtplib + from email.MIMEText import MIMEText + msg = MIMEText(msgstr) + msg['Subject'] = subject + msg['From'] = from_name + msg['To'] = to_name + + server_str, port, login, pwd = _get_credentials() + server = smtplib.SMTP(server_str, port) + server.ehlo() + server.starttls() + server.ehlo() + + server.login(login, pwd) + try: + server.sendmail(from_name, to_name, msg.as_string()) + finally: + server.close() + + +def _get_dir(subdir=None): + import getpass + USERNAME = getpass.getuser() + if sys.platform == 'darwin': + HOME = '/Users/%s' % USERNAME + else: + HOME = '/home/%s' % USERNAME + + if subdir is None: + subdir = '/code/scripts/config' + conf_dir = '%s/%s' % (HOME, subdir) + return conf_dir + + +def _get_credentials(): + tmp_dir = _get_dir() + cred = '%s/credentials' % tmp_dir + with open(cred, 'r') as fh: + server, port, un, domain = fh.read().split(',') + port = int(port) + login = un + '@' + domain + '.com' + + import base64 + with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: + pwd = base64.b64decode(fh.read()) + + return server, port, login, pwd + + +def _get_config(): + tmp_dir = _get_dir() + with open('%s/addresses' % tmp_dir, 'r') as fh: + from_name, to_name = fh.read().split(',') + return from_name, to_name + +funcd = { + 'html': html, + 'zip_html': zip_html, + 'upload_dev': upload_dev, + 'upload_stable': upload_stable, + 'upload_dev_pdf': upload_dev_pdf, + 'upload_stable_pdf': upload_stable_pdf, + 'latex': latex, + 'latex_forced': latex_forced, + 'clean': clean, + 'auto_dev': auto_dev_build, + 'auto_debug': lambda: auto_dev_build(True), + 'build_pandas': build_pandas, + 'all': all, +} + +small_docs = False + +# current_dir = os.getcwd() +# os.chdir(os.path.dirname(os.path.join(current_dir, __file__))) + +import argparse +argparser = argparse.ArgumentParser(description=""" +pandas documentation builder +""".strip()) + +# argparser.add_argument('-arg_name', '--arg_name', +# metavar='label for arg help', +# type=str|etc, +# nargs='N|*|?|+|argparse.REMAINDER', +# required=False, +# #choices='abc', +# help='help string', +# action='store|store_true') + +# args = argparser.parse_args() + +#print args.accumulate(args.integers) + +def generate_index(api=True, single=False, **kwds): + from jinja2 import Template + with open("source/index.rst.template") as f: + t = Template(f.read()) + + with open("source/index.rst","w") as f: + f.write(t.render(api=api,single=single,**kwds)) + +import argparse +argparser = argparse.ArgumentParser(description="pandas documentation builder", + epilog="Targets : %s" % funcd.keys()) + +argparser.add_argument('--no-api', + default=False, + help='Ommit api and autosummary', + action='store_true') +argparser.add_argument('--single', + metavar='FILENAME', + type=str, + default=False, + help='filename of section to compile, e.g. "indexing"') + +def main(): + args, unknown = argparser.parse_known_args() + sys.argv = [sys.argv[0]] + unknown + if args.single: + args.single = os.path.basename(args.single).split(".rst")[0] + + if 'clean' in unknown: + args.single=False + + generate_index(api=not args.no_api and not args.single, single=args.single) + + if len(sys.argv) > 2: + ftype = sys.argv[1] + ver = sys.argv[2] + + if ftype == 'build_previous': + build_prev(ver) + if ftype == 'upload_previous': + upload_prev(ver) + elif len(sys.argv) == 2: + for arg in sys.argv[1:]: + func = funcd.get(arg) + if func is None: + raise SystemExit('Do not know how to handle %s; valid args are %s' % ( + arg, list(funcd.keys()))) + func() + else: + small_docs = False + all() +# os.chdir(current_dir) + +if __name__ == '__main__': + import sys + sys.exit(main()) diff --git a/doc/plots/stats/moment_plots.py b/doc/plots/stats/moment_plots.py new file mode 100644 index 00000000..9e3a9025 --- /dev/null +++ b/doc/plots/stats/moment_plots.py @@ -0,0 +1,30 @@ +import numpy as np + +import matplotlib.pyplot as plt +import pandas.util.testing as t +import pandas.stats.moments as m + + +def test_series(n=1000): + t.N = n + s = t.makeTimeSeries() + return s + + +def plot_timeseries(*args, **kwds): + n = len(args) + + fig, axes = plt.subplots(n, 1, figsize=kwds.get('size', (10, 5)), + sharex=True) + titles = kwds.get('titles', None) + + for k in range(1, n + 1): + ax = axes[k - 1] + ts = args[k - 1] + ax.plot(ts.index, ts.values) + + if titles: + ax.set_title(titles[k - 1]) + + fig.autofmt_xdate() + fig.subplots_adjust(bottom=0.10, top=0.95) diff --git a/doc/plots/stats/moments_ewma.py b/doc/plots/stats/moments_ewma.py new file mode 100644 index 00000000..3e521ed6 --- /dev/null +++ b/doc/plots/stats/moments_ewma.py @@ -0,0 +1,15 @@ +import matplotlib.pyplot as plt +import pandas.util.testing as t +import pandas.stats.moments as m + +t.N = 200 +s = t.makeTimeSeries().cumsum() + +plt.figure(figsize=(10, 5)) +plt.plot(s.index, s.values) +plt.plot(s.index, m.ewma(s, 20, min_periods=1).values) +f = plt.gcf() +f.autofmt_xdate() + +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_ewmvol.py b/doc/plots/stats/moments_ewmvol.py new file mode 100644 index 00000000..093f6286 --- /dev/null +++ b/doc/plots/stats/moments_ewmvol.py @@ -0,0 +1,23 @@ +import matplotlib.pyplot as plt +import pandas.util.testing as t +import pandas.stats.moments as m + +t.N = 500 +ts = t.makeTimeSeries() +ts[::100] = 20 + +s = ts.cumsum() + + +plt.figure(figsize=(10, 5)) +plt.plot(s.index, m.ewmvol(s, span=50, min_periods=1).values, color='b') +plt.plot(s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') + +plt.title('Exp-weighted std with shocks') +plt.legend(('Exp-weighted', 'Equal-weighted')) + +f = plt.gcf() +f.autofmt_xdate() + +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_expw.py b/doc/plots/stats/moments_expw.py new file mode 100644 index 00000000..5fff419b --- /dev/null +++ b/doc/plots/stats/moments_expw.py @@ -0,0 +1,35 @@ +from moment_plots import * + +np.random.seed(1) + +ts = test_series(500) * 10 + +# ts[::100] = 20 + +s = ts.cumsum() + +fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) + +ax0, ax1, ax2 = axes + +ax0.plot(s.index, s.values) +ax0.set_title('time series') + +ax1.plot(s.index, m.ewma(s, span=50, min_periods=1).values, color='b') +ax1.plot(s.index, m.rolling_mean(s, 50, min_periods=1).values, color='r') +ax1.set_title('rolling_mean vs. ewma') + +line1 = ax2.plot( + s.index, m.ewmstd(s, span=50, min_periods=1).values, color='b') +line2 = ax2.plot( + s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') +ax2.set_title('rolling_std vs. ewmstd') + +fig.legend((line1, line2), + ('Exp-weighted', 'Equal-weighted'), + loc='upper right') +fig.autofmt_xdate() +fig.subplots_adjust(bottom=0.10, top=0.95) + +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_rolling.py b/doc/plots/stats/moments_rolling.py new file mode 100644 index 00000000..30a6c5f5 --- /dev/null +++ b/doc/plots/stats/moments_rolling.py @@ -0,0 +1,24 @@ +from moment_plots import * + +ts = test_series() +s = ts.cumsum() + +s[20:50] = np.NaN +s[120:150] = np.NaN +plot_timeseries(s, + m.rolling_count(s, 50), + m.rolling_sum(s, 50, min_periods=10), + m.rolling_mean(s, 50, min_periods=10), + m.rolling_std(s, 50, min_periods=10), + m.rolling_skew(s, 50, min_periods=10), + m.rolling_kurt(s, 50, min_periods=10), + size=(10, 12), + titles=('time series', + 'rolling_count', + 'rolling_sum', + 'rolling_mean', + 'rolling_std', + 'rolling_skew', + 'rolling_kurt')) +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_rolling_binary.py b/doc/plots/stats/moments_rolling_binary.py new file mode 100644 index 00000000..ab6b7b1c --- /dev/null +++ b/doc/plots/stats/moments_rolling_binary.py @@ -0,0 +1,30 @@ +from moment_plots import * + +np.random.seed(1) + +ts = test_series() +s = ts.cumsum() +ts2 = test_series() +s2 = ts2.cumsum() + +s[20:50] = np.NaN +s[120:150] = np.NaN +fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) + +ax0, ax1, ax2 = axes + +ax0.plot(s.index, s.values) +ax0.plot(s2.index, s2.values) +ax0.set_title('time series') + +ax1.plot(s.index, m.rolling_corr(s, s2, 50, min_periods=1).values) +ax1.set_title('rolling_corr') + +ax2.plot(s.index, m.rolling_cov(s, s2, 50, min_periods=1).values) +ax2.set_title('rolling_cov') + +fig.autofmt_xdate() +fig.subplots_adjust(bottom=0.10, top=0.95) + +plt.show() +plt.close('all') diff --git a/doc/source/10min.rst b/doc/source/10min.rst new file mode 100644 index 00000000..a9a97ee5 --- /dev/null +++ b/doc/source/10min.rst @@ -0,0 +1,753 @@ +.. _10min: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + import os + np.random.seed(123456) + from pandas import options + import pandas as pd + np.set_printoptions(precision=4, suppress=True) + options.display.mpl_style='default' + options.display.max_rows=15 + + #### portions of this were borrowed from the + #### Pandas cheatsheet + #### created during the PyData Workshop-Sprint 2012 + #### Hannah Chen, Henry Chow, Eric Cox, Robert Mauriello + + +******************** +10 Minutes to pandas +******************** + +This is a short introduction to pandas, geared mainly for new users. +You can see more complex recipes in the :ref:`Cookbook` + +Customarily, we import as follows + +.. ipython:: python + + import pandas as pd + import numpy as np + import matplotlib.pyplot as plt + +Object Creation +--------------- + +See the :ref:`Data Structure Intro section ` + +Creating a ``Series`` by passing a list of values, letting pandas create a default +integer index + +.. ipython:: python + + s = pd.Series([1,3,5,np.nan,6,8]) + s + +Creating a ``DataFrame`` by passing a numpy array, with a datetime index and labeled columns. + +.. ipython:: python + + dates = pd.date_range('20130101',periods=6) + dates + df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) + df + +Creating a ``DataFrame`` by passing a dict of objects that can be converted to series-like. + +.. ipython:: python + + df2 = pd.DataFrame({ 'A' : 1., + 'B' : pd.Timestamp('20130102'), + 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), + 'D' : np.array([3] * 4,dtype='int32'), + 'E' : 'foo' }) + df2 + +Having specific :ref:`dtypes ` + +.. ipython:: python + + df2.dtypes + +If you're using IPython, tab completion for column names (as well as public +attributes) is automatically enabled. Here's a subset of the attributes that +will be completed: + +.. ipython:: + + @verbatim + In [1]: df2. + df2.A df2.boxplot + df2.abs df2.C + df2.add df2.clip + df2.add_prefix df2.clip_lower + df2.add_suffix df2.clip_upper + df2.align df2.columns + df2.all df2.combine + df2.any df2.combineAdd + df2.append df2.combine_first + df2.apply df2.combineMult + df2.applymap df2.compound + df2.as_blocks df2.consolidate + df2.asfreq df2.convert_objects + df2.as_matrix df2.copy + df2.astype df2.corr + df2.at df2.corrwith + df2.at_time df2.count + df2.axes df2.cov + df2.B df2.cummax + df2.between_time df2.cummin + df2.bfill df2.cumprod + df2.blocks df2.cumsum + df2.bool df2.D + +As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically +tab completed. ``E`` is there as well; the rest of the attributes have been +truncated for brevity. + +Viewing Data +------------ + +See the :ref:`Basics section ` + +See the top & bottom rows of the frame + +.. ipython:: python + + df.head() + df.tail(3) + +Display the index,columns, and the underlying numpy data + +.. ipython:: python + + df.index + df.columns + df.values + +Describe shows a quick statistic summary of your data + +.. ipython:: python + + df.describe() + +Transposing your data + +.. ipython:: python + + df.T + +Sorting by an axis + +.. ipython:: python + + df.sort_index(axis=1, ascending=False) + +Sorting by values + +.. ipython:: python + + df.sort(columns='B') + +Selection +--------- + +.. note:: + + While standard Python / Numpy expressions for selecting and setting are + intuitive and come in handy for interactive work, for production code, we + recommend the optimized pandas data access methods, ``.at``, ``.iat``, + ``.loc``, ``.iloc`` and ``.ix``. + +See the :ref:`Indexing section ` and below. + +Getting +~~~~~~~ + +Selecting a single column, which yields a ``Series``, +equivalent to ``df.A`` + +.. ipython:: python + + df['A'] + +Selecting via ``[]``, which slices the rows. + +.. ipython:: python + + df[0:3] + df['20130102':'20130104'] + +Selection by Label +~~~~~~~~~~~~~~~~~~ + +See more in :ref:`Selection by Label ` + +For getting a cross section using a label + +.. ipython:: python + + df.loc[dates[0]] + +Selecting on a multi-axis by label + +.. ipython:: python + + df.loc[:,['A','B']] + +Showing label slicing, both endpoints are *included* + +.. ipython:: python + + df.loc['20130102':'20130104',['A','B']] + +Reduction in the dimensions of the returned object + +.. ipython:: python + + df.loc['20130102',['A','B']] + +For getting a scalar value + +.. ipython:: python + + df.loc[dates[0],'A'] + +For getting fast access to a scalar (equiv to the prior method) + +.. ipython:: python + + df.at[dates[0],'A'] + +Selection by Position +~~~~~~~~~~~~~~~~~~~~~ + +See more in :ref:`Selection by Position ` + +Select via the position of the passed integers + +.. ipython:: python + + df.iloc[3] + +By integer slices, acting similar to numpy/python + +.. ipython:: python + + df.iloc[3:5,0:2] + +By lists of integer position locations, similar to the numpy/python style + +.. ipython:: python + + df.iloc[[1,2,4],[0,2]] + +For slicing rows explicitly + +.. ipython:: python + + df.iloc[1:3,:] + +For slicing columns explicitly + +.. ipython:: python + + df.iloc[:,1:3] + +For getting a value explicity + +.. ipython:: python + + df.iloc[1,1] + +For getting fast access to a scalar (equiv to the prior method) + +.. ipython:: python + + df.iat[1,1] + +Boolean Indexing +~~~~~~~~~~~~~~~~ + +Using a single column's values to select data. + +.. ipython:: python + + df[df.A > 0] + +A ``where`` operation for getting. + +.. ipython:: python + + df[df > 0] + +Using the :func:`~Series.isin` method for filtering: + +.. ipython:: python + + df2 = df.copy() + df2['E']=['one', 'one','two','three','four','three'] + df2 + df2[df2['E'].isin(['two','four'])] + +Setting +~~~~~~~ + +Setting a new column automatically aligns the data +by the indexes + +.. ipython:: python + + s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130102',periods=6)) + s1 + df['F'] = s1 + +Setting values by label + +.. ipython:: python + + df.at[dates[0],'A'] = 0 + +Setting values by position + +.. ipython:: python + + df.iat[0,1] = 0 + +Setting by assigning with a numpy array + +.. ipython:: python + + df.loc[:,'D'] = np.array([5] * len(df)) + +The result of the prior setting operations + +.. ipython:: python + + df + +A ``where`` operation with setting. + +.. ipython:: python + + df2 = df.copy() + df2[df2 > 0] = -df2 + df2 + + +Missing Data +------------ + +pandas primarily uses the value ``np.nan`` to represent missing data. It is by +default not included in computations. See the :ref:`Missing Data section +` + +Reindexing allows you to change/add/delete the index on a specified axis. This +returns a copy of the data. + +.. ipython:: python + + df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E']) + df1.loc[dates[0]:dates[1],'E'] = 1 + df1 + +To drop any rows that have missing data. + +.. ipython:: python + + df1.dropna(how='any') + +Filling missing data + +.. ipython:: python + + df1.fillna(value=5) + +To get the boolean mask where values are ``nan`` + +.. ipython:: python + + pd.isnull(df1) + + +Operations +---------- + +See the :ref:`Basic section on Binary Ops ` + +Stats +~~~~~ + +Operations in general *exclude* missing data. + +Performing a descriptive statistic + +.. ipython:: python + + df.mean() + +Same operation on the other axis + +.. ipython:: python + + df.mean(1) + +Operating with objects that have different dimensionality and need alignment. +In addition, pandas automatically broadcasts along the specified dimension. + +.. ipython:: python + + s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2) + s + df.sub(s,axis='index') + + +Apply +~~~~~ + +Applying functions to the data + +.. ipython:: python + + df.apply(np.cumsum) + df.apply(lambda x: x.max() - x.min()) + +Histogramming +~~~~~~~~~~~~~ + +See more at :ref:`Histogramming and Discretization ` + +.. ipython:: python + + s = pd.Series(np.random.randint(0,7,size=10)) + s + s.value_counts() + +String Methods +~~~~~~~~~~~~~~ + +See more at :ref:`Vectorized String Methods ` + +.. ipython:: python + + s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s.str.lower() + +Merge +----- + +Concat +~~~~~~ + +pandas provides various facilities for easily combining together Series, +DataFrame, and Panel objects with various kinds of set logic for the indexes +and relational algebra functionality in the case of join / merge-type +operations. + +See the :ref:`Merging section ` + +Concatenating pandas objects together + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10, 4)) + df + + # break it into pieces + pieces = [df[:3], df[3:7], df[7:]] + + pd.concat(pieces) + +Join +~~~~ + +SQL style merges. See the :ref:`Database style joining ` + +.. ipython:: python + + left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) + right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left + right + pd.merge(left, right, on='key') + +Append +~~~~~~ + +Append rows to a dataframe. See the :ref:`Appending ` + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df + s = df.iloc[3] + df.append(s, ignore_index=True) + + +Grouping +-------- + +By "group by" we are referring to a process involving one or more of the +following steps + + - **Splitting** the data into groups based on some criteria + - **Applying** a function to each group independently + - **Combining** the results into a data structure + +See the :ref:`Grouping section ` + +.. ipython:: python + + df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), + 'D' : np.random.randn(8)}) + df + +Grouping and then applying a function ``sum`` to the resulting groups. + +.. ipython:: python + + df.groupby('A').sum() + +Grouping by multiple columns forms a hierarchical index, which we then apply +the function. + +.. ipython:: python + + df.groupby(['A','B']).sum() + +Reshaping +--------- + +See the section on :ref:`Hierarchical Indexing ` and +see the section on :ref:`Reshaping `). + +Stack +~~~~~ + +.. ipython:: python + + tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']])) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) + df2 = df[:4] + df2 + +The ``stack`` function "compresses" a level in the DataFrame's columns. + +.. ipython:: python + + stacked = df2.stack() + stacked + +With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the +``index``), the inverse operation of ``stack`` is ``unstack``, which by default +unstacks the **last level**: + +.. ipython:: python + + stacked.unstack() + stacked.unstack(1) + stacked.unstack(0) + +Pivot Tables +~~~~~~~~~~~~ +See the section on :ref:`Pivot Tables `. + +.. ipython:: python + + df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, + 'B' : ['A', 'B', 'C'] * 4, + 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, + 'D' : np.random.randn(12), + 'E' : np.random.randn(12)}) + df + +We can produce pivot tables from this data very easily: + +.. ipython:: python + + pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) + + +Time Series +----------- + +pandas has simple, powerful, and efficient functionality for performing +resampling operations during frequency conversion (e.g., converting secondly +data into 5-minutely data). This is extremely common in, but not limited to, +financial applications. See the :ref:`Time Series section ` + +.. ipython:: python + + rng = pd.date_range('1/1/2012', periods=100, freq='S') + ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) + ts.resample('5Min', how='sum') + +Time zone representation + +.. ipython:: python + + rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') + ts = pd.Series(np.random.randn(len(rng)), rng) + ts + ts_utc = ts.tz_localize('UTC') + ts_utc + +Convert to another time zone + +.. ipython:: python + + ts_utc.tz_convert('US/Eastern') + +Converting between time span representations + +.. ipython:: python + + rng = pd.date_range('1/1/2012', periods=5, freq='M') + ts = pd.Series(np.random.randn(len(rng)), index=rng) + ts + ps = ts.to_period() + ps + ps.to_timestamp() + +Converting between period and timestamp enables some convenient arithmetic +functions to be used. In the following example, we convert a quarterly +frequency with year ending in November to 9am of the end of the month following +the quarter end: + +.. ipython:: python + + prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') + ts = pd.Series(np.random.randn(len(prng)), prng) + ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.head() + + +Plotting +-------- + +:ref:`Plotting ` docs. + +.. ipython:: python + :suppress: + + import matplotlib.pyplot as plt + plt.close('all') + from pandas import options + options.display.mpl_style='default' + +.. ipython:: python + + ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + @savefig series_plot_basic.png + ts.plot() + +On DataFrame, ``plot`` is a convenience to plot all of the columns with labels: + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + + @savefig frame_plot_basic.png + plt.figure(); df.plot(); plt.legend(loc='best') + +Getting Data In/Out +------------------- + +CSV +~~~ + +:ref:`Writing to a csv file ` + +.. ipython:: python + + df.to_csv('foo.csv') + +:ref:`Reading from a csv file ` + +.. ipython:: python + + pd.read_csv('foo.csv') + +.. ipython:: python + :suppress: + + os.remove('foo.csv') + +HDF5 +~~~~ + +Reading and writing to :ref:`HDFStores ` + +Writing to a HDF5 Store + +.. ipython:: python + + df.to_hdf('foo.h5','df') + +Reading from a HDF5 Store + +.. ipython:: python + + pd.read_hdf('foo.h5','df') + +.. ipython:: python + :suppress: + + os.remove('foo.h5') + +Excel +~~~~~ + +Reading and writing to :ref:`MS Excel ` + +Writing to an excel file + +.. ipython:: python + + df.to_excel('foo.xlsx', sheet_name='Sheet1') + +Reading from an excel file + +.. ipython:: python + + pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) + +.. ipython:: python + :suppress: + + os.remove('foo.xlsx') + +Gotchas +------- + +If you are trying an operation and you see an exception like: + +.. code-block:: python + + >>> if pd.Series([False, True, False]): + print("I was true") + Traceback + ... + ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). + +See :ref:`Comparisons` for an explanation and what to do. + +See :ref:`Gotchas` as well. diff --git a/doc/source/_static/banklist.html b/doc/source/_static/banklist.html new file mode 100644 index 00000000..8ec1561f --- /dev/null +++ b/doc/source/_static/banklist.html @@ -0,0 +1,4885 @@ + + + + +FDIC: Failed Bank List + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip Header +
+
+
+ + +
+ + +

Federal Deposit
Insurance Corporation

+

Each depositor insured to at least $250,000 per insured bank

+
+ +
+
+ + + + + + +
+ +

Failed Bank List

+ +

The FDIC is often appointed as receiver for failed banks. This page contains useful information for the customers and vendors of these banks. This includes information on the acquiring bank (if applicable), how your accounts and loans are affected, and how vendors can file claims against the receivership. Failed Financial Institution Contact Search displays point of contact information related to failed banks.

+ +

This list includes banks which have failed since October 1, 2000. To search for banks that failed prior to those on this page, visit this link: Failures and Assistance Transactions

+ +

Failed Bank List - CSV file (Updated on Mondays. Also opens in Excel - Excel Help)

+ +

Due to the small screen size some information is no longer visible.
Full information available when viewed on a larger screen.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Bank NameCitySTCERTAcquiring InstitutionClosing DateUpdated Date
Banks of Wisconsin d/b/a Bank of KenoshaKenoshaWI35386North Shore Bank, FSBMay 31, 2013May 31, 2013
Central Arizona BankScottsdaleAZ34527Western State BankMay 14, 2013May 20, 2013
Sunrise BankValdostaGA58185Synovus BankMay 10, 2013May 21, 2013
Pisgah Community BankAshevilleNC58701Capital Bank, N.A.May 10, 2013May 14, 2013
Douglas County BankDouglasvilleGA21649Hamilton State BankApril 26, 2013May 16, 2013
Parkway BankLenoirNC57158CertusBank, National AssociationApril 26, 2013May 17, 2013
Chipola Community BankMariannaFL58034First Federal Bank of FloridaApril 19, 2013May 16, 2013
Heritage Bank of North FloridaOrange ParkFL26680FirstAtlantic BankApril 19, 2013May 16, 2013
First Federal BankLexingtonKY29594Your Community BankApril 19, 2013April 23, 2013
Gold Canyon BankGold CanyonAZ58066First Scottsdale Bank, National AssociationApril 5, 2013April 9, 2013
Frontier BankLaGrangeGA16431HeritageBank of the SouthMarch 8, 2013March 26, 2013
Covenant BankChicagoIL22476Liberty Bank and Trust CompanyFebruary 15, 2013March 4, 2013
1st Regents BankAndoverMN57157First Minnesota BankJanuary 18, 2013February 28, 2013
Westside Community BankUniversity PlaceWA33997Sunwest BankJanuary 11, 2013January 24, 2013
Community Bank of the OzarksSunrise BeachMO27331Bank of SullivanDecember 14, 2012January 24, 2013
Hometown Community BankBraseltonGA57928CertusBank, National AssociationNovember 16, 2012January 24, 2013
Citizens First National BankPrincetonIL3731Heartland Bank and Trust CompanyNovember 2, 2012January 24, 2013
Heritage Bank of FloridaLutzFL35009Centennial BankNovember 2, 2012January 24, 2013
NOVA BankBerwynPA27148No AcquirerOctober 26, 2012January 24, 2013
Excel BankSedaliaMO19189Simmons First National BankOctober 19, 2012January 24, 2013
First East Side Savings BankTamaracFL28144Stearns Bank N.A.October 19, 2012January 24, 2013
GulfSouth Private BankDestinFL58073SmartBankOctober 19, 2012January 24, 2013
First United BankCreteIL20685Old Plank Trail Community Bank, National AssociationSeptember 28, 2012November 15, 2012
Truman BankSt. LouisMO27316Simmons First National BankSeptember 14, 2012December 17, 2012
First Commercial BankBloomingtonMN35246Republic Bank & Trust CompanySeptember 7, 2012December 17, 2012
Waukegan Savings BankWaukeganIL28243First Midwest BankAugust 3, 2012October 11, 2012
Jasper Banking CompanyJasperGA16240Stearns Bank N.A.July 27, 2012December 17, 2012
Second Federal Savings and Loan Association of ChicagoChicagoIL27986Hinsdale Bank & Trust CompanyJuly 20, 2012January 14, 2013
Heartland BankLeawoodKS1361Metcalf BankJuly 20, 2012December 17, 2012
First Cherokee State BankWoodstockGA32711Community & Southern BankJuly 20, 2012October 31, 2012
Georgia Trust BankBufordGA57847Community & Southern BankJuly 20, 2012December 17, 2012
The Royal Palm Bank of FloridaNaplesFL57096First National Bank of the Gulf CoastJuly 20, 2012January 7, 2013
Glasgow Savings BankGlasgowMO1056Regional Missouri BankJuly 13, 2012October 11, 2012
Montgomery Bank & TrustAileyGA19498Ameris BankJuly 6, 2012October 31, 2012
The Farmers Bank of LynchburgLynchburgTN1690Clayton Bank and TrustJune 15, 2012October 31, 2012
Security Exchange BankMariettaGA35299Fidelity BankJune 15, 2012October 10, 2012
Putnam State BankPalatkaFL27405Harbor Community BankJune 15, 2012October 10, 2012
Waccamaw BankWhitevilleNC34515First Community BankJune 8, 2012November 8, 2012
Farmers' and Traders' State BankShabbonaIL9257First State BankJune 8, 2012October 10, 2012
Carolina Federal Savings BankCharlestonSC35372Bank of North CarolinaJune 8, 2012October 31, 2012
First Capital BankKingfisherOK416F & M BankJune 8, 2012October 10, 2012
Alabama Trust Bank, National AssociationSylacaugaAL35224Southern States BankMay 18, 2012May 20, 2013
Security Bank, National AssociationNorth LauderdaleFL23156Banesco USAMay 4, 2012October 31, 2012
Palm Desert National BankPalm DesertCA23632Pacific Premier BankApril 27, 2012May 17, 2013
Plantation Federal BankPawleys IslandSC32503First Federal BankApril 27, 2012May 17, 2013
Inter Savings Bank, fsb D/B/A InterBank, fsbMaple GroveMN31495Great Southern BankApril 27, 2012May 17, 2013
HarVest Bank of MarylandGaithersburgMD57766SonabankApril 27, 2012May 17, 2013
Bank of the Eastern ShoreCambridgeMD26759No AcquirerApril 27, 2012October 17, 2012
Fort Lee Federal Savings Bank, FSBFort LeeNJ35527Alma BankApril 20, 2012May 17, 2013
Fidelity BankDearbornMI33883The Huntington National BankMarch 30, 2012May 16, 2013
Premier BankWilmetteIL35419International Bank of ChicagoMarch 23, 2012October 17, 2012
Covenant Bank & TrustRock SpringGA58068Stearns Bank, N.A.March 23, 2012October 31, 2012
New City BankChicagoIL57597No AcquirerMarch 9, 2012October 29, 2012
Global Commerce BankDoravilleGA34046Metro City BankMarch 2, 2012October 31, 2012
Home Savings of AmericaLittle FallsMN29178No AcquirerFebruary 24, 2012December 17, 2012
Central Bank of GeorgiaEllavilleGA5687Ameris BankFebruary 24, 2012August 9, 2012
SCB BankShelbyvilleIN29761First Merchants Bank, National AssociationFebruary 10, 2012March 25, 2013
Charter National Bank and TrustHoffman EstatesIL23187Barrington Bank & Trust Company, National AssociationFebruary 10, 2012March 25, 2013
BankEastKnoxvilleTN19869U.S.Bank National AssociationJanuary 27, 2012March 8, 2013
Patriot Bank MinnesotaForest LakeMN34823First Resource BankJanuary 27, 2012September 12, 2012
Tennessee Commerce BankFranklinTN35296Republic Bank & Trust CompanyJanuary 27, 2012November 20, 2012
First Guaranty Bank and Trust Company of JacksonvilleJacksonvilleFL16579CenterState Bank of Florida, N.A.January 27, 2012September 12, 2012
American Eagle Savings BankBoothwynPA31581Capital Bank, N.A.January 20, 2012January 25, 2013
The First State BankStockbridgeGA19252Hamilton State BankJanuary 20, 2012January 25, 2013
Central Florida State BankBelleviewFL57186CenterState Bank of Florida, N.A.January 20, 2012January 25, 2013
Western National BankPhoenixAZ57917Washington FederalDecember 16, 2011August 13, 2012
Premier Community Bank of the Emerald CoastCrestviewFL58343Summit BankDecember 16, 2011September 12, 2012
Central Progressive BankLacombeLA19657First NBC BankNovember 18, 2011August 13, 2012
Polk County BankJohnstonIA14194Grinnell State BankNovember 18, 2011August 15, 2012
Community Bank of RockmartRockmartGA57860Century Bank of GeorgiaNovember 10, 2011August 13, 2012
SunFirst BankSaint GeorgeUT57087Cache Valley BankNovember 4, 2011November 16, 2012
Mid City Bank, Inc.OmahaNE19397Premier BankNovember 4, 2011August 15, 2012
All American BankDes PlainesIL57759International Bank of ChicagoOctober 28, 2011August 15, 2012
Community Banks of ColoradoGreenwood VillageCO21132Bank Midwest, N.A.October 21, 2011January 2, 2013
Community Capital BankJonesboroGA57036State Bank and Trust CompanyOctober 21, 2011November 8, 2012
Decatur First BankDecaturGA34392Fidelity BankOctober 21, 2011November 8, 2012
Old Harbor BankClearwaterFL575371st United BankOctober 21, 2011November 8, 2012
Country BankAledoIL35395Blackhawk Bank & TrustOctober 14, 2011August 15, 2012
First State BankCranfordNJ58046Northfield BankOctober 14, 2011November 8, 2012
Blue Ridge Savings Bank, Inc.AshevilleNC32347Bank of North CarolinaOctober 14, 2011November 8, 2012
Piedmont Community BankGrayGA57256State Bank and Trust CompanyOctober 14, 2011January 22, 2013
Sun Security BankEllingtonMO20115Great Southern BankOctober 7, 2011November 7, 2012
The RiverBankWyomingMN10216Central BankOctober 7, 2011November 7, 2012
First International BankPlanoTX33513American First National BankSeptember 30, 2011October 9, 2012
Citizens Bank of Northern CaliforniaNevada CityCA33983Tri Counties BankSeptember 23, 2011October 9, 2012
Bank of the CommonwealthNorfolkVA20408Southern Bank and Trust CompanySeptember 23, 2011October 9, 2012
The First National Bank of FloridaMiltonFL25155CharterBankSeptember 9, 2011September 6, 2012
CreekSide BankWoodstockGA58226Georgia Commerce BankSeptember 2, 2011September 6, 2012
Patriot Bank of GeorgiaCummingGA58273Georgia Commerce BankSeptember 2, 2011November 2, 2012
First Choice BankGenevaIL57212Inland Bank & TrustAugust 19, 2011August 15, 2012
First Southern National BankStatesboroGA57239Heritage Bank of the SouthAugust 19, 2011November 2, 2012
Lydian Private BankPalm BeachFL35356Sabadell United Bank, N.A.August 19, 2011November 2, 2012
Public Savings BankHuntingdon ValleyPA34130Capital Bank, N.A.August 18, 2011August 15, 2012
The First National Bank of OlatheOlatheKS4744Enterprise Bank & TrustAugust 12, 2011August 23, 2012
Bank of WhitmanColfaxWA22528Columbia State BankAugust 5, 2011August 16, 2012
Bank of ShorewoodShorewoodIL22637Heartland Bank and Trust CompanyAugust 5, 2011August 16, 2012
Integra Bank National AssociationEvansvilleIN4392Old National BankJuly 29, 2011August 16, 2012
BankMeridian, N.A.ColumbiaSC58222SCBT National AssociationJuly 29, 2011November 2, 2012
Virginia Business BankRichmondVA58283Xenith BankJuly 29, 2011October 9, 2012
Bank of ChoiceGreeleyCO2994Bank Midwest, N.A.July 22, 2011September 12, 2012
LandMark Bank of FloridaSarasotaFL35244American Momentum BankJuly 22, 2011November 2, 2012
Southshore Community BankApollo BeachFL58056American Momentum BankJuly 22, 2011November 2, 2012
Summit BankPrescottAZ57442The Foothills BankJuly 15, 2011August 16, 2012
First Peoples BankPort St. LucieFL34870Premier American Bank, N.A.July 15, 2011November 2, 2012
High Trust BankStockbridgeGA19554Ameris BankJuly 15, 2011November 2, 2012
One Georgia BankAtlantaGA58238Ameris BankJuly 15, 2011November 2, 2012
Signature BankWindsorCO57835Points West Community BankJuly 8, 2011October 26, 2012
Colorado Capital BankCastle RockCO34522First-Citizens Bank & Trust CompanyJuly 8, 2011January 15, 2013
First Chicago Bank & TrustChicagoIL27935Northbrook Bank & Trust CompanyJuly 8, 2011September 9, 2012
Mountain Heritage BankClaytonGA57593First American Bank and Trust CompanyJune 24, 2011November 2, 2012
First Commercial Bank of Tampa BayTampaFL27583Stonegate BankJune 17, 2011November 2, 2012
McIntosh State BankJacksonGA19237Hamilton State BankJune 17, 2011November 2, 2012
Atlantic Bank and TrustCharlestonSC58420First Citizens Bank and Trust Company, Inc.June 3, 2011October 31, 2012
First Heritage BankSnohomishWA23626Columbia State BankMay 27, 2011January 28, 2013
Summit BankBurlingtonWA513Columbia State BankMay 20, 2011January 22, 2013
First Georgia Banking CompanyFranklinGA57647CertusBank, National AssociationMay 20, 2011November 13, 2012
Atlantic Southern BankMaconGA57213CertusBank, National AssociationMay 20, 2011October 31, 2012
Coastal BankCocoa BeachFL34898Florida Community Bank, a division of Premier American Bank, N.A.May 6, 2011November 30, 2012
Community Central BankMount ClemensMI34234Talmer Bank & TrustApril 29, 2011August 16, 2012
The Park Avenue BankValdostaGA19797Bank of the OzarksApril 29, 2011November 30, 2012
First Choice Community BankDallasGA58539Bank of the OzarksApril 29, 2011January 22, 2013
Cortez Community BankBrooksvilleFL57625Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
First National Bank of Central FloridaWinter ParkFL26297Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
Heritage Banking GroupCarthageMS14273Trustmark National BankApril 15, 2011November 30, 2012
Rosemount National BankRosemountMN24099Central BankApril 15, 2011August 16, 2012
Superior BankBirminghamAL17750Superior Bank, National AssociationApril 15, 2011November 30, 2012
Nexity BankBirminghamAL19794AloStar Bank of CommerceApril 15, 2011September 4, 2012
New Horizons BankEast EllijayGA57705Citizens South BankApril 15, 2011August 16, 2012
Bartow County BankCartersvilleGA21495Hamilton State BankApril 15, 2011January 22, 2013
Nevada Commerce BankLas VegasNV35418City National BankApril 8, 2011September 9, 2012
Western Springs National Bank and TrustWestern SpringsIL10086Heartland Bank and Trust CompanyApril 8, 2011January 22, 2013
The Bank of CommerceWood DaleIL34292Advantage National Bank GroupMarch 25, 2011January 22, 2013
Legacy BankMilwaukeeWI34818Seaway Bank and Trust CompanyMarch 11, 2011September 12, 2012
First National Bank of DavisDavisOK4077The Pauls Valley National BankMarch 11, 2011August 20, 2012
Valley Community BankSt. CharlesIL34187First State BankFebruary 25, 2011September 12, 2012
San Luis Trust Bank, FSBSan Luis ObispoCA34783First California BankFebruary 18, 2011August 20, 2012
Charter Oak BankNapaCA57855Bank of MarinFebruary 18, 2011September 12, 2012
Citizens Bank of EffinghamSpringfieldGA34601Heritage Bank of the SouthFebruary 18, 2011November 2, 2012
Habersham BankClarkesvilleGA151SCBT National AssociationFebruary 18, 2011November 2, 2012
Canyon National BankPalm SpringsCA34692Pacific Premier BankFebruary 11, 2011September 12, 2012
Badger State BankCassvilleWI13272Royal BankFebruary 11, 2011September 12, 2012
Peoples State BankHamtramckMI14939First Michigan BankFebruary 11, 2011January 22, 2013
Sunshine State Community BankPort OrangeFL35478Premier American Bank, N.A.February 11, 2011November 2, 2012
Community First Bank ChicagoChicagoIL57948Northbrook Bank & Trust CompanyFebruary 4, 2011August 20, 2012
North Georgia BankWatkinsvilleGA35242BankSouthFebruary 4, 2011November 2, 2012
American Trust BankRoswellGA57432Renasant BankFebruary 4, 2011October 31, 2012
First Community BankTaosNM12261U.S. Bank, N.A.January 28, 2011September 12, 2012
FirsTier BankLouisvilleCO57646No AcquirerJanuary 28, 2011September 12, 2012
Evergreen State BankStoughtonWI5328McFarland State BankJanuary 28, 2011September 12, 2012
The First State BankCamargoOK2303Bank 7January 28, 2011September 12, 2012
United Western BankDenverCO31293First-Citizens Bank & Trust CompanyJanuary 21, 2011September 12, 2012
The Bank of AshevilleAshevilleNC34516First BankJanuary 21, 2011November 2, 2012
CommunitySouth Bank & TrustEasleySC57868CertusBank, National AssociationJanuary 21, 2011November 2, 2012
Enterprise Banking CompanyMcDonoughGA19758No AcquirerJanuary 21, 2011November 2, 2012
Oglethorpe BankBrunswickGA57440Bank of the OzarksJanuary 14, 2011November 2, 2012
Legacy BankScottsdaleAZ57820Enterprise Bank & TrustJanuary 7, 2011September 12, 2012
First Commercial Bank of FloridaOrlandoFL34965First Southern BankJanuary 7, 2011November 2, 2012
Community National BankLino LakesMN23306Farmers & Merchants Savings BankDecember 17, 2010August 20, 2012
First Southern BankBatesvilleAR58052Southern BankDecember 17, 2010August 20, 2012
United Americas Bank, N.A.AtlantaGA35065State Bank and Trust CompanyDecember 17, 2010November 2, 2012
Appalachian Community Bank, FSBMcCaysvilleGA58495Peoples Bank of East TennesseeDecember 17, 2010October 31, 2012
Chestatee State BankDawsonvilleGA34578Bank of the OzarksDecember 17, 2010November 2, 2012
The Bank of Miami,N.A.Coral GablesFL190401st United BankDecember 17, 2010November 2, 2012
Earthstar BankSouthamptonPA35561Polonia BankDecember 10, 2010August 20, 2012
Paramount BankFarmington HillsMI34673Level One BankDecember 10, 2010August 20, 2012
First Banking CenterBurlingtonWI5287First Michigan BankNovember 19, 2010August 20, 2012
Allegiance Bank of North AmericaBala CynwydPA35078VIST BankNovember 19, 2010August 20, 2012
Gulf State Community BankCarrabelleFL20340Centennial BankNovember 19, 2010November 2, 2012
Copper Star BankScottsdaleAZ35463Stearns Bank, N.A.November 12, 2010August 20, 2012
Darby Bank & Trust Co.VidaliaGA14580Ameris BankNovember 12, 2010January 15, 2013
Tifton Banking CompanyTiftonGA57831Ameris BankNovember 12, 2010November 2, 2012
First Vietnamese American Bank
In Vietnamese
WestminsterCA57885Grandpoint BankNovember 5, 2010September 12, 2012
Pierce Commercial BankTacomaWA34411Heritage BankNovember 5, 2010August 20, 2012
Western Commercial BankWoodland HillsCA58087First California BankNovember 5, 2010September 12, 2012
K BankRandallstownMD31263Manufacturers and Traders Trust Company (M&T Bank)November 5, 2010August 20, 2012
First Arizona Savings, A FSBScottsdaleAZ32582No AcquirerOctober 22, 2010August 20, 2012
Hillcrest BankOverland ParkKS22173Hillcrest Bank, N.A.October 22, 2010August 20, 2012
First Suburban National BankMaywoodIL16089Seaway Bank and Trust CompanyOctober 22, 2010August 20, 2012
The First National Bank of BarnesvilleBarnesvilleGA2119United BankOctober 22, 2010November 2, 2012
The Gordon BankGordonGA33904Morris BankOctober 22, 2010November 2, 2012
Progress Bank of FloridaTampaFL32251Bay Cities BankOctober 22, 2010November 2, 2012
First Bank of JacksonvilleJacksonvilleFL27573Ameris BankOctober 22, 2010November 2, 2012
Premier BankJefferson CityMO34016Providence BankOctober 15, 2010August 20, 2012
WestBridge Bank and Trust CompanyChesterfieldMO58205Midland States BankOctober 15, 2010August 20, 2012
Security Savings Bank, F.S.B.OlatheKS30898Simmons First National BankOctober 15, 2010August 20, 2012
Shoreline BankShorelineWA35250GBC International BankOctober 1, 2010August 20, 2012
Wakulla BankCrawfordvilleFL21777Centennial BankOctober 1, 2010November 2, 2012
North County BankArlingtonWA35053Whidbey Island BankSeptember 24, 2010August 20, 2012
Haven Trust Bank FloridaPonte Vedra BeachFL58308First Southern BankSeptember 24, 2010November 5, 2012
Maritime Savings BankWest AllisWI28612North Shore Bank, FSBSeptember 17, 2010August 20, 2012
Bramble Savings BankMilfordOH27808Foundation BankSeptember 17, 2010August 20, 2012
The Peoples BankWinderGA182Community & Southern BankSeptember 17, 2010November 5, 2012
First Commerce Community BankDouglasvilleGA57448Community & Southern BankSeptember 17, 2010January 15, 2013
Bank of EllijayEllijayGA58197Community & Southern BankSeptember 17, 2010January 15, 2013
ISN BankCherry HillNJ57107Customers BankSeptember 17, 2010August 22, 2012
Horizon BankBradentonFL35061Bank of the OzarksSeptember 10, 2010November 5, 2012
Sonoma Valley BankSonomaCA27259Westamerica BankAugust 20, 2010September 12, 2012
Los Padres BankSolvangCA32165Pacific Western BankAugust 20, 2010September 12, 2012
Butte Community BankChicoCA33219Rabobank, N.A.August 20, 2010September 12, 2012
Pacific State BankStocktonCA27090Rabobank, N.A.August 20, 2010September 12, 2012
ShoreBankChicagoIL15640Urban Partnership BankAugust 20, 2010May 16, 2013
Imperial Savings and Loan AssociationMartinsvilleVA31623River Community Bank, N.A.August 20, 2010August 24, 2012
Independent National BankOcalaFL27344CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Community National Bank at BartowBartowFL25266CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Palos Bank and Trust CompanyPalos HeightsIL17599First Midwest BankAugust 13, 2010August 22, 2012
Ravenswood BankChicagoIL34231Northbrook Bank & Trust CompanyAugust 6, 2010August 22, 2012
LibertyBankEugeneOR31964Home Federal BankJuly 30, 2010August 22, 2012
The Cowlitz BankLongviewWA22643Heritage BankJuly 30, 2010August 22, 2012
Coastal Community BankPanama City BeachFL9619Centennial BankJuly 30, 2010November 5, 2012
Bayside Savings BankPort Saint JoeFL57669Centennial BankJuly 30, 2010November 5, 2012
Northwest Bank & TrustAcworthGA57658State Bank and Trust CompanyJuly 30, 2010November 5, 2012
Home Valley BankCave JunctionOR23181South Valley Bank & TrustJuly 23, 2010September 12, 2012
SouthwestUSA BankLas VegasNV35434Plaza BankJuly 23, 2010August 22, 2012
Community Security BankNew PragueMN34486RoundbankJuly 23, 2010September 12, 2012
Thunder BankSylvan GroveKS10506The Bennington State BankJuly 23, 2010September 13, 2012
Williamsburg First National BankKingstreeSC17837First Citizens Bank and Trust Company, Inc.July 23, 2010November 5, 2012
Crescent Bank and Trust CompanyJasperGA27559Renasant BankJuly 23, 2010November 5, 2012
Sterling BankLantanaFL32536IBERIABANKJuly 23, 2010November 5, 2012
Mainstreet Savings Bank, FSBHastingsMI28136Commercial BankJuly 16, 2010September 13, 2012
Olde Cypress Community BankClewistonFL28864CenterState Bank of Florida, N.A.July 16, 2010November 5, 2012
Turnberry BankAventuraFL32280NAFH National BankJuly 16, 2010November 5, 2012
Metro Bank of Dade CountyMiamiFL25172NAFH National BankJuly 16, 2010November 5, 2012
First National Bank of the SouthSpartanburgSC35383NAFH National BankJuly 16, 2010November 5, 2012
Woodlands BankBlufftonSC32571Bank of the OzarksJuly 16, 2010November 5, 2012
Home National BankBlackwellOK11636RCB BankJuly 9, 2010December 10, 2012
USA BankPort ChesterNY58072New Century BankJuly 9, 2010September 14, 2012
Ideal Federal Savings BankBaltimoreMD32456No AcquirerJuly 9, 2010September 14, 2012
Bay National BankBaltimoreMD35462Bay Bank, FSBJuly 9, 2010January 15, 2013
High Desert State BankAlbuquerqueNM35279First American BankJune 25, 2010September 14, 2012
First National BankSavannahGA34152The Savannah Bank, N.A.June 25, 2010November 5, 2012
Peninsula BankEnglewoodFL26563Premier American Bank, N.A.June 25, 2010November 5, 2012
Nevada Security BankRenoNV57110Umpqua BankJune 18, 2010August 23, 2012
Washington First International BankSeattleWA32955East West BankJune 11, 2010September 14, 2012
TierOne BankLincolnNE29341Great Western BankJune 4, 2010September 14, 2012
Arcola Homestead Savings BankArcolaIL31813No AcquirerJune 4, 2010September 14, 2012
First National BankRosedaleMS15814The Jefferson BankJune 4, 2010November 5, 2012
Sun West BankLas VegasNV34785City National BankMay 28, 2010September 14, 2012
Granite Community Bank, NAGranite BayCA57315Tri Counties BankMay 28, 2010September 14, 2012
Bank of Florida - TampaTampaFL57814EverBankMay 28, 2010November 5, 2012
Bank of Florida - SouthwestNaplesFL35106EverBankMay 28, 2010November 5, 2012
Bank of Florida - SoutheastFort LauderdaleFL57360EverBankMay 28, 2010November 5, 2012
Pinehurst BankSaint PaulMN57735Coulee BankMay 21, 2010October 26, 2012
Midwest Bank and Trust CompanyElmwood ParkIL18117FirstMerit Bank, N.A.May 14, 2010August 23, 2012
Southwest Community BankSpringfieldMO34255Simmons First National BankMay 14, 2010August 23, 2012
New Liberty BankPlymouthMI35586Bank of Ann ArborMay 14, 2010August 23, 2012
Satilla Community BankSaint MarysGA35114Ameris BankMay 14, 2010November 5, 2012
1st Pacific Bank of CaliforniaSan DiegoCA35517City National BankMay 7, 2010December 13, 2012
Towne Bank of ArizonaMesaAZ57697Commerce Bank of ArizonaMay 7, 2010August 23, 2012
Access BankChamplinMN16476PrinsBankMay 7, 2010August 23, 2012
The Bank of BonifayBonifayFL14246First Federal Bank of FloridaMay 7, 2010November 5, 2012
Frontier BankEverettWA22710Union Bank, N.A.April 30, 2010January 15, 2013
BC National BanksButlerMO17792Community First BankApril 30, 2010August 23, 2012
Champion BankCreve CoeurMO58362BankLibertyApril 30, 2010August 23, 2012
CF BancorpPort HuronMI30005First Michigan BankApril 30, 2010January 15, 2013
Westernbank Puerto Rico
En Espanol
MayaguezPR31027Banco Popular de Puerto RicoApril 30, 2010November 5, 2012
R-G Premier Bank of Puerto Rico
En Espanol
Hato ReyPR32185Scotiabank de Puerto RicoApril 30, 2010November 5, 2012
Eurobank
En Espanol
San JuanPR27150Oriental Bank and TrustApril 30, 2010November 5, 2012
Wheatland BankNapervilleIL58429Wheaton Bank & TrustApril 23, 2010August 23, 2012
Peotone Bank and Trust CompanyPeotoneIL10888First Midwest BankApril 23, 2010August 23, 2012
Lincoln Park Savings BankChicagoIL30600Northbrook Bank & Trust CompanyApril 23, 2010August 23, 2012
New Century BankChicagoIL34821MB Financial Bank, N.A.April 23, 2010August 23, 2012
Citizens Bank and Trust Company of ChicagoChicagoIL34658Republic Bank of ChicagoApril 23, 2010August 23, 2012
Broadway BankChicagoIL22853MB Financial Bank, N.A.April 23, 2010August 23, 2012
Amcore Bank, National AssociationRockfordIL3735Harris N.A.April 23, 2010August 23, 2012
City BankLynnwoodWA21521Whidbey Island BankApril 16, 2010September 14, 2012
Tamalpais BankSan RafaelCA33493Union Bank, N.A.April 16, 2010August 23, 2012
Innovative BankOaklandCA23876Center BankApril 16, 2010August 23, 2012
Butler BankLowellMA26619People's United BankApril 16, 2010August 23, 2012
Riverside National Bank of FloridaFort PierceFL24067TD Bank, N.A.April 16, 2010November 5, 2012
AmericanFirst BankClermontFL57724TD Bank, N.A.April 16, 2010October 31, 2012
First Federal Bank of North FloridaPalatkaFL28886TD Bank, N.A.April 16, 2010January 15, 2013
Lakeside Community BankSterling HeightsMI34878No AcquirerApril 16, 2010August 23, 2012
Beach First National BankMyrtle BeachSC34242Bank of North CarolinaApril 9, 2010November 5, 2012
Desert Hills BankPhoenixAZ57060New York Community BankMarch 26, 2010August 23, 2012
Unity National BankCartersvilleGA34678Bank of the OzarksMarch 26, 2010September 14, 2012
Key West BankKey WestFL34684Centennial BankMarch 26, 2010August 23, 2012
McIntosh Commercial BankCarrolltonGA57399CharterBankMarch 26, 2010August 23, 2012
State Bank of AuroraAuroraMN8221Northern State BankMarch 19, 2010August 23, 2012
First Lowndes BankFort DepositAL24957First Citizens BankMarch 19, 2010August 23, 2012
Bank of HiawasseeHiawasseeGA10054Citizens South BankMarch 19, 2010August 23, 2012
Appalachian Community BankEllijayGA33989Community & Southern BankMarch 19, 2010October 31, 2012
Advanta Bank Corp.DraperUT33535No AcquirerMarch 19, 2010September 14, 2012
Century Security BankDuluthGA58104Bank of UpsonMarch 19, 2010August 23, 2012
American National BankParmaOH18806The National Bank and Trust CompanyMarch 19, 2010August 23, 2012
Statewide BankCovingtonLA29561Home BankMarch 12, 2010August 23, 2012
Old Southern BankOrlandoFL58182Centennial BankMarch 12, 2010August 23, 2012
The Park Avenue BankNew YorkNY27096Valley National BankMarch 12, 2010August 23, 2012
LibertyPointe BankNew YorkNY58071Valley National BankMarch 11, 2010August 23, 2012
Centennial BankOgdenUT34430No AcquirerMarch 5, 2010September 14, 2012
Waterfield BankGermantownMD34976No AcquirerMarch 5, 2010August 23, 2012
Bank of IllinoisNormalIL9268Heartland Bank and Trust CompanyMarch 5, 2010August 23, 2012
Sun American BankBoca RatonFL27126First-Citizens Bank & Trust CompanyMarch 5, 2010August 23, 2012
Rainier Pacific BankTacomaWA38129Umpqua BankFebruary 26, 2010August 23, 2012
Carson River Community BankCarson CityNV58352Heritage Bank of NevadaFebruary 26, 2010January 15, 2013
La Jolla Bank, FSBLa JollaCA32423OneWest Bank, FSBFebruary 19, 2010August 24, 2012
George Washington Savings BankOrland ParkIL29952FirstMerit Bank, N.A.February 19, 2010August 24, 2012
The La Coste National BankLa CosteTX3287Community National BankFebruary 19, 2010September 14, 2012
Marco Community BankMarco IslandFL57586Mutual of Omaha BankFebruary 19, 2010August 24, 2012
1st American State Bank of MinnesotaHancockMN15448Community Development Bank, FSBFebruary 5, 2010August 24, 2012
American Marine BankBainbridge IslandWA16730Columbia State BankJanuary 29, 2010August 24, 2012
First Regional BankLos AngelesCA23011First-Citizens Bank & Trust CompanyJanuary 29, 2010August 24, 2012
Community Bank and TrustCorneliaGA5702SCBT National AssociationJanuary 29, 2010January 15, 2013
Marshall Bank, N.A.HallockMN16133United Valley BankJanuary 29, 2010August 23, 2012
Florida Community BankImmokaleeFL5672Premier American Bank, N.A.January 29, 2010January 15, 2013
First National Bank of GeorgiaCarrolltonGA16480Community & Southern BankJanuary 29, 2010December 13, 2012
Columbia River BankThe DallesOR22469Columbia State BankJanuary 22, 2010September 14, 2012
Evergreen BankSeattleWA20501Umpqua BankJanuary 22, 2010January 15, 2013
Charter BankSanta FeNM32498Charter BankJanuary 22, 2010August 23, 2012
Bank of LeetonLeetonMO8265Sunflower Bank, N.A.January 22, 2010January 15, 2013
Premier American BankMiamiFL57147Premier American Bank, N.A.January 22, 2010December 13, 2012
Barnes Banking CompanyKaysvilleUT1252No AcquirerJanuary 15, 2010August 23, 2012
St. Stephen State BankSt. StephenMN17522First State Bank of St. JosephJanuary 15, 2010August 23, 2012
Town Community Bank & TrustAntiochIL34705First American BankJanuary 15, 2010August 23, 2012
Horizon BankBellinghamWA22977Washington Federal Savings and Loan AssociationJanuary 8, 2010August 23, 2012
First Federal Bank of California, F.S.B.Santa MonicaCA28536OneWest Bank, FSBDecember 18, 2009August 23, 2012
Imperial Capital BankLa JollaCA26348City National BankDecember 18, 2009September 5, 2012
Independent Bankers' BankSpringfieldIL26820The Independent BankersBank (TIB)December 18, 2009August 23, 2012
New South Federal Savings BankIrondaleAL32276Beal BankDecember 18, 2009August 23, 2012
Citizens State BankNew BaltimoreMI1006No AcquirerDecember 18, 2009November 5, 2012
Peoples First Community BankPanama CityFL32167Hancock BankDecember 18, 2009November 5, 2012
RockBridge Commercial BankAtlantaGA58315No AcquirerDecember 18, 2009November 5, 2012
SolutionsBankOverland ParkKS4731Arvest BankDecember 11, 2009August 23, 2012
Valley Capital Bank, N.A.MesaAZ58399Enterprise Bank & TrustDecember 11, 2009August 23, 2012
Republic Federal Bank, N.A.MiamiFL228461st United BankDecember 11, 2009November 5, 2012
Greater Atlantic BankRestonVA32583SonabankDecember 4, 2009November 5, 2012
Benchmark BankAuroraIL10440MB Financial Bank, N.A.December 4, 2009August 23, 2012
AmTrust BankClevelandOH29776New York Community BankDecember 4, 2009November 5, 2012
The Tattnall BankReidsvilleGA12080Heritage Bank of the SouthDecember 4, 2009November 5, 2012
First Security National BankNorcrossGA26290State Bank and Trust CompanyDecember 4, 2009November 5, 2012
The Buckhead Community BankAtlantaGA34663State Bank and Trust CompanyDecember 4, 2009November 5, 2012
Commerce Bank of Southwest FloridaFort MyersFL58016Central BankNovember 20, 2009November 5, 2012
Pacific Coast National BankSan ClementeCA57914Sunwest BankNovember 13, 2009August 22, 2012
Orion BankNaplesFL22427IBERIABANKNovember 13, 2009November 5, 2012
Century Bank, F.S.B.SarasotaFL32267IBERIABANKNovember 13, 2009August 22, 2012
United Commercial BankSan FranciscoCA32469East West BankNovember 6, 2009November 5, 2012
Gateway Bank of St. LouisSt. LouisMO19450Central Bank of Kansas CityNovember 6, 2009August 22, 2012
Prosperan BankOakdaleMN35074Alerus Financial, N.A.November 6, 2009August 22, 2012
Home Federal Savings BankDetroitMI30329Liberty Bank and Trust CompanyNovember 6, 2009August 22, 2012
United Security BankSpartaGA22286Ameris BankNovember 6, 2009January 15, 2013
North Houston BankHoustonTX18776U.S. Bank N.A.October 30, 2009August 22, 2012
Madisonville State BankMadisonvilleTX33782U.S. Bank N.A.October 30, 2009August 22, 2012
Citizens National BankTeagueTX25222U.S. Bank N.A.October 30, 2009August 22, 2012
Park National BankChicagoIL11677U.S. Bank N.A.October 30, 2009August 22, 2012
Pacific National BankSan FranciscoCA30006U.S. Bank N.A.October 30, 2009August 22, 2012
California National BankLos AngelesCA34659U.S. Bank N.A.October 30, 2009September 5, 2012
San Diego National BankSan DiegoCA23594U.S. Bank N.A.October 30, 2009August 22, 2012
Community Bank of LemontLemontIL35291U.S. Bank N.A.October 30, 2009January 15, 2013
Bank USA, N.A.PhoenixAZ32218U.S. Bank N.A.October 30, 2009August 22, 2012
First DuPage BankWestmontIL35038First Midwest BankOctober 23, 2009August 22, 2012
Riverview Community BankOtsegoMN57525Central BankOctober 23, 2009August 22, 2012
Bank of ElmwoodRacineWI18321Tri City National BankOctober 23, 2009August 22, 2012
Flagship National BankBradentonFL35044First Federal Bank of FloridaOctober 23, 2009August 22, 2012
Hillcrest Bank FloridaNaplesFL58336Stonegate BankOctober 23, 2009August 22, 2012
American United BankLawrencevilleGA57794Ameris BankOctober 23, 2009September 5, 2012
Partners BankNaplesFL57959Stonegate BankOctober 23, 2009January 15, 2013
San Joaquin BankBakersfieldCA23266Citizens Business BankOctober 16, 2009August 22, 2012
Southern Colorado National BankPuebloCO57263Legacy BankOctober 2, 2009September 5, 2012
Jennings State BankSpring GroveMN11416Central BankOctober 2, 2009August 21, 2012
Warren BankWarrenMI34824The Huntington National BankOctober 2, 2009August 21, 2012
Georgian BankAtlantaGA57151First Citizens Bank and Trust Company, Inc.September 25, 2009August 21, 2012
Irwin Union Bank, F.S.B.LouisvilleKY57068First Financial Bank, N.A.September 18, 2009September 5, 2012
Irwin Union Bank and Trust CompanyColumbusIN10100First Financial Bank, N.A.September 18, 2009August 21, 2012
Venture BankLaceyWA22868First-Citizens Bank & Trust CompanySeptember 11, 2009August 21, 2012
Brickwell Community BankWoodburyMN57736CorTrust Bank N.A.September 11, 2009January 15, 2013
Corus Bank, N.A.ChicagoIL13693MB Financial Bank, N.A.September 11, 2009August 21, 2012
First State BankFlagstaffAZ34875Sunwest BankSeptember 4, 2009January 15, 2013
Platinum Community BankRolling MeadowsIL35030No AcquirerSeptember 4, 2009August 21, 2012
Vantus BankSioux CityIN27732Great Southern BankSeptember 4, 2009August 21, 2012
InBankOak ForestIL20203MB Financial Bank, N.A.September 4, 2009August 21, 2012
First Bank of Kansas CityKansas CityMO25231Great American BankSeptember 4, 2009August 21, 2012
Affinity BankVenturaCA27197Pacific Western BankAugust 28, 2009August 21, 2012
Mainstreet BankForest LakeMN1909Central BankAugust 28, 2009August 21, 2012
Bradford BankBaltimoreMD28312Manufacturers and Traders Trust Company (M&T Bank)August 28, 2009January 15, 2013
Guaranty BankAustinTX32618BBVA CompassAugust 21, 2009August 21, 2012
CapitalSouth BankBirminghamAL22130IBERIABANKAugust 21, 2009January 15, 2013
First Coweta BankNewnanGA57702United BankAugust 21, 2009January 15, 2013
ebankAtlantaGA34682Stearns Bank, N.A.August 21, 2009August 21, 2012
Community Bank of NevadaLas VegasNV34043No AcquirerAugust 14, 2009August 21, 2012
Community Bank of ArizonaPhoenixAZ57645MidFirst BankAugust 14, 2009August 21, 2012
Union Bank, National AssociationGilbertAZ34485MidFirst BankAugust 14, 2009August 21, 2012
Colonial BankMontgomeryAL9609Branch Banking & Trust Company, (BB&T)August 14, 2009September 5, 2012
Dwelling House Savings and Loan AssociationPittsburghPA31559PNC Bank, N.A.August 14, 2009January 15, 2013
Community First BankPrinevilleOR23268Home Federal BankAugust 7, 2009January 15, 2013
Community National Bank of Sarasota CountyVeniceFL27183Stearns Bank, N.A.August 7, 2009August 20, 2012
First State BankSarasotaFL27364Stearns Bank, N.A.August 7, 2009August 20, 2012
Mutual BankHarveyIL18659United Central BankJuly 31, 2009August 20, 2012
First BankAmericanoElizabethNJ34270Crown BankJuly 31, 2009August 20, 2012
Peoples Community BankWest ChesterOH32288First Financial Bank, N.A.July 31, 2009August 20, 2012
Integrity BankJupiterFL57604Stonegate BankJuly 31, 2009August 20, 2012
First State Bank of AltusAltusOK9873Herring BankJuly 31, 2009August 20, 2012
Security Bank of Jones CountyGrayGA8486State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Houston CountyPerryGA27048State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Bibb CountyMaconGA27367State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North MetroWoodstockGA57105State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North FultonAlpharettaGA57430State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Gwinnett CountySuwaneeGA57346State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Waterford Village BankWilliamsvilleNY58065Evans Bank, N.A.July 24, 2009August 20, 2012
Temecula Valley BankTemeculaCA34341First-Citizens Bank & Trust CompanyJuly 17, 2009August 20, 2012
Vineyard BankRancho CucamongaCA23556California Bank & TrustJuly 17, 2009August 20, 2012
BankFirstSioux FallsSD34103Alerus Financial, N.A.July 17, 2009August 20, 2012
First Piedmont BankWinderGA34594First American Bank and Trust CompanyJuly 17, 2009January 15, 2013
Bank of WyomingThermopolisWY22754Central Bank & TrustJuly 10, 2009August 20, 2012
Founders BankWorthIL18390The PrivateBank and Trust CompanyJuly 2, 2009August 20, 2012
Millennium State Bank of TexasDallasTX57667State Bank of TexasJuly 2, 2009October 26, 2012
First National Bank of DanvilleDanvilleIL3644First Financial Bank, N.A.July 2, 2009August 20, 2012
Elizabeth State BankElizabethIL9262Galena State Bank and Trust CompanyJuly 2, 2009August 20, 2012
Rock River BankOregonIL15302The Harvard State BankJuly 2, 2009August 20, 2012
First State Bank of WinchesterWinchesterIL11710The First National Bank of BeardstownJuly 2, 2009August 20, 2012
John Warner BankClintonIL12093State Bank of LincolnJuly 2, 2009August 20, 2012
Mirae BankLos AngelesCA57332Wilshire State BankJune 26, 2009August 20, 2012
MetroPacific BankIrvineCA57893Sunwest BankJune 26, 2009August 20, 2012
Horizon BankPine CityMN9744Stearns Bank, N.A.June 26, 2009August 20, 2012
Neighborhood Community BankNewnanGA35285CharterBankJune 26, 2009August 20, 2012
Community Bank of West GeorgiaVilla RicaGA57436No AcquirerJune 26, 2009August 17, 2012
First National Bank of AnthonyAnthonyKS4614Bank of KansasJune 19, 2009August 17, 2012
Cooperative BankWilmingtonNC27837First BankJune 19, 2009August 17, 2012
Southern Community BankFayettevilleGA35251United Community BankJune 19, 2009August 17, 2012
Bank of LincolnwoodLincolnwoodIL17309Republic Bank of ChicagoJune 5, 2009August 17, 2012
Citizens National BankMacombIL5757Morton Community BankMay 22, 2009September 4, 2012
Strategic Capital BankChampaignIL35175Midland States BankMay 22, 2009September 4, 2012
BankUnited, FSBCoral GablesFL32247BankUnitedMay 21, 2009August 17, 2012
Westsound BankBremertonWA34843Kitsap BankMay 8, 2009September 4, 2012
America West BankLaytonUT35461Cache Valley BankMay 1, 2009August 17, 2012
Citizens Community BankRidgewoodNJ57563North Jersey Community BankMay 1, 2009September 4, 2012
Silverton Bank, NAAtlantaGA26535No AcquirerMay 1, 2009August 17, 2012
First Bank of IdahoKetchumID34396U.S. Bank, N.A.April 24, 2009August 17, 2012
First Bank of Beverly HillsCalabasasCA32069No AcquirerApril 24, 2009September 4, 2012
Michigan Heritage BankFarmington HillsMI34369Level One BankApril 24, 2009August 17, 2012
American Southern BankKennesawGA57943Bank of North GeorgiaApril 24, 2009August 17, 2012
Great Basin Bank of NevadaElkoNV33824Nevada State BankApril 17, 2009September 4, 2012
American Sterling BankSugar CreekMO8266Metcalf BankApril 17, 2009August 31, 2012
New Frontier BankGreeleyCO34881No AcquirerApril 10, 2009September 4, 2012
Cape Fear BankWilmingtonNC34639First Federal Savings and Loan AssociationApril 10, 2009August 17, 2012
Omni National BankAtlantaGA22238No AcquirerMarch 27, 2009August 17, 2012
TeamBank, NAPaolaKS4754Great Southern BankMarch 20, 2009August 17, 2012
Colorado National BankColorado SpringsCO18896Herring BankMarch 20, 2009August 17, 2012
FirstCity BankStockbridgeGA18243No AcquirerMarch 20, 2009August 17, 2012
Freedom Bank of GeorgiaCommerceGA57558Northeast Georgia BankMarch 6, 2009August 17, 2012
Security Savings BankHendersonNV34820Bank of NevadaFebruary 27, 2009September 7, 2012
Heritage Community BankGlenwoodIL20078MB Financial Bank, N.A.February 27, 2009August 17, 2012
Silver Falls BankSilvertonOR35399Citizens BankFebruary 20, 2009August 17, 2012
Pinnacle Bank of OregonBeavertonOR57342Washington Trust Bank of SpokaneFebruary 13, 2009August 17, 2012
Corn Belt Bank & Trust Co.PittsfieldIL16500The Carlinville National BankFebruary 13, 2009August 17, 2012
Riverside Bank of the Gulf CoastCape CoralFL34563TIB BankFebruary 13, 2009August 17, 2012
Sherman County BankLoup CityNE5431Heritage BankFebruary 13, 2009August 17, 2012
County BankMercedCA22574Westamerica BankFebruary 6, 2009September 4, 2012
Alliance BankCulver CityCA23124California Bank & TrustFebruary 6, 2009August 16, 2012
FirstBank Financial ServicesMcDonoughGA57017Regions BankFebruary 6, 2009August 16, 2012
Ocala National BankOcalaFL26538CenterState Bank of Florida, N.A.January 30, 2009September 4, 2012
Suburban FSBCroftonMD30763Bank of EssexJanuary 30, 2009August 16, 2012
MagnetBankSalt Lake CityUT58001No AcquirerJanuary 30, 2009August 16, 2012
1st Centennial BankRedlandsCA33025First California BankJanuary 23, 2009August 16, 2012
Bank of Clark CountyVancouverWA34959Umpqua BankJanuary 16, 2009August 16, 2012
National Bank of CommerceBerkeleyIL19733Republic Bank of ChicagoJanuary 16, 2009August 16, 2012
Sanderson State Bank
En Espanol
SandersonTX11568The Pecos County State BankDecember 12, 2008September 4, 2012
Haven Trust BankDuluthGA35379Branch Banking & Trust Company, (BB&T)December 12, 2008August 16, 2012
First Georgia Community BankJacksonGA34301United BankDecember 5, 2008August 16, 2012
PFF Bank & TrustPomonaCA28344U.S. Bank, N.A.November 21, 2008January 4, 2013
Downey Savings & LoanNewport BeachCA30968U.S. Bank, N.A.November 21, 2008January 4, 2013
Community BankLoganvilleGA16490Bank of EssexNovember 21, 2008September 4, 2012
Security Pacific BankLos AngelesCA23595Pacific Western BankNovember 7, 2008August 28, 2012
Franklin Bank, SSBHoustonTX26870Prosperity BankNovember 7, 2008August 16, 2012
Freedom BankBradentonFL57930Fifth Third BankOctober 31, 2008August 16, 2012
Alpha Bank & TrustAlpharettaGA58241Stearns Bank, N.A.October 24, 2008August 16, 2012
Meridian BankEldredIL13789National BankOctober 10, 2008May 31, 2012
Main Street BankNorthvilleMI57654Monroe Bank & TrustOctober 10, 2008August 16, 2012
Washington Mutual Bank
(Including its subsidiary Washington Mutual Bank FSB)
HendersonNV32633JP Morgan Chase BankSeptember 25, 2008August 16, 2012
AmeribankNorthforkWV6782The Citizens Savings Bank

Pioneer Community Bank, Inc.
September 19, 2008August 16, 2012
Silver State Bank
En Espanol
HendersonNV34194Nevada State BankSeptember 5, 2008August 16, 2012
Integrity BankAlpharettaGA35469Regions BankAugust 29, 2008August 16, 2012
Columbian Bank & TrustTopekaKS22728Citizens Bank & TrustAugust 22, 2008August 16, 2012
First Priority BankBradentonFL57523SunTrust BankAugust 1, 2008August 16, 2012
First Heritage Bank, NANewport BeachCA57961Mutual of Omaha BankJuly 25, 2008August 28, 2012
First National Bank of NevadaRenoNV27011Mutual of Omaha BankJuly 25, 2008August 28, 2012
IndyMac BankPasadenaCA29730OneWest Bank, FSBJuly 11, 2008August 28, 2012
First Integrity Bank, NAStaplesMN12736First International Bank and TrustMay 30, 2008August 28, 2012
ANB Financial, NABentonvilleAR33901Pulaski Bank and Trust CompanyMay 9, 2008August 28, 2012
Hume BankHumeMO1971Security BankMarch 7, 2008August 28, 2012
Douglass National BankKansas CityMO24660Liberty Bank and Trust CompanyJanuary 25, 2008October 26, 2012
Miami Valley BankLakeviewOH16848The Citizens Banking CompanyOctober 4, 2007August 28, 2012
NetBankAlpharettaGA32575ING DIRECTSeptember 28, 2007August 28, 2012
Metropolitan Savings BankPittsburghPA35353Allegheny Valley Bank of PittsburghFebruary 2, 2007October 27, 2010
Bank of EphraimEphraimUT1249Far West BankJune 25, 2004April 9, 2008
Reliance BankWhite PlainsNY26778Union State BankMarch 19, 2004April 9, 2008
Guaranty National Bank of TallahasseeTallahasseeFL26838Hancock Bank of FloridaMarch 12, 2004June 5, 2012
Dollar Savings BankNewarkNJ31330No AcquirerFebruary 14, 2004April 9, 2008
Pulaski Savings BankPhiladelphiaPA27203Earthstar BankNovember 14, 2003July 22, 2005
First National Bank of BlanchardvilleBlanchardvilleWI11639The Park BankMay 9, 2003June 5, 2012
Southern Pacific BankTorranceCA27094Beal BankFebruary 7, 2003October 20, 2008
Farmers Bank of CheneyvilleCheneyvilleLA16445Sabine State Bank & TrustDecember 17, 2002October 20, 2004
Bank of AlamoAlamoTN9961No AcquirerNovember 8, 2002March 18, 2005
AmTrade International Bank
En Espanol
AtlantaGA33784No AcquirerSeptember 30, 2002September 11, 2006
Universal Federal Savings BankChicagoIL29355Chicago Community BankJune 27, 2002April 9, 2008
Connecticut Bank of CommerceStamfordCT19183Hudson United BankJune 26, 2002February 14, 2012
New Century BankShelby TownshipMI34979No AcquirerMarch 28, 2002March 18, 2005
Net 1st National BankBoca RatonFL26652Bank Leumi USAMarch 1, 2002April 9, 2008
NextBank, NAPhoenixAZ22314No AcquirerFebruary 7, 2002August 27, 2010
Oakwood Deposit Bank Co.OakwoodOH8966The State Bank & Trust CompanyFebruary 1, 2002October 25, 2012
Bank of Sierra BlancaSierra BlancaTX22002The Security State Bank of PecosJanuary 18, 2002November 6, 2003
Hamilton Bank, NA
En Espanol
MiamiFL24382Israel Discount Bank of New YorkJanuary 11, 2002June 5, 2012
Sinclair National BankGravetteAR34248Delta Trust & BankSeptember 7, 2001February 10, 2004
Superior Bank, FSBHinsdaleIL32646Superior Federal, FSBJuly 27, 2001June 5, 2012
Malta National BankMaltaOH6629North Valley BankMay 3, 2001November 18, 2002
First Alliance Bank & Trust Co.ManchesterNH34264Southern New Hampshire Bank & TrustFebruary 2, 2001February 18, 2003
National State Bank of MetropolisMetropolisIL3815Banterra Bank of MarionDecember 14, 2000March 17, 2005
Bank of HonoluluHonoluluHI21029Bank of the OrientOctober 13, 2000March 17, 2005
+
+ +
+ + + + + + + + + + + + + + + + + + diff --git a/doc/source/_static/df_repr_truncated.png b/doc/source/_static/df_repr_truncated.png new file mode 100644 index 0000000000000000000000000000000000000000..8f602703587613cea6ab354ad2eabbb103ba72f4 GIT binary patch literal 8040 zcmb`McT|(_vhRZ+U;_doNC)Yt7<#XvR{;$WiV&qpiS!zp3ZW}VZwdm^dxs#1L8M8C z5I`x?dxwzRcz%2Bd-py2?6c2WCx0Yw*5rLwX5MGM^OlDQa(izzu)#$Gxaov+j;ufxIY8fyTV-ZHZ%A_)+;tgB z@;2wv{kxZ5UjHT`7e(Xq?fr}R0g;477)B<_GVDFM>bZPPk<7jL`4~gE3$nf=fyQ&Wda$*(E$(Y||neqwB)4Nt%Lg+|N&NUEW5z+3KeQ>rB5^QR?w z#?-B=V+&$e4BUXqi*vl`{?q{pc#jX5Q@>6nf`2gY;aa9HCd5#C9l>|+6l5);AMm8S z;C?7d&q{JFZYa#n=b@1&<>gmjAGVjSzVj)2TpQop5}wFGnHz7fpKO^!UzRNzF2nQ6 zup!_sRonOc1bs0rQhC`FAYz z;30vT)t-rQS$en!I?&URz`Lye6vM=o5P%nhbF)PUgoq zvu)$5j=L=u&jVL4=ASnO9*rA5kBimHO6hN!kRzwvcyu~AlBEo$To<{i+VVOS`NPpx z0`Rl5V3mXG#6kAg+bY-(=^qr}SBfsJ0Dx4DtC$L|RT(@Uaj)pecCj{XtcTX#2#xS{ zFw~#{hRw5kNCj)`V+r^NG{8%_{x*Gg%rp-sLXbNXtZGr?!_LJWX?ZQX<0Eb0_0m?w zE5By=V7>UjOoe6!;b%OYA{RVEsOP=ZTzq_m9d`l_Z=R4JtDy#-M4tC6Qz98-*z4KB zv++W2Rth^}GNZjGKC`-!eh$}iq}2YV;yp`(b8}$8pBakep0^6NVCg_QW5I+iGH$FW zvn*A1*LD*@-ENvA27mTW12!vl7uaLg%anO%+Q)3bOnNz9egM7A&-QuOO){hSfrtc6 zQT*L5xQ4;P6DtYz{P6etUE9x+dT(1lZ*Mf;T4Va227?(b=O?uN7BE`vU0qBrsN;^; zn>hG%Z;qTRjF*ERu;Z$sLz`MZ?H+b>Guv5E1|AB8@Q_O zrjz^yH&1NuHLGwA--SlK`8IEypYJ)(H^CH9)M$`f9X46iiQhi1elase`zXDz*3><~ z7UvtoEJ?)+C}`%Q0k$IYNB}eI73owE=&hFWH2{(lrM#MZA<3qy6=09SRXr!W)YkSh zPwxvuQwqz#+>jh;>tz~XIg>5`VC?c?X$@3a;$Uq=p$chz{4wr!Qc}__*gqCQBd`lZ>aN8AN9NRQ zvb1Vl7xue>%kwF5-9v5__s?!^x3*L(2lHtB?vVUcSn6K1n|~BtvJ@*_6Yl=oL1)0P z=xK$Yb~KM9G(fS349Lj~699Z3?Am(Ty2I2{+y`vpq# zlYNMKdR?;P5NvFtCq~b~pFCXRrUBwq zD5f@zpJ(%-7{`8&7pEBbz7&$pcM@3c8cVY3wBHw{8S|+6;t!}q*g-zVSfIK&08)S`CLVRAI2-szZ9O|IW76r?0)rHc-NiP&@quen(BG6rjr5swKa>R z9L;qpoUU|OvhY0z&x(OamB#WP(X(1$zkG*6x`XnJ(biLv>0&p6-hE>=sVSy+zl zlkP2pRJ-?X#tPS2Tik>9C5fAeXml^_tOE~ zlNBS*DQ3fieC%yjd3%>o5^DqjD;*_Vm-tHD!ufW4#HQYzJQTnx-7o@?7W+(dNH4R8 z_l=~1D$w-4(W=${yIVT0_T%Q|fnl_4=-svO_FVr8d^hVta2?KQ7l(G^-~xLSA{+>@ z-G;+fS{bMg6bDzJU(wZOu3oKqcA2g}y0^a#Z+|U07f7dec)T2si(jgLcCHZ&-0V%G zg9N+KAt?#fTE%D8EIqcTZm; zD+@kh&Mk(DOf%IY=_of( zE5RA3>|xs;7wycYAmr zP7gG@s4_##0^)z3Z2P4K2Sd4M8#>KVzPq+8P6wi!a0+yu<+%Eg+ff^cM)``_H+g>& z4WClg^l=JN$!UU3Y{0z4!l}X#a!nMdxGjgoT|ggv&>)kM+s*1DO2S=B!txbX;1KMp zH;Y1?H7baeQ-Q?P?J@58H&#%_jeN$NTk&jkb7)i^UlklGq3%PaHrOP6ed z`;kxMi$R{6b_%r0A=Z}`sN<@Q5psk{JbrSDeb~sOa6RS#RS%SK5|in=d`Ga3q9CZT-^OAqYT7U>{xw}VCy&HUyv;7-UKdwph4}aT z$KXDr!`L&AlirnSAdoSHs*U1TQU%piY>y=fk~x5$?2xBnTZ#VmQB1D+34O7y#JSQQ z&T*EwbBHxFIDpgAGuVnD$T6_sFj*!Nru)v_wj@K=>|@g zz2l6M-5Wt=^7ssriVuOfoFox6<2iwfPm&0`#b0`2LFm-A7w$yW2=+%XTdVc{#9sHS`dJ!o~E zascG`Wyyk0O0PZqMn0_d*IxNAz25;ByL~;?cJWg%jt;9Ng51n)@T88P>&6zaDH6D+ri8m~1BMxYhxbMOH| zRIcZe1xJx8-#>Zjb&EfKqC3se;?^W;SA>_))mAlBxmCSS^)p_zWoIHH0XVb#DE_On z#3P%h&KySUX2`j?Y;;GfbBy*4<>kmZS$YqTE_fNAyTwvh3*n#9`M5jeNYx`o_|YF) zjvRXIa@s-e(cjafe6%L8b7LzyFiY93FPkd~m8tLQXx__wTHxeAnhYleD$~8W3h-^n z9VjY zO{-fgUhg`foHdwjHt34}*A0K1B$5>(y*tbNJANnw-nxrFvNZsEpMN{UCSCksSibhUHY0ZzK(1J+R6y!lloFVz`s4u5iuN#R3m3DE z=3-J}BSxMdeKhBLoo6%4algA5NO^D+7B6wbdeJJu{8Ed{JG6k;1JAEcYHO(v&CDWY zYL?Pa)s*X2%rQIVgP6_2f6BAoO&aqOV;qyy*A$b-rC3UE2g{V>1zgLc5TfUHv1ZkZ z904jaT*HJIT!{?hVg5;=w=nn^(E$TN7!c7uvzAjo>k{tl|70E=DN41BpX;1;ONyHK zBn$xBK>f<CaL$vi8A*pfoWPVfzRP`;R#0rwbD4{a4+moFO~MdpcQkzTXXd( z35UH&@zVDpVIXrw6dws})6YSS344rX;BPdK?M!R|)xmEXbq5NRV2#}-l9$7fmEKdj zAAK}KB)G{h!VS9hGtTUc>CrqCu#FTx?ru+wKKC=xF-!E9E6Gro*)RaSoou7%Q;UEt zc}Veo#sh8znhS9b+QXk~-io-g^cX?V&B?DY|Su zy9^POk(d|GsDcJmfta0vf9xs8eFKKRT|$g0O4mEaW3sB7v>8^{$!B(=1P>wcQFF?_m(mnn}>ByoP)@7{qK95 z4nF=nvw9X||HeOWjz(Lgpyn@|zjNIU=4G{U6fN~~XP{EjB|x4eg9=(EFtd-wTJa#w zz#TOfg<$$e5Ph4DH@@8Rm6!!kM(mPlodc~cF7U0ieG^ep9sBx$?U@VDg=d-8nX9}m zefk?{yGhlsXR=y529{@%9>o_SrtQ7e)>aO0Sb5m#$$wBwoDOebyA0%PoFV;IbNbQu z!h0rEu(z|Kp_^1GM z+&mH(Q}2<>AE%{Jbi2{KaDqSIwh0*d@kux&g?H4(x?VTjmIfFkI41jOE3R=}@Hhnd zqVIPA>C5}<;8)l^a}x$ctON$^Ttonj(f0Hw)!kY*_zQ@g$Lr=d)`|H>EC*w++kb}E z8ggS#rK*o5ahz|5Q!Qhy2i&}FZXZG&{OBcJKO|+P!+fYOeLsFFv>+5aIJNkjUp=4? zE%MP3v16tDg?ewGw1|(`|H{^kyVKLX+j$TNVP<;Q1|g_M@3xxJQLoimCd;^>Zmx7p2|pyr0X<_WPBAl{@?UUs!eGefjOWFD%=c zER9WE5Sy4thWER0K1_tHtpsx~NK8GNb03oTc<}`>AIDDe<4R0gLzcR!|6i&md4W)D zltE>>LkCJfWHNrW=(`;qvh$0^DVLkLt*aI#cm~`J6r^}qZ{NN>P64pM9$ZIK&Z7Pm z`%3?R?E5@VOy6EHNj`@yaxzr3(rX*YLXf8>_5{&g=Z{T$8HBBz+vY*c+8}-`yF4!{ z7NKEaJ6G=DL~K_Fe#OPE%!|Kvv5oN&lR}z$;fBZuDQdwT+j?_%>us9 z7iAzuG2A@_sft(K#&XNRCkTHhZQYM3&4&0{R;rei@FmYeWm zF!7q33in$nrjZ$t&Y}IMf$q*LU*6C%UnNxfUiY&F4opaF`($X_;x}J=4k4VTD-!;4 zda>&oK-$(<&g$SNW=TdJ+B=o|E@T~dz%Ik zYk1vb6$DQUnKDPDpl-o0)K6M3ZXLd1FmBuIog92lLoY^_2wxa1eHkDi6M=-(t@C06D#Z< zyd&tYB~SmEPj>7&#O0x-c@cL=)7#)^MyE~gL@4aRqv6IQt6ox{P$miwdg$*eT!=3H=jz?90rVgoysCJL(bMynu z)H!UZ%dSGMctTK+pZ`4$!QRWHN<(($yv!h08NhXDQJI27>Yj<#uJqWvqoq^ftP&90 z=uz`*Mr(q>!rTp2<9E3HIaG8)hvLn0w$r9xVrrpEVDRPjxx5ZbWBzXwIrGK=!_g&F z0#HCepa0o(+ST-wGzWwLhscPHttfNU>cwrdQBKaxOP#ikpI6L08!;Tx)=+WC#9IzZ zDr-8jP&|IDM9lX^@YP7&TnhV&T&y^J=zY*dk-a-}l}L&AO=A@|{ztpe!u zYh!0^xc^ed!uZ}Ts46jF_YFV)on(B9Eg-G8+{3xKYTI1|_w+5oI-LDBYkp}VTfHv1 z2g!!x+A=^z1~=!oEG_}0TDeI(uGEkK>YhP=ZZcC?3zM`6tpfm}^DlqEr+|!3`ge2} zUP}{jY!L=a3*ExmFH;(B@jZ{pbu-DdjC7?&y>idvLBfAicNouQ0n4D!>}y<|m}jDI zXE{kQZx|1l6OB!um|4>`k|4q--c2r_Wrp}m=kd> zl?HoM*cGMS4+hF$%>jS96eqoqsSs?ioyx*ox$?zmOp{ch=ZWj|lbDn3%Z#3j`^&Os`&BV@iM4A%V)W2%D08fx8Qe2LA%3bB+P_r=E z@}7@&y9d3L1^W;iIEmT%c9z5#42P37M1J`kQ3$i#l8%gIejp)1!_;2?ZQ}OEbLx$W^0ZG}kRHwH#@lGNuG2v83*A;?tO(-3|wZ5Uipc6hP!w+c?q{pRdly zLmc)F36{^`n-Ve(biRX zJGXg|HCKOcx*GJDpd7q0r(Gkm{BWMMSDudKO^DE{My*No?R=6s1zR+K?QCPIclAZYJoB~>60sCe+rga{9QbKJ1q4Sv8n zOT1G>1Yh2WCgI@!NDi`}oFNbl!hGF?b`9(K5{Ke5}t(1DcA#KuQ!DA)^eA|-g z`i}v2xe~Tj(7MT_DX;;sUk1N=t%%w=y&HrM2!`cb9UF*i+RgMjasJ}q$3Nyg>WrA5 z6xe0JMu1AUE@TS{0xzX$<;4h4;e*)eQ~km3;fiJx$70p;d#AIa=Gu=0kOiWDHN-Elk?63(HyJ3UZ@g^rH&t|plKP8Jk-EPje z`v|Ih{P^MH$Dp;fHSyJ+S(sm+j{f{2AR}{MgEBEUr>3WuWxDSnfAT*%vT3~-6@2>? z(CNTB9MVUiUT?P?IY8!xYE?(1U2j9EqHV)DlAr+ApqMEXF+40^;>@T~PV)CAXx$f_ zKxhVCraLs`?4YDVD@j8`gPNKe#?jGnjStg4@ zor#c;Fn(wo;(v8G7qrQTvWVb025wxH0saXN4W;7Z!U0#l+LZYA?VFKc6drT6_09G1 zBD>p(&bcqJhT_%4#8tt~)z+^>^||BK4sw!#_52h)r=O?w^VOy(;G{6Iv5j3>I61LR zO--5bHjb6ydqQ69orEc&QKq$BxsuNBOh4eb;i#sD`#M-Tc5^RRD#}7#9Zx|?sW2Fm zh=@pT#)>;XX|lUpBJJC^cSw|6jEqt3cLybk4eQYvNh-}w+j1}OYOr0DMxs<@Nttq6 z@fg_mD%bDGd?jJw=_!!GYft9$a6YiV)a3kzGPu95Z-U=>XZ-foy~IiTl$0`f^gHj^ zhHo&Ymb&_^z299+du3%jxb*Yq`k6XQOUtB9KCO=*H&s_P%5^NyJwD1=f} z@U0IL;kTH)>+;03f^Xlx-T$4;X#2ESV~znXx&20#M5mNVaw)6;U2QQz71a#RkY{Lj zG~+#gY#vdJG!tjUJYANz(T9f8(r9o`u&H8dcu!%lv4H_?UHeyL=JzWaV)7ePH=JB_ zEG)4Fg@uAg7(F$W6aqce^51#Tq@<*Z%FCtYITwK#T9unapYp6SS zv$N_ZW@bjuPml8;$e+%_L?M{4Ntjgq*`a^GeG_?(kH>6r-EwLQ8Dww80Qo6yd~y<6 zBAh@>tz>3!bhLlbzJnP)37kVzlk+Y*U;IHN4~gMn*XfB@nz82|FtT=x`fwE;`lwD; zS{gFM{jjV-66y^DgM_qnD81s_eJ^KcPDpqnvW?*@d>LwnP;*1*Zi3I#R@^Va+}Oxh z#l+36mOcpr?M-te_>!D^=hv8v=7lL?BhtI%G1i_+R!>PuscuK@rep79{7sq}!c?$? zo_~r>D-VH$<-MNT3yKK40V#<6D0SziLZ|7KSVcC-P6l)3x_GYpkU39qJ9EMluh8IN z3O+u9kdP3bv9QokqwZiNKIfh2_vO4zRV%*twj?6n1UWf5xo4<#rv($~)c9)e-$yM^ z;PeO@9EuV2$bJ6Iqvw4t1@c4Wp}71dT63BXjsJ1jmog6XMF+SZkCzNmTVEgPh={$| zU>_!(C$6skRa0G8*EZR0_}ig^UFmX*+h&H_N>wvFf%od}@HgrOi#XtmP<mIm z0h;+-05Ddj^G;@gW$VtA6KS&O^}I3O#|(wkf=3YbpIRFV_Un>Vw3L+%+_X-`%6StK z5-5G`-u@f5X$u24=Hld>+PRl2W;4C*!`H9Gv+vf$ijs2aPLrd*a9p09t`)YSAz zdHMVr9mT^ZPQxQlGk*0+JC`l~@V?&IS|Oa%r{Rjs%*13~hyI;q1!Unsu*ZucsXAB! z)YV&lkH0iK!1I3oWN<8thyZnNdUkfBHv*fiI+h)?RZA>M?zPQN6U%3j;oAU2A9LdsuBPm(6nXOm6xn-w{aMAZZqc zF&U#Gj-mUL7@w@p?z^~56(|J#v_4H&FUdcgEyEsD)6keo(f9rHx}K9D`iZ@1aQMQ^ zcCmh>{4i!+*Zr8Ow7rIt<`vSmD#LMR zqFg@Swm=3t1l-=HAk~bGpnUh#@44kxfQAN^nVFfaHgp|TPY{}2cz7g1E}v(Jbf%6w zEne>BgJtFAgH;)DWw_PU)F6;DbwSk&LSWkI%{pkRWVtf3ErE@9cn_ElK+)4!S*sHFQs&N?un@eY=%Zafu3`0XhvbwGkIFlcp zuqG~Scz1}=F}|@OBA!A1?zWg2FQ!;Q`~QaKT0aw=TD)L~U`(a&>@<9_*f-N;WsN`M zl2l4E?g*kM5V7M8{V_QX%!AoH36CX)3{nkPnErnvL-%Ob_FHHV4-Y(36g;L*6xP-? z$|wTV(=Z~`?A5HlwS< zG-?7!qFg*Y8z2$&24RSVx3&s_`w|SLup&T6_?;w6OC~Iob#$V$o*#B!YE7lduU9m$ zU$5VuZ>F*v!$a0FUbC`dc-@@z^~Vrb*)H--&&^SDa(=Wi2Rm^GXo%fk) zEx_CWZREAJkp=_=l>1NZO`~9AvUZ<6Q2Uz^IMKs} zb(yn-#aKgPHkO4^9RK21)UQ#aQMXBeqt;rzs5ao9B;0v~JJHOWjhLoJAP& zVy#FDjM;YAwYZ8{uU;9c;Q^D}-FcRNs(}aYT2xvpsj7+-85LCp$VS*PmIRyYfojX{ z;o+g}${Y=cRoz04Eo#4PfT%ib^Udi7dy_G9W34G!?H~~?#fqfEUU*AfS!s={t83%a z-C>o(x&$l&0=1|p`NYHo1lDaznXPVWVZn<1%Y+3N#8WCr0nRMiWKW|n`}yI!yF6Y6 zY=cQM0&{hY21>X}v81Kt8ysXfkO~1$p-7NTFJD}30!ePrgri86frgfriihX({GCN? z$&y^pJK)r_rJtzD$OT*w0b@wzccz8BSnqZ%2m=#rGXH%F`aj=-&(6-y=Ua(Nk=q4u z5fKrY7)%@lZv(z&L7x3sVUiFa@!wMdOK+RP%3c;R7)vTx>3XOR@z<-6QB*|dv0K`E zDOSiRD9mL3AZfX8CCDtz7EVZ;n!Y}1+9hN9R1FI@KVN5!|K-b<-h}?i{PzhdDPhgc z0^d(cfSo}+z;c?8N-8U3(&wX};)7FOec0ihZ*rys0gcC^6aKMK(Gb8OP2dOwyL%O+ z@IAyit69(;){F*h5XhjbC57AbZ)=%pJ;9iU=H`VBO{#=!P%#;je-}4D)_qh_F`6il zr*&4t?}u%1JC$~E;i|X!6WHc;^OE54IE&;1Y$U>y?+;$w9gqX{3O(k2o}mmM)S`Bp2yHCAd z!IE`#t()mt+A>r5ze^x#ePU*2Y#5m*;?P&-w;--g8aL5zTr84a;ot;=N*uviW`hNe z<;M@2lD{D+_#1l@d3dHiaZrXhQ*)1X0KP{h80*{AIRgTgmy^?V!i^eQnC+^z_IyV& z{)ZB0xKuw@w8b}UbAQ0ilJa}adD%=I<%ae9E6R$xum$L zNDS1Udi&KA%Z~)86~UQh2+zCb{_p#D=aV_ROhw=~4(sB~u`2468ImPRX0(#_T?AzF zC!yw&f0>M1)qP$BpD^F%)j)7DI5gDbJSH5bq36-ZpsBB;V{=u)14?WGd3F~S6|JZr!;MZ7AsXeCcuX;evK54M-b6|U+jt|H zEZzE6T*14Q&LPX{=#ad4*za1cvgG@p9nD~4CohiAq**D(q+er(<|04}kV~mF)tfhO zK9dImci%fXsb5S?3BK9mYxa4ef^53^xY7@N#|td`djt_ z&$tqLfP4$INk2(pIVqCQj}e8n22u1{qB&oM7=-6Tx{041uzRdmUTs-fdK4IoMDR1}uV245Jc?B#0Yz|gv+u|OLjW7i35@LO52BZrJ%#H?lgi!n7k+Lo{}2&}`;WnEd4`DqLDJo&Nnk3fzX zJJOP|ws^YE+94V>W(8h+0yH4U{3}l$UM)WUY^IUX%!XVUf$Amcex`qKL0_LN5zkIJq8X-cXs-S zFx@zB_d`rju=CmWV}(0xSfH27RCaHV_J0CYEArhh1TiHHh`){>%HXA>1F;)~vtqyO zVqN4+FcXS2_GpI1|Fc`b>OW|Q9l;ziJj9cA1&YXDncOzv|0!8|Fvmxb$KljpodXCo zYT=Hricy#Bv!FjUp|dGOJy$_L6s!h4?Jc*+PiB&xwKTx2`cUIUr}*DLI#Ybi6}C=A z`JZLLYv~~Fg}7>Or5%b((1q_JQm)xGVK3!<55U}STcO1LlHpAz!Q@9a}=1FdhXh-5dnXbbBD=9)+OoW2@FjPcz zm39hj2;w2_*A=u>b(54VNXNe6c-qtnZ#_A{NA}M!mqm_PHE|e?r32#fxx8}!?MMhc zALnw@tex8Ir zui1`c+*~cVoxWWwC(CGB16zRa`@caHV#A6jBS36b?&n6T>er*;O$R?qUI13qp_8*S z79pYK%;;!_V3k1^Gz|?6g`glYq|)QUOvi266`(muAQUE}?c6BJ%D$jUip&y_(EwZ% zxm|;)DlIQZ2mBB~9M=C2RET%O)4&SgRZ~+F0Em_Ihr#Otw{ff3`iWAE>54eATZU^- z=A7YwGS@GAvCh7e!wGAjatPiBv3inUirH1@mT-3IrU=;FklVfN2aXc<-@kt^)r!Yv z-XCz|y!Q5$4RIyJ{Tp#H#IEP3)92jQRuaIa2hzB$<3Z&M4u<98;xYi7_;$qJ7r?{Y zPKju+;g<@Nno95j>#S!>yTH!-?&)Z0jYiVA-H)mU#%E{YA%nxik}4{%UP@Gb;Xh{l z9PfQ3`tPaC9f#u=vXgSbHmMAAwaTOc!d_Th#3CTzTl*{_A#rxo@l5I;f5pejI`;Dj zY_AjGvED;H_jNcX-DcHN=SABF!-MH!Aj`a91he5($`h?daIOHprtC@AFY=qb^HN-FS}~XT@X+ng=;5hZ~fH*|5aW zES-NpQFaM7h{81`KNP1iPfXG0oHQFa{X70WKfV1r!~NwoySAb2n{%F2e|M~d_rexr z@KBDp1;nOYOibABtFPH{qlkM`+lSlf7_?R*#yQm1A0VjzaJY-X({g_+t~k(9e6~qR$l$Ynv!e7sqx@7!4G- zTmtA1s|`@UYdT+(-s)>fqSUDQ@r_~ zeSM#@pE`#t*x{-f85l5eabG``6c>vFEe8ZS#89KRw3NwasSyhVn~a`bW*-x~7-Ii- zXAloBhuAbuP#%Vd5Aigt;r-$bBO8AI`Gh8ceJ#_I&_wUlMN7*8^I=FSpkBwNSy;=9 z=wUSzTbdifCP=u|z8d5pQqJKa*(psCEzTYe_!HC#G)g`9UVTr}5aP+r|Oo0jk&BCIaEVH5=LE80xhH@WZ6ji23Kw zANP$2I!t2Xs50#at?8FTIelCx=5TnIVGSiYkYy|Yd%Lavl z_oOEWpp0G7z%f6VB)8T9#5M=KhgfsS$I-GO?9Ru#cbFX$RWs!^m83|zki`Utt(*yq z`1p8#-c|39>9*SpclWK9Q*h&P*FiM8XdKqg&uu=BnA3C8D_T=L4x7@r*vD| z%_w|+EbyFFJT|$ZAwzVyQI;ym;E+)3%l$qe`QBrTl}v2odhs{PWI9`}trE`12BY>9 z1p66d>?QQ9x!FOfWn9$q_l#j|=eiqM2|4^jjY}0b5tsP9ge!^=e*WQhnRhMgUbx6| zla5+-naM`V`c>Q@fkTJatntP=v%4#*WU*>)Z7mMaau|}4K?5RQ*2d2u_CP@3X8bLN zc)rS~+7_ceWG3zU;T<1>=p3<@mH_gh7zClcuzRuD$I%*TG|mUZn%dV5d|CCq$(XS) zP0TPZ3od6B?Xb1QP_CS}DVeX#Eb>|Lb#(a0APV@wQB(!~UbjjVlL5?MNk`o4{Ez0V zV6+?T;2?Wfo?Ba62AlmcigT_mEW*O10FMG`4&o1kfX;Sq&=|Kc)yC-t8UHdSma>HQ z3U6IJFBD@z6Gina5&||3g)^S}G#U0*B#nd8^C!%NEC#3`D3?|1?7|M1wB*iz@Hrc> zA(gnc%0=dA*RNVR;Dwi-!hos*#0uD|!_F`vXy7%qbD~yI z1GOP0CWa->?-Jp$s=K<=0P=V$*kct$S0%6Q6s9qvLq+N25N8AZ$>wX|*7sN0InwT= zR0ubF7ZiUZGTFyb&aI%>W02(m^KF4PDk)bfcA1X_0-l5J%STRaj8^K}Z=ImW+L%J67)PsYHdnrI zN}M4Xz41l2R4*Y_76ip_;P3X(@xn*IqN*ThknWzID!XL?ARhATIP^Q}l}w@Xb~=)5 zIUTCaEU#t-E)YB;%BG=w>Tk1n+QJ-nM5=AX>?)b-F(+7}_VxAUId-ON)eHMd9Tozk zFFJUtg)$n$IHo%((?sG*98rOg7sK%OSH3dN6J`6-#zm zbh80xVXH9Q`OHn%v_&2eyV|ZPYu!+odh}kXs;VkHJK7Azu<&ri^Dcc|UFQC`Bs0eq zq>!TG;;P6*APe^kJ&8wKlMm(!Li~)Gwl1Cntx6LGV_htiFf7*ZH`w2G^aRba=|s%M zib2*AWXQyU@C8&6!2iz9dN87P(MkCHL4VNkcUfL@>?l+q3X_n~lI>%5Qta&q!~w^( zm^URdjO7P;BRYCM8|10<-m9|8h=W&A;wTu<%evWN82%W+>N0wi$!b-ikwmX{ zNN^aRLDF(?AYgXpT=k=lL@;7SU}^cCx5K6bBr$R^@=XyZv-huOagM0_`pJY5-zpE^13RB=HHLjLEn)gkN1128Su;O*8 zIBLp&@8mW6S21`=czvVt$s6B5{#ZO}1;4N@R@Fj0tM;|AO+$MX1bl4I2w$XM>RZ~- zARVt72ZwhEcfTlkwRc1;-ed1YPalY}oAm6)$@60a3 zsueJg8^-IMW8_-}i$d0zJ+?2*Zs`|flYA$Bkpp5Z?0G9ZK8oIuwr2%2SU3R|D9B(P zZxC#V`#e=>%M0CUObF-~9j3%epwM8&L9eyV&WcO_fOx!*x1<4v0POn>`-^&+p>yXB zKWJ-5Ei7n#-=8Lot_tKl$8a9BE%B70B=102nAvw)T!MQ1iy~&L10~`tBHN2IM%3%T z$TAe7rl}eJ<;yEoRn>W+tfZvitu2!u1=W8DqN@UGUha`(VK7?PIXB?X!wu>SMR7yR zFO@m5H+A!l@hzqy2xW$-2ygM?A0E70Z&tiOYL-=0#A^uWK5odF+6Rr2my}7u=Kuj* zMYGBXZqY5B{fxh-U*ROVZDqR= z19#z|7{AmW<~HI7-h2r!@?!*rcL?WK&JjN4!Lw>mq6EBVALZfAbT>Z*5%hG&2kzDP z_SF8urs$P=K}P@wm3W>k`2o>@>+Q=D3i@FpE`o!;Mr99(x+a8_hmf6+`ov#uGui$k z=rXOTw6d~tSoOJCUlXUqbp;vVWwZmdjSWEGi=LU;b*2n77tF)K88qg*MOEyB>gweg zYLm&BGr}-l5g8KHX%G*`>orT!U%Ac!^#|admsIQB#CF<}oecpBUC&`fN9@0g-ObJb zn8(^qgJ`l)BH*O*-_dorwfP#0U?2ZpP}c2QSs02KEd5GYs)g;%%4a5IY;=J&N((Se z^GMFjL>Ei#*4EWM{;ADQpclso;?beudyIt0Jc89Q=eA{$B8ul$rNi3k>5RB<=%M2` ze_5JWj;X$&>jz%Y;~;Brj~PT0=)6RGul|hSmk$|-BQA%Ad%NUN{i=*6;zY6r@nOtH zsB|pavV67+TqHv7m>l0w5irOkjdIUSHHuaF+|ROmEZeVvhzG0^B9Y@SmoEYOT{N#E zrd;o>3xE88)hfdS*FfUahk4o!QK9fbysX=R2z+rQ$O}0Ka0l>K$L0}I!OsSNszJK!OHWT1pK*05udGzYNw%CS zFa&)G#l>F+shhuV!{AcJAR94Wky-YT$GgVE1u|Gbl*FbONQT?-B-wxEU9*Lfr^JYX zV>!P&taw|jswF=$@*n^GvYZekeAh4kf7+F$I>xaFR&`oo$6MRm>sNF72oPYsm!6#B zpI-)-eKY46Rm>8d%f7s~q+`f8|JIntn^43W*T*8{JTgJbY83fqw0iY|jufL1D8d7O z<(o**CVndA+^_f_Wbw`&=x)j%J_LYH4r7-gTY4AiIR;93I-LA>mR1pMx>A)J*kpZ} zUE<)ma}6ZUJ_6%dN~A$Bp#-07t5Gp$JeDbpez@l{doTVzU*-FD?7>2{(tomQZgVq{ zXB!!aB+%pRjl^ZFw49=)^XxXb>{2+#zz)WYa;1Dgun$lx{pynJui!-KDoQ|fwpN_b~Z)a0~Q7yKo4NJD3hnBtT8vPIh2#_ z(IjEyT|XahtNG=fOOdEMim;j}^wmuK%!9wb#~j-2UCwpQ{qT)4@v@6ZVjh;~CNr61 z9={Qjx4!PXshX%`iBL*5~l?>(JP+#XTMP({`I_17&=F|FuvKSby*MXepl^^ zZKP#ZSJ{hB_@EFU?CyqDR1QTY8*G-$MZbd@?c?_-R)GQsBO2(&VW9Vh&yp zJ?Y;r0tPj+tE3&i`)I=tryjQ76Xq!3i^aUz!NP%dI##gjpzHCXoR0Eg<>uxF-AR9m z95FCkH90lahu{a=EgPGgeLhkoAf^=aRdPV00*(X5f%-|^mewz!2SHny2bvHLwJz@R z((OgQhEU9{z@9vX>6)e;-~0HW0?Lhy;Hcq0w8E49VNm(*S4WkM&b|1FCW_s!pW=0! zcKzKqGsMyqddw8maI_S8L6j|@(c9oW*)Sktv$ha7-Lw+<8l-KHGrt8$dI;F?bnxdAHqi{yyUR(g)XIoNtA zw(1Ad=zw=y{OWQ*E`GnhFY2=!jBVLoyE}SqJD%zka*;NaLd~B;Jh4T1J|}rhbeb63 zcO(Dd0C?Vt=SqZiZ!tSu! z)XJ@$l3bsH4;Jw&OqWx9yfOvf^sN?4D>|Anp&KC;3vPURI!ROpLSkZKLSkZ2XQ!CZ z)%3^mR@|UiH*EcqUCcn=aey?_0*U{02DYttfyGZMJYZHb{qlr!+WuS9w;sPIy zAhFf&Bu+ztcG(OUfO^_*3jGDBf0L6cLT8;YvhUybEW4rX;{la0<|P+{Mz?00XI+Wi zfPp(&Cctpc;fhcW?}W;cBPG<7aaN*rgOOh8>j+DQmJ|-g<4}ql%uBk&{Uwx(Uhgu6 zRsjeHv==CuN1P6ffcE$@5UHs-2paAJ=gJQ~l0gO7t3P4S5lK&@;vGebERS)!^n2w)M9VX&-tM{M!JDFzrTBOOUI={IHZuy$INYChoc>oKL+Cci zAr80l_L|;($xciiDzUV>yo%{F81g?MNM;dV|b1kYOF&` z0fuloUmBps1LzYm`6|WADk_0sLQd$R(rZe9jUcmg>O9W)>e%9R0}6tY3xkDY(AIbC zE{gu^Lx5~7GOW^@q~$02yAu+yr(E^r!-M9c(=M69ncKZuqD^lkGig~zg&(LFv_qKP zI=ZODjYrxE>FLo3WbP2qh^Bw@1`6V6AH!`uTj|qJn1_uqAUeFP(a;-qshvd)i-Z-B zxP8~<4Ubt{wi=}d@YkX++Uzu}@*wxQW9yX0@CI)hE<0<2O z`|lkac^EL~h;(&on1+btz~4CsH^c61IVKiqb+q9D1e+^^z&ymLm?Pn z?B1Y={_x@!*MYm`fz+^5bg=`M5u4{1pA40)T9g-x`uy50B~k)5dOwl!`1#@~>4?HX zwaZKPGq7M|PBFNINElWvQcz0b0M?|s-gP^04h1`=`>t`Ae5gF0nuiNA<~JK2OBR{9 z>+1egBQ4}=Ow-SfMVQzqF9t@ks(_5{dm~`ZCXJFA?dN@Da<-&K6Z-!ANa4fnRAJ2C z-uj2~>CGl3;>z=WS=omrMkCbI9#c_bKt|0*N$&Ju!CXMs$8!t~*s-u~HDsjB_c4Lf zO77m?CC$O2zLSW$DLGk;gn4KwNP`dmEzi=O{MD~7nBHWe5tee+-^@O$!rkvMQSHkD*6j(LAVSr!h9 ze~u+wHZeA0(7@J zzNs?QeL5*Bf5T#2kpYHoC-lC_hXSKbEcY4AcVnN&vB1PBSuH>y?PTbgf>b&?aU+%= zUrvL?qQy~sF@_H1;M`jar`z-?lgGXnWQWbN-%+F4vQC{of?$^unA~LUwXDB zzb|QQB#?i9Q(V(b#2Sv7Ah8piwJ>NLqcm=GEYagM>##y)`tfUgZF8f9J7%6$HrVFUkKirDtvZ*6n7GtYB!{sW$qrPc6cv43PL%xS$il z#E*#~9H^&5EvsCS750%|QTJ;IV#g-&JDquDHuk$cx$0fT^N+jjuQk^GP18%lBQ##E zbPnFPTeYC_o#~>iJXgVy3b%DrUhO*UCSzA#Mk+^NY-i70HnTrIJYeGS+>*a4@3bzI z?d|W^YAgU&rlO*fgktaG!9kYla6QgQ{AB`|e~5cIU}`{$W6B<_j6>KUkFjm*x_G09 zC*&6F-O9>SD6{!37k;`}aKf_~K?!$uvuWBw9?h{MJQ`NO%bnz|3is4^4z7U7Z>+s9X5YB3KuFsYXyve{*SU2pP9)L zDe=D$C=clisRN!4Kq?q|1l;mviBr&%)6>1u6nXvumkFYmczaz!5Xh$ns3liRyN1KM zie$dqw~UarCw^q64kwH9Z|hG?ivnow^+jYeWfF3?*${G}6rvDKUOV~iKJ?-gi!h**Ay}pHFUOk8=vd~T-^HG912PNSv958>`?tA!MM=B4EmSQ&D z{!{`aI#4O1^@sd7MO5U@*Ojblz14$Tey@l@R?$vEgeeS9GQw6(*Bqa{)GSW9I#a0K zA4hK~S;;PYd(;*!JIO_DJNI|cA^yWzJy~|K(hsmV;o%L);Q@uWx%FuFsye|ci}%B8 zhz`a!Y`$Ol3_BCiTE)dex4Jp-gC`i!QFlLBwN$PL5xEf3KkYXE(j?gYhg|3QbZ}b~R_<4Ibt9c(YGmard zO|K&Ower8x2tf+WIovt7`2f?d@kyzUT*2E}FHF>1C#+0Ay$&<(Q{oTzeW2z&+`XYz z5%Q-*27_s}X;gxE{|4sdl)n2feN-q?bK9unNFFG}$r4(<7khyUXlaTFb!}O1MeB3l ztr~x*cYd(ZIoLVYStq)KAH$ZG1&R2TD7lcAGtL*#GTY22BlCNxg~F9Ptka>+c&U9; zAy@9#iF$LS6eb+LBFxBs&HTtI3Iu<$-YiNF!?}AR$uSZ9gc+5*-uPWW&45aRnIXi zA;CK~C_=a*7h$%!%4YSk$Bk$8eEek%T9J~8&d#{ukgxX)tP9~~T)*G@ZFU@=m+*p0s7>>B!XRsbt`tBa1anxJ1I(Yn+ba{IwQZ%GyP1Y4$&f|}{B*F_XLp-qV;*2m_@48+VY4+g#)w=LwY{6O+ z^@BGenKmCBzs%UWg^!@8dnEAXzULqE{ec#zWUh6tL}#N7Be!RT>QNuu4E!kQ1#l~;za%a%%0kRfaF?l<>+q6PjJMr;&#R*}e11T5-UPijjl zOm_cTVT?EMD=AFbKO~DHC@qo!Rz(EqTt06+2*scS5~ERn0FyW7RyAyV_?vmmKM`S~ zKgBUHTJLo^%cIbHxtgqJxuW~Qn!Wi0Wh%a-lAj2*7uf~{K(pdy_4J_lAGSFlH?}YO z-}qVd1)OSU^M03QUrFg9)%aa?^jlf>L3+o<+MLj5bAx>lu526e$A>967^QX2;U2RK zioK(EDz^@>y#4T-oj1&T>mhYG4f^btux{JBL4kjp}22$1xrVqSt=x^**``}RE zHrZu?oGI8HMt%EOm$){B#SU{EYrJc)Ry=1yb^@r_9v%0!-(oaRP21`tUm)NYj&o{% zKkQq%^v)j~(@Xu45d*H|e5@Nzw9~Xr#|$g!ppoB?1Pu+5nn|NX8C>VIYE|)sg6-U# zQxbPd%CAV*Qw;i@iry%|QJxaTc%|Js|792z# z{j!HS<e?x?(hGc0bgL)7wWYksIW0a zs=q0ly8!W1k7osSU?BME2SUivpG07NMPQ#qNT9tDAACYM{K(h6!;}y%Q#_*wVB1Lu zYcYx4BXh0LR!SSg&1&|SNT4P+B`O&lp=sqFbgp=!=|gEKz)b%@t>u&s$ljRBa*jE1 zC@7d46g_V!;4?R-;l#y$Jdv<)78He}qj#EO2fK?aSkwK~7JxrKu$*<)`OCs zIKSb3I?Wy)3kzbEaVTv^+a2DBhVAT5r6^TQ3x5}oxtiFMh`PV{Z;(2!@i)3OIr`&# zV9;uV(a3N_Bm3+Vrrwk#=9K8YlUUruYwf}l$=}oKm8xs<@553RT+D)}yM4Y2Fl*|_ zs;QO{*7}2bX}RufcgVF?;gPjbJlWB;LoFhJzIA{T@9~BLMqW@wGhf{JJH?=eJW1RZ z{XoDokOc<*GH$t_-s8Dk;P^2%`7=l5cA@>9I~mYx-2La!gB5sn(ak=Z(gT8LEs}5z zr(4yKa$firE?Yw>!RUCHt+$aKYfIM1=CPHKQdzF*CaLmiZ6q=Z9jo$xZ#W?URu0m~Z=Z#2(ZBO$bjuj5c`es5B{=#8toP?ZxZc=#E7G zG#!~gmEktSUGJJZ*?4Bd1Up75j@Z>!gRA*DgoHW8b9|;JFRF0wAO{E>dt8XmZiKJ48PD#}W=*JffMY$vXA)2UUi{{1^!~dg}w<1-R zk2!Q5z)>qH8x`DGrTac^B|pY-y5Xidt{Ur+a^M(Mfr+ud^>CKr=-e5mL}-IQ4q&r= zIu7)%2<&Z0;-P?9@Oy_6_W6~Gw^cQlfgkyQH4iwPEhS=7^$MFniKJ4T_xup^lSe}W z@z_sIsT0VbXbNcXKyME^7?g^S5_sRGYf|Omxntra2Wh_t6q%13B7sCQY!jLh6t&4B0cF~giC~ms}fjV3h?PqR|{H? z`5R^A)T_;k(zQjg8fB6bWep#&{wdPis~E$U$6$)p|If+Y zy~`xE)yrN@uU845mEYD6=zDN#fwD0-+YW9Ul7)+qVELMkrUJL1;&xDwW-6hREOt5f4|ap+q6T6~HNi94Uw{hx^()}gpi4z6E4aPA&FA~jsBvUJ>2`|b zMIFxq!&+d6k%Cm7Ufk6q2+hB^0+&3o{W;mFwP#5*5&E%jB$#Ni`7H4d8In)&N4Jd( zd$^P&*6s5{^6M@HoUQ&oijSL22VlaHzFd%hZY9Q#Xf6^pST3K8ZWKAoBXyd5C&yOt zf6oF8=~SvbrCfAl*(~u_{t%BcMmt77B(Ep%f#viQH11uigI4)}j@SIYTVE?W5nGrz zyrKveqrHeV@$-?PV92oL4fn92>2Az^W71l2dq#u~<;Zpa8_!3EvYzb>=ULlHZ`j6D zW|=Pa=SOI9N(#5qs-LGrSVFA^91k%cgG2?&`sd8=pD8;(M*3wQJ(4|e@c(}O>ogGB zyYlH^#g(o_iJn2D9d;XCw$1 z!oZ-aV*dLv@WJspeWi5VkC8vF-uKS8wp>?djgOlK(xA8ATsk=?g<$0S!=R*~$A9kV zNb6VNZgfKQ9CEm#d7I$>mQKo2Lp=m>ad9;(4FXV@>J@6*iLK~9BXXThQ>~&?AcixG z-i3%jxnS=&#(VO{N;IRj?zLc$o?1KVdqv-D_Hr#Z+mYqRe5DQndi2x9h}1dXVN1i> z*bQU~TYP7At@8LKE_F8HrOHU*TmR-i-o9-JXqf-tuzbXpLYqYsOmT7GAp00BTBT&Y zU=tgud6v%p3F(%F7LZMkDtx`IHT zkJF;0gBi58zuO@hSO9-^4%Hw~vwDq0>CWcy3VLJ%Ba6uG>Tn3C&l^N(->|+wHLa$b z2KHjyj2)BC96kJnzFcn0qDIN&lorrqqyUextDAy}rJWtMmr*tiJw0>~>0n|A2MLx}>$@_^g9EDEQB;2gkjFHjPzDc2WLA2$jb`u-@E zhL8A{xl!ef<)k1QY(t4IZ>44^A&_?+FeXly{tI8@Zu;y;3MNnZb7~h@r4~&)7je7k z!yuZx+Uxb5t_QUS!_k<&**eoVYpZ4vM4y03C(pIQtZi&cL#*WrrzokYBycc4+PuwC zzSN#uX=*pR1b}a=eP0=6tZZMl~)s58wIE2&Y}TLV_O=Hwx){VSPDG;UV2k&RRhV zi|Ku1V`D!voRZ_4gCLFTzuzE|ik>CSzJGOjD`N>m@BalwS2KZrK=ilgRE6l-qpz%& zK&OJL*wY15yd%uBYr7=NKyj4r?cP1D&cXWm?}(_|p-9*XsjJxuty*GiQI<5aP>=J( zSTZv540YpoMl|Ko@#|A*oZUYBfr7ie6ul8Q_VkS88pss_*a)H3!-Gd>@lqu-21_6zj|CDafwnRXzsYFHwf9V8_5cGmUv@%IB zjrw1w_4w%I1nc62#NJ{cP9eo%K+WkabY())Elzniq{sF4eB(7qX`_F`Y5UT_d*zaY z<=>Z)5u0cE?9$3*HhrZ?Z7|YuZ$_XIzJ%=)5YoozP%9brvr^JTebwy(-Hh)u8>BEd< zZ82>^N;CMeo^R^dkbQF2WR{ZIpoli(f~@nPy9EC|p~1ObO?@Xo@wvRXc&nLe-JtRr z4?G70lvkk9K|?^%U>?0vczs&*e|2`&aZz>M9-kqGW<~*rhEZA?1qq1(BqgLCkWylh z&Y^n<1%U@iVWbpP5D<{=6p&Ivq(wmK?zo%heee6ZpZnjve1@9gaQ4}It-aRSd;Pvs zoaYgOAT(|akW3M>kmZqXGWfIQ?nbhXr-}SvL3)(b|H(Tvjgvt_K>uW@`ax~4yjx?# z1fR{l7vo7&g{_@21YfX{JS0gL^v&DATE5Ygu z=}bg-=y_U>C2#9LDT{Oa>5vMv!rQq2_3>zANZLJiW@vsy91Yu@`+?VJUJf*uC5ul! z=1Pfq!MC0CGf7vz^Ta)kRlIs!5p1meyk!Kb&eeOWjM0EVthdw?N=f}wsnbT(Xs$5v z)gu=vM2$GLa9?IHpVy=n^<5$BS{1Yc*JLoU)@dQtS9VAU-;|m>KmG9aoRq_}UI?rc zw7w2Y)g8eIM1SqSj;7I5oRT1T_hb#%`91YD>C-6zi<2A8sfsV%Y-;6l4*T9+gb)(b zzL0lr_{Ldzd_+oyJp1Apn{f1K%O*$g$&B%Oy?q#u9aPqFLDI^4enceU@37~V+|JquFuaHx(8uTHRDWOI-+kyO*zz#@Vhe02NJN_QLKyZjSVa2EAC@a- z5ke}PV8hDiBI??gbbhOwAWLN2Ut{5Nd@c_`QM;FG9@5!25(MVDS&&5-Cse78IdQQs z3i7X9i55~pF#L))@!|%{DLLg)Y&x$Bz_?a3fixCzG+m54t!=WWwuGa;V)x)c!ne zqiV8G&ra$Ck}Rn3>%6U%@OVIe@)K-AO$b>2Pvb(C^}i(Iu+D^cj$es)l* z?Yc}zaEaqm&PPHb8FvC-9`Onyjn}U&$8jnWq7hTqU$}}p<_&gMpvFHaB8TpZbl5)0 zT<4nT6kK_=KfM0H?t|kuR(9+Urz*p2ftD0_!qx7iH_thDVpX`vZZY15t9W1N+)Cj$ zuC(c9)My%zl>O_#yl*ll>Bx5-pFZ;?50ZHAZ|JX*8!~($(P(IZyPea#g@q5&;$iun zWs&u{=A0NB`P}X7uJlIoy2WQlhoku3E$a=p`m-Tp)7fU(v8dnv8@C!wYUq(+%yI$u zTbUY%q`%*ytX^5xc*JvMR?-(n$UQLo{i#?_uJ3DYZFjzCwqWGxC)Ujycbn_xWw>k& zh`-NF-hBt;)x)6#s*L!jX|2aAN0!6akMpX`pT_02`K%E#EQK@soBOdD-J8^>uZd>8 z_Uh*|KXMn{pGC{I}Cq%E#=E;a8gC{GgGZoKbP3!^Lw!w3O+6v!M|* zCiUw=5$=m9t^8M>ZAxp7&w1NcNDAV^V<)4W5ntw}y>ns(>iUzdn&{&}6^|NmHpXkU zFQ%5{;ysXIhYJt$;stosgd^w-6AD`L4LaWb^&pztKYo5HegGv()de#98{V^Wepjmz7d=1Sx*h#y7-sk5yvH{EHPiYZeA;cU5#nsJzC?)utaZT`vgT~$Q*xqm z7GiJenoAl3WM6KW#9d$mqX}JJwbutwi6>da4Fo=Z)FbFXh4l`*SZ%sTK~Jb&LR^<4 zy6Yin7O13JnGF#QZ*@Q_d+a1dG1=2Ansr>yWSR3h!t1QB-mEy%#SnA=^Vuf-<)KiI z-C~XP1@ejDMLtuhV0*T?Oc|=iL*6SZhURfKzWkiLrqB?Q@Yw-b!t;!!WYNle#-dJg z>=UdUU2=k{GSnItF`&|lK|b2Apa_S~fOd=ZAi8L!LAjO=Q=s^?$*}A`-E?{6&y!0_aBA z;5EBJ*AoNb-w(2aZZn(;6(CNrZFZYHX@q@pqk5a{94dnhsxbaE=}#AH$GwYlfA)-S zXw&p~=vW2?| znn}$W+G=AYzca$!T%2h-?T%K(W-o>SH)B#G z!$|Ii;Sw?xp&Wz{e2MjIH+Pmo<(i<=7mPj^S2AVJogMQEjnF&EFkw`=b`>Yi8b+Kz zsp9~d%OngAU}>2-C->R$nq5slq>2%Q*8MRo-LrsXh)D!7p;brqydZybkO5u^Livhp z7TmU+!Nlpb*NEoRef=f(8U!iP?7_*Ld(CC-V)SFbjg&?u8#UbFc=>FgP?$6|!B+pR zG}zHtQG0|oV@vTvD*l_q-v)^TaKEIySh_pyo;+z;pQ?jGLggWSeSLOzc4_VtAH4wm zLfOjd24ps4=9G((iRqDOIs|h6{(U<(-O>b$- zCut23kM21MO?hk0G`+v`hxzBbX2PIN33HOT-H?kMWVP_3$eK|JuG0}(TmtFUTheDj zDY59Y$dgm#Au$vb6kSy8TR+qjiPRFWI&-@}1Z9_og||CyFuV~MwCpQO?|arIN%H!j z#jGTx&YbxC6o#O@2V_$@7Tfc*Qx|9Lh=+v#my{=WR%>vHVta}aR7ulC!Bs0D9_MFS8%MIY%XV7TR|0O-=E)h2vkI;9UmpzbFjE7v$juebZWCj$ zpX0Hk!%sUKQ}|3o!XjL{M-V?MUhpjI=w{f=iA3{(f~T?@S=!b;;rZvyx5q*kxV2Q3 zNm28s}(?DaPc>RNyIp`M*~a4@{zl#hzeA^gHT;NzL?yiI4Mk4P#m)C=+`!qn$=P{Z z2VOyg8{jet$9wR8$-^Nd^aLBj)0xhlYlr0nJSDe3T~HcUvo8dqa0$XAg1H0Yr|E{9 zM&<;?4XnPE`NPg9V*{ab-XeEQvBcjo=hUmC0|bmzhlNos{?yQ}Y5gDN%Z$M99{eCd zlQP5*s6GS7lAmoa4h=hdZw}1jS@MVrWQl^;88$6WX(1$lh!xcK^+BnZh3XSFQWOU( zIs3cWGd&aW-ho#Nn^Apv1BE6PMD%__6^ricj-K-!8ZFM|6xbEKC9$UmJBs6sC*(DB zNrp3K>=dI7{d$sv?Cr^ZcLATp!A}PjHXCO^baYVKyy7W zZdE7wLuh)qZ$)~3SQlO27wJKLJ{hlwxiTP2B`Z#%QjI?`Aew0UP;Yw>$WZjFr@}h} zfzA&DxeU?T49T>6c+)P^EW9C)uB|Kd8(UNm8v}IX#B9~y-$H4#uI?VP%jGSBMhpHI z4v}boD^pd87z-4XS6h0=Y66= zNAu}I)T%!BI+ElPl1}ek2lL@oe`Pud_GBh3Fo+^8vW(y=diW`UYAyb5-f0iXbnFK> z3H&+Nu9yYnWL2tEdUPnA1f?_rx|Bg~@6qojD8u0_UMFW=eRg>B_jvc7ghKg(gQdrk z&D&HQLevYSLTl}=zniAzc;RN&H;te_uWw!)Nv8e5mikMCFAV9^p{%hvkVI$hs1~^- z#A%1NO0T*c+5HYD`LGjtQk2pfIMc*uUm~_C(PO{qfKrF&=__y)?lmcKLVCp}xgDzy zy<>|_`Z^g3r^!T|z|iX4K(jh66!s)S(>nrzesI~ZADc>u9~#2=`C&H&yFI;u3B3VR zGKU)4!h%gn7dNmJKJFN4ScrF@Q1WuFVlsF6y}n`9754}TL=ehu$M}mRvelGi z3p8o^n*JGo5nzJ+#eXeLlPj&y7Wgms2)16zIAw0j$MWM};BiUa$j^UbOT!NWzISk3 zeekgqF&wRGsOJRP@{!Rk`D7vhgTZEZj`D}ab*Gw}Apo~oFbq&-hXIY%Cu#hBRY64J z-p*e)xWROf9)Z3F<>v2;N=scQAy-R2aHx@0S>0m1Cs zv3*a)9>(6haiEgiT}G)8-9U-`?-=mnM_k*SJW$||7Mn>gG+z5%0%c-I-#)RW>7*ww zlNs|9jJ@*txf_^-c6P*q5Bme)ow_C_FVg=Cy=4?QfE{;wIecrU38}t*qQp zeI^-jC~O{dwD1M?*%2Uy40tv@hZk}wPCF21T>on4$kNO!vQc5Kw+=Su(p{Q%d5csaoTuO{Hvhj zKbqdCHTOrc^3Jhh$WBs-+)=l-Xl?Mwt58TtpRJ;i5mHIF9Sj3&(BY#u>p;i@xUu0K zkJC%ne7q7l=7v<*p8qIdCl$`51MU;HiAOYG}-m<9~FjGFGt*@#|_4fA0tkK_Kx$BqYGF>lepEY4>Ju9Fo=0-*Z z_5CT~{gG5m)e~!rnI6+qQ}|@7*vYKkg`a>F)#J^R(O5MX?o&^A z-FBH&!HLEy6DP+XE91My4hwX&Jd0a+|BV?PmLtAuM6%reG1&}6aOS`bn^Qw ze@!_{06ppBp7J&S6;CM?DFKMy0LX}X_Os}i{|{dghbHI4ToX?9);A;edL~FAvcIHa zQ_VytsHH%DjICn<=AovTaNmk@Lt@%uX?(X~WMxk{T@wJ7!O|<*!=t=xV`-=5 zk7w)3p6$rNsHHGZQ%o*t$ykg$yKteb206Q$PH(8YRO0>C+xTB*VyH|rQPJCOmCT+X zC7W_KGH0TM`81gC9voo$=tFgSh|9-|PI0}hPZ-MP{8zSobRIq|tdEmP7|0(;9FLu9TOkv>)N4?zX9H!siX;k@)a~8gT&vjb=RX3FyG;R&6(f|vgyJ;w2 zIAYrY@N)!EfGH$|NJ&crVl1I9ZTV0ChoENOYh{$T1h2&%d+$bt$y(8Ip2U2Te&U&F;q3m{-B)zLeC|KY@C= z1&F@59NGFKu5$H?`uX_{PES)qXq8meL5tou(}(jsQ!jh%n%^MY@2f?EVE2T%eY-Dl z|Gg7HGHar_VmiW3*6ZNl){IvE--k#>X%#2}?CvVIZkv98>ykn-RqIvy7r%NsnDG7m zrS~^x;E+qUH8AKF$*8uukeLk?5C~WUHWfTQJ+n(n;J}E%c3J7TqsZB1%UxWALUh)j z14x-NOA=nIz}+@`#Qu?ZMjpWoh;YfwYrx_G&qj~+wp<+l=08cwuA#KYyl9-WGC=< zEzVW*fC?YV^QJjXK8y3jKlfx|xo~F7&@UiUUw4pBzBz340z6(sp$yqr$s8xV*QO8gL#4jXdeSN+9 zM7t|lV-6xR9vD?{dYQm#8ZbP+_eK*z#7Pfq63i#}h2iKX!a$nxPW_ zIcKqTz&djC!$B?b(8vfe*xJ^mopZgDRCD%jJ=}v;Z+++J5^)2}ijs-kJ;er%lL0jNcn8Cf3fxG8KVOEJ%^g! zk1BT3o>t#qbf$AdLYYNbc=ZiIz)-j?dL4c~d1xP%=N5VzGv`}Q4NG@M9c?j~BOWG)rf;>Pv zy5Ih$50*`U6wnik{W+MU2t2RU_r1jaIQFr`edKy%EO=TQ{np2_2w@6h-f3A zT^$-6%q|Ls5+dCoC5?2Rd*1K+ z&;OtQoU_(h=Uv8&GtV>kbI0D-zOHLeq`IoyLrgME2n6y_L0(!D0zrxgzuD1I!N2U) zt#^XIpw2J_Ep+go54w2-_!+}d{*^NXf@^~Kjr3K#;57t715uEc)bhyKTk!TtJY5jK zo7((?B>$7{eGYQ|w{w?Zq6bz=h4$5jgdND=C0{SVar7QHJ2ZxC3(UKVCpaA%UHTOS zaillSXsO57YAx9Q!Fg+45X8T5^o1p-iGgMG#rv$km$wNUisPTdKMV06Y_5%={n%`L zYj{x1GSxJ-*)O_4UbD*Ve}7=XVDWBhW&wMIbz$-90^$g<9{*YxFpV zv&5o){?z=CoV*II?ob!T#AuWqRq1y^)=OfbCZ_H2&H#mBUZ#F<&d z`-~ljQaGfxmLDQvYs;SHxj|ZD*hG}=a~xSNMNcS!5#kZ8S=gOGCr9x;+wbc2?pz)2 zt5>hA?d@fim80i29o?kfPRKhEEKN6iK704>9UdhmN3z)E??h%+*3L?^KE@j~87RwGoRr@vlU+t_sNFElZ^u;Lxv?z=JkT_jwQB-1r;`US3{CL`6v%7*K#qnQF1tH#ejD8m{-7-(315jlG-jpfPFq9fm3x82X>+qhev?=Q$JDt68{I8)Yw8?1rV z`$a^cEJ#auvR#nps`iDc)9{=6RxZ2;IC}QVc5;aA|)hHs4)q#Nlob_Qpw`gv^IlH=< z%sB4ekt{Khjnn8;zZ$(C;Df$??O$n8%zuv+)SjN6ZrFGRW_;RrF~gaPoO}=q6H_)i zk-PVNZ%$z2G`=0Iz4Ek&!w`7?vtAEp=VzPqZ=D}Q1E(e@XZPwHmYxrjtjWbuJsw>2 zyS5u9A#_;PE~DrovjVehTx}UMUJf?naKrkBRf|TJi2LxKvmRmpg8PzPor9+4`|)GM z*d_5iyuF)rEmcgZ{VsABTCGO1)}y&wJ&sJDe>`f<=I%{QOr%_NP!TONx;})Osyy!! z@5+7u{=J8r+u7GCDP%OhPlVZN0uEGfY;1BT-f+9`8$5dSXt`vV`MuYEVj6nS65Z#OuW(1()z9L zceeY=pvg_e)_k$WTcE1Cy3%^=N#!O1gr1S{2^$+$O-;?#)>ehh2dX!s2M!JneINO3 z98b11cm)LLxViBI@~Mi7S_!rq2P*Kj(iuc5Cmvg$^l3*txi*qa)Ntw6lK zL(9wNqi;1eD~W2)J7%h_G&D7N_RSidqru^iZ6cnut6SJeaE^5ucX7K}nVp-{c<~~* zzFx3kj?~Nzqc8KkuD14xxHvUfi5{?FkA5erAy?3=_ohyx@+FK6U0wF|_Quw$`S>&& zzNZU}AwWM$17|6an~^`Kwx%Zc>sNG#meb_voMtfT>_*KV^A8KN1beMB7A{(RhjU8= z7It@S!wZsfWfgJCn#IABLmYqi<8uf8nw&Ja zBM5)b$0jGkoHvJQSy&X<2fu$OXmZ&Ar|zW;m^ssl0bNn2=mf6bjK%I|&*7C4fw}2v z-S9Vb4$Vt?F&4gWkl<76ZcnD3`(C*~NKRi5T%AY`&rS z>M?3>{Cv2QDoZe|*vEGm+@`y`TcT83WC5JC@a^$}wVTVoRGw`$WIG@tcA`=H1c2G& zK2koJEmd>Qd#PdAxaC)SCN}Z=E0& ztW?{1xIs`A%-(=~Nf29~qbmiv+0Zt>&hYEKUXgG5`_}7fVy7Prb*$KI>Xtr#rf33PKPljX~ieM;5|837Qum-`_<7~!CZb2?@EJ93TI9oifxmmP-{jG6`9J*6S zgY#wu*qgk3e351x6M9Iz&XZrlSkwfD_7lc?oNm83=9o zsi>$hKYdDUT=4m`MER6d`IPrFQCLL!+5Q3>|FZroCa>oEyX#vLQm2>gjPXZ`Dk?SU zjGqb%F(6+tqv9b+NlBp~@4!LGg|N5OpY6^B*Dd+YjU?dfsJwHcktzr)D=O+t=Fq<( z#-=q_E^j`5ci{_CZkb`zMxS*=Q_vlTIdhF(41-c~gzpRnZHULnXAfi~cCpG&pFYJW zCvP0612{qtwLk`uBYwc#R3zNhyqM4P>%<##b90F&34+b%zXuWpUH^PAOw(eOF)~{G zS{*_FL$4&Rz%bvMj(ztI^YiD=Gz<(fKA9OA(HR*OTQ0)kW&WClj}-HNZaY1D_ADVS zZQDiP#)j?c`dZB6ao|&7;aPjnX?4jdN#K%ky&Gc3k?C{N!Y!lhe~n5$x=D zadEfEKOAsGq=*ix+ksoX$ z`Jba-p!MPN3(V|n<{_Y1BE&%y%lVQ`V$dQxGZBHupxld#n0~)?mZ|;C*|4$Dn%3;g zv(x{1h1Q-w`2O;I`QV0Vi)WVK$L@JZb06)H0>$MIs(-*<5cd5s@{k2uz!(2%ObZRT zjUOc0DP=_X8ax-_*{1kCsj<(|gSXDkJyp@s*d@A^$l`ZLabTCwCSo0Iql!C`G=D)R z`Kgj(VIEjI10{OAH}1h47>`k0Ru(3kSE)o3U8yj&o`&bY`cVz8-GXc|F^BnB0M_sQ zXm>-aty>xaIFU&lnlSw?be6MmUiC4HQtU3e!p}t^MEEOp)7JD}5jG(Il)S1Qot};; z$nwJcyE)Paf*#0YP{?$R4YB2qkL!o6AoAf2CUbOESq!Qh8R5jp(oyxjbB?X5;sM(S zK3Vcgx7Ido!41O9#?~cvJ|Fz5+R|kEXTG4vVa~!Ccr*}d@g^oFR{e%!rTWdgexPvO zuNbEKU)B2_*9$%Fy!XWK^u@j6^Mrh(ipZ>-L`jRb!TeTNXLfgUDK9TC0fQkQA0Im& zt;+1q)?l%)uz0M55<8BFoiG0oSn8&9pEo(r^1VJY>3R3q<9LASsE61%=ksT1wbckJ zyy2*>u8xkD7D>O(UIy$=zq`NLbo~6<2KaBM$_&8>X%M&G%yeI^9(mpsh=LZMo*o6@ z6EY;3)9~HamNi#zh3k&CE;>OXq3HKaH}4mUvm4;KTkw zq8Yw?3CmePb7t=8rnQx;l*36%NwGTjx;$Qwh>YxzqxKDX%%+VPr4Q@Zva;Sx_eG(N zA9EiayB*C;A&|_>%!`}9Bbiz5U11Lq8-$lECN(~cy6@GeC-l=tBjUBu^lg)-Y|VbP z`3W~RSU1Ge%*=rvQr>K%ZWBuy_4NGwlexCT8AA0g+XP_e=V}%rwjM}wYajqIb8t|e zdPH;6)6+-S-~P;3QXR2T;jy!`%j=cxH}B9d@ARpEB+cWDs$4^ig1QHVyKW9JCK0)?+I?U)%-g_9fhPzR(HNTLuX&d zZ~m;8qNseo>EL;Wuq!4xzis>VL0jAdETZTO;$Nf3)b|&C;3R?%M9a}EjE|2GpcHY- z$;oNozN4cIkB7I57Y8>qKeBECD{q1m5>AbxbQgAyoDLyFYNyq0=roT$>h&PYXxW~oc#>*O_)Ft)@&x`9G%S!8!4ICJ)*T}vlLqmrD zXOb@%&uwj`dfPzZ)k3A1Pe2gaEr%b58Y3I5l)`BOim^MK(D(koP@!4NpLj!?IOEUElNL?TFWA%Fy*&7|jl%VM^tCiPZLHvCd&9Hj z+SpoJI}QOx36Be>iqGlbv4|h|B6E|);819Xa9Q2cqfMc{C;I!A!lI%zOWU2#(-|`e_zt zaPH1*maU$DdyI*k`j%nZ7dJ>HbE9n;84p}8-@j)0?1F{wqH+n+t2!6vDInSJjcx1p z={^Vx&Rap-ZDDS{yxb9t2;BdXhi9rRtZi){onQ6}v<3#F;m)>{my}>vTmE<}IE+Yl z0Oel(-OK~U9Hhc#ij&>2@ez0t21petY^f13|2vM& z_w(@HooODEJYa~&>m!i@Q5uIrFn}q`1DSz=0py4G3o%*T{2Mv+y(C6n+YG$Cyl`;F z`+xihRm|@MAO(@<;h^~T*sbh41fgq|ArG8fP*cJ8=NmvOk+HYu1b3v#5@%p|(9qD} z@u#q0bZiU>(mygH_vQ^NzwI;)6%(j`jr}eej67Bz*e*0aW=+BbNT)igys?q2$^D?$ z_k2N4X@lS}iz;>L^5KxkJ)z*eKxM|l9S6DyuEfO|4;PuJC~HyYaZu;^)11=crJc>l zOrqFT3uh)5>|r60*v$n@O#f?&KH{hx5R$XCEvcW@vb1E>Voj2oL1Yp{2=`eU@xOBh zNDwx^3BZ61Kt71n4l*PN#fWDW^gO0_+nWvj@Zk|CT2_bBp1EztMMhSpH+X{c10Jkz zAdv|`ag1Rh)hyA-@dCA|!S(l}J$%`n26vc_GFEIKjB#n}AiLysS5GTwq%gX%W*e`Z zCeSdtUz7|^=ruS8TSN;{Sb>}A);s1J&Wwh99 zo<m?FRY}jgC-97cAx+U%az$ZQrqqxv}JVqDW)JHTEVS z&ULb}Uj}d#q76+}sE-1IjgBG zPyxS63_FM90D;>sZRRLc{d08P0A(MNlCPl=(SwJtX+T_=zlsw$5t|IN1u+O!<4<1ZcnxTQJA1z7IVoC|Sg0~m3x+tg0LK8-OY`(8 z?mwjLU}8_w~C+89-g|@bFZ4pWBPxU96c#`(KS`BYuJ;%btpPpGA9Sn!jDq zaY>#);>(`Kq@$xXJV$Zi_<@QQcWCUGi1(vc1p1?w+y@X_+#t|UJooYFb~T#kBNaaw z@KAxYUtf3FuyzzLLA|`~5LxYTlt{U4n>_KjQBJD*hyd14a&g)Ait+i)xs1Mvt!N0E z07?|k*7kT5A`}M^2)G4NUq8H%qn^f0gSbt-tw?HskJH~_aY`e~Qt0DhA&&*>jwQdx zTefv}^73Jahh}h$0q}Y^cwLf%kr5TdmGxf=cRw`^Pt{^d9slrFyu=OmwIWT$ zOD#rG)kZM?NOA7z?MvR|&0h$%sCiUISFzD|1cG~1&W#;uqd}!f4|ns|bgbBY2_KvaR!LiJTSkW<-T>*+ z=j`&?bzL4GKO-lykkGBhx9wGK>z1KF=o3(=l1SfJhg&QyikB4w%;PW|E->~49vDUt zfhD*Z5f^vvPC~_ijyz;p4f1=#k%559&t>v?TU2qwo#|ox$0)T|CKdUwel@X5Ja9oC z+jjX{ULJ118E#Y_u2~o&z3Sr_uUZf+n-{3e7;IE7r>KZYKq%IfTv^T16}(`cIcv(E z>$MOVcE3{|yhovecml3X1$NFt2>B92*@Hl`P=_r%S!@x`)ad5L94D7f;N#B+JuVzj+e`hGf9$ zKHXMUhBGrWv&x^Qypbq~LA%!1q_TWKjdGvUS(#~eO9vMyxOytMS{0y5)0G%d`;2d}3lSsIf})YQF2_62N-fe^?$q3tkhN4cB_`LPSm-YZ_|GOGf$qvu5G= zTS))+@15&|DH=LDXzjJLl(gM#vbLN3-;wlNV`qheblVy63binUJK#E&ykxXVij-%3 zWHHPvEckPxAbVzc?+Z2G91qg*Hxj~{qX3e!9K8Mg`8G^{oVH{}dXFMe0sv4z-T@A$ zi6^*Y`P-FZ-WUxXos>^zdO8-M!@hiB3};FJcx{#=6cd6OicBtGFYV&u((k$u9TAc0 zVwl($TsG$oLeCfX#enT@{G5^6`WMsl=KcK71N%S77>Tt#%f0ZtHl_RkuxVsuq}#+x z02-fJ4L>4`sELv0Op(e}Uq0C$mse9GK2K512MWhqL3>NfjR>|WR~MJ)#wRQ0kBSyv zZ8wUI{2V?)J6vmXwcXTBLdz-)kwiziIdsPD84>xdL(k66E)B9jK!YDYeys4lc5~7P z818*yVgi`&?rv$oHEF$k8TnEh!2J9_Y%rgY`;jH~JqqHHIn1ukjt}R73r#r6tGcJk zLDZ#@&%1SPKB>v(UZjvpOzpAU6fN>(nR$rxactvF+CvkUmMfiAq?!E|C9k2TJZU*o zJ4leX4H;Qb^fYr1_o-!%*tM=+=b_Mwef)~@rzgga-BzN>62IQ?{rU4JXUsaoUQIA+Ud!0k#F__|uidZSc{U~_Su(!9j z)mf9`2iHyA_4CHiTa4lW(~|e;6Edh7bz8jn0N=<+Axhir?lH%gy5vEM!0z&U6ts3Q zJDRy#J1KB%fRNB9=9CMH7YhppGIH{xK@e6JUU)yK2@Jt$+&@~&2)H`0EIr4-l7Bx zj^;zxSPkWJDPyhCdk6;l>z5Wt_VA%|0vIyd{crlAA{~fSxNfigJ<+sOdWZR#UvxAS zGmh_*fPlbS!)BJ0tSle=nZi}9K$4+G4#LDvxL=ws`3c)0pL^tGg??E37A+T?>)ZH`T4n@K6Sn*coO1) zA+Ce=_ug~4#*cK`l);`X9dNWE?1mn=l)}FYU*N+;qsbpG zhbwjL+*z}IkPTcX3RpxaCjKwY65L%svZF1gqf2wtOG7#WJCqY0dU)^WzoLnEiA(Ss4Xk_ z(rRuuu`j3x&+m*NgkdT7w`6aRnHmi^wdE~mAGJ%`1FnP38T~Ab=}_y>4B}(yk1p$xPwLObB|H6y-4XO z%UFQ%(j+N48BpF0H_sCXLVo`I8JY1Bk4Lq+fPCXv2hQn7WNCAp%JZ20fE#@-JMXvT zJbx}zyyxGIm7_jt1at;_ky17tw_wa#{lyEE7+ER1Cm|P*`zd1+zuS8xscd0;#lXW` zJNPrptEXH%EXp2|EFdDS^~L4Y))IkO{&(X?Fvt@a9sd+uk@tN+j(P7v32c9%;|pBTfZMvjf)=9++2md9H&HVU}R zTtfb+W*fS)0v950-#2;0}rE zV*(z@<~p>St~ZD0amb8$cEQr9ZU=f5P07Is#4SPaUi72HyQnA)W81B|RE|l566*By6d~%zQV2asc9^a}+8oISXtZIlZ@$f69HFgb zW#Qy&My{@|;-A-)wUueHayu-F0-B`Wl`;g1JltTB8M}Fhb#rPw`yR~-C`X zW6TH92xHG(4CzvBIwmGGAU%|c0$2=81%RQLt}yAUk1LpXvuC74PXII@$?_?YQi2zS zS{{2fQxFj3DW107RFUgQM%+8?iJLq>6Fw|&n{|-CWemA)lY97^9`3;&n5xW3&%`9M zkPL_eLMGLE7tvB}Hf@<#1Liyh7~LOjdC49^a&mK#K$JleGQ^~%TS;jPx%41)Jo|PV z2G$;ck7<>>5k9$ya!>Zap-d6t0<~;eJ-v@zwV-NB(V+)~>A#qo2JPa|0rMbOZh!%& z9PlTALho=j>S?2kD(oh(&Jd;iIriNcTEL9$TuwcH=b!LE871pX9D;a|jkL>+Vs5X` z%@E`jUot>dhK7_t#;ydZGbNTONzsIhWcNGZQ~(a?YJJ-Ymq{(F8^C&s#WtrkS!p@Bu3%>+)X<0s^zOdCYasLd z-E{ri`rKtotMlS)XPMH@R485e;kfoXtjd&fn)g7f?dK+?u+P!2KV#w!9~2qDuWx## z)0vYC1CI&+3qQodh-X6{3E~ErGgvO(oE&(>N0_1G1#v?|%I21q!eUl(a%i9=cyi;k z01zrjRvt2Gw^tMo_YIJMfBn_gFycRwmUD?^$n9_s$o!%qg|C&y! zc34EJWyMwYYf)Rx84+=T8X^}K6~SJ-NR$%Vq=@EDnUnO1A&}oXTz5^sKelq5ki5qK zEl>dSSiXik-*+)_>yCjL2zn!ix5MmW6I}b)Oj(&GB!mEwR9+z)oy+N)%)8=I`BCUr zMBJ&x=WJF&HI0qqfDWg(lYs}hF^=K7_Y!w3m0>qqYDQr{*B}#9WqcUNfY)FU3o1gd zKaL?#hQ%eTG|OA}gXu4?vKf&EeILPQ>u$&?XsI?Tq3@HE!#;i_W7n0)vrw>JHTFgN zTkeNsK*sZMWNJg|qQk9Xo0pC+%?E5gpqEs<9%mFi>L!#|P;hD-0hw3KRTzehx=C>p zcM#ZiOEx-X!veH2{Uj{6b5_0VU}ZfjC7S&Fe6BQQV9OzlA{!ep@0rsF$s$E!t)RAZ z$4ctLgmyZE9%R}GTyP43WD-U;rXatPJ<)NKp7IagXsxEO{UKY?&jRoU-qImR-UH^p zB2;0>2m8qWJ5$L6xTT-WXl-xQDn9hwVMXIn|J6m-U`+yVVH7L7Wg2SZL_IN{!E{gv zK5Qw4Yj7jpBMxq35}liy8wiHmx><8OPxcwZabEd(5rv zsz;c})5orevMf~N+<;)#x`qsvvCNTT;#d7>)W?)9gPX9EE)=dHaQ&1EhTI=QAP?X$ zz(F7}tG|57w{bUFIE8_s)m%POrMTSWFoXI4IoU1tG%i5pbQq=<0?orYx&2q0KI|@(7RrQNY5V*>@j9LV~~rQ%2rDw$DSLgo|#E zZ%TDK#P9mGnbG0y3uwuIjv?~x$IbLhx z_9?&dUE>D9Y88OhfcyBWRitrVsk(Xhr?_18#3iIY+W2Q@iLkZ6`wWH85~`fjbtz*3 z60mvR73e2*W}1}oH#YQ#j)`Mc&AZ_T$n)&U)pKp3^&E3!!-bi_@sX*Hh!Rg$TRWKv z44tli1{=WBWrsGPLeBCJHID%fWCshMh#1$PX`_t>BMpk&_)z-4s}Td!zQ;;RN(m_` z0%g7fzdMA`?(Rs34w{h+L>=bvNMENl{n>vBU%|h_ijP-d%pvT*yS=Ui3>qZhf65Qs zf4sr=a@-gq1vTb70(tb6ANN5B=x|TMPk6%d8gKF*4>0ro$aLle>FUS|N=jV-l!`u% zqAb<6IoErVkOMpgUM+sz{KSa^&|VUAo-`fPCxPMqrwl`nE!t-Q0bt1ls|TV3mUXBE ztl0vF@wuX6^o5fk2yZx%5{!v4A0kl)6HL-=|KdgsKZ742jw&Y(F+FQNFQ+d#A zziZt;ke5ftoQA&t@Bvz*XQyMyMUtUy%*V^y4(F067)OGfc!`11EdKrbpl<1Fj3Wf? z`JzB|CW++{rtN$^{@B=9a+WA9{gWriz~6U=VMVJ7v07N_8>f2Ob3Mxfl(Q}K^N>gV1^-Gs= zptz)@`+E}G>kmG7s|$>&#@G2~BSkW#c-rfMhT?`bM_ecFYu4mXz0HkR$IaS_U>ni+&?mgRvcke z$SB>AlhAgvDAIHv)O$jjVGr)^H1t440j{2kcUn~EU zgLbdyzUp(*BE^E&$o(n@el9w_bYQ`QR`8{>n+f}jK1jgh;r<{IVVtTq6_i2=z4GFC zpuvrI9Mx{Gx4hJjPP-pNCh&_jd+6p<0zs)zwK_b5azU|kH=gr(F{LB0a$uFdbx&jRlE zNiLckkTyDAPZbQ(ieB;R!m9j(=|Ysz!PR^>f7{x9F7Oa^`OOs!Jv|#5_3q3V9)cH_ z1BHpk))~~&$A-n(g_@WkycEfQ@lx@CM1xLCB?UaM*av%+>}MxUW@k}i3mKRzxg2nT zp(A5Qa5We;W3)4lZgFu*FftMiKb7qQ9lXxsXS)Q9kyeq$99DnFrnHL42-sNF(Tv8G zRJN?81PcA;G8n)fe0~Ck&3tjNSXo|nbp=y$oj9$&C8nV5W&%}2iqnq*u9P;@r6yK4 z;gHifNK}fZ>tkBlAOdt!fkn3;JP!w$nEn~@OmTa1IZC=PfG>q|_$3<+TdW74`A!~zm_)xyooc$vUbU#RVX``5;I@pxc;1be)_+KX|6 zr?em_D5gUCA0M2xGm}J^)x_A`lf%Uipl` z`|$AKg{EPihZrgis_3L(u$5Xmw|6864GqECB8GC#JU#U44l5Rqpn^Ftglkx+QLV`2 z?uSx)4O{KQD(i}x^!DU=nV8=p@%S3Z7#cULo~gLhu+7Uh1Ub~+T8+HhdI~hw`yjdy zXei+HgfnqKc><`I(VbHSCnKLRQi{+xKp6!Q^uH4Y_y%A>Us8MZ<-OA7z9opbA(o^4 zTm_sv#5FF!Pz_8>{1L_$w95b94_GnKAaI^EbpwNFpok0ZTJe$rGil}9bw!8-0OfA` z^O3~HK4S$(s62~HxOXcUhlJrEb5@G(h6yd(@CW73Z}e@U({{)oxe#;jeKfjq1_h=eE-R?lC)d^ufZ5mg7n@@Q_uZT^CJvp^@IR%a)*I>qj+ z!$5!iR1ntx86S|WtVx}4uIVJw7+LxJpBhUA+QpUT0|e$47D5Yt$G;VTr{H^;a;`dj ztv3dE7-~z(h`YVS?@7Xp_rfpzpGI1+3lsfpMpnKZZh;5+mRd+lLxEe}m-5Tfm}_{l zdc!`_v+&9~YRZM%tsN9gk4qmAmw}ZxN4t3MJ2q<9*9lEf{cC7w$SWz4J7xd77+9DD zs#R<2^MW~K%C=9zmwP&a)mFBl%8=_bF*w`bQHG@4;?f^0z91{qyX}3%rF)3&vXIL# zUFO|BWU~7#ui6R{wusc^FXD{`-y)T;D)&4zo^vxRC!G`zba+3ges(+8LS_X>Sin#= z{b2?C2H=t2VRX;R@|^HNxc_agi=N(duUZByMw>U-fC->-!ow^)X_z;fU|wSZ|Bj+ z=3Eyol_S@Y0Z8$(m=PZdP+ysoe0tJFlTHv;{)42Yafxnoqr+(XK$3%lLsBEO=l+~v zC>$5b|9)PZgb3Ab3bW!dZjeIneH5W>+*uY382&O0-IoGL_ry@Pwx}fZ-P@$zR`@NA zJqxJaM{+s$79d5(j+YIiW!0>pN49~bnyO$1rP6%*GF_|m1R8=7`vONsS{mMqeNY<& zJt0J|?3*hKM4*s7vH;vmi#rgJaqV)GkQC&1$d|Xaw&lFf+m}_Ak{>l*@_lnj9i0@T z**81>IUd*!dG+~p<7tE%RO-HF{$?+mj6MN5geKiPmOvhF8F*xtbOQVxDYo`1?NT|z zEB4XaOO>pDO7Xl-yF$$oIFWSNm5QpQE;YREA_!TJ?9kss)kr`#{Srp!rAUJK%koX? zx{cyRJQEVqSnvQ`Px_Dn*&W0nxDOn1lZV@`){NnTy)DV-`q9D{2Z$OhFK&owXsJdQ z9Ksf-(OPnG8#|;;Zgs%NSdFw#qd5hxTRLQ;gFDYKMRXkG6@&e*H&9zrk8=Va(56o-~bf05Wz7KxNKCV zfBy^V@{5z4XD7L(b1hI>F8I|qM`m;U_`xSMDYEb|UgU8G{9rZoWGFs9zneqXg$GNS z;SQ;@_$rwq4N{2vw~m{+lz@?YQ??IF5R-92_`uSkF`WMM=}G#SY1KFJt#B=lS8Z*R z697S#3 zIlLvIOxrDq`L+5Yq*ATq?N9RS(ftiVD5P73Tp3S_Aoz4?haNuyJr9aM#2k1*djAIt z_=;+%$AMI}ay{2h+uM`u)+B9`DshlMU1;JK*>U^`Ur2Z1 zQEb1H*3*P#8H{cKIR0H5|kI@qpKfWr)egC_`0SA zxGu^`b;3l2bl=Oi&7mxlu&d#{eI1gDXNk=iMH+1JqLW23gh!F)IcOv(zo)wxTY~IK z$@Z$INa@~7EhqDmjdMpnOa*sDb-0oul*pbraxEwQ(s2;uYnke0ckO9(a+;qidsB#@6op8SIeV@SNt= z%pI5OPEN$d?jyy`-}KSJ;K2$V$G>S(WN0*~&@jBf^3wR}3C1|l5W+CJ(JpG(l3CclgQ`1CkXPBjgZ8G#T3r~Q#=eMwyIZLfU;M}2pdOW2&(Z6Ec(@!!cbCVY zMd~09#Q%B-;Xgzu!WK5|=3_wdl|R*=BUV1M!`4A-bKrYblqf7Fm}W$RVp5lUMS6HZ7H4G!hY@D&85(@NrA64uk;gd}Jh1`y z>Ds$WHHxEo)7`GeFR05tvcGWfE}GI9`oPt#iq#kW^>ut`I){R3k;UTY`q#lrYFOdy z=mnFE-BaN95a0p3>@N>+cCGEa*rBmyMxW^jn5@L4Tu4ruoozyM?D-YW*6a5z^$HZ- z;$@(fyRl85zK? znN#^Jv;MS4lVh#Mk z?3#FJC58giw=N+1JfmN!kL_i_BRFIZtdBrw=b})*M&Ql^UbiGOQf%gg%cu4Dy5za~ zP3(qRbKdM??`NB?$9l?h^796(isqKbyQH%yoP(WQgG@#7b?>;i9(`@0jYe{}H|A@KIZ4%c#+gq_a)k zf^PWaz8?t_S1m0J9+7UjlSL-}8e4E6YAZ{- z8k?UDF2mpz?pA%+Rk9^NPa^g}Kvr(GcT2wC0zV>~3};)=i>aqirK`rqFmGQ9opu=N zf;{m!~3LN9W0$Cu(qld=?Ys-A*za%0VvM(Yc3X3Gp(TQoOX?;(&&* zuF-3n0*Vu;ut6OFssZ~05wgDnWQR+w^sW2&PjXU*+b`GS+PR#m%6t~zu>}V&xeyxs z8GjQ-Pw{uP;E9kQ>qd`SkMb+%D#J=Dk?KoD;KU#T)5lnSK){wbAgfHXw&2by{ztxd zJ=zN2WM58@T4L~*Aa(`UGEcxDhgqvXa|17Oozc^tDJ$h*dYD9gJ05;77lT3j2QJGr zU%n^URV*h^NpuWV0CuBg^Q1syhO6HG_g1slanWQ`eoo_^PEuOXjb7D210tkXpYz#v zMia&nSELR7i)JCgMs4B&vQVLxaIUZ1Gl9j{F<-^{08ghETmts2Wr-nCc6ehipdsNZ zv7DvR+|tq^i@%N$5cpBKi0J%v-8rBl{?q)~@jB@P+jW+xCqaQ~W);Fl>aiS%c6j59 zC5q>Hn8r=VYlZn`2ZtlqV_|P^57@4cW{*(LMz#>>n3eh8=vYCrcmk+>aW+f2zzz(o zWaj5Jz+F!v;x;s>^C;EG|JKz%e3=+A2(9w#7=gvZKj0=9PzWX#md4AM4*;DFDjD0Q zR>OI14UNFxzs-OD{vF+L;88zr#6t>v0{?mS+}wE$sP=$wL0(xI3i(`6FlWcc&Q5XO z3c4ahpTdhP5ZxX8|LV;pIIGUS-L<;Id7=(1b$v^;7Y_{Gt#iTSEH>Ns3c34tGJ@_9 z+NdJLMgLo)uDRw_}qqqvJOX zow?!-Z#h840!(6%0N?|ld-`-FpXCT8R0Jy;5L?S&2taChDep50Q3vLc;0_wjaJ-UW zN5k8;(lP6925N-8RkFVWA)Vo7#4vYwfe?M``Pf~#)scjD1H z{aPk5wQp9W&mOvz87RGYLHK=euxARiY=A>1>AsN1!Nva`m=ft27<%S%J(G&t>R6K$ zGn7<0Q`Xi~jL*EH7nlFIPEAbA*pc5g@yyi!y*Fuwu?9w;hj0C$t}h4k2HfM*P449v zjfn24pfb$Ci3hzvlmkYxX3FHLhAR;q-7>?xF^POlp^`#Ec7xmdkD%GtZmw1`Uo#i5 zYQ@^6$1rJXYNB!674 z!DR{Pkyd6T1e825sesB;9he{ybv<#e*N5V^H6TFN`&{w>8WKXYl`Y!0TKM2R10#(I z&%eZx1bx1R=wH2AG;C~x=_{U~8Tq}`jQl@q4H!(xQf(>l3uxeFWvaXKc*@}8*53-z z6GUD-1O5asarKTXK~G_)%sORr*1VrUXA5ZO?TC8X8Ug(0qUZC@iH)0?Z1>a#k9jb< zlc+xl_%F9^4A)rml1T#_jm0sGB!)I&?KiNR6GYT$3u1#MtUKj{0QUt_g=i#n&Dd)4K`(enbq$S`3X-&% ztIqG{Jh@d>1Q1}0ApNjsSmrYc91EDCFdsX>gQzp_7I6iK7Eg(Zz5=Xofdca=FcAYM z4^qXAstqrh)7;_G(b%21vb4O)muW!1fWD$Kz0}N1j=T4diHYY*9R+7wJ1m>0O2@Yk zy9w2%TXa~HRws0fPvxe&AC>_D=G5!guU`_m>Jr5hrV~Xk6Ytz9Do#7AYP}hNI&86u z4W;cYJ+Xm6(D@MG3y=i7b|D_BV2k3z(2v;vb@KmDUHKSkh05r(<~%%PG1HB%#S6=$ z*>@)e&(pHP8-Z<59?!%CE4$q7% zS@Q}H4J2n}#U`=o2=9ty$yAppU0o6IeM$^a*VFss!hzXKLSp6@!OVs5B@uHPw8#0K z&<&*vI1Jr}moEYHn$y+EYYJf(bVxU8c%%a`VuNEtDNIrEfr_Ve1 zk@u#u>IooAb)vaJzvb4cSJORQJL7M>DoudJUhJGho3Ft&=c}sj+;bJ+;+ZTrj=Q?@ zQl#-SHKYP((*!J+=`o3eTHz7<#~R@I9D(j21CqI1a1&^){lmjyiuo{E+1{@{JLcB;pbs{t-Q=8;NgkpMlgT@>!@eJ)HlFk zFu$+hH2y_pbwnG8L*Kq>8SHU^j&9IBE7?%{P5|_Of2yt~R1?4D1y17~O@TQH!{4#9 z5)n(>y}(+C0{TWqxwguHk|$L>foRMIb{Np|JpJu8p`M-|qK6C!zToMic35=50m;kH z2gc$DfWriqOF0dVgnGkLZG2|E`OPGb1b?N)#=?SeeSRS4 z>njGbDWdTxD~pqg2`ns#GKh@;r0s531`z#?e2#kX?}N!>WHSVvklA#~y2BolcbkDs z7&$>PX<>yFeJxvQH^*PF4u)WgOg7R`dd<(zZ`u*`z}nhc!qoH$FaeB>js}2;TDc|# z#LiUZu;+GwK7AMEe0eQPj2@<~Kd}C`;AJfx#xI%uEsPLoG(a{0eSpoZ;vv%x4Eaq{ zn4z7yLo;jlhx|c^8}&uiys^A^J~<_osNv;x_4OWnDvo%vE2)Y5l=)M3_HNK}$mVCa z)Jh2=27tRn*%ZCb`@pFr34}c0F$u6^nFb$HS(2M8X**qqW6s9L7OYnq2D+3DIQxL? ze(RTr1?`#>1sYXCLc+?Zh|RZ`s4X=`%hID@x|u=G12`Or8}BT&^vuv>4^Lw104Kw? z%c^kzohuvF74fRBO{_lQ@58KTlb`vUTfLhBw4nGix%neK9Hh(@;O+$5Ywfu^@E-zU z5;$y74eA|{aK&%9ghfPp$_fCAL7dH{HunP~uwg*3SQ*Wg!r5a5^N=WOYWhj))Dm0^ z?wWBo6tFwy&)zhH#$XrY!d-1U<1cTJqet-GU>Ipf>$A3w4# zI%fNxy(;j3+4M8!=anC@%DQ*e8AUNQnd^;Ms8FnlxdW@=*}>A&=T3-r1j}ktz*vLz z1bS*!Qvz77$`?Vd4F*Yy3~szMiMTIK5&{x|O!(~^FCxMTXAb9*J6Ks2EbIV8(&S2I zwGbSx!}4ol0&IH_lV9HKr~$OfnG$83^3=HZsoxP+N)}m3%qQRnQ5`vm$x@mG&B4z4 z#Wx6yQ^dL>zX{qsL89X(1zXt!gaJ1NRsjK>jO<8d)e|V}W_X5B7JQcn*sdpDErzb@ z!1DvpVh%!{9?aD)ru^bVav&tQiL(y6+dwmj=+E(Sr^{mt#Eu4_XZ3{w=w|`WU@%3| zz>}FF?oS1RgS@J$TVa_oYtj>t^1$PR46$0Nnjwe;6x<^>5K?e5RX2Y&5_t#&jlasc*zAUA5hCX1WG0~$Ahvaeqy?Cdx`Yq9`;V9%m* z8KPGQmMa1Ls)#SyP-h1AAsd~0vBiW7v%NfZ=I#(%UtyA+`G9qn&kYSToEU;^iSqOa zllSx;lJ2Zg*xB8aL|m0kRm6AX{6hWz`>LG(ukY7ETy!rULht=@NgfBZ>Ovq2GOE&L IFnG}a0B-H1S^xk5 literal 0 HcmV?d00001 diff --git a/doc/source/_static/legacy_0.10.h5 b/doc/source/_static/legacy_0.10.h5 new file mode 100644 index 0000000000000000000000000000000000000000..b1439ef16361abbc0756fbf7d344fd65d8a1a473 GIT binary patch literal 238321 zcmeEP1zZ(P*WW9Mh@u#X2_kkN4d&3&T_)1fDF_yJcOWK;ofw$lV1pvkCDIMjDWKo% z-Q9bENA}+5jqmsV-q%&`Y|MXW&Y3f3&YUy5yGKi1Q&3>60ETb;{1`7L#N4HRevw=K zB_8G}>X^E|n%rTK>25N;-jCQ}V0;*UZ8Ujah`j$q;&xa$dEU%KLjyxOsryqYc^Ql4 zBlf8KQ0M=o{~H|8(oiwNDUBe>WgejY%+16JO0ISX6UXa_1RIP^XYS#0$dCS+X`AUG zLBmF%KU^*;xn*WTKiYl{?kf0?D*mH}4I|~Xw_N|-`}WSSrl(?Rips<5(I5R$^;lwJ zXpYmy&X3ocAG6RfG1WFSVCKiW4wYY8QVJd6`IVR{1NAgKztK`N6;(YAQ)^8_6MYpk z>%QZevD$y#LUMe1*g z6V4v?4(rf?t(&KvtCY3BjoVrWU-q#x>U{ViX0y)OR~n^M2K*S8cuM|-@JwhkH84~s z`Z#gS&r76z{ZhEq6Z^_RoTudYup@mwiq6p+>GMq|FXxP;&o`R9ypTga`n&x7tCYTz zBKc=Twi{a?ai>QIV}xG9^$Miidf2!-;9SS^ch}PflK5F%PgC^q zqkWX0Sy5jM*R$w9+LW2U6kn*6jq6X~?48~5jD6uqb=|Ezyfl8y%f`dr#@E^}(2H43 zW8Ej2_yqj zd!qjYnK_%1w6nEp5oPbWOrnLGuh0v`NAQ&GC~*V$Egdi}8U9XxFnf8T+pnh%%h zOhc)x(`be1DBaq2dpn*c?NLd=(~0BK#b=<@^R|l@o~~FueZ@?a-l~tfiKjod8n@%A z_Q=vDvru~KO3+C>-LX_@>ui*!R>mjdsl)~wu{kL95o~eC(^u#BM&apQ6Rqo#DD5iB zX~9#ywcZ+YQF?}t?>L@ThMX>(D|qOb%@(b_89*zvzoFW&c(_Fto=%wF2vY}K)qPd> z5{x#MD2lgjfb+g@D_$i&L^%unH}$4 z0n(&$ecJCA0zRXbyOM1U@bpu^x8+zp*r&>$n}7NPc%{!X(rRfbs2On6r8A%w)@c+L z#qX{M(=Ke-F>hfB=!>sw^t|SoSgHmo>KLr?@X=J)mDMuRRB~6a@K$kJE90VPr6Rk` z&);&bgR8cgbCBjb8!aCeoHRnm`+GsZJPf21jSPIAz?(=zW?L^9+{byx@;89<8 zn2L#s%2HHkHa*K9>5+>@#w(Vlh8law9r=;PpxpTOHQ0sT>Rva|E_3baPE6#?{P=jq|= z=d;$%&(lZ1OIl#M9bsKd`Vm&Nq^F~!uY;ejfa+RbXAcKoUv-;60WTQ=J8Ns(wa#vS z&K}m*0-m-m4t9P5Ua|rPekNXW0@LliOnxC{G+YR3c zIVJWnthdf&_hW8#~{Ef>1$=8fynWd>eT=0{`{w({XQ6(>p% zEy8VFF>xx9pBT-k*8G$HuW+FE98!xUJa$q~4*aX*ic~yo_e7$c>tuQC@s?XVTZ(yu zKk+bvkN?T>uGq1^ThYCgl={E@wqjF6+=zrEvOyXAZ5=y^=^N(`GW)AM}q zqlx5+$-{eJ>3K{t4;wwB=lR}8{N#z4DZQ`sJZ6n|r2nMn)qmd+TfhBvUfO;Bf&5?p zmHti2kI;9&;xs=X`~;L=hw2}c51ZZJrq3i=F@|zEFb{k8H+Fk)=g+13`SxMN-NT6F zZ}(3UhVhpPwtbpW16C*bCo7nB!4+01#Y;EmgUaof`JZh02(?#4YDJhQLiahvS`UW( zgsE>#RD-}>IKEz1cF3nw9lX_WEj<>Auxq zTe-LGpn3JcYLb}zFR^qecxim|5Saofqat!6{Co^Js4Zb!|Dh5VI%^I2wxbRj?(@x^ z99IHd7mK-l-CPbV*6@{wDZK)h>;~4`&CCJ3?-Fm^^vMA>Z?8V?e4Gzb4xL(Gd9@9` zo*2SAcuE7<`ouduKDHj-UdhODvB`mLzB4|2tFMQ`mpiAO`_u|o6c@iR=P3fpm(o<` zWmiK5$*;CI9ExDfhr@e=Q{thT$R`(%qg^m|#t6-E8J(bB>K%AKq7EK7I`u`r+-6{M zE?Ib3OdO~iza%d%D-|5Oco(W{`3enkopgrRCBy3ZEoq6r8la`u#jYD6)sR2j#?rYW z3JU(}7r1j$88Ck59QAZS3fTTZ-)>*wFKF6mX*FOU7V}8B|F4-zMR42d#wU?&Mev3C z!hvqiHDJ)*(vF0PG@vud@_FT@LU7UWQSgbRF4$PR+(EP+$;Fw_l2q3itfQ; zVo%)^(d*0wr$2n%niW|Jz0V}i{;bso-_4!lxBpc%thng8eD%F1aO(bY5Ao-v@WL{Xz;}(QhNe%TlZma1fsVPeo2#XX)EXlPcU>PJ zO+OjOKqYe@KWR%D4HZ3U&v|}Uy6U!m8gAakn%?g6{*G45*1G%I2dXXechPrK^j7o? zG}6&m(DN`^=dGmVp9Oi$iz`)nY*o!jj$r8=I{ zOSISMYI&&WnYh}j>#ft+Jz`GniTzhVC*RKrYgTzdal0-VfOJSr*3z=|$%I?-KtQB}&ZFRnJqzX(8J>ER2umQL}*6UE?&jIQK z(?yea*8*J&y<&%N%^+CvRX=rrV(1(9Khh0qhazL&ZTD8G1IqmK8=XsIz}kjziPO2w zaOHacy1ApSfJfKml_eh)fUAyr23-eU!v{&0Pp1#gfWBKgVr`>4pxhi8&xp^7;N!wK zUq%clfp}*Z=e{ zZq`n5dR|6(G0`rRKcGh^`E&4J&jHH6yjqbU)t>CNz4bn4edxu!0lwx5mrHUTkEa{4 zjEA|7X$i_fIrXNy;R}|~G9c4w%x5H~3HYIvNjB59WzZRS8y8O>Yy6Q7{Io27S{6U; zj8-MFPRj|{pan#OvFl{HgkRLd)5Dr5+h!g9909(5D)_JEqSZ)ls65kYZqjIR5-I`h z$)G)SQ?w_G_T=$Bv^WaCu0ZbNITZ1ITTf3WP>DLgugt>_&^iemftEWF2lxfF>dD5> zOB#_UiRw#@8+|_yf%{1*K6#+$EUc-8Y3Tev?gysICEAC1OZHP7NQSWd+B^h{crYnu=zS_$+<>c4)&pZ` zALj+`!)J>zIzStKuKT(lwmdU0WB39q9E%s(gKXp^#uxwKjyimq7JeO1@eS{;L>2ja zV$eftxGxyT;GRG{59Q$d-I{_PsGec|*eA&LG#ep{ErV64(Fy{yh$` z)&Y_WO4blU#d;q8pSRnP5TlKOq@1}1T3bU;-CD}pKt*2zKaa}@B{%pn)t}hvkT;Ec9vwQinJ@DQ2W{*CBNd7%ujypW+!@tMNbB9O$ z`1g1P?(nEjv&Wn6^h;T5YU?q3BHnH|?DiTCbPYzS-9?c1ke9h{O{yN zhK2uo<;t>f-Sy?~%T>ltqxuLU-AdJ4j)ni7?j-UvVXU9u(=X4u&p$@L0t??a{S@E$ zP#f@L1}27Q2!8>KHMP3=uk!=;x*y6u7h=vU z*mG_IjN3`w{VCM*W9;X_D0_*-v+RF1zWcm{T`q+q{7$6ZFJ)~{yxIaU7dK%jyB7Bt z;mtJ|Gr|*eo_H?_3rCWQ%`VY;6AZsE>5_`gDH| z!_m#t#t$)}kNQYctjFtvFA<~YApJT_?-#Q&R82^?8R3=dK5ytzADQm;!TllBeUaZx z7GDB4n0)H87k^~B*GH@yj;s&qc4O8E?X$Vl2VZ2n*Mq;?{fPFJk?W%#vK;C`It-ch zK>OY6LHVmVJ>RN;YJ*{%{PtEB%xp;J|7`z@9O%6sU5I3Ym2ppQP_ARDpK0~H{I`#& zc)@5x0;)uo*L(iRIbN`uc>}t50lsvWsyDNr6Y&DJitiaTU}65AL1L-)r~Lfg>))e}vgmC@B!;r3C=5;$?#id)0%4Zu0B!lCf4^H? zAb2@(X(O_$$TvK&aS?XJ#eDL{z2gG0X2kVBzkiPdz2gGwNI2!s`{&~Vl>Oe*zuY}8 z@b~@uH1Pq}C_xa?-A~JoOZ1En{C)pEm+^r<{rg!IBwtxR0^V75a{QsPvp6&1d$LMGK`TM4y z;v08`wzK|Se)_aN{Pa`9d$uI5jPZ@7kcq=TzkfLJhXa2&@P`9`IPix9|3w^N z+lznv?Rc{?&}*CUc>_7$<-*@?#gv9#{_P(AcHD7ASy`lin{q=j9m|v(in(l+azl}h zceXv=qh5df?#x!Ad)Fx8ckk<+jYiJQX2aS@{{kofe1toZ|Brv3;KTpgZ(siTwlzID z@$d4_KXoL^$t26`J#XafpVwmEfX+WZ2Ic6Xr|`Fb;!EJsr!`2|e7E>y1bM%{{qQ^f zdO!4x0I^PAWg5P4968_7FY>yhUv=g$8E|8&LNegS^v+WOQ>J&G3Yaq2@#6>hI&$Vh zell*zT*ptw4bd`wv{)al-zVb-l%Jw+|9f};6!U#8-5l_}^)dPV11`Q}K$Y9~dVR|O ziZiyG|D9Qd|2_Y^7*iIlNP+yT|GkcReJTn%t08_-?`gt$hx=&oH3B_+*2q1BpTm9R zxGNgJE$Nb`U2~7G;m0u((I1W>-6PoV{82s1L%uxxfm9qzwJYT#@9u-@fkFPeMI?rk zU%kx(Wkj15Nklilx{(iYDH&Z-BX-$-_23}l%&9*7>RQZI4HW86$}I+Q+!A+hcI?*90m{`9}^59iXK-lspDOMiNw{%|h+>8L-k^>ugp+5Ygp z{kfFB7DV@;){V3M;QyFEo9z$(o&F%&m)Xp$pYH9)t_RyM-d&gc9{)SJVf(}Trk}!b z=g+3(y|-QR@A7B2^x@C$xI}4Lf`zye#5a~ABo6=l{^7tM4*cQ39}fKCz#k6$7jb}X z?{$A~0XOT;DSM}fpOuUC-@o<$N_AgX{2Tu-<<4OGe<^nc)9FFEGwA)lT==Cae@~Bf z={@|?zgc(vd$$ZCfX&w){Q2XD=GqVa`|p;=lIAMWTAbI3$7~}yEB}8{pZ!A$_Z;ZU z4;{R@=RIuEw{P){bf5jx-@c;mZbR=LttoPbQM8 z*J|=U|LTNV*RD){KY2CDA8zNT|M+`G1erV^O%g<{gXPDa;b`+B`#Yz2)9&3we(FBN z(VyS{6%O?7=gK7EYe@Z1@uB;7squ5J!h=RNV~aBwIaN7%vK$N#32{P}mi--`S1HlUCh>-mgC{9!44BL`$w9Em}_kqM6u zu(#gN2lRe#B%8lI>E$fvRDSN>W6Irk8~T%9x5KcQ5H=0J+ashNy>BY?cURfx2q{P3 z9Y4!H_q#oQ^5nW*zq`slN9o_hz3g-UNqcO4^`HKJPWpcJkA9byT0cbf4^|wyUq5D# z;ru<$iOl9?kR5}HA5-xXs=u)NUH(3z{gv@$2&}Dlzhk!xcXU&B=tQy~gpqgdeIHK! z)Y|Q$;k>sEplwgUT~*tB*w#<>{cF!QP_#Yc+UG+h@WqOxVaFZXK#a_aZSEt!!ew0x zEVk)IgAv7BR_CWxLGv7K-ltg|@Y2logZ^K(!e_%}3*-~qp-|V{;M}5Iz_X=3TIX#F z3~!QE7%FrZ%yZuDBxh0yZ+mQVu2(65ftU7)SA}K(&FSe616)(UM|shqk&?0Ct55aZeCv_H>|W#_UYH37$WI&g)YNyAx92lg$oUk4g&rl1loL*P4Bd8FRmll9^=h|UrmgJY&mWkl5VtcVg zNg{mR3WDBSSAc@>#i3>!ia>5hyhOOjdlFn(Hg11F3QM)sr{3z+W5MZ=N0f9oQF%BxpaW1Dit^>5Ax-z_oSh`}}iqf#J?3 z**91#d^jX%hv|b_IH=^!7v-PLz%j_!@1SWr$mUJDZB+9WR0kfNqw*^SI7qlC-`W!k zIs(^rm941)l2^(_^e&e}=@q92j#^#|w)~37s+R8p*K1}4je1cHhu$rWD!W++vu(F| zoa6Zh&sDt1A26^2P99&n?|VxwoG3W;OYNg7$Y_+`W%aV3^14g&g&nW;Q@$Vbi&2o* z0bkYLif}>iy7zB8`2OZ4!I&Y@%P;1iONXBVTrARdJc8vrhhC5?ZinJImxn0$R6()H zyyA6!*AE8>H^EQ3YM=Qg zRKpjGjY89&G{CzBvx{aby@rjm%6-Ehr9jc(56Y#VKZ7fiv#Z+D%m>=OHC?D$soac^O21ci@~Eo&6n)06JTglm1xM_HaMvM=*@%X2Z7{rbzSL# zuV8_@UeWz|EpSK8Q-{2?0vJ5pW^=)g0?4=i#X(2(oe;s(sUf@Nf51b7Ewl8p>)`t6PK;pKJ-~DfLroDIhYl}BXgA(oGxhtuiMWCobV`vhQ8kt6#Tw}mr{yneqJpF zDvZhMqw|X4p^d@gwbiO&amCIZ!@noM8Ot*zN)zirqiWqo-w6dEqHgEpct#z3>0MMS zZS(^nA>e#Rs`+8c^R+1oduRB4cwnC(*h+2mwgpo zej8*rMBaVia2cA{kJZ_u*9_)1deztEmBOLIN%hj(KEdR*iWh4oGof~>FW>ob(ZHbQ z(F60J4CTUdL!Hb1d9YRHd<=g;G8|A*ENVWW98TyD1%}9d2HTH`WB1=Tz|;w)7o_4! z;Y2Id^ViS42MJG1<)sZO!F}sX;=GZ$(C?ex26f>&C_76^xW1+Wu8#g5^*X8!td1J~ zJosG$7zxvWzg{66rlMG$|EwH@9Ltd#i~3pO(X&QJPGtiRhec|$+c0GV>DAg_f(oGA z%YrXE=3+7ZX6br~eD8$ok9DaBoT~$E=B-PIZ%u`Z&2y^#HF%X@-Fx>^leZqagdQ?` zG5aSR^JVaciGERV;+L;RAUhd+E$@G5#@jOJyLqh0I-L%%>D#3mO|fRsAO{mcT@>1zh^K}?Pnw0`!uX@zI+o{p~!nleM=}fbvb>-8SQo;|3T=yh*bkz zUvbj)MO_n|{oM7sz|>M0`XKtQ)SE)k*mm#C*}Ph4Fis$O?lbyhThVG1EIQo{bg+y2=w@Z_T%on{?S@RVMQ!-GRGWRq@F=h-3v_pR<|5Uqz- zqZ&L{94my)Q7R=Tw&uX7ev?&iNR^=Waa!xwm$Il0T;4nV#-oHX9hXN*1^jcU%gb@kO8uw z7EeG>4a|1j;UksZ3U4{>u>aDY4~Ov?Shp@H0@GAXSCnka25Ze9S@C`-1YV&rzYZ_^ z3?A&8TrNB&8uE_1qBcvv5!CHoKUpfS5a#;l2Grlcl#6YghE}@N0o55>7XR8(4SjvQ zOz-Wf2M1-R$`|LQ!`y*qW>37B2*S(;Xv{j&0{m+}jE*h44Q7?YeU-kYVE%wB7YxNP z<%-3#%QwbWfJq_+b*(q+Kw++GwM%Rf6xhE|TwCTje5#L?#cof6y!DCujh7Tc>9r># zcOR*M^FJ;6;lsERFFtg=n;O3DjaP!J*7GeTraL6j&*(VM# zV)jZm2#@ot0FgSfu_G*!fV%f~Q!x2EXi#`^I{91;G?MRk`l0rBSiLw-Ml7xg?mYbI zV%zaHC^L88vu`3{;FS0IEC?1jqq;u@o=npa)_GhH!P3TxORx<-x-k0OkS&Gd2@ zFl?_?-jo8E_&hgUG&c`OcdU@|h)9AH-uIu^bw3Ua96E2PeNi)1w~ARNR@??3Ze3lQ ztH4mMdA&!dwK5%E^_8`an|l#Hm6jU+zOxSOS=#S`fNwp_%u(p#o81P)M6HAm&if2j zm%m&1aprgEnfSn|(5n-e`-*Jy2j78tko>I1&@MRA^SLHw+6Xfr34HjK)&y}xrss;&|^6a@XeDP0EMz1;X|8~Xm!_^wm7 z0&*>&LEZGi54A>Exj6pznANRt&4^0rA1}*5lF-zV^Q|FRZ=-B5aAy{X9xf|>PwqZg zudG`+Y%8XG;oJt%_hALdesztnF8cx2pE%>%b-D%)IrQtvORzaQij5I?l=W9Mb2bb|@_!ccKLx zsgUAdaJmJu*K-YGt}FBNvGMTr^7N$~=zXm#Bd6yG^Y`Y-?EIYGQ-l`Fq3!Q{mj(AD z^fmuy&u4hasWrVHfn85xKEr7q!p{GnJ}>3YpTVYwyZIbDKez2l_=ot(3FlwuvD|&< zo|?C^+jj)nzI9Cbp?)~cS2?#YaX$un%Q{YB_wU>%Fy)RY$jBRSM42qB|RM-eI5LK1ytAiI(s+I&|>|t#!fF3$vwp} z<1EkN_78v>f2pS$(E1s)w5JDJmBEPC(VQTh-vBB!-Y;Ds)&_Qp-<>u0ODp68t%hM$ z9y2zbF|3DD>4x{JPB*}~!9F6dpVf1NR%b9KMDF~x<7zsvXlm1U5h(yO3nf;~Zu~%x zmf}00G+9uE!H9d(Ja5mFHn9K7?A)p@OgY&h{O#Bqzv$^E(5e_FJ-ooV>6PuDO(+5jQKSAWu0^( zoOSfY++f=b;AK(8SEwFMU-sq@@m);1E93SB%pQ{iZjNr*K6XI`P&YUby=hAcec1$B zm4QuOtMkKg;yZBZqo8)PR3)@i7v4~Xy!^E7O`s9o@6EEV6qja!_LY0x;tw;F6PBF3 zy)C$ao0=oK7sL;p`T0!-eE2wM$7<1EzHQz1w>B~--{x}WMjbT5IIm1@uA1jXc z0TtIXp?t7G;;K`H^koxhM7KY`#n4Wt5)kEa7pqyB4gJHjOJv6X;s%ZAHu1Xqd0Rw2 z7_a~=YkZA-S@xD=bw}rMgGO?0^y$>dZ>I{tg5*hW9o!SZ<6{fnFY5Y9kDl;cXYv%B z?)#l_k2Vkd3FXbdxgR;70Ie@9kKQ{hh8r}Z`$drRTnV#xVAMQu?Q>TXVa3wHms2H5 zxk0O8jNBya)Lfl`?Y5g8S--y?3~txmoNdwu z72eq0($s_WXaNs?p{a;&2LGP`DJeC1Qi#3=X zdb$ZT(rY7pOMhK9S`U9Nyd;p|(gt<%*H1p4z*X654D2~?{57#tO+fnWiRfn{17bSb zjh38H=fHEKY(%$E!v52`=Uc$Fn-Ao4UlziJiD`L1ORDMVzJ6Hx4zs<@Grko)I@$RSPS8~nx)xwvmp z0bCPX9|<&D=+T-Fm+4HyI`_uv- z*p$mki+#Rv>eQj<%eEp!`jxXkUAToPm{JwQRxG5XSxq7qI=69=V!9~f{ zNr}6k!{p)mZ@q02>FFlQM(sT&$~;UgAsYr+`dmEowidd9a|S=AbFjF!w%y*&q-zM^pY zniFUw=Zi$%WQ4jj!0JOuy2{6|z%WMA=Z!l#aE(ACx|2lS+(AKcXgO-R`&IFL=xgE- z8Sy-Yp65w>R8nxd8Sy0}ALndr0M&jUzs+nc02zyRJ&3Wn!VMbHEwg}cmrPX(7_78e z=I#3e_&6&0w$JinZqSHs-5-I%(u1ntE?0h;nxXkHCTCzlJBo$T_7bAz$ZmMA?XL->R_H;hRdp z;MphI!`D>8QMU&D7;>M3J}1yfuMNHE+4#Vy76zK#+8J=E4qTsS_dWi3EPacuSUr8k zOkA(Y_?7gZGmlR>`P;ITrM_9PD@bZ)o7F>Z(1`BTj!m*28{R|z5cAi2-133Y^^Z|& zPV>ajN-9w{ve!;5nYWfztRUdT|(T!pA8zRSBf2@U~v8PtvBajuMud} z-cfrJY+qEjz?Gv8Td#Rv2@C>5wFppjlH5PI=etECXQeu}*6fo9T8!o`opW+&03wMUjNnT6V$@lDTt z#r&ULaQ((Y`3YT(;LxXjrH1=BXhi~z=)V1O9egCrP;NauK*u({9{hT!oV_=WD>Rbx zyqXz1%-f5=bHn|i%U{)lRR>R9HnVS}=NeHqYHuY*^frEx6sRc#kI6-rfSjr->8_(3 zG}EaoK_?O246HV5iXddvgX@Ct48?;!!Q{2sJu07@veCRmd;S1;*rEc6 zt*{FXSyKoPjnA4I+t@~5_Ku}WTW2%rK496XwDlu6D&q-+HYdZCd(9_Go{gX{n?R%S zdwJ@^>m>$C5M(Q{bVDhZYRW zE|~FIGQ>S5lN&VB=LP9kO|)M%z{B+`NLyd6jysW21k@>*MkfHcFpF?{~Y_JiVgWGEn z*S?JXvZx7MSUBc}ooOMME-hUx#&|_fH-ScUUv_z=Zd{1Pq<*|!si|56vM;_!Y^=M( z4I1_DueBGVO0zzI#)w=KUbI%PTsz_9^!O9ppiz5&7c~*jSIULL4KbHC-f4vA((2Z{ zY2?sDeFR(FnRM%{db>k;X9?Ic>{$Mc_&1<*QTjg1$N}^tFo8yLUd2;%dWCfo7|b}F zV>jjpEGT%gGD@x$(2m~(8nyS!FCR4w?xw;`SyMJ$(5r%jPQ6|lsD^P9uW7^^f1@Ed)GaPqDb>*Ko>^u*i@R9IX4hCR*1eaXD9$vyzR9_zn(q zJeo5`stTl4jW6TT7|80qxQ~EpSJJIGvp}=-m+!U)k9$akR|okAAF%l zcNOKdFzKGIcC96HOfksnT)y$_h;-03h+oOPlR?V$u3_3xjJ z^FHmd?f{bDqP>gP2k6{ayO+MLhvGqg6IsF;*o`W-de8&;p4D6}e0dhCTji@xzxTeR2gL}o?ppjk^U9fQeNT(Q3zI=>y4ORyY9$AYBw_l?{ zqY4-JwN*&q_PKw)c%tj(dbs$G#K(6(zJlrR`vpz7-^>jf-NyFF5aEv&`S5fEcGz}P zCTyKM;nCD{bef4(Hlq9P`Q~fU!#{wahW4(Ln$5tNarn4^$3AY#M(rIq*RB>DQVcGX ztY?IKyaHP#Mfq**=y;CGX2nB?he*^_l*WO<1zo}8_tk^*5l>3}vl%pFC<~4Bn!XVb zk2{tEJBosz-P+O$EW&gKKW(7%fV0p@&JT*2<~w$lfHL*m(lfH<@MFOTk*lV3^D!10 z(QOzOaJIoR6O8-VdDWsd4`f?Uo@7{0HxeN<84tx6VvWYxP4nyFtwp@q(hZeBRN0_Y zK!^j~tazyXy;*bb`!<0OD}?lvPS(McV(*4lQ@W@Ft8CQXDj`p!r@2*uoR7_ei{l!> zz8TZK=Ga8hw>MEXlJgS*5h^JU-hqB_^ZdattD(w}lF{;RbUsZ~HWd$LoOiU$+L&Aj z!^8(*>GHwN5*dG=L`W{PIXw=?Ljj}>ZAJqXp@qS(lN8bZ7I&;em zLpkuAjE6E9eE!0G>p#_jyKOfD8bSn=g-S(!Ovs_r$f#@rjr6&l#<6`$-Zh~AbNL{( zxE82AVgBC3Tj}gI78;FX=Z3dVG%#YsJOlAQ{^3`khgJ3=Uq?Ef%0i>|j(TOeDKzsn zsIOhf(>S0E1|*z0o@d9QY$_g#mF?LrqIx6>I%imJ5kAlfM~LR`b`7VSL8G!+@lcm= ztF2#PE0}-VM1s$_9*&i7&A+{kF51FEqxMeSZ;|AxTLm3Pj1tSamj_}igRB|f=zLr( zG@@H(#^KbcsuD0sP+`^$DptOO){0HL3re4;O#6%XCGDC}nbm`JE}BPGwtt_&VueJ*3k5;{K_zhWhwy{^wb3c5cVLFg$`=V~bfZ5i zn-veO$=@y))%5~Of8SP!4Q>WQkK37+2ei<)7=cE9rn8|V-+Zqx0hU9c-Kp6Pp#93? zs}oF{=xa{KLm7-u2U0fa&TIg81I$+@C>4T+*>!h;c@>R>vfx;bpLNVrzz2`Z$Vrm(9C(s%1LwT zn&HDXnVMs3D#7gS)1^hj=w>x6G#bC9_iAm4(D(t&G?&1W-*TYD&tFF>rqZp3VxbY; zHK+T%@PnhESeJx#d`unW4SMYSup@(>ZZaN# zbAv{e*gI_F(Nz#Vo#(93G+d|=o)NgPZ0sUBPZ0tUWg|HcJKR5A$Ug^`y-nC+7r+}c z@YJ`eexh-ZwttiHPzL52{~^7#G!@)dFj}-|QZbxvl^t_L{4;&c$#^KnC_QQS#dAeI ztO=R1;E`@7Fqpsl++7zAy@VAH9pg87-JD2-KI_efXHjP1#7V1*=}T zy1uvwgzO3*`n0PCxJQgjv>hMM4I0fu!$;Jfce+~xC)^!=#ARdy7%um|3PVyu%QY$< ziVZP+$*3Ch5KK*ceS^oh4u(6Vy{J0Op_dS7G=7I%%{scxuNpWWiX6LO@qm~ydLIwX z*iEg_V#A!_!8h#?s@Ivi+E7KM?Pt^ULif*R6G=GtM_?0sHy=>I5}_4 zI=v3KJTE7B$P*`eG%FtZ*dv2)Wa(SzH;n&s(7|_bg}XxBTFo!?TqEP57{h33ou9qJ zYj{vsaJ(xYrd-$fC5lIv!`y%s53P7KP-t{{0jQbscxK4>N~mevIBJp$hgl7QM&q~j zo%p@Gz88ZlMpNaMq9=d!$dlJQUmV?<(1+S!M-KqqX$fP<6DL1g|tUtJ>( zGYbNZ^x9r)HDAW4Pw=bQ=e2>uGC||hhcxRbn!7HEo3~Df!&f9|ICQA1&@cF`JRlPBp3NGd%dR&o%bD~$#^KnI8$8HIjK4w zrk(NAJ-@60@E^RlZ`wsVp9c$#B&9*LgD(uOXy{diicwU zYo6~^{8l{`=xQP z*Qj_XgO@S!s8?YVYy*pL?XBqqhjoi9Z>Foz%Q=BYbf?K*cd(t<4!jF)Khv961%3Co zb!?31FfU=nLnoiuexoAiD;&Ii_w`$^OJLQq&Tu)uTyDxndTrCyM)^ms-{3;GS<_Va zdY>B4)dlXt&CvI|LB{tW4z>eo73e2?+SKrlgN04T zLoo(#G{5nu-K8)#N#l-E zCvW61hLZ763=^9$&dTpg32Znf2WlJ9`w9{w9p<}nu&~K^D8?9ct?b4Q$4dCr?aiqV z7m`3%x#^UETsr?b;u?WQ4FVt*PhG^#m) zMms}iYJHfKnE>y{hOC&KR|mHl4E7qYNcXG*3ytJlpLg<{XXBp0*Xx$=-*WaUFs%I9 zx$6{%`8O*b`uOw>rP{Md8$NKqsdzIJtSDZg7-3sRFXs|OJQRKFNYd+Ja_t*9C&#<^ zd~poi>}Xusu!w_|OQ6ww%=X+(N0G@bz-ejc_LA>tL`uJX=<#w6BQSwR^O8lGX}=a( z6oTWsr}C+fX@cRW^44?~a*!4Rjr6%$xc$M!C>9%L$?tBL*$fo>PaW}a4~O|T6%WPC zqHc-HPG|&5l}ATd6ug3?bj{{We1_4tHyIDbFbH%c6Zu==`O%dlijW^U_xZV$mvtPp z0~rrxU@3yG>lxQ8(VTzs8OfQI@Zf=%pk!gXr!-v>2(Vk4`pEMmu`JdnI{PFGj0e8+QjN&ZJwXiCT<|hoUtHHyVv5JV^yz*PKqfwif`C zfW|ZaOE}C0S@FRyBcTI>#0qm#qhqWvA_BTR3<`S@BS*^n?X7zLtP>!_M!O z-B|?`CrN_lDb@5`Bhbh`zrJNx{7K0hP-l}wpvU-1AZePnS9s2M`t~N!sJ-iE4%wP$ zk_alFCnxLADuI*Eg<0rZaD_&4zM>?|PItj)Fud^m9p%;suxS5~XwOxV^fhP2LqE3i zR}Bv>gwLj|21Vne;O6BCbyh+t^ff2rp%@mv=hbe{rL8a_Z1?1@v<|51BN{r{or6v# z(1`B6yiVJWjcfwNiv$9&qu*ip5_75CIJ&qXk`@Au=xz^Hd|n+_4jlG4ZP`1l1X``w zS5{`iA;Lw)LmB6C7G2=oSP3lVD>CM*wSaA>;m%E*v?41WD)spDvSlxGz~pgyYfUBy z!~{kh`xr0rou21pJQTw^RXZd#zrKgtee?zo8}ec4nz6pM^W z?-zPK9wY|cSDWEp3gh-TYJU!+vs+NvgS2m`8*Js6%QTJ5+i)4?lHX4b^Q5OvvkneKF#XQn4k1qqvD~AFw9d#RWw=Us?5fiG3zy1#FUgWCpbdt{VkI<}m=(q>D9`WLbX z%&me=Efdui%4NYP`hm@j5h?UMC(wv)vw^Qp9n>uYS=U_-Wh$kDNX^+(-X+mJU5bjN z;-QSiI&v?6J~|7H9L5GJrtWpw^C#^u$mePDTNpUVa zH{^qvN&0J6Jg2jI5SomKqGwx_B5Gx{vq3}jx={4;OE_oP)JaDVa*$L4jpk!l8%vM8 z&QAid4+o`&<`saS7w24;4|z||H3E(Lcg&+lsmEO(0{pmq2y#oHt4MnP81 z(R?gLp!Hng=u+4)|4R9qhDH#wRW!_?Kiz6agr?%5C`Ogwx${;g4D%JcI%0qyG(LAI zMEL{NKyI zT=;ynv||+Q>KInd(fB=Wn7c^ysdjKA$zJF7t@DgB<&OA8ebWyt79XRdQ?g1aTQtyJ#J16PF%mlog6 zhO}cS84qP(Q*t^UTGU(0ux^Wx#dxq-F|Rp?@|sKp30ve&kH z6#kIOsK2JhYhqhe_P87I<^U{4VdbR%o~=>GWP34w6d7 zLovp+8NBkPt7G6(TVun=Vuj$=%ZHbeuG9GyQO(JCC!E`e!R?Sg+YYl7}8}Oh5-d!@`plEOk9CbFSAtRYXi?QON7xFKZ9QyDE z4B0A^aCGN87`OSwCW(*N=p~gE4=p#Db}>n=9qjKao+`I05B6_tcV96jlN&UW^E#>5 zvUA)Uq0K10HD*&1fqua1O}<8SaUw)F6%S<$2@y0%I`IJrS!)hPb~+d{-~^A$nJ4r- zXT?K5?X}y!KCu|SXc^ii)E{|>ekCb+iMDWqMs|zDzF6%8AF7~n=!wT&1ubBXRkYv1 z5p*+2RC885bZ?+u>CAKK0Dd>xo357uO^2@vI`ZQtH)W&ods9MQf?r}IxT^WWTWnJy zcrP_idEKHmdNeB@nlrLC=ego95G~oif#S@F(&8Cc7y8RfPkBfyya<0*( zzPRH{BaA#3kgM7l3kOW9nVYuo5Ixlf*U;>TgT=1axw!=%CfcKq_;mLtNz;+RnCjSK-xJJf9 zF|6P4F|$0zHp3|s`R!s9^I^i%{kh|oR?*j-KqI;X)>NL*UiA(>6K=eERQVztZ7^)@ zg&QyEYtD*?3Tf{@ZolU}m@P9fd*VSAo#zPrc;pe#GBb8;qy7Inw z=dd5}dqH}*RSXB)fsBWuH}1%XA5QRm38rcvD+yc17h_h=`)cBLy7dmIY$_hgfblCI z#ecsBi{1oZVECqhSL*WemUHUqxkkl9u@UVpdFf*5Fy`3!le09k;F+V2mN5(He44DX zk$t{skn5V#V-@h0N0|(ErxNgeTX(JXDTnc!iia}ZHJvUiUt0{#@~anb_?!%mCSb2a ze{mSUS@F=H!@o4kpf_qAF+!1tT^~WD;kom=j z;42N0JdITa^wL7cLoqDYT(M9ssR6bPGyb@-b{AZ5obRBm);D^dlkrdncH~p$Sd$S= zV9X3b$N44Ya9e43)~yK~xJJc8vD_hrJpL*HT?6T?UX2nBaE8kdG5tRf5ON%xo{w#n;=SS~+rCLId zrsAR4z4=mFk2kf!AcMU7x`)2Q83Jd23StlFxkkl98AJI3olY)BtJhP#c=Ri>LE-gf z{Y~c4tuaCEO`wt8GV48m?w-N*&|fWk)25jZfX3~sLj)&quzJXND1+fKbj%YJyjT8a zZuErzav@NC7Fkh%z6e086^W>#3Wh1%oe=kYcQ%Z+&kR%lUJJ+QB?eh}aOj}~8rdxe z#BYWcZE1m{wKVLW%zq0E=WoCFf9>6AT#Z}UKk&UvnUk5!awtPIDZ{avG^bHWNku9t zB~6ksLuMlLkSVi_8M4Z(NOMZ_sChRh(zBfHoLA?z&#UM2|NNidE3d9~t?PTOweI`2 zlEJ~<`x^FF2jj-5TL*Jj??0Pus=U4nW*=}$CQ#5;yg=(8Gex4acm)8a-9hbO8 z_h%*ezV{LQzTf}chnCb%EkE8rhe-8yjFG)`pMJlp@Uc_-cXyJkWB=TTesKRD6}|O4 zjotdHM#Vpy8mZi@67v!9=JD%3l*hX~v{7a%d-{Lc?z}!@Z5ovu+G~U9```VH$v?T- z_xFQ_j!&CU@HH+gq;0(3+(1{<)J&Vv6!-gn|8pOzxYH%5uLDnG%Pc=>@xu)?cTPYA zpZy&AZ&%B&`%oV5aAJU&Oa3XkPBv`+*;D~Ft-Q5&iQa$8&HA0C7k|;esgABU?o#cp z*hGSQw%ZyV7xC8f>pqmnugg4jCLy|zhDLl;Jj;InT<_6Syxv|f{Qb=Tz7OR;?mvHV z?Y<)Fn#z6@J0XLV-H!5}RQ|i4Ut{LK?n8Mz_wdX$mX{mpyjdx0?h^rBkrY=hyY>2i z$<69ss36b#F}{{gmv2>>EF&N`#og|HnBGEuYyRs#l*c>MwR*$Dr~)b(t2B7v#dsQ@ z6zYD=Uc@^8`#zLs6{4r!*zFUk)HZ8+eDo6aTdkq5Rd(a|GynTOl>N(f@8#4}Kvi!_ z9+ypMqN_)*i;E2v@m2HdK9u!)NQTmCN%ns#`E)t=Mo>!{txw*3(YyM;ENOB}3e^#8SlR@asO5&u_bzZJTc$PizOKO0K zhxSxZ_tB5fq}^U7>h+sgOm~=F36tt8Vu${^4`shetSC<<{7xm+PMi99mu?f?v!J(( zjMDG^T@71nf8U4lznzU48K>Dq%8LJp?ntO1ZQEbgE$S`e&Ewa7D37OLcafiUF z{F3_oJ8cg?&`Rlo+jDyq5w$%xCau_&^k1UcxfZ=n&f)2>5)xY;U@$xV4Rt1OpU5>Q z{(iOmb06wHL~3rWTnzcVLp-J7csZ4hR+$&0nDYDFzwSeMyd6C}GsCh>S8nZF}U;fL?*>}koC&?oQZY?BG zL4VhM_LB;N#@8#xKIc;VfA2T=Vnz^v00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHaf zKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZaf&Y5~g+6T+vb{C= z{G%hC@`q(5k$rY2Hg6mwAm;5UL4LW7g#XJ#?AM>po}P{l-aMW#P>au3+i_&<6aRCh z-eZr1*`|7uGeXL_>3$jgeV{gSWsp3lPz z6{k0+&~tNoJX~oVOXrVmmDKDS$eD4P;$9l}6X;z&g)3Jh(Ikq?~e_iqrzY0k_+bc zC9}Tg(8w+iY-~=R=TsGDWPOy*?lmiOODk>D{V>FRS~VH{aJr^T-$D_AI()vivC+`$ zE|p~Jpoxc;ITsS=>;qk#@(MVoI^wb6xIccb)wX%5Wa{&T78qdwM;iGE4lfY8Q6WZ$?dq5r(Cl8GCqilOE63D!)+7aL|+W z{MgosVa5+{@HAFvY)G4JRU;yh_0h3&Xjn?#dve25>B{O;Su|^TcmjQ$AR>^h)kdEu z8#8)+p&|1&d>pm)8*To)DJ$BwNJJppk2~Ax(^#=)`t0naRCA3S!V7C}nxs-nxVF`z z&XY@i*2tUc)24WQ)GK;-{>Otim0hWmQsLmL&!PfZAI-I1N;fTQqz&iBsg86lChoK6 ze*JN&oO7zejI72~KPs6Lxtl7u%rX|F=hDl|+k$%*)rpuYTdRDlhr3?8wv!7wc9&nv z7ZKN-_ZDBaa&N21{famKsxd)5`h$MbO4?t4*L6&!&zIk(~D{1zP%0ZHWHkNl`xR4Rpl`?XLL`ekOlLeDleMzm1qsQ;uc%{ofAhG z$ojY^V)O%3-F&)yzFhyD@2TX-;A9!?@Jvpe)-kWYu^RdONTT~YhSYX#IB9Qg5*EtQ7^sIV1U+>^ixu(y=D{RSujpw;fIblZD$12|RciEbiw6E>V z@11>8sAkAKsiGBWoHYsqb@;p~CdcI5FNtZio4twN7#>ZDgG5p{d+v7V-nB8Dz-r`G zYHf2moR>&uK5XzfdZCsI?DG92!;&~N3IkamJqPz18(WZ1YGWkl41bhO9j`a<^h)K% z5eBkzHRPa}MdpZVI=|jnb>`&;^6*7VrPHN+PE}zbJ9FAQ4;o(F-9(4GuX7shdx9>w zcQW_3lYn!o$}5IUm-~6<@Q29v{d09KH9bDUGvjS3k<5HlyWu?dDJKkM&qu$N^)p7_ zDXc99qk**7QmJg3n|W2{W@Z^@#?5Wlx3U`fd^zodh z`SYIJ?mgQ(iQT$|Hhcbv;>;)vWczW>*YADyc5kMA{}@#0E6dXutb9mYDy3dTAbVfk z=O(Ec$a_U>#&2s(l=whr1xf5!Ggu&Es%))3daat^;vY_z551EvA*iOa#!Swcs#DAv zIC@yd82O)T)lBw@l=Z$6dSsqnl!j9&>Hnx;lX3rY5rJAfo(j**LDj#LZho^j=~zky z705~_ZVcvrA`3IJK8m{xtm*vrhI%bKljpoLg(fTV!uGAo5|NSBxFMaixtHhBOPd<^ z>)K_};6A>e-Oi?Q26|q1Tg__Z^Th(UNp5-EM#!HkP{m*AgH9Y#W`zAEY6co)380Dk39$c35s$BVj97M?y=SY~v235p{`25gD7rG`Jq+ z!i?%C?pKc#}!h zG$Fpv9OeG?oKqDBvi%rztXRqXRxGg#zol!KUP5~qZd%#>k~rt{Q5eW-EOS07e&I?z zIe4vIEnKyPtV$5Cdv~2LB9OfUoZ6^9_4tBjk`SNLxWvwv6u8FFZpY&|r+Tga=~Y%E zkGG=v?xTWY@z^1AR@^_eyn&v#iGM8dxQ=qE3Io}GOxNuBZsN>B8g7=?_m=aAs`ZZ2tAH8U=@TTfUBZOFtJ6a@v!eOcy$jP_KW*{k|#;WNWo=U$w{2 z`_+?8)^!VlFK3YPp~;iZUak_6k)1ho8t-%L$ZF!f@K&8pk2-pxRqczTv8buC{ivE{ z)_dLgR60OB?9iV5C3LsgT>I~rpK)e17*M7+>1Q9~x1Q7QWmG{=Y+JN8Ix>N#O!5i# zmN`kd`X~%!XO8jQh`FC+TWPpnp0e!RFc;Z^e z??l7n@Hwpm+%*a_vS&x11rlwB&l^e7YV{jpFG}c;e$wJvK~0=fJ$f9aw^~FXtI;T4 z(O$4TkB$p0eNvxONxltf^&~&ui3ntUe5V|oY~j~L>rX}R`<_%qH@)U5hsdW=u6Beh zQr|rJ=d{5iiN3F8uqS%(%aFgn?|Wda2%;tbeGHJTn`;MyVr* z$cI^X>vy1vbE?8XwjX20yna`=w4AQ#J$do6DXG+T)~)v;L*hjQvi&GHXk%l4tb~l{ zR`G4{cL9;PpxAfpe2xMz@dv1r;-GdDzf2q{-i-kMrg0Z_KE=fP;43rIW7&YbR^HEA8QOZWShP>52 zpO&naNj#I<^Ur&8XA}mq8pmr7%(-HCk?iyQR2OIXie7neCT>z4_h+CmkoEDSUyRhY zfL+vV;K0#Q$Fk^y<#LPa+PN2iFp#ZP@2fE<7az^18)n9w*r=63-CF&(_bSSuTx-?4 zcgrePBaiPL_hs^kh#I=n&Chsw=tp9j!Jm`q%KeQj3}nv^DXV7nS0?2&;a#@b^RNce zf98Ztm6nSlGP36I;G)T)iEraX1hN_xw&?cRKDC-I?|1Hw z)$=cOT-nvRIUBjZJK|1UIm;r3Io}hqj&U6ukFE^MD;9{`!h3{q*S3xUM363=EmB?gER|C+u(}z0k^piD`6n(W1dxB>afLCbgW_bci&`h(4kYB z&$=IY$%!KjWNX#+m~7?BUEip*_Hh}L2m#p^CoMLkphQF<>tofBC)4}SswH_f-#6}U z%%j6gpFHR@G?z2bc&*pmsXzDQXW1vd?zSEDnR9brmFj%5-${D(9)5Suw?|!bd* zhSq`K+llo+_w&6huLELZ1 z!a&x?%eAp)#+TQUH68vu$q2(l(mv z@AbIn_%Bo{!_r`vCil@H%*f81zRT^EV}jx+Q9M>=vG)$;C6A20cRiCcV`cd9qMv=_ zS?#cYGe<9j5y)z+$nV#1Fu#f%?RBH9&$D`BoO0{W=Ml-2 z>m(Ou{Q2w%ReMqKDV@#;te9yYf0ZWJNM@%!$`g^1eOqa(6lXYZ7SP1uW+UzQRg$h< ziSmJH?oZwSdpC8c7y=N000bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb z2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$## zAOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;| zfB*y_009U<00RFj0xoKcmRV^n_gU@F&M)@_|P=sW_zt( zqxLTKe;_lx|9%0-5Nn*wHI*C`6<4{kSrjS6Oi5YYO#809`E8_i2lch7H0b&`!7^qb12MPDoA4nfnf|+ljf#%3 zm!*mMGDkLxsB6`_i{I?06S{96#vhYI7Jk2EqT6zVMp~!;IQ*c6V-}7x=KGqs`7JQ9 zVneK#eD}U$S|dGJXI2q$r-q!|E#Y^4{};-CkR@&HU(7Ltg;9X+>KQIdrdq5c51Oat ztqgia)^?k;Yn(w5@$t2CIBF42SKbg$aeLmtQOLAqPMXV1&AjziGmB+qo(HZktfKRe z$Y0rYE|vPlcr8{Nl0q9gqF#R6|B7Q4CSLBA>bh#~?iOqoS7skt7k9da>{;dFqBb#_ zm`~W=`><>Q4VbgC>U4WK#}H-#TGLktdIg#ovmti6kNK!oQBFq9TIq4XtCmcDZnU!f zP!m0$D!=J;)mM%}bOZh8t1A1f)b(UTbc=bS|17PO*qcVlPCd|7!$M$E^Y(55k=i!O zX^lb?#}Lz|&+ze3@iAYc#)b$882Of5Y#@geW;-^iHIf2_`) zGY{}GF`KTbqsN*$|9(G{%Q6kLVCCbldKqP8LC}u`gTZf!gM3+zO=bbdEF4xlu5wvu zK37|r&EnMC$h1!nUlNt0_j7ma6p{`_Tm4(J1@w6POQl(6r5r>mWM;1~K{+7%^*#eFs9JHKuSL+z5Xgl!P7`7Abr0!G@ zfyKMw6SsEI){v`9o}8?s{sp5`jz8ta{`(#=)iXCSSmvN)Y0gIY8d4wB&##JZu`|on z+F3(-j~%dA-zJm}IN`P6?CwJH??VKv)^bvwZth~O$Y$}et$6b3UF{@z!2o}Y_&lmL zB6COD>q7ccO@B9o^hT_Ui50Lc04})6J!&%^Vhv zy59Qc&RWXGR&0p2IeT?|A{*$6TaV7PH>6RO^=`H~ON+_lOKx`sAFDWq(4Xd}s%+%% z@8Q7;8RiijX<%PQO+9+gEE(2G0vEL0nL0L=o~-N8&K#1@F@&D`0v9cp!0FxwY>1aF zHgmU5`a(m7gie@Z`i^vVkN)wtwL*T+=xh!PH7y4(O+DXLW;$$vf40u(Rg!&= z$W$4ag^VvHzN;GEdW}e=A0m!gdWJM}3}Lj;#AemBWe$$ZnMK%Bx%DrGrP27sy+3;k zKGC}hi}Eab))OPgfZn>T>E&xefKB@F`7-E}9w?>qOL zbL*e$qYrNOX05r#9Py5KypwPx1t~0aQgjG{uwF~QRDmG4Sn%%!Dl+)aZv9#p_=4yp z{`xH{_~VUg90p#a*-O83f*>4&hktN|A_W!@L=C-uDfZSqZD-y~X z-ouG2pYl>JB@B)8!$+EMI(CK?6Xf@BrpN~Rlh^&G+>sn}D4$7uGdB)jpQY4J>5@6$ zo0c=Kra8gHQE6_UL^>m((geHHp;HIL#F2yH!HZb9YzVw(EXv>yK0^Zg|9oq+x~hsD zHE`)fQCpk5s;X*qbd+^$Vy7}g=iYju#Ur;^#`9xx^0V6vOk@Oc2?=FYRj#hu$J}kS zI&doQ-ePkC4=WQEf zo(o%L&3mM8FOAI;5)usV?`~O%F(LohSeW4P4-RQX#s2Sc^imD3h*^+0GBp<$f#3D) zBKa7P-9n^Jv#Y`3N^jxUuj2Oh2ma-q#AI>=aL#&)D#0F^fnvxpDHUOZJJXd2soXY!o%$VTyR#^&gf{VfO<)DfC!6Ene*AF0us+?I z2)es|r5yy1>QH2s;eiZQ_}%+9JA)@XIdSgk?G=L|28VI_QI<+k1u#aD9s>u* zo97#M*Zcc*3(c;(;wmbioLfsuN`x14O52_%$b7YW_l{(2d>ogGie*h!RyK&5_wJa7 z7!!sF_wL<0gsWU_lQLsW3=@2$~Fu%(dzFFiUm(}FSXS=hV8EqKiW=czdK5kLIk;o+gm!mnR%n-@RI`}nk;98*cRHeGv=Vj>R{ zuWoK`D(qiQHU$R<^Zv?a>?2;RwNj4}!C^mP@Q)SV-VWm9;z|IEQB@__(DrUX5A2Ac z5HL-$n<~=*YsR4zWXNj;UYW*5C)8te^(Le=<9Wh|a^2S7o0)TS+QW?(9o0zT;{}R1 z7b$g+{?Vrgi}gJe0&d**b){NmP*QQ_Vsco)$cVXM#i{6QAKO9?Cp_ZeO75W zt1P%&k<+cs4GRo$otZ%AsqL9D+gpKARS;o{va;J}PV9n(c-3)4dOInw6Q&~>Oi_aH{+~Y+ztC+V?vJLBFfz)~ zDX5B4EAFi{>W|IHhz_Ol)sVr##_qejz5W;<|HocI4=Y!9*f%3a@Ca2QQ#i_~Kl&l` zRD>Z{9|{u_6R4+MYCf6EiV!TGd&%o@nHXGfLz-GK(hnae8iWR!~-lh@Vgdw2aJ_lT;z&z~As{lzSz;YQ|X!4ko6dWt+U>J<64O zw;8ZSA!9I`>0E3u#kR)BF>gjf^eV?gF;&kjn8kB_690jeUuJ}-*HAJ zHN@L^-)&74!#N|zgCw*w-#k@T6nO4X778Aye7Vh3S{gMlC`kG3+dn5x2{I-;nCEJB z{yryvl|x5>>91OPB1@nBP(5u**xm0XvugSoeZp9cYnVR2|;dd zB+)FLWM=r0!i51cAg}vDR=0LUKb7CyvL+1;%~IN|%S*S``hdw|9yT_%&q7|@j!sTL zM@RklLAt{BcyP^mBe#cz{wjYNG94Cm|Cq9%dAYJ~Ie8vyv1W(_)PoHHx4l0gpDuy$ zT=cuY9rF(R^5yYxBQH@;Vq$u_P!{`@0M4Ju>^y|Vd>dlBDT79#JiFgD-)#%4qim+_ zz&6c~xPok>l93}7!%s13bv-j9tOXJn4jS4r@F0+GX0TR;z|!M+I5@`l7J!d+lSj7w zDz48gh_WF*DeeB;tWxw=kb|RomYvw><~@<_-$Q=mzseAekEs_cM&Qx`Jtt?pnTZLX z4>d6+at5fC@|LN<(K-7>I5^@k0`<>3s)QOqnnZak=?5?E>De@X3A=60-@TZ;-e(pP z5J&|Vfr(mYx}-N(dK{ZJkJtpPOosAotjtB$)NoRgk`6rN*c2|S2DRb@I(FBD)Ko^K zkkL`4P5xncX}zspj}Z|?vRT0xt&P4p&FJdz#56Z#B{LBL_7>uen!hlb1?FeZ(4wNE zF3x7{q?(?88yb40rbdWODQM1PmRAh294&YsR6}gqZ=mcrEO#M!-(7BWE9;CQ)MHcg zr(@^kK>e;={c5G9FLk2;7 zcz`@=7RhIG*OX9NRJNUB1f0hCx3iRF$ZT|1lM7;qHnWt8$Oe56cnN*QWY@HteIO`k@d_s)n{cAzn3kycXMq14L0=rz_jO* zZ~$h1^H+kv3wCOJQ2@6w!)CEpPwAc~SoTcOkYb9#a$74~6BzeDI_*`&(a(N!WKw05;)-7Z+)2VF6-_Rt ztDB*dB4FJrgStufKcPKIEXcPnB_+Rn`2r7cY*&rNIIGQXPEZ{YpgNn`kZ<3lLysvv z{@}iNSA{5>_(Duf433jD>TNXE2Dyia$7c~g;m5SJmB!y6HIKkiv#{Xc2wzZx?>L;# zyBX~-w7O=FWQtIEo$vL2q?O9`9v%SCO^yU(XGl&?E-WhQ#9(RhFUasjzr4KEs4_-V zE7L}Ucztgg;_I`s;d*;}FFJ92{o8MM?KJAFNgmVKdA9%pK0~jfQ~%4Zx^DQDi)Zp!Q?0 zfzBt?^h=x=8aB41m{@=_IR!=3zpBwg(FR8A3`D|!pimqtWJvI2R00(f^QFB#>(}C9 zTmphMB>MFlVd0;FL+@Kc9t zNqRCSAejUE<`fm71q1|?-HO53(Ul8%|)(7wy>QOib$>i+npbm+=sHw9P^3TtwOXG9u+jId)G{R=KP7Fj0 z+~%ZX8q#y|5Og~`l;cB^O-syY=+t|;*_5nv#G9HYveMFF5}MA=oM3I0w)6adHa2pA zBPc5?GcYqt+S=O2tg-d6T8!n|ij5kIpMETF!+QasL8yFBbH2iXsU+I16sNLd@+aE*K|ABy!d>9zC?{KyM=kzqH=*`(= z+|rW1;9(EmbhQ}{K$;GJzTsNW)?tl^+zWz-%=L|47@#p`I2fp2^4qn}Hxcnhf5C2w zXvJAG$L()ktJ!SSs2x2pAoKV1N_cI;vQ}jwN!Rl=DIK~qx82DGX-^}7_B8zbqz~vF zgfO_|j-=8Ie-()l_K-S`&T=>e0p|Od{^1Rz( zZz7Xcg;76t!DTaJC`^4N>a?P*3c=hC0jl_w09#++=&B{ve5fE3mf3Hg?~Vb>oOrhW zy7@KLqH~-SA1^J>eA|fZz|6pa3c@vBQj-r4L|aeK%g_@f48oev%mG{>oHiJt;yZ4oh$Zl&5aqkFz3#&dW_lm zazQaE*!1~cJhlCh84eEYf20f4FHTV-l3UDSJ?Dbsku0~KcGgn|Dc#xuU@%YNap;s7*|pBaN&QrY3_P-a4_P|VmlQ9 z0l{sPq>PMAJx*s|pQOf%MBDw|)cc>V1t39!-R#eO9amwu*!Hn(xS~q4w6M6CY}1k# z6${H{(6ezsdXB2RD9~lZb@Q`O6`Z^gJtU!olPnkFu-ezyGS_K83xS+!G!<{mo(#jq zr4gF5xh1l!ZR&4XTU#47=NvNw7!87vkzq7yFRXG&pFcDOZ{CcrZs;W8()7&Q$0=h{ zmX<=0tK0MGE)>ZIweAQ99?E?URaK-YNrYO>DBzO0d3mpK&ePJ!d;0rjB_zBCW>1-N zT3X0)L~dB=6f@?wJ$zKPv}U)-!g&0SY0u_P&v(m!*_K;GY-_u&j`rb(Ub7jf2nO#^ z=(D%eH0KMn1vI!ohxquS(&~Fx*RE6GIqerRYb&6e1FMw)cGRxKd3^(kYkRkTh@Icd z4UNXw{~~!rAP5DI+WO)&76r%c)k8JD3fnI4xGET)cs|`N0yP`;p-!3`1N0pnE4;+S z)2nd29v#3_y%8jIaPUpdGWef&bnS;b?{6O2Uw-+_DvaJ`L^khe+h_mffi`Phdi8!O z)6IHBkPhm?8}W}KNmgy1M?(Ts+9sSx0i1}tC_ka^4Ms)SH{6ns*MNsYrI}xHnd*J@ zs8kJtw3?8u)s9f727La(2=ooTkC{gnp6t1XTU2+b3G-JaIC9{Ne)IqFpXKxb^mmWh z*&jWW5r>o8uALObbp@fCE<2-4;HMG3&O3fy$4MkDmp+?BmCeg@uJ3`SPg`(4g7M ztyIs~R~Y#GKQCJq!rWHN*Wgtb!%z$?QAIpRb%dy`ad%82%h^Huci1?7V zoAn>kdP&I(xeP&)#>U2nxCB+4R!0N?OpqwL!NPLqu>m3WkcMlmrexNPO-;Y#=OeSU z9KdIIZao8K&FK55r>>_0j(=vU#DLupLeo#Cu=@)_S|M(n=uG71wFa}B1sn~wA zA$mg|<$D=}!OB^Wb?G7w@BOjb_$X^&DJ`d|>9=j?%b-qw(9&{panZ1{R*b9xqG%~W zT^EkJ%rh-Bb3gVB?84HIWR5Y=JlzX|n$5iSuF3fsoqS3#__4Qd-|7SSGFJ8;6bA`$ zaro!YpC15BGovE{5C;+x(wGA%#{kGE1B@IWk8eNx@E=8y+qb`&liYc9Pnq1sHqgg* zP!t1SHT!zF+KIyTL=mUV+8YtWhZRrP71xq{Lr7#>S5g3%G5t}IIX}xq4Yrn(o11?} zx&hSq2k6n#qW!ze9dI`8C(4!co&YgmyR^9E6q^ht(tu@v8kLl|uLlqpQjBsS4_C+4 zfNPe#_G~nhCbGf87u>Rh>o3rXq&R%Y#?|gsWTcr`3AK zBL$uZv+mt@JZGNA9JOr>uL>Xg;NouQuf5)hTmt-FlCC;sNd7|v9gG`Jr=KHHm* z0Cn<(PE*au`D;P4c5STf`#WWRQ>#0yojL)j3dxM11&|t?&k28W`2b+6QK1j_z*ha> z7@?-4Ll$t|sTf=>N!uH1%D+D;5b166xmE&U2SN%&I5jP;1gUZY2ru07;{)}% z2kca(&$Vk&$$tWeq=CQuQ${r`i5t5OGl<&@yP|{g5G1RMSZAIrJ0R>a6XuCSiq8S7 zq0Pn^jSkLus8Hki^4(VDt#na!js$MEKFwCG+Ij8g{iSMb?%vAJ>0b7Vep^6V-xvAY z6tBmzEF#hrJPPUEm@4Cc2}04M+XXxnjcwrls=nDlEF;a$2}u%S%vK2D&cz@0-zEMF zze2tohlnL@md!Qcyet^cuBpKXe1T^xWl>R4>|i=e_S_rYD9NC&U!OxApeX{03=V)v z=%FtF6eR?K4};L+1XW1zghvYWNi8e*7unv2{%oi*_vqHzJz0aLcDCJxDIW%K}ePdTzbOU$I$@qGMa#`=Ru{Bw*pmnOD#I zt5x-9c+0fzb|NXW+y)l^4dD5%m6nv$XnlPQ@}FnxLK%6ZwnC>-{6iBXFCR5{EfVIn z%EYu%S7VymVF{kVYX5hi%C!Xs#BXzFxZ96xz1%cQ31#xe znB|M!x{`@D4W@qnY|i6RQdRW`%B6cD>dpAmoD<|`H%(!ho@0mM&f_zP%g~9ti(cG) z=iVPJ$P3ePxbi+1J5!JFTt@vH;?4CJhG6XRfk(98|0@UKH8@+Vm{U;!d!Rj{jSLNA z)6yaV%6X`0e0+S&g+eqv^9L~ta4eV7f(KNzWfhF-m`!u_ZDrjeZ3F@XK1lkj9k>ViY%`iU1p6=O z4b7~Jx_uco@)#9fTw>miuv(7J34kUU#dEXED@@Gt``7%~S2IornZiz^Q#S{*+ZV^1 znwz{{4uX5>{G93Qd%X1E0i^-=4iMYUSfpf*&iEU@BTG(FL}X;cy}3q}6u^Z~mTKiG zIGc5Tr+fM|Z}#k{udh#cBOvx?H^5nI-Vs0fae~QgJFdD2vsx*j4oY|8{AV2cq<=!$J&x)u3^?wH9Pi(b*t(e;&C3>dT1Jbo;Wj;>j%wpIRDRu@t^V0ivK zaK>8l6Ir2a%MYa40PWwQdUqI7EIW(ukinLWHWd zhjbK1U&^LI5cAY{!>QZ}u$eh0Y2k$BTQ#-EOic3V=wN%oVqzZp7SA8WcR252hdih^y5@>8z&N)G5EwMb)=P4?DiQ2Zbx{-9_5hdS-g8C(eMb05skLCgT zg%pMCcyRRl3KJd;{e!GBqlTG~opY9z6y+Np5vLaT_C}sZZ4!doQxuD$bNv499~R)l z3XacEa754lV%fHeY!^@kQe6bOFEIwiwjx*`*#tF7%BB)%efWAfgyqn%_Xp-@-7Z4v zI5WDRuon&2W0P6|xRWe*mpoqMdIYVc_npW%4rToW&)kENE>?IV9)~`4Z(w|&MasLP zqE?K>WEmlUCU2Oy%`eacLty@av=ghjtp5IwiL#3r> z{fTM^YS%^@{71F5zbTIEqVMaSpLLIU44=AUxXVVItOs=HqrOY+G!F8Tz1UH~yL0~P z^!@|=hNS-FMW>?Z3GxGx#B3&{jCgldoNlFnh0y#{G!UY3!qfOWEtcT1=gHxyp#5<` zmqQ4%$bly4`FQFBx@^D~ae==XVR>CVk4-|QCrXE4?CbvnN7$$@OXmQt57B?V8Q)q) zeBb0`$?^@O@BIjAOu+71*5OS*SHswo1*x|cRNO3SsL3&A^ZoW&${NA3VyiB- z6?ap9@Iw@`=<0DB*lziSv z$aAG?(Oy~kV+~sz2cR4Q6;P)olMY*%aks)KVY#Pl_+9zD)Du29>K=Q2-^1z2!figK zoe$GlHY(%&4%mOi#*sd<|meWZsr9q3A{{Nu`c%JZYYT=hh9{FvY4lgaQA}ogw(UJ-x%b4MBcuogO4dja6 z=;wP0<<8G!NgD708hO}Bbcg5}1A{1Ne=~=l1~-UoAL4Z2tjYFU-;Q+drEvvvkrErp z3izHYV`1|2Hd4ApV|0vcM1?LH(DqAy8Vos9Q&nBA?5FI`mI#Hyd}smJ+z-Znm7wIt zCnh#+sRYO3c8z%3w#!{@^;17t6mTRUrIu|bm=!4!lh0|eoeUNg5p+^~a^r1oD&g0# zQ&4NSC>$IbYP2?x0QrNdjv?TEczMEzCnUM=ZJD7 zWUN&QNBYdE(!^wB5fY);VnuIPMzbZPrD5t_zqU_#o&PFjhF-en{dc16m#tatdAxeg z&#bRs(INZ08xO$JOLvxvOi>%xESttPybQMq%Kt`;swgP%SBP6ue1?Gb>~1^7`Nr*)$Od3pJ7-@gX{x=vCO8KM>tAOr9WbZy&i$<1#2eW0Y|qJ04M z<#yPLJvnQ_veK{@y+orD{&-`gACyiMXDo3^Jlo2SCN}{kTHj0Y&!k-=et<9X(5t9% z;i){nAkYvnZ585FeIJ@15^bH=;y<_`3Y^kmsRQn>Gk-Hp6JVO9<>YqG>#ww~zgMwnkM~z8s!25) zOKJfbW4h6)AYS*~J2X-4Ka~$gfLYCDv=XnBNRj^VkbxbR%aMDuh0zqd-E_QGY%n!lnQLTJP)4{<3AfYN*4$>%v zYr3mXz>TR{Sz%5a!xW=jD*rJn0z=E1&tF1wybv$c3k-ZNJvKj=YDcWk9RJ**5lJdL zKTlcbzdJm2f&At4-Ca8o89X?)wo;_3J_N0}L_C9EslNaM0T30U#S`KnPO7R@G+!t) zCmG${T!KA7Ff=l{*0zASJ`%a_3?_VuTFV45YohD41N^I*M9k@l%$R1&6( zUKx$^9q4y&8@%gCkp5ts2w+k_W9pxjlx&JNyfOhC6deOVzadRV;72;!NvB9f170Kc}9%r{f+=Jsv9PCM!f|aP0!PdAybX}i{{}aHo%^}(VAm-uK{&__cXO|G*vm9 zXVuz!hx)+ms2qDiqh^lEN?6tyh@R}80C39nZx2?0Mx+ppBa-6&IMIdV74k~yOB2}e1MB%>zH=)(Z^;P>x6 zd^mEO9Xz0-uU`W}jDuaZw6v`6HbkSL4QfwEx!Ytr?3g0ziW>=#Pvb0cVb%J5SgY&Z zT{xPo{LT$es79Jb`&l<eCJx)k z@ET44rRF4V?PjW??a=HXuGbLW2bo?)>7zg4qd3Qw3(Q1+kFWod=`mQ zgM`Tk9+ypF*`xSA8X43KFj8Y+V(MLu5s3z~6Yu-$72+enjv0csm4Shw<9Dr61?TUt2v0cYIM}}_ViGSq z%Dq9MmQ#T1#%p-tj}LmThxv%Se66gaoLtXJZv>Eyyo3vmHR!VA4$9<3_AP-Gwwm5& zv!hz~YNAFYHr+IZtaG=Vm%`m&)KP|{?#{2L;_t**EW;U!glI;`LR-1wIB*Zbd(sKc%j3jy%!|P)p13US}m)A{FWAPQGrK~#IHj1N)Sb^ z>ZaR&`VR8k3z&&ZYoVzQ82%!|WNtQXPFU4TisJRI!`9o{o%;NUIF_6s%*!Z{)h3Nx z#&0BfLg8xI1kQNktLk*C7mw@y0uz_f=Ba0VeEbq%2QGS9e3!q<;;y(h^ulWwA@_X^ zwP8l8*9~;oCl}s) zn|Rmr>Tt}X?xSbVUwpvo`;7bBu6y=A!}C^!4$vwOLJi>lKh1+xigu>ZTY}vm-_1y+ zcOpP1>K&)*{-0ucK1#8R=&J~7?Cctfh9|=l^4Hx@vCTpE2NVY}iwHl?uBWLDA8TyM zRQ!3Re^<7>xReIcKL>VZX?GDLUFgTR)%}bIqi+dJqt!*TJVR__o8KvpvXkz%D#5VL=d!^on zzQLo(Ek(?yZ!8H%hb29_1P}|{Ou|K?c;4(z81uXr*=PozfQf{CppnxbpO8?-E84A> z55*|*N-S(@ITYPLANi2lLfE^CJK~mxOLghD9kk1L@YN|=!UD@hbjEDSBr{y>70mzJ zcSF|jba+Azq{7zcXLe~!^&(UKL_1|jD`ZkNR_!G^JX!}ac9`z>EpYNoXX%GVYi%0b zuijs;iWj~~gX|5U${}&};tAJgEcU)!NIZ;Z8QhGjm67t|QbOa{4ua_7;MnRTI`t17 z+3emRWZfdq5h{IcgR^xdX*8~dpU^z8WI8zX8m6EBo}hou45w9t|DvbXOZy>3cAIS< zSh&P>e|qc|iP0hPMpZf*GdM={KKeC%5>o8gm6>WTYaUXNSUc;HTdw7W zY&mDoix97`mjY(REB?3=h+DYwZoVJ93jvcqBB>b4-jhv<6arxk}MDl zf@@$@2@lQx%BsUu;43p`2kZrA9%G1IT2h8sHF!S%K{`5=mV%KQ5I^KCddSMiWB~-P ztPEE@V@;&-`0-==CI1VXs^+DaVOW4I_AN2e;Yp?n-jATzpl2aF1YH6h@}N;Bxo*OT;TIFN*MU`*+agPW6cP@9?q-%iZX;|B0Qs3%8i; zK*fTxva&ihYZ>fg+m_P0ucmxf!k$6a-z_UDmm6&B^rsF7%{SdO?r)3vFD_ZeVMrZSHi*H@NEHuFB67n0H@JS=K5&Pr#n~ z%j|!`Rn7tQPf85<3@j`U^3&Zc>9iI4hyW+jLw#%BX@p`3^5fHCM8Zj(bd6>yOF8IO z#k-5t1kRx^84SSJrb4PmAwH7<*nfckQu4eU* z$(zi;zWF$KhYeoOacYS{6S!cu5Fma_a{0oDk4_wQwtf5gpINlnTC3_*6VPG3RaNCX ziGy`5dQh;=!Ehd}`wul>s68;R3Io;L4lnj1u=;!|J3%&eG-7OPFzrgVGBqzt{yAp? z*EJYb+Auf$!L5x=RKFv>5>CZw%Eky`Y4&hwUv?CgWHrbGpYt0kusomV2?Lni^KJk; z@81UN0)2XRVc|FCA#+ZC8{I9%g|Zq=CZOed?(*dv-~K~Hgu_Y?T5}DUwwLI&Qv!96 z{qmPSTPeCY8ct5<++wy|&!+ym!HGf^{wI_d1F)1!#*5&PmDXxc?^!lD7H+v{sl5*g zls6EJ5c!F#bhCFAf%M8DK5Y>G8_?tZ`T7M4lb6*=%>tS> z5E{8z`rGF~blAGStMG5Glmf^Tfb{I==B$*A3_9o$(h|p~r=a-wc|nL5sLhkvHNf{3i>X3`&rsD!n~<~R_WbYe`G$)fwO#8e;mYbN zo7-3TOI}w_U=IR_$E$Xnr9TK>B z<|R=U-!sbK-~c8q1n5F(7P9fdfE-SL_f~e&s=gARgD;(xlO~!QL*KT#&NBj2OMN4F zeGt+2=|u@+exL9aXuo_blmhsd`NGQ&rOb`PO@3Mtx;s7s*h2U z21CrK)DI_1z+i-NOZ;1yg^&)sxAJ33#NrN0PkczpcPMy_)FCSjbWX(uKPz;^tqXC2 zBR%H~0}d7aVxwoa`a9tNP;|SVs$AiiMN9Zc-hGD|u1;DR;IflnpxAYK)=COC>?%|= zomG%IkIv z!e_Qihnl@EoGv9{&Gss=Jz}YDSF}a~E`yKCp;HrGSBoPE6A3CQH$O9%*1v>85EmAc z`*|9#^%S7iLaWWEnQ4Xr6P|-Ea!*j=RfiytWf^L7hnaN5UCNI=M>aztIj~6A|&mP#ZCrzTRIxGVv&0hi&B-frHy1Ya~HoTgu~=JeMGCYw#vNd>ZfnB%a>$=9JSrvYl@tXulk? zL-xj6dFMWaq@i~wWS_HI+BHS=VigV5t$_cJjd|S?m;6S>eE&va=Yu7L9JkMp|5u%x zBXWE_kk|rov^Y?-{V+ZDFBcO;VY$Yiq)6otl2Sqq))U(2bcZ^M^OE$fYvCcnUc}|P z3jgJ=&uJ(hw8v`2wE~HQYlS5x!Mf!@*!|~^afx=LqtX<0Mu1jn1>qgRWUGLGVyfXv zt~=_;nt!jni>E?uHd5&GGB~L3g8fFH4r}&lF3VgUnk*n$ir)UtdzJw7ZJwbMe%EnR z1RJ>LJ$BsPo=Z3Yf~%0Gdf`mtWWpaG*N+VHF_UU4X&Uu}{zkT#-rKS;K#dy*PJPt{ zmi3>K=zm?K+eKY}zL}!ITUkyYe)`<^$uRT&-$C8;%lcBec0`eeERIX zp${BWkq?(SCZ#3iPo!_0pjehMej4>YGiJ7K>(PoUQ1Jb?GOM|zsH>hP6|#U;jiiRn zaD#|=fsRf4#a$M|{U^=A`2wOXx0^4F4R;-DWaX_EOh&jda$G&CVWy8_fQXn?NIUw^ zH_SY?v82rWa8b!{0vUV~2&~9@&~D;!FueKe20ar<5c~X2_eSg&B_+kJWMCrgwJFI zVMPEx(VpB54b4&s`=2l7fLbnq46C>4dfXl46m2Dm(Klccdx?3^c;r%z#7AGYB7ld4eKRZVppWZS1Gb$o>RnZB>{*KceiD*?GuHH{MT`p~G(T$o?I z_dI>itP(`BhVAQt;9*uqtF{)iF;0iIodM7T@uA*R^}MK5MC1~A@`%tJ#ESYmlu4ju z{oNdyf}V+*8S01HTVl-cOcchx!r&iiXrmS_p%+np^ZSbMPew~f=1A)3zGF$+xmB~e ztU$kB+i1>#wVBkVMk7mxf{MXbkz40EVPJioa|R3_{%kZy|Cxn;+lhK2%uVc`RH_mf z9HiiTM2O0g>?3hm$Cv3P;dHw-a+HA+z?e|3;4JmKO$RynN_t%$IMeFuC*8_ zl3CWPxE?H!W@c^EU*|WuR2KKNhAI{V+7hu;nA8pqs-`m*1 zKwKj1-f)C4S1f>3J9xhgAtIXWDwPh5kdb1Q|C*)}WKM+*sR!H%OGm#bG-9$l2Nc!_yxYecjR2Q+v?jI@$FHi+sY0r}*1Z{b3PkZaREX z&1Gq>HwSeBG$mIw(>88R7kj8Q-S6SQF`Rfr;WAO&5*+8iFVC9fpN-8**srIc<#9=J z)@puqT0!?s|Jt*T^Hq7Ovgg}08@#WVCl%eK3u4T>?=as3e5C>u%kqFJUA%iQ9h6_X z2wRqLR{iGXe|b@heIdEjS8=zfhxWj1*}!bi;^23MjUt!LzOO1@+7gtd&K{`RaQGh8 zJJx9R`s0!Ojqsal`B2$p*NbOp8+u@;7<74!5Fpl(>pn+H=&nVkvE1btt)5UNvp6HQ%bbZF^xg6f@0K7BUt+ilTt{( ztxbRRaYvt&@&R>E1n_`sASj)Ez?YWxbQNvm!=3sM#NM$Ho!t)q9n@A@DAccd1bM?Qwz@4({>ept$U}6*#aCBRWuq(|C91h0q(4K zTT~s2ghAH>)HfT0`df!W>-DLEynKZlRoiy{_s3LAR1~;Uif}8I4YK1XdJf`Y>R`-} z4V1c_ii8m;HiQ5HBVPX;X&Ta-nhQZsCxKXLltMHTR=+(P=F1X7@!er{pD8%=a+u@8 zu%1`6z$c)2DGt|zaF{w&LD*uLGr?~sJ|RRZ-QH*^q{u{P38w>xu|#pIXyGC z%xrJI9C{T7V$0Cuw(sNu@Y)xXcZR~k2YRu{gWcN}EZE{<6rPXWseKC}(Da}j%v{V| zm}xw2Lv5w7(M!n_R8M));8;evpH>_yr+;v~i%E<@7e1suXNe?o2Y0dD4v%$uMx=}! zFn>%vEhG!WcS}{@=aNSyA^z{+Z6QG29D?v|9)2&tzfQsJ@gA1ew+Q+Ci$IEogTQH} ze-lU^8hdus7rnNyF-`Q|&taH{9{mwX`Y9y*1>Z??Ue37{6EUv9s z-$$z1*inrB0H@Ug512^(s*H0)n&3!Va8e0S)ln1zt`PLYlmm<&DuDRYcVl<6`X8m5 zJp?sjDRh4aBgyL8p)d@xpjfuEcFKhX>}@?=Ns|D1dGhdCx?p!!upj@aeoK3ppp9-iwEjKCy_Xp-MWKSj^ z1B2G_lZ>O3p>eD<968t)T48;k^w3`ZM^N$pUH>E zZeI`Fv#=xNJagux+#0VqO^!7MIrF5fL=@^ z6^)zOP-E{SY$Y9FP;jO&be;YmMEi52MJDeN*oS^c>rMoiz|Z2ZU-h|DL-XEkgBt!I zDGHe?OQsCjr;mJJ-eG_B70pYgT+FU2D=&T3B#2aRu99%5?XKZ_kKQc8Ax%S$5tj}j zKD_JxqXV1;Kqf>-W=;>)z%)12jbkc z#RdAKv|+pxfAo3ue!|L_odo;>1~7c5&-Oa;UhwtXyT}^>+P{Yh?|A&bCcOBf*0+>E z)Hl^K@00)bd8OsV^UZ2-R`UrR9ly`D^y}ARGBO{=o4C2T0cg#-Fr?LGXoDA_3b|d3 zaw2{;F)`)I(KZ=jCy57da?X>ywYh4{7VFVK6u>cX* z&GHY2i-npB+mY{l4oFmPFG|}g&uHh)$EgJ7KZolH;%x8xp|$#<-mw}94G+-k>%OI~ zB9i-iMuui*0cTtV*J{JNx_i$G*Cb`Pw0jQp-m-ic)QgI!-M?RMwD0sKy53yuFLoIs z#S?Oz`JH)1CZ}VbU6hkkUID>u;QhAIA@wSnwqg^G(OQ))ParA!!W|A?me&dL!>$mE zf_8B*Zs$2aj!%S-rdL0?5K#OqE;NLT`cX05cta4C8h2LCHp02C0Fs|oY6Gj`%O}$V zwJIbnWrR0x`vb!Vs8NpzY35=~LxGwJ0SQLb@OvxBsrHPv6_aOn)Vgxz{Jtd^-N&hZ zW5KWX?UkSJA1)21Qhl6zSc4<(q`K>C#+53sq`=G5mUCWk){!2aS|Ycw3hq?`me6{# zY2nM<@YDb{BWZ^ncB$6Y=8pO8h z!>>!Z?Lp?Z2dEe7ND#!WO8$!Og(k8vGyJ4avwx~#98t5|v*{G7@vjZRu=Xs)D}(m0$w7Iht|YeJ&$S1nTjyTyzS0vvV}IULp4wEB0KxJIHbY~pe{caR(RCZ} zB~zyhdVQdxec+r#r%0kH3MRjPGSM74@L?s0bH--HeOk+eI%|4Oz4ogShZSy$q`l(6 zO8U!{T0MhYNt;cydhn@FGWHdsK!?%zdiKJ=wWnL+RRaRoUziqEt#tLRST2_t|Ge?S z(^kGN|F2i;(vGq=qGo{~f7qz{=IO@q7;MqQBo0o_8=a&so^08K>S?m1u&tM#9A3sA z?w8in3{y`SS&pI0i@90eX1coh-4IWPii` zhn$9?$5&qEdTb^`;y$Jepi$_92`0l(BEvH0(b6UOnmwmq_$#3Xar$>9uSA}~-@+Ld zB92zqPG+tc;HcT1ZT8KOHk&;$7EXA;A|edNm3oC;TvvDD`knr8ftYaZ7Z5#D+Z?Hv zgVGL`8e2qn8V*PG$EY(}VzK&aq6U)OT1Y`XnGDZ5!UPHqM`0-}DV|M}RkCD7kO*!L zsSW+pdRQ&zEG{1XKMKiEJCLpCW1$_M1wD$=8b4aIs)%eNgxEER?xmvib+$F1&$ql;!O(i|zkVU#A0N=zJOUf`=Mv-z2QXBUTg!3FHx0<%CRx7n04C_Y z0%eAz^-5Nc$@1y}JY6UKa93~Y!v_|YGiSJxdu~}%+@_q>*tRak*jaB&cneSX)1R~k zqj$HJF(PQja0vK7#JS^;UoWNi-maw8C*g2Ol)8p09PUlC!at5{J6TA-rP1^wvT+p~ zvi}{g5ta0pP-_Rx?N`8I*y_V#hl%>&;WJvgx)e?iJFfrnG%FL7i>s;A$UFI-wRIO@ zKns0hC$$6IVgfM01Tu4aB{)v-QPeDNe9~n#a zcRgGe=1b56OmCp!PB^E=MNp~NQ6F%BFbG`qVo zhpnu*=@t*hD9U3ht7Vllo_rTp3~L+!<2JCQm`F|$%N;N;{(vTns&kWpR0E_)n$;V# zd$Z^E$>DyGXq+x%SObb;{|*OuCncU>4A7`i$VntzAH^JAa z!NKxF)cHzYHp@wE9t|z zbc;WxqD zC5}IqlE|MxfhjvUEOedv#{EJ}xc$4)Ds~8+PET6(fO+ z7v_=)s+`6{UHwGD{7S<#pO+@qxtMazhU#D$V zkjSrt!THgSmi~L|5<5FP3=@96lt%sNdn zHF3@I*FrDP&y|%~pfE&TThn|Td|5f3R4wc3;c+qFgb5M}#R#y#h@NSZVAZ9n_4c*^ zD%xQ-hj{xebZ(nsqd80@kx0fyIddB-?$hrRwO_PDwE1Hgq6m3z6YGa`T(>=_2U{}f zDm*+qz#KYiqWq3IB@+22^Z3g5jF8WtKa2Avbkw^&$ccGa7e6~|H`(mR>Ehx7-YCF& zntdeD>rlPyZ*k|&yP6uN83cb1z@zDu=JhXie?(_Gp6;-AppkRYvmRd_Usm?-jUW5P zNMoI=S)^~@L_v%NSBq|5?rRL~K!&W}C$PeNvCa3eAU{7^(r6DT!Y~~9CfqW87Lz*H z)fKhZ@H;yb9*>vKA_I^pl$r5)UY={9#`3|OB+BPsMrp8Qu=e8(51(KG=S!=}`oqG4 zBuI+&_4Nn^h49Eorkf@vaM9%AKcvNTzbGAHWimQDIv7eyN??ilrqa9!Rn;p$em5p< zZYmxe9HiWo;!4y{{c=AyNfqc^2*4;KBPk}O3Mwi%2gNu;Nk*_Gh(uP1|N0CAxR|+w zS{Y(aepts!uI}nVu)nRScwvE$EAq_=o^E`svyjOULRR<4DKSiaD8K?RGT6YC-y!Xj zlfoPv9P2=8Vs3S+84?DNa0ULKz11HxYhx~D-?Vpk=NNOXw~Lz9_wcARWJMy8%*I1G zpEmbfr~)Fv!NGfgNC**@dMhX(&<mnK_FJ|$&k1r#bCFadFH+dDhj!2hSr1zx?U!X~-Rc0iiS@rhzKF9Q{M=sPks zHPt;Z0Q1Heh{MpY{|qP4aJq2#vbl;dU&pJE+ViZ@;3ff7A6v2hFaz1&$u3!N^;&=anStgBhqe%_SU~`l1-r5+fT3U>SQ_sGiwbY z?`Hl>&-BsmDu#tKc@7#yG=Zmvl{g!|{&Vg)u(bA5>EiC8Rpx~liN7PR%E{GDVuQyA zEGEj9m_LHV6XSXL4bTIqZwt+{MT{b+2dAf#v$HRpZZE;af`I7`TNsEOKrs!NkrcSr z8Vhv1;D!B6D(o`b&Q@aT0a0~zzrQJu_eM=2E49CMLbNge2A*?vKATgbmG&xAB)Ih; z>9A#IcQ{wzUtoGT3@{mYJg?i;JML zrAGG)Nyjsi5Ph{Ch5eVM`e=nSo}YdgoChfyus%W6VvlipHpeWo$#)tJd+4v5WXT{r zbb%utonXamU6~B--O1wS{*q7S41kLh{#ObA-@*>^ d|JHoO78AQ)9l>NV2~N`sUr-MO#%z5D(4 z{`P+NfBQM$@R?!ezOPuFP zgO%~oVm^JL_aiAYGd4)JFl1UVAgb3X*75Be&n{cYNLXE55w$2*KU@--E94XE+>6s~ z>wIH_vEpf0rnNXuSY9FD&nbkktOi?BDwhgIA{yGcRU8UQs%EreI^5Y@mz{X;E`fT~ zCbUz_GcJru7Y^HCOs>L|LZf5kei00lL32e%Mg#ArY5xEAg6!kFcNo!f^ie_Y-@mU8 zy!R!=LNO(&U3I&-t9>*5#FQiji&jwZ)s@7vXV0W_HNO-U4Gb32i$~nFKp3lsw%kf) z?acMdX7lp$5Rj3PSy))QNKE}h-b@!Yzx|7diet9*J;nQUOz?P!A+f1Gt9?1(a#>bH z;o}|ohB_o56re0T)ORefWmWM(D7Gzwx}09 zA0Gi2MbfYpXHFQ8z(;z7-PSBd?sDT+Vjmx$=(sq-x%&;0PAL-;>J{Jp3cANo>7_6e zbG*idjsQ^sr={qkq9+cE4US5KdCz&n4CUpK$FqdPmX?fK9Y@l*Xig?bTxl?18v+wz zr&tiP)s6sx{dxlq-EzHOA$!b4Q{l>x7*M*v!o0CSiz>i+DTcIY`$IMJNN}Gqf>wr|ho$>5+ z9?M9Vj`RI_+#B+Wt(1{18GSZZ)@qA>3`S;V%-h$mUrRH7d#qZWnL$cc3J0HS)voO5 z_$=ym5U#}7W9Zzzb<{+kjON^V-%#xYpPsU^jK6=|@&mYSNlD3<7FbX>I$1bZ$5klb zz}5E@w%cXro3lM5Mtt-U^7u%;fhHx!jnkcpo&EW$9esWMES`ZaH(_TxyNxV&69a?q z?(XiwmT^_4ou3aE8z=TGg&Gdw)(xla}FkJQqCdo)8}$1?8t-Me>zRa>!K z9cwRMyy)3^RcSN~?r#$Z|K`!}-&W?P^#+co;@|8X90)reDnFK0RoT3JNdP`T#HK6M zw!QK?=H0t*Y_b}0JRR;k z^x&Tv@ZtS?%E}xbOB9kAxg|4hwEc3yWn9U-N$XW$&~!*Qme>)WOqx0@yPwkX!A7(=EfPd0X^@+a-;qZ1PcZ_f9R zyAW~BR@!_7ZuJfAo(Txxc&oljn3|cfIyFG5o~hs-;IBSq}9fEUTnYyJDM&L&@y>~2EoV2Ps+%M$yZFwYi|Bor1ev| zY!-fRxEWIiUYVP>t3K)N2)O089KdRwz)EG?ayvWsKbul?-5w>Q6!#^}$jImkggE>W zVP$27=mjR+@H+nD2Y;B^vZp_gg~}{}3nJ`xKvgpZ)SC7Lo;_PXosd+Nmyd8cCB=G` zP-D^mPP(z@ldyY{;rP#awTY=?KR!|{mZwicj5xiTvU#itgv9loTfmTUsw`QIo~GsR z8-v4ia&l7bb!b8-g+nO>X*O~q5SYu!0j4c6QlQD=&=YrsgN)`{78!}cJrIsD2^RnN z*4Dan&~2t>t{9lgC^%u0M7ak-3PF_1SKtOXmJ z#@5@<|H=kNh>_)<`EK{~{I*>)%KN0G;JxYM{4ZZ1C}B{^o2kUV7a71VUE%{uOG-$! z+wqSPa_S%Hp@mi3HFMh;0nb+JF38}Pxmsh1Y6`3X?rwn7lnIIka~*FkcQ$F~ce{LJSbEC?EUcCun)_oHOloX=zC! zv72-g_)&|_9l8~1s0>QKuO)GeoM#Xr2C`{qV0$lx3~)>nV`D@j^66E>M|?MRSqM3)dNUyUH5L)Q53upi(=AK4H@5~kX=#JtGseMsjpu(`Ye3M={5SVy zH{f)8>4zW#8(Sy{CT@Lp(ZYSJQw$>hSuH(8`mUUH7#KKdHy-};;# zP)y$nC5RZftx$bQ=^H{es(?ygU-{?<;ky)fd1bmG9W)uVvxmMztmshXFq(3NE;zHAQi z@ewvKG^9&WhC<%Fd9%XGN*6eigTVOoDf-Stj-ri?P3J9G0s^~w^2J8yMtv^)m>X=G z4dX!Lm=61SPAeN51`ZBv@Yn0>>wp780|VA@RSS!}msGQ8WC#BDet906-#>jKcUWq2 z?Yj5+yFjNsW2le%w&3W9CXj-*-OxO`-@=xkeD~Zp>Xw9vh)z_L0`f(h^|)B_et7dW zBO{D^V2O9|yXMTy41796V`Ee^E`Pyc1;IJ{2V=2TerJv%`Re=8$p{zbV*-de`@2g1 z6V5qOaSHj%A~A9sF73Ky5yMuY%7rq@${48}hLIeG4c__b;V}HVI{_&Wa>1fG&XL6{ zC^6!<2Bau6^;y&$Spu%-((_bCp~o$O=nt}lWj!GJ*$5q{r zoU&I{dF}1X|K`^x2r=X}b=7}(pr)C%9aL9y$5DySsMGT+yw0X~zG6@FXWfT6MEHWMh>CGvc^z zmSO^Bca|{7mk!%baUf>L8~xGI(cq)9#>P}n z)U#~8mM&NPCaiywAv}76njrKF^FjaQz_wNOEE~6$G z7)*};2|6){K{$hAf*C-g5cZU9aQDMtL1+6lwEmP~2!w-!L&)!iW9++^k^WjJgi4?+hC z2ICdGSg_Pi_*&iz+NEEF(TBHAokJh4XBa2@>4ycS$cGKn!vX zxN#(Z_+U95`rROTjI`D-px*n~LQY=ZeD8pZlkYj}uf@gNB@pr=m&3Jy1$|)F7}&4s zOo)q%3%pbV2g}Zm)gPT%nf3{0+Uq8*b7m@P(_lv0wS1?0$Xkub;dn}XcfP#uD{*{RjGUNt}Y(p%pJJ@lt zfsT_C2jKW~4_7ogV{b*&tK-hF&Qp=eUrX)M>`=od{5(|Z3vV(IIRMHU`=-YpsgNIH zQWd3D`u2(p17~O0ir1*kyJq#DcoZJk9pW8IA$N@J#s`dDE-z;^Y~;*Vn8Ug<_j4q@ zk{##6Kq^|T(i!V)ZPtH+Qr|~L=#rKG&)_C;$q=3-)%R?*%YTh%ZE0R$|EIBgSE@LU zfWWV^t>Z-q4|%-jJ|lii?>QtXDT(&2#0TX9k z%8dkJrMi@9eBWw>(kClT{`zHCJ8sJ_L!T_P)M+#Ex=yl-94g|VPYdE9&hzuoi+FEUQ&F7{4k!suz05gF;|y0<-2rflWjzO94N&WYfI`Rwl6vKuz2E;P2aQG&>+U8;_T&a&iy zB6-hql{+6CNz)=2M~W^G^mGAnq)Xv?B4ErJ5ef1+@a7F62?-ooSsBg~NgdJ0~g4u+8ozhSW3)g5BY zqmq9wh%%%69)9L16xbQH`3yT!(p0ch8}l^eRmdMM2SL>g=+wzGGlHAXuZkt5?=fOw zGLd{MZ?W%=0*?q3p-u-Y+&s`cqDKsYH(w7i1eFMB0sAC>*MfdWgW4@AgtXK))h^A0 z;;^@AR(pVmnM9oPpHCpS)e^LoBei);PA-YR=IiMWG<4{Q5!TckLNu`$ zzRNmvl0a$FGA42AVrXgQW;nX&VEm7d9_LN7%l9+Z3dDdp};#K z&%>2?co+hd!9tB?RJGH@XL0Y17-ooGsO@x-nqHkPo`AzbaL~hP&O*DNAlyBJXj@QF zV0QjzcA?3Y0Yr4dzI5v7U$vHZ;3oW=fJMM-&i-|b;1 z*T=)5WCr+)jsDm--;Fki+P3{vM!p_?brc*`irQTdg1$_+RO?;}N9aTR9QY{q>6Zw~ zifeRUk8E-$ZJ^TNT2?*?V%A-K|CKT478WAo;o*fzsB3F~Cxpnw9XnZI-Gw+F58 z_<7V!V&V-2VFdreXE!$rMnV$R)+VO?>Xqg9k57BbBjAx0fL^fbu775pW}c2%@%~-y zG}TLpkC%6H;{5*ods1>TbX*wxkITvL=jJU&GGY;?6{wO|*!MtkJU)gVc$4=$3_R0bl9$J*fo z_T56OfNY|aFm;~d19G~6pPt&FDs%(-VtwWMTg#?c91n zB|ZoTK`;u6Sa313sXTelThW$CfN1g>8gzwBjI!+i?V*#QIXk0DdLrE|3qoHe^4t|9 zo5j)~w_ZwDVO1SC_8z{w>(Vr3Lk|5c==BQFUXU&j=$yRV8$0l8McjU)fyhPsVYnJV zHvr{&tsfOoEfK)ES?{<#pl}mL-@_El?3TPmx?8jFdKPdQS&e7*$DtaP21;-#E@1j4 zUsumc)Aw?tUbjuOY{#aY=qFgTbRpNgqvE0>6u2oPH-k_QIH&)B#ieEfd3kw(iavo zIVo<}iAb3Z8xN$UvU0#OQ9LpNlLRa$F#wU$AnH^5?&eCHGSjuYe!y#TgSI{7Z5@t~ z1mDDV;fj2@oS32VGT7cuPhZRIxq%=nB~gqTA%O4-w?052!3|bEmU#VmkY@0!PT7sP zy6L`eWe_-@@I91#I8Fq}ITWB+b#EE71+5sQIZmKnSXHthn*DMx9E(}q9UV&=0Vh=(h$H6Ia_37v*YLP zU}MR%>>ALhLsASM&EKNIfu};ykjF2zSPI_9ZH0jwU&lR3&mN-Xx@?)aC5YKY2Jm5< ztq2TDLDpB8{DicOU;Yb@OwISxQuhT-hIC#&xf;%wvaxv-jS5HjdTd;k-C`mhpM%!_ z@8#rz-QUOBzBlj~(%Dxv7uFfnPmS!`^cCXLObUl+gl}UNTS&>Ek-1qQvJux2PU<;EtK`LT2=Y4_uSFkb1zb zkkHIeGYbryAUUWE+HuT77{ekyH6dl-*#Ct4P@UXfX=(;%Wo1eK+s7IgCd7y^Ox&{v z^t$yaZKezk1IR(i3%dDOgd%Y}agvN%J<&JGGE|@Z-r2D{-5$FSEc*5hzgRuH=SQK+ zGBs^NBo^-5Zm~LfZj(l?7cN+mfV(iLn6uVrCzj2hk^*!}skX>n+^`jbC6?CHdl>!( z*VU}=2zx-W1d^}Bd4x*Ww^R^J^VZNb0eEWe_#AFfaiMUGMu+%ijr0K^ zpE!kEbxHh~zIER&-ls2oeOroS<H9WwlF#Dt`?PAB^$&^XDua9QFG1zkdBWI@bYi70?y{gFqm@ z?bKUaTT0KLOUuX{jp8UobDdF8>Z^2Zb5LKrY(-(~#7+tiN6MVn0e6|2nvx0D+gAb{ z!jB(693)zuS0xGx3x%;VzkTD)EY*Zc6>6D}eR>8eH&`u2ndau^y_IiH%qBf>jcx9k z(pPRUD}&B(yOx%}zI$GPW_ricLt}uGoO4ErJ8;-2xXpgNPlj%9XQ!#7gIuVE9R47^!*C4UmKV4{(4agoknhk!5>po^8i!??ii;L_+om%g#m8rK!k~|PvvUL;%H)o9dq3x`m=XrPcN?P zc&=kxh@}A{C&ifebxrp?wAEq=s%2yp7lw)}!V6Ljn`N`o-(!r82B|wq@ zJL@$#WT^sDT&9bveOD*Rk2chjJL)=_H zPw!1432760@^{Rl3JVLts>2zOk56Bs+_8g+9yLB9_&=t+mX<)ttA9j)f^qDa8>*e% ztyOh=d~I#S=ds}U>Ul4>ARYcC^AIstfg-|wz=;X9w2$Ak`!sq;y+&b+hyWvy-GByUqCs7;SDO3Kx} zghY1mF4;mZdPorY-^Lts65!rtO0_XKssLvP(ff3C0BUx|#q{CcXq`@>K9>S(kz0Z$ zsRAT_ACM(DXO9Mi2L)d(x_}YC_88+{smrkwe$IpMPGFTzKoe8);$ix6(UA!+w!epA z25<$m#If1rQSV?IhQPRAnIin*GSKbo3){MzVXjgLl@1tD^PibLYv`B_L@S(H&u!wO zgI_8-o_sG1A7|7N`aV59x;F|}mNjJqEGY0IP_t5PNnlXeW$Mv6n}z*rrf=i5sW?M+ zW7NzT?oS7ZN(`JmH>`{?6W(B7{jtkjzDTCSs_|Xvzi_HYRC~K+85qk zn8?O)LrSMCM%9w&94(M~di@Qn*0CCqqk)OCStpge{ckNmRU1OKy3Mqlpr9ZLfI9rt z#B_n*f9N0en`9%Gc8_SDQEL6Zy0#{Ddi(1i0f8{WDgJBt5?J<`OR#KBHn$5Dc|9hq zd=STzxkEkN4ff&kAby)h(-KK|T{4(NFD{>e)!lxa>Ko74KNMVLO0E?QiZ!&%%!7Ny zn81^P+@%~~Y?fl?a4fu_eXcIyKkPz;h}ci@SuN6u82azglhY+k-f2o!&ASXFAu#gr z;7Q*8oJjig=^eb-wX)p~hs=Pu1^?)^e+nF->q{HWNeM{YA7k7rLlB#Mr zV4Ah<$k4xmBm%?;P^Jev0jLzc&Oq_m7}$pFog3Aasqb52|CvWoOt*2@((5OmoZot` zE{vs8_+5@IFKre%xBtxSmZ{9G|K~d|P^$(*+xKN}6-iU5voW*rCiX9zN01#FS3quRn2WFZEhIBU5YDr$i9 zQpNpp%#=~41TBy(Dxn!Mk?Ut!B}zJr0pBxM-(_D|veM^y!@BTZM}p%+hP$?rz{bC- z%5|0Ye;>L_G$XHvUkr5^ht$09`kPC|zC=w6_8Anexyj?fR@*u{7QXO7Kc;(_E9+{m zf!Kk5Ws_7o9DdT^h%Q^sDke83|g@M64oidci6`iu#U+l5} ztHh;2%+-GOe$3bIvtlTjGhcUG@!)7^?M~?O<(K*KppIh^_~L@d<5=|iHN;x&qs7mH zsipvjrupx(V!e(~3*L;RtgOE-hHh@WD?U3^#RKfRVrlzRQTaGDRFJYJ#I<3C80t=%LMpsx~b)~GF7M$#j|+7XG1VoGdIjfZAi zCw%Zo%~jyXZjg;vQj9*DtZ-v*(@7|Cu{#r9YWi2W`R(?zxDIr_a*K42n%34jx30*=3a-^s)9*RC38*PI{a^%~w1 zQ|=_Y8y|9SE;~~k_lX+ZUG%-qJknH_IdmI$wEus{GQk#!=y6O6wcSD_0yz>n!y;Sqd zGjj=z<0jpUI_I_>PK$YeQfu(bKS@9@nG-L{e7N8&H|!nS?9Z()Emr^z0k~o~Yi`Rw^!+*>oE~$Hyt3C(1kz^E*YSFSg^!c< z1fGX@HwWQ9<2CA79YpM$|~!Y`l^(<(AJa7Nh2OV z3F7_0<6ly$b~g-$yQrT(f6hFH!f@BlN#JB@;2+Y-sr{!u-?5^p{YjfV-Yo{bD06O( zu~FF%r~Zw8l6>_n0(Ebr0rG-1C0)ehYb%Z-8nkO70QpoB2f;=X*|)YfIL}EMk%!9lYc+x1W8NVZwn6D6fM_Eb zaZTfl_{rHpFaZMt{buiYsM&gTEJX!qDtc!tkJ4EvLU#C=>EHK+ovk%Mz^-*C z2Hjc0iO#1_pWbT*hV+ zOsOiuD($IH(32UenfQob> zH?S)NMUPEsOr+HEnZyX!a~HlQ$iGDz`zejG-2egG5TebByh8~h-hl`^9W>VTYuL35 zolDV@XYYb-^|ea3wTrfOgg2o(x1l^7P^9YWWQ#|NrakQz)qUlDaCsU6KPDjD41kljq(d!pM|^0D`uH31$m)K~QNt46W!zi?@IL1 zUi6m6GL={-mIjf6;$JJmMSkz-@ghOm(NCtuqjY{BisE9~IxC`uE4@`hCz!|>W>i)ozYW#iz)HF0T6r)v|N{J?oQK~wTb~$sK*I7ll zvQW>`LNWzWB&1HLU>tswf9Hm&x%M?ZJ_$c)1|kpIXF?&l^pU13f>5u7(;Q-RD{oS_ zV1OBb0>|rP1gvq5iYnD)ZKvZ;;3r$2g};1eR&iLUdCbA`8W5FMnshO7+U+_X_XNr8 z%0b0vb4L(?v5ero%|Z!IGO~n7N?IT#92y;ctEi7g#``c!mTvk;v-UkA#0v)s#{^@~lbn4lp=c zgeR&fMDle$tmaSex!jz+0h0qFBj8-3`x7#0bU|7MVkyK0ur|Mc|DG(Xr|qh01V!a8 z-b8Q3MD;J|jWTtAU1=YPt6gq$GBhI)97wgF%s*R}(l_Q)pJ_n*ARR+`q1VyK(MSsEDC^Z&&9t*C2p#hK2SzP5+NQ1*(CP19}`mdodNJ| z;a)4!i=9>OD6Mbo-kF9RY=| zjXv~D51^E?pDUNPvSJK%blCukD4--aCCS$ioCu7ZlY9)~HWlM4w*-9Os|V6`%+#=m zW>3WP;{D1u-0S&=RqG4QG@8?s+b%gXu6XHwBcw}d1zmcj`|e&>(;j{=0ktgnDfbp= zz?hhF>*{8#jqUC2JF5!6e*Fr=!<#zX?II&18yXsNX{u;$&WLosBv@L&I4ujRwoe^_ z7mzQwnm9jf`l`f^3|^jlm~t<5#k3G-l{hM7hcRP+*+17lWRA~N;O7?##Gt`4HEpay z+Vr@+cRBd{@&5@spHIAo$aurE8gcRV3G2?jAmC}XAFnwyji=gqj|HNsjZsrwVh=F{ zk@q z-m<>8=`L1%FKm;#Wclzfr1CpP!?oeudoKl~zY zJ5@uhi@(sIXiVI55`U(<(LQ<9c-YtorjB~f#*XiQs{0ck{k<8U$_Nl0~4mmjvGk}3F_DR){BfRnN#Jwq>TJ1jp!+P-3MMbXjQKun{?JCq6Q zi3k?QgQNugRX2h`J1kp={XSb)+xZ;sIN~UaI3dD~YFJu=A4oq}t{dXGiyap~ z2>n!IZ!u1|-C*G1{b=$sc})r`8r9z=@z2BeZOhu?CPTnL)OTRJJ%{vsR@-=){rX>)R)po7@nu zex;f#0bwm|3tu()P@n)mCVSW-I|^!Z)JH>)uwCZLqJDn)sg#NduzW)E;r2xdA(D_U z6}@r41_Wj$Z;jC_g8qZeqfC%E|*u>qlXwh}e!)SKTuS_C+{G z`vx5=tkbQdM0HyL`WcLk8QpD484k;ef9gAndbC75X*xOFL2`}#&lH3O%} zgm-NRkYXFMjsy=60R)wG3oA?VSn0yj?@&MFb33*nRUGa>el|7`6A8JG-0ff4Zpzy_HZ(?Oe1RV~?9MqZ3On#Wa%sHf{$o0&AfjxG;B zx#U$M|7)if{GBh8ltLK+m>uamTLT{1-i2Qc_;I;t|!1mI?h zuHzw}4jrOc_3x3=-LjTDR*f8cD?hvn-a5Bg+g4mGRN z&${tRf#e1N!Yk4&%gf?dn#<3WaviMD0GdSHQzCx?v|HeeW%Vqo0>T6NF8}hMpu$ks zK?e!@zrc~WP{g)S-`X2;i5USz2w*YA{DQEwwf?M1D*-J&UD&{*K92j8$(KEY%kOv$ zT?bi?=<%WlNSI_x)4hYwPzZG%XxShAZiJQup0wp?z+~>Qk*qEi`aS0P%>R~o|=BakjDUOGTwWC}WaHaP=M?5zkDP_vlxW}skw zGV1|^?6nKt(Q?}Ie#=7#u{d=5xWSc`KF7n zr67}Vq%rN=s`L@tuwtm;v20UR{J|9Now|w=ty@;FPaNA1xD&YOkoBKwHmHq4As2_M zhbbq>2$k0Z_>`tO|J-TZ@0q?l;CuH!bx{Aj9aQA6hhA`||DC`xW4AOBkN;a84B zuPhLpttWQ<;Omy53M$iALeqPT1%6y!MUL!L0zSp^8H#_2W8{jhWY+LiG#=pbuzoyI zX!6IAi4Fo#@A-d|c5`!cbe$cnY;3(ipW`rBs?7>gH4fHFAgSVn9RBXpTYE)I;H3(;>zO<>$r(i!y+dYUg)UdpjkH~(}Cvm z`>*x0nw6qSQy!?e&UZnl;I>Li)jr0r`N{F9s5p?K>l6BZN(~PUNhw5buGVE{W=^df zm~l%<4jN){AKp7&zSC*8ZM~JKtFODb&*j93K)}Yv4wHBVBA!D#9!-w(Kxqr{(rz8L z5sJ#`l1~ip=(}XkG1`TMfq?<(oL-8o_Jj4%N>T6dhzN6_5~3E5T17#Fh~1w}vjD;j zP=N~h-7hu7nH)sl?!rL1Bsn`98&dN1tBjgjd^HCX6Ea{LB%U@l9^d-y-5^pU@#0 zIde~y`|?IzQKdI2^-CAkJI#WUK3LRG-D)@zJw6>13hW>|uedNJd7h5u4j={Bi%3Jo z#FMgD+KA`_t4h$812gDPEfw*V0vb9JAuY80 zvaF12)7oOu96L^M%=7kZf%#B$yp&xX(oUuEU6&@#LT5zS*Cd1c_Vk@G4J6!%6cx%#h^J}t$- zeh6YDL3*kh^|xubI2wFB&JBhfuWr5;X8hH2WjQ&BmoC7Z?n#Ix<=IWu2mK|`jU8~} zO|HqVNSH&o*n%%ImnjPISaL2c34_eh6E*RwUB{8qdct5bS@O^g03cDXM61GrjszyB z%sXYnv0hO>BAbXT(wE;1V-YzOzA;CqGXWQk0!o+k%X{LI>v2z6E5e1dpJvn3uHwfy zf7zR(8*o%!E`n*-tfjKhd`yMmm$OLuhYQ|8W=xR=G>~G+Im!j$MIRTzN$YRrK)MMy z$pvFFsT0sAjV83?ef^tghMavJkBuTEXFMzL`UtMr^Xu?==nYAVYXSX_wxvPic<>|^U_h!jl@P;$FOJQMa=RjngZJ%7ine9(> zx;}toxYh=A(}N!EyCyB=exZ>q)css*QAusV*)I=fPz(QFS2G_H>m3$UAn6Yy;uyrFv+J?k0-X=0~N#g{NZ}gt{@BDFbAyWkp}2H%t{;F zjJ|nF+38O}MZyuH^_53TfEOv_Iq#C!qJw{_bFZX&HVNxcg$yY-vjo11(<|B+>;Dx0zp|1h_?_(1Ul}2a13K`^vl;8E_EpG8o0DM%X z8SIUAxC;ToPt73o*WXK1k9O(F=z0)`U!<24ZUW5+yh=G`4_SV0*?TPIM)<>$p~d=~ z0SHBa-B{nDQe7z?hUDDAYD@XkaW`YaND}6qHQ#6^tZgQR z4aKMV|3vQOBcd$8nbb>|@*HFa;%k9E!#41M0Cf0s0py{)NOymWxk=gAW@>_cMS%1TD_Gnc&lqOGc>Ohlo1#rC6n7qT68U}z$u(pq&A8zeCB5(#JdpU< zH`e`ZMV$QxP=f+1qM!fpqS5tYAV+_=iXcDwgOM&NnvljU5&i!74J0^*3WYyh*-VaQ zGJkOZP9=8OP@WS5Er2L!hFW{J+IzRbCv~fK7jWl5Ck9!QDN(&@jE3?Js3hT$#I#x# zz@nz^D8rN1qY$hhZDUA*7yM=p5qtNtEIMO7!!=K@BzScfn58A~pT!H(T6BR@pROu( zyzQr-i7?*L7r|#??g<)~?V*ASOOdcsG$U0X{36Xo^vqN>Ewiz=Om8a$@rda|>9aZt z!jsn4=DkG!-%W$KKC9h!64yL|Tq=#2*pTQL%8*+E`r{+2)S^b#r{b=l?=998bqGix zs{?L5A;H5|GJ$~-0?sR9zXNVx-l-uz803a!nMP4h37F=-pil7Ic&+C=eEF-_+5}O9v-BXmnOWAmk@2 zF@luRDdo5uq+$LVO z|LMy3-{p4I>b)5O|BE@ok3mWlJ3$Mq^~R|7en+89qc@O1wZBh)g=FmIfK#B!g zmJttk={o~TbKryw#(&oVkt=lBW?ZrAoHgM5&&Wva_37^q8XFajDc?|<1>`J7z^-XbAD0IkOlx(;$d2jP_M(acK?^hF#hk#Nw754tvSn-0ye z3Og8+Kxy8Tq?>sc%%s|+vG~v)2o%xL(Up{xAP@j#e>(U9xfE(U7ii17JY9UH2Lj0* z#eY4#BK+=VzVo5J3Ui$cv^1fvydllo$Kiat+};0SeoSb`e{C85j0;21y#tepR)MQI z;T?J40`M0@)dkagnxGbZa^ePhCwi@TvH!S(Cr^Z}q``u!Uyr0hFI~?cd!_DL7;;;$ zeIsQ}UAGMY7pD30B_c$~>wpqy2C_cA>^E)V?bZVl0`+W>h(eV#l>SR{2|G@p7uO3ZDJX8*N#1NAHV+e%)Uvzc>k17AwT(* zuI>rw!uqy)f0xZC*$DcR?|>{S%YEEKgvzwmI(~3u=jphSm$`ZF*vVnwrD$Tx2nBfV z#BjWVWzM4U^&_{Q^eIMJr#@!Xx2jI}Z3ED$3DO5H_Tqv9pe;pR@@VDlUKswsoc#X% z#$KM!$6Cooz}WZbxq)hV(9Lv?n2Cah%SFq`$cSUZImkAh#Dv#PG?AkvM_qOu$Bk6} z?~E&2(<{Akb#A~_M}T$?4U&<~ta~z4_IVTV1}K)d3VR?#)IOSmCmS4{<~&k@`vCMg z0A#T3FI01m`@;QnrC*|M?33XH^#8PvNlYYpFb13Gu|MbY)s}j56v=Y#d!JOr>F#jG zVf}yi(1vQ2%4uog1MT+F4;3z-{1T_FzZ2fp)=&QI@4qlH!GS;@e+$FB)S7mbCdERo zEFxut*8bE30@4DslmOcQ%<5Q-{^`?UCu38BI2~PGnc``{kP`lvWRQi6i-a5>3E!q; z9xR7Njo#HK&{GU(`7Ftx6$*I|Hk(SYtrmwX?eOP$nE3870kd8Q;y{`?j>D|$$F=e= z=rEg(=+Aedewl3ZT3{XEU65F~UCqNo|D_s8VOoN(%N*9D(gFfVHF~cLY2G}`}at=B1^6u|r zvH=|!5YQn)aA+hblg^FD{i}3XiL_Z$svUsvQ>)a$#2f{~rmEFT5A@7N(fvMzTfUJI zR@Vt1+?XBq@A+0%AER>7zIGG-mvokC)F3@2h3PJ7YiDP1)Ma6{BgDFTzG8Z>+PoJf zb3&OB|L%Gs=t90G1~coLY<~Y7{s%NQdDhbdWElu6xtxS_27 zIj_YZi#-RHw|Kj%EpxzG7M z&$+G(cMS)y*RZ!g{EuO#mn{m5m#_k(2nUnhU+fl#7KM!A@QX5^ z-`gR{tOfJeVz9}4ey{Z}?T}V6Q|5fY|J+34WCeG*`u%K$O&LEPKYmDDxD&NPX6l@^ z9`*~*;};=R9|PbtE;gkP&VLpb7M7lt=7we3l#ya$4rht|%rMUt9`Dm+^S!n-7mWo9 zp!CEb^?uEk2C9 zRHM5j)n1svV1Uo%?IJ9pqdUy=Sy2&=8wT+)mdau^HZ`5#AT#Z0d{g`0wwz09S@}!5 z5pw+-aXl&;7n~t5@oq9SvdF5IduA^-Dv>$ot!NrsIHBlJY(ia{ibR{_M8Bxg0F4lg zhJ-SN*53;}El-x$_Ntpj1ebq=ThL8n_ID^o#=>G}{Yw&=FhMg@-Te5Q1t8ZVMt|=O zq>oKZ_!__D*XIbd)_ew3Pe?4TB-URyd5$E~*FcJnPK`vTacjMqIuftkHUGMYdFzX~ zQGgmL4r)wM#>PNiSGwj6F0xGiXd==x&D3S%O4a=y);g-!-4p>~4K9vv?PS6Gy57AL zHKIy^ICS&? z9(2%@I04{TBr+L@ln7fm4TYdG`vb{gVF|#*d^%_}QP(zRBcK zjd$I>PR~8RIDXO2yxz^fS^xI5sjNkIUjXXP(=y&=)i_7>_;7~g?2L0=sd|GSFT7B5 zg@X0hlMu)a=nV5*L#I=z)CBMO+|K!eB=r=K-&e;85%|;hE2`p-gi=j=ovmq{I=J}4 zo&5d|tZawf_xnK!;&Bibc})M!%C`5HmzV3s+qawd2eNHzHML0}Cc;%RkEs~v08>wL|J&bS zfGfZYKLZkM1=q8QzGAcXexacJos48}aZ-6*?{Ko^3(zTt!W7d!+m@|w?H$zb(P*@X z6YE!`i=H{x5y6!ZaMRllG8?KZHI)O2eg;r`#nr#>85kJE!u$RRXBe26B&u46xID3^T`y1rE~r{} z4HjeBA*lNHb`$Qk2L{l@sw%Jz$uqBQbbTEmYq85)J^^+lKWmwgN@9l0-A6v^Ge#Lz zWvGgRhycr%x}55)M4?D-M5@PTB((DKzLYouu^g!!n}8Lv9mi2CZyZWND}lQ7TLj}j z9jN_@n4?1s5L5z3sFMtpwYYx165`gnzP&mF{5U0*lxeF*KylE+OGr>k+q|sk|?hrz7cXyYj(MB41{QueKo^$s( z`@Z|$cz29fqefSES5SAv0;A-XQ272Ek z2mp`(WW+_(y)w_%Err#z2mo7m6viNhuh5v(-{EzsX_%Zq3o2MuzB}crcg@u&hm{rX zljT^-$z(J%U(ANlMTEKPwLDyUh>b&j2#ZcNXCJ1xB=~M*`kwoPKHq&3M~D7`8)l44 zg*3B>3g}}&?tAffdi?XYWl7tu|62ls|#io%beUm)Id=Jkgc`D!-B9-Kzc${LJ{((Sq4>V04?XCeOL#t z+ZgfjZy!R`?|gd3>s zqogm(gDg@}dejgA!qW}sd^YC;0zyi)nqpP=bV9pkIk9}cp=O323sxFBL1PV@I!tj6 z7u>u3H8Yba$|)By;6Ayo+Q*=otM8XooA>_6nm5}`hJ;+ypO5G4U4xRaomQiM{LI!F zgKa4DZEnX;uuVazP=#__yNQ}G4{A^AE7fU`@^Glls54Req>!`2Gn^j)howou-+fA) z@H5$9;QeW+RN#wZL$~*(pwyH5t+ACXt&vPm!)#)kS`WKsxsLcTi|=cr|3@xIr;l}EHi1kc4$BvND(v_21Pe9eO#ETYDl@jKjIRXuAN>0jC?lngKyx!a;c zxQiB1?^>e!dk4qfHDYfX{QAPs#;d4s*%PxhU|`BGL*`Df;Xgv7cH8*`=j(vBk*LN9 zj`&R-Ae8NR+JAy$$#FQ5akLmrk-6F4clIK#*>$HmMX`Jv%Uk@2{_Y_4xahs8E%(7Cm zV)8m58wwBY1pT@v{nrFbKctR{tF<=!eU_$Dg2Cz@+e_8Zt$OH{ELg?VHK~rvpXHv5 z0UdY+f`0qWSs4}{j5h-QbR;z|CJAO7F6-($6V;IApn=mwS0p<3RQI#VXH)stJZc3a zA^xQEYXQ4mR6X$ts8t|4uKHOHF~W11>Iqhks_06)Ek;}=Xk%bhIVIxZWyBYS=59P zG`9OOGFW1^tA@|vN=AXC7v~&TvqsI~iSGfz?q|a}VautdhQGz3K3zA<;NJCQyJ2SE6UO2R=Ci$`Ev&9!P3TU=e4fb5$%jw#$}hNw+jg zQ5#05=Z@<5#c6jUnbn{XLsYsYjyjl<71nqo=Q;5vOW%GDW7%e<*Yw&>9V3G4cJd~D zY;yc^owI-Adl0&6F*wUJU4_XIN5gSqlfE{KeMFzZ`mcZ(;bSjQ?}l!W5gl*5nS@qv zi;97zG(bbXI)6&-#yz8|v2e~7FnOJi&5g}{J2Em2KF~D_iLa07YwX&Q`cxXgGx^o5 z=M|)O4N*zL5T#3=b+AAxoXbwU0B93eGXfn&MWmNa?jMZ;3l}yC$+T%b1BvDpa7aVu z7{e<&U$SXEa493Rb1u#Aat`fC0_MMRbSPyC_-D%H#Kuh2JiydEZ+>i^wM0EOu~;x4 z8aw9tyKN%xde}YcKWwACyi>}&nP3N57RW&XNKWt49(Nn3C%UX+9N5su z&|Zuw!^jU}J!=bfd82dKf9HPY*b3JEN|#ll*dYCKeE`wieA(?>wNa?QX}vqN!I5(h zLGO3_Q+KBiK-c`Eug_Z;OeG3$?TPmsrDXxcFwlN{Y)Oly8i6M*bXYN6t6s!1(V$>J z65<_~gd;7SH=Q6RUShPt(l4@F29Z zEnji^R?Dz($Cw~!-JDSc(`5Q;EdF7FxX>LuNw(5@v*MF9Y~XsC`^)TrOQY;F&A zjSiIeGHXzAwMRP74Ah%%_puibvmI@$7?$+K#;O+}=lgFrWMy%=2D!bnZ)nMbr@hj+vLH;l6>Jq@|w>+! z;zCN8^JWx1c9VrF{dZpna-%|_d*y_T<8IcC)G?q%&~qlE&Rvj38b*(2(bfaA5|ban zCvlmZJ;dEfy*B$Gx;sP9QnLdf+I(gtGAQQ#jr31b03g8nc5BmUJ*~eJdQX8Z@Wl7QmG0gOK{L?>QM zdC}Jo$8n)0ICg%9hCMWwSCBe1K@Aq9-~hT6p@4&?%Hl~M{H+uV7as=R{l5G7><=C; zUXSOjyDy6u)(9A$W-*Q_U3sB)n%k@FLT<|Tb>MWg8eWsA)irvn>EP74*3(_JxOr|0 zkqskPxTpJHUyiFv6B|RQGse2s!fQ~<0!MVRE<7m)eL}h%Y#kQf_m0RsA{RGn;==;~ z*4no$-zH_+?4SU&$DRai3*%vTWsSTYLrzoLErQr5jAw;U`Frx!HgE*w)2M}JYhA%~ z)YoD^wEN55J}u}8053K?{Tj5-W(osh7T_kBFF+o!xej1r@PaKm(c5C7wO z$b;xtY_95%^^d939pNae55xNx4Izz*+=qkv=Y;GJcaMAmPn|eQGxLPu?skb?fbVPr zo6h2QDqK~*L!=B!N)_S3xg*chLh_YtI&6=FQZmLa z(aLm=_KszAXEZzo;@r_z7)}F+L=5Qbm~d}bh584u(XCCvsnTLnGJm98%)>0>@laLr zs!$s5{#}P$dEj$^np0VPf}hXA&qLHfkmGUHx7Op90K=K+V2q?cLL{B-^)yueS!MmC z0mS>K_`)122d*(t)E<-}p;E?G9qxT(8-lE4RT6-g`=1YaAuWCXMzh@4Eb4-L^* zwm7nv|iHCnTT$$}le;eQy(2G)&NvO&sf2>}4a= z5k#t5c;$pHM_~~vstMU*{I$Y}k@a2Cblhl$pK5BLJwP>Q&qxKwGx%ni-{KyT?&QUm zE#>W^Uyq~33aYF;&kBw_(VxZ|VgWZO?msw61Yt$aiT#rp7TC0Z|6WxzTVL)?5|`Lg zCw$hKq(=xi2z`eMv{sU}Ic?y-fU_^^2*2PM>2Ah8({8m@j1g=vAMw^l%^Tn~8~ zbM5-Z{{&J+=pTG`Jl@HAfoYom!wkRF=5_g_^h&?SNUMq$H(u)ne6iEt0VRY4PfGuM z`RYO3{ldn2wv^2`?btPW#^ZFW5OE*(F|m8C#p|+nXrx_pLiWjje#hwCGO-(CI_oky zdRc@Q=*?v4FBriHX7pI&vSranr!y9^3nFYOA{U-4phfYxI;rg+_@Xw8jPTYhh=3w& zcV2xI@cH*I${#*wPi%4P;Jmd?9xHh;MicpYxLETSmREE3aRa+u*$shEO2b{w#LSw# zIZecCkA&NX-DaKOjkWlLyGAOk(ec&>>$<`IyM1fV7Dgja zx=xnY8%%!p&m}nqreXh2%PbLhFBC12$hb;&)o%hMHpC=LT#Yk6s8`p7JZyZh~NQ1^UEp6K*-^|;lL1avl*dehI&St`Ezr(+k156r8$X^MliNoTdx z`Bw0wYyj3CjhuUr{Rhq^)(7GxNb-_*Y#-KtYSrP-DIl1kzzUx=%4!q2dmmPBc9E+h z((yo%hD*SQtGBpe4-fmexSeiiOm3Pdv3NrXMk#0~v%mnF;#%!JAB~8#TpQvmWm?cT zf7*0Ox;fm?sLLW#U@4e~B+ez$(xs13kkFQ?d0~Bm(k2d0xW}p)JmPMBJWf{m{od3Z zKyp7n@29M(<=_KrJ=fpbTxC@ORgtL?7-qk%?8=2!&f&{Wc43bVS6Rp*me2dJ@nZkl zK@1jr8)V>IY1`%bBgm0#oR0f=5kE_6S@(~fRasbhlQm;sUwb4FAY9;;g{7)`bYpX z5+*Rn2;?b@UTyF8{a%<~RZ~+^wX#+~@iQ`v&53TA?KY0f9p8*&$+dTG8<0ZJoIYt|L*QOxnH7Cb2m)gcaWut#35l*}xlHFS|ilV_> z?oe_@oR#}knvnP5E6Lr0+aU~y`WfW8$=XbXpNP0wwvK}?dVD+R)8Rg}#BPznv@STi zsj3y4QakBg=ot*Ma&PoJTlWjpDy&>SQr=KSh6kXFmL3obqc^R#K)&78WgBDwlf%?+ zkmt8dotttm#Q+pVRq)Dj<>=o506){SVWBK&wE~xZ5(Wc8Bqme8 za@@?J15~G{VF0h3G=t}q;(-hGu|fy0)SuM#DAdOyI}A9U^FD= zbImI$<(OlNZxI2xT+__2jx7USCV-)VUFPoBGmJCh8@9dSr2Rq9%;k?b-Az*CWgp0X z5L8A2th{%*dVHY(%32|T{PQ&r6E2vH9~r;XI2=4B=9}lk%#Sl#W9t|g5iXg}582J7 zuNMHk(gHPt%ge$t;m8UL%PJ79DL;+pxi+u7N7ry1<;l&fA?;S6aHRHb&MlNqw||X6 z^%d5%QBF(*Gt)%+{P^>p1|ChpGBwYAkW?^WF3QOWiNNum1T%TL{fO}yH>`TJ6w;(|Rn>RPz}UZ=_40Z7t=(~=}@ z6>&q8+RUx-G&HILyZii!!7^@>ddiqM&uKHg8}fLG5LY%h$(^WzZ&!s!e8pzP9619|ie42Y#zhNfLcz98tXaQF ziuKI6pij|Hm&s`#$jX!=VtIIJhZ%9MynF_Ik`#L~bT zOX#}LvbC^c@ulo(>{(jwW7g@pGv=AitRp8;1Xg-44G;ULh`k{jlI|p!ei;ybqn)o> zMrMudT9B}loNGh9i(H$1M{XYOipQ);aZio68B?Xa97ClJMc1>d>cm9)>k|0BLJCP1 ztoZOyR;pL{^P#j!kDI&Bz1yH7b{|NXy>wOCqBAvJcARjvfW9dbaZ)*{;&^SMy5zUD zPqEkKpOK@wg6i1E-oUMZ5U~XnFUveGxViio(sUbfUS0FybMC?_)pg<#D@@EBixJ^kLYR zScRPlV(XFt9GWLL3J_xCM(%_<%|{nTK?Jvg2bliy(J{#FcdI6g#4kH2n?1hYQ30i? z%d@2Y)D#&=Bs2l;Zu7&@u$yG3?&pOD7(UcAU77tBQ8P3WJ+?UGzgb@I5uxB020I7> zQSD~s;Qnfl^$(3*ND5+F^4iZ*P9{FOPZfhWEp3y0YMb7M5$9ZGV*M3glcM6Nb?&l-B(q?k+wyV^QHH+++HF%r)nY zqE(Q@AFw*#@*Ys)MxjeMxUXkRDktYfOP{Ib;DuT7wpNm)Tm2#CvWA8uqpO}Dl(}18 zs2C0A(a-YF#+1^Q`|SYJLavcK&;V5sn@xOx$`&UmweqPF7i9ZQ0#?z%V7?z!ZT@m& zI;8LB+hYux!P-rSZ?oO9@2grxddj1xkfSS;pn&>7?3QTo zU_aT=qQ?8I;7MZAxsy)i%@~fSXfwRwC`&*@N}*9%zh7&NJ*Kq?k}}&nq=4P7#C`|8u5k($Sek^JKidxIYLt%>{*Jh z^}M88v)bVroYjZZn0rxI@QokM=UTD=q?dz!MQhtim8@39eBpZdDc|_$+p716P;e__ z^u376(6#}y&7CMMFsIa~!Enw6xna#vhSH)9jBH+a@Q(=q9t(!>KuIt*-`E17I-6J+Q=RLaZMbraOmNXynt zPyTCt6s*f9l(l)MN_-h5cXQIcupV}P|M`i#$JG>B1|ZCa@3{f0ty&akXC%DtrB?3- zZH|;shAtN09nU-N<*P*)2oMi=GuLUt0+i324<$Zfa0z@TwSR7*S&#|d%{<*eC(3Ls#RkPr9! z+|Ng6HIO?{SNb1Vz|Y=a{X_}82mGJwT@QbRw=JmgW20C--89pBjEwt3z{H;)3IQaC zAk()K_-@O6>ZhqQ0j<)PmFvgPx2U&vkFI^>fFJwLuu@@yO@*-JhxA7B2!T<6YS$ z0g-iQ%Zn(}iQ}F(wzsTcJok9jZ-nobt!h#fbHxk|Cl-|RL&1TNtZ$nrfy2^cRoEoE z9Ml>40UOX1rQ#2@MFl*LL>E2Td}a=pzB((9Mi-5>WG~uGnf>I%vX!rQBWi1JZwfaF z)xrwfBuCK{Xzcu}oz_=qvem~LI|fSH_E$9o!Y5$Ja_v?>2Vr^5y;zdrcHFs7_5>x*gO27u==cr|SeTdD_-gv1CuC(9R!l2`g_ z%(#KDV>0|-YK#Lf{0Y#AIZszsu@b{ye$dKoIuExFhx=Hs5KG(Or#CWcKl_v+94X#0 z{d}|lEPOJK05O;r;0d}c8bE{~6r}kqHt_>Y!D+|#z}5j~zGX}9v`0^95u5&|k6@U1 z-Z0uJ(A1pa}XWmZ0XeE0*x>YL@ck3i7}Mz^H}@j9u9vj zK>XV97xmwAQHwa%#n(AJWvC4)^ZLtH+eF3?4RXKThPrgJ!JeGUQ`O5m-SgkuY_Q) zM1m)m9G*~QMNlLXFYv=(a_8QZ7uoNzt^{U+_*fFRHJ6Lo3Yib#MYi`%R|#FQxMQQq z9EYXT`8xA|PWbw4ea+aTCYXGu22TEl0 z4IyDc#I1Kq+3Qe1#@ppBwTSWiUyUd#b+$0-2X=rmk5U|FQ$cItumbcNere(w+fAFZ zKZhRsO5BL%qkiIK)m!cW6);R{tPs+a7Wf5o3hwh+-_%%nj6XBfe5B{v(B3!4KPuMc zeR$kAU!`(TVGw#A*+T&glJPLdGuaf=rLy8LNdy!zQs0sQ^E^l~u_Q%E6{dN6BlZez z(0SwoQLaUC-QV4;N;VmppFf`tKSy*s9%P@R0I=fxM{SvfBc9+U4-9vgP?IkO9N&Rl zJf7kV=lUZsN{DfNsH>s71?+!8LZn<|0m;*3LG6dOTeNQIaut6=tKD~I;a!}rUV z4vnsjbs#++Qog@c=n6r9#L;>l0Nq?W0t^kEx;@RP$tY)>75(K$C5WRp+*B7RZy28f zu$A=6BlIhL?qP;fZ`31o>u)U^bDO@HCI}~RWYk&#d$fMA>#K-A=R4t4k7>_+wW2gM zDMfbjq@h;9$RaV0?J1%kCU3V&TkQ5Yq0lxD>enzmPLP!$i6C_il$&+A0y_g~NOSCs zl+z|rz-Kcx?27Q+gc7C#1@V;5py>EX{jH63zXQ#6>fqOfrW1na04L_Q1NQ#6CRXdA z9C(d=ysW1!s`G_GDiNROIL)y_S|I$m94cO2LtR8!hrO)5_QJHUTZEvZm+{}d0Pre9 zzYgK`GSouPYP0TO9Y)A9vqX(htH!ecU)|bO34t+Zf3J397yy9IXkJ{ir0s^%zN;(y z2>X@OBI#RusqBg{&@Bfa9Z#p!Iqc6Sr2B_kB^zFUDa=^PbXB8%S2{K3Yugjx#2(nt z6>9Ss{){vFF8Z`bMiaOu(?-U@L$hv1ktCn5tX{bB(ITuTwVp2yDK``B-~sZ?*U)Cp z+Ins<1{rMjtZQ7?{m$Ty2^@nt+@5?rnM+p`f65H0Bh2o{3ekGzZD3Mg(i7Rm`_o92I%;!ebPWV2GF z;@`!shQApg`~|$)?f%UHkzM=O_Fr2g*fRzH?wuB&{R1#66A9?yW;HNGZIZ<_H)WH! z5cKsf*c@tdcI-^^qtRzF!HxA#W*EWUj5a={?N7 zSG1Xix7V|Pg6GLrR0X;XFi2;@$;Wdi%du;EUql)ehDRHKgR!@sCSa2wOAHD5cXJ!d({VlDabo1Ol6F6Keu_58*#f+>54`pll+oT5JWU7}v zOadKRq&8omO4#(LQbg@@q}d?o2pV7QJ^e}zyz|6iQ2(e$8@(7Q1DA6ZO?@R)`l!u*3T z<}EvR3*n^Ln;;#!Q5O1YK#uwxO?P!C-b4DqqR($!3=+rD0sd>+w?~~i%ecYRFAf2I`5?Ve&dABbwFl064Pj&u zdQqnF6folh*&sxjiK4e6gg-@9DYk@7}P&e<3Y@>uV0@_NyC ziwZlr2FIQw>c@yFEA}e?L1tD8u)=_i^~}*^`Q#7i1l>`w;Vkc%dFF&7WtF4dsK z+c8E)WOX`nrj6lg528hlOC=rFGrgN82$TQ&5Ujk4quiOrYh!iQIkCxZx-V_KmG%B3 zJJ{4x?6@@`29Edpc_l_HxGD&3^8I&2`xg3H7RV>T&EBDl-N(}>LE5a@ngC4hocaMd zFT4^Tk!u65(HD<*UlsXwmLuVf*o?RPBJhQcCYAlTmwe7zf$ISI5~Axk1(}FjA+CAM zUmjW=PaF;h!Ceq$Zcq-*bVnw?{x<9p!Fp4=w(CiN@F4V5(0OdL-L+@QN zU&73XWt>M2UZH%)?^;)F*OLmfX-&=7$q#j?ps3pZ+1yTzE7(b7T% zmPVk9+BAMcnjF;Jx$9(=XMICx%!kL%pV#X$0s?RNZ#V36B?O%h&mcKum`t!Sim=pIXTy@_S7}GhK>fccU4lG__k^?r1;rw@H@? zNPMgx@hVol_9RSom|;4t>6f|nNi^p1Kcw7@PYJNJh=0}PHd`A`1QW+M3s|@7f47*6 z@T;FkO$=kew98hsRZ<5Nc_G=4hg}@x3Y2lU~ zM|AgSKQiMr4|S9ZD8jRx5l~ovCKK71|JGDch!E99RVoHIi94NZS81h0Kb5+tMn>`d zZ#j;IyEOA}oi@t}z;U4(EV5WGih~=}Hkh%1tUfr|{u%IXcbMZZrB&s{u39ja!QX)K z<;NM4KLB=}&Tbq~frzTaq-s;sRlDoy1{G8G4GX9h^4a04W<7)07GxBa;nIjRlYL^a zMx+;kSN9VU_S#52;zbW@ry9+of*flTD($}}lC5tL7WsD+*}6!}=JF(I9#d*iTmO+z zY?S{;LV>&Y#;6fWI2*W|dvkswx=*p-V-Tk@fU=Y|U4;2Xh`GMgrPIq+9BT8aU(k^m zO+5Hx5oYUCEN%pbe$^R)LTPVR_O7&Mi)s3IueBuZ!~_sHUnFvND-)uGBD zS;~Q`flHdDRyfYT_g02~rv$*9U%|7$kkCk?koJ4IjA{-9WtS6)D>gVkRaC2c8m`gLD26LaEApHEz{vhSVDHhPT=HY}q4ms7E?J58#GG(# zlncE`Ty>hSd1hYuxMbL`_;N}*874&38Wa%@J>OT9wK%N+!KC@?Lm*ST5g#TDscdkZ z{>VwEnFI*jRu$)_9~-XkjGAgyCP`MNqk#I!kLP{~ov`21rm1>2v+>H2uoKIOl%%yy zMCuOOIhViXtY_wi<*dSn$sbY9(2n4_pk|l0r;n-K1j3PTl|IF2O{FWV;Ne!^j!irGim@;Bdui29avAc@h&z#ekE+m!))vzX!KiC0k%DO?|fi-2axWs4L%ycQsv~B z4Z5h5?bE=?r-DM4xVxg0plkx*MR#P!D>Fg^NUUdU&fA8Gv!P_Fy8`(P!NKNY7LVXI zTpw>Lg=1T(0-80n6tqxfeUM^7%PjVXJ;4=k!Gb$tR z)(z0HK4SG-e2_s=oA-PBDKCiAg-c!2i53^EVB8Yeue%?8!R)sF<1>sACq|9xMlkm8%RvVZrY9eNvqJk(l%P=#71{z0$Oggi&ZS zQ`Qml*P`HQYdW7^Ki3}HD3K;oV?*Sj*)J4JEe{Lkwtoo9=QG%%~c2vHgcV(YpyDCrb{zGbZU|DFs zm)I0|xM4FetS0lQw%CE+vM7-rZ=ehFReAkIf-C`$x_<`tDT03EKJ|2QOVuUyI}-|k z{M8jZIrSz>Ev4N!7($ztWt{Q8m7Q3+tG=)H`jSOoN#Knoj_7zxZ*Ki?lQ3_m8fp-= zt5GW`;Dbn(E`5E;j_PlG#8oq2MY!#!GPX5nPL`?7_9-Pr^kCgX}vzcsIlytWp8Kb7kM)Nke}vrE+HKADc`N$w>u8oWCL3;3(yWMlqYEK}NU z?H`^-9ysMg8W;9HI>tekR2OAZyX=x`!_z9k3x3SCH{5QrFPevWPAD0r3JFD{wgLDH z6s+@_e1z_eUEM(W<#xjQx|Eox{c5VF@XpS&X;f*~LU5_$^{1Bd5>k$fb<9KxIE{i&gB(1%MnH4%akEw z{7?A`^VDs?|L;Hw{q*Ph%n&hCs!#Wy&*Z&KD&xY>pNslZ_^Z<<42KxVDVx@5UHmh4p}0fAk7%ME>R|VsqY6Bc z?77p?0X=i2KiYrdJj{Ob)^%B}`0oLX^rc$ImnCkQ87J|e&WiW8QZgU0eHE0$QL|a| zMV7Uvvo&*BhfBnz(dUh$VtT(gDD%9blU_N#ZGhEQ5o~a^Mko^tv}+u96<%p>rW?uex4T&)L zZmuX2O1LmHHlUt-0b9`Nm;P)4B2zZob>(MpYNkfh>72|>eJ;Ty+cI?T3#OMqX{MmH zEKdJquqeD2RBq|Q1WOc6c{zp1&UM_?c{XlX#E<_o_@)Xu4stm%SyJ!w(z!07)$O8- zn!mgPakSc8f#83i*&^1SOOd+;+M5f0>WTQKA6135obB-X3+2fSz= z*U4EXx4bOW&_WYI<_UKVC1HxS^|x_<-Z1~%@10yD$!GybHHt8-2`aY})9B51QD z+t0*^5CA|!gnCpsu2>3#v22V{Qh@*chA&iY0as#>hi7nd9ACKrw~&(lZ%P254A%pF zy;Zb4%t9AETf@5bM08FZkbh)oSX5ewO%m+9`0+z|C;&jApgS(mg59@kE~L$6R_yo> zMnPA_$R%Fi_tnEFAKcA<9zGf_N>lAwGQDbzgy2F9 zJpw*9a|V=UZcfKVh8Qm#3U0Fr31EhL(e>VTPNVk zRVchF=keV7z+X#oM^l}Rf{u-W)JpzXt`;X6U&Rjb;76+Wkc?(O5^`HTkB?Oo#RLWw z2Q(A{{0;--5AQ8uQQNOhR@P#HZh7zCI*XCQ`3iY!(em+e4x+ZgX6}~OfyEoX(ctut z5E~kslpY`FT@P?aAp-6U9N=(TE7hYk!Usr+UXSx(vLPMNnN}Z)s#v zE{ksUqLy0q*4OG6GViZ-RPkpf3+SFywI#XJ_NH%azI$11jh1~Ug8n;lO2bY0`1FB= zj`k$a(g>?opO7lOG@Y=gg*u2yu11QY|AE+T7Pbq|Ff>2gUsTi31Tz@J$GH<5SH1^- z;EQ+<^PV(lgyR0ms-4KHgYYI!F0!?|5s3uyjKHZzfixmC&}l9%xIBaP%VBet+kIPx zl=ZC!=B)>E1+6l)ynr<^vP0(0c?Cs(F5V_J_!X)%w0!1C@AdMx%)_HW6L`+A5|UL; zdN^kc#_+3T{yKjWzKtyDCX3f&-*nehGMcO_@%wl9N6xR^*?Hd)#wq8C_#Aw>0kdU; zPo`?BCeLvSoG`WP^T|0>bwFo!;1lr=jBI9S?^=mN0Z?pPj5-@mA0a6Zttu3u)M+6XBb{N~ZSA>lEKF1NN;)~? zKk_}jkGqJoP&u93H{*g^l$VOaw=yNomdw%Z`~w@+hiJK(k(YtrEVjp8_W4;DFo3Tn zbFMSsK^=QK6_eFn^G$;-t9&RxuJL~7g}fO;fHh1XCNu^#Q7H7^lyxxQ_FuF6{;NLr z?`+J!n(oJ5cSFk%!+*%S@vT@v5}r1L*RN-gxg;Q(Lvti1DfqY?o|JQxw8F8d{xtc* z#g8lQq%ss;l?}-EsxQCp20klnkQ)Gl?1V+(bt;V;U6LvC$sQ{qo>a2%q(G*E7XIoe z5#_BO(*qV?q_@c+F6{<$BX=5=1-%tLWW^dks za!aDo$q1IB*S(lE?9_SRP!r#OCDKbh68Zj_=gGqbZ29}N+Jd*3ByHZvb9(`|hoU7! z<0#gr0vYSQ{kX7vs?6$PfvBdvH?JFJ*r)XAjPf9KQMx%ncC;QVv0E8iUvc-D`6Mwu z8?umieEY56*L6wv0*=UPZ%t=-)Z0f$(etc$lUM-F-@_h{(+gsiVz0sg0OYgT&3<3T z(PnpK`+&!7!7@3Hzx^P~5z9cmJ(Y?8Cy2`TF&p=EH_tcl_<}OdwzvU@Z4|!Z2R-T; z(e$HtPKbnHR5){BX~zIH*X6Bs%G~qrAfXH<3O4;-BD?!1OFN$W-|e?ON-kcaqKnQ8 z6GB9AJ&3*@?l8c9%@&pL4{cROt_7`fzd_v;nOd))jfSbM9<^wuoC zHw^P4{-oj;u$`;p=>o&}=OT`(=!fk;F-R036#Tb4^e<4@nzyd({~HvRH~#+)g^5A_ zB?|jkbu)`6_4P&T8jGV)CkrQ`-W_r4W#p}>5HHmhEl@qYqj`Tx)?%Dx+WM8Mj|;dKWgh6#05 z(n@Q4IktXSK6XJRYDMeA6y7OoiZ>X5{%H*-KnYOvapF+x=&5M3Du$g`B!^gQNWql2 ze$BzTd}JSX@KbtI2y>O;Pzo?EzYSk0T29KMey2X_gH@3r_p-VDWE+lXD6u!n)8lyh zdsnOQF{|qabved)N@J|yOOT#QKg?eoJ4qE7Q{)$e`8U{geX(gZ1I}K{TZ2xM5>J;g8u++{t^lkbcVj=1|^F5DKP6^$-`9e;F4dbBKtQoYV9-@M3|S&QvVXY zF+u+yV5b;EwXoP00i={!6T6D>i*SOgYht7c-w{Sd!9K7y{ymn4)Rgg>zV?I9Ui*h? zIs$z`O%darXL<&sU| z{{r$!n4N^#D^;C-rmlL@gLCZHf`@l-n3@(4`9w3xs3t!9CPEsz9;<0znXDcW`x7cX z5PD2wt(IU&%!W9~uurSn)E6Yo#~f)UTeOc<2}0MB30O5fvme=Ra_|#NjYOP~J%x5;F;BqXZpb>a^WACQ#d?`j)ld>ul&W-1v$YRMx%fk%fXm{QpZ%N z2QTK&$cN6)iy8v&emHshBCCBL@N|WdnB)Sh&J%hwG?$;(p+(ij!mhBNQo&XDaTyRC zoo?2{8=g3N2)ndtsbA|Xtln}=WxvS#;nyAH_#^5(&_*Eqy+oA3RHlQp1fNBaj)WE` zh$$I5`JhE6O^N$0coHto5{$BcQ0N+n&Xh!^hdWgv)KCr8niqd*WNB<4{-6U-j|&%W zRp#GvTeQ@v{L{9Eu~p{|$4#9JvOO|rQGB6#)S#q!)jvEiVvy{8$eC@cXoLJ}HqLp) zoB5|}UBrHF9V(Qc)UFtOuY&`5cLON^V-%7=c02*k>E0B~Q5LI)<7A359+H|Ea=-j` zjZx>Ywh?_sB8@kj!;TYjUd=Q-v4ZO?Ka}^Q(Z6Y*mnT!ku9ZhL!?EjY@8+m|7Ci~m z@ATbr4~lYJ>`WK)ke)Ga1e2I}7u5MZxs`<%Nz1n9VcY&dbCfhu{_b+b`*$zEHoUA6 z5I;AM)k(=qCwMm{aU}tV`gtIZ!~8K*4{v-@u)6k!wlcuerLnslh~E=5;J%=W{Bv=b zc34(!bjZ=d#okat@p15GZKQZrN;xa#g;k(K{oS>btkqsq`S0YkpqH~>#R7$9RC(Z5 z+Hm`KkFRNZ8G^2_XK-Kc^laqRwq>cxfSkxBXAd9Lre1nm)K#AISk5xuq@0HbLAk1L zYo_~EQ!5*gCf5ZddQq)uk)oQzzS6?d@N9pLUoP&cM>diFEub{e9vYF0HenHEj$MQb zKGy2PjXt^yW&C6dlS8bq_};!H_{CWUV==|!omr1i-oc8b@(b$$Bl*>Eay7sMlHFZ~ zq=YE5ec1qFfH(uEZR4QCjKUucRyiYJzg+C@l6lvB4ly^@BTV`#2>E@@-PDAWr8+=e zxR}k~`k&Y|N(sun1qyNpXBUp(%f|^(c4Z=;$IyGSW!Bv|lv(KDI;kh}(|`mkB-xle z7$f&~JKC7PWD9g`!OYTkH@DHvM!7Q80v>B%2SBk}+{0jT;h#t13>(BR9dhS6R8{{i z?A+-&(|i~9tQPSP{>>4%Fs7q~{?7{D|C)cB)#dHSPV5m77@YnN4ZvS;=<%{#G;m7% zH`u9jy^RoQboO8@OfAL<4al{#<965u?pf4G>W05fj519i*bRQAE7M_U1Dk0baCQ#& z(LcgYKRZkpc{fCv4Nmf(F70tgH0L(r>RxHgmfT1xUi?EcYy)bwsoKPmd`{a~rCPyL z0RU(u1*q)$!pA?Hw1AFBJRs}6E)0+0PESP4cq%8 zt*r6;O+{HO86Aq0k*(!+Kp2u}1wQkSPMCdi?_Jqgk;p*N_X(;JuNNx`$e?Uou=#dI z%indtE{=}a?jG_NHs3_{yZzsL1jm1&HUG?%)N)*94tl|*bK#K|+MQfH6ZK?1s&s49 za%-X%SCECwF@#rkCyi9={T{ts<}Yu+7L^`u1UXg{d(Lemhh8<3QqlP^8Np8fKh!h0emHd$c0l;rqx86y}LllCNwy+ zk((EXCv{%|+?nRtmv%CjK21+i2_f(pjN-@|P@uiB95Aay|J48!FiFsdSr`By19$k7 z8(28~R!W&4y?F$ls*gxRiaf4N0<>G2(r(1UJz~WQZ zm-82Yw0oY(n}h?4J@ek^|DaNfGowbo(>hmQNvp4Ig9H|~k|G~4ecmEc6$a$l4xt(@>z-!tpedW71T@tg|!v90x?%{@$2F!Y04@}qOh!642su*NU&;@zj9cwOJCEzIt?YWj%F#|>3{tJjOdH_8>%iPE#5N?-x6s4p`Z51R4)NJEk}0(&wv-A!pU$o6dMLCbpa)2B zH3OPN37-2l2U>!2$cTiH-J8>zB;h=mRS?pkv}KN`bW0PdzEO!yU6JvW>dNOG$IOy8 zK0DX5>`@ScmH+H zI&0RPv-duGzdrB%{_G>-)mx30D{*z|iz#fBPgqGvWx+FM=G>{YTB>Ms&u_S@5P-(3 z^czp{pLo`OoLO>j6@V4ZJnn_gIh94$wV+ID5#WYPg<39zEf|$Z$G~gGL_x6GSwihx z&-TLzIHaS{$kx{+8|w=RE{i`c))0e?+Xf4`rG1=i_D34^+1Rl3c52*71v%S^5Bg zR4yTn{L@v;1OxBK-l_!v9*-e)wY+FhTW?9zaeY|1) z?ekU`0#D>DbH(P2PVu=JTvi@NjSK3~1NH=^*sYsA<0Q;! z*v{95A({W}R$BLnmZ^F7ce@g>zxp*pl--|)lj$M@$Z3~uDm;!TV^fU1yv}cLL+{>W z_N+inx?_5Ckq2Sv`&>g2&<-Ec7|pj$xoREP;s`ilpDx;wMjMxm>tzRr0)Z_Z>1AWV zd){e$;w3*>lX>~kX&4JiqxtrmAZqIpySg|=?gG0{O7W42zhk2Zk7L<~G00v_zJA7E z-v1$p3kOJ>mmiqV#Jf54NhtrzUMO`A>Tw#n-ALDcZhSgx@8(*J()w*5^t^$3u!O(d z+)|`e;eB?omXxh#d3NpH7U?9ako#8F_IKPHz08!a+ZBx6Pg?jYp-{Yy~p`3ufjx zG2nO4>V$Q4cCVQkODvLf)v!0 z{e_ALkebFYg5HdlEn6zo-T5JJPU)T_;Lh%U$5Gyw?Zy6&NjQ!sMu%^ zHON+B_Zrpsvh9Yh`$=q^bO#EnHWaQ>F$YBpkX@Y6XNNutG!HOvv2t6!?G{VUVB;wBZ8b|9O*{c!hb@ktN16(TJaQy5JZ->aLm?0G>We`(`fzwiy#<{wdsPGt_=HT%47uXHxOgIu|SH z{_Jc^;dNs!nea9*OwFc*&InKjajror|T`2B0@1KA_z zufC+R+9N}GF&K5v;ddI0gWxnHHAL-`;3W$UQ^l1Bv^kx1yE3T`#D4BFs!`$bDo~T? z-%!c~8aJ3H78%D>eQL8=+p2?woy@2c7v(nWVQlngs?$hKGocK0qWhf53IxR(MwYHK zHQWW;l&x~|+QS&M_k1g3w?0eJD1|-nyz=dn!@*`7)rkWY4?wi&$8150Dooc%-R$Z@ z$-sV&&Ya87pD!PG>OBWwHZp;$sI^ZmT6nf0UR7msgUvzPtQ9;LGd45?3n^oEwc$BW z#FM-6I_nDrOQIzasokFxIak5<2pb5zsWE5Qzp=!-;J$FNHSLGc)6GC!J)@nIkV}`2 zWB@@`SD65iZENMRO#rU*Bv>%^A=Z(A%WMO6TG#onddCoSb+I1E|EaEz?UJ3#-cKGQ zcGkMg_c9s+dF-4AIuF$`WbTw{(-_hDQ+s6Vqp?*Vv-pWa0PsnJ2 z=~G#>c<+N5xH10P7Aw*h( zl;VspN?Iqz%XZ_vb+;+B>FX`tc z)a%OUEj9CM0nXy1Se?sd`TSkVv!*u4{z1tj+J>_aMc0?v^ z`&yTg`v~h|otg2;KH$iXgW@yf9juh&;RQsb(LJSr5c&rh?kxZR0mROq-`SAjGJKcmR{jsmK!U4KW3HNS>5nIY8AJPt zIANRT7?2}c%jVz2GUBQa?>@jee6|y4quVsB0U1ljf zDOv}QLQ*A3b}?Mh6Zo07Sk9MsJEZ_V3(3zLHS*qbuRZME?(Oktv~g~ldSK#4KJ7Lh zqbnA*MJIT5Pk4V6PI8wtnmyP@?p(yPoMTyQ_OCY1scO|s zDqdGj{jB9;B-C}5ce&{X-(zg_eVU}Ul7vky zjIXU-e9s{Ndlp|`+>eabI34yPET$38rJvK;C%Fmo)nTf!MB=$rvXLuk^Iv`GG-PCRKiKS&LZ8P`3^*s%L(( zHbZk_*nGz_KW@>^`kq`M5qI?&95bHJ5Xd74Srt#fgOgX%#F2SbUlTGrPfM>}i-rk$3vHd!HzHhSIgS zO{+A`HaUMEeQ9NoN^4(Uk5p2f=C&}M&;iF(kNFf1uia97qmQ;hYB?Osvy3^Zxy<#Jo2QEZorzB!r4`PY=I7??!`f zaHQe)>r3ury=&A?Yz$Xaa55`3MehJe}F=d|? zcF|^Lj%<9jRnqKbf>^_< zE=59LxJ;Oirdnk`^Q!c?C)54w9QQ1I3aO>VQ0o$#RvBjO=Qy6?Dq#&3uHNK#B-Hs2 z6zx4&b~9C+Y8)RIpYgX6uQ!m=(fQmfTA;c!@xYIXvrQ~IFkyMn>XpxWy+p`DhF1Sh zyrx@u;qr+nS5}}wM!K#(JGsR^J%EPy=zZH|Lw-YR@tcR=ypX(lT#k-28c(_P&%ajY zZ?np!9!Zj#tSjo_B$spr2zC9Wyn%_2qa0o@O%Y8LFMoBot(&WtnjJ}ee79RXRR-P4 z@IhA1%G920YVlVuW<9arhZ3fiVK*)p^4VYW7x+07?Iv)hcvOYG^+&Y2;!uo9FD4KR zwsZdM=h}+&8ktV}urm-I(C>CH_;yy`pM)Z5NpDMJBY0L#t*QzPcd%pPsaU|a)l$H+ zR;1B9~Hcol$1u^VOWF(4R6h|EN^4 zs?ceRCvZJPBEiFB>iMEl-sis0FHPOwKgSap%H=2{l+2^9q(vS!6E#KO7FOihC~UBM!k4E%AjLm+XGg zL-3eP$)b$eELv3SuJBR)B&dLGmFE=6Se4ZnDA97x;C}fd1?KgMaF*E#{7u|td!ACB zX+H(-Z)PfypV%AOhvkc&V;!8EeOOum?ELiN=@eoL?Z!LJu6&5SRGjT9RAXzIi8u}i zE&iUuOrAc>37+^_$L{cfAW_9IS9T%AWh|p)K7AiR=ZPih%m%_)r&80pHMwYfs(TbB zjwjO4KS>K+Fbg=QN|YTb9YG7gwjL?o*JD|lDDh@;dh z=yMhNm6D9Wo;(g?a1-L`mlLy;)IQ7KC;~vJfB}ga$S0^?0Z+%b)V>_Wad}w(0P9P; z9ZJnc{#6_67oV{2y=8#xUP9x!BUqyNMlVyPg4`va2qN|)78{Tz0J>|_ z8|72WV;hZRtnxbPXV1C>Z~IugsQ>_0N54)z!jf4ecM|egl(Z-y?xp301;{c>VUO9W znuVeTPD!5uo1bkh-l6%7n1DdjZ9ccb3P0P}XKIZGCeGpNCp2A7p! zocYfww{kWb?PI@lL!O|C`9U`Vo#gUQ7t;#WTr4MQ8pd(b>$Zc5wG+Mp9Xbszt)lq+ zdt0>4XYGQC!I3WZoEB4p^tK!X*FGAOjxym9{o!(bt7(mORchk3e!C_9*nBdtv%wjh zhw-x7eGGa+3-3aRzt?8A@91b!%N9x-Y^gk&Q3xWIG2~44wC!N)iJGY!bt|EWB$io1 z^3);oobyil+N?!)$8mFaME$F`6y@d{R>>b%Mu=SYzvkta!5wRKxmz`>BL)+TTk@@g z?v@j8eS(Se%*4D7mjs!dhg!BzJIaG<*cfH)J>IJ{abikM7G{C?m-NwpF#+r2K=Olp z+(HT&uadn9xhK&T^6+#F|K|HZ8wIp8W?TE`FcCI&DY1-=58-&zLb5jZVbpQK!xUa7 zrK)vKK_5U~t%C0+A-^T54DPu5Ogl8DjY0z5AVDu52e5SS~fZ>=j zQgvo7;O89aj)_IYzYr}Jaahwz&oU{`za5|)p2wi^jB zv9;a7;QaH1x2?9sU4c+)=P3`{p|k89>|P+9Pii9K_b5(p&Zhqp-JOsy*}{O1zI(}A z!}EAt#=Xn8rtu8bx?$RptYzKXtoahA<>G+{wVJ=UYR9uP^-a}ufdOI)8Yig4r{$8T zi*D-*&TCmSoLB#n#-IE<@Si)(zZn@%h5V4aue?R)7aO11kL>nL6j`(lEBJ_4{?bfX zCC@2f4VP`rU)LZ1J#vhUGV@dmbFox1q-bNSIOLX3n+mNAvFLb+=Me5@UH_@=H^ctu zcZN+uT)t5$t@+7{(ZlMKs05CI-k7rnEZCP1RXya|7w%t39&z9%?Q6kP!6w|XJTESy z-Jjw61zZgQ!cbdCs?-Ge@Q0k?j2;@Q}RXni)T`%25F zKLO8UxtOKxzzjU946dH#m}jizxk{1FIlDbj`crMMV$XxbOH^AyjBT#le&W@4j1}I8 zyI<&ay-3(qa;I0B8{&aImbwet+nk+LjmSOr*UOg6xB9+!eCV(xA0w1uFeNz=c$LJ2 z_NqH*3f`x_!X}yenoglq^>INO;T2b|{LN&{d?$BK0sk!bB8|mT*EP}jqSX(FL?$9( z{(`l#Y83AJ{=5}hBbttx*7b3lLWiMNg7(kb*bs)VLE>_gTT7C(1C}Fx%Q~}=x9ihT z&SmM~*wys{HFd?V>E)gp$$5$HMC{RbUv%+&HaQM6LpGHodpuh>+{86n&cJ+Eh-(Bbei44NIR*R3T<~geJ=j6C^~g6O~?KupVU}`-)Z;6-Oi|d zfS$YU!?HI7Zo#hDsWnm~hhO#3W8O6m=%+fD;S zs9!d+A+e-7vh+9SuGF~fN^`Rt<>?a#MJ+ZdhBFYxQhnoJ<~RTViO-*J1jyn=$XqxQ zIRc#2waY)^gR)R3CLQA`1`F=KtytpGQ}z>lzTpddRQk_yWGDN#nt(9C3T6GZverg7 zP;LFfpu@GK_G(%>Z0xCs(-kn&6)M;*lPRy+o1ixB27z1hdJMP_TB}HFF@aVaiZIBIu^) zG!yFFsGv}8{4&r7_l`8jgP?u z2CF6AIXo~4#ovYp6J|JX0O;(juP26?VmHbw)0m>m>t_d=4Hqr0?oMm|M7+>i;o1i1X5t;j2YfW{zx?0jeFWq|i*RZDI8*VaPSfJ=GVSH!%gLS^tPKC;gY2nG z_y}5(PodaTcYO7OeQ=QVs{^rHki=;6`RjogP!n!*Pm6b9gN$_V5=IWsDPhxWP1ep; zBI!R8dJ!$_|D~qSH^!1rC2ww$yU@xpZ5i5fZ)%;Iiaumro)1D53>J#RU(Ki@f|0jR zG32un!O{^AuEQ`ti;cl5*Uopd7XFN`6&qCQ>N}a>N=?<&8w3KS&kQT7T32RolBSs? z`KA|dUHdOt|MdC)ko7Yg^*=@D`$~GbFt^5|H{REkUxr}ZEG(OXt4I_yk1UJN;@!yQcr@?5JJe4aCKnrUMxhHFjQxYGb)43?p+Ob3(VEtXQa;N^5>HV!;PQ%r=X zjuCD`)KZM<&r%Zg+ES_^l8$LJ4f9H5w8_VwG)gjE2i=S)Wfk1HgPwcP77TdYv6;V% z5xo$7@qR*-p;NnT^_TCn}*X0Upz$IiB$X9x;u4O)ae5}lUCUGJpj2t6z)_j}3 zQg!` zeIqJajm53!RX(ds_?LQ*kit}PLhOf>VCu(Au z#mWC!EMUn(B8VF8dqrRi$gy2J>rB6|GLrL$1WqEDbscAlq52eN`@kyn6R?=hc4r<5 zZ=?4&Axv%9Q`c)^$l`AoI{5~`#0C)48^nOE%s;~ExzRumlCq!T@d1Kd@N z_*F0Y=5i|H1F?*=Tw8@&-b*B93aR+XpHNfNeWrl6?_@SzN@OUulY3IR%NfAP|6 z0rV<@#g9C57yi4AjQLczu`P%$j=8o>_|*PN+GgJ+^_EVQWjTq=#EagFmRP;JX>w`b zRbT_Tc6Re@sQc||XYADFX_A)F<%6{*1A<^;ANJPE-#QCj8#@GF+IISq1S+EWo*jKT zIV9p@rbBZ*kzu(nIOOWYU}KemYfCaD)p*iowbo{yXsP!F(OZ7C&NQJrp@1cCbV#sZ zxKu;LjFKpah=!ooVGt?#`XnjFu5tAe^Nbuq5(HgHnJ@3o(_4!NKX;i(m#lC6FCE)0 zPikhV&7eeW#zJMro-^A}Ji3Aq;BJ1AtwGBYVaGu@C(tb&0|-oOIXxl!#BWx{XxT=; zh)NGeB^QU^%|$E)DA-}jeF90&3wl<^Jpcr@9tV~#ORPoAcUC5DgSXk2$L5e1(r#8Od(m&C)ybJE zRWBh?VHKPY0TaGN=}la}J=t*l9t|GQjp&htW?R6JT?Lbp9!`Y(#q=4nmocP1!{LtA- z2%&*#q7S`#TCA_*;G**VSWV#qo#gtKzdG}`oQcbzfdN&=_f#Pi)L~y^hFE4>GC%!Q zSa46=uPU49@yvP*bkM#l_ANuHHGB9-e|K24<-DM`RTO!R8kle!q-(&%g~u@*)70O9;fZx8-ZbW!mjp5%;Ktlufk)7X`DJ5r;UvJ1OKkum!R+F@8mAJm*y}e zB4WPIytNI?{OlE%Nqu|ua!~hvgBf5&e0lx$+lPS*|Cr&Sm5fw%f0M?mq7spf2WYr_ zQh9lUuND(MD2(hmpsv>Of8A9x89W`;{3W+h|4hqeWj|b4w7DjVKLnz8@+^)Nv1Zmj5@3<^$M( zH*ThA{pTgniPZpvTlP3n4&&{_FPkZaZ&#GjW*9T5ThwAw2kk}3rMF7z%T$*G5RLN( z=^i=Ya(InzKt@G6P2#|#Ys&y|;6NVYrAbV-SeY?U1i^D8XkkWGSs^@g5t9vks4wQM zS@3)1=7YbBG0~V-D{WQH%PLaDklunQ{(-dwqWO5=%}u;HQ+KvMmIj1~Sr#AFCwFx@ z#iG9lNEW~pN4xCLkzCAmFQ$wZCG4wd0$~?@PwyUitV&HZS@(IeRvx^q`LfDyLA+tB z67cJg|B}9r^~R>dwhGGx7m)MPe@~a8s0<6`8z5jdx!`LnmhCSB_^7+XWFlr?8wqWp zNxi6Xp($VQA(7ecEDC#GIrIVRdwW>Z7NGoI(%(bQSUY53iSzG*zB?;%Uc_YyX>ShoD|daUp;l6shGG-ACZ5kq zda<_w2xVhBXn4JoH{c+c-g~9_Mo!CsJqI2B zh50oOu{!q=@8xPrfS_k@r8JUGrWJ5=?-HGg)F{Lo879cJV?F(j3SFXAc}x1MuymX; ztx}d$QuT*)?&p-8lL5OYW??Ln&jj{%8p^2Wi`r4VV(iZgy1cF*kzoTyql&JIp-Q+j zhX+4i3!k;5R18KJR1R$uyJ-UZv@s`sh%0V**taJ{Fc=WAowlS^`et?DUVYOGN;kgm zv+1+wu;@v|)xtoB%s6TCNOt8COdnCSgt zpxM*hlZld?88GT*n_p(dL*rDHMK8mA&+J>{%X0I#t{diKa9o`HbLc&Vo2$o5!}Idk z2ca?ckkiK#x~IBFg(;!|3OM(hTxko`=}9t}`se-*(IU4D(t?kKzRNzMA~slLV;%!L z9)RVzzSkBe81L47e<>?+esi`{t4mA)rm!igbdE#V+HvsPl*HZE)^L=|Al06Ak2biS zf?764qe{Z;uPlr^vyRWL(w+p5N-F4%7dMNrOFS2O&RC3{gFX@wdCt?MLy*Jc4x=fa z?&Mnb;$(Yqyrjn75j}Q#4w+6|LQ>BcT6Mnf$tiPV7s!Yn`l;GlXFt1%1)Vik=6(#A zQyoX?8ZMts*bP@pJl)h6pM~Vt-f1bI{WWDzyiVsbw`9yf>r@tEP(PGn!|mffP$u6s z4V(yOwIILa!}E8N?7cfMMGoU86FuOt-`t9fBQvas5o%D1#-qJ>bneg zuIel50o{4*bT~vZ(JmrHyhF=*CamiWV2VdXo=f4wLxZaR?wf5d1S4bt`rWz^v$(c^ zo#gTwJ-wO$2N23FtB1Kp}~Jv}AmCanD|PqxRp7v3c=kRUho@Xl%TExr-jXKi&vf=?)`;~|D2XGqr4|DkAO zC;LS#h=1$HGe=aQiaI}i3g{)c_zy?+hpiN5BNuv-^IEoz92Dm}N-xWw7l9#12y%Y<_Qi{WM&p{|80B<;dL`!l9eA6!J~JY!FH3gueA*s?1hX)tFO<=bz-*c z5XRk+mqR{7n#)6};3?;Zbo2+VZmk>{mPjAF>#t>&iUbS)dQj;dxn2gjtYI9ZgVZPH zt<^%%LyqTM{of2L6VwaU1pmUS^522h^MGZVgn)d#i6Qjo907`QsPkx@z@WyZ7qdZ+E@xS-<_f9V#y?j)qKt3;+PoB!2!-1OQ;8005Ze_pomx zA}BZIZw;J-u!Qpa_wQFW=0A89?HPkg{v~$(oB6_TV$(1&BzT+a2q0_k*&03CA}~DT4)d!~QS=DNANCLm}oV zd=F8>D(1#T9CnU!Wt6a=akL?`{@cqC2-lC@bnrHI9w%2{%3OeJDOw}ReT{NmhsUj?=^+V(&8w(T|lkx6jY`x}47ne1LE z`0cuO9P-n#_BIhLlbYk63`sYPG`C^P9G`D?mP#d5L4_aphQvx13`N*#O}< z*mC?ehKTOsrYnve_)=U4=Ed9nu@qQhmiw{A+$KqjpuX84s@tCe?}?MYU;fAaO; z%-i27>jT{d`UyqvL{gMb~bS&u# z82p{A=T@iRWV74xa_eX}j}avl@%l0sTjt-9bfxiOy9(c4;j~$h{;%U zW2@K&U*_t8_h%Sz2*$s2(iCz}9JnVyfpGw7tPG!laEvS{dB6wu!s#JyI}@#nz&wXO zRvXadBJbHwPWe3)Pr&slNVegT@9vDkC~&1qQ{Se^r7En#V$t)E)jU94ZNbN!j#*%4 zZn=49Y46jY+Qi17t@Ie(%pGN|3(~b0?&*Ax9`APSY99DJ!@VQ|F3IER*Y)~>EPpH) zo6`jgb!)@6W4J8m_SXcugv>OzOH`2c!xMP?US7yyzca2W(^U?vFuk5j(&KEJ) zNJ2fc{2bwjXw}ctNdB+#SNLF;-U3(vwN&54b@eOkmjQiwi?2F=ht5Ju9FHAM;3Hc9 zf&eLa%*JqEec4F7p2)vp@mW-dC%Zn(rEz0I7SU?x5TbgLy*EhoHFPbd0RS`Dr@Q{8 z!3$EqL#)YVShdQ$BVN?+seh`UT|4b&o?r|KF`gv_x9>Vi)+Tgl-}&}PnY7BbcY>#H{(9<026mN>;k-)+MlPIpT-`(s8H{ z0xw9ed9Q4Bl>JSNDPOguIY|II9i6;Commj(QLk>jHJ-`bUSin&8;6F847!6a@v(U+!kNIY|(EB zV8~Spqw?{F*`j~g+S7iB|C}x0EXe4?%KEp{QJwHJFG|0Wt=vHV3YRBhTsryicVqya zbaHqmx}I+{{_MhK~31V}5 zPB-=O=eVZF-&N(GXdr9Sol_6RZbY2!@#7w-!{Gekh3U!3IoNu(QK%xN8~X?rk6S9! z_t6UfPrkWSEpEB(Wp;pwHYZjrK>KnOTn9{ifOzPyD%!b=QwtN-^&Dxbgd7K)00`42 z48p+%GC+<*1bs);AcCqsa$_T+Rfmfl)_3f;TEPjycPysrx1Sje_V)p8K=ljqnfpeH zp?m=%#q~t{Ym@GH8>k8+)T;ib zv-3Njnu2?=0YwqAqXma8gCZiv9!?g<>iBWZhG#HY-uEoZnoeZ?S9(M1?#h)FM2V~i zlX!bbfm;2M-VFjVWN=IowDKxc1X~huL)cJ z7u9*>yX@m*l~+(HQ)UnXU`UGM+&=pHf$8HY8K^H4A7JYYL)X2w4`6cVH;(S*)D^Dm zffYO!1H?GPaocC3s^#if$813i0k%-g(ayn10j3g1-JS$;T|_SvJ*e9+<84AiG8nv> zQuE?+vd?ev2FxJD5LxVw2>t&=pkx;(ddcY z2&VZcdY51TXi4g&jBFWQU^`$@Nno@x@jCA?4z76cZpWTHWgigBs+!{*`cmv5eX=TJ zO?4_AtC)+NGnLiWI^2Ph8_+V~eYt)C^ zJ8q#l&I?XFBQ_cF9%ga2aaxhwJA`?RxGQDLn+1SM01JZw@tbT3dw4uZfyt0+u8`z^ zi|LYLA*zPsBswWr#;PzMPBiVw>hc!tlY4GGWhd+xuS^d{W=7@$PX=~Q3P$Ej4bRLi zL%GsX-TZTP!WuKB*(q@%z*m9wRvLqu*5Wd5+LHv2GABdSto|e+(8+NDoZ6EGE7P^C zb``1lR7s`rRYiQUr+=XcuMYoOyN(lG1>if_`ESqw{VBUcFNx(6c8#%%u^;U<>{%Vt z0PZFHn}#T$+OT=|zIuZi!IjL3s@1u-W!bbs*NZ>3lB*dpa{TguX~OpX(ThEt$TPe8 zB%1p@Q#sb}v(MR^Wr_sG2U8@VD#KTn@?{KSmoe^;N)?8%8Md?s6HvGDljxvu*O{Xy zhFX)ysTyGe(Xr=guYsI+)zj_MULD09eLE-dje}aj{Uqw~)211V-kWw{8bkq{LeB*r~3HmRJ`D)I>}b;;+>{Fr{J0k4V7DG@lLH z5JVgD=ajNTabt>xrag4K$Q|IGkLIo&G-w_m_X2|c@KA|e=r;R!0e{8t_dy9!oqHuBHi0~ogEtlVQ>L^O)gKQI3_)M>8k=eoiBaD9sJBI{_{{DlWj zC|nkbfJZ79aR)_8xh04^u6KD4CPZRXKL154!*nxm#y~N!T=83#BS!uwhJWm3lh3&s z$qdrl4~~X8l#t;tZcwnV&;ox)3vTUwb#V#IayOlB*Y=`gIG;`_??t%}#)}HmHp{!8 zT2b73u1p`PD9;%|dYdY>NP18jG|{s%I5Hy|j3GnlLPts}g@;7~h&PsQM6-w7J`@Qo{#V+@aAVq`LfSuB?{yLDGk1*7FJ42Ta>&xgX4;tghZ*4W ztAwLs^iO|Fmj6*g^H}l~N{r0f_e1`dfZvsCs zdF|@98=dr~-V1;JQCX^T$qdYqFXJ&gvPhkQ4)PWo5Civ1Uh>D+#6l5)iq$cC_mj1wb|62BU z>Xj0(ZBZ&%qlAd=GOL{>Q+=Yt+c82&$C;0-h5*3$t$CK4vq}HOsSk7 zpa5}zLgE|h*|GNnUar0teJ1$-QI+BT)z0*2 z!${uIV(<_*!+Jo1R*jaQ^Lp!3=34fj%#PGU>?B)0!!kd^j-dOxDMUcVJY0uZ#03Fo z_w-MZz>8}+^5svjghH_aCEI^;deBCexY%mMdPvn%k$t4{iMkmIPhWyJt{NNA( zTZd=HX=II;aqLO^;ZXNU%U0<2gi_N!ii?W^dfJbu?gHBt9ONo23GhA zn-(mOC3;MtoGjnvLxqZ!m=hj!{ib^V+@ip`%j$A{PEYEms&ymw-ADzGlXJk`p#?ed zH}y65x&ytR1EuqYh_vK~F;@Ur zXu6r{d!J>p_@baeqMy+L{SR|%&nYED0v3L~D|((-`#`dN%-B)AJfI9_7rZ)Yv^(o2 zlr>Sj*9x749}ac2X+pqX4}aGt> zTsr{(IfG9IKYMrnFY5QHxK%!d^*JuUz*=hx5#ya?t21vvHo^oq<&FT&0spP-YA)^%(YOxB0V2{p|0Py~lyV za7#WHG?XYQm10k<`ALM0oJ4ZKPpM0D7wxHTfZl~E3gDB=rCle%_qTNsBY~kHq9c)5 zkVyegG{DKq`APauze|yr&6(jtPVaSez_!k5qBP|;;JZ%$&8grjqv?T}Pe5Bp(ydzB z5zZXxd(nR0ofvyS_wdEho-o`Rr3gSvBa(dukPUXlSLr~RK4!%j6S&J&*5hYs9=%)R z->ct&lOOsEqsbrx;wK?sSt$du_Vy(A5j$W!+_V*qjPV0ZUsh76|Iy|LE6xEi&??Aa zm+HW7p!U&*3*@5-si)P$mF)RV-q*0)h?l;dEd!jYSJI8k-ndZvp>W}QJ~+J%OY>Zl z=~4#Ym+e6ZDHu`btM#9N)=r(dbC_*2Q-E)~^3`9dTsC-sd^#L=OjZc&oK&JdScRI4 zk1I?XFkd|n>#KP>x$;iyop2_=8GldZe}LZFhh-dx)tb)BhqtWoK zewLpzq`$=C-TWhav_)Z?aMe7g7VRIHLz&|Lv6(8v>!-=J^{F)Nt_4MRyNQp(qwmNF+U*P<8 z5IT(kMT+O7+4;Q7W^AXRXrN>8&tpArv4_A^0`l2wa5+RKEjLPF3qW>xIWL2mHZQ6@ zu8>22uX;RyFA&b?pSiRkg;;sBV63;7(W)ajo2#Tfz5}FDctns1c5ZiXb>mkNbA4(= z;b5~it4blf(;F=eVQ*taS@+QCwkAt-g4g}+<&!;ye^90~{hdi$q}_XDxNykrD%K;u`kUXk<#<2l4fR2IE9G@GMo z-O$o0VRXsPKSl^TF}eWDQ~TUMSSU-TKmu-R+9REQh=CQ_F%(;IZ)rAKMf&|XF1ay?Vw zD)guQe)eKR+-2*>XZYlt8eI@&cp}GZkMI&Y_<7;Hd1F7EpaSKw+IcRF>nYg%jqnck zA#l!mTjUicK(l@Mawe!kq;N=S`25aC*sKfqtdx|VQ%ETouJm^vF(x2(JLhdBfh>iF zd&?ZQo*08oa!&9#=4;C zAHdX)351Q;TERuX`>rz^2}rwYKz>or4>49*JRv?sdz*jm|0eViKRDg83x!vS-`TPJ zrlu-WP9q7x%$<)XRVXwu**rDeJk7@N5TL|vQn-faQk?8nFr5NY6^|% zy~b3ukTS84-)nG4DDu&7{!P9ZLb@~|z=xZjZTC!}HR4uq);CWr{!ytY6}t#uIhH$E zzv=Bf(5lIkC)Rn_*C=Dvuw`sakNqbH-$^cHzti=!FV=XojwcUEx z*lRZLxM#yVAScg*M@Qtd3-%}Pp1Twa0KUKDh}l13xxS|MP%xP1_u?ClISG}&1H|~` z%RHttc{l{Md^_B@L3cuZJ5oK7G=o8rkZ|99{YvkqsryZbuWwK@nz_?yC5u5oO*z2D zS;vJ_iNr9%L~nn_X4n5HI*#vISU!ozJ%V{#OoT)5TM4$P$|ePbyP=~F#xTN+fVTm^ zgHEEQEwG5kg85!zrgfppC_B&SVA*OD3@Q^;&rN3kvfMg1Zxq3--L?l)X5B6i@TIma zr@)#BqbW1JJF_IO!*?oZ4jVz1bD5gCJ=w7sNfv!>IqtGB7)o|qDY%z}p|-Dj;j1nP zZ{aYE_{8<>({`%!h>0yqe=*B(+#x7}4$zt!Pq}?G_QLq3O!&4HV`9x51c@)8+n5*h z8_diLG_dF~g#~aE&I++y2Ie%mc*Oax?@%K+UDFiIQhy)(b99>kBJiGW&vU4lg~OvYhK|)cC-=`Dz?|XWmRT&EJg9 z@m%oGp1CDa0b1)Rc3!xvHVt>q!kUsNficXPu%b>L6jDpeU864^<7$EZ`~uLtsWVZ9 z1P+LW>(%U<2x8^LX&WykH$BbMY?vQ|r&~dyL6hK-Jy${X*Zr-KeL)w(YPyEh6(-r( zxRL9$Rth#sflK`Z*!$|?Q^d2;7&!M%a1=JB6_`i`qE-0y>Hbs#y?20fs>--Z^7@#e z-O;!d5*R4-C>WrvdxGES>E(!&{lSQj^@z&_lz@m7dLQj=;%*uMZJzW*_sv`w^QYqn zMG~~r2;~l;Wz4&nnQi#R2+Y%t-`KkNq8 zKDeAV72fYVowR*KEzO16yn0}aSW4W)4L|xy0A4Xc9#k6OI#nGqwXS-#( zNWsjri05%IY5eJIzavJS}vVy^%2vY zk`mVo?h}X{RkpJhFxp3bMZ0ypQ{v6vpK?pUKF!(I%kLlC*V@67@Z< z;?osv8J0No;9Z>1K1y@a96M~+njaDTU}%rn!f$=(qT(nk zE(2r-1?$SzBS*eoN)di9n#mzDS_#ApP8BY@F8N*ByDgdLZTq8Q6HnKJbkL4S&9*j< z(y}NZNnGc{GC0Pq12u7*OT#<_gY}bp%LCX@@z<&Ogri3$y)wcB@Jr@l3<~^P&;orh zSWV8tNmaa#aMF*yF`amHhR~s&=7H`7Uw+fITxV3srin1FFF%WS&6=oUStqokGHin& z4fXt#V~6IYOQt$MY9y&NE)xJsrj08ZkD66<_UO)&0fQp}SNtGB&Tf2k)?CT{DM6k~ zepM>TmfV$a1T+mI@~wO%I>O7qI)lD*hj%yy+$A)_Hj)xvI+~(4S0|H?etw2j6U&Ye zZ+mltof;!7I4Y}mkB+?8Y78>3C=catSn zYDD>QjG=ucm?a+??dRw)2BrsRek9^&kkf#zPxYzfAu^-$GYBP{G)7Q$xrUubyBarw zO8ShEBzvTEV2kO_pn&_OJ%{Ryq-m*^ez$gM)PMH^VBsHS*Jmrk_}a`kDPs7udSx8@ zOqtw7Q1Dh2CaAc?jAV-O1^VlHi5U4DwKRS#{g^G=@fY04ur#l_SlsJtV?r z%WaX$9nt=6Y_u{R!aEF*M%#t-T7l2RmA3;!4)VmQr;P41zC^9T#{y%W^@fCCq$ejJ zH#z%f(QOcf0}{gcGD-t@|2>Nps|B!)b4fGJTn&?d zkOpAdf`LCfzxH>2xa@kvdVVXpYdg$LPbTC^kw%5}Zdn|f=UM3H9s~h26<8 z*hy0m)0vug^TlT%9sh>d}eGz+O$%_ zTreu??rZ)FO1IrcezcmCIWN65m)%NWvG^sDo!aj;S^`)=CEMl5?YHl^F6@$l;wM$9 zGxKU@3b=^hK^;wjKRuN);DMqdCjLBfU-;F=6WV>;Holc+uXk(@d{L9OYp6K%({Q~%S*zY5e!GY;% z+;Eba0TeyX0bc!h>G;uAe}_kYmF`-j87@2xT$L@v_N{d|0z$B=`bQ%*=UL~9??^%q z7ayF7&sLlsW=nJ%Y6xnNW&cu(*Ci3+Y`wG_aD%)14vY+@8ZAzISzTuOZ0Fv|C*md` zB)>|nRc#;gK6()!+&gXD*bsWOFf6)s<%_Rei4P&EZHs;>M*~zco$VS zr&7~>RGUDeztai?J4Qx2mGOUR{(X@5CzV+4g(Wu<6M)^{Zv+!b+q&N$c1A;fu}6N~X(AzeFL?}?KAG7v*} zNIdt&(>{0hWGH0pz_kW-`UWut6I0{Tjomhhei+QG=O5@?U2;m$0? zBuBV35$E)zc4Wy63~|wz#btJRDGkTKI?vMN^VayiHhJ0kw2{)3H7y0b`i--oGej=I zi`jni7pPB3tj^`(e364K(J#ew3pe1tcm9djYro!@@!N#sRKYk4Gsh zCA=%xd-t${XVG=JpsjY`OZrGKmrNf9Cn%NQVkyvs*3;kjklR%~EKhEAuKvEj>q2MVQh!(81E5NI$}KMDad%0q-vlgpLlxOitGnzFC?I^y5A)k zKLK1DTCI8NW?$7YEa1MR&6hPibId4i(iW)L91X`g9BXIHj;SnG?+3+tljCtZF<0S- zHp>!K;7pG3>p*g|-MhGX71jMi6E2S|Hpz;!Z zwBtlr*kc}WJ&nQvOjybe!O@ft^PT@G6HKaFd1U#O{b zId0M6bA29m%ba_$M*{0VkPdcos)I+gzCX zVIRwrM6Rj1p}wBVLyQuy{icl$XSR1FdE#vVXMb3)(qsQpBmzdh8;k9(=Ou#Fd1v#c zbVB|*{l~;w@#D@CI&ZuYxH_u7*QtspKg{u#h@g7br$kRNNZbnYhM+= zfSD`Ke%lutjuNtNe<|XDLLtuPn?wS*GdH|q;D_> znomh8e~F)jwIs|qWcgiDMG$xy_V3_`%XHG8^)pt-kh|wz(l9TwrW}TC0iVqKN1w92 z#DvRqs_0$HE+x#)g{{!lErPIMPh&~LJ=cfIpR6}TEmqGT+Mq>(d~e2@;$b9PVF8<1 zs1&gKIByZ-t|xD2ywx%ZFY?EmI;bf=X5e{X>NU)v-9KRjC2qFnJ)z0>c_ zFZVm*i3+Dg`3EkyVM1-l>y5Nsr0891?zPvAn@p1tEkykERpi*)qqaqE+TCdNAFQyf zEsLk>efcF&jX+>2ktPK}3)4&8D68(Db4GcExt^_;iqL~UT&29B9J9sZf)OH(%c4hy zsbDHb)lpt)wyF&bDu2!<+fUC>CS#!p`cL*6S68C{v(NJA2hpql7tmvo2h>%6zW$07 zds3tLx(i)W^uNKRsJMclhwA8$^CqL&uYV1qu)t~CFIh+LWz%sRZTi#C2O>VP8o%6Z zoHp6-X7PL#h5#Q%pzi<)q|xh$43d!=EI#YUw?ER_ z?@@Q;d?q+m(b)JEx`d(~37yS&yAzT5zvtkbEHN`%-}A*XNj74wbD- zRxM?K*US=5G86Q?^y%3AP@My9^^=-rr;LKFhY>VrAcC1cDYI*BM9u2yd!m!IketgK zzme7h+n>xo_m7xu#vc3Oj?w68`Y_Uo_$6kFteit76JP)u#N1!Wwatt_P# zUP7jUDoCF66cS$)6%jgEhATXr(=XKht1A zQhhXl0uKo38PH_eEKd#_4ykRc3mh|;P1vHPrQ=%@m0PHbtD-T)J>9Q_R)&%r?pY(VGtqpG)&W@xD7qzRIwC4Cuo z4Y*Mgn4ork5ySK5xZqlWovDJCsnp&$hC|e)r55qiH(v?eDr64XqpiaOoO7FQNF|N; zgAWg9u&T<(U{S5kCKNzz9~BKTmn`m0zpjz9{^lUqbRb-U%r3d5>U_#~?gtJ#wnK8k zfHyz?v}C-Rf`=m^;AWF7KGgdX`x27n$a#O*`SRzTuR;>5;+t+s-RusiI}hWI+HM|n zSgQvB26T0|xGI_Io7M@x#KiK}t=wv7+~3}XP+e{U-T8Faiq=k6a6m5HhY4uWZKTrQ z7^Ie-``1K!y`w(D0g_ydwX)#vffXGYF^L~N$bUr?eOFl18CGn(J75strs1;}a-?XJXA^S4ep4H4mIZLa-^wz?KD}iXB&FhSso|ew=B-eN`A=Ds zqLY=%A3SRJQ`SjPVlk0FJsW0zCGg$Lo>gK!+En^a4j}~ z%&j>WeDKP%9X%)@<7K~p@w<|vJe5`W?&PMO8L9_aGEv!a)tK<-`V-hDw3C=^hBY9Q zOj3VV)#f(0p8P?09ZgLF5~}T4j_yw8ajPitd>^m55r5dVWqqb#mMh4fD zI8`z#k8XEIc_rqZ zvA^sZ%pY2wOFz8h0~VuRG>P%ekmDi0(N|hi!ZM6bMo^-R}hSHMhTT*MhH<<*fcjP}* zXd_FaLr|XLrplBxnS=wJsj>8PnDD}PBCop>^iR)l~PX(oSji7znTqEeh+LpNFI;W{%KUcS3V)n6F{8G)x)oXc+N4m6l0bB+PIWcJ7LBr-?psrUznN8AXj z8;aRGPsaA)TX|)SyY^b^b56l1sWJreMazc$Gt&zO8)k>*7RQICzC|Tv!wCsL zxxObl7~L*L>Bu%j-rhP!@_Bm>CA1g2rswLGzLFAM*55YZztJ)&5uk4PARD}9Ob9J{wHtAG5-CWH>fKHIAUubA z>k#l@#%-w2?Ko-;RAv2?c6m#0d9rR`m;ZQj)4SWP&Wy~kIz4JU;+#~Yx_%m$ zKJ3CH4``=L_(*A3<)sYvd20K5zXeBjsA{oIR@kr?tPa>(v+};wiqmCfHAX%xYtd+t zhOs5;2AG&A+R6 zK5eCFNtN~bhjx{vT+sg~xXXPRsw%F$hH(|)^$LvS%N2AzJ>;#qL_$(XOM}yTl&%PJ zYNFA#H&M$P_H@=irO{}k;kQzG|Hm(yDnpn6}05`IR z^&Q!0Sj5COuba9VOaUOqep{b>-+00|mP$Xgnpj`=5}s;c+rqY%CB=Nf{69xt<{(E) z&o=b?Qy$Nw*{WrAv?Zq^y?Dd^5&l3f!7B9$PA2AQhA!6sz`TkZr=yF`ckZ3C!3%yf zT~;T%t$N_sSQ4vIQgSM-NW92u4nT8HbDEsK;JR$vX8{>M%8{_X1hvZqz8AXk`r%YC zrDOdT%m)*}iFYS-lF3bJR}DIY34|i`4CXG-GLeb}n^G=`&2Zt8pvsyW>jsl|Y9GAV z6zjG_b;atLVSRI(o4;i%h~Bq1UwBn*v`X!)3N8g#o?IbdlwKsz`91jZyX-;5BW zMRA<(IXzEr=Kd7}qszcdg_ViN>72*hIK)4SzemT2+55Vu+9Wl%O}Jn+(`B6o*)GCJ z(`p*t%)9Jryn);^uA3z{5Z%WCm(s$&xh)qOtE`S)jUTIDGwvAU4zD?}0fvz4!gDMN zi;hgkSj5*rLRiYXKviU=7B1C{xK-!VICxmZ~`-Ygy*)J$!{b-BQ0 z&vrRqF%OC8VT(JBeZujrhC_Bh&4frsIZ@QQht| zkSMku>+dK@yM|<}qu%Dp(%)c3or-tUC8SLAtUjQ{LtdMHsx`q5F53z1I02>Q)Q)zs ze?`XnDHQ4&+xVe}ha#La;gbCZq3x*`w}f)|doi{4H>)BFp`mXaEkfsZR)4YY2ve^a_b4W0=mEP^yc9wHGX4&}}>*PY39dK*$+%gPhmJbKDsF_A8t8s)O=39P8@ z>dTr5cRq1mxqNG>HXpq0yBRBAeark-78o9VSBpI!^7fp+xaVqo@;!azL z({|@Jkf6&>gvz_wZT>`Klz~r4UpDA}1B`{2am%*~%&D#>dVtO*~ht>@8o~ki|y6%?MgRx zw^)gcKes_5aO}di+JLwuPuIyzKr^Iyo#NFYYb zRY8w(hw2VG?KMcPE4nIjD!)(aO^G1+#u6HuvkoNd2a9|@8`xOp8^EtxtAPc$%lL42 z`Ls{{1=yIFoa+Zibgj8~+p%`|yf0zNY$U(ed)}#BcRD#)bFak1SogNiG-2=3>T$t` z4Az{Z%{Q?uaM~Ss_qo}2gvqD`3}WHOuWPZkdcoyX_zzC@8plET*ea*7ew}N)W-;DW zZ7*BCu;054-or1dr8Z4ja~Xm_^rX@jo0b+YW8AQ>`{)M0gBO z4mwJ32%WDR6>W%n@9tA^Q-QZs-)G!bYk(`AG%MWYHR1k~W#=~csNZf~0soq+Jx2Hn zl~J^Hx^Q#O4h|12x1P5M12!~nb-D^|E@3g?MA8Ok;yQ`Gy;Pr5UOV3E1VH$j8`APB zs^73&5K8pW+`;jLBIM4>&R#;3-|6YEpeJ_ote# zKVaM9L^;}W*%2OfFiCi0w6l#1+E*MJ@sy&(gcdycggz+BJ^E*2&bqmSC;?=;| z!%)ctw5v+)%?gs{Lcg5hP>{7G{sJG7EA=JgwMs55^0CA_!BF@O!GTJLvuHQ|NBKgj zcjFe0#_Slow8xs9-DGx_JQ>SWc=@1-cS?I+_U#QoZDi)nIoV`xt?}tUlrs#O6tWpd z1)qnpiGcpBG-A}`sYt%PxXaucHa!NEwljR@Hd@vB5Ntzimh8oB1^I45aDXKY;iOji zr~bh_G85^)wub>1JZry+D$N62`wxc`kfnz57Hq~@xjEqp#CslnS04B`R`YzzJ@-}`b&KgC_s7A~ zH{mr|3jW;T36s5$y1Ah_t&ZUh{HzRb7G&{cf~3qIegQLAVh5-1>7QvBthF{1?9k`^ zW%;0NkKa>D3-t36X&rv`XYIB9ASJ;p+a;zzH2DLQ3p)Y~Ih z&Yh_GT;7q){BaVy6)^8VgFtolN9RmBhGo`QRl_9toQw0u-mDjG%uFqNqumanjH+fN zk-#3_!Rz&9V2PD-WqhucGpW;@N zX`aLWDP8%@cQK7j*iClPJ#=xw%-e=+UQIe45g?3}Gx=jEY4YOV{}+I|T%{>7C~TqT zK6hI{;Ma=WIGROmBq&)n7;Ga3hIkA{?14DZ?#%0`TX(SF4JIA$8fCBuJum1JgvxF= zF*lp!#w$@~pATs9`L>hut0~*vIl!#)I%& z!`j4-RsX@4>b@V)9eS07+S#RR*>aQ&u1QEN@Hj_3apGFoo zM~Jw4Ij^CeXfX$q)5>Y!mxbpaFsQVNr<}HWrC+Hn@8S`Uw(s7vgzxnhf?(L(` zStO>jnJe43+pqT1gh@W0tp(HtdG>kIy5Xjw52BwEs;8vl6CRm*wx#Q8Jsre z7-dw!YF4e6TPn4CEBBcY(fI+K^#90R0=Ih#>vfIFxk;an5p3LaMoNSszn0up zkJXclufoI1GrHiefut|@JR|nMk0+Ya5-;a@vHL;DnT5FFzo5!Tq&virtzB6T@T>G? zY3AE@KiT+L*Zb3nRRM~r`Xi__=8MlKCrAa(D*-;tWW|i3wJk4BXi-OLF1~+L7 zaee@k?IG87RXocZ-K%}LwYqfzRz+bsys@Byn$gtb1MiwWyOc2fPB^!x!~yA|-FTZ7 z?vw*_vlNaOV-;7S$0(W~H9&Fp%6>RDs}&h-w(i)sZB7MC&~%8I^W?clr~G4=s6Liu z_ME;=p;n99%AkwouL=-cw%O*A$L10$&6_9c0n*N6lPR2_`YP!iKL}U##A0Bc(lZX1_z0!j;h@9S?Yoy1(~1rdi(=%|$gF73XKhI9 z&0Au$?c3mTrAyMIM&D^ET=Iq~N%)KziMLHmy5YtMq6On=cS}$CMTfTq3w=};-UHs< zRScfHQ!CmMQH?&-WwjXlbGwA$`~$;^c-sgr3CGbGE4ikymwdcSD&`>H@5+>-x4C4~ zZS^F1a|8evkwmXzr_%At3!?o~Wx>x2O&lz9#!c-PO zYvmW=>#)j5HlMmy%{*Nw6QnJ->&tPQxGp`U=``%5w<#mNvcmJ*J{QBeXMEbs97WA; zRxF}aB~OdE?%00m`c3SO!S~{sq~T%4=yy~!I%{`YYrf_C{{-ssx;)1T)Bps2E7AHH zfA4?avWJ39TZvgaR=?iku;v_3&4@0#aPG#f{X|oHRUrJQ1zH$4b87fzoFbICJZj9* zwZ@hJFJH;EW!6>orx9CRuVutuab;Rx7>jwioM+7q_X7YCp5F4-&q>h;gLbv)U3kt8 zQ;2%AwijDuq!(R0f1r#NU9ymG9VQv0*M>S7sI~F&QL_6_nK9I$mfFEk;`9lpB}KP| zUzl|abB7rzWv+FNK3|auwNau(iteuK*(RrE8}75bZ_N60ijP>;Jt?~5*)<)1jaO49 zrUL3uJkBK&?6ju9aFHWz6_Bmwg{UKFjZ3D!423p8ATHo*{ABn|>HCoP27_}rI3DN6 zN6XbNY8so3k&aSrKlJh6j`b3!y@^E9IC^qhJvc2 z2ihOOJ^7e&Q7zOI_lc4uOvlD?h8@>-9huI?tL==}s<%Q{PA3*oZ7fFx`L&o-nqAM| z**m$bWSF~_i$)2Rg7?s5HrWIZ#qjTCn4x(#MQZl!m3QGA*Q*l`gaCljbby-;!&)-VP!aZfg0E(a6 zov+Ty5I~m_G!nYM!Z&^1!ax5%(g1*nCUw(nd!C>{sJWDg#E;oV6au@i zF{r|IAX##_+%p>?G5b55Cfat1KGlNM5UZ~bL}Zl(L&su;%@AIEtG-YGVODK?P0-t% zB9(~dyaG*tt$bPMuAS_|<*yIm+>1vg6$lVg>xX=ONaR<$Bvw;s-h&b_@<|1?ST(OQ>tdf2N5o=HrL_SpgM>kk+vIvK3zm$9Mhc{6ph zU})@7m}7YDnqM0!>r)-L%V^xvs-A%Pr=E zSpkR5yzOF?&fA>*DClqb*x&B5t_L8ESglqSIusvGC)&z#N+<)XGS_K7?CV88DsDCOyL3MWyR??h3lb#rje-Ki$}k_JBM zz71`8^6OXu%RKm(Ptuny8sVWy6SYc(8{dK$d3PO`Pd$^D;<$RQcI#lt=BU+XC%0eVYi z)XjFQh7g?Et zPY*f$>tLC=yvGDNREnv)k}3^9eZ_q6v5*vSo2F_qg|SB6y78i;S-*y2A(eboj^ry8 zS-<<*5))DMb}e_qZw=L7Z#2*YP#^f5W#o;HImZ#p?Njn1mc8TY?kv{BV&-B!IYsg$-BYZU*jhJIio{CaQ)kOON(}SH>0U z{=NRj@(Rzdnj=`~P(7$WZpYTBuD|I{pXe&Qyy~XoW=sw8>q|uopA<~ea~%Oev$N}C zS+du=)R{;PbsiqnDW`l-p-<7b5v{14C%_~@S4$gWd)A8Q`?JG^0F1ma>BMEBKLXxfZDRR^mLP{#^P!=k{4q9Ty1L7 z4gclu>tVc@rq&ttCc9ZCO0oQX6h1d0tD(ez->`8;`YN6=3}Uvhxd?xBDSeL@sI~AH=+su^fMn2WAYPI$sy0rmMoZ3Ev6); zqg8su{_$yvj%5cLo2-xHRI|*`osPt@0NDiKD-XNAlKV=x3f+%Up?-HQju8{Lr1Esx z8hSfcfdLT!mA!XlXW7+tK7*+R%1lqySF}Bl`&+RE5RXw4k~k(RVtGD3T7keZfP`&v zJYDcvt1`@RN)0c-WTjwsx8ZprJSYJtIV-i!Ifka zV4-K>`+C|pDo!^Wqf*pD?b!{_)$k-s79y#PxF)M<@T6uB14 z;8QzetM=-^U@5TpRoBPfZPx7chYp)bhjHsu_aZ0yoy`Z^dfMOap5_Cu;_o_0+rKnm za2btD0un1HQrSvJhY=<}g?AEJgi#mhEHdN!FUGXAhn~ci(fFp0j%Xg_;b?2?IGt|x zwLDCf%))5W15Q4;YjXRWP(w78PyybJ{RmS540AuG{3IefFL*LxS>o~Op#+Yf>=0SKGu zOQKX+1_x~;AYaP2)fha457-Uk)IC(4Ps_$JV896>v%x=+1N_bMzeZzsw%6<=QD=vbndQoAV$Ip=Bv5a*Hwp^x18KPv}5}6bg$(CFZ*|G;0XMGMS z6pgFcM#FlvegWxv=WX5%TCeXFF3OgmCnQ|F+`T!C-*e03Y0rxRV&{;swm5?e^0)2U zPywpJMnHhL9ySE&a1y1c$&R!q%$>3tS1Myj4p0sktb1L-?`zLP`I^W3L0d=pokjGvHOi@pC-*P9# zdu4b46xlG=Oeu|~%ieO-?@>K|6xX+#=>l;dYr#&Y=kWpgV(|qEItyBgV&+|Ql;o}! zRNvx9O-@o{{R3`6Ke7@V&jUfck*F#r%&X~<1<-v`T@U?UfZiJ4%)H&79R zZ@)iYto`jEa&wy9D7-GLUS0VMVXeKPt#=;}Uu>>CCg=ot%L{2_3*}T65)DV9izPpq z4^NSfqb)MxBrvMqzXu7yHzWaHeQt{ty{NrhS<=9)SgKecLFt2^F6SP9Qxn=ZZs(;v zzc1{~^82bHgm83XnwC!wJGlxPSur;@?YbBM5F$o~`KfZNERP18=|wZnM$j|kqpiV? z0Sh{;QS6jX`=QPs@*^nml})D8SK1g@Xo|$2Kjw^$aA(7sWDQ^w@8FFE zh^2^(NsE71FvI{zwoob=!{jx6PIuMpNiQi575-c9ngC*k0^sm4w zOLD=UNyeK|s5HI#Ipdq;-qPN7Wn!R;o_b5>Hk;@U1Dv)slwt;9Ji_-nxuXglSU;1Q zc|7?J`SMiTas+_?Yse5=SWAE)Q0ors+k6cP|GM|p?S9#y0vplujSTReNdP)t6!lJ0b*1NixI18 zucz}Wv+l3%3zqJ~_jZ<QX`U3D1kY=Yk~4n7V{It*o+!uVGnW zs_M?1WscQhn`FyO64=l3VC^<9+*y1!K7B{9K;ogU^x>b)FCfVKI^VI&5QHieds%OH zne0pnD9!9>5iR_v!{UO-4LhnE6`(k!k>EVD5*Bk42w|u=_|?2aPu4HSiTc@*D=^CC zLL5h}mWNPxbQE{_SMPnx`6t{cE$4UfA6gc&SABJ9ChyMtez1Vtu({%ptB@l{8~_6o zFF!vyZ$-=gPHct5`B&Rhjhpr3-Nj8fDM^_~&%4VSPL*}reRznF5tr4$iU`7AHzzx~ z6sQ|od9Lm^YiT@A3kUQo5d`@O`y|PFE*;F=?|Y>}kwS2xa-+i$_Z?RX7%krv;wBBh z$fbmsZ0UcJ5@e}!1Axn7LCQqsk6O|GxmPR2_Tz?}Nwz`>W*`Ni{hqEY$z6;TiCK7r z3lA5dY44I|yZwEJv^Tt1 zZ~Nb{WIQ$eH~<$7Ss^)fWhxbvq9XofrBu7@Rl8bZysGm$`!@EBBS{hEMJxIbU3)8W zSx+E5YO^@djuqRk#8VKUscyRc>vyu(wW*$Zq1gWUMzpq#AzPpP*|(4pSv6UtQ$w)Q zh{pcV1euhe=$PC7{Mp5J%McJksg4{rePj$N6{TJ!ND>==^L}4gkc8MD|Lgoi7YSe=HoFND?^pB?o(GF zQxU-AXT$Cwl4M^6ZOI%=UC{q{;k=1e!D_G%U;?YZPf9nAvo{&KljIn6wQ>I7!Zod; zb>SUc1)opRU&GqVBO=Z`69JE7?PC!U<^IwT0L5E(^V9!M>@+LNNvz{(yj~LN?F|}_F zy^$Yr?Yau4D;XoIz}I2y1q>a|Mg_eyx~8oVG@FNC0UG9&h9iJDyOPtGf1q zVv5m(D$519kYvBIis*qQ%VoBTw@30op2JqM0>9M-RPG{3-_-T|fSiU?3NfJF;lbC_ znPVimyo=c+l-W`eHEKrh&hhniuffw=g>AI6tNDS3L?teJ{I{0`+JmC-^f>~VXFS>& zpwmhU??)|Y>nrlyQ`OvL)!)gMuG*>`SLM=+pB06&x0<1`N!mdgb7SA<|9Y~LAX{<} zm9Xv_7kqi}bV=PWL?z5!TTR%o5s}1Jda@5n9HAAeVYErSYz)=8GY0@TDp1}`&b%}! zN#SzW1}j9`#H!r#dwtq(4%hYuY@5Er$ZV={n+hm7?SQM`Et$tb0OD`YU?zp$J4GoO z@6$}7T4yhD=dkqdgV)!K!@NG8HE3c>p4X)@p$EHSPvfC!KfkBU~adhKQh zi#5yluUzoE{6Z>-@LNa+cXyISbSLtJBpg*C0onMs>0T#Z;u-x~a_S1`>_(@_m4zn= z;8PWF!TyfIWGT22p0(RC-65!*wNSo@U5i1SvDCh{4xUQC52d=q)H?Pa>4J$$sLSxh zki7}ApR~0k?g(m=_zB^3Zu@^2+_Vf zPj^x^J?B>an%4{Xa_GqdbISEx#5{pbfYkNU4r)5mTGp3E>pr-rUXFHPHuCZ~?T)rIcl;dpw>ge(svcm} z4dmQ(U~=sP7jje0$J*LO^keBd-2dVuid18|GsWwHFl~W84y1=fP6@MNu!5hv1`_y_ znD%t|zB^*}Zi55^%20BDp*b&+Mh$xl;%;-lOH(8|vpK-niNUHFxhZQp9Q4Fq0vdk7 zLxPzteDc9t>Cp#2Kj4FDvp|9g=+R%{`0sanD)GTDd`ad)cAf_x9?(;7UR^fu&*6n~ z0Xm%j9R0-_|E!etpEu2C_0|9U^I5WK|Ia^a@32&y4pxD*9y*7~B5qcLO@KT^!$adW zWH^;Sa*d-5_4F7DTR95RE!URvlJA@9o&a#u=pB*zTF>%^gktC`x8msiWU(!f**FOE zd|Ar$>bGo<4`minV{eF22UF$)vN1;a7ItE*p$#raWy9K1hP;+nWWh~W4sWfnYP*)1 z#Z|b1oneA@dO&#*rmAE|2ZFXDcb9nO+u{lQjQs8VMJr$A3ukEkXIcQD8#VPMX|&~} z71v!?7xT94M%~>$e-|rRu(nCLg9Q4?+^53B`)sq^5r@{yysaX`)v|(_2;jR@@4{2l z;VN-xSkx(RbE*P9#}%Pf5CGrO^_t1z7J^152TsN4eI@z5)=2rLB8h(MrlMi{t}h@l z?pg86)3?G?;&kWIq}V{3J-C;*&V-GFQ$wA(sV3ZwAkvYnj~24t{YNK{=c25o5(OLA zV5$P+#9!U|uZ@2KoQAV5a1$kjnmrjv8>3~EIZt`)pZ7WkZh3LA657B1f*1#{$$}h^ zF%#R{JWj=}?6rtxBptMBTsp^Tl)71vI=r4XyYt!n{7pQP?AQlfX5Uork1}v<9iBX< zvsean@rvI+Vq{+T*heR584-v|7OVX*4mLD9n)G`EKI3GaQIv*wJx=Zp<K|2+OlZI@yx4q zwW*^mY@_ZDGSn`j9M{kDHCXy_4zvB=C%eWN>$16h4t;ukDv0|%pC{n?zE`yHfqGQE z>_HDr)_=t_W{g2&S6?1*gOMQcE4hNo^KNGLi#_;#n$Zc+bz8L-`6n(OTCNY-vTyPH%ig8c zAF1$#@gKtO1ft1KqabQln>+ms-UQL1ecB^Fq|7g4&D-bxYl9MzFF2YCpsTfdBj2Ba zi#cT^^zZ;ujhc-ny~66h(7|Z6%DQVek;do%08|u{YQu?uMR}dY+Z>$yNXEBANVoOF z%7-69MFf|3%U$nsC9^7|$4FY7nawTx@{NR}hT6F&=YC$(&ZDMIR=3BaFRsp~D(5^h zt?Ysy-{F?{#W4>JFrQC3n%j8#xR86wV64;L4JDR2&=gd?o9(Z(@?%e9ok8klPM1Y{f+OfQfIYckrV<@?VyflUGoF0-C}AJe~u}{?sXAFrN;_I zE8Vu^!%tLSBsU`Kfa=Om@zn%)ASd%3g!&1f1k{L0&;c@On;-OBpZhvWRAtVuYUqSY z+b;IK*?($NLp)gRqFybd2#jBiS|-tAK3!_c^$wPg>&9JSTaSlCpBClfR0zM7IC4w2S9Pd}K^ zO$$O3MHtWN^@{M#8Y9*)tUq`qiM$^?Y0@G2cSTciOpZW zuhx7q(a}chpNx93cYj|{N1T5;4?+xGu?6qOZo{Xg5M#xwX3aK{t=gIXLSL)h=pO$E zB|si{{>6S3v~$2NlY0jMlSmj`yN-3fGX1K1)aZUfn z3t%jwAb=pMO0hWK<}+Qz-b-B?!rzvi)q)@jSK^HK9PDDUKEgC~Blz^)e2PV$-pN~x zo-40Nfp~gzS+t=?f{BHKlvKOq@D^TR{O-@$T45AxB?2h?XBZ$l3>bA^?u!{**~!ER z*WrF7CJYSDRNK)d7XK)sn;k&8uwJb6<_;{D_asy;75FtEDWT$}G8of^GDi>Z8LOdF z)S+(oYFw>66bidsi;^QvD-IxKL;>$=ME%Aq8j?Tn)?0Zf-k?c4uah1cx^>z^l1`rRB~#ic0F^-=@&Kkg+s=LifE)!su-9uYyUr?Kd{_^4}- z`PG$5RPjE#H}L8~_FWXAJE(Z_uRR-KBX3dEoQF><5$^T(yEJ%IKbr5aUr3;mN)FTA zc|=TC+sPvbR~z;$5lJ?NZI9cjdf>Z~P=#xJX}Qmt;H>BEo*%@p6bkpCuyNN>Qtt&+Q+u7$L2rmhalQ$_%ds{ZoP(J0(|Uz z$0QW$9sy8QwBd6e^E@BL`E#xx70x0IF!F7g3{ZfEZEsJ1{VqMrFh(vh2RMJykiSCK zB>;ebt*lLvn}k;{#%!#AWOJc45!cSI+=xCzv4f}m!vwWQlNui5Yu$@`dN?s)3$(?z z_gi=Dv{-xx+m?fjWg(VZ7Tm6+XgYisfg_`?TM)zLT_?JmrSwyQP85Qea6EMRk-!a< z?auL5R0%xG18TRzYc<7@C>tK0%#`1LVoJuKH(K`mL zM#z|LtPkxSl~Kf@&eKW0#$)C^GrAeJy^A>7+s}>j9NjjPtqM&_W9}O~6A*Abe>-+FuPUWjFI|RchIHGBZ*)#g$Zx;N=B7xDB>y7Fqoc2NLyks;}fb;j)*< ztZUt+r$vxxmLH7G`I3mw@!c#$Hp(Q-rR5=w2<;R4tMvd~q|8ZEXJ!S|HY(=d?}LiT zIV*A}23eF(`G>+V-B}u2r)Tw&15OO}(4dsIg>+l&wqq3E83GnjGQEHX?{7zYLx?ul zFIA(8?MRivdhF(tFBVGsZReN^RFTcUtPMQqS+q(4 zba&gQua`c5LyNz)OkyLvc2KEZJr$j#?+0b#cNNxvXBeEfMKR&2! zUvcS#ZTrr8VhGsOS~elDmV3UCIZ)=Xj6AcOL~_0Z|La+{Q4;1D zirZPngY=f(V9m_UCEr zB*;a66YcJ{65u2}N6L(|=g;bjNY1REksyn?4^3@~1f{krnBk2(o_jQsf@E33 zP68zg8t#pr6_kOG!)4j`AO2%1I(}#FRplEm*gBVxQ~Q#s8f5O`XLdOv6PEqrAbK?90h zwCcC<l--?fx1ldGqYwq>@pYfaoi0Wvy^RXr z>&&N1^K8RzvTU4OHgVOY(8$%!gkBnnJKqihE;9wZNjBD=El)dF?aEuJQvXTTypD&P z751};Ty{N^ok<1{znnG`C&4y(}-WeO4n+-;*g@TilSX znwRrT!5Z$9^D6oh*~DuTw@S-m*#Ke;bq z|70qdG6Wcv;~Ym5IuFT<*w@p*um1a^$MYajFM_TxJTnmi-k(%#Z6*jtg2hO!C&oUA>86Y^*kIg}I6 zRE~TL$u~KOD=~+}9|x3;I7unWkjV-nv-ZzbALKr7zk2l2Qle6zG^&H=987Kbgdz7k zOfnE7K~^OvHrk&Y+ZFi3KjVg6R}q~}wr*ocdhk|R@JJZ~Ahw2pqKIYwM?x3tmWiVu zAqES0+-*`T4*2PT&$Hl2?PqhQFZ>v6YS$yHwZUa8Afs+N4#O*1Ppld1?!S}VjL<5_ zf?4BibUqQ;dZldD+8fgJ8|~0Q?yi`^-$r)~R`Jyct`y@A@o-I2HK;H#v6UPexbPkr z&j{1%QCt=)qt{MO7GPsDO2{xGE?r9*A;|%hlS2l&$6d2hrm|r-GfpIx8=9$xHGdW% zDSbRNB-AL3LgJq!VLm&Nan;|egb5PVHba%Aivd0pq6CoG{d*s4}>5@H!q=QY# zj5hV-fJ;+AhVnEU01$>NErpUh{62ak)oRd1SR0#Km>42rZQmXHZ&EjBCR|6wL+e*K z>ho`y+We~GNR&}8LI|#T_(Ws=+I|>0$X+zAyFk!;kAkKUL0;<4_pO22DDPm$)H+b}^Pi?yN5eLWUU z8jfnc{OWS)p9k^HkEs2xxv2em@V~?^0d(_1v8;;ume5mZa4^c%R%=#k^@k~g9B+)5 zw`5d@Q|vc@FJS)F&zID~e%fSlkQVi{FhF-0NdUOTmcpNV5eyH=hJntt2}MgIgaE)j zapENTBQ03IHjZm3_`A;#{OX_fetK9_O>6&daKxHjn!h|*g+C;nD&0+2B?|LR&5Gz# z_hh>c^sVy*{SIBH`C)>B;`6-Me|m@-IkwicKa%owvxQU7AVjP( zNp*ZohCOA~=q|o6K*Ji@T3nK@1cU3w87qhwQnZUPE0w)`j1(H%i8nvd9_an$@0EsA*>NXEJzxr?kjaqtcu>=mJ{i4&^n*D@4N?{2)TK z*O8f>omzT7N^XiVuQP0D>Tj^Aw8BoWZ=}vHlW#IXU$s6H99jOt0G(_`NMf;(Z7s+| z{4Y%Ba;pEcxa!-4T!Qtn+br7HkOw-#xBIiy^{%*<7{5+;ps05pDxKDS9W|9yNH4ayQsD)O`07gXh*WqL-LQXd)G=YH9qf6l-g0m3V-z-0TjevdP%E6S zxRjk#i&afBN8HAJm>Yz`ER`5CS!)g`MAP#Ahc~fz1IyuZf{X0&F=tYX9B}f$5sA)=_(4pCdrHGRwly0CvxFE>%|Ogm z*9b!b@S48Ves%Sl&*paPNSATo>>}FT>e1J^+Mju^&Wr>3)hS~^1V)`nUx(OxY3B9$ z4-32n;)lcaw(sLyvP2s$i6m=g&5|ax5Y1ouYmTn(MOQD(YV@&#0G7c?hh^}t)7#-7%*IJ;+-BFg zS`f?5jIyCWN#;5SsA<<}fs^(>tm^~XkBdq@sk{q4GZxF;Zd_Uqui7!<`Y=%;`oncI z2$_Jz`<3wbLit^4uK0rCtp@8VqjA>DeQ-?*od}J(4f2cQ|CTlOn?6kbO5e#%zS`2! z?8ue)HEPe!&Q^<1CZ_m8qYVX6orQpmC=ICEfR~$`36hWcBx!`h#)L z*qXiWc$TJ{tRRGBNnf;JYX7@Bv6#ueskmVCj$Zlxj){)TTskeHM|AccTS2&V9|zC1 z>4L3>&z7biqh<&Ptb1kSOSAdjSr>^!L4JRim%YDTHs9o5Ol)K$D($fstuH z?2k?i@#Onz<-|GbMRP-YNG9RMA&Hb_({k zKuFSe_M$);U!m@IJeShyWQ+a-xUY$qXVQbJP@H{Bm_aZ4}nvdelo> z*UMt7Al^r5c$Raw5&OgaZmyoVEAuW@x7KSdwTOybSQQ!4aP~XiY2z>1+=TfTS>RZ} zL*VY-Myj==NZMHlGDFCkeMuSU_?^K4pixAV;ovOD*%!=&swdZQ@pN@Pxy#FdEpF64 zp1U0JdORJVOH?v0w1g4;K8Kx%TDYpc67_W;)A{|nGLs%4JoDLZ1>Cvt!3;7ebz8w_ zmij*4$)3UZd|trDvLnu zMR6C3ENSGsReouH@sha&KdP@4$N9{DORtU&=e!d#-z5B5gI)T-Vmwv9&FTfXM673Q zOle_e%c;oE%wR7NY77SqG5q!VoS@?_8Xuy}3k3T^2TYs1mXlzbDtPV80L$UsJLs%) z#r$n=$683+>8R!Flnp50lwB@gp@ze0qt{@AahTs~toXP3opxvI4fnrYwWIaG*>hm(tm0pHADfqn437z@!~oLm z=l2Z0rjIucJ%H~~;;#Cbb51)?ofqhyJqN+-upG}+(;efUY)!`} zbmGDIW?byJD~D^%IGaQ2DiMRsmy>rLSB)D(j5Nn9kwknhL~g2pXot(Gi44B`_p}Mf z&q$+Hw8oG}E>k6|^Vm2Y{GDEh6CFu;Jc`)P2A2~Vs|vx1;Q+KeP!&Uwv%%9xhZBgo z%Jn=aJP~D~O8XT$XDE;(P*b8{>o} z$f5+wRXgWT66x=wgZLi+f^S1i`V}Q`zciz?(DI!@`@eoZB%lpI|yVgYjD!> zERFEK`j8EE@+*#8iA&DPaCfrr@u@#Vp%uKd8)P&8$&D8l|xk<--)tUKhG_T&S+bgB_D|6E?(*TI|8&H*QmdN)bE71{Lwy zv!2!P(}!e#p1}%*hI+WJ-1CW_MoYI0zo2z`k%ARE?htQe9DgLS&UD*0{g<&O2z3Rd zgYZX#f0HmFm}V8f|1AKE$tpWYBB|eun(_{O&wQ0_+!f`4wjA$|O4q`i?gd%@!{Y*; zYKon}hR`waE)0v9dy3nnV+944L6C767v`5J$K|R!ZRdo*Q(jOZ5Pj69bwTGh=?JPT z_w0TI{Yuo7h;^FJgLm=Wh+{{Gxn=V;^MW;-FW$cZ-p8kFYUbA)!#rE-5wnk^%PH@X z0(G^=@mEbVv`en)mN+OaVNGFSmc=1Q($7wu#|6|wPa2QR3Q%lk6~F)aNn;;WSO9>$ z)oqUA%@kiiVhbJlR2`4rYgJ#{dejR_a_(PbM6*o+Ywt7LG^-(i*n5E`Ot>oXVHC00 zNQ$NN+YeR%Kx|#H1qzVOZmacd4oO9~12x9bB~ci#Q_t6>gI~n>E^l^#RtheW_S=n+ zIH9tt>}owL*8iU`k0_4@R}K#!8v)#jpl}XO&Wy>mmWJd)jucprk|lnA8}G5BZ9 zL6D+gUf{-s>s$D7d_;mEr-waK_qBp}nTpmJ2M%J%ZSkUx>7|jKa_O>4kf?@Pg3I(@ zlymHZY}2&{M(vU!NV26eRO@(k!4|o*`dfasLBK-=n}R zLux$)rhD9Q;lhh$>j3E6%Gz<=V-N_}X+ol0n;OOo0OSQD|AFeRX61kcsBt1jng182 z<)8!z&S==z8eb1#?1Mgk%Vxp?+vm^hutq zFjE7%-3cbKAyldNz}DHwph{5(g-^b|NofmZk>HU{{-BWr0B+?ol3Z|~ncdv? zq(oG&29X_lXKt+)Tj*i{W&=SwK!yw5Q5IYX8Y;WP<>$kIxa0GRVr)Nn2=gCY65ZC$ z_Gy&@E}_JqHr82j+wMTMB!ntRTYEOMehX^>0RQx*?E}*>iS25(fB5mM9$99Y+LVU7 zPj8}4f7*Jb55C<_3g{kK5WLs z#l`%Z^y&92!OmkV0#0(P3Gbo-hy8!i?AI=`F1TM%lx$cTErk)0^2l4Gw{S&Md2imS za{8Y{dIv-PLESKrfSa@b;^hJ6(7!IR|Id&(lj)L_u8cQZ4n-iA+zV4%CgZvObd_J0 zx8YrLXQR7nVDv~}5hG71gh)>vwEfCzkZMb=H?3-Wz&9Y!*mbdjbfDh6b+>V02 z(947fo(`MyV@aKCsFh6`n`~~iOi>V#f*!K4IJZ^9=CwN^obsPJ?dS42h6akkY`y!~ z)NxT4^<|u6cgG*>5RR2Kd_Sp}dCnC(v-jpp($pRZ^r*ZBYY3qpx%c@-0y=QG{La-UY^*bhR?mIO_ko$Jp8 zZ0Am78oC&Nmu&9aIFt#%n^5ABR(Cs|!Hxh2G`tov`fK6>r~~2{EAV=)|Jz0%QvBd! zeApgMEN0y@Lj2&1`jCQ{jw^hdH6VS`2yu@o6tiLD(^1}QvVEK)_V)6;>uo`_p$>$* z4pX!Obxo$6Nj5PqD9~Pt2t2Q{w;+D;^vZc$cWN~p>dS!tNmCUtclxU0?UFBc;9}MD z##V}4tg4BppLd|TC=WJwxmC;sg0+37sT=f2{hut};@NRv735iCUxYEyTZdx((KI_c zPG`bNG+YtM!A1$fCEzH!x2DNwGrTf3FH`(kS<@F*;2+Lx#=7!l$e~V5SGpu`q15@O z2ceg;Q5mr(0d)j&rH1pOjoPJ`=l37%Enk(Dru&R%ZTU!qcq=o0bnyUapKVT2+}H1s z-7tUkeEw@-8m5L4%L|2e!>vKz4qKLMd6klYhfa7)w}DV;>dUlmEn}hmW`{_qU)wV+Yu$)QpT_RX zhv1Isrp^}Ibq;8`J6z?@{;{zn5=b5;QUKg<*X?vCWTxzNlJZJR2ZpBS#m#P_-+XJfqd-{ShS;)via9I7c{));Hf@RIggy9T@_^zzZy#u~(G{-TpH@_|Gi+{i zQrCnn9r-qH{!yc=2hZ}hQof<`_VNRvldW$jtpkw^Bh*)C!sgAU%6>9n+pdOFOlZ{# za?0(Y4E5n4dAOyFf!31*_^EY)w^;@pp7U?sT&`BWNalUC@=!Cw9=JL4@yv4U&iF%` z7lNwp7o$hNe(mC5<@(bvHCzJYZT{1!IVdbwM1sJpUkztqviU|P>T}eJZEH=l*LCpK zPXOt#INHPRwRl@ydQkIHs+rQrPcM`TuWl|q&i=wX*wvU=5f6fZ)DJiyay+2;`vTX2 zRz;>l?$6EnvaA%;r{xV*O#1+q_ZBXH)}0e0hkqP15svStXtyL8CxHy%KaffQ!t5t) ze;I1D2xJotS(&ig;p$4CT*au1YfpSN5|x0XOTE{27DxddBM1$ezD3_71xAauUDU~o zd-G04Vfy@hzsfvskc1=4!#1#5b|j;V{Si-EF>tE!sTq}{=QkZ$!60Jqc08LA3znF zNGZlc`YsP%d3fYexD+cx)pNOd2X&jO|bNGG9J0*YM+{dGuwrclLk1t1xxGyL}zEYg*ch# zhF@_Vmqlsv_xy)1Hw~NDbbG61&wswgQbvKO;n1ZfjW>v^23Z0i-y-GmHgpzzUVqEF z5#tTZ@e3M$E9|Hw6bAr&&rP~q41SV(Ce{~?XcL2_=7o8?U&@zDds{%+I4hw)T7qko z@+XHV$?Jibvj<|6Z2o~FRk43@plnVGU#Wi_ru;fA`~UF*sC+m>&At3+%$Ao<@N=l| z`Srx7$u)~*lyUyM`&g1b3moJ11=Dii-4R7gPc~w`20mv`-jQoI{7MtHENjC(#iAUW!9L=_w9#jTwLz(KVAK8}>Fbg#q*Y+ZUuS)IVP_ zwRls<2B zy1@OuX53Ml)MDe{4$AISCNv27dHN+UXqM1&!Wrq{V&+%dm^&W*eskaV>wg)a9s(kQ zIu>pJaA-DL*9*e?>LqHz;186_Zl(-y%r1M&0u7OAM;S3gs=grAL)ogk#5`ztKr`)gr@lES>k*d{BGTTo! zu8!khdCMMDtIkLY_`yQQO=%MtF0E&c{~TU5tJLN7Hh)S85kgDvPF*Qfa%ILNGysW3e~B^v-#zFD}UiLE8ZBYScZI9G38d2U!TJ1 znY){d(s@ZY0zNhV6=itInv1O>IY1SLkPa5v5IO7-fuRY?>BjYD8qycDrVvThy54L& zEZ;@HA+JPgq2wr z#Imq$W85o3j845Suj($D&$#CopRBg}O9pQEtyBJfx!eP6Z7t=sIk#UA-I?-#w{EoD z|H>%u{{Y${T=EPDY4u}BWPs}>(-er%4aIU&EDG%vp<70lS;5Qer@~rLR9T^u>D+BSnQ*kUcM( z5cp2C>3KDC56Vuwt0+DSmO?pJI2pxK=QELvxA$0M;eBol#jx1}-i-4vwocC1gFd9tr}Y$45;FgFv^OhO194OmXYE z8zB*iyORj4Fbba`59eYsUCg$DEs1qCO!emV?K;g}{?xxit&=Mi{AFLSsLcc?;AC=8 z_xEq{zu@9--gxXRZ#PgniYIl$FLKtJF;S%QuQki;0+G58@Nfuj`wq?(Wz@b;rQ8V> z%*BHKF#HQ#?mc`umS_{6oxWFQPS`v}Z5ZVN zs~{Id@OY?#H;yh({VtP-FTSRmhgZjG!Bd9tHJNh>Ro^*mkZWhOjq1S}0SGUg^X|Gr z)it{9{m%rP+~4w(MRJ z?nH^f`YX(?sK5T}+5fdy7<{!AGqzP=A~`lI zmj?UgR(a&t4w67_G@{P2i?LaCG@)S+G$zioD#HI*(OS2+X@~-$qqr4_w)FGCsS`7InDGyI zW)Xq^Ar>JGp_w3B+F zCl$L#=Y`$>L1XAqNWoyj*H%U-i^JXB)4!&k!%%9W{&#A2wVNIq)}y}%1rPAe0^Q2j zKVLb~TneFjIveM_hYJPYZ({YpfE}G8r!ewDeVOh|jLc(}(sw21kOKbwFBT4l4=4l; z494Bp+;&QWqw|-HzcN0mHP#ZtN=M=AM8aLHiHaHPsB?dkxGai)lvZ7Zh9UAS2)P{& zp}*|@tw;_TEmw0$tu^@gDYLvOo9n)2bop{z7$-3K)aiNd;)(EdU+PT}N$J&3Jsdk9 z_p<{hEY?f|y)Aun!gwK3C+@D3{7nJvRT^j9bzXyzj8uX0q@U*M=nTPsold<;|p3iV#1p@JJ1#-F4 z0Vhtikr&^+_W#xPmQithTl#+!0)*fW!9oOrCj^Ja3BlbhxVzI>f;&NiyL)hVcXxMd z92$pzc|X}g3U|q)Y-QHb-^wxnt@b{dF45#=`7lgi)jrG6Q+m6CC?T!pbymR< zJ%G?~w;GI*2DD#@CV_$mum8lA3CmHZ;wD8YxCic}k!S;fbLI&B8xOS2N zf!YUVx8>Z$%H&d$t|)2YXL=sRuLCIyf=-~O+_!SQWz!2Pm1UQI$B-fOZ{dL0;9#O# zqSi@WiYUirhlT*MoDaU5DgYy?2Za}8{Wln*PXe`k&b20rs>75P-t1gq^<`%bmiL;S zxy85XL;MtuQfwLjT&IlcHhj5(Ncp7)?xso82anf`>g-5Z10(wY7=FdX>ZAj_LE0c2T1!H$?PzzlJDszZp zeDmi}lbni&xPtCiGQU(QF|~CFjggm2j=s(SDxh?i3OtRinA)!23n?206lMtpTgK4Q zoP`oJ9Mgl(osnujA^WFt3=ba6h3ChfFlDRwr0p2vaaUf%vnnzWD0;;%`!%-Wg2z66 za8v2=2uoD(TRc3l2J1uh`Dy0HeBWgvyjR9?^-kJh(j3T!uX?_=rv=~*wU&=U1V68!IB^r`Cc>k+%6Y?WiK z?FAL-sY`QxeW8FA1dN^l-fqU`L|UX8%D;uupZ!I4``!7UFBwkz#Z34;PGZ){cnxu9;0x5#PpeCgaMAwvxm9r%umZo*lhOOz2kT?+}7d(f<^4cZ4SP+^gT~ z?N*vj5kHo;&epFqDKcpuSCI9a>FDnTq4>qBKQD$4QhSDWus~kM^V{ofIzlt7-&zyt ze!<2+V`ahxtgE}wm1P<7sl6@f?p6u}fxgmoC+t?Nk56s@QTnV^LFvCm>3I6pGX%iy zzrwZB?h4pWbmN?CLDtsLF-@*z=gr&CWSOiBRkkGff5q(RW^uz^BQ26Mnb0L1^6F}D zZf&XbAV9R436{CBp86FV@zOZ1^f(^KQD}?=#QV-=O1X9H`FC&i^%3_Q%cisnsyb1>@n`SfVCkK4Iv+ zXj%p@upCeK7UYn2_02gl>r7sBXBYN}x3B`7`ot6aqjd1M@#98D_SfoVry;g-E%+Bd zzdncB!jy|q&9JvpkUBPW%>q2C=MdA*kCe%ehCW4AJmKOthtt4mnvuqDzEG(HXv8H( zl92E0y{vtp`CqCI&Xt*`JDV3ycT_dM9j5KbtdqPAG*_Komo390(KY=1}(KzRKX{+ts z)G}Nni)RF~2!ClO5GHo7XKgCL1NsJ&hTvAAmY6~eUaU;7!TT8GG#XmUH%X%M6~I{& z9Cz9S>UU(?m0SZR0l~$A_}O&RJEQYYRZc5#lA0^e;s{&bo! ztHK}Cy0T_A>cv572c`!_)k+q{^Yzy12(q$pcyoRfOhQ;P`gyN)5PO|lc(Eg5FrzMJ zDpN+Ze{NX9R@ja}v}iim6u0 z79!&7%0^o53El`Cvjk=2&Bv)OEj%S3k!!wf6m8_W>U_qA;s6@m-E01^dAdO5lMvMY zqshV^#MkBgt@Zn4!8Ve{D<-zts)dvR>5QJ=og2#8(5Jzv07MXHHIG?Iqp?f3FX>Sx z4FmUSr%w<1`M%70ymqeb@>cgv^*diG9!NQ6s>)i2RM4GFRhgY@<-&3Riq#`|MUOjH zz3KXETKBaPF8W}8c{XiY)95B*YWYrD&O9&ws!o0B)-YsP z@+5d_?b|_!n!cDBmYm=Ae+8`9e8#d(@u7RwHu=uSST#(;ma{ZH+gF{W8TJ+e76?e# zTW$5;qd&M|*}T8qx=x4rF$;VhF`%O+=*}vb@JPP=9{7sFXW0x|Ur7JyM0k!Gj!C{0y7=W%}0Qb9>0uxlj%9Ol3m(Ha~jZ@H}*T_}^K z+D=@$aV+Y&RRE3!DB{$(UJME*?d7V>6=6QzdVl_c)Pu4!WjQ$i$#Q)*JaStAH zJjhP_yoV}RE}AxBR}Sc%p)`J|i%z=lFAw5yjTQ*wx>rlT@77y)PnS1EY|qdW&GzwY zO$t*32NygGdG4T9{T_DJ$&D_@M=^()v>vHkZryX(9t59y`2=7e;D$O6gAXa#j=Fok zKvUBnXRyPiAKe(v`_-@=&kRGuF`gGej?C%I2!=Y1qRMPG76fD+Ex+M7WYLf<01Hnos>Sql=9(X6wmBX?A>&Y|x1h zvAbg{-nZRTre<8?m}hslPxfg&NwaEVx-i@{YNtd!|G9U>z~cTsOjT2WH}3n@ef`a6 znR8?#aRP4=HKWpEKO{2?K095Z7+p@&rl{b_Y)g{fID0ysmS>ds7R8b<(~SFz-iXZX zm-7pl0>}PRC5kAoW-s|Z11~SUFftn7eZ(&t)!T240IcRm3ijZON z*uY=l*-fRG7L-^#d)q*^|C$;S)LL4)>4iQaAhMs`WYAS;#KwmK$qdW&d>NK52~&HB zSO4j<*E;7Gn!Z6=i8n5rm`nnk4h&K~@a#3bvU&vLH3^ycLuEx#%oUMk$7-(B+%9y` zq@cl=`(pmjx)@Q^8#t0VJO|!=eg&#Ib9qnraaVE1=o2%Yk%+<=5u(gZV@d(Xg0P<0a>!i|NP>dmFviNxJvFTg+)UY{0l8@O{VkqH26Cn zR>hO!6xf_R%eiV3*Jk^rPv`FEHu%&-6cSeOrtcdVlFz88>Vh1-GfssEG1N7oXIb+?R7nmci1a8t};b{aZr`+MJ)& zll~I>Q`Am)W7`T}l3bQx?}Rm+=e<1w+#`PBx|jw_&y|>@k`G7Ss9i4bnNE1#Iw+&Dk&NdHSJEgN-LrK+-&NZ)Lay_=&u27j3pVnGkcS}2WMw+p5XIi zX6FmG_1t;oRDZl~iQ;*|OFN&$fqoYy`g&s;N#~5$8{3IL#ujJm(dmx`WQOK>?gjynG*qoQTA$Cz>0L9AHF+LA*;lOv;x zok1k}tkqz&8R?*u(^be)IzJxrd}%WqO!-uGWilJD=LE=2YMr(fRbODlyUP^2Y{pqv_h=U1EDSaMyqTcf9> z^kouJ1QcrlL@Qh6p4`ZDX1<9nGk9so;nG>RqOHXY49;pYaua=sU2LZZVkjy2jd{@r zB1i(esff73vW?{@uhnJt8g}iAc8Rc+#J=dDrDlBKcYx|r8l$2&M<+zNQak@fRAC6j zNqNtz@3AA0raao0l@vm};J*89l^mw#{2R|&tU_M3Xto3AqILAxnIt#{w(CHE#Pe7v z!wa8wXWsJjXOult9zlc$uYTb0+_j?gguF`M%rpmx-ILs<xfvH<=Zf~eZvhhiPAg{Yj|W5Sd|%%EL9hEzGKG! zJ?m9;f}$KF9hW3toe7;%`&`|28jq=`#6gaea%x?VJOOIYasAK5vv{ACv`)h1%6jevRmsDchXVVSP!qlC=GSCs*QdJ-L%kBvX?pHYV9%%f4ZFq>Ou0Rz6=iw z4(Dm1ZT*Sp+Dw(*;GUvhTgq$iQPbo@2{XVjcFN@v*j+&aK0c;;h7MM|k0MuAWHrSj z;Ih^s0a6gNYdzyjmwQ-cPW}hF*@Z7iAcYVlM3S$?raX--`j)@VYQV9nXI)|NK_`WB zFwbpMA$Sm}dMb;uR%O!5-yQA>UA|OWJ8P%3y44bQqb2_MKt&HWN0_DqWsMXJ-cGQa zGZ8&A_bo#s7EAERa)X|~i$iT(|Gk>?5Fh*IsL4db2KZ^Js>2A|!#h$p;`=q!M%$+( zfxTBu+j*n%S65noVkusq%_~M1D$l#IEyk(!3aW2C&mr>;1X|UUx9H0QVMfzm=dga6 z`Yh}DWSr<=0W9>@WsPJNXgwGq*C7`Qz$7&b-PHf!5{H3KA>Pe;&j${*{w0Obl6~tW zJyw^MGUh5bvMTeBe9;$UeJ_Dy*HNwr-FxQ zzggQ|DDGmV{V$oO-}zuBG(YTiiI7h>ABwucmPwx!I}6Gl;+GU&PzX)9 zrLA1Ci6_`~(sNg=>p4;3I0ytBMSpeS!30^ONCQ=(^-2VR`(|vNR~eD5o&(47Aw-uO z+{ZkI^*}B7p=;~ulv&$m5^&&1$~B?Bd{m{u84~LfFi}E7dB;911m|3CM2#6ZID0-3 zIuxawah7}nn^j|v_TveMY#4ZdQ6z5&BB+%Hz+OL_g!9py5ql=dkE&&($=CUAp80J z(!Aevu)R1@dhu%FGY(+ktj+eJbcx@14$%;}Y{VS&2~<#g(Bc&!rOI>Typd zp(Bjve8|R>mfGeRbvA4~th!IAmo2}X z-OhIG$Uw~7zP|Wsw-(&C4@XS_@Q*YEzrrsbwgO4S1p|lvOyz-pMkPcV{gIxThIMWD zrV5_SB9Y0~KtS^#mYIhK^Y>9*MH!U*%cLT~PcDl9;}l^Q7Pp`7JqE2<0(r$@IB}=q z;TUEq!7?&ftg^&K-V?q!BDUi{iusIeuNSsBpJfyRYx#ra<1(Br!;HIU&;Ty<|bEXkZL7@mv>|SZYb^u-@9&7*pRXDJy|uac&#hxYjIk$ zYU}MfBORg_-`|R%?|!7W&(hM;5Rg7Br?NmjmXcAnzZ!jf=ONALWH&}luZUMM{S+~D zv~mwA5c4?a+GDn*G5KChLqT&3(*pKbsI@90h0pwfb=*#@Z}C#na8ipwBoUgI`b{G? zD&a-JK?UKfUlnf)c2VK;+_d^|s(0gw&mxsCrkN9*%N1gEKvP|KOur}!(i^i7QRC|0 z_qaifIg52qA#r=ikjTKSliHCmB*dks&0Qr7UqueBwa4#1qm>d!p79^gWbVfTzc2$N z$uYT%_VJxiabnZ4Rt#QE#x!h7X4Gic%Y9BfW^@w5F3oFIv7xoAQ3w+*+S@COxeL-z z@;FFmR2E4#lCCFQmK3~wr7oagA5Uf$l&c-Ea7cOop32$Xp>Ghy8y$K#s{q%+;lmlO zF*Rs(c@%*%gM*zp@*Kmhn$ddpB7(s75aUJq72kr|`+veqDyt_c1qaEvU#K`>mhq#9 zw7=u!@#q_LDi`X^o19nM%2x|1emD;9pL6ggzN+sFQeN&yHv}2m(YzgQw1h;Ce`?I> zxih3=yuQEwwPv3$cobRCbzz4wp<1SlPRHC({jHzPNo@u6o&v%&N_HU#^{&k7l~;G` zca*%lx7Lb!#yU@3KeKjmG3gV)>y4CEa-+Yd{oX?CWht}y&#Uz!WUBTV*hFOMTjaSY zk`Atk-6p8(7qzqGVBTS$Ot?p`iTw{Ct<(6r@3LC{9xxWARS@@gp}xO+0fcvcnsFkQ z+?jUIVhQ?%B)oE4?@oEFQm#p+j^2>t1>XG=N4g|!vZ{iXoX0pYhN*hVgpJAo5nt+Z zxxzQeXmZjxS~+%HHmjP?quj%>!CC{Ns%@ZpZG{721g;h(`qgM~-*Eq5V;Ith)zm_1 zn?UwanYA?MgR{HkJGfY|w0dz8^|l&IX~9R4+Pb17qp6JGob`@E_Ty#ISCb-1bn?AZ zqL0CE1>q3MY1T#iQv@cvhu9D@>&l85p^x-41qfCp660`pq4D$tj)_4G!)3B;sgBTm zT>_v)KvRisgtQ(EA8C}39QMWep>~6e-)=5tVe-PAJCBilDnK*_BQpb=cbefOy+qeGw>-i z_5;i9LmZce`+>vVUH-XGV}0BGDKRmQd*UcE#10?#+D?(dxVDj!vbYE`+EcHACDN7l z9M^i5#3dyrvij}!RSRiJ-`r>Zk}WLvD))gP;x6=-n&FgALfp|9uOB)4oMZ@`f=;>m zY{ytcvcx2C8r5CgjodwAlKkMnz@#ghLw*E8E<({MjMAK}k1f9|tD~L=SYc1b;%yV) zSDZGIrhBph+p(6aa|C_fg{D2H(gi9q?fcB%e$ukhX`GIqaF`L1w8e%*n{c^R;OSyJ zkw>tJnOf{9Mrn8T<)pep4|(iAo?t-e8!HrBsfVNt%5*WLN8trqHf-pW<}+tBlh9AK zeQ$|COGbC0AxT+L{7qgLcNF6G!`-E>2`TMj&DFzjtv7eQcl2wbuPGXp7+q_%mbqFF zj;wC&kGfYzv|DhC@KKV$9Ps>u6rPclYhhNe|g%QFr7bzKOEGCYJbBwYR=wK_b z3RjrrABgvu*4F+V1q53+0-5WUw&h0s>H6S{gn-889BC@YIN!kCuAOB=>}e2sKLn}v zy7#8ElpaOp^gJKn-Rs@+hG!%uX*&?P`h}f3-@9AE9j66xV_j{S*LDvFrofXLpDeB- z@#jLw9k$fJqF6`vhW{7EE|B9b{1?rW$W!0{E-wg0n!9YRDcehKDbC6Mdw8&K%11uU z!nF!mXQKzC7dN%Zsk<~_j_FXaflLiBNi5CIIZeeca*p|9Ympjz-#8S62p%pHq!&`6 zmyZ)HlNB3$GTz@d6K+pcgMKVUC$Kj-{we@_*ZDf?774;byr*9v2$cIGx;Rs_W{+7q z-AQ>SR(0Kr0FC=5-Eh<%Wt?%m+5~kr42>q$$^Wto{_^1^IX*wYu+ap=aAE&*r2RYp zz5;orG2N_=Bf}Hg;|T=R?|J2`zX@6y7V34uE;oKIABCA=z+v0B*XZ{P-_quKJ{)=p zv=7ShhR4XMDcYHdI6sKp@Si6he?b9`Ez8m%j4F`2bC^b=H1)xBEiF4JsqSp`bkXau zAMpTJAyaf7O4LP5{|H7YWt~rzQmc5_PJcF?{ha^il4IKkGxsn&yDx?Nm7ODoJeAO$ zC!3j&=Nf*B>SR8`i0glKz0p)>*c4VNUDaa);gRU zljgt0iuNBlSF;xNRK50w`aF=i@JgI4#n5vHA-^izZ>YO1U8ovWx|OD+;0A$m(Xg%( zsk%|M^-CEZ{JK{0Qqk1njC+h9*nlq-{J-<;^lkV*D4LB6R#js`>a_5~`&`utyA7Wh z=_*IZtu}rhI2{XATVj0|QnFC=6k2T(jYTPeM<#ueoNU)(wd*)4IxrVg5H2a|qUGQX z-D}*2U5=o&naj!yJ7>AL6$7v<%W3n z{3VqfY^qhIeb>ZbmN$cE@#_A;Wpa6uzFeIEc<4;n3v8yCScoJu?!2wg5XL!|J%| zdbi^hhX@x35torURuQ;S^k zxqjbseHlYCs-w31wW-mWs#E`%KIk76&aK^me-Jkp7-uQdMJXf8b7$Q#C@#o$tsDQ3 zI8}Ps}LL@s!oKR9_1$U0lrey9+bWwp@__;I(jIb3q#>378FJDk%w`4iy(bLFiBpCK=I7LniWIn219 za)B?@N%aE?N{}Zabux0N*J_LZ4x3bXa*lnXTJvVT%LG8QtMV(UVaHp^Oyl2}bL4N# zIr*YOS`PszsdzPE;PeoxhA^k<{6}0Q@qHp;>Ks!1JabeC=!y@`)lmj6x^0Q-wD;lj zP_a7==zceL`IS@x*>Pf7nCNB6EnR4BzbPf5BA@>Sb^(>b}9X5tXtOcM3qP(xlWT>%1o zXpP?5{aK#k1IuL()J0~&HaV1muex#<7U8x7ySK=-P?verEs0Mv1ulxd z9PL8}Tik{Cn#?hj+_Di@&(PYOwcIrQ@$;?alss^bpXUtT{*3!LaG)J}yF#4^w%PfSv>Y~Xq5`>?SxzR5Ut#YThEMy%OH-!? z0TPLbc?oDYqbk;?`yKPtWUipba-Rj^tkN3687dW_iO%7&4vT&@biLaMyRlP)MmVJc zP*uG0XX>bYLCFRlU%7O3n4{^FNPfpglNdw>t{eReQ_8gQg{nIj#4&UChu+X3haf== zJqm^LVTHU2y_@W0Z%MGZ;u2kfsG{Ar@jw`CBRB1W zxJc}DyG`kY1k^FiX_v1MdA@wkj_*0UBGllKsreopG+Tx#+@KJ3=b^WF4p&#+2wMv>Rc(0u9fyj3?=kfL})tZ+sa;wp|j_6J-5fb-aUs?+dW2UAFzA^s| zTWFUUJJ$@!q{v~oPMKVV*#1^iXnclHt_S_qN49@L$)No_3)m}L*hmUq}Y2*6mBTo?zPWjl~=}x z0#-w%E7<#_qZMvT^;C%N_e5V#knx4RHtyN=;i$g6tjh12P;^ipAqFaYgxZvwvsv%r zw;4+a1MimIg3~g33yr~(QOOr(&}iAEs@<0G!1Y_MMG-{}=XD3N6faU7sL70`a~S^x zX=5zuNRjNQO`Usw&N~BXou;8i+saB@@J+bKWWtv*_H5aL7;ch zg>UYm8oqaqWwX+|ja?_mlS2M4qZI($Mj;_GxIA<}{?Hy{5DkFOk7nTQ&~j$6Xp9Oos_L`5GZyJ9Vx0|4Zsa!|+lkIbBjv{){H@qByZ? z=k`u_Ix7N^@rjCHJfVM=R*}9KP;tHIjXhlxOUzqbKB(1r|EIk78Ibq3VhAywz#_U< z$07%LT`}$VxGcL5+5>g%5>MBPz-3!!bT$+`ki~ZSbMRf<0WeErRU~BB;_bxWNx&>w z9z*66QENL_^PY$QAY#T_@|nZlxqH8$co#+k0@bje>H0lhPqkR=ndLt)+0JBG^|(%E z>zmt6J^Pz@8KRtQ=x%ufH6Fbn$~5^E&x#8X)`4ACh>#2!&xl!D)puSW-nhlXwv>@)l-K))lqELfAWUYZDJQ#2 zZMLq59^bp{2mSSq%-Nb)G;)ZLCu}-GQ*bbC-9#!TtT7Yimi!5;d0H3;ja*|VO%2cP3j499Z6VR^oW#VgWvq_*wUQv+GQM#_^Y&a>O_x_&zb}qM& za&uO2*w042Y&+^A1{dw_zq*~awDXY6k2KQ91Lei~H(nr9sBOdNx_ed#G21foK+f!A zDqEFeF!9_WrGL@@_wEzza-nPT5-%wBzqq@5GxBWNPvf*SN5XSTppfRr5*%0NN>$7h zB>xjd?!Cxcq=&B(-q?fS&@6{7$}4^{d%&23J`T0ya#I}w9eYxno@%TxDK50f(iP4J z+NUk0jD}a3`a%#}i`}%45m8;IE>%&{2IjC7?fu{scuF^s)h`CoT>5BT&0a}zs}>A= z^x9v4Ni={tC8H$=k0`Mesn-lMwd-ODIHN`QL4W+*n81fW4`D@gsL=d`D zI17T+Kt4Lw}Y>SSJ@B&0y0@M&yYYS||-lG2Ws2r;ro+SK~g@AFXnEt@~6o0SU}5P-1}*0(?VZRn}N*-=#d}VzYGmNO`cPh zS-B}&l#2jKg44ot@|;{sp?e~zL0)^DOLJ}E)?8L6BN|aA95{BVH21?%a6}-Oj;95d zAWiLIK>ukj^MlwRTL7j2X(gv9mV!_z5%|$5v)JiLZS_O_%9kN86A)W)f&HRWCm7Ld zeph?;TkA&|yz0qlg^1=hHgW`b<}43QWP1_8PzFyDajA64_+=~W8N(jv%fW<^c?LTqCh zzu_=AZs8Jwog%&)_tS$T#YoIt6wODI>Ro| zFeH4g_-wTASv)r{4!qHO$dZR;G6?pWVnV{BSQelqq~M;y3^<9?wV#&mA}WHahCg=;I7_)pqeTwJDiQ@=eo(w(Z^pbRBCKZ?os)mQTv&;O z_YIonMNX(`{2^duV`9*}0Y!dN!I=ig0H}9_ccs4gEjwt0WfIi{vv~F$#BLtOE$!Fz z{+rPfBcdLX9t$Wry+OXbLWn7YmHk<*d9aJKOIVBm(*naEux0Blq^2HZq$@sr2e+R1 z{L;Yp#=ZiGMorcmFRH70LR)s;E}Yg=+swLtQ^=#9M25dHESsKLR5r z;t-|sc7@x!5>)ceE8aJ8eMx&W&xr)uIevRw<3sfwSYBkuk9-J4Mh|3B5AQa*!)y@K z*!J9d)DeB6+^(^kIqvLPLY?ET zqw+FY4SmTYEGy~TlS}9gJGs;e9J&i4Ar5`3!4iKTlr}cuF5A0NqnwiXpIxLKKvN{d z2FC@v8|s$Afsp%x=!ew0S7RW)o@t|@7y|m(li4+%g7tPBX>(`Xnv3onQFqiApu7as zq#HZkY$(PE^ETglaaPMCoa_!Bk02N9#{?8Rrg`Xk)!XLbN$YC5D)d^$P(>)wsBI2g2)2#7fd%s9e@3BEd1(^?t)v(lE(u_`bvwL@qev3|7f8e_DuO3 z?a%KUjaPp!2vi5VZc_|j$QPL_1$aKR5oLOBm_5b62Hxa{D|^*~skjNp$7-{oM$ref zdDh8X=u+Ysc=akd70t%}RMq9=4$f-W9F2C%Y0hAy!^+&`m-xt2U1x_}B4&mur|U>2 zC1^EfX_325+UXUAQ1Bl9g@_?Nf+WYvrBapV)9*uLE4^+`V#OYEG8VWOG&qjZPAsxi zKA090&CQ9}1@BWAP-v_etCjQ+g`0um{V)1UM_851G+y6IpE8qp!^mdqnNBj7T`eA^ z6qo00HtR0WIud!vcG*uhB>jJ>gSi^dEe&SgqACn^=!rLxq5|%hd3^H0*<_9PZt{9< zzNJ00d%%M(4c{FEBWQ*@%m_)1Wux6xC|pw{kh!lU8kTMHASKvVv*#tagjXw4CAe?j z-*@PX<7FJ32+xZ1fY_{*wbjD&xu&I$^s9O|C*sbiJ_Ek8cJcI=wS-y?mmq~4e zdD6bIU|aIgxVX20_Qf3CDiFMnHT-D0pnZV$l*IE?D#ufLJxGw6!mQ?LlB?`QBtNEN zO;w!2Gr6FI11Ot^-Kb-Q;!P6@TiCIS1_)%_Klk}C_;UhEsXi}bf5X#^p*G5!Yewy5z84hlb7&5T z>taS|`A6m=VjDBAdN-QTsg89C`5bJ7-+GlWAJ^R6j?hOPPJ&vO%@V+T8{T?Ia=WQt z9@))4kftf094C$}6yJU_4G-U`g)4})=Ms>gGoAg>sUmOqu96ecA>AVfM*A$|oJ$WtuuOS)Li z#j`$;BxvCn{k!MC$M6*dt+%8#z|^&t^uc7TuVEb>l=9?#-! zPE2-P^KHWvr+m%YwrTOxs7$onbJn9Kd!MI#Q_kNlaE!c7J_{DD?055$GRg+@36-rzI{Zf1l1rA%*%$zPvXbNKbTekW0SHU+}P5|E)X^(Hp+gGLD5PLpXHy1XzwU=gaU|HHe z>~Zl0wq}&zSHhW|Z&p}nyjw64knoF>d5R_IEX&HGgdUY&QA9aZTYW(Y4zyc4oVs&% z4^dA1zQLp3sN<rDaBX z;aG`T(0*fY0T5_^WC#))2yf_2sWe$EY{5R0&(w+wd1YAe5=7IDgU;(3gH_!%BFu6( z`z%$YpO95*VBhKjQ3RT|}kmQ;+!ReL_sN3^u^Jh2{kk;RSK;*P4qpg(a*giE6U z)(8Ftb8L`+7YGyd`uT^KK8HRL)h&2JR&OuX7UZ>fgXTX&K0MrXDMlq}wgf#$udIe+ z3iOL_%N3ilZVKwPbrfRwUU8oHV_OpZY+goJW3rU?=7@m1xgrZ@=s)Y1o1j%JzKaK% zpEixWzJ#LWO5?*&l2{hrA7AeUbG#zIo~^mavfPA%?e=~NvZ*NN>}RJ2(;c13YJ2qE zFg~E~dDwArs_ZzmFb+FBpd)`*2ZZtxsgOZ}M*-u(aIunUvQ-`W8@AveotCzo#*5#T zFt)b*-7DM49o{f{Oz_E61o@rM%21h;Q_Qw9CJQ}_agn9*IlOm{=!{SIt1V*4FN zRnI_&-^G8`Hgeo<6+~=`U|1cYt0kJQ5^#@6WDlS@N=?`($#^<`9i8ckO2O6FT4A+v z_iKwTQOv2Nq4 zpyA5tXsip_Lhs1qsg<^=I{B_!Z_*EC!BsOE{J>{9V-Rq+ZR08?rlw@$e$HDV&Vh>w zA3kLlhrsJmc4|AHSvf%5a$VMHdqjib&CuOeg;b()EPL+15<)H@f0ja^zXUG3Ss2CU z@j5~Z=BaUmO%SYTNeqlgH~fAKPEA$!Xm&nXK$+U#CP&}XvejAXnrx38kBw>nX-=;$ zZB#Xgvr}>Kf#0J0Cq2Oy-PC1KaWLCG*3YE#qFOJ9n~{SqH7V~wJnLi-h0kb1rrL}C zk$U7@J`-=!qluD>v4>1Ez}uIXlkxkT6)`DD<`WsA_3J7zKGAdvy=1g9jhJqel~Dz^ zL-m#AB@k$G$bXn8PKkbAtHZmBmCue#sXg~Po9fGGD=aYtQD;yQIgtvZauR4?H~NvB zBx9KqzvMV4RYJ+t+7C+EQeP3CarwOphPp4d7mWn|h*blm=LZUF{(1lG4%T3l zB>TXB>CHhl11F;IQmz6czf;xW#k2$<4omweB|M;qH` z+j*yr$S;nI;e}=@ui^}V6R4QyU454dda!M+SMV74r_Lju4pY|G47opb{asDt7`_%| z{!umrLw34mklAD$mXy6wQC!{wA_SpRV@7dv;K8M_5+HoT=%Aa=M6 zfekJ4l~7~nlpc=OpDI;OJfGiil0U*C<=XOvfxE0`_KW_A>;nBr1Z{w{3BWL5G6u}W h`G6zizrUP+f^*VuTSt(*o&f`. + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.add + Series.sub + Series.mul + Series.div + Series.truediv + Series.floordiv + Series.mod + Series.pow + Series.radd + Series.rsub + Series.rmul + Series.rdiv + Series.rtruediv + Series.rfloordiv + Series.rmod + Series.rpow + Series.combine + Series.combine_first + Series.round + Series.lt + Series.gt + Series.le + Series.ge + Series.ne + Series.eq + +Function application, GroupBy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.apply + Series.map + Series.groupby + +.. _api.series.stats: + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.abs + Series.all + Series.any + Series.autocorr + Series.between + Series.clip + Series.clip_lower + Series.clip_upper + Series.corr + Series.count + Series.cov + Series.cummax + Series.cummin + Series.cumprod + Series.cumsum + Series.describe + Series.diff + Series.factorize + Series.kurt + Series.mad + Series.max + Series.mean + Series.median + Series.min + Series.mode + Series.pct_change + Series.prod + Series.quantile + Series.rank + Series.sem + Series.skew + Series.std + Series.sum + Series.var + Series.unique + Series.nunique + Series.value_counts + +Reindexing / Selection / Label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.align + Series.drop + Series.equals + Series.first + Series.head + Series.idxmax + Series.idxmin + Series.isin + Series.last + Series.reindex + Series.reindex_like + Series.rename + Series.reset_index + Series.select + Series.take + Series.tail + Series.truncate + +Missing data handling +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.dropna + Series.fillna + Series.interpolate + +Reshaping, sorting +~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.argsort + Series.order + Series.reorder_levels + Series.sort + Series.sort_index + Series.sortlevel + Series.swaplevel + Series.unstack + +Combining / joining / merging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.append + Series.replace + Series.update + +Time series-related +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.asfreq + Series.asof + Series.shift + Series.first_valid_index + Series.last_valid_index + Series.resample + Series.tz_convert + Series.tz_localize + +String handling +~~~~~~~~~~~~~~~~~~~ +``Series.str`` can be used to access the values of the series as +strings and apply several methods to it. Due to implementation +details the methods show up here as methods of the +``StringMethods`` class. + +.. currentmodule:: pandas.core.strings + +.. autosummary:: + :toctree: generated/ + + StringMethods.cat + StringMethods.center + StringMethods.contains + StringMethods.count + StringMethods.decode + StringMethods.encode + StringMethods.endswith + StringMethods.extract + StringMethods.findall + StringMethods.get + StringMethods.join + StringMethods.len + StringMethods.lower + StringMethods.lstrip + StringMethods.match + StringMethods.pad + StringMethods.repeat + StringMethods.replace + StringMethods.rstrip + StringMethods.slice + StringMethods.slice_replace + StringMethods.split + StringMethods.startswith + StringMethods.strip + StringMethods.title + StringMethods.upper + StringMethods.get_dummies + +Plotting +~~~~~~~~ +.. currentmodule:: pandas + +.. autosummary:: + :toctree: generated/ + + Series.hist + Series.plot + +Serialization / IO / Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.from_csv + Series.to_pickle + Series.to_csv + Series.to_dict + Series.to_frame + Series.to_hdf + Series.to_sql + Series.to_msgpack + Series.to_json + Series.to_sparse + Series.to_dense + Series.to_string + Series.to_clipboard + +.. _api.dataframe: + +DataFrame +--------- + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + + * **index**: row labels + * **columns**: column labels + +.. autosummary:: + :toctree: generated/ + + DataFrame.as_matrix + DataFrame.dtypes + DataFrame.ftypes + DataFrame.get_dtype_counts + DataFrame.get_ftype_counts + DataFrame.select_dtypes + DataFrame.values + DataFrame.axes + DataFrame.ndim + DataFrame.shape + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.astype + DataFrame.convert_objects + DataFrame.copy + DataFrame.isnull + DataFrame.notnull + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.head + DataFrame.at + DataFrame.iat + DataFrame.ix + DataFrame.loc + DataFrame.iloc + DataFrame.insert + DataFrame.__iter__ + DataFrame.iteritems + DataFrame.iterrows + DataFrame.itertuples + DataFrame.lookup + DataFrame.pop + DataFrame.tail + DataFrame.xs + DataFrame.isin + DataFrame.query + +For more information on ``.at``, ``.iat``, ``.ix``, ``.loc``, and +``.iloc``, see the :ref:`indexing documentation `. + + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.add + DataFrame.sub + DataFrame.mul + DataFrame.div + DataFrame.truediv + DataFrame.floordiv + DataFrame.mod + DataFrame.pow + DataFrame.radd + DataFrame.rsub + DataFrame.rmul + DataFrame.rdiv + DataFrame.rtruediv + DataFrame.rfloordiv + DataFrame.rmod + DataFrame.rpow + DataFrame.lt + DataFrame.gt + DataFrame.le + DataFrame.ge + DataFrame.ne + DataFrame.eq + DataFrame.combine + DataFrame.combineAdd + DataFrame.combine_first + DataFrame.combineMult + +Function application, GroupBy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.apply + DataFrame.applymap + DataFrame.groupby + +.. _api.dataframe.stats: + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.abs + DataFrame.all + DataFrame.any + DataFrame.clip + DataFrame.clip_lower + DataFrame.clip_upper + DataFrame.corr + DataFrame.corrwith + DataFrame.count + DataFrame.cov + DataFrame.cummax + DataFrame.cummin + DataFrame.cumprod + DataFrame.cumsum + DataFrame.describe + DataFrame.diff + DataFrame.eval + DataFrame.kurt + DataFrame.mad + DataFrame.max + DataFrame.mean + DataFrame.median + DataFrame.min + DataFrame.mode + DataFrame.pct_change + DataFrame.prod + DataFrame.quantile + DataFrame.rank + DataFrame.sem + DataFrame.skew + DataFrame.sum + DataFrame.std + DataFrame.var + +Reindexing / Selection / Label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.add_prefix + DataFrame.add_suffix + DataFrame.align + DataFrame.drop + DataFrame.drop_duplicates + DataFrame.duplicated + DataFrame.equals + DataFrame.filter + DataFrame.first + DataFrame.head + DataFrame.idxmax + DataFrame.idxmin + DataFrame.last + DataFrame.reindex + DataFrame.reindex_axis + DataFrame.reindex_like + DataFrame.rename + DataFrame.reset_index + DataFrame.select + DataFrame.set_index + DataFrame.tail + DataFrame.take + DataFrame.truncate + +.. _api.dataframe.missing: + +Missing data handling +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.dropna + DataFrame.fillna + DataFrame.replace + +Reshaping, sorting, transposing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.delevel + DataFrame.pivot + DataFrame.reorder_levels + DataFrame.sort + DataFrame.sort_index + DataFrame.sortlevel + DataFrame.swaplevel + DataFrame.stack + DataFrame.unstack + DataFrame.T + DataFrame.to_panel + DataFrame.transpose + +Combining / joining / merging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.append + DataFrame.join + DataFrame.merge + DataFrame.update + +Time series-related +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.asfreq + DataFrame.shift + DataFrame.first_valid_index + DataFrame.last_valid_index + DataFrame.resample + DataFrame.to_period + DataFrame.to_timestamp + DataFrame.tz_convert + DataFrame.tz_localize + +Plotting +~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.boxplot + DataFrame.hist + DataFrame.plot + +Serialization / IO / Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.from_csv + DataFrame.from_dict + DataFrame.from_items + DataFrame.from_records + DataFrame.info + DataFrame.to_pickle + DataFrame.to_csv + DataFrame.to_hdf + DataFrame.to_sql + DataFrame.to_dict + DataFrame.to_excel + DataFrame.to_json + DataFrame.to_html + DataFrame.to_latex + DataFrame.to_stata + DataFrame.to_msgpack + DataFrame.to_gbq + DataFrame.to_records + DataFrame.to_sparse + DataFrame.to_dense + DataFrame.to_string + DataFrame.to_clipboard + +.. _api.panel: + +Panel +------ + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + + * **items**: axis 0; each item corresponds to a DataFrame contained inside + * **major_axis**: axis 1; the index (rows) of each of the DataFrames + * **minor_axis**: axis 2; the columns of each of the DataFrames + +.. autosummary:: + :toctree: generated/ + + Panel.values + Panel.axes + Panel.ndim + Panel.shape + Panel.dtypes + Panel.ftypes + Panel.get_dtype_counts + Panel.get_ftype_counts + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.astype + Panel.copy + Panel.isnull + Panel.notnull + +Getting and setting +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.get_value + Panel.set_value + +Indexing, iteration, slicing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.at + Panel.iat + Panel.ix + Panel.loc + Panel.iloc + Panel.__iter__ + Panel.iteritems + Panel.pop + Panel.xs + Panel.major_xs + Panel.minor_xs + +For more information on ``.at``, ``.iat``, ``.ix``, ``.loc``, and +``.iloc``, see the :ref:`indexing documentation `. + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.add + Panel.sub + Panel.mul + Panel.div + Panel.truediv + Panel.floordiv + Panel.mod + Panel.pow + Panel.radd + Panel.rsub + Panel.rmul + Panel.rdiv + Panel.rtruediv + Panel.rfloordiv + Panel.rmod + Panel.rpow + Panel.lt + Panel.gt + Panel.le + Panel.ge + Panel.ne + Panel.eq + +Function application, GroupBy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.apply + Panel.groupby + +.. _api.panel.stats: + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.abs + Panel.clip + Panel.clip_lower + Panel.clip_upper + Panel.count + Panel.cummax + Panel.cummin + Panel.cumprod + Panel.cumsum + Panel.max + Panel.mean + Panel.median + Panel.min + Panel.pct_change + Panel.prod + Panel.sem + Panel.skew + Panel.sum + Panel.std + Panel.var + +Reindexing / Selection / Label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.add_prefix + Panel.add_suffix + Panel.drop + Panel.equals + Panel.filter + Panel.first + Panel.last + Panel.reindex + Panel.reindex_axis + Panel.reindex_like + Panel.rename + Panel.select + Panel.take + Panel.truncate + +Missing data handling +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.dropna + Panel.fillna + +Reshaping, sorting, transposing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.sort_index + Panel.swaplevel + Panel.transpose + Panel.swapaxes + Panel.conform + +Combining / joining / merging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.join + Panel.update + +Time series-related +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.asfreq + Panel.shift + Panel.resample + Panel.tz_convert + Panel.tz_localize + +Serialization / IO / Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel.from_dict + Panel.to_pickle + Panel.to_excel + Panel.to_hdf + Panel.to_json + Panel.to_sparse + Panel.to_frame + Panel.to_clipboard + +.. _api.panel4d: + +Panel4D +------- + +Constructor +~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel4D + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + + * **labels**: axis 1; each label corresponds to a Panel contained inside + * **items**: axis 2; each item corresponds to a DataFrame contained inside + * **major_axis**: axis 3; the index (rows) of each of the DataFrames + * **minor_axis**: axis 4; the columns of each of the DataFrames + +.. autosummary:: + :toctree: generated/ + + Panel4D.values + Panel4D.axes + Panel4D.ndim + Panel4D.shape + Panel4D.dtypes + Panel4D.ftypes + Panel4D.get_dtype_counts + Panel4D.get_ftype_counts + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Panel4D.astype + Panel4D.copy + Panel4D.isnull + Panel4D.notnull + +.. _api.index: + +Index +----- + +**Many of these methods or variants thereof are available on the objects +that contain an index (Series/Dataframe) and those should most likely be +used before calling these methods directly.** + +.. autosummary:: + :toctree: generated/ + + Index + +Modifying and Computations +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.copy + Index.delete + Index.diff + Index.sym_diff + Index.drop + Index.equals + Index.factorize + Index.identical + Index.insert + Index.order + Index.reindex + Index.repeat + Index.set_names + Index.unique + Index.nunique + Index.value_counts + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.astype + Index.tolist + Index.to_datetime + Index.to_series + +Sorting +~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.argsort + Index.order + Index.sort + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.shift + +Combining / joining / merging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.append + Index.intersection + Index.join + Index.union + +Selecting +~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.get_indexer + Index.get_indexer_non_unique + Index.get_level_values + Index.get_loc + Index.get_value + Index.isin + Index.slice_indexer + Index.slice_locs + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Index.is_monotonic + Index.is_numeric + +.. _api.datetimeindex: + +DatetimeIndex +------------- + +.. autosummary:: + :toctree: generated/ + + DatetimeIndex + +Time/Date Components +~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + DatetimeIndex.year + DatetimeIndex.month + DatetimeIndex.day + DatetimeIndex.hour + DatetimeIndex.minute + DatetimeIndex.second + DatetimeIndex.microsecond + DatetimeIndex.nanosecond + DatetimeIndex.date + DatetimeIndex.time + DatetimeIndex.dayofyear + DatetimeIndex.weekofyear + DatetimeIndex.week + DatetimeIndex.dayofweek + DatetimeIndex.weekday + DatetimeIndex.quarter + DatetimeIndex.tz + DatetimeIndex.freq + DatetimeIndex.freqstr + DatetimeIndex.is_month_start + DatetimeIndex.is_month_end + DatetimeIndex.is_quarter_start + DatetimeIndex.is_quarter_end + DatetimeIndex.is_year_start + DatetimeIndex.is_year_end + +Selecting +~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DatetimeIndex.indexer_at_time + DatetimeIndex.indexer_between_time + + +Time-specific operations +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DatetimeIndex.normalize + DatetimeIndex.snap + DatetimeIndex.tz_convert + DatetimeIndex.tz_localize + + +Conversion +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DatetimeIndex.to_datetime + DatetimeIndex.to_period + DatetimeIndex.to_pydatetime + DatetimeIndex.to_series + +GroupBy +------- +.. currentmodule:: pandas.core.groupby + +GroupBy objects are returned by groupby calls: :func:`pandas.DataFrame.groupby`, :func:`pandas.Series.groupby`, etc. + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + GroupBy.__iter__ + GroupBy.groups + GroupBy.indices + GroupBy.get_group + +.. currentmodule:: pandas + +.. autosummary:: + :toctree: generated/ + + Grouper + +.. currentmodule:: pandas.core.groupby + +Function application +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + GroupBy.apply + GroupBy.aggregate + GroupBy.transform + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + GroupBy.mean + GroupBy.median + GroupBy.sem + GroupBy.std + GroupBy.var + GroupBy.ohlc + +.. currentmodule:: pandas + +General utility functions +------------------------- + +Working with options +~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + describe_option + reset_option + get_option + set_option + option_context + + +.. + HACK - see github issue #4539. To ensure old links remain valid, include + here the autosummaries with previous currentmodules as a comment and add + them to a hidden toctree (to avoid warnings): + +.. toctree:: + :hidden: + + generated/pandas.core.common.isnull + generated/pandas.core.common.notnull + generated/pandas.core.reshape.get_dummies + generated/pandas.io.clipboard.read_clipboard + generated/pandas.io.excel.ExcelFile.parse + generated/pandas.io.excel.read_excel + generated/pandas.io.html.read_html + generated/pandas.io.json.read_json + generated/pandas.io.parsers.read_csv + generated/pandas.io.parsers.read_fwf + generated/pandas.io.parsers.read_table + generated/pandas.io.pickle.read_pickle + generated/pandas.io.pytables.HDFStore.append + generated/pandas.io.pytables.HDFStore.get + generated/pandas.io.pytables.HDFStore.put + generated/pandas.io.pytables.HDFStore.select + generated/pandas.io.pytables.read_hdf + generated/pandas.io.sql.read_sql + generated/pandas.io.sql.read_frame + generated/pandas.io.sql.write_frame + generated/pandas.io.stata.read_stata + generated/pandas.stats.moments.ewma + generated/pandas.stats.moments.ewmcorr + generated/pandas.stats.moments.ewmcov + generated/pandas.stats.moments.ewmstd + generated/pandas.stats.moments.ewmvar + generated/pandas.stats.moments.expanding_apply + generated/pandas.stats.moments.expanding_corr + generated/pandas.stats.moments.expanding_count + generated/pandas.stats.moments.expanding_cov + generated/pandas.stats.moments.expanding_kurt + generated/pandas.stats.moments.expanding_mean + generated/pandas.stats.moments.expanding_median + generated/pandas.stats.moments.expanding_quantile + generated/pandas.stats.moments.expanding_skew + generated/pandas.stats.moments.expanding_std + generated/pandas.stats.moments.expanding_sum + generated/pandas.stats.moments.expanding_var + generated/pandas.stats.moments.rolling_apply + generated/pandas.stats.moments.rolling_corr + generated/pandas.stats.moments.rolling_count + generated/pandas.stats.moments.rolling_cov + generated/pandas.stats.moments.rolling_kurt + generated/pandas.stats.moments.rolling_mean + generated/pandas.stats.moments.rolling_median + generated/pandas.stats.moments.rolling_quantile + generated/pandas.stats.moments.rolling_skew + generated/pandas.stats.moments.rolling_std + generated/pandas.stats.moments.rolling_sum + generated/pandas.stats.moments.rolling_var + generated/pandas.tools.merge.concat + generated/pandas.tools.merge.merge + generated/pandas.tools.pivot.pivot_table + generated/pandas.tseries.tools.to_datetime + +.. + .. currentmodule:: pandas.io.pickle + + .. autosummary:: + :toctree: generated/ + + read_pickle + + .. currentmodule:: pandas.io.parsers + + .. autosummary:: + :toctree: generated/ + + read_table + read_csv + read_fwf + + .. currentmodule:: pandas.io.clipboard + + .. autosummary:: + :toctree: generated/ + + read_clipboard + + .. currentmodule:: pandas.io.excel + + .. autosummary:: + :toctree: generated/ + + read_excel + ExcelFile.parse + + .. currentmodule:: pandas.io.json + + .. autosummary:: + :toctree: generated/ + + read_json + + .. currentmodule:: pandas.io.html + + .. autosummary:: + :toctree: generated/ + + read_html + + .. currentmodule:: pandas.io.pytables + + .. autosummary:: + :toctree: generated/ + + read_hdf + HDFStore.put + HDFStore.append + HDFStore.get + HDFStore.select + + .. currentmodule:: pandas.io.sql + + .. autosummary:: + :toctree: generated/ + + read_sql + read_frame + write_frame + + .. currentmodule:: pandas.io.stata + + .. autosummary:: + :toctree: generated/ + + read_stata + StataReader.data + StataReader.data_label + StataReader.value_labels + StataReader.variable_labels + StataWriter.write_file + + .. currentmodule:: pandas.tools.pivot + + .. autosummary:: + :toctree: generated/ + + pivot_table + + .. currentmodule:: pandas.tools.merge + + .. autosummary:: + :toctree: generated/ + + merge + concat + + .. currentmodule:: pandas.core.reshape + + .. autosummary:: + :toctree: generated/ + + get_dummies + + .. currentmodule:: pandas.core.common + + .. autosummary:: + :toctree: generated/ + + isnull + notnull + + .. currentmodule:: pandas.tseries.tools + + .. autosummary:: + :toctree: generated/ + + to_datetime + + + .. currentmodule:: pandas.stats.moments + + .. autosummary:: + :toctree: generated/ + + rolling_count + rolling_sum + rolling_mean + rolling_median + rolling_var + rolling_std + rolling_corr + rolling_cov + rolling_skew + rolling_kurt + rolling_apply + rolling_quantile + + + .. currentmodule:: pandas.stats.moments + + .. autosummary:: + :toctree: generated/ + + expanding_count + expanding_sum + expanding_mean + expanding_median + expanding_var + expanding_std + expanding_corr + expanding_cov + expanding_skew + expanding_kurt + expanding_apply + expanding_quantile + + + .. autosummary:: + :toctree: generated/ + + ewma + ewmstd + ewmvar + ewmcorr + ewmcov diff --git a/doc/source/basics.rst b/doc/source/basics.rst new file mode 100644 index 00000000..ec845608 --- /dev/null +++ b/doc/source/basics.rst @@ -0,0 +1,1635 @@ +.. currentmodule:: pandas +.. _basics: + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + from pandas.compat import lrange + options.display.max_rows=15 + +============================== + Essential Basic Functionality +============================== + +Here we discuss a lot of the essential functionality common to the pandas data +structures. Here's how to create some of the objects used in the examples from +the previous section: + +.. ipython:: python + + index = date_range('1/1/2000', periods=8) + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = DataFrame(randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + +.. _basics.head_tail: + +Head and Tail +------------- + +To view a small sample of a Series or DataFrame object, use the ``head`` and +``tail`` methods. The default number of elements to display is five, but you +may pass a custom number. + +.. ipython:: python + + long_series = Series(randn(1000)) + long_series.head() + long_series.tail(3) + +.. _basics.attrs: + +Attributes and the raw ndarray(s) +--------------------------------- + +pandas objects have a number of attributes enabling you to access the metadata + + * **shape**: gives the axis dimensions of the object, consistent with ndarray + * Axis labels + + * **Series**: *index* (only axis) + * **DataFrame**: *index* (rows) and *columns* + * **Panel**: *items*, *major_axis*, and *minor_axis* + +Note, **these attributes can be safely assigned to**! + +.. ipython:: python + + df[:2] + df.columns = [x.lower() for x in df.columns] + df + +To get the actual data inside a data structure, one need only access the +**values** property: + +.. ipython:: python + + s.values + df.values + wp.values + +If a DataFrame or Panel contains homogeneously-typed data, the ndarray can +actually be modified in-place, and the changes will be reflected in the data +structure. For heterogeneous data (e.g. some of the DataFrame's columns are not +all the same dtype), this will not be the case. The values attribute itself, +unlike the axis labels, cannot be assigned to. + +.. note:: + + When working with heterogeneous data, the dtype of the resulting ndarray + will be chosen to accommodate all of the data involved. For example, if + strings are involved, the result will be of object dtype. If there are only + floats and integers, the resulting array will be of float dtype. + +.. _basics.accelerate: + +Accelerated operations +---------------------- + +pandas has support for accelerating certain types of binary numerical and boolean operations using +the ``numexpr`` library (starting in 0.11.0) and the ``bottleneck`` libraries. + +These libraries are especially useful when dealing with large data sets, and provide large +speedups. ``numexpr`` uses smart chunking, caching, and multiple cores. ``bottleneck`` is +a set of specialized cython routines that are especially fast when dealing with arrays that have +``nans``. + +Here is a sample (using 100 column x 100,000 row ``DataFrames``): + +.. csv-table:: + :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" + :widths: 25, 25, 25, 25 + :delim: ; + + ``df1 > df2``; 13.32; 125.35; 0.1063 + ``df1 * df2``; 21.71; 36.63; 0.5928 + ``df1 + df2``; 22.04; 36.50; 0.6039 + +You are highly encouraged to install both libraries. See the section +:ref:`Recommended Dependencies ` for more installation info. + +.. _basics.binop: + +Flexible binary operations +-------------------------- + +With binary operations between pandas data structures, there are two key points +of interest: + + * Broadcasting behavior between higher- (e.g. DataFrame) and + lower-dimensional (e.g. Series) objects. + * Missing data in computations + +We will demonstrate how to manage these issues independently, though they can +be handled simultaneously. + +Matching / broadcasting behavior +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DataFrame has the methods **add, sub, mul, div** and related functions **radd, +rsub, ...** for carrying out binary operations. For broadcasting behavior, +Series input is of primary interest. Using these functions, you can use to +either match on the *index* or *columns* via the **axis** keyword: + +.. ipython:: python + + df = DataFrame({'one' : Series(randn(3), index=['a', 'b', 'c']), + 'two' : Series(randn(4), index=['a', 'b', 'c', 'd']), + 'three' : Series(randn(3), index=['b', 'c', 'd'])}) + df + row = df.ix[1] + column = df['two'] + + df.sub(row, axis='columns') + df.sub(row, axis=1) + + df.sub(column, axis='index') + df.sub(column, axis=0) + +.. ipython:: python + :suppress: + + df_orig = df + +Furthermore you can align a level of a multi-indexed DataFrame with a Series. + +.. ipython:: python + + dfmi = df.copy() + dfmi.index = MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], + names=['first','second']) + dfmi.sub(column, axis=0, level='second') + +With Panel, describing the matching behavior is a bit more difficult, so +the arithmetic methods instead (and perhaps confusingly?) give you the option +to specify the *broadcast axis*. For example, suppose we wished to demean the +data over a particular axis. This can be accomplished by taking the mean over +an axis and broadcasting over the same axis: + +.. ipython:: python + + major_mean = wp.mean(axis='major') + major_mean + wp.sub(major_mean, axis='major') + +And similarly for ``axis="items"`` and ``axis="minor"``. + +.. note:: + + I could be convinced to make the **axis** argument in the DataFrame methods + match the broadcasting behavior of Panel. Though it would require a + transition period so users can change their code... + +Missing data / operations with fill values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In Series and DataFrame (though not yet in Panel), the arithmetic functions +have the option of inputting a *fill_value*, namely a value to substitute when +at most one of the values at a location are missing. For example, when adding +two DataFrame objects, you may wish to treat NaN as 0 unless both DataFrames +are missing that value, in which case the result will be NaN (you can later +replace NaN with some other value using ``fillna`` if you wish). + +.. ipython:: python + :suppress: + + df2 = df.copy() + df2['three']['a'] = 1. + +.. ipython:: python + + df + df2 + df + df2 + df.add(df2, fill_value=0) + +.. _basics.compare: + +Flexible Comparisons +~~~~~~~~~~~~~~~~~~~~ + +Starting in v0.8, pandas introduced binary comparison methods eq, ne, lt, gt, +le, and ge to Series and DataFrame whose behavior is analogous to the binary +arithmetic operations described above: + +.. ipython:: python + + df.gt(df2) + df2.ne(df) + +These operations produce a pandas object the same type as the left-hand-side input +that if of dtype ``bool``. These ``boolean`` objects can be used in indexing operations, +see :ref:`here` + +.. _basics.reductions: + +Boolean Reductions +~~~~~~~~~~~~~~~~~~ + +You can apply the reductions: ``empty``, ``any()``, ``all()``, and ``bool()`` to provide a +way to summarize a boolean result. + +.. ipython:: python + + (df>0).all() + (df>0).any() + +You can reduce to a final boolean value. + +.. ipython:: python + + (df>0).any().any() + +You can test if a pandas object is empty, via the ``empty`` property. + +.. ipython:: python + + df.empty + DataFrame(columns=list('ABC')).empty + +To evaluate single-element pandas objects in a boolean context, use the method ``.bool()``: + +.. ipython:: python + + Series([True]).bool() + Series([False]).bool() + DataFrame([[True]]).bool() + DataFrame([[False]]).bool() + +.. warning:: + + You might be tempted to do the following: + + .. code-block:: python + + >>>if df: + ... + + Or + + .. code-block:: python + + >>> df and df2 + + These both will raise as you are trying to compare multiple values. + + .. code-block:: python + + ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). + +See :ref:`gotchas` for a more detailed discussion. + +.. _basics.equals: + +Comparing if objects are equivalent +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Often you may find there is more than one way to compute the same +result. As a simple example, consider ``df+df`` and ``df*2``. To test +that these two computations produce the same result, given the tools +shown above, you might imagine using ``(df+df == df*2).all()``. But in +fact, this expression is False: + +.. ipython:: python + + df+df == df*2 + (df+df == df*2).all() + +Notice that the boolean DataFrame ``df+df == df*2`` contains some False values! +That is because NaNs do not compare as equals: + +.. ipython:: python + + np.nan == np.nan + +So, as of v0.13.1, NDFrames (such as Series, DataFrames, and Panels) +have an ``equals`` method for testing equality, with NaNs in corresponding +locations treated as equal. + +.. ipython:: python + + (df+df).equals(df*2) + + +Combining overlapping data sets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A problem occasionally arising is the combination of two similar data sets +where values in one are preferred over the other. An example would be two data +series representing a particular economic indicator where one is considered to +be of "higher quality". However, the lower quality series might extend further +back in history or have more complete data coverage. As such, we would like to +combine two DataFrame objects where missing values in one DataFrame are +conditionally filled with like-labeled values from the other DataFrame. The +function implementing this operation is ``combine_first``, which we illustrate: + +.. ipython:: python + + df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 + df2 + df1.combine_first(df2) + +General DataFrame Combine +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``combine_first`` method above calls the more general DataFrame method +``combine``. This method takes another DataFrame and a combiner function, +aligns the input DataFrame and then passes the combiner function pairs of +Series (ie, columns whose names are the same). + +So, for instance, to reproduce ``combine_first`` as above: + +.. ipython:: python + + combiner = lambda x, y: np.where(isnull(x), y, x) + df1.combine(df2, combiner) + +.. _basics.stats: + +Descriptive statistics +---------------------- + +A large number of methods for computing descriptive statistics and other related +operations on :ref:`Series `, :ref:`DataFrame +`, and :ref:`Panel `. Most of these +are aggregations (hence producing a lower-dimensional result) like **sum**, +**mean**, and **quantile**, but some of them, like **cumsum** and **cumprod**, +produce an object of the same size. Generally speaking, these methods take an +**axis** argument, just like *ndarray.{sum, std, ...}*, but the axis can be +specified by name or integer: + + - **Series**: no axis argument needed + - **DataFrame**: "index" (axis=0, default), "columns" (axis=1) + - **Panel**: "items" (axis=0), "major" (axis=1, default), "minor" + (axis=2) + +For example: + +.. ipython:: python + + df + df.mean(0) + df.mean(1) + +All such methods have a ``skipna`` option signaling whether to exclude missing +data (``True`` by default): + +.. ipython:: python + + df.sum(0, skipna=False) + df.sum(axis=1, skipna=True) + +Combined with the broadcasting / arithmetic behavior, one can describe various +statistical procedures, like standardization (rendering data zero mean and +standard deviation 1), very concisely: + +.. ipython:: python + + ts_stand = (df - df.mean()) / df.std() + ts_stand.std() + xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0) + xs_stand.std(1) + +Note that methods like **cumsum** and **cumprod** preserve the location of NA +values: + +.. ipython:: python + + df.cumsum() + +Here is a quick reference summary table of common functions. Each also takes an +optional ``level`` parameter which applies only if the object has a +:ref:`hierarchical index`. + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``count``, Number of non-null observations + ``sum``, Sum of values + ``mean``, Mean of values + ``mad``, Mean absolute deviation + ``median``, Arithmetic median of values + ``min``, Minimum + ``max``, Maximum + ``mode``, Mode + ``abs``, Absolute Value + ``prod``, Product of values + ``std``, Unbiased standard deviation + ``var``, Unbiased variance + ``sem``, Unbiased standard error of the mean + ``skew``, Unbiased skewness (3rd moment) + ``kurt``, Unbiased kurtosis (4th moment) + ``quantile``, Sample quantile (value at %) + ``cumsum``, Cumulative sum + ``cumprod``, Cumulative product + ``cummax``, Cumulative maximum + ``cummin``, Cumulative minimum + +Note that by chance some NumPy methods, like ``mean``, ``std``, and ``sum``, +will exclude NAs on Series input by default: + +.. ipython:: python + + np.mean(df['one']) + np.mean(df['one'].values) + +``Series`` also has a method ``nunique`` which will return the number of unique +non-null values: + +.. ipython:: python + + series = Series(randn(500)) + series[20:500] = np.nan + series[10:20] = 5 + series.nunique() + +.. _basics.describe: + +Summarizing data: describe +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is a convenient ``describe`` function which computes a variety of summary +statistics about a Series or the columns of a DataFrame (excluding NAs of +course): + +.. ipython:: python + + series = Series(randn(1000)) + series[::2] = np.nan + series.describe() + frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame.ix[::2] = np.nan + frame.describe() + +You can select specific percentiles to include in the output: + +.. ipython:: python + + series.describe(percentiles=[.05, .25, .75, .95]) + +By default, the median is always included. + +For a non-numerical Series object, `describe` will give a simple summary of the +number of unique values and most frequently occurring values: + + +.. ipython:: python + + s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s.describe() + + +There also is a utility function, ``value_range`` which takes a DataFrame and +returns a series with the minimum/maximum values in the DataFrame. + +.. _basics.idxmin: + +Index of Min/Max Values +~~~~~~~~~~~~~~~~~~~~~~~ + +The ``idxmin`` and ``idxmax`` functions on Series and DataFrame compute the +index labels with the minimum and maximum corresponding values: + +.. ipython:: python + + s1 = Series(randn(5)) + s1 + s1.idxmin(), s1.idxmax() + + df1 = DataFrame(randn(5,3), columns=['A','B','C']) + df1 + df1.idxmin(axis=0) + df1.idxmax(axis=1) + +When there are multiple rows (or columns) matching the minimum or maximum +value, ``idxmin`` and ``idxmax`` return the first matching index: + +.. ipython:: python + + df3 = DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) + df3 + df3['A'].idxmin() + +.. note:: + + ``idxmin`` and ``idxmax`` are called ``argmin`` and ``argmax`` in NumPy. + +.. _basics.discretization: + +Value counts (histogramming) / Mode +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``value_counts`` Series method and top-level function computes a histogram +of a 1D array of values. It can also be used as a function on regular arrays: + +.. ipython:: python + + data = np.random.randint(0, 7, size=50) + data + s = Series(data) + s.value_counts() + value_counts(data) + +Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: + +.. ipython:: python + + s5 = Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) + s5.mode() + df5 = DataFrame({"A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50)}) + df5.mode() + + +Discretization and quantiling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Continuous values can be discretized using the ``cut`` (bins based on values) +and ``qcut`` (bins based on sample quantiles) functions: + +.. ipython:: python + + arr = np.random.randn(20) + factor = cut(arr, 4) + factor + + factor = cut(arr, [-5, -1, 0, 1, 5]) + factor + +``qcut`` computes sample quantiles. For example, we could slice up some +normally distributed data into equal-size quartiles like so: + +.. ipython:: python + + arr = np.random.randn(30) + factor = qcut(arr, [0, .25, .5, .75, 1]) + factor + value_counts(factor) + +We can also pass infinite values to define the bins: + +.. ipython:: python + + arr = np.random.randn(20) + factor = cut(arr, [-np.inf, 0, np.inf]) + factor + +.. _basics.apply: + +Function application +-------------------- + +Arbitrary functions can be applied along the axes of a DataFrame or Panel +using the ``apply`` method, which, like the descriptive statistics methods, +take an optional ``axis`` argument: + +.. ipython:: python + + df.apply(np.mean) + df.apply(np.mean, axis=1) + df.apply(lambda x: x.max() - x.min()) + df.apply(np.cumsum) + df.apply(np.exp) + +Depending on the return type of the function passed to ``apply``, the result +will either be of lower dimension or the same dimension. + +``apply`` combined with some cleverness can be used to answer many questions +about a data set. For example, suppose we wanted to extract the date where the +maximum value for each column occurred: + +.. ipython:: python + + tsdf = DataFrame(randn(1000, 3), columns=['A', 'B', 'C'], + index=date_range('1/1/2000', periods=1000)) + tsdf.apply(lambda x: x.idxmax()) + +You may also pass additional arguments and keyword arguments to the ``apply`` +method. For instance, consider the following function you would like to apply: + +.. code-block:: python + + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide + +You may then apply this function as follows: + +.. code-block:: python + + df.apply(subtract_and_divide, args=(5,), divide=3) + +Another useful feature is the ability to pass Series methods to carry out some +Series operation on each column or row: + +.. ipython:: python + :suppress: + + tsdf = DataFrame(randn(10, 3), columns=['A', 'B', 'C'], + index=date_range('1/1/2000', periods=10)) + tsdf.values[3:7] = np.nan + +.. ipython:: python + + tsdf + tsdf.apply(Series.interpolate) + +Finally, ``apply`` takes an argument ``raw`` which is False by default, which +converts each row or column into a Series before applying the function. When +set to True, the passed function will instead receive an ndarray object, which +has positive performance implications if you do not need the indexing +functionality. + +.. seealso:: + + The section on :ref:`GroupBy ` demonstrates related, flexible + functionality for grouping by some criterion, applying, and combining the + results into a Series, DataFrame, etc. + +Applying elementwise Python functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since not all functions can be vectorized (accept NumPy arrays and return +another array or value), the methods ``applymap`` on DataFrame and analogously +``map`` on Series accept any Python function taking a single value and +returning a single value. For example: + +.. ipython:: python + :suppress: + + df4 = df_orig.copy() + +.. ipython:: python + + df4 + f = lambda x: len(str(x)) + df4['one'].map(f) + df4.applymap(f) + +``Series.map`` has an additional feature which is that it can be used to easily +"link" or "map" values defined by a secondary series. This is closely related +to :ref:`merging/joining functionality `: + + +.. ipython:: python + + s = Series(['six', 'seven', 'six', 'seven', 'six'], + index=['a', 'b', 'c', 'd', 'e']) + t = Series({'six' : 6., 'seven' : 7.}) + s + s.map(t) + + +.. _basics.apply_panel: + +Applying with a Panel +~~~~~~~~~~~~~~~~~~~~~ + +Applying with a ``Panel`` will pass a ``Series`` to the applied function. If the applied +function returns a ``Series``, the result of the application will be a ``Panel``. If the applied function +reduces to a scalar, the result of the application will be a ``DataFrame``. + +.. note:: + + Prior to 0.13.1 ``apply`` on a ``Panel`` would only work on ``ufuncs`` (e.g. ``np.sum/np.max``). + +.. ipython:: python + + import pandas.util.testing as tm + panel = tm.makePanel(5) + panel + panel['ItemA'] + +A transformational apply. + +.. ipython:: python + + result = panel.apply(lambda x: x*2, axis='items') + result + result['ItemA'] + +A reduction operation. + +.. ipython:: python + + panel.apply(lambda x: x.dtype, axis='items') + +A similar reduction type operation + +.. ipython:: python + + panel.apply(lambda x: x.sum(), axis='major_axis') + +This last reduction is equivalent to + +.. ipython:: python + + panel.sum('major_axis') + +A transformation operation that returns a ``Panel``, but is computing +the z-score across the ``major_axis``. + +.. ipython:: python + + result = panel.apply( + lambda x: (x-x.mean())/x.std(), + axis='major_axis') + result + result['ItemA'] + +Apply can also accept multiple axes in the ``axis`` argument. This will pass a +``DataFrame`` of the cross-section to the applied function. + +.. ipython:: python + + f = lambda x: ((x.T-x.mean(1))/x.std(1)).T + + result = panel.apply(f, axis = ['items','major_axis']) + result + result.loc[:,:,'ItemA'] + +This is equivalent to the following + +.. ipython:: python + + result = Panel(dict([ (ax,f(panel.loc[:,:,ax])) + for ax in panel.minor_axis ])) + result + result.loc[:,:,'ItemA'] + +.. _basics.reindexing: + + +Reindexing and altering labels +------------------------------ + +``reindex`` is the fundamental data alignment method in pandas. It is used to +implement nearly all other features relying on label-alignment +functionality. To *reindex* means to conform the data to match a given set of +labels along a particular axis. This accomplishes several things: + + * Reorders the existing data to match a new set of labels + * Inserts missing value (NA) markers in label locations where no data for + that label existed + * If specified, **fill** data for missing labels using logic (highly relevant + to working with time series data) + +Here is a simple example: + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s + s.reindex(['e', 'b', 'f', 'd']) + +Here, the ``f`` label was not contained in the Series and hence appears as +``NaN`` in the result. + +With a DataFrame, you can simultaneously reindex the index and columns: + +.. ipython:: python + + df + df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) + +For convenience, you may utilize the ``reindex_axis`` method, which takes the +labels and a keyword ``axis`` parameter. + +Note that the ``Index`` objects containing the actual axis labels can be +**shared** between objects. So if we have a Series and a DataFrame, the +following can be done: + +.. ipython:: python + + rs = s.reindex(df.index) + rs + rs.index is df.index + +This means that the reindexed Series's index is the same Python object as the +DataFrame's index. + + +.. seealso:: + + :ref:`Advanced indexing ` is an even more concise way of + doing reindexing. + +.. note:: + + When writing performance-sensitive code, there is a good reason to spend + some time becoming a reindexing ninja: **many operations are faster on + pre-aligned data**. Adding two unaligned DataFrames internally triggers a + reindexing step. For exploratory analysis you will hardly notice the + difference (because ``reindex`` has been heavily optimized), but when CPU + cycles matter sprinkling a few explicit ``reindex`` calls here and there can + have an impact. + +.. _basics.reindex_like: + +Reindexing to align with another object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You may wish to take an object and reindex its axes to be labeled the same as +another object. While the syntax for this is straightforward albeit verbose, it +is a common enough operation that the ``reindex_like`` method is available to +make this simpler: + +.. ipython:: python + :suppress: + + df2 = df.reindex(['a', 'b', 'c'], columns=['one', 'two']) + df3 = df2 - df2.mean() + + +.. ipython:: python + + df2 + df3 + df.reindex_like(df2) + +Reindexing with ``reindex_axis`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _basics.align: + +Aligning objects with each other with ``align`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``align`` method is the fastest way to simultaneously align two objects. It +supports a ``join`` argument (related to :ref:`joining and merging `): + + - ``join='outer'``: take the union of the indexes + - ``join='left'``: use the calling object's index + - ``join='right'``: use the passed object's index + - ``join='inner'``: intersect the indexes + +It returns a tuple with both of the reindexed Series: + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s1 = s[:4] + s2 = s[1:] + s1.align(s2) + s1.align(s2, join='inner') + s1.align(s2, join='left') + +.. _basics.df_join: + +For DataFrames, the join method will be applied to both the index and the +columns by default: + +.. ipython:: python + + df.align(df2, join='inner') + +You can also pass an ``axis`` option to only align on the specified axis: + +.. ipython:: python + + df.align(df2, join='inner', axis=0) + +.. _basics.align.frame.series: + +If you pass a Series to ``DataFrame.align``, you can choose to align both +objects either on the DataFrame's index or columns using the ``axis`` argument: + +.. ipython:: python + + df.align(df2.ix[0], axis=1) + +.. _basics.reindex_fill: + +Filling while reindexing +~~~~~~~~~~~~~~~~~~~~~~~~ + +``reindex`` takes an optional parameter ``method`` which is a filling method +chosen from the following table: + +.. csv-table:: + :header: "Method", "Action" + :widths: 30, 50 + + pad / ffill, Fill values forward + bfill / backfill, Fill values backward + +Other fill methods could be added, of course, but these are the two most +commonly used for time series data. In a way they only make sense for time +series or otherwise ordered data, but you may have an application on non-time +series data where this sort of "interpolation" logic is the correct thing to +do. More sophisticated interpolation of missing values would be an obvious +extension. + +We illustrate these fill methods on a simple TimeSeries: + +.. ipython:: python + + rng = date_range('1/3/2000', periods=8) + ts = Series(randn(8), index=rng) + ts2 = ts[[0, 3, 6]] + ts + ts2 + + ts2.reindex(ts.index) + ts2.reindex(ts.index, method='ffill') + ts2.reindex(ts.index, method='bfill') + +Note these methods require that the indexes are **order increasing**. + +Note the same result could have been achieved using :ref:`fillna +`: + +.. ipython:: python + + ts2.reindex(ts.index).fillna(method='ffill') + +Note that ``reindex`` will raise a ValueError if the index is not +monotonic. ``fillna`` will not make any checks on the order of the index. + +.. _basics.drop: + +Dropping labels from an axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A method closely related to ``reindex`` is the ``drop`` function. It removes a +set of labels from an axis: + +.. ipython:: python + + df + df.drop(['a', 'd'], axis=0) + df.drop(['one'], axis=1) + +Note that the following also works, but is a bit less obvious / clean: + +.. ipython:: python + + df.reindex(df.index - ['a', 'd']) + +.. _basics.rename: + +Renaming / mapping labels +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``rename`` method allows you to relabel an axis based on some mapping (a +dict or Series) or an arbitrary function. + +.. ipython:: python + + s + s.rename(str.upper) + +If you pass a function, it must return a value when called with any of the +labels (and must produce a set of unique values). But if you pass a dict or +Series, it need only contain a subset of the labels as keys: + +.. ipython:: python + + df.rename(columns={'one' : 'foo', 'two' : 'bar'}, + index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) + +The ``rename`` method also provides an ``inplace`` named parameter that is by +default ``False`` and copies the underlying data. Pass ``inplace=True`` to +rename the data in place. + +.. _basics.rename_axis: + +The Panel class has a related ``rename_axis`` class which can rename any of +its three axes. + +Iteration +--------- + +Because Series is array-like, basic iteration produces the values. Other data +structures follow the dict-like convention of iterating over the "keys" of the +objects. In short: + + * **Series**: values + * **DataFrame**: column labels + * **Panel**: item labels + +Thus, for example: + +.. ipython:: + + In [0]: for col in df: + ...: print(col) + ...: + +iteritems +~~~~~~~~~ + +Consistent with the dict-like interface, **iteritems** iterates through +key-value pairs: + + * **Series**: (index, scalar value) pairs + * **DataFrame**: (column, Series) pairs + * **Panel**: (item, DataFrame) pairs + +For example: + +.. ipython:: + + In [0]: for item, frame in wp.iteritems(): + ...: print(item) + ...: print(frame) + ...: + + +.. _basics.iterrows: + +iterrows +~~~~~~~~ + +New in v0.7 is the ability to iterate efficiently through rows of a +DataFrame. It returns an iterator yielding each index value along with a Series +containing the data in each row: + +.. ipython:: + + In [0]: for row_index, row in df2.iterrows(): + ...: print('%s\n%s' % (row_index, row)) + ...: + +For instance, a contrived way to transpose the DataFrame would be: + +.. ipython:: python + + df2 = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + print(df2) + print(df2.T) + + df2_t = DataFrame(dict((idx,values) for idx, values in df2.iterrows())) + print(df2_t) + +.. note:: + + ``iterrows`` does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). For example, + + .. ipython:: python + + df_iter = DataFrame([[1, 1.0]], columns=['x', 'y']) + row = next(df_iter.iterrows())[1] + print(row['x'].dtype) + print(df_iter['x'].dtype) + +itertuples +~~~~~~~~~~ + +This method will return an iterator yielding a tuple for each row in the +DataFrame. The first element of the tuple will be the row's corresponding index +value, while the remaining values are the row values proper. + +For instance, + +.. ipython:: python + + for r in df2.itertuples(): + print(r) + +.. _basics.string_methods: + +Vectorized string methods +------------------------- + +Series is equipped (as of pandas 0.8.1) with a set of string processing methods +that make it easy to operate on each element of the array. Perhaps most +importantly, these methods exclude missing/NA values automatically. These are +accessed via the Series's ``str`` attribute and generally have names matching +the equivalent (scalar) build-in string methods: + +Splitting and Replacing Strings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s.str.lower() + s.str.upper() + s.str.len() + +Methods like ``split`` return a Series of lists: + +.. ipython:: python + + s2 = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + s2.str.split('_') + +Elements in the split lists can be accessed using ``get`` or ``[]`` notation: + +.. ipython:: python + + s2.str.split('_').str.get(1) + s2.str.split('_').str[1] + +Methods like ``replace`` and ``findall`` take regular expressions, too: + +.. ipython:: python + + s3 = Series(['A', 'B', 'C', 'Aaba', 'Baca', + '', np.nan, 'CABA', 'dog', 'cat']) + s3 + s3.str.replace('^.a|dog', 'XX-XX ', case=False) + +Extracting Substrings +~~~~~~~~~~~~~~~~~~~~~ + +The method ``extract`` (introduced in version 0.13) accepts regular expressions +with match groups. Extracting a regular expression with one group returns +a Series of strings. + +.. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') + +Elements that do not match return ``NaN``. Extracting a regular expression +with more than one group returns a DataFrame with one column per group. + +.. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') + +Elements that do not match return a row filled with ``NaN``. +Thus, a Series of messy strings can be "converted" into a +like-indexed Series or DataFrame of cleaned-up or more useful strings, +without necessitating ``get()`` to access tuples or ``re.match`` objects. + +The results dtype always is object, even if no match is found and the result +only contains ``NaN``. + +Named groups like + +.. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') + +and optional groups like + +.. ipython:: python + + Series(['a1', 'b2', '3']).str.extract('(?P[ab])?(?P\d)') + +can also be used. + +Testing for Strings that Match or Contain a Pattern +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can check whether elements contain a pattern: + +.. ipython:: python + + pattern = r'[a-z][0-9]' + Series(['1', '2', '3a', '3b', '03c']).str.contains(pattern) + +or match a pattern: + + +.. ipython:: python + + Series(['1', '2', '3a', '3b', '03c']).str.match(pattern, as_indexer=True) + +The distinction between ``match`` and ``contains`` is strictness: ``match`` +relies on strict ``re.match``, while ``contains`` relies on ``re.search``. + +.. warning:: + + In previous versions, ``match`` was for *extracting* groups, + returning a not-so-convenient Series of tuples. The new method ``extract`` + (described in the previous section) is now preferred. + + This old, deprecated behavior of ``match`` is still the default. As + demonstrated above, use the new behavior by setting ``as_indexer=True``. + In this mode, ``match`` is analogous to ``contains``, returning a boolean + Series. The new behavior will become the default behavior in a future + release. + +Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take + an extra ``na`` argument so missing values can be considered True or False: + +.. ipython:: python + + s4 = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s4.str.contains('A', na=False) + +.. csv-table:: + :header: "Method", "Description" + :widths: 20, 80 + + ``cat``,Concatenate strings + ``split``,Split strings on delimiter + ``get``,Index into each element (retrieve i-th element) + ``join``,Join strings in each element of the Series with passed separator + ``contains``,Return boolean array if each string contains pattern/regex + ``replace``,Replace occurrences of pattern/regex with some other string + ``repeat``,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) + ``pad``,"Add whitespace to left, right, or both sides of strings" + ``center``,Equivalent to ``pad(side='both')`` + ``wrap``,Split long strings into lines with length less than a given width + ``slice``,Slice each string in the Series + ``slice_replace``,Replace slice in each string with passed value + ``count``,Count occurrences of pattern + ``startswith``,Equivalent to ``str.startswith(pat)`` for each element + ``endswith``,Equivalent to ``str.endswith(pat)`` for each element + ``findall``,Compute list of all occurrences of pattern/regex for each string + ``match``,"Call ``re.match`` on each element, returning matched groups as list" + ``extract``,"Call ``re.match`` on each element, as ``match`` does, but return matched groups as strings for convenience." + ``len``,Compute string lengths + ``strip``,Equivalent to ``str.strip`` + ``rstrip``,Equivalent to ``str.rstrip`` + ``lstrip``,Equivalent to ``str.lstrip`` + ``lower``,Equivalent to ``str.lower`` + ``upper``,Equivalent to ``str.upper`` + + +Getting indicator variables from seperated strings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can extract dummy variables from string columns. +For example if they are seperated by a ``'|'``: + + .. ipython:: python + + s = pd.Series(['a', 'a|b', np.nan, 'a|c']) + s.str.get_dummies(sep='|') + +See also :func:`~pandas.get_dummies`. + +.. _basics.sorting: + +Sorting by index and value +-------------------------- + +There are two obvious kinds of sorting that you may be interested in: sorting +by label and sorting by actual values. The primary method for sorting axis +labels (indexes) across data structures is the ``sort_index`` method. + +.. ipython:: python + + unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], + columns=['three', 'two', 'one']) + unsorted_df.sort_index() + unsorted_df.sort_index(ascending=False) + unsorted_df.sort_index(axis=1) + +``DataFrame.sort_index`` can accept an optional ``by`` argument for ``axis=0`` +which will use an arbitrary vector or a column name of the DataFrame to +determine the sort order: + +.. ipython:: python + + df1 = DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) + df1.sort_index(by='two') + +The ``by`` argument can take a list of column names, e.g.: + +.. ipython:: python + + df1[['one', 'two', 'three']].sort_index(by=['one','two']) + +Series has the method ``order`` (analogous to `R's order function +`__) which +sorts by value, with special treatment of NA values via the ``na_position`` +argument: + +.. ipython:: python + + s[2] = np.nan + s.order() + s.order(na_position='first') + +.. note:: + + ``Series.sort`` sorts a Series by value in-place. This is to provide + compatibility with NumPy methods which expect the ``ndarray.sort`` + behavior. ``Series.order`` returns a copy of the sorted data. + +.. _basics.nsorted: + +smallest / largest values +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +``Series`` has the ``nsmallest`` and ``nlargest`` methods which return the +smallest or largest :math:`n` values. For a large ``Series`` this can be much +faster than sorting the entire Series and calling ``head(n)`` on the result. + +.. ipython:: python + + s = Series(np.random.permutation(10)) + s + s.order() + s.nsmallest(3) + s.nlargest(3) + + +.. _basics.multi-index_sorting: + +Sorting by a multi-index column +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You must be explicit about sorting when the column is a multi-index, and fully specify +all levels to ``by``. + +.. ipython:: python + + df1.columns = MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) + df1.sort_index(by=('a','two')) + + +Copying +------- + +The ``copy`` method on pandas objects copies the underlying data (though not +the axis indexes, since they are immutable) and returns a new object. Note that +**it is seldom necessary to copy objects**. For example, there are only a +handful of ways to alter a DataFrame *in-place*: + + * Inserting, deleting, or modifying a column + * Assigning to the ``index`` or ``columns`` attributes + * For homogeneous data, directly modifying the values via the ``values`` + attribute or advanced indexing + +To be clear, no pandas methods have the side effect of modifying your data; +almost all methods return new objects, leaving the original object +untouched. If data is modified, it is because you did so explicitly. + +.. _basics.dtypes: + +dtypes +------ + +The main types stored in pandas objects are ``float``, ``int``, ``bool``, ``datetime64[ns]``, ``timedelta[ns]``, +and ``object``. In addition these dtypes have item sizes, e.g. ``int64`` and ``int32``. A convenient ``dtypes`` +attribute for DataFrames returns a Series with the data type of each column. + +.. ipython:: python + + dft = DataFrame(dict( A = np.random.rand(3), + B = 1, + C = 'foo', + D = Timestamp('20010102'), + E = Series([1.0]*3).astype('float32'), + F = False, + G = Series([1]*3,dtype='int8'))) + dft + dft.dtypes + +On a ``Series`` use the ``dtype`` method. + +.. ipython:: python + + dft['A'].dtype + +If a pandas object contains data multiple dtypes *IN A SINGLE COLUMN*, the dtype of the +column will be chosen to accommodate all of the data types (``object`` is the most +general). + +.. ipython:: python + + # these ints are coerced to floats + Series([1, 2, 3, 4, 5, 6.]) + + # string data forces an ``object`` dtype + Series([1, 2, 3, 6., 'foo']) + +The method ``get_dtype_counts`` will return the number of columns of +each type in a ``DataFrame``: + +.. ipython:: python + + dft.get_dtype_counts() + +Numeric dtypes will propagate and can coexist in DataFrames (starting in v0.11.0). +If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, +or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, +different numeric dtypes will **NOT** be combined. The following example will give you a taste. + +.. ipython:: python + + df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') + df1 + df1.dtypes + df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), + B = Series(randn(8)), + C = Series(np.array(randn(8),dtype='uint8')) )) + df2 + df2.dtypes + +defaults +~~~~~~~~ + +By default integer types are ``int64`` and float types are ``float64``, +*REGARDLESS* of platform (32-bit or 64-bit). The following will all result in ``int64`` dtypes. + +.. ipython:: python + + DataFrame([1, 2], columns=['a']).dtypes + DataFrame({'a': [1, 2]}).dtypes + DataFrame({'a': 1 }, index=list(range(2))).dtypes + +Numpy, however will choose *platform-dependent* types when creating arrays. +The following **WILL** result in ``int32`` on 32-bit platform. + +.. ipython:: python + + frame = DataFrame(np.array([1, 2])) + + +upcasting +~~~~~~~~~ + +Types can potentially be *upcasted* when combined with other types, meaning they are promoted +from the current type (say ``int`` to ``float``) + +.. ipython:: python + + df3 = df1.reindex_like(df2).fillna(value=0.0) + df2 + df3 + df3.dtypes + +The ``values`` attribute on a DataFrame return the *lower-common-denominator* of the dtypes, meaning +the dtype that can accommodate **ALL** of the types in the resulting homogenous dtyped numpy array. This can +force some *upcasting*. + +.. ipython:: python + + df3.values.dtype + +astype +~~~~~~ + +.. _basics.cast: + +You can use the ``astype`` method to explicitly convert dtypes from one to another. These will by default return a copy, +even if the dtype was unchanged (pass ``copy=False`` to change this behavior). In addition, they will raise an +exception if the astype operation is invalid. + +Upcasting is always according to the **numpy** rules. If two different dtypes are involved in an operation, +then the more *general* one will be used as the result of the operation. + +.. ipython:: python + + df3 + df3.dtypes + + # conversion of dtypes + df3.astype('float32').dtypes + +object conversion +~~~~~~~~~~~~~~~~~ + +``convert_objects`` is a method to try to force conversion of types from the ``object`` dtype to other types. +To force conversion of specific types that are *number like*, e.g. could be a string that represents a number, +pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise +they will be set to ``np.nan``. + +.. ipython:: python + + df3['D'] = '1.' + df3['E'] = '1' + df3.convert_objects(convert_numeric=True).dtypes + + # same, but specific dtype conversion + df3['D'] = df3['D'].astype('float16') + df3['E'] = df3['E'].astype('int32') + df3.dtypes + +To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``. +This will convert any datetime-like object to dates, forcing other values to ``NaT``. +This might be useful if you are reading in data which is mostly dates, +but occasionally has non-dates intermixed and you want to represent as missing. + +.. ipython:: python + + s = Series([datetime(2001,1,1,0,0), + 'foo', 1.0, 1, Timestamp('20010104'), + '20010105'],dtype='O') + s + s.convert_objects(convert_dates='coerce') + +In addition, ``convert_objects`` will attempt the *soft* conversion of any *object* dtypes, meaning that if all +the objects in a Series are of the same type, the Series will have that dtype. + +gotchas +~~~~~~~ + +Performing selection operations on ``integer`` type data can easily upcast the data to ``floating``. +The dtype of the input data will be preserved in cases where ``nans`` are not introduced (starting in 0.11.0) +See also :ref:`integer na gotchas ` + +.. ipython:: python + + dfi = df3.astype('int32') + dfi['E'] = 1 + dfi + dfi.dtypes + + casted = dfi[dfi>0] + casted + casted.dtypes + +While float dtypes are unchanged. + +.. ipython:: python + + dfa = df3.copy() + dfa['A'] = dfa['A'].astype('float32') + dfa.dtypes + + casted = dfa[df2>0] + casted + casted.dtypes + +Selecting columns based on ``dtype`` +------------------------------------ + +.. _basics.selectdtypes: + +.. versionadded:: 0.14.1 + +The :meth:`~pandas.DataFrame.select_dtypes` method implements subsetting of columns +based on their ``dtype``. + +First, let's create a :class:`~pandas.DataFrame` with a slew of different +dtypes: + +.. ipython:: python + + df = DataFrame({'string': list('abc'), + 'int64': list(range(1, 4)), + 'uint8': np.arange(3, 6).astype('u1'), + 'float64': np.arange(4.0, 7.0), + 'bool1': [True, False, True], + 'bool2': [False, True, False], + 'dates': pd.date_range('now', periods=3).values}) + df['tdeltas'] = df.dates.diff() + df['uint64'] = np.arange(3, 6).astype('u8') + df['other_dates'] = pd.date_range('20130101', periods=3).values + df + + +``select_dtypes`` has two parameters ``include`` and ``exclude`` that allow you to +say "give me the columns WITH these dtypes" (``include``) and/or "give the +columns WITHOUT these dtypes" (``exclude``). + +For example, to select ``bool`` columns + +.. ipython:: python + + df.select_dtypes(include=[bool]) + +You can also pass the name of a dtype in the `numpy dtype hierarchy +`__: + +.. ipython:: python + + df.select_dtypes(include=['bool']) + +:meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well. + +For example, to select all numeric and boolean columns while excluding unsigned +integers + +.. ipython:: python + + df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger']) + +To select string columns you must use the ``object`` dtype: + +.. ipython:: python + + df.select_dtypes(include=['object']) + +To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you +can define a function that returns a tree of child dtypes: + +.. ipython:: python + + def subdtypes(dtype): + subs = dtype.__subclasses__() + if not subs: + return dtype + return [dtype, [subdtypes(dt) for dt in subs]] + +All numpy dtypes are subclasses of ``numpy.generic``: + +.. ipython:: python + + subdtypes(np.generic) + +.. note:: + + The ``include`` and ``exclude`` parameters must be non-string sequences. diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst new file mode 100644 index 00000000..84bba77e --- /dev/null +++ b/doc/source/comparison_with_r.rst @@ -0,0 +1,479 @@ +.. currentmodule:: pandas +.. _compare_with_r: + +.. ipython:: python + :suppress: + + import pandas as pd + import numpy as np + options.display.max_rows=15 + +Comparison with R / R libraries +******************************* + +Since ``pandas`` aims to provide a lot of the data manipulation and analysis +functionality that people use `R `__ for, this page +was started to provide a more detailed look at the `R language +`__ and its many third +party libraries as they relate to ``pandas``. In comparisons with R and CRAN +libraries, we care about the following things: + + - **Functionality / flexibility**: what can/cannot be done with each tool + - **Performance**: how fast are operations. Hard numbers/benchmarks are + preferable + - **Ease-of-use**: Is one tool easier/harder to use (you may have to be + the judge of this, given side-by-side code comparisons) + +This page is also here to offer a bit of a translation guide for users of these +R packages. + +Base R +------ + +Slicing with R's |c|_ +~~~~~~~~~~~~~~~~~~~~~ + +R makes it easy to access ``data.frame`` columns by name + +.. code-block:: r + + df <- data.frame(a=rnorm(5), b=rnorm(5), c=rnorm(5), d=rnorm(5), e=rnorm(5)) + df[, c("a", "c", "e")] + +or by integer location + +.. code-block:: r + + df <- data.frame(matrix(rnorm(1000), ncol=100)) + df[, c(1:10, 25:30, 40, 50:100)] + +Selecting multiple columns by name in ``pandas`` is straightforward + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 3), columns=list('abc')) + df[['a', 'c']] + df.loc[:, ['a', 'c']] + +Selecting multiple noncontiguous columns by integer location can be achieved +with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. + +.. ipython:: python + + named = list('abcdefg') + n = 30 + columns = named + np.arange(len(named), n).tolist() + df = DataFrame(np.random.randn(n, n), columns=columns) + + df.iloc[:, np.r_[:10, 24:30]] + +|aggregate|_ +~~~~~~~~~~~~ + +In R you may want to split data into subsets and compute the mean for each. +Using a data.frame called ``df`` and splitting it into groups ``by1`` and +``by2``: + +.. code-block:: r + + df <- data.frame( + v1 = c(1,3,5,7,8,3,5,NA,4,5,7,9), + v2 = c(11,33,55,77,88,33,55,NA,44,55,77,99), + by1 = c("red", "blue", 1, 2, NA, "big", 1, 2, "red", 1, NA, 12), + by2 = c("wet", "dry", 99, 95, NA, "damp", 95, 99, "red", 99, NA, NA)) + aggregate(x=df[, c("v1", "v2")], by=list(mydf2$by1, mydf2$by2), FUN = mean) + +The :meth:`~pandas.DataFrame.groupby` method is similar to base R ``aggregate`` +function. + +.. ipython:: python + + from pandas import DataFrame + df = DataFrame({ + 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], + 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, + np.nan] + }) + + g = df.groupby(['by1','by2']) + g[['v1','v2']].mean() + +For more details and examples see :ref:`the groupby documentation +`. + +|match|_ +~~~~~~~~~~~~ + +A common way to select data in R is using ``%in%`` which is defined using the +function ``match``. The operator ``%in%`` is used to return a logical vector +indicating if there is a match or not: + +.. code-block:: r + + s <- 0:4 + s %in% c(2,4) + +The :meth:`~pandas.DataFrame.isin` method is similar to R ``%in%`` operator: + +.. ipython:: python + + s = pd.Series(np.arange(5),dtype=np.float32) + s.isin([2, 4]) + +The ``match`` function returns a vector of the positions of matches +of its first argument in its second: + +.. code-block:: r + + s <- 0:4 + match(s, c(2,4)) + +The :meth:`~pandas.core.groupby.GroupBy.apply` method can be used to replicate +this: + +.. ipython:: python + + s = pd.Series(np.arange(5),dtype=np.float32) + pd.Series(pd.match(s,[2,4],np.nan)) + +For more details and examples see :ref:`the reshaping documentation +`. + +|tapply|_ +~~~~~~~~~ + +``tapply`` is similar to ``aggregate``, but data can be in a ragged array, +since the subclass sizes are possibly irregular. Using a data.frame called +``baseball``, and retrieving information based on the array ``team``: + +.. code-block:: r + + baseball <- + data.frame(team = gl(5, 5, + labels = paste("Team", LETTERS[1:5])), + player = sample(letters, 25), + batting.average = runif(25, .200, .400)) + + tapply(baseball$batting.average, baseball.example$team, + max) + +In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: + +.. ipython:: python + + import random + import string + + baseball = DataFrame({ + 'team': ["team %d" % (x+1) for x in range(5)]*5, + 'player': random.sample(list(string.ascii_lowercase),25), + 'batting avg': np.random.uniform(.200, .400, 25) + }) + baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) + +For more details and examples see :ref:`the reshaping documentation +`. + +|subset|_ +~~~~~~~~~~ + +.. versionadded:: 0.13 + +The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset`` +function. In R you might want to get the rows of a ``data.frame`` where one +column's values are less than another column's values: + +.. code-block:: r + + df <- data.frame(a=rnorm(10), b=rnorm(10)) + subset(df, a <= b) + df[df$a <= df$b,] # note the comma + +In ``pandas``, there are a few ways to perform subsetting. You can use +:meth:`~pandas.DataFrame.query` or pass an expression as if it were an +index/slice as well as standard boolean indexing: + +.. ipython:: python + + df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) + df.query('a <= b') + df[df.a <= df.b] + df.loc[df.a <= df.b] + +For more details and examples see :ref:`the query documentation +`. + + +|with|_ +~~~~~~~~ + +.. versionadded:: 0.13 + +An expression using a data.frame called ``df`` in R with the columns ``a`` and +``b`` would be evaluated using ``with`` like so: + +.. code-block:: r + + df <- data.frame(a=rnorm(10), b=rnorm(10)) + with(df, a + b) + df$a + df$b # same as the previous expression + +In ``pandas`` the equivalent expression, using the +:meth:`~pandas.DataFrame.eval` method, would be: + +.. ipython:: python + + df = DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) + df.eval('a + b') + df.a + df.b # same as the previous expression + +In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than +evaluation in pure Python. For more details and examples see :ref:`the eval +documentation `. + +zoo +--- + +xts +--- + +plyr +---- + +``plyr`` is an R library for the split-apply-combine strategy for data +analysis. The functions revolve around three data structures in R, ``a`` +for ``arrays``, ``l`` for ``lists``, and ``d`` for ``data.frame``. The +table below shows how these data structures could be mapped in Python. + ++------------+-------------------------------+ +| R | Python | ++============+===============================+ +| array | list | ++------------+-------------------------------+ +| lists | dictionary or list of objects | ++------------+-------------------------------+ +| data.frame | dataframe | ++------------+-------------------------------+ + +|ddply|_ +~~~~~~~~ + +An expression using a data.frame called ``df`` in R where you want to +summarize ``x`` by ``month``: + +.. code-block:: r + + require(plyr) + df <- data.frame( + x = runif(120, 1, 168), + y = runif(120, 7, 334), + z = runif(120, 1.7, 20.7), + month = rep(c(5,6,7,8),30), + week = sample(1:4, 120, TRUE) + ) + + ddply(df, .(month, week), summarize, + mean = round(mean(x), 2), + sd = round(sd(x), 2)) + +In ``pandas`` the equivalent expression, using the +:meth:`~pandas.DataFrame.groupby` method, would be: + +.. ipython:: python + + df = DataFrame({ + 'x': np.random.uniform(1., 168., 120), + 'y': np.random.uniform(7., 334., 120), + 'z': np.random.uniform(1.7, 20.7, 120), + 'month': [5,6,7,8]*30, + 'week': np.random.randint(1,4, 120) + }) + + grouped = df.groupby(['month','week']) + print grouped['x'].agg([np.mean, np.std]) + + +For more details and examples see :ref:`the groupby documentation +`. + +reshape / reshape2 +------------------ + +|meltarray|_ +~~~~~~~~~~~~~ + +An expression using a 3 dimensional array called ``a`` in R where you want to +melt it into a data.frame: + +.. code-block:: r + + a <- array(c(1:23, NA), c(2,3,4)) + data.frame(melt(a)) + +In Python, since ``a`` is a list, you can simply use list comprehension. + +.. ipython:: python + + a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4) + DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) + +|meltlist|_ +~~~~~~~~~~~~ + +An expression using a list called ``a`` in R where you want to melt it +into a data.frame: + +.. code-block:: r + + a <- as.list(c(1:4, NA)) + data.frame(melt(a)) + +In Python, this list would be a list of tuples, so +:meth:`~pandas.DataFrame` method would convert it to a dataframe as required. + +.. ipython:: python + + a = list(enumerate(list(range(1,5))+[np.NAN])) + DataFrame(a) + +For more details and examples see :ref:`the Into to Data Structures +documentation `. + +|meltdf|_ +~~~~~~~~~~~~~~~~ + +An expression using a data.frame called ``cheese`` in R where you want to +reshape the data.frame: + +.. code-block:: r + + cheese <- data.frame( + first = c('John', 'Mary'), + last = c('Doe', 'Bo'), + height = c(5.5, 6.0), + weight = c(130, 150) + ) + melt(cheese, id=c("first", "last")) + +In Python, the :meth:`~pandas.melt` method is the R equivalent: + +.. ipython:: python + + cheese = DataFrame({'first' : ['John', 'Mary'], + 'last' : ['Doe', 'Bo'], + 'height' : [5.5, 6.0], + 'weight' : [130, 150]}) + pd.melt(cheese, id_vars=['first', 'last']) + cheese.set_index(['first', 'last']).stack() # alternative way + +For more details and examples see :ref:`the reshaping documentation +`. + +|cast|_ +~~~~~~~ + +In R ``acast`` is an expression using a data.frame called ``df`` in R to cast +into a higher dimensional array: + +.. code-block:: r + + df <- data.frame( + x = runif(12, 1, 168), + y = runif(12, 7, 334), + z = runif(12, 1.7, 20.7), + month = rep(c(5,6,7),4), + week = rep(c(1,2), 6) + ) + + mdf <- melt(df, id=c("month", "week")) + acast(mdf, week ~ month ~ variable, mean) + +In Python the best way is to make use of :meth:`~pandas.pivot_table`: + +.. ipython:: python + + df = DataFrame({ + 'x': np.random.uniform(1., 168., 12), + 'y': np.random.uniform(7., 334., 12), + 'z': np.random.uniform(1.7, 20.7, 12), + 'month': [5,6,7]*4, + 'week': [1,2]*6 + }) + mdf = pd.melt(df, id_vars=['month', 'week']) + pd.pivot_table(mdf, values='value', index=['variable','week'], + columns=['month'], aggfunc=np.mean) + +Similarly for ``dcast`` which uses a data.frame called ``df`` in R to +aggregate information based on ``Animal`` and ``FeedType``: + +.. code-block:: r + + df <- data.frame( + Animal = c('Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1', + 'Animal2', 'Animal3'), + FeedType = c('A', 'B', 'A', 'A', 'B', 'B', 'A'), + Amount = c(10, 7, 4, 2, 5, 6, 2) + ) + + dcast(df, Animal ~ FeedType, sum, fill=NaN) + # Alternative method using base R + with(df, tapply(Amount, list(Animal, FeedType), sum)) + +Python can approach this in two different ways. Firstly, similar to above +using :meth:`~pandas.pivot_table`: + +.. ipython:: python + + df = DataFrame({ + 'Animal': ['Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1', + 'Animal2', 'Animal3'], + 'FeedType': ['A', 'B', 'A', 'A', 'B', 'B', 'A'], + 'Amount': [10, 7, 4, 2, 5, 6, 2], + }) + + df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum') + +The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: + +.. ipython:: python + + df.groupby(['Animal','FeedType'])['Amount'].sum() + +For more details and examples see :ref:`the reshaping documentation +` or :ref:`the groupby documentation`. + +.. |c| replace:: ``c`` +.. _c: http://stat.ethz.ch/R-manual/R-patched/library/base/html/c.html + +.. |aggregate| replace:: ``aggregate`` +.. _aggregate: http://finzi.psych.upenn.edu/R/library/stats/html/aggregate.html + +.. |match| replace:: ``match`` / ``%in%`` +.. _match: http://finzi.psych.upenn.edu/R/library/base/html/match.html + +.. |tapply| replace:: ``tapply`` +.. _tapply: http://finzi.psych.upenn.edu/R/library/base/html/tapply.html + +.. |with| replace:: ``with`` +.. _with: http://finzi.psych.upenn.edu/R/library/base/html/with.html + +.. |subset| replace:: ``subset`` +.. _subset: http://finzi.psych.upenn.edu/R/library/base/html/subset.html + +.. |ddply| replace:: ``ddply`` +.. _ddply: http://www.inside-r.org/packages/cran/plyr/docs/ddply + +.. |meltarray| replace:: ``melt.array`` +.. _meltarray: http://www.inside-r.org/packages/cran/reshape2/docs/melt.array + +.. |meltlist| replace:: ``melt.list`` +.. meltlist: http://www.inside-r.org/packages/cran/reshape2/docs/melt.list + +.. |meltdf| replace:: ``melt.data.frame`` +.. meltdf: http://www.inside-r.org/packages/cran/reshape2/docs/melt.data.frame + +.. |cast| replace:: ``cast`` +.. cast: http://www.inside-r.org/packages/cran/reshape2/docs/cast + diff --git a/doc/source/comparison_with_sql.rst b/doc/source/comparison_with_sql.rst new file mode 100644 index 00000000..371875d9 --- /dev/null +++ b/doc/source/comparison_with_sql.rst @@ -0,0 +1,380 @@ +.. currentmodule:: pandas +.. _compare_with_sql: + +Comparison with SQL +******************** +Since many potential pandas users have some familiarity with +`SQL `_, this page is meant to provide some examples of how +various SQL operations would be performed using pandas. + +If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>` +to familiarize yourself with the library. + +As is customary, we import pandas and numpy as follows: + +.. ipython:: python + + import pandas as pd + import numpy as np + +Most of the examples will utilize the ``tips`` dataset found within pandas tests. We'll read +the data into a DataFrame called `tips` and assume we have a database table of the same name and +structure. + +.. ipython:: python + + url = 'https://raw.github.com/pydata/pandas/master/pandas/tests/data/tips.csv' + tips = pd.read_csv(url) + tips.head() + +SELECT +------ +In SQL, selection is done using a comma-separated list of columns you'd like to select (or a ``*`` +to select all columns): + +.. code-block:: sql + + SELECT total_bill, tip, smoker, time + FROM tips + LIMIT 5; + +With pandas, column selection is done by passing a list of column names to your DataFrame: + +.. ipython:: python + + tips[['total_bill', 'tip', 'smoker', 'time']].head(5) + +Calling the DataFrame without the list of column names would display all columns (akin to SQL's +``*``). + +WHERE +----- +Filtering in SQL is done via a WHERE clause. + +.. code-block:: sql + + SELECT * + FROM tips + WHERE time = 'Dinner' + LIMIT 5; + +DataFrames can be filtered in multiple ways; the most intuitive of which is using +`boolean indexing `_. + +.. ipython:: python + + tips[tips['time'] == 'Dinner'].head(5) + +The above statement is simply passing a ``Series`` of True/False objects to the DataFrame, +returning all rows with True. + +.. ipython:: python + + is_dinner = tips['time'] == 'Dinner' + is_dinner.value_counts() + tips[is_dinner].head(5) + +Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and & +(AND). + +.. code-block:: sql + + -- tips of more than $5.00 at Dinner meals + SELECT * + FROM tips + WHERE time = 'Dinner' AND tip > 5.00; + +.. ipython:: python + + # tips of more than $5.00 at Dinner meals + tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)] + +.. code-block:: sql + + -- tips by parties of at least 5 diners OR bill total was more than $45 + SELECT * + FROM tips + WHERE size >= 5 OR total_bill > 45; + +.. ipython:: python + + # tips by parties of at least 5 diners OR bill total was more than $45 + tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)] + +NULL checking is done using the :meth:`~pandas.Series.notnull` and :meth:`~pandas.Series.isnull` +methods. + +.. ipython:: python + + frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'], + 'col2': ['F', np.NaN, 'G', 'H', 'I']}) + frame + +Assume we have a table of the same structure as our DataFrame above. We can see only the records +where ``col2`` IS NULL with the following query: + +.. code-block:: sql + + SELECT * + FROM frame + WHERE col2 IS NULL; + +.. ipython:: python + + frame[frame['col2'].isnull()] + +Getting items where ``col1`` IS NOT NULL can be done with :meth:`~pandas.Series.notnull`. + +.. code-block:: sql + + SELECT * + FROM frame + WHERE col1 IS NOT NULL; + +.. ipython:: python + + frame[frame['col1'].notnull()] + + +GROUP BY +-------- +In pandas, SQL's GROUP BY operations performed using the similarly named +:meth:`~pandas.DataFrame.groupby` method. :meth:`~pandas.DataFrame.groupby` typically refers to a +process where we'd like to split a dataset into groups, apply some function (typically aggregation) +, and then combine the groups together. + +A common SQL operation would be getting the count of records in each group throughout a dataset. +For instance, a query getting us the number of tips left by sex: + +.. code-block:: sql + + SELECT sex, count(*) + FROM tips + GROUP BY sex; + /* + Female 87 + Male 157 + */ + + +The pandas equivalent would be: + +.. ipython:: python + + tips.groupby('sex').size() + +Notice that in the pandas code we used :meth:`~pandas.DataFrameGroupBy.size` and not +:meth:`~pandas.DataFrameGroupBy.count`. This is because :meth:`~pandas.DataFrameGroupBy.count` +applies the function to each column, returning the number of ``not null`` records within each. + +.. ipython:: python + + tips.groupby('sex').count() + +Alternatively, we could have applied the :meth:`~pandas.DataFrameGroupBy.count` method to an +individual column: + +.. ipython:: python + + tips.groupby('sex')['total_bill'].count() + +Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount +differs by day of the week - :meth:`~pandas.DataFrameGroupBy.agg` allows you to pass a dictionary +to your grouped DataFrame, indicating which functions to apply to specific columns. + +.. code-block:: sql + + SELECT day, AVG(tip), COUNT(*) + FROM tips + GROUP BY day; + /* + Fri 2.734737 19 + Sat 2.993103 87 + Sun 3.255132 76 + Thur 2.771452 62 + */ + +.. ipython:: python + + tips.groupby('day').agg({'tip': np.mean, 'day': np.size}) + +Grouping by more than one column is done by passing a list of columns to the +:meth:`~pandas.DataFrame.groupby` method. + +.. code-block:: sql + + SELECT smoker, day, COUNT(*), AVG(tip) + FROM tip + GROUP BY smoker, day; + /* + smoker day + No Fri 4 2.812500 + Sat 45 3.102889 + Sun 57 3.167895 + Thur 45 2.673778 + Yes Fri 15 2.714000 + Sat 42 2.875476 + Sun 19 3.516842 + Thur 17 3.030000 + */ + +.. ipython:: python + + tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) + +.. _compare_with_sql.join: + +JOIN +---- +JOINs can be performed with :meth:`~pandas.DataFrame.join` or :meth:`~pandas.merge`. By default, +:meth:`~pandas.DataFrame.join` will join the DataFrames on their indices. Each method has +parameters allowing you to specify the type of join to perform (LEFT, RIGHT, INNER, FULL) or the +columns to join on (column names or indices). + +.. ipython:: python + + df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], + 'value': np.random.randn(4)}) + df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], + 'value': np.random.randn(4)}) + +Assume we have two database tables of the same name and structure as our DataFrames. + +Now let's go over the various types of JOINs. + +INNER JOIN +~~~~~~~~~~ +.. code-block:: sql + + SELECT * + FROM df1 + INNER JOIN df2 + ON df1.key = df2.key; + +.. ipython:: python + + # merge performs an INNER JOIN by default + pd.merge(df1, df2, on='key') + +:meth:`~pandas.merge` also offers parameters for cases when you'd like to join one DataFrame's +column with another DataFrame's index. + +.. ipython:: python + + indexed_df2 = df2.set_index('key') + pd.merge(df1, indexed_df2, left_on='key', right_index=True) + +LEFT OUTER JOIN +~~~~~~~~~~~~~~~ +.. code-block:: sql + + -- show all records from df1 + SELECT * + FROM df1 + LEFT OUTER JOIN df2 + ON df1.key = df2.key; + +.. ipython:: python + + # show all records from df1 + pd.merge(df1, df2, on='key', how='left') + +RIGHT JOIN +~~~~~~~~~~ +.. code-block:: sql + + -- show all records from df2 + SELECT * + FROM df1 + RIGHT OUTER JOIN df2 + ON df1.key = df2.key; + +.. ipython:: python + + # show all records from df2 + pd.merge(df1, df2, on='key', how='right') + +FULL JOIN +~~~~~~~~~ +pandas also allows for FULL JOINs, which display both sides of the dataset, whether or not the +joined columns find a match. As of writing, FULL JOINs are not supported in all RDBMS (MySQL). + +.. code-block:: sql + + -- show all records from both tables + SELECT * + FROM df1 + FULL OUTER JOIN df2 + ON df1.key = df2.key; + +.. ipython:: python + + # show all records from both frames + pd.merge(df1, df2, on='key', how='outer') + + +UNION +----- +UNION ALL can be performed using :meth:`~pandas.concat`. + +.. ipython:: python + + df1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'], + 'rank': range(1, 4)}) + df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'], + 'rank': [1, 4, 5]}) + +.. code-block:: sql + + SELECT city, rank + FROM df1 + UNION ALL + SELECT city, rank + FROM df2; + /* + city rank + Chicago 1 + San Francisco 2 + New York City 3 + Chicago 1 + Boston 4 + Los Angeles 5 + */ + +.. ipython:: python + + pd.concat([df1, df2]) + +SQL's UNION is similar to UNION ALL, however UNION will remove duplicate rows. + +.. code-block:: sql + + SELECT city, rank + FROM df1 + UNION + SELECT city, rank + FROM df2; + -- notice that there is only one Chicago record this time + /* + city rank + Chicago 1 + San Francisco 2 + New York City 3 + Boston 4 + Los Angeles 5 + */ + +In pandas, you can use :meth:`~pandas.concat` in conjunction with +:meth:`~pandas.DataFrame.drop_duplicates`. + +.. ipython:: python + + pd.concat([df1, df2]).drop_duplicates() + + +UPDATE +------ + + +DELETE +------ diff --git a/doc/source/computation.rst b/doc/source/computation.rst new file mode 100644 index 00000000..d5dcacf5 --- /dev/null +++ b/doc/source/computation.rst @@ -0,0 +1,555 @@ +.. currentmodule:: pandas +.. _computation: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + options.display.mpl_style='default' + options.display.max_rows=15 + +Computational tools +=================== + +Statistical functions +--------------------- + +.. _computation.pct_change: + +Percent Change +~~~~~~~~~~~~~~ + +``Series``, ``DataFrame``, and ``Panel`` all have a method ``pct_change`` to compute the +percent change over a given number of periods (using ``fill_method`` to fill +NA/null values *before* computing the percent change). + +.. ipython:: python + + ser = Series(randn(8)) + + ser.pct_change() + +.. ipython:: python + + df = DataFrame(randn(10, 4)) + + df.pct_change(periods=3) + +.. _computation.covariance: + +Covariance +~~~~~~~~~~ + +The ``Series`` object has a method ``cov`` to compute covariance between series +(excluding NA/null values). + +.. ipython:: python + + s1 = Series(randn(1000)) + s2 = Series(randn(1000)) + s1.cov(s2) + +Analogously, ``DataFrame`` has a method ``cov`` to compute pairwise covariances +among the series in the DataFrame, also excluding NA/null values. + +.. _computation.covariance.caveats: + +.. note:: + + Assuming the missing data are missing at random this results in an estimate + for the covariance matrix which is unbiased. However, for many applications + this estimate may not be acceptable because the estimated covariance matrix + is not guaranteed to be positive semi-definite. This could lead to + estimated correlations having absolute values which are greater than one, + and/or a non-invertible covariance matrix. See `Estimation of covariance + matrices `_ + for more details. + +.. ipython:: python + + frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame.cov() + +``DataFrame.cov`` also supports an optional ``min_periods`` keyword that +specifies the required minimum number of observations for each column pair +in order to have a valid result. + +.. ipython:: python + + frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c']) + frame.ix[:5, 'a'] = np.nan + frame.ix[5:10, 'b'] = np.nan + + frame.cov() + + frame.cov(min_periods=12) + + +.. _computation.correlation: + +Correlation +~~~~~~~~~~~ + +Several methods for computing correlations are provided: + +.. csv-table:: + :header: "Method name", "Description" + :widths: 20, 80 + + ``pearson (default)``, Standard correlation coefficient + ``kendall``, Kendall Tau correlation coefficient + ``spearman``, Spearman rank correlation coefficient + +.. \rho = \cov(x, y) / \sigma_x \sigma_y + +All of these are currently computed using pairwise complete observations. + +.. note:: + + Please see the :ref:`caveats ` associated + with this method of calculating correlation matrices in the + :ref:`covariance section `. + +.. ipython:: python + + frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame.ix[::2] = np.nan + + # Series with Series + frame['a'].corr(frame['b']) + frame['a'].corr(frame['b'], method='spearman') + + # Pairwise correlation of DataFrame columns + frame.corr() + +Note that non-numeric columns will be automatically excluded from the +correlation calculation. + +Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: + +.. ipython:: python + + frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c']) + frame.ix[:5, 'a'] = np.nan + frame.ix[5:10, 'b'] = np.nan + + frame.corr() + + frame.corr(min_periods=12) + + +A related method ``corrwith`` is implemented on DataFrame to compute the +correlation between like-labeled Series contained in different DataFrame +objects. + +.. ipython:: python + + index = ['a', 'b', 'c', 'd', 'e'] + columns = ['one', 'two', 'three', 'four'] + df1 = DataFrame(randn(5, 4), index=index, columns=columns) + df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) + df1.corrwith(df2) + df2.corrwith(df1, axis=1) + +.. _computation.ranking: + +Data ranking +~~~~~~~~~~~~ + +The ``rank`` method produces a data ranking with ties being assigned the mean +of the ranks (by default) for the group: + +.. ipython:: python + + s = Series(np.random.randn(5), index=list('abcde')) + s['d'] = s['b'] # so there's a tie + s.rank() + +``rank`` is also a DataFrame method and can rank either the rows (``axis=0``) +or the columns (``axis=1``). ``NaN`` values are excluded from the ranking. + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 6)) + df[4] = df[2][:5] # some ties + df + df.rank(1) + +``rank`` optionally takes a parameter ``ascending`` which by default is true; +when false, data is reverse-ranked, with larger values assigned a smaller rank. + +``rank`` supports different tie-breaking methods, specified with the ``method`` +parameter: + + - ``average`` : average rank of tied group + - ``min`` : lowest rank in the group + - ``max`` : highest rank in the group + - ``first`` : ranks assigned in the order they appear in the array + + +.. currentmodule:: pandas + +.. currentmodule:: pandas.stats.api + +.. _stats.moments: + +Moving (rolling) statistics / moments +------------------------------------- + +For working with time series data, a number of functions are provided for +computing common *moving* or *rolling* statistics. Among these are count, sum, +mean, median, correlation, variance, covariance, standard deviation, skewness, +and kurtosis. All of these methods are in the :mod:`pandas` namespace, but +otherwise they can be found in :mod:`pandas.stats.moments`. + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``rolling_count``, Number of non-null observations + ``rolling_sum``, Sum of values + ``rolling_mean``, Mean of values + ``rolling_median``, Arithmetic median of values + ``rolling_min``, Minimum + ``rolling_max``, Maximum + ``rolling_std``, Unbiased standard deviation + ``rolling_var``, Unbiased variance + ``rolling_skew``, Unbiased skewness (3rd moment) + ``rolling_kurt``, Unbiased kurtosis (4th moment) + ``rolling_quantile``, Sample quantile (value at %) + ``rolling_apply``, Generic apply + ``rolling_cov``, Unbiased covariance (binary) + ``rolling_corr``, Correlation (binary) + ``rolling_window``, Moving window function + +Generally these methods all have the same interface. The binary operators +(e.g. ``rolling_corr``) take two Series or DataFrames. Otherwise, they all +accept the following arguments: + + - ``window``: size of moving window + - ``min_periods``: threshold of non-null data points to require (otherwise + result is NA) + - ``freq``: optionally specify a :ref:`frequency string ` + or :ref:`DateOffset ` to pre-conform the data to. + Note that prior to pandas v0.8.0, a keyword argument ``time_rule`` was used + instead of ``freq`` that referred to the legacy time rule constants + - ``how``: optionally specify method for down or re-sampling. Default is + is min for ``rolling_min``, max for ``rolling_max``, median for + ``rolling_median``, and mean for all other rolling functions. See + :meth:`DataFrame.resample`'s how argument for more information. + +These functions can be applied to ndarrays or Series objects: + +.. ipython:: python + + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + ts.plot(style='k--') + + @savefig rolling_mean_ex.png + rolling_mean(ts, 60).plot(style='k') + +They can also be applied to DataFrame objects. This is really just syntactic +sugar for applying the moving window operator to all of the DataFrame's columns: + +.. ipython:: python + :suppress: + + plt.close('all') + +.. ipython:: python + + df = DataFrame(randn(1000, 4), index=ts.index, + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + + @savefig rolling_mean_frame.png + rolling_sum(df, 60).plot(subplots=True) + +The ``rolling_apply`` function takes an extra ``func`` argument and performs +generic rolling computations. The ``func`` argument should be a single function +that produces a single value from an ndarray input. Suppose we wanted to +compute the mean absolute deviation on a rolling basis: + +.. ipython:: python + + mad = lambda x: np.fabs(x - x.mean()).mean() + @savefig rolling_apply_ex.png + rolling_apply(ts, 60, mad).plot(style='k') + +The ``rolling_window`` function performs a generic rolling window computation +on the input data. The weights used in the window are specified by the ``win_type`` +keyword. The list of recognized types are: + + - ``boxcar`` + - ``triang`` + - ``blackman`` + - ``hamming`` + - ``bartlett`` + - ``parzen`` + - ``bohman`` + - ``blackmanharris`` + - ``nuttall`` + - ``barthann`` + - ``kaiser`` (needs beta) + - ``gaussian`` (needs std) + - ``general_gaussian`` (needs power, width) + - ``slepian`` (needs width). + +.. ipython:: python + + ser = Series(randn(10), index=date_range('1/1/2000', periods=10)) + + rolling_window(ser, 5, 'triang') + +Note that the ``boxcar`` window is equivalent to ``rolling_mean``: + +.. ipython:: python + + rolling_window(ser, 5, 'boxcar') + + rolling_mean(ser, 5) + +For some windowing functions, additional parameters must be specified: + +.. ipython:: python + + rolling_window(ser, 5, 'gaussian', std=0.1) + +By default the labels are set to the right edge of the window, but a +``center`` keyword is available so the labels can be set at the center. +This keyword is available in other rolling functions as well. + +.. ipython:: python + + rolling_window(ser, 5, 'boxcar') + + rolling_window(ser, 5, 'boxcar', center=True) + + rolling_mean(ser, 5, center=True) + + +.. _stats.moments.binary: + +Binary rolling moments +~~~~~~~~~~~~~~~~~~~~~~ + +``rolling_cov`` and ``rolling_corr`` can compute moving window statistics about +two ``Series`` or any combination of ``DataFrame/Series`` or +``DataFrame/DataFrame``. Here is the behavior in each case: + +- two ``Series``: compute the statistic for the pairing. +- ``DataFrame/Series``: compute the statistics for each column of the DataFrame + with the passed Series, thus returning a DataFrame. +- ``DataFrame/DataFrame``: by default compute the statistic for matching column + names, returning a DataFrame. If the keyword argument ``pairwise=True`` is + passed then computes the statistic for each pair of columns, returning a + ``Panel`` whose ``items`` are the dates in question (see :ref:`the next section + `). + +For example: + +.. ipython:: python + + df2 = df[:20] + rolling_corr(df2, df2['B'], window=5) + +.. _stats.moments.corr_pairwise: + +Computing rolling pairwise covariances and correlations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In financial data analysis and other fields it's common to compute covariance +and correlation matrices for a collection of time series. Often one is also +interested in moving-window covariance and correlation matrices. This can be +done by passing the ``pairwise`` keyword argument, which in the case of +``DataFrame`` inputs will yield a ``Panel`` whose ``items`` are the dates in +question. In the case of a single DataFrame argument the ``pairwise`` argument +can even be omitted: + +.. note:: + + Missing values are ignored and each entry is computed using the pairwise + complete observations. Please see the :ref:`covariance section + ` for :ref:`caveats + ` associated with this method of + calculating covariance and correlation matrices. + +.. ipython:: python + + covs = rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) + covs[df.index[-50]] + +.. ipython:: python + + correls = rolling_corr(df, 50) + correls[df.index[-50]] + +.. note:: + + Prior to version 0.14 this was available through ``rolling_corr_pairwise`` + which is now simply syntactic sugar for calling ``rolling_corr(..., + pairwise=True)`` and deprecated. This is likely to be removed in a future + release. + +You can efficiently retrieve the time series of correlations between two +columns using ``ix`` indexing: + +.. ipython:: python + :suppress: + + plt.close('all') + +.. ipython:: python + + @savefig rolling_corr_pairwise_ex.png + correls.ix[:, 'A', 'C'].plot() + +Expanding window moment functions +--------------------------------- +A common alternative to rolling statistics is to use an *expanding* window, +which yields the value of the statistic with all the data available up to that +point in time. As these calculations are a special case of rolling statistics, +they are implemented in pandas such that the following two calls are equivalent: + +.. ipython:: python + + rolling_mean(df, window=len(df), min_periods=1)[:5] + + expanding_mean(df)[:5] + +Like the ``rolling_`` functions, the following methods are included in the +``pandas`` namespace or can be located in ``pandas.stats.moments``. + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``expanding_count``, Number of non-null observations + ``expanding_sum``, Sum of values + ``expanding_mean``, Mean of values + ``expanding_median``, Arithmetic median of values + ``expanding_min``, Minimum + ``expanding_max``, Maximum + ``expanding_std``, Unbiased standard deviation + ``expanding_var``, Unbiased variance + ``expanding_skew``, Unbiased skewness (3rd moment) + ``expanding_kurt``, Unbiased kurtosis (4th moment) + ``expanding_quantile``, Sample quantile (value at %) + ``expanding_apply``, Generic apply + ``expanding_cov``, Unbiased covariance (binary) + ``expanding_corr``, Correlation (binary) + +Aside from not having a ``window`` parameter, these functions have the same +interfaces as their ``rolling_`` counterpart. Like above, the parameters they +all accept are: + + - ``min_periods``: threshold of non-null data points to require. Defaults to + minimum needed to compute statistic. No ``NaNs`` will be output once + ``min_periods`` non-null data points have been seen. + - ``freq``: optionally specify a :ref:`frequency string ` + or :ref:`DateOffset ` to pre-conform the data to. + Note that prior to pandas v0.8.0, a keyword argument ``time_rule`` was used + instead of ``freq`` that referred to the legacy time rule constants + +.. note:: + + The output of the ``rolling_`` and ``expanding_`` functions do not return a + ``NaN`` if there are at least ``min_periods`` non-null values in the current + window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and + ``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is + encountered in the input. + +An expanding window statistic will be more stable (and less responsive) than +its rolling window counterpart as the increasing window size decreases the +relative impact of an individual data point. As an example, here is the +``expanding_mean`` output for the previous time series dataset: + +.. ipython:: python + :suppress: + + plt.close('all') + +.. ipython:: python + + ts.plot(style='k--') + + @savefig expanding_mean_frame.png + expanding_mean(ts).plot(style='k') + +Exponentially weighted moment functions +--------------------------------------- + +A related set of functions are exponentially weighted versions of many of the +above statistics. A number of EW (exponentially weighted) functions are +provided using the blending method. For example, where :math:`y_t` is the +result and :math:`x_t` the input, we compute an exponentially weighted moving +average as + +.. math:: + + y_t = (1 - \alpha) y_{t-1} + \alpha x_t + +One must have :math:`0 < \alpha \leq 1`, but rather than pass :math:`\alpha` +directly, it's easier to think about either the **span**, **center of mass +(com)** or **halflife** of an EW moment: + +.. math:: + + \alpha = + \begin{cases} + \frac{2}{s + 1}, s = \text{span}\\ + \frac{1}{1 + c}, c = \text{center of mass}\\ + 1 - \exp^{\frac{\log 0.5}{h}}, h = \text{half life} + \end{cases} + +.. note:: + + the equation above is sometimes written in the form + + .. math:: + + y_t = \alpha' y_{t-1} + (1 - \alpha') x_t + + where :math:`\alpha' = 1 - \alpha`. + +You can pass one of the three to these functions but not more. **Span** +corresponds to what is commonly called a "20-day EW moving average" for +example. **Center of mass** has a more physical interpretation. For example, +**span** = 20 corresponds to **com** = 9.5. **Halflife** is the period of +time for the exponential weight to reduce to one half. Here is the list of +functions available: + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``ewma``, EW moving average + ``ewmvar``, EW moving variance + ``ewmstd``, EW moving standard deviation + ``ewmcorr``, EW moving correlation + ``ewmcov``, EW moving covariance + +Here are an example for a univariate time series: + +.. ipython:: python + + plt.close('all') + ts.plot(style='k--') + + @savefig ewma_ex.png + ewma(ts, span=20).plot(style='k') + +.. note:: + + The EW functions perform a standard adjustment to the initial observations + whereby if there are fewer observations than called for in the span, those + observations are reweighted accordingly. diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 00000000..4f01fe4f --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,309 @@ +# -*- coding: utf-8 -*- +# +# pandas documentation build configuration file, created by +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import re +from pandas.compat import u, PY3 + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.append(os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('../sphinxext')) + +sys.path.extend([ + + # numpy standard doc extensions + os.path.join(os.path.dirname(__file__), + '..', '../..', + 'sphinxext') + +]) + +# -- General configuration ----------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. sphinxext. + +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.extlinks', + 'sphinx.ext.todo', + 'numpydoc', # used to parse numpy-style docstrings for autodoc + 'ipython_sphinxext.ipython_directive', + 'ipython_sphinxext.ipython_console_highlighting', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.pngmath', + 'sphinx.ext.ifconfig', + 'matplotlib.sphinxext.only_directives', + 'matplotlib.sphinxext.plot_directive', + ] + + + +with open("index.rst") as f: + lines = f.readlines() + +# only include the slow autosummary feature if we're building the API section +# of the docs + +# JP: added from sphinxdocs +autosummary_generate = False + +if any([re.match("\s*api\s*",l) for l in lines]): + autosummary_generate = True + +ds = [] +for f in os.listdir(os.path.dirname(__file__)): + if (not f.endswith(('.rst'))) or (f.startswith('.')) or os.path.basename(f) == 'index.rst': + continue + + _f = f.split('.rst')[0] + if not any([re.match("\s*%s\s*$" % _f,l) for l in lines]): + ds.append(f) + +if ds: + print("I'm about to DELETE the following:\n%s\n" % list(sorted(ds))) + sys.stdout.write("WARNING: I'd like to delete those to speed up processing (yes/no)? ") + if PY3: + answer = input() + else: + answer = raw_input() + + if answer.lower().strip() in ('y','yes'): + for f in ds: + f = os.path.join(os.path.join(os.path.dirname(__file__),f)) + f= os.path.abspath(f) + try: + print("Deleting %s" % f) + os.unlink(f) + except: + print("Error deleting %s" % f) + pass + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['../_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u('pandas') +copyright = u('2008-2014, the pandas development team') + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import pandas + +# version = '%s r%s' % (pandas.__version__, svn_version()) +version = '%s' % (pandas.__version__) + +# The full version, including alpha/beta/rc tags. +release = version + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +# unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'nature_with_gtoc' + +# The style sheet to use for HTML and HTML Help pages. A file of that name +# must exist either in Sphinx' static/ path, or in one of the custom paths +# given in html_static_path. +# html_style = 'statsmodels.css' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['themes'] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +html_use_modindex = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pandas' + + +# -- Options for LaTeX output -------------------------------------------- + +# The paper size ('letter' or 'a4'). +# latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +# latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'pandas.tex', + u('pandas: powerful Python data analysis toolkit'), + u('Wes McKinney\n\& PyData Development Team'), 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +# latex_preamble = '' + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_use_modindex = True + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + 'statsmodels': ('http://statsmodels.sourceforge.net/devel/', None), + 'matplotlib': ('http://matplotlib.org/', None), + 'python': ('http://docs.python.org/', None) +} +import glob +autosummary_generate = glob.glob("*.rst") + +# extlinks alias +extlinks = {'issue': ('https://github.com/pydata/pandas/issues/%s', + 'GH'), + 'wiki': ('https://github.com/pydata/pandas/wiki/%s', + 'wiki ')} + +ipython_exec_lines = [ + 'import numpy as np', + 'import pandas as pd', + # This ensures correct rendering on system with console encoding != utf8 + # (windows). It forces pandas to encode it's output reprs using utf8 + # whereever the docs are built. The docs' target is the browser, not + # the console, so this is fine. + 'pd.options.display.encoding="utf8"' + ] + +# remove the docstring of the flags attribute (inherited from numpy ndarray) +# because these give doc build errors (see GH issue 5331) +def remove_flags_docstring(app, what, name, obj, options, lines): + if what == "attribute" and name.endswith(".flags"): + del lines[:] + +def setup(app): + app.connect("autodoc-process-docstring", remove_flags_docstring) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst new file mode 100644 index 00000000..6d76c6e4 --- /dev/null +++ b/doc/source/contributing.rst @@ -0,0 +1,16 @@ +.. _contributing: + +********************** +Contributing to pandas +********************** + +See the following links: + +- `The developer pages on the website + `_ +- `Guidelines on bug reports and pull requests + `_ +- `Some extra tips on using git + `_ + +.. include:: ../README.rst diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst new file mode 100644 index 00000000..84411231 --- /dev/null +++ b/doc/source/cookbook.rst @@ -0,0 +1,690 @@ +.. _cookbook: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + import os + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + options.display.mpl_style='default' + import pandas as pd + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + +******** +Cookbook +******** + +This is a repository for *short and sweet* examples and links for useful pandas recipes. +We encourage users to add to this documentation. + +This is a great *First Pull Request* (to add interesting links and/or put short code inline +for existing links) + +Idioms +------ + +.. _cookbook.idioms: + +These are some neat pandas ``idioms`` + +`How to do if-then-else? +`__ + +`How to do if-then-else #2 +`__ + +`How to split a frame with a boolean criterion? +`__ + +`How to select from a frame with complex criteria? +`__ + +`Select rows closest to a user-defined number +`__ + +`How to reduce a sequence (e.g. of Series) using a binary operator +`__ + + +.. _cookbook.selection: + +Selection +--------- + +The :ref:`indexing ` docs. + +`Indexing using both row labels and conditionals +`__ + +`Use loc for label-oriented slicing and iloc positional slicing +`__ + +`Extend a panel frame by transposing, adding a new dimension, and transposing back to the original dimensions +`__ + +`Mask a panel by using np.where and then reconstructing the panel with the new masked values +`__ + +`Using ~ to take the complement of a boolean array, see +`__ + +`Efficiently creating columns using applymap +`__ + +`Keep other columns when using min() with groupby +`__ + +.. _cookbook.multi_index: + +MultiIndexing +------------- + +The :ref:`multindexing ` docs. + +`Creating a multi-index from a labeled frame +`__ + +Arithmetic +~~~~~~~~~~ + +`Performing arithmetic with a multi-index that needs broadcasting +`__ + +Slicing +~~~~~~~ + +`Slicing a multi-index with xs +`__ + +`Slicing a multi-index with xs #2 +`__ + +`Setting portions of a multi-index with xs +`__ + +Sorting +~~~~~~~ + +`Multi-index sorting +`__ + +`Partial Selection, the need for sortedness +`__ + +Levels +~~~~~~ + +`Prepending a level to a multiindex +`__ + +`Flatten Hierarchical columns +`__ + +panelnd +~~~~~~~ + +The :ref:`panelnd` docs. + +`Construct a 5D panelnd +`__ + +.. _cookbook.missing_data: + +Missing Data +------------ + +The :ref:`missing data` docs. + +Fill forward a reversed timeseries + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A')) + df.ix[3,'A'] = np.nan + df + df.reindex(df.index[::-1]).ffill() + +`cumsum reset at NaN values +`__ + +Replace +~~~~~~~ + +`Using replace with backrefs +`__ + +.. _cookbook.grouping: + +Grouping +-------- + +The :ref:`grouping ` docs. + +`Basic grouping with apply +`__ + +`Using get_group +`__ + +`Apply to different items in a group +`__ + +`Expanding Apply +`__ + +`Replacing values with groupby means +`__ + +`Sort by group with aggregation +`__ + +`Create multiple aggregated columns +`__ + +`Create a value counts column and reassign back to the DataFrame +`__ + +`Shift groups of the values in a column based on the index +`__ + +.. ipython:: python + + df = pd.DataFrame( + {u'line_race': [10L, 10L, 8L, 10L, 10L, 8L], + u'beyer': [99L, 102L, 103L, 103L, 88L, 100L]}, + index=[u'Last Gunfighter', u'Last Gunfighter', u'Last Gunfighter', + u'Paynter', u'Paynter', u'Paynter']); df + + df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) + df + +Expanding Data +~~~~~~~~~~~~~~ + +`Alignment and to-date +`__ + +`Rolling Computation window based on values instead of counts +`__ + +`Rolling Mean by Time Interval +`__ + +Splitting +~~~~~~~~~ + +`Splitting a frame +`__ + +.. _cookbook.pivot: + +Pivot +~~~~~ +The :ref:`Pivot ` docs. + +`Partial sums and subtotals +`__ + +`Frequency table like plyr in R +`__ + +Apply +~~~~~ + +`Turning embedded lists into a multi-index frame +`__ + +`Rolling apply with a DataFrame returning a Series +`__ + +`Rolling apply with a DataFrame returning a Scalar +`__ + +Timeseries +---------- + +`Between times +`__ + +`Using indexer between time +`__ + +`Constructing a datetime range that excludes weekends and includes only certain times +`__ + +`Vectorized Lookup +`__ + +Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. +`How to rearrange a python pandas DataFrame? +`__ + +`Dealing with duplicates when reindexing a timeseries to a specified frequency +`__ + +Calculate the first day of the month for each entry in a DatetimeIndex + +.. ipython:: python + + dates = pd.date_range('2000-01-01', periods=5) + dates.to_period(freq='M').to_timestamp() + +.. _cookbook.resample: + +Resampling +~~~~~~~~~~ + +The :ref:`Resample ` docs. + +`TimeGrouping of values grouped across time +`__ + +`TimeGrouping #2 +`__ + +`Using TimeGrouper and another grouping to create subgroups, then apply a custom function +`__ + +`Resampling with custom periods +`__ + +`Resample intraday frame without adding new days +`__ + +`Resample minute data +`__ + +`Resample with groupby `__ + +.. _cookbook.merge: + +Merge +----- + +The :ref:`Concat ` docs. The :ref:`Join ` docs. + +`emulate R rbind +`__ + +`Self Join +`__ + +`How to set the index and join +`__ + +`KDB like asof join +`__ + +`Join with a criteria based on the values +`__ + +.. _cookbook.plotting: + +Plotting +-------- + +The :ref:`Plotting ` docs. + +`Make Matplotlib look like R +`__ + +`Setting x-axis major and minor labels +`__ + +`Plotting multiple charts in an ipython notebook +`__ + +`Creating a multi-line plot +`__ + +`Plotting a heatmap +`__ + +`Annotate a time-series plot +`__ + +`Annotate a time-series plot #2 +`__ + +`Generate Embedded plots in excel files using Pandas, Vincent and xlsxwriter +`__ + +`Boxplot for each quartile of a stratifying variable +`__ + +.. ipython:: python + + df = pd.DataFrame( + {u'stratifying_var': np.random.uniform(0, 100, 20), + u'price': np.random.normal(100, 5, 20)} + ) + df[u'quartiles'] = pd.qcut( + df[u'stratifying_var'], + 4, + labels=[u'0-25%', u'25-50%', u'50-75%', u'75-100%'] + ) + + @savefig quartile_boxplot.png + df.boxplot(column=u'price', by=u'quartiles') + + +Data In/Out +----------- + +`Performance comparison of SQL vs HDF5 +`__ + +.. _cookbook.csv: + +CSV +~~~ + +The :ref:`CSV ` docs + +`read_csv in action `__ + +`appending to a csv +`__ + +`Reading a csv chunk-by-chunk +`__ + +`Reading only certain rows of a csv chunk-by-chunk +`__ + +`Reading the first few lines of a frame +`__ + +Reading a file that is compressed but not by ``gzip/bz2`` (the native compressed formats which ``read_csv`` understands). +This example shows a ``WinZipped`` file, but is a general application of opening the file within a context manager and +using that handle to read. +`See here +`__ + +`Inferring dtypes from a file +`__ + +`Dealing with bad lines +`__ + +`Dealing with bad lines II +`__ + +`Reading CSV with Unix timestamps and converting to local timezone +`__ + +`Write a multi-row index CSV without writing duplicates +`__ + +Parsing date components in multi-columns is faster with a format + +.. code-block:: python + + In [30]: i = pd.date_range('20000101',periods=10000) + + In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day)) + + In [32]: df.head() + Out[32]: + day month year + 0 1 1 2000 + 1 2 1 2000 + 2 3 1 2000 + 3 4 1 2000 + 4 5 1 2000 + + In [33]: %timeit pd.to_datetime(df.year*10000+df.month*100+df.day,format='%Y%m%d') + 100 loops, best of 3: 7.08 ms per loop + + # simulate combinging into a string, then parsing + In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'],x['month'],x['day']),axis=1) + + In [35]: ds.head() + Out[35]: + 0 20000101 + 1 20000102 + 2 20000103 + 3 20000104 + 4 20000105 + dtype: object + + In [36]: %timeit pd.to_datetime(ds) + 1 loops, best of 3: 488 ms per loop + +.. _cookbook.sql: + +SQL +~~~ + +The :ref:`SQL ` docs + +`Reading from databases with SQL +`__ + +.. _cookbook.excel: + +Excel +~~~~~ + +The :ref:`Excel ` docs + +`Reading from a filelike handle +`__ + +.. _cookbook.html: + +`Reading HTML tables from a server that cannot handle the default request +header `__ + +.. _cookbook.hdf: + +HDFStore +~~~~~~~~ + +The :ref:`HDFStores ` docs + +`Simple Queries with a Timestamp Index +`__ + +`Managing heterogeneous data using a linked multiple table hierarchy +`__ + +`Merging on-disk tables with millions of rows +`__ + +Deduplicating a large store by chunks, essentially a recursive reduction operation. Shows a function for taking in data from +csv file and creating a store by chunks, with date parsing as well. +`See here +`__ + +`Creating a store chunk-by-chunk from a csv file +`__ + +`Appending to a store, while creating a unique index +`__ + +`Large Data work flows +`__ + +`Reading in a sequence of files, then providing a global unique index to a store while appending +`__ + +`Groupby on a HDFStore +`__ + +`Hierarchical queries on a HDFStore +`__ + +`Counting with a HDFStore +`__ + +`Troubleshoot HDFStore exceptions +`__ + +`Setting min_itemsize with strings +`__ + +`Using ptrepack to create a completely-sorted-index on a store +`__ + +Storing Attributes to a group node + +.. ipython:: python + + df = DataFrame(np.random.randn(8,3)) + store = HDFStore('test.h5') + store.put('df',df) + + # you can store an arbitrary python object via pickle + store.get_storer('df').attrs.my_attribute = dict(A = 10) + store.get_storer('df').attrs.my_attribute + +.. ipython:: python + :suppress: + + store.close() + os.remove('test.h5') + + +.. _cookbook.binary: + +Binary Files +~~~~~~~~~~~~ + +pandas readily accepts numpy record arrays, if you need to read in a binary +file consisting of an array of C structs. For example, given this C program +in a file called ``main.c`` compiled with ``gcc main.c -std=gnu99`` on a +64-bit machine, + +.. code-block:: c + + #include + #include + + typedef struct _Data + { + int32_t count; + double avg; + float scale; + } Data; + + int main(int argc, const char *argv[]) + { + size_t n = 10; + Data d[n]; + + for (int i = 0; i < n; ++i) + { + d[i].count = i; + d[i].avg = i + 1.0; + d[i].scale = (float) i + 2.0f; + } + + FILE *file = fopen("binary.dat", "wb"); + fwrite(&d, sizeof(Data), n, file); + fclose(file); + + return 0; + } + +the following Python code will read the binary file ``'binary.dat'`` into a +pandas ``DataFrame``, where each element of the struct corresponds to a column +in the frame: + +.. code-block:: python + + import numpy as np + from pandas import DataFrame + + names = 'count', 'avg', 'scale' + + # note that the offsets are larger than the size of the type because of + # struct padding + offsets = 0, 8, 16 + formats = 'i4', 'f8', 'f4' + dt = np.dtype({'names': names, 'offsets': offsets, 'formats': formats}, + align=True) + df = DataFrame(np.fromfile('binary.dat', dt)) + +.. note:: + + The offsets of the structure elements may be different depending on the + architecture of the machine on which the file was created. Using a raw + binary file format like this for general data storage is not recommended, as + it is not cross platform. We recommended either HDF5 or msgpack, both of + which are supported by pandas' IO facilities. + +Computation +----------- + +`Numerical integration (sample-based) of a time series +`__ + +Miscellaneous +------------- + +The :ref:`Timedeltas ` docs. + +`Operating with timedeltas +`__ + +`Create timedeltas with date differences +`__ + +`Adding days to dates in a dataframe +`__ + +Aliasing Axis Names +------------------- + +To globally provide aliases for axis names, one can define these 2 functions: + +.. ipython:: python + + def set_axis_alias(cls, axis, alias): + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES[alias] = axis + +.. ipython:: python + + def clear_axis_alias(cls, axis, alias): + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES.pop(alias,None) + +.. ipython:: python + + set_axis_alias(DataFrame,'columns', 'myaxis2') + df2 = DataFrame(randn(3,2),columns=['c1','c2'],index=['i1','i2','i3']) + df2.sum(axis='myaxis2') + clear_axis_alias(DataFrame,'columns', 'myaxis2') + +Creating Example Data +--------------------- + +To create a dataframe from every combination of some given values, like R's ``expand.grid()`` +function, we can create a dict where the keys are column names and the values are lists +of the data values: + +.. ipython:: python + + import itertools + + def expand_grid(data_dict): + rows = itertools.product(*data_dict.values()) + return pd.DataFrame.from_records(rows, columns=data_dict.keys()) + + df = expand_grid( + {'height': [60, 70], + 'weight': [100, 140, 180], + 'sex': ['Male', 'Female']} + ) + df diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst new file mode 100644 index 00000000..7c43a03e --- /dev/null +++ b/doc/source/dsintro.rst @@ -0,0 +1,964 @@ +.. currentmodule:: pandas +.. _dsintro: + + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + set_option('display.precision', 4, 'display.max_columns', 8) + options.display.max_rows=15 + import pandas as pd + + +************************ +Intro to Data Structures +************************ + +We'll start with a quick, non-comprehensive overview of the fundamental data +structures in pandas to get you started. The fundamental behavior about data +types, indexing, and axis labeling / alignment apply across all of the +objects. To get started, import numpy and load pandas into your namespace: + +.. ipython:: python + + import numpy as np + # will use a lot in examples + randn = np.random.randn + from pandas import * + +Here is a basic tenet to keep in mind: **data alignment is intrinsic**. The link +between labels and data will not be broken unless done so explicitly by you. + +We'll give a brief intro to the data structures, then consider all of the broad +categories of functionality and methods in separate sections. + +When using pandas, we recommend the following import convention: + +.. code-block:: python + + import pandas as pd + + +.. _basics.series: + +Series +------ + +.. warning:: + + In 0.13.0 ``Series`` has internaly been refactored to no longer sub-class ``ndarray`` + but instead subclass ``NDFrame``, similarly to the rest of the pandas containers. This should be + a transparent change with only very limited API implications (See the :ref:`Internal Refactoring`) + +:class:`Series` is a one-dimensional labeled array capable of holding any data +type (integers, strings, floating point numbers, Python objects, etc.). The axis +labels are collectively referred to as the **index**. The basic method to create a Series is to call: + +:: + + >>> s = Series(data, index=index) + +Here, ``data`` can be many different things: + + - a Python dict + - an ndarray + - a scalar value (like 5) + +The passed **index** is a list of axis labels. Thus, this separates into a few +cases depending on what **data is**: + +**From ndarray** + +If ``data`` is an ndarray, **index** must be the same length as **data**. If no +index is passed, one will be created having values ``[0, ..., len(data) - 1]``. + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s + s.index + + Series(randn(5)) + +.. note:: + + Starting in v0.8.0, pandas supports non-unique index values. If an operation + that does not support duplicate index values is attempted, an exception + will be raised at that time. The reason for being lazy is nearly all performance-based + (there are many instances in computations, like parts of GroupBy, where the index + is not used). + +**From dict** + +If ``data`` is a dict, if **index** is passed the values in data corresponding +to the labels in the index will be pulled out. Otherwise, an index will be +constructed from the sorted keys of the dict, if possible. + +.. ipython:: python + + d = {'a' : 0., 'b' : 1., 'c' : 2.} + Series(d) + Series(d, index=['b', 'c', 'd', 'a']) + +.. note:: + + NaN (not a number) is the standard missing data marker used in pandas + +**From scalar value** If ``data`` is a scalar value, an index must be +provided. The value will be repeated to match the length of **index** + +.. ipython:: python + + Series(5., index=['a', 'b', 'c', 'd', 'e']) + +Series is ndarray-like +~~~~~~~~~~~~~~~~~~~~~~ + +``Series`` acts very similary to a ``ndarray``, and is a valid argument to most NumPy functions. +However, things like slicing also slice the index. + +.. ipython :: python + + s[0] + s[:3] + s[s > s.median()] + s[[4, 3, 1]] + np.exp(s) + +We will address array-based indexing in a separate :ref:`section `. + +Series is dict-like +~~~~~~~~~~~~~~~~~~~ + +A Series is like a fixed-size dict in that you can get and set values by index +label: + +.. ipython :: python + + s['a'] + s['e'] = 12. + s + 'e' in s + 'f' in s + +If a label is not contained, an exception is raised: + +.. code-block:: python + + >>> s['f'] + KeyError: 'f' + +Using the ``get`` method, a missing label will return None or specified default: + +.. ipython:: python + + s.get('f') + + s.get('f', np.nan) + +See also the :ref:`section on attribute access`. + +Vectorized operations and label alignment with Series +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When doing data analysis, as with raw NumPy arrays looping through Series +value-by-value is usually not necessary. Series can be also be passed into most +NumPy methods expecting an ndarray. + + +.. ipython:: python + + s + s + s * 2 + np.exp(s) + +A key difference between Series and ndarray is that operations between Series +automatically align the data based on label. Thus, you can write computations +without giving consideration to whether the Series involved have the same +labels. + +.. ipython:: python + + s[1:] + s[:-1] + +The result of an operation between unaligned Series will have the **union** of +the indexes involved. If a label is not found in one Series or the other, the +result will be marked as missing ``NaN``. Being able to write code without doing +any explicit data alignment grants immense freedom and flexibility in +interactive data analysis and research. The integrated data alignment features +of the pandas data structures set pandas apart from the majority of related +tools for working with labeled data. + +.. note:: + + In general, we chose to make the default result of operations between + differently indexed objects yield the **union** of the indexes in order to + avoid loss of information. Having an index label, though the data is + missing, is typically important information as part of a computation. You + of course have the option of dropping labels with missing data via the + **dropna** function. + +Name attribute +~~~~~~~~~~~~~~ + +.. _dsintro.name_attribute: + +Series can also have a ``name`` attribute: + +.. ipython:: python + + s = Series(np.random.randn(5), name='something') + s + s.name + +The Series ``name`` will be assigned automatically in many cases, in particular +when taking 1D slices of DataFrame as you will see below. + +.. _basics.dataframe: + +DataFrame +--------- + +**DataFrame** is a 2-dimensional labeled data structure with columns of +potentially different types. You can think of it like a spreadsheet or SQL +table, or a dict of Series objects. It is generally the most commonly used +pandas object. Like Series, DataFrame accepts many different kinds of input: + + - Dict of 1D ndarrays, lists, dicts, or Series + - 2-D numpy.ndarray + - `Structured or record + `__ ndarray + - A ``Series`` + - Another ``DataFrame`` + +Along with the data, you can optionally pass **index** (row labels) and +**columns** (column labels) arguments. If you pass an index and / or columns, +you are guaranteeing the index and / or columns of the resulting +DataFrame. Thus, a dict of Series plus a specific index will discard all data +not matching up to the passed index. + +If axis labels are not passed, they will be constructed from the input data +based on common sense rules. + +From dict of Series or dicts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The result **index** will be the **union** of the indexes of the various +Series. If there are any nested dicts, these will be first converted to +Series. If no columns are passed, the columns will be the sorted list of dict +keys. + +.. ipython:: python + + d = {'one' : Series([1., 2., 3.], index=['a', 'b', 'c']), + 'two' : Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + df = DataFrame(d) + df + + DataFrame(d, index=['d', 'b', 'a']) + DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three']) + +The row and column labels can be accessed respectively by accessing the +**index** and **columns** attributes: + +.. note:: + + When a particular set of columns is passed along with a dict of data, the + passed columns override the keys in the dict. + +.. ipython:: python + + df.index + df.columns + +From dict of ndarrays / lists +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ndarrays must all be the same length. If an index is passed, it must +clearly also be the same length as the arrays. If no index is passed, the +result will be ``range(n)``, where ``n`` is the array length. + +.. ipython:: python + + d = {'one' : [1., 2., 3., 4.], + 'two' : [4., 3., 2., 1.]} + DataFrame(d) + DataFrame(d, index=['a', 'b', 'c', 'd']) + +From structured or record array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This case is handled identically to a dict of arrays. + +.. ipython:: python + + data = np.zeros((2,),dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')]) + data[:] = [(1,2.,'Hello'),(2,3.,"World")] + + DataFrame(data) + DataFrame(data, index=['first', 'second']) + DataFrame(data, columns=['C', 'A', 'B']) + +.. note:: + + DataFrame is not intended to work exactly like a 2-dimensional NumPy + ndarray. + +.. _basics.dataframe.from_list_of_dicts: + +From a list of dicts +~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] + DataFrame(data2) + DataFrame(data2, index=['first', 'second']) + DataFrame(data2, columns=['a', 'b']) + +.. _basics.dataframe.from_dict_of_tuples: + +From a dict of tuples +~~~~~~~~~~~~~~~~~~~~~ + +You can automatically create a multi-indexed frame by passing a tuples dictionary + +.. ipython:: python + + DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, + ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, + ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, + ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, + ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + +.. _basics.dataframe.from_series: + +From a Series +~~~~~~~~~~~~~ + +The result will be a DataFrame with the same index as the input Series, and +with one column whose name is the original name of the Series (only if no other +column name provided). + +**Missing Data** + +Much more will be said on this topic in the :ref:`Missing data ` +section. To construct a DataFrame with missing data, use ``np.nan`` for those +values which are missing. Alternatively, you may pass a ``numpy.MaskedArray`` +as the data argument to the DataFrame constructor, and its masked entries will +be considered missing. + +Alternate Constructors +~~~~~~~~~~~~~~~~~~~~~~ + +.. _basics.dataframe.from_dict: + +**DataFrame.from_dict** + +``DataFrame.from_dict`` takes a dict of dicts or a dict of array-like sequences +and returns a DataFrame. It operates like the ``DataFrame`` constructor except +for the ``orient`` parameter which is ``'columns'`` by default, but which can be +set to ``'index'`` in order to use the dict keys as row labels. + +.. _basics.dataframe.from_records: + +**DataFrame.from_records** + +``DataFrame.from_records`` takes a list of tuples or an ndarray with structured +dtype. Works analogously to the normal ``DataFrame`` constructor, except that +index maybe be a specific field of the structured dtype to use as the index. +For example: + +.. ipython:: python + + data + DataFrame.from_records(data, index='C') + +.. _basics.dataframe.from_items: + +**DataFrame.from_items** + +``DataFrame.from_items`` works analogously to the form of the ``dict`` +constructor that takes a sequence of ``(key, value)`` pairs, where the keys are +column (or row, in the case of ``orient='index'``) names, and the value are the +column values (or row values). This can be useful for constructing a DataFrame +with the columns in a particular order without having to pass an explicit list +of columns: + +.. ipython:: python + + DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])]) + +If you pass ``orient='index'``, the keys will be the row labels. But in this +case you must also pass the desired column names: + +.. ipython:: python + + DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], + orient='index', columns=['one', 'two', 'three']) + +Column selection, addition, deletion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can treat a DataFrame semantically like a dict of like-indexed Series +objects. Getting, setting, and deleting columns works with the same syntax as +the analogous dict operations: + +.. ipython:: python + + df['one'] + df['three'] = df['one'] * df['two'] + df['flag'] = df['one'] > 2 + df + +Columns can be deleted or popped like with a dict: + +.. ipython:: python + + del df['two'] + three = df.pop('three') + df + +When inserting a scalar value, it will naturally be propagated to fill the +column: + +.. ipython:: python + + df['foo'] = 'bar' + df + +When inserting a Series that does not have the same index as the DataFrame, it +will be conformed to the DataFrame's index: + +.. ipython:: python + + df['one_trunc'] = df['one'][:2] + df + +You can insert raw ndarrays but their length must match the length of the +DataFrame's index. + +By default, columns get inserted at the end. The ``insert`` function is +available to insert at a particular location in the columns: + +.. ipython:: python + + df.insert(1, 'bar', df['one']) + df + +Indexing / Selection +~~~~~~~~~~~~~~~~~~~~ +The basics of indexing are as follows: + +.. csv-table:: + :header: "Operation", "Syntax", "Result" + :widths: 30, 20, 10 + + Select column, ``df[col]``, Series + Select row by label, ``df.loc[label]``, Series + Select row by integer location, ``df.iloc[loc]``, Series + Slice rows, ``df[5:10]``, DataFrame + Select rows by boolean vector, ``df[bool_vec]``, DataFrame + +Row selection, for example, returns a Series whose index is the columns of the +DataFrame: + +.. ipython:: python + + df.loc['b'] + df.iloc[2] + +For a more exhaustive treatment of more sophisticated label-based indexing and +slicing, see the :ref:`section on indexing `. We will address the +fundamentals of reindexing / conforming to new sets of lables in the +:ref:`section on reindexing `. + +Data alignment and arithmetic +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Data alignment between DataFrame objects automatically align on **both the +columns and the index (row labels)**. Again, the resulting object will have the +union of the column and row labels. + +.. ipython:: python + + df = DataFrame(randn(10, 4), columns=['A', 'B', 'C', 'D']) + df2 = DataFrame(randn(7, 3), columns=['A', 'B', 'C']) + df + df2 + +When doing an operation between DataFrame and Series, the default behavior is +to align the Series **index** on the DataFrame **columns**, thus `broadcasting +`__ +row-wise. For example: + +.. ipython:: python + + df - df.iloc[0] + +In the special case of working with time series data, if the Series is a +TimeSeries (which it will be automatically if the index contains datetime +objects), and the DataFrame index also contains dates, the broadcasting will be +column-wise: + +.. ipython:: python + :okwarning: + + index = date_range('1/1/2000', periods=8) + df = DataFrame(randn(8, 3), index=index, columns=list('ABC')) + df + type(df['A']) + df - df['A'] + +.. warning:: + + .. code-block:: python + + df - df['A'] + + is now deprecated and will be removed in a future release. The preferred way + to replicate this behavior is + + .. code-block:: python + + df.sub(df['A'], axis=0) + +For explicit control over the matching and broadcasting behavior, see the +section on :ref:`flexible binary operations `. + +Operations with scalars are just as you would expect: + +.. ipython:: python + + df * 5 + 2 + 1 / df + df ** 4 + +.. _dsintro.boolean: + +Boolean operators work as well: + +.. ipython:: python + + df1 = DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool) + df2 = DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool) + df1 & df2 + df1 | df2 + df1 ^ df2 + -df1 + +Transposing +~~~~~~~~~~~ + +To transpose, access the ``T`` attribute (also the ``transpose`` function), +similar to an ndarray: + +.. ipython:: python + + # only show the first 5 rows + df[:5].T + +DataFrame interoperability with NumPy functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _dsintro.numpy_interop: + +Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions +can be used with no issues on DataFrame, assuming the data within are numeric: + +.. ipython:: python + + np.exp(df) + np.asarray(df) + +The dot method on DataFrame implements matrix multiplication: + +.. ipython:: python + + df.T.dot(df) + +Similarly, the dot method on Series implements dot product: + +.. ipython:: python + + s1 = Series(np.arange(5,10)) + s1.dot(s1) + +DataFrame is not intended to be a drop-in replacement for ndarray as its +indexing semantics are quite different in places from a matrix. + +Console display +~~~~~~~~~~~~~~~ + +Very large DataFrames will be truncated to display them in the console. +You can also get a summary using :meth:`~pandas.DataFrame.info`. +(Here I am reading a CSV version of the **baseball** dataset from the **plyr** +R package): + +.. ipython:: python + :suppress: + + # force a summary to be printed + pd.set_option('display.max_rows', 5) + +.. ipython:: python + + baseball = read_csv('data/baseball.csv') + print(baseball) + baseball.info() + +.. ipython:: python + :suppress: + + # restore GlobalPrintConfig + pd.reset_option('^display\.') + +However, using ``to_string`` will return a string representation of the +DataFrame in tabular form, though it won't always fit the console width: + +.. ipython:: python + + print(baseball.iloc[-20:, :12].to_string()) + +New since 0.10.0, wide DataFrames will now be printed across multiple rows by +default: + +.. ipython:: python + + DataFrame(randn(3, 12)) + +You can change how much to print on a single row by setting the ``display.width`` +option: + +.. ipython:: python + + set_option('display.width', 40) # default is 80 + + DataFrame(randn(3, 12)) + +.. ipython:: python + :suppress: + + reset_option('display.width') + +You can also disable this feature via the ``expand_frame_repr`` option. +This will print the table in one block. + +DataFrame column attribute access and IPython completion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a DataFrame column label is a valid Python variable name, the column can be +accessed like attributes: + +.. ipython:: python + + df = DataFrame({'foo1' : np.random.randn(5), + 'foo2' : np.random.randn(5)}) + df + df.foo1 + +The columns are also connected to the `IPython `__ +completion mechanism so they can be tab-completed: + +.. code-block:: ipython + + In [5]: df.fo + df.foo1 df.foo2 + +.. _basics.panel: + +Panel +----- + +Panel is a somewhat less-used, but still important container for 3-dimensional +data. The term `panel data `__ is +derived from econometrics and is partially responsible for the name pandas: +pan(el)-da(ta)-s. The names for the 3 axes are intended to give some semantic +meaning to describing operations involving panel data and, in particular, +econometric analysis of panel data. However, for the strict purposes of slicing +and dicing a collection of DataFrame objects, you may find the axis names +slightly arbitrary: + + - **items**: axis 0, each item corresponds to a DataFrame contained inside + - **major_axis**: axis 1, it is the **index** (rows) of each of the + DataFrames + - **minor_axis**: axis 2, it is the **columns** of each of the DataFrames + +Construction of Panels works about like you would expect: + +From 3D ndarray with optional axis labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + wp + + +From dict of DataFrame objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + data = {'Item1' : DataFrame(randn(4, 3)), + 'Item2' : DataFrame(randn(4, 2))} + Panel(data) + +Note that the values in the dict need only be **convertible to +DataFrame**. Thus, they can be any of the other valid inputs to DataFrame as +per above. + +One helpful factory method is ``Panel.from_dict``, which takes a +dictionary of DataFrames as above, and the following named parameters: + +.. csv-table:: + :header: "Parameter", "Default", "Description" + :widths: 10, 10, 40 + + intersect, ``False``, drops elements whose indices do not align + orient, ``items``, use ``minor`` to use DataFrames' columns as panel items + +For example, compare to the construction above: + +.. ipython:: python + + Panel.from_dict(data, orient='minor') + +Orient is especially useful for mixed-type DataFrames. If you pass a dict of +DataFrame objects with mixed-type columns, all of the data will get upcasted to +``dtype=object`` unless you pass ``orient='minor'``: + +.. ipython:: python + + df = DataFrame({'a': ['foo', 'bar', 'baz'], + 'b': np.random.randn(3)}) + df + data = {'item1': df, 'item2': df} + panel = Panel.from_dict(data, orient='minor') + panel['a'] + panel['b'] + panel['b'].dtypes + +.. note:: + + Unfortunately Panel, being less commonly used than Series and DataFrame, + has been slightly neglected feature-wise. A number of methods and options + available in DataFrame are not available in Panel. This will get worked + on, of course, in future releases. And faster if you join me in working on + the codebase. + +.. _dsintro.to_panel: + +From DataFrame using ``to_panel`` method +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method was introduced in v0.7 to replace ``LongPanel.to_long``, and converts +a DataFrame with a two-level index to a Panel. + +.. ipython:: python + + midx = MultiIndex(levels=[['one', 'two'], ['x','y']], labels=[[1,1,0,0],[1,0,1,0]]) + df = DataFrame({'A' : [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=midx) + df.to_panel() + +.. _dsintro.panel_item_selection: + +Item selection / addition / deletion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Similar to DataFrame functioning as a dict of Series, Panel is like a dict +of DataFrames: + +.. ipython:: python + + wp['Item1'] + wp['Item3'] = wp['Item1'] / wp['Item2'] + +The API for insertion and deletion is the same as for DataFrame. And as with +DataFrame, if the item is a valid python identifier, you can access it as an +attribute and tab-complete it in IPython. + +Transposing +~~~~~~~~~~~ + +A Panel can be rearranged using its ``transpose`` method (which does not make a +copy by default unless the data are heterogeneous): + +.. ipython:: python + + wp.transpose(2, 0, 1) + +Indexing / Selection +~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "Operation", "Syntax", "Result" + :widths: 30, 20, 10 + + Select item, ``wp[item]``, DataFrame + Get slice at major_axis label, ``wp.major_xs(val)``, DataFrame + Get slice at minor_axis label, ``wp.minor_xs(val)``, DataFrame + +For example, using the earlier example data, we could do: + +.. ipython:: python + + wp['Item1'] + wp.major_xs(wp.major_axis[2]) + wp.minor_axis + wp.minor_xs('C') + +Squeezing +~~~~~~~~~ + +Another way to change the dimensionality of an object is to ``squeeze`` a 1-len object, similar to ``wp['Item1']`` + +.. ipython:: python + + wp.reindex(items=['Item1']).squeeze() + wp.reindex(items=['Item1'],minor=['B']).squeeze() + + +Conversion to DataFrame +~~~~~~~~~~~~~~~~~~~~~~~ + +A Panel can be represented in 2D form as a hierarchically indexed +DataFrame. See the section :ref:`hierarchical indexing ` +for more on this. To convert a Panel to a DataFrame, use the ``to_frame`` +method: + +.. ipython:: python + + panel = Panel(np.random.randn(3, 5, 4), items=['one', 'two', 'three'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['a', 'b', 'c', 'd']) + panel.to_frame() + + +.. _dsintro.panel4d: + +Panel4D (Experimental) +---------------------- + +``Panel4D`` is a 4-Dimensional named container very much like a ``Panel``, but +having 4 named dimensions. It is intended as a test bed for more N-Dimensional named +containers. + + - **labels**: axis 0, each item corresponds to a Panel contained inside + - **items**: axis 1, each item corresponds to a DataFrame contained inside + - **major_axis**: axis 2, it is the **index** (rows) of each of the + DataFrames + - **minor_axis**: axis 3, it is the **columns** of each of the DataFrames + + +``Panel4D`` is a sub-class of ``Panel``, so most methods that work on Panels are +applicable to Panel4D. The following methods are disabled: + + - ``join , to_frame , to_excel , to_sparse , groupby`` + +Construction of Panel4D works in a very similar manner to a ``Panel`` + +From 4D ndarray with optional axis labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + p4d = Panel4D(randn(2, 2, 5, 4), + labels=['Label1','Label2'], + items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + p4d + + +From dict of Panel objects +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + data = { 'Label1' : Panel({ 'Item1' : DataFrame(randn(4, 3)) }), + 'Label2' : Panel({ 'Item2' : DataFrame(randn(4, 2)) }) } + Panel4D(data) + +Note that the values in the dict need only be **convertible to Panels**. +Thus, they can be any of the other valid inputs to Panel as per above. + +Slicing +~~~~~~~ + +Slicing works in a similar manner to a Panel. ``[]`` slices the first dimension. +``.ix`` allows you to slice abitrarily and get back lower dimensional objects + +.. ipython:: python + + p4d['Label1'] + +4D -> Panel + +.. ipython:: python + + p4d.ix[:,:,:,'A'] + +4D -> DataFrame + +.. ipython:: python + + p4d.ix[:,:,0,'A'] + +4D -> Series + +.. ipython:: python + + p4d.ix[:,0,0,'A'] + +Transposing +~~~~~~~~~~~ + +A Panel4D can be rearranged using its ``transpose`` method (which does not make a +copy by default unless the data are heterogeneous): + +.. ipython:: python + + p4d.transpose(3, 2, 1, 0) + +.. _dsintro.panelnd: + +PanelND (Experimental) +---------------------- + +PanelND is a module with a set of factory functions to enable a user to construct N-dimensional named +containers like Panel4D, with a custom set of axis labels. Thus a domain-specific container can easily be +created. + +The following creates a Panel5D. A new panel type object must be sliceable into a lower dimensional object. +Here we slice to a Panel4D. + +.. ipython:: python + + from pandas.core import panelnd + Panel5D = panelnd.create_nd_panel_factory( + klass_name = 'Panel5D', + orders = [ 'cool', 'labels','items','major_axis','minor_axis'], + slices = { 'labels' : 'labels', 'items' : 'items', + 'major_axis' : 'major_axis', 'minor_axis' : 'minor_axis' }, + slicer = Panel4D, + aliases = { 'major' : 'major_axis', 'minor' : 'minor_axis' }, + stat_axis = 2) + + p5d = Panel5D(dict(C1 = p4d)) + p5d + + # print a slice of our 5D + p5d.ix['C1',:,:,0:3,:] + + # transpose it + p5d.transpose(1,2,3,4,0) + + # look at the shape & dim + p5d.shape + p5d.ndim diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst new file mode 100644 index 00000000..e5009aeb --- /dev/null +++ b/doc/source/ecosystem.rst @@ -0,0 +1,91 @@ +.. _ecosystem: + +**************** +pandas Ecosystem +**************** + +Increasingly, packages are being built on top of pandas to address specific needs +in data preparation, analysis and visualization. +This is encouraging because it means pandas is not only helping users to handle +their data tasks but also that it provides a better starting point for developers to +build powerful and more focused data tools. +The creation of libraries that complement pandas' functionality also allows pandas +development to remain focused around it's original requirements. + +This is an in-exhaustive list of projects that build on pandas in order to provide +tools in the PyData space. + +We'd like to make it easier for users to find these project, if you know of other +substantial projects that you feel should be on this list, please let us know. + +.. _ecosystem.stats: + +Statistics and Machine Learning +------------------------------- + +`Statsmodels `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Statsmodels is the prominent python "statistics and econometrics library" and it has +a long-standing special relationship with pandas. Statsmodels provides powerful statistics, +econometrics, analysis and modeling functionality that is out of pandas' scope. +Statsmodels leverages pandas objects as the underlying data container for computation. + +`sklearn-pandas `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use pandas DataFrames in your scikit-learn ML pipeline. + + + +.. _ecosystem.visualization: + +Visualization +------------- + +`Vincent `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `Vincent `__ project leverages `Vega `__ +(that in turn, leverages `d3 `__) to create plots . It has great support +for pandas data objects. + +`yhat/ggplot `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. +Based on `"The Grammer of Graphics" `__ it +provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. +It's really quite incredible. Various implementations to other languages are available, +but a faithful implementation for python users has long been missing. Although still young +(as of Jan-2014), the `yhat/ggplot `__ project has been +progressing quickly in that direction. + +`Seaborn `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Although pandas has quite a bit of "just plot it" functionality built-in, visualization and +in particular statistical graphics is a vast field with a long tradition and lots of ground +to cover. The `Seaborn `__ project builds on top of pandas +and `matplotlib `__ to provide easy plotting of data which extends to +more advanced types of plots then those offered by pandas. + +`Bokeh `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Bokeh is a Python interactive visualization library for large datasets that natively uses +the latest web technologies. Its goal is to provide elegant, concise construction of novel +graphics in the style of Protovis/D3, while delivering high-performance interactivity over +large data to thin clients. + +.. _ecosystem.domain: + +Domain Specific +--------------- + +`Geopandas `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Geopandas extends pandas data objects to include geographic information which support +geometric operations. If your work entails maps and geographical coordinates, and +you love pandas, you should take a close look at Geopandas. diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst new file mode 100644 index 00000000..00c76632 --- /dev/null +++ b/doc/source/enhancingperf.rst @@ -0,0 +1,670 @@ +.. _enhancingperf: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import os + import csv + from pandas import DataFrame + import pandas as pd + pd.options.display.max_rows=15 + + import numpy as np + np.random.seed(123456) + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + + +********************* +Enhancing Performance +********************* + +.. _enhancingperf.cython: + +Cython (Writing C extensions for pandas) +---------------------------------------- + +For many use cases writing pandas in pure python and numpy is sufficient. In some +computationally heavy applications however, it can be possible to achieve sizeable +speed-ups by offloading work to `cython `__. + +This tutorial assumes you have refactored as much as possible in python, for example +trying to remove for loops and making use of numpy vectorization, it's always worth +optimising in python first. + +This tutorial walks through a "typical" process of cythonizing a slow computation. +We use an `example from the cython documentation `__ +but in the context of pandas. Our final cythonized solution is around 100 times +faster than the pure python. + +.. _enhancingperf.pure: + +Pure python +~~~~~~~~~~~ + +We have a DataFrame to which we want to apply a function row-wise. + +.. ipython:: python + + df = DataFrame({'a': randn(1000), 'b': randn(1000),'N': randint(100, 1000, (1000)), 'x': 'x'}) + df + +Here's the function in pure python: + +.. ipython:: python + + def f(x): + return x * (x - 1) + + def integrate_f(a, b, N): + s = 0 + dx = (b - a) / N + for i in range(N): + s += f(a + i * dx) + return s * dx + +We achieve our result by by using ``apply`` (row-wise): + +.. ipython:: python + + %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + +But clearly this isn't fast enough for us. Let's take a look and see where the +time is spent during this operation (limited to the most time consuming +four calls) using the `prun ipython magic function `__: + +.. ipython:: python + + %prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + +By far the majority of time is spend inside either ``integrate_f`` or ``f``, +hence we'll concentrate our efforts cythonizing these two functions. + +.. note:: + + In python 2 replacing the ``range`` with its generator counterpart (``xrange``) + would mean the ``range`` line would vanish. In python 3 range is already a generator. + +.. _enhancingperf.plain: + +Plain cython +~~~~~~~~~~~~ + +First we're going to need to import the cython magic function to ipython: + +.. ipython:: python + + %load_ext cythonmagic + + +Now, let's simply copy our functions over to cython as is (the suffix +is here to distinguish between function versions): + +.. ipython:: + + In [2]: %%cython + ...: def f_plain(x): + ...: return x * (x - 1) + ...: def integrate_f_plain(a, b, N): + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_plain(a + i * dx) + ...: return s * dx + ...: + +.. note:: + + If you're having trouble pasting the above into your ipython, you may need + to be using bleeding edge ipython for paste to play well with cell magics. + + +.. ipython:: python + + %timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1) + +Already this has shaved a third off, not too bad for a simple copy and paste. + +.. _enhancingperf.type: + +Adding type +~~~~~~~~~~~ + +We get another huge improvement simply by providing type information: + +.. ipython:: + + In [3]: %%cython + ...: cdef double f_typed(double x) except? -2: + ...: return x * (x - 1) + ...: cpdef double integrate_f_typed(double a, double b, int N): + ...: cdef int i + ...: cdef double s, dx + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_typed(a + i * dx) + ...: return s * dx + ...: + +.. ipython:: python + + %timeit df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + +Now, we're talking! It's now over ten times faster than the original python +implementation, and we haven't *really* modified the code. Let's have another +look at what's eating up time: + +.. ipython:: python + + %prun -l 4 df.apply(lambda x: integrate_f_typed(x['a'], x['b'], x['N']), axis=1) + +.. _enhancingperf.ndarray: + +Using ndarray +~~~~~~~~~~~~~ + +It's calling series... a lot! It's creating a Series from each row, and get-ting from both +the index and the series (three times for each row). Function calls are expensive +in python, so maybe we could minimise these by cythonizing the apply part. + +.. note:: + + We are now passing ndarrays into the cython function, fortunately cython plays + very nicely with numpy. + +.. ipython:: + + In [4]: %%cython + ...: cimport numpy as np + ...: import numpy as np + ...: cdef double f_typed(double x) except? -2: + ...: return x * (x - 1) + ...: cpdef double integrate_f_typed(double a, double b, int N): + ...: cdef int i + ...: cdef double s, dx + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_typed(a + i * dx) + ...: return s * dx + ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, np.ndarray col_N): + ...: assert (col_a.dtype == np.float and col_b.dtype == np.float and col_N.dtype == np.int) + ...: cdef Py_ssize_t i, n = len(col_N) + ...: assert (len(col_a) == len(col_b) == n) + ...: cdef np.ndarray[double] res = np.empty(n) + ...: for i in range(len(col_a)): + ...: res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i]) + ...: return res + ...: + + +The implementation is simple, it creates an array of zeros and loops over +the rows, applying our ``integrate_f_typed``, and putting this in the zeros array. + + +.. warning:: + + In 0.13.0 since ``Series`` has internaly been refactored to no longer sub-class ``ndarray`` + but instead subclass ``NDFrame``, you can **not pass** a ``Series`` directly as a ``ndarray`` typed parameter + to a cython function. Instead pass the actual ``ndarray`` using the ``.values`` attribute of the Series. + + Prior to 0.13.0 + + .. code-block:: python + + apply_integrate_f(df['a'], df['b'], df['N']) + + Use ``.values`` to get the underlying ``ndarray`` + + .. code-block:: python + + apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + +.. note:: + + Loops like this would be *extremely* slow in python, but in Cython looping + over numpy arrays is *fast*. + +.. ipython:: python + + %timeit apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + +We've gotten another big improvement. Let's check again where the time is spent: + +.. ipython:: python + + %prun -l 4 apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + +As one might expect, the majority of the time is now spent in ``apply_integrate_f``, +so if we wanted to make anymore efficiencies we must continue to concentrate our +efforts here. + +.. _enhancingperf.boundswrap: + +More advanced techniques +~~~~~~~~~~~~~~~~~~~~~~~~ + +There is still scope for improvement, here's an example of using some more +advanced cython techniques: + +.. ipython:: + + In [5]: %%cython + ...: cimport cython + ...: cimport numpy as np + ...: import numpy as np + ...: cdef double f_typed(double x) except? -2: + ...: return x * (x - 1) + ...: cpdef double integrate_f_typed(double a, double b, int N): + ...: cdef int i + ...: cdef double s, dx + ...: s = 0 + ...: dx = (b - a) / N + ...: for i in range(N): + ...: s += f_typed(a + i * dx) + ...: return s * dx + ...: @cython.boundscheck(False) + ...: @cython.wraparound(False) + ...: cpdef np.ndarray[double] apply_integrate_f_wrap(np.ndarray[double] col_a, np.ndarray[double] col_b, np.ndarray[Py_ssize_t] col_N): + ...: cdef Py_ssize_t i, n = len(col_N) + ...: assert len(col_a) == len(col_b) == n + ...: cdef np.ndarray[double] res = np.empty(n) + ...: for i in range(n): + ...: res[i] = integrate_f_typed(col_a[i], col_b[i], col_N[i]) + ...: return res + ...: + +.. ipython:: python + + %timeit apply_integrate_f_wrap(df['a'].values, df['b'].values, df['N'].values) + +Even faster, with the caveat that a bug in our cython code (an off-by-one error, +for example) might cause a segfault because memory access isn't checked. + + +Further topics +~~~~~~~~~~~~~~ + +- Loading C modules into cython. + +Read more in the `cython docs `__. + +.. _enhancingperf.eval: + +Expression Evaluation via :func:`~pandas.eval` (Experimental) +------------------------------------------------------------- + +.. versionadded:: 0.13 + +The top-level function :func:`pandas.eval` implements expression evaluation of +:class:`~pandas.Series` and :class:`~pandas.DataFrame` objects. + +.. note:: + + To benefit from using :func:`~pandas.eval` you need to + install ``numexpr``. See the :ref:`recommended dependencies section + ` for more details. + +The point of using :func:`~pandas.eval` for expression evaluation rather than +plain Python is two-fold: 1) large :class:`~pandas.DataFrame` objects are +evaluated more efficiently and 2) large arithmetic and boolean expressions are +evaluated all at once by the underlying engine (by default ``numexpr`` is used +for evaluation). + +.. note:: + + You should not use :func:`~pandas.eval` for simple + expressions or for expressions involving small DataFrames. In fact, + :func:`~pandas.eval` is many orders of magnitude slower for + smaller expressions/objects than plain ol' Python. A good rule of thumb is + to only use :func:`~pandas.eval` when you have a + :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows. + + +:func:`~pandas.eval` supports all arithmetic expressions supported by the +engine in addition to some extensions available only in pandas. + +.. note:: + + The larger the frame and the larger the expression the more speedup you will + see from using :func:`~pandas.eval`. + +Supported Syntax +~~~~~~~~~~~~~~~~ + +These operations are supported by :func:`pandas.eval`: + +- Arithmetic operations except for the left shift (``<<``) and right shift + (``>>``) operators, e.g., ``df + 2 * pi / s ** 4 % 42 - the_golden_ratio`` +- Comparison operations, including chained comparisons, e.g., ``2 < df < df2`` +- Boolean operations, e.g., ``df < df2 and df3 < df4 or not df_bool`` +- ``list`` and ``tuple`` literals, e.g., ``[1, 2]`` or ``(1, 2)`` +- Attribute access, e.g., ``df.a`` +- Subscript expressions, e.g., ``df[0]`` +- Simple variable evaluation, e.g., ``pd.eval('df')`` (this is not very useful) + +This Python syntax is **not** allowed: + +* Expressions + + - Function calls + - ``is``/``is not`` operations + - ``if`` expressions + - ``lambda`` expressions + - ``list``/``set``/``dict`` comprehensions + - Literal ``dict`` and ``set`` expressions + - ``yield`` expressions + - Generator expressions + - Boolean expressions consisting of only scalar values + +* Statements + + - Neither `simple `__ + nor `compound `__ + statements are allowed. This includes things like ``for``, ``while``, and + ``if``. + + + +:func:`~pandas.eval` Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`pandas.eval` works well with expressions containing large arrays + +First let's create a few decent-sized arrays to play with: + +.. ipython:: python + + import pandas as pd + from pandas import DataFrame, Series + from numpy.random import randn + import numpy as np + nrows, ncols = 20000, 100 + df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) for _ in range(4)] + + +Now let's compare adding them together using plain ol' Python versus +:func:`~pandas.eval`: + + +.. ipython:: python + + %timeit df1 + df2 + df3 + df4 + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4') + + +Now let's do the same thing but with comparisons: + +.. ipython:: python + + %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0) + +.. ipython:: python + + %timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') + + +:func:`~pandas.eval` also works with unaligned pandas objects: + + +.. ipython:: python + + s = Series(randn(50)) + %timeit df1 + df2 + df3 + df4 + s + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4 + s') + +.. note:: + + Operations such as + + .. code-block:: python + + 1 and 2 # would parse to 1 & 2, but should evaluate to 2 + 3 or 4 # would parse to 3 | 4, but should evaluate to 3 + ~1 # this is okay, but slower when using eval + + should be performed in Python. An exception will be raised if you try to + perform any boolean/bitwise operations with scalar operands that are not + of type ``bool`` or ``np.bool_``. Again, you should perform these kinds of + operations in plain Python. + +The ``DataFrame.eval`` method (Experimental) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.13 + +In addition to the top level :func:`pandas.eval` function you can also +evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. + +.. ipython:: python + :suppress: + + try: + del a + except NameError: + pass + + try: + del b + except NameError: + pass + +.. ipython:: python + + df = DataFrame(randn(5, 2), columns=['a', 'b']) + df.eval('a + b') + +Any expression that is a valid :func:`pandas.eval` expression is also a valid +:meth:`DataFrame.eval` expression, with the added benefit that you don't have to +prefix the name of the :class:`~pandas.DataFrame` to the column(s) you're +interested in evaluating. + +In addition, you can perform assignment of columns within an expression. +This allows for *formulaic evaluation*. Only a single assignment is permitted. +The assignment target can be a new column name or an existing column name, and +it must be a valid Python identifier. + +.. ipython:: python + + df = DataFrame(dict(a=range(5), b=range(5, 10))) + df.eval('c = a + b') + df.eval('d = a + b + c') + df.eval('a = 1') + df + +The equivalent in standard Python would be + +.. ipython:: python + + df = DataFrame(dict(a=range(5), b=range(5, 10))) + df['c'] = df.a + df.b + df['d'] = df.a + df.b + df.c + df['a'] = 1 + df + +Local Variables +~~~~~~~~~~~~~~~ + +In pandas version 0.14 the local variable API has changed. In pandas 0.13.x, +you could refer to local variables the same way you would in standard Python. +For example, + +.. code-block:: python + + df = DataFrame(randn(5, 2), columns=['a', 'b']) + newcol = randn(len(df)) + df.eval('b + newcol') + + UndefinedVariableError: name 'newcol' is not defined + +As you can see from the exception generated, this syntax is no longer allowed. +You must *explicitly reference* any local variable that you want to use in an +expression by placing the ``@`` character in front of the name. For example, + +.. ipython:: python + + df = DataFrame(randn(5, 2), columns=list('ab')) + newcol = randn(len(df)) + df.eval('b + @newcol') + df.query('b < @newcol') + +If you don't prefix the local variable with ``@``, pandas will raise an +exception telling you the variable is undefined. + +When using :meth:`DataFrame.eval` and :meth:`DataFrame.query`, this allows you +to have a local variable and a :class:`~pandas.DataFrame` column with the same +name in an expression. + + +.. ipython:: python + + a = randn() + df.query('@a < a') + df.loc[a < df.a] # same as the previous expression + +With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it +isn't defined in that context. ``pandas`` will let you know this if you try to +use ``@`` in a top-level call to :func:`pandas.eval`. For example, + +.. ipython:: python + :okexcept: + + a, b = 1, 2 + pd.eval('@a + b') + +In this case, you should simply refer to the variables like you would in +standard Python. + +.. ipython:: python + + pd.eval('a + b') + + +:func:`pandas.eval` Parsers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are two different parsers and and two different engines you can use as +the backend. + +The default ``'pandas'`` parser allows a more intuitive syntax for expressing +query-like operations (comparisons, conjunctions and disjunctions). In +particular, the precedence of the ``&`` and ``|`` operators is made equal to +the precedence of the corresponding boolean operations ``and`` and ``or``. + +For example, the above conjunction can be written without parentheses. +Alternatively, you can use the ``'python'`` parser to enforce strict Python +semantics. + +.. ipython:: python + + expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' + x = pd.eval(expr, parser='python') + expr_no_parens = 'df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0' + y = pd.eval(expr_no_parens, parser='pandas') + np.all(x == y) + + +The same expression can be "anded" together with the word :keyword:`and` as +well: + +.. ipython:: python + + expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' + x = pd.eval(expr, parser='python') + expr_with_ands = 'df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0' + y = pd.eval(expr_with_ands, parser='pandas') + np.all(x == y) + + +The ``and`` and ``or`` operators here have the same precedence that they would +in vanilla Python. + + +:func:`pandas.eval` Backends +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There's also the option to make :func:`~pandas.eval` operate identical to plain +ol' Python. + +.. note:: + + Using the ``'python'`` engine is generally *not* useful, except for testing + other evaluation engines against it. You will acheive **no** performance + benefits using :func:`~pandas.eval` with ``engine='python'`` and in fact may + incur a performance hit. + +You can see this by using :func:`pandas.eval` with the ``'python'`` engine. It +is a bit slower (not by much) than evaluating the same expression in Python + +.. ipython:: python + + %timeit df1 + df2 + df3 + df4 + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') + + +:func:`pandas.eval` Performance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~pandas.eval` is intended to speed up certain kinds of operations. In +particular, those operations involving complex expressions with large +:class:`~pandas.DataFrame`/:class:`~pandas.Series` objects should see a +significant performance benefit. Here is a plot showing the running time of +:func:`pandas.eval` as function of the size of the frame involved in the +computation. The two lines are two different engines. + + +.. image:: _static/eval-perf.png + + +.. note:: + + Operations with smallish objects (around 15k-20k rows) are faster using + plain Python: + + .. image:: _static/eval-perf-small.png + + +This plot was created using a ``DataFrame`` with 3 columns each containing +floating point values generated using ``numpy.random.randn()``. + +Technical Minutia Regarding Expression Evaluation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Expressions that would result in an object dtype or involve datetime operations +(because of ``NaT``) must be evaluated in Python space. The main reason for +this behavior is to maintain backwards compatbility with versions of numpy < +1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` will +truncate any strings that are more than 60 characters in length. Second, we +can't pass ``object`` arrays to ``numexpr`` thus string comparisons must be +evaluated in Python space. + +The upshot is that this *only* applies to object-dtype'd expressions. So, if +you have an expression--for example + +.. ipython:: python + + df = DataFrame({'strings': np.repeat(list('cba'), 3), + 'nums': np.repeat(range(3), 3)}) + df + df.query('strings == "a" and nums == 1') + +the numeric part of the comparison (``nums == 1``) will be evaluated by +``numexpr``. + +In general, :meth:`DataFrame.query`/:func:`pandas.eval` will +evaluate the subexpressions that *can* be evaluated by ``numexpr`` and those +that must be evaluated in Python space transparently to the user. This is done +by inferring the result type of an expression from its arguments and operators. diff --git a/doc/source/faq.rst b/doc/source/faq.rst new file mode 100644 index 00000000..81bebab4 --- /dev/null +++ b/doc/source/faq.rst @@ -0,0 +1,301 @@ +.. currentmodule:: pandas +.. _faq: + +******************************** +Frequently Asked Questions (FAQ) +******************************** + +.. ipython:: python + :suppress: + + from datetime import datetime + import numpy as np + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + from dateutil.relativedelta import relativedelta + from pandas.tseries.api import * + from pandas.tseries.offsets import * + import matplotlib.pyplot as plt + plt.close('all') + options.display.mpl_style='default' + from pandas.compat import lrange + +.. _ref-monkey-patching: + +Adding Features to your pandas Installation +------------------------------------------- + +pandas is a powerful tool and already has a plethora of data manipulation +operations implemented, most of them are very fast as well. +It's very possible however that certain functionality that would make your +life easier is missing. In that case you have several options: + +1) Open an issue on `Github `__ , explain your need and the sort of functionality you would like to see implemented. +2) Fork the repo, Implement the functionality yourself and open a PR + on Github. +3) Write a method that performs the operation you are interested in and + Monkey-patch the pandas class as part of your IPython profile startup + or PYTHONSTARTUP file. + + For example, here is an example of adding an ``just_foo_cols()`` + method to the dataframe class: + +:: + + import pandas as pd + def just_foo_cols(self): + """Get a list of column names containing the string 'foo' + + """ + return [x for x in self.columns if 'foo' in x] + + pd.DataFrame.just_foo_cols = just_foo_cols # monkey-patch the DataFrame class + df = pd.DataFrame([list(range(4))], columns=["A","foo","foozball","bar"]) + df.just_foo_cols() + del pd.DataFrame.just_foo_cols # you can also remove the new method + + +Monkey-patching is usually frowned upon because it makes your code +less portable and can cause subtle bugs in some circumstances. +Monkey-patching existing methods is usually a bad idea in that respect. +When used with proper care, however, it's a very useful tool to have. + + +.. _ref-scikits-migration: + +Migrating from scikits.timeseries to pandas >= 0.8.0 +---------------------------------------------------- + +Starting with pandas 0.8.0, users of scikits.timeseries should have all of the +features that they need to migrate their code to use pandas. Portions of the +scikits.timeseries codebase for implementing calendar logic and timespan +frequency conversions (but **not** resampling, that has all been implemented +from scratch from the ground up) have been ported to the pandas codebase. + +The scikits.timeseries notions of ``Date`` and ``DateArray`` are responsible +for implementing calendar logic: + +:: + + In [16]: dt = ts.Date('Q', '1984Q3') + + # sic + In [17]: dt + Out[17]: + + In [18]: dt.asfreq('D', 'start') + Out[18]: + + In [19]: dt.asfreq('D', 'end') + Out[19]: + + In [20]: dt + 3 + Out[20]: + +``Date`` and ``DateArray`` from scikits.timeseries have been reincarnated in +pandas ``Period`` and ``PeriodIndex``: + +.. ipython:: python + + pnow('D') # scikits.timeseries.now() + Period(year=2007, month=3, day=15, freq='D') + p = Period('1984Q3') + p + p.asfreq('D', 'start') + p.asfreq('D', 'end') + (p + 3).asfreq('T') + 6 * 60 + 30 + rng = period_range('1990', '2010', freq='A') + rng + rng.asfreq('B', 'end') - 3 + +.. csv-table:: + :header: "scikits.timeseries", "pandas", "Notes" + :widths: 20, 20, 60 + + Date, Period, "A span of time, from yearly through to secondly" + DateArray, PeriodIndex, "An array of timespans" + convert, resample, "Frequency conversion in scikits.timeseries" + convert_to_annual, pivot_annual, "currently supports up to daily frequency, see :issue:`736`" + + +PeriodIndex / DateArray properties and functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The scikits.timeseries ``DateArray`` had a number of information +properties. Here are the pandas equivalents: + +.. csv-table:: + :header: "scikits.timeseries", "pandas", "Notes" + :widths: 20, 60, 20 + + get_steps, ``np.diff(idx.values)``, + has_missing_dates, ``not idx.is_full``, + is_full, ``idx.is_full``, + is_valid, ``idx.is_monotonic and idx.is_unique``, + is_chronological, ``is_monotonic``, + ``arr.sort_chronologically()``, ``idx.order()``, + +Frequency conversion +~~~~~~~~~~~~~~~~~~~~ + +Frequency conversion is implemented using the ``resample`` method on TimeSeries +and DataFrame objects (multiple time series). ``resample`` also works on panels +(3D). Here is some code that resamples daily data to montly: + +.. ipython:: python + + rng = period_range('Jan-2000', periods=50, freq='M') + data = Series(np.random.randn(50), index=rng) + data + data.resample('A', how=np.mean) + +Plotting +~~~~~~~~ + +Much of the plotting functionality of scikits.timeseries has been ported and +adopted to pandas's data structures. For example: + +.. ipython:: python + + rng = period_range('1987Q2', periods=10, freq='Q-DEC') + data = Series(np.random.randn(10), index=rng) + + @savefig skts_ts_plot.png + plt.figure(); data.plot() + +Converting to and from period format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the ``to_timestamp`` and ``to_period`` instance methods. + +Treatment of missing data +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unlike scikits.timeseries, pandas data structures are not based on NumPy's +``MaskedArray`` object. Missing data is represented as ``NaN`` in numerical +arrays and either as ``None`` or ``NaN`` in non-numerical arrays. Implementing +a version of pandas's data structures that use MaskedArray is possible but +would require the involvement of a dedicated maintainer. Active pandas +developers are not interested in this. + +Resampling with timestamps and periods +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``resample`` has a ``kind`` argument which allows you to resample time series +with a DatetimeIndex to PeriodIndex: + +.. ipython:: python + + rng = date_range('1/1/2000', periods=200, freq='D') + data = Series(np.random.randn(200), index=rng) + data[:10] + data.index + data.resample('M', kind='period') + +Similarly, resampling from periods to timestamps is possible with an optional +interval (``'start'`` or ``'end'``) convention: + +.. ipython:: python + + rng = period_range('Jan-2000', periods=50, freq='M') + data = Series(np.random.randn(50), index=rng) + resampled = data.resample('A', kind='timestamp', convention='end') + resampled.index + + +Byte-Ordering Issues +-------------------- +Occasionally you may have to deal with data that were created on a machine with +a different byte order than the one on which you are running Python. To deal +with this issue you should convert the underlying NumPy array to the native +system byte order *before* passing it to Series/DataFrame/Panel constructors +using something similar to the following: + +.. ipython:: python + + x = np.array(list(range(10)), '>i4') # big endian + newx = x.byteswap().newbyteorder() # force native byteorder + s = Series(newx) + +See `the NumPy documentation on byte order +`__ for more +details. + + +Visualizing Data in Qt applications +----------------------------------- + +There is experimental support for visualizing DataFrames in PyQt4 and PySide +applications. At the moment you can display and edit the values of the cells +in the DataFrame. Qt will take care of displaying just the portion of the +DataFrame that is currently visible and the edits will be immediately saved to +the underlying DataFrame + +To demonstrate this we will create a simple PySide application that will switch +between two editable DataFrames. For this will use the ``DataFrameModel`` class +that handles the access to the DataFrame, and the ``DataFrameWidget``, which is +just a thin layer around the ``QTableView``. + +.. code-block:: python + + import numpy as np + import pandas as pd + from pandas.sandbox.qtpandas import DataFrameModel, DataFrameWidget + from PySide import QtGui, QtCore + + # Or if you use PyQt4: + # from PyQt4 import QtGui, QtCore + + class MainWidget(QtGui.QWidget): + def __init__(self, parent=None): + super(MainWidget, self).__init__(parent) + + # Create two DataFrames + self.df1 = pd.DataFrame(np.arange(9).reshape(3, 3), + columns=['foo', 'bar', 'baz']) + self.df2 = pd.DataFrame({ + 'int': [1, 2, 3], + 'float': [1.5, 2.5, 3.5], + 'string': ['a', 'b', 'c'], + 'nan': [np.nan, np.nan, np.nan] + }, index=['AAA', 'BBB', 'CCC'], + columns=['int', 'float', 'string', 'nan']) + + # Create the widget and set the first DataFrame + self.widget = DataFrameWidget(self.df1) + + # Create the buttons for changing DataFrames + self.button_first = QtGui.QPushButton('First') + self.button_first.clicked.connect(self.on_first_click) + self.button_second = QtGui.QPushButton('Second') + self.button_second.clicked.connect(self.on_second_click) + + # Set the layout + vbox = QtGui.QVBoxLayout() + vbox.addWidget(self.widget) + hbox = QtGui.QHBoxLayout() + hbox.addWidget(self.button_first) + hbox.addWidget(self.button_second) + vbox.addLayout(hbox) + self.setLayout(vbox) + + def on_first_click(self): + '''Sets the first DataFrame''' + self.widget.setDataFrame(self.df1) + + def on_second_click(self): + '''Sets the second DataFrame''' + self.widget.setDataFrame(self.df2) + + if __name__ == '__main__': + import sys + + # Initialize the application + app = QtGui.QApplication(sys.argv) + mw = MainWidget() + mw.show() + app.exec_() diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst new file mode 100644 index 00000000..0078ffb5 --- /dev/null +++ b/doc/source/gotchas.rst @@ -0,0 +1,581 @@ +.. currentmodule:: pandas +.. _gotchas: + +.. ipython:: python + :suppress: + + import os + import numpy as np + from pandas import * + options.display.max_rows=15 + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + from pandas.compat import lrange + +******************* +Caveats and Gotchas +******************* + +.. _gotchas.truth: + +Using If/Truth Statements with pandas +------------------------------------- + +pandas follows the numpy convention of raising an error when you try to convert something to a ``bool``. +This happens in a ``if`` or when using the boolean operations, ``and``, ``or``, or ``not``. It is not clear +what the result of + +.. code-block:: python + + >>> if Series([False, True, False]): + ... + +should be. Should it be ``True`` because it's not zero-length? ``False`` because there are ``False`` values? +It is unclear, so instead, pandas raises a ``ValueError``: + +.. code-block:: python + + >>> if pd.Series([False, True, False]): + print("I was true") + Traceback + ... + ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). + + +If you see that, you need to explicitly choose what you want to do with it (e.g., use `any()`, `all()` or `empty`). +or, you might want to compare if the pandas object is ``None`` + +.. code-block:: python + + >>> if pd.Series([False, True, False]) is not None: + print("I was not None") + >>> I was not None + + +or return if ``any`` value is ``True``. + +.. code-block:: python + + >>> if pd.Series([False, True, False]).any(): + print("I am any") + >>> I am any + +To evaluate single-element pandas objects in a boolean context, use the method ``.bool()``: + +.. ipython:: python + + Series([True]).bool() + Series([False]).bool() + DataFrame([[True]]).bool() + DataFrame([[False]]).bool() + +Bitwise boolean +~~~~~~~~~~~~~~~ + +Bitwise boolean operators like ``==`` and ``!=`` will return a boolean ``Series``, +which is almost always what you want anyways. + +.. code-block:: python + + >>> s = pd.Series(range(5)) + >>> s == 4 + 0 False + 1 False + 2 False + 3 False + 4 True + dtype: bool + +See :ref:`boolean comparisons` for more examples. + +Using the ``in`` operator +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the Python ``in`` operator on a Series tests for membership in the +index, not membership among the values. + +.. ipython:: + + s = pd.Series(range(5), index=list('abcde')) + 2 in s + 'b' in s + +If this behavior is surprising, keep in mind that using ``in`` on a Python +dictionary tests keys, not values, and Series are dict-like. +To test for membership in the values, use the method :func:`~pandas.Series.isin`: + +.. ipython:: + + s.isin([2]) + s.isin([2]).any() + +For DataFrames, likewise, ``in`` applies to the column axis, +testing for membership in the list of column names. + +``NaN``, Integer ``NA`` values and ``NA`` type promotions +--------------------------------------------------------- + +Choice of ``NA`` representation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For lack of ``NA`` (missing) support from the ground up in NumPy and Python in +general, we were given the difficult choice between either + +- A *masked array* solution: an array of data and an array of boolean values + indicating whether a value +- Using a special sentinel value, bit pattern, or set of sentinel values to + denote ``NA`` across the dtypes + +For many reasons we chose the latter. After years of production use it has +proven, at least in my opinion, to be the best decision given the state of +affairs in NumPy and Python in general. The special value ``NaN`` +(Not-A-Number) is used everywhere as the ``NA`` value, and there are API +functions ``isnull`` and ``notnull`` which can be used across the dtypes to +detect NA values. + +However, it comes with it a couple of trade-offs which I most certainly have +not ignored. + +.. _gotchas.intna: + +Support for integer ``NA`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the absence of high performance ``NA`` support being built into NumPy from +the ground up, the primary casualty is the ability to represent NAs in integer +arrays. For example: + +.. ipython:: python + + s = Series([1, 2, 3, 4, 5], index=list('abcde')) + s + s.dtype + + s2 = s.reindex(['a', 'b', 'c', 'f', 'u']) + s2 + s2.dtype + +This trade-off is made largely for memory and performance reasons, and also so +that the resulting Series continues to be "numeric". One possibility is to use +``dtype=object`` arrays instead. + +``NA`` type promotions +~~~~~~~~~~~~~~~~~~~~~~ + +When introducing NAs into an existing Series or DataFrame via ``reindex`` or +some other means, boolean and integer types will be promoted to a different +dtype in order to store the NAs. These are summarized by this table: + +.. csv-table:: + :header: "Typeclass","Promotion dtype for storing NAs" + :widths: 40,60 + + ``floating``, no change + ``object``, no change + ``integer``, cast to ``float64`` + ``boolean``, cast to ``object`` + +While this may seem like a heavy trade-off, in practice I have found very few +cases where this is an issue in practice. Some explanation for the motivation +here in the next section. + +Why not make NumPy like R? +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many people have suggested that NumPy should simply emulate the ``NA`` support +present in the more domain-specific statistical programming langauge `R +`__. Part of the reason is the NumPy type hierarchy: + +.. csv-table:: + :header: "Typeclass","Dtypes" + :widths: 30,70 + :delim: | + + ``numpy.floating`` | ``float16, float32, float64, float128`` + ``numpy.integer`` | ``int8, int16, int32, int64`` + ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` + ``numpy.object_`` | ``object_`` + ``numpy.bool_`` | ``bool_`` + ``numpy.character`` | ``string_, unicode_`` + +The R language, by contrast, only has a handful of built-in data types: +``integer``, ``numeric`` (floating-point), ``character``, and +``boolean``. ``NA`` types are implemented by reserving special bit patterns for +each type to be used as the missing value. While doing this with the full NumPy +type hierarchy would be possible, it would be a more substantial trade-off +(especially for the 8- and 16-bit data types) and implementation undertaking. + +An alternate approach is that of using masked arrays. A masked array is an +array of data with an associated boolean *mask* denoting whether each value +should be considered ``NA`` or not. I am personally not in love with this +approach as I feel that overall it places a fairly heavy burden on the user and +the library implementer. Additionally, it exacts a fairly high performance cost +when working with numerical data compared with the simple approach of using +``NaN``. Thus, I have chosen the Pythonic "practicality beats purity" approach +and traded integer ``NA`` capability for a much simpler approach of using a +special value in float and object arrays to denote ``NA``, and promoting +integer arrays to floating when NAs must be introduced. + +Integer indexing +---------------- + +Label-based indexing with integer axis labels is a thorny topic. It has been +discussed heavily on mailing lists and among various members of the scientific +Python community. In pandas, our general viewpoint is that labels matter more +than integer locations. Therefore, with an integer axis index *only* +label-based indexing is possible with the standard tools like ``.ix``. The +following code will generate exceptions: + +.. code-block:: python + + s = Series(range(5)) + s[-1] + df = DataFrame(np.random.randn(5, 4)) + df + df.ix[-2:] + +This deliberate decision was made to prevent ambiguities and subtle bugs (many +users reported finding bugs when the API change was made to stop "falling back" +on position-based indexing). + +Label-based slicing conventions +------------------------------- + +Non-monotonic indexes require exact matches +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Endpoints are inclusive +~~~~~~~~~~~~~~~~~~~~~~~ + +Compared with standard Python sequence slicing in which the slice endpoint is +not inclusive, label-based slicing in pandas **is inclusive**. The primary +reason for this is that it is often not possible to easily determine the +"successor" or next element after a particular label in an index. For example, +consider the following Series: + +.. ipython:: python + + s = Series(randn(6), index=list('abcdef')) + s + +Suppose we wished to slice from ``c`` to ``e``, using integers this would be + +.. ipython:: python + + s[2:5] + +However, if you only had ``c`` and ``e``, determining the next element in the +index can be somewhat complicated. For example, the following does not work: + +:: + + s.ix['c':'e'+1] + +A very common use case is to limit a time series to start and end at two +specific dates. To enable this, we made the design design to make label-based +slicing include both endpoints: + +.. ipython:: python + + s.ix['c':'e'] + +This is most definitely a "practicality beats purity" sort of thing, but it is +something to watch out for if you expect label-based slicing to behave exactly +in the way that standard Python integer slicing works. + +Miscellaneous indexing gotchas +------------------------------ + +Reindex versus ix gotchas +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many users will find themselves using the ``ix`` indexing capabilities as a +concise means of selecting data from a pandas object: + +.. ipython:: python + + df = DataFrame(randn(6, 4), columns=['one', 'two', 'three', 'four'], + index=list('abcdef')) + df + df.ix[['b', 'c', 'e']] + +This is, of course, completely equivalent *in this case* to using th +``reindex`` method: + +.. ipython:: python + + df.reindex(['b', 'c', 'e']) + +Some might conclude that ``ix`` and ``reindex`` are 100% equivalent based on +this. This is indeed true **except in the case of integer indexing**. For +example, the above operation could alternately have been expressed as: + +.. ipython:: python + + df.ix[[1, 2, 4]] + +If you pass ``[1, 2, 4]`` to ``reindex`` you will get another thing entirely: + +.. ipython:: python + + df.reindex([1, 2, 4]) + +So it's important to remember that ``reindex`` is **strict label indexing +only**. This can lead to some potentially surprising results in pathological +cases where an index contains, say, both integers and strings: + +.. ipython:: python + + s = Series([1, 2, 3], index=['a', 0, 1]) + s + s.ix[[0, 1]] + s.reindex([0, 1]) + +Because the index in this case does not contain solely integers, ``ix`` falls +back on integer indexing. By contrast, ``reindex`` only looks for the values +passed in the index, thus finding the integers ``0`` and ``1``. While it would +be possible to insert some logic to check whether a passed sequence is all +contained in the index, that logic would exact a very high cost in large data +sets. + +Reindex potentially changes underlying Series dtype +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The use of ``reindex_like`` can potentially change the dtype of a ``Series``. + +.. code-block:: python + + series = pandas.Series([1, 2, 3]) + x = pandas.Series([True]) + x.dtype + x = pandas.Series([True]).reindex_like(series) + x.dtype + +This is because ``reindex_like`` silently inserts ``NaNs`` and the ``dtype`` +changes accordingly. This can cause some issues when using ``numpy`` ``ufuncs`` +such as ``numpy.logical_and``. + +See the `this old issue `__ for a more +detailed discussion. + +Timestamp limitations +--------------------- + +Minimum and maximum timestamps +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since pandas represents timestamps in nanosecond resolution, the timespan that +can be represented using a 64-bit integer is limited to approximately 584 years: + +.. ipython:: python + + begin = Timestamp.min + begin + + end = Timestamp.max + end + +If you need to represent time series data outside the nanosecond timespan, use +PeriodIndex: + +.. ipython:: python + + span = period_range('1215-01-01', '1381-01-01', freq='D') + span + + +Parsing Dates from Text Files +----------------------------- + +When parsing multiple text file columns into a single date column, the new date +column is prepended to the data and then `index_col` specification is indexed off +of the new set of columns rather than the original ones: + +.. ipython:: python + :suppress: + + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print(open('tmp.csv').read()) + + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + df = read_csv('tmp.csv', header=None, + parse_dates=date_spec, + keep_date_col=True, + index_col=0) + + # index_col=0 refers to the combined column "nominal" and not the original + # first column of 'KORD' strings + + df + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + + +Differences with NumPy +---------------------- +For Series and DataFrame objects, ``var`` normalizes by ``N-1`` to produce +unbiased estimates of the sample variance, while NumPy's ``var`` normalizes +by N, which measures the variance of the sample. Note that ``cov`` +normalizes by ``N-1`` in both pandas and NumPy. + + +Thread-safety +------------- + +As of pandas 0.11, pandas is not 100% thread safe. The known issues relate to +the ``DataFrame.copy`` method. If you are doing a lot of copying of DataFrame +objects shared among threads, we recommend holding locks inside the threads +where the data copying occurs. + +See `this link `__ +for more information. + +.. _html-gotchas: + +HTML Table Parsing +------------------ +There are some versioning issues surrounding the libraries that are used to +parse HTML tables in the top-level pandas io function ``read_html``. + +**Issues with** |lxml|_ + + * Benefits + + * |lxml|_ is very fast + + * |lxml|_ requires Cython to install correctly. + + * Drawbacks + + * |lxml|_ does *not* make any guarantees about the results of it's parse + *unless* it is given |svm|_. + + * In light of the above, we have chosen to allow you, the user, to use the + |lxml|_ backend, but **this backend will use** |html5lib|_ if |lxml|_ + fails to parse + + * It is therefore *highly recommended* that you install both + |BeautifulSoup4|_ and |html5lib|_, so that you will still get a valid + result (provided everything else is valid) even if |lxml|_ fails. + +**Issues with** |BeautifulSoup4|_ **using** |lxml|_ **as a backend** + + * The above issues hold here as well since |BeautifulSoup4|_ is essentially + just a wrapper around a parser backend. + +**Issues with** |BeautifulSoup4|_ **using** |html5lib|_ **as a backend** + + * Benefits + + * |html5lib|_ is far more lenient than |lxml|_ and consequently deals + with *real-life markup* in a much saner way rather than just, e.g., + dropping an element without notifying you. + + * |html5lib|_ *generates valid HTML5 markup from invalid markup + automatically*. This is extremely important for parsing HTML tables, + since it guarantees a valid document. However, that does NOT mean that + it is "correct", since the process of fixing markup does not have a + single definition. + + * |html5lib|_ is pure Python and requires no additional build steps beyond + its own installation. + + * Drawbacks + + * The biggest drawback to using |html5lib|_ is that it is slow as + molasses. However consider the fact that many tables on the web are not + big enough for the parsing algorithm runtime to matter. It is more + likely that the bottleneck will be in the process of reading the raw + text from the url over the web, i.e., IO (input-output). For very large + tables, this might not be true. + +**Issues with using** |Anaconda|_ + + * `Anaconda`_ ships with `lxml`_ version 3.2.0; the following workaround for + `Anaconda`_ was successfully used to deal with the versioning issues + surrounding `lxml`_ and `BeautifulSoup4`_. + + .. note:: + + Unless you have *both*: + + * A strong restriction on the upper bound of the runtime of some code + that incorporates :func:`~pandas.io.html.read_html` + * Complete knowledge that the HTML you will be parsing will be 100% + valid at all times + + then you should install `html5lib`_ and things will work swimmingly + without you having to muck around with `conda`. If you want the best of + both worlds then install both `html5lib`_ and `lxml`_. If you do install + `lxml`_ then you need to perform the following commands to ensure that + lxml will work correctly: + + .. code-block:: sh + + # remove the included version + conda remove lxml + + # install the latest version of lxml + pip install 'git+git://github.com/lxml/lxml.git' + + # install the latest version of beautifulsoup4 + pip install 'bzr+lp:beautifulsoup' + + Note that you need `bzr `__ and `git + `__ installed to perform the last two operations. + +.. |svm| replace:: **strictly valid markup** +.. _svm: http://validator.w3.org/docs/help.html#validation_basics + +.. |html5lib| replace:: **html5lib** +.. _html5lib: https://github.com/html5lib/html5lib-python + +.. |BeautifulSoup4| replace:: **BeautifulSoup4** +.. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup + +.. |lxml| replace:: **lxml** +.. _lxml: http://lxml.de + +.. |Anaconda| replace:: **Anaconda** +.. _Anaconda: https://store.continuum.io/cshop/anaconda + + +Byte-Ordering Issues +-------------------- +Occasionally you may have to deal with data that were created on a machine with +a different byte order than the one on which you are running Python. A common symptom of this issue is an error like + +.. code-block:: python + + Traceback + ... + ValueError: Big-endian buffer not supported on little-endian compiler + +To deal +with this issue you should convert the underlying NumPy array to the native +system byte order *before* passing it to Series/DataFrame/Panel constructors +using something similar to the following: + +.. ipython:: python + + x = np.array(list(range(10)), '>i4') # big endian + newx = x.byteswap().newbyteorder() # force native byteorder + s = Series(newx) + +See `the NumPy documentation on byte order +`__ for more +details. diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst new file mode 100644 index 00000000..22f1414c --- /dev/null +++ b/doc/source/groupby.rst @@ -0,0 +1,997 @@ +.. currentmodule:: pandas +.. _groupby: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + options.display.mpl_style='default' + from pandas.compat import zip + +***************************** +Group By: split-apply-combine +***************************** + +By "group by" we are referring to a process involving one or more of the following +steps + + - **Splitting** the data into groups based on some criteria + - **Applying** a function to each group independently + - **Combining** the results into a data structure + +Of these, the split step is the most straightforward. In fact, in many +situations you may wish to split the data set into groups and do something with +those groups yourself. In the apply step, we might wish to one of the +following: + + - **Aggregation**: computing a summary statistic (or statistics) about each + group. Some examples: + + - Compute group sums or means + - Compute group sizes / counts + + - **Transformation**: perform some group-specific computations and return a + like-indexed. Some examples: + + - Standardizing data (zscore) within group + - Filling NAs within groups with a value derived from each group + + - **Filtration**: discard some groups, according to a group-wise computation + that evaluates True or False. Some examples: + + - Discarding data that belongs to groups with only a few members + - Filtering out data based on the group sum or mean + + - Some combination of the above: GroupBy will examine the results of the apply + step and try to return a sensibly combined result if it doesn't fit into + either of the above two categories + +Since the set of object instance method on pandas data structures are generally +rich and expressive, we often simply want to invoke, say, a DataFrame function +on each group. The name GroupBy should be quite familiar to those who have used +a SQL-based tool (or ``itertools``), in which you can write code like: + +.. code-block:: sql + + SELECT Column1, Column2, mean(Column3), sum(Column4) + FROM SomeTable + GROUP BY Column1, Column2 + +We aim to make operations like this natural and easy to express using +pandas. We'll address each area of GroupBy functionality then provide some +non-trivial examples / use cases. + +See the :ref:`cookbook` for some advanced strategies + +.. _groupby.split: + +Splitting an object into groups +------------------------------- + +pandas objects can be split on any of their axes. The abstract definition of +grouping is to provide a mapping of labels to group names. To create a GroupBy +object (more on what the GroupBy object is later), you do the following: + +.. code-block:: ipython + + # default is axis=0 + >>> grouped = obj.groupby(key) + >>> grouped = obj.groupby(key, axis=1) + >>> grouped = obj.groupby([key1, key2]) + +The mapping can be specified many different ways: + + - A Python function, to be called on each of the axis labels + - A list or NumPy array of the same length as the selected axis + - A dict or Series, providing a ``label -> group name`` mapping + - For DataFrame objects, a string indicating a column to be used to group. Of + course ``df.groupby('A')`` is just syntactic sugar for + ``df.groupby(df['A'])``, but it makes life simpler + - A list of any of the above things + +Collectively we refer to the grouping objects as the **keys**. For example, +consider the following DataFrame: + +.. ipython:: python + + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : randn(8), 'D' : randn(8)}) + df + +We could naturally group by either the ``A`` or ``B`` columns or both: + +.. ipython:: python + + grouped = df.groupby('A') + grouped = df.groupby(['A', 'B']) + +These will split the DataFrame on its index (rows). We could also split by the +columns: + +.. ipython:: + + In [4]: def get_letter_type(letter): + ...: if letter.lower() in 'aeiou': + ...: return 'vowel' + ...: else: + ...: return 'consonant' + ...: + + In [5]: grouped = df.groupby(get_letter_type, axis=1) + +Starting with 0.8, pandas Index objects now supports duplicate values. If a +non-unique index is used as the group key in a groupby operation, all values +for the same index value will be considered to be in one group and thus the +output of aggregation functions will only contain unique index values: + +.. ipython:: python + + lst = [1, 2, 3, 1, 2, 3] + + s = Series([1, 2, 3, 10, 20, 30], lst) + + grouped = s.groupby(level=0) + + grouped.first() + + grouped.last() + + grouped.sum() + +Note that **no splitting occurs** until it's needed. Creating the GroupBy object +only verifies that you've passed a valid mapping. + +.. note:: + + Many kinds of complicated data manipulations can be expressed in terms of + GroupBy operations (though can't be guaranteed to be the most + efficient). You can get quite creative with the label mapping functions. + +.. _groupby.attributes: + +GroupBy object attributes +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``groups`` attribute is a dict whose keys are the computed unique groups +and corresponding values being the axis labels belonging to each group. In the +above example we have: + +.. ipython:: python + + df.groupby('A').groups + df.groupby(get_letter_type, axis=1).groups + +Calling the standard Python ``len`` function on the GroupBy object just returns +the length of the ``groups`` dict, so it is largely just a convenience: + +.. ipython:: python + + grouped = df.groupby(['A', 'B']) + grouped.groups + len(grouped) + +By default the group keys are sorted during the groupby operation. You may +however pass ``sort=False`` for potential speedups: + +.. ipython:: python + + df2 = DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]}) + df2.groupby(['X'], sort=True).sum() + df2.groupby(['X'], sort=False).sum() + +.. _groupby.tabcompletion: + +``GroupBy`` will tab complete column names (and other attributes) + +.. ipython:: python + :suppress: + + n = 10 + weight = np.random.normal(166, 20, size=n) + height = np.random.normal(60, 10, size=n) + time = date_range('1/1/2000', periods=n) + gender = tm.choice(['male', 'female'], size=n) + df = DataFrame({'height': height, 'weight': weight, + 'gender': gender}, index=time) + +.. ipython:: python + + df + gb = df.groupby('gender') + + +.. ipython:: + + @verbatim + In [1]: gb. + gb.agg gb.boxplot gb.cummin gb.describe gb.filter gb.get_group gb.height gb.last gb.median gb.ngroups gb.plot gb.rank gb.std gb.transform + gb.aggregate gb.count gb.cumprod gb.dtype gb.first gb.groups gb.hist gb.max gb.min gb.nth gb.prod gb.resample gb.sum gb.var + gb.apply gb.cummax gb.cumsum gb.fillna gb.gender gb.head gb.indices gb.mean gb.name gb.ohlc gb.quantile gb.size gb.tail gb.weight + + +.. ipython:: python + :suppress: + + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : randn(8), 'D' : randn(8)}) + +.. _groupby.multiindex: + +GroupBy with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~ + +With :ref:`hierarchically-indexed data `, it's quite +natural to group by one of the levels of the hierarchy. + +.. ipython:: python + :suppress: + + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = list(zip(*arrays)) + tuples + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + s = Series(randn(8), index=index) + +.. ipython:: python + + s + grouped = s.groupby(level=0) + grouped.sum() + +If the MultiIndex has names specified, these can be passed instead of the level +number: + +.. ipython:: python + + s.groupby(level='second').sum() + +The aggregation functions such as ``sum`` will take the level parameter +directly. Additionally, the resulting index will be named according to the +chosen level: + +.. ipython:: python + + s.sum(level='second') + +Also as of v0.6, grouping with multiple levels is supported. + +.. ipython:: python + :suppress: + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = list(zip(*arrays)) + index = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + s = Series(randn(8), index=index) + +.. ipython:: python + + s + s.groupby(level=['first','second']).sum() + +More on the ``sum`` function and aggregation later. + +DataFrame column selection in GroupBy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once you have created the GroupBy object from a DataFrame, for example, you +might want to do something different for each of the columns. Thus, using +``[]`` similar to getting a column from a DataFrame, you can do: + +.. ipython:: python + + grouped = df.groupby(['A']) + grouped_C = grouped['C'] + grouped_D = grouped['D'] + +This is mainly syntactic sugar for the alternative and much more verbose: + +.. ipython:: python + + df['C'].groupby(df['A']) + +Additionally this method avoids recomputing the internal grouping information +derived from the passed key. + +.. _groupby.iterating: + +Iterating through groups +------------------------ + +With the GroupBy object in hand, iterating through the grouped data is very +natural and functions similarly to ``itertools.groupby``: + +.. ipython:: + + In [4]: grouped = df.groupby('A') + + In [5]: for name, group in grouped: + ...: print(name) + ...: print(group) + ...: + +In the case of grouping by multiple keys, the group name will be a tuple: + +.. ipython:: + + In [5]: for name, group in df.groupby(['A', 'B']): + ...: print(name) + ...: print(group) + ...: + +It's standard Python-fu but remember you can unpack the tuple in the for loop +statement if you wish: ``for (k1, k2), group in grouped:``. + +.. _groupby.aggregate: + +Aggregation +----------- + +Once the GroupBy object has been created, several methods are available to +perform a computation on the grouped data. + +An obvious one is aggregation via the ``aggregate`` or equivalently ``agg`` method: + +.. ipython:: python + + grouped = df.groupby('A') + grouped.aggregate(np.sum) + + grouped = df.groupby(['A', 'B']) + grouped.aggregate(np.sum) + +As you can see, the result of the aggregation will have the group names as the +new index along the grouped axis. In the case of multiple keys, the result is a +:ref:`MultiIndex ` by default, though this can be +changed by using the ``as_index`` option: + +.. ipython:: python + + grouped = df.groupby(['A', 'B'], as_index=False) + grouped.aggregate(np.sum) + + df.groupby('A', as_index=False).sum() + +Note that you could use the ``reset_index`` DataFrame function to achieve the +same result as the column names are stored in the resulting ``MultiIndex``: + +.. ipython:: python + + df.groupby(['A', 'B']).sum().reset_index() + +Another simple aggregation example is to compute the size of each group. +This is included in GroupBy as the ``size`` method. It returns a Series whose +index are the group names and whose values are the sizes of each group. + +.. ipython:: python + + grouped.size() + +.. ipython:: python + + grouped.describe() + +.. note:: + + Aggregation functions **will not** return the groups that you are aggregating over + if they are named *columns*, when ``as_index=True``, the default. The grouped columns will + be the **indices** of the returned object. + + Passing ``as_index=False`` **will** return the groups that you are aggregating over, if they are + named *columns*. + + Aggregating functions are ones that reduce the dimension of the returned objects, + for example: ``mean, sum, size, count, std, var, sem, describe, first, last, nth, min, max``. This is + what happens when you do for example ``DataFrame.sum()`` and get back a ``Series``. + + ``nth`` can act as a reducer *or* a filter, see :ref:`here ` + +.. _groupby.aggregate.multifunc: + +Applying multiple functions at once +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With grouped Series you can also pass a list or dict of functions to do +aggregation with, outputting a DataFrame: + +.. ipython:: python + + grouped = df.groupby('A') + grouped['C'].agg([np.sum, np.mean, np.std]) + +If a dict is passed, the keys will be used to name the columns. Otherwise the +function's name (stored in the function object) will be used. + +.. ipython:: python + + grouped['D'].agg({'result1' : np.sum, + 'result2' : np.mean}) + +On a grouped DataFrame, you can pass a list of functions to apply to each +column, which produces an aggregated result with a hierarchical index: + +.. ipython:: python + + grouped.agg([np.sum, np.mean, np.std]) + +Passing a dict of functions has different behavior by default, see the next +section. + +Applying different functions to DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By passing a dict to ``aggregate`` you can apply a different aggregation to the +columns of a DataFrame: + +.. ipython:: python + + grouped.agg({'C' : np.sum, + 'D' : lambda x: np.std(x, ddof=1)}) + +The function names can also be strings. In order for a string to be valid it +must be either implemented on GroupBy or available via :ref:`dispatching +`: + +.. ipython:: python + + grouped.agg({'C' : 'sum', 'D' : 'std'}) + +.. _groupby.aggregate.cython: + +Cython-optimized aggregation functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some common aggregations, currently only ``sum``, ``mean``, ``std``, and ``sem``, have +optimized Cython implementations: + +.. ipython:: python + + df.groupby('A').sum() + df.groupby(['A', 'B']).mean() + +Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above +code would work even without the special versions via dispatching (see below). + +.. _groupby.transform: + +Transformation +-------------- + +The ``transform`` method returns an object that is indexed the same (same size) +as the one being grouped. Thus, the passed transform function should return a +result that is the same size as the group chunk. For example, suppose we wished +to standardize the data within each group: + +.. ipython:: python + + index = date_range('10/1/1999', periods=1100) + ts = Series(np.random.normal(0.5, 2, 1100), index) + ts = rolling_mean(ts, 100, 100).dropna() + + ts.head() + ts.tail() + key = lambda x: x.year + zscore = lambda x: (x - x.mean()) / x.std() + transformed = ts.groupby(key).transform(zscore) + +We would expect the result to now have mean 0 and standard deviation 1 within +each group, which we can easily check: + +.. ipython:: python + + # Original Data + grouped = ts.groupby(key) + grouped.mean() + grouped.std() + + # Transformed Data + grouped_trans = transformed.groupby(key) + grouped_trans.mean() + grouped_trans.std() + +We can also visually compare the original and transformed data sets. + +.. ipython:: python + + compare = DataFrame({'Original': ts, 'Transformed': transformed}) + + @savefig groupby_transform_plot.png + compare.plot() + +Another common data transform is to replace missing data with the group mean. + +.. ipython:: python + :suppress: + + cols = ['A', 'B', 'C'] + values = randn(1000, 3) + values[np.random.randint(0, 1000, 100), 0] = np.nan + values[np.random.randint(0, 1000, 50), 1] = np.nan + values[np.random.randint(0, 1000, 200), 2] = np.nan + data_df = DataFrame(values, columns=cols) + +.. ipython:: python + + data_df + + countries = np.array(['US', 'UK', 'GR', 'JP']) + key = countries[np.random.randint(0, 4, 1000)] + + grouped = data_df.groupby(key) + + # Non-NA count in each group + grouped.count() + + f = lambda x: x.fillna(x.mean()) + + transformed = grouped.transform(f) + +We can verify that the group means have not changed in the transformed data +and that the transformed data contains no NAs. + +.. ipython:: python + + grouped_trans = transformed.groupby(key) + + grouped.mean() # original group means + grouped_trans.mean() # transformation did not change group means + + grouped.count() # original has some missing data points + grouped_trans.count() # counts after transformation + grouped_trans.size() # Verify non-NA count equals group size + +.. note:: + + Some functions when applied to a groupby object will automatically transform the input, returning + an object of the same shape as the original. Passing ``as_index=False`` will not affect these transformation methods. + + For example: ``fillna, ffill, bfill, shift``. + + .. ipython:: python + + grouped.ffill() + +.. _groupby.filter: + +Filtration +---------- + +.. versionadded:: 0.12 + +The ``filter`` method returns a subset of the original object. Suppose we +want to take only elements that belong to groups with a group sum greater +than 2. + +.. ipython:: python + + sf = Series([1, 1, 2, 3, 3, 3]) + sf.groupby(sf).filter(lambda x: x.sum() > 2) + +The argument of ``filter`` must be a function that, applied to the group as a +whole, returns ``True`` or ``False``. + +Another useful operation is filtering out elements that belong to groups +with only a couple members. + +.. ipython:: python + + dff = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) + dff.groupby('B').filter(lambda x: len(x) > 2) + +Alternatively, instead of dropping the offending groups, we can return a +like-indexed objects where the groups that do not pass the filter are filled +with NaNs. + +.. ipython:: python + + dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + +For dataframes with multiple columns, filters should explicitly specify a column as the filter criterion. + +.. ipython:: python + + dff['C'] = np.arange(8) + dff.groupby('B').filter(lambda x: len(x['C']) > 2) + +.. note:: + + Some functions when applied to a groupby object will act as a **filter** on the input, returning + a reduced shape of the original (and potentitally eliminating groups), but with the index unchanged. + Passing ``as_index=False`` will not affect these transformation methods. + + For example: ``head, tail``. + + .. ipython:: python + + dff.groupby('B').head(2) + + +.. _groupby.dispatch: + +Dispatching to instance methods +------------------------------- + +When doing an aggregation or transformation, you might just want to call an +instance method on each data group. This is pretty easy to do by passing lambda +functions: + +.. ipython:: python + + grouped = df.groupby('A') + grouped.agg(lambda x: x.std()) + +But, it's rather verbose and can be untidy if you need to pass additional +arguments. Using a bit of metaprogramming cleverness, GroupBy now has the +ability to "dispatch" method calls to the groups: + +.. ipython:: python + + grouped.std() + +What is actually happening here is that a function wrapper is being +generated. When invoked, it takes any passed arguments and invokes the function +with any arguments on each group (in the above example, the ``std`` +function). The results are then combined together much in the style of ``agg`` +and ``transform`` (it actually uses ``apply`` to infer the gluing, documented +next). This enables some operations to be carried out rather succinctly: + +.. ipython:: python + + tsdf = DataFrame(randn(1000, 3), + index=date_range('1/1/2000', periods=1000), + columns=['A', 'B', 'C']) + tsdf.ix[::2] = np.nan + grouped = tsdf.groupby(lambda x: x.year) + grouped.fillna(method='pad') + +In this example, we chopped the collection of time series into yearly chunks +then independently called :ref:`fillna ` on the +groups. + +.. versionadded:: 0.14.1 + +The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys: + +.. ipython:: python + + s = Series([9, 8, 7, 5, 19, 1, 4.2, 3.3]) + g = Series(list('abababab')) + gb = s.groupby(g) + gb.nlargest(3) + gb.nsmallest(3) + +.. _groupby.apply: + +Flexible ``apply`` +------------------ + +Some operations on the grouped data might not fit into either the aggregate or +transform categories. Or, you may simply want GroupBy to infer how to combine +the results. For these, use the ``apply`` function, which can be substituted +for both ``aggregate`` and ``transform`` in many standard use cases. However, +``apply`` can handle some exceptional use cases, for example: + +.. ipython:: python + + df + grouped = df.groupby('A') + + # could also just call .describe() + grouped['C'].apply(lambda x: x.describe()) + +The dimension of the returned result can also change: + +.. ipython:: + + In [8]: grouped = df.groupby('A')['C'] + + In [10]: def f(group): + ....: return DataFrame({'original' : group, + ....: 'demeaned' : group - group.mean()}) + ....: + + In [11]: grouped.apply(f) + +``apply`` on a Series can operate on a returned value from the applied function, that is itself a series, and possibly upcast the result to a DataFrame + +.. ipython:: python + + def f(x): + return Series([ x, x**2 ], index = ['x', 'x^s']) + s = Series(np.random.rand(5)) + s + s.apply(f) + + +.. note:: + + ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to apply. + So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in + the output as well as set the indices. + +.. warning:: + + In the current implementation apply calls func twice on the + first group to decide whether it can take a fast or slow code + path. This can lead to unexpected behavior if func has + side-effects, as they will take effect twice for the first + group. + + .. ipython:: python + + d = DataFrame({"a":["x", "y"], "b":[1,2]}) + def identity(df): + print df + return df + + d.groupby("a").apply(identity) + + +Other useful features +--------------------- + +Automatic exclusion of "nuisance" columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Again consider the example DataFrame we've been looking at: + +.. ipython:: python + + df + +Supposed we wished to compute the standard deviation grouped by the ``A`` +column. There is a slight problem, namely that we don't care about the data in +column ``B``. We refer to this as a "nuisance" column. If the passed +aggregation function can't be applied to some columns, the troublesome columns +will be (silently) dropped. Thus, this does not pose any problems: + +.. ipython:: python + + df.groupby('A').std() + +NA group handling +~~~~~~~~~~~~~~~~~ + +If there are any NaN values in the grouping key, these will be automatically +excluded. So there will never be an "NA group". This was not the case in older +versions of pandas, but users were generally discarding the NA group anyway +(and supporting it was an implementation headache). + +Grouping with ordered factors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Categorical variables represented as instance of pandas's ``Categorical`` class +can be used as group keys. If so, the order of the levels will be preserved: + +.. ipython:: python + + data = Series(np.random.randn(100)) + + factor = qcut(data, [0, .25, .5, .75, 1.]) + + data.groupby(factor).mean() + +.. _groupby.specify: + +Grouping with a Grouper specification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Your may need to specify a bit more data to properly group. You can +use the ``pd.Grouper`` to provide this local control. + +.. ipython:: python + + import datetime as DT + + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [ + DT.datetime(2013,1,1,13,0), + DT.datetime(2013,1,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,12,2,14,0), + ]}) + + df + +Groupby a specific column with the desired frequency. This is like resampling. + +.. ipython:: python + + df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + +You have an ambiguous specification in that you have a named index and a column +that could be potential groupers. + +.. ipython:: python + + df = df.set_index('Date') + df['Date'] = df.index + pd.offsets.MonthEnd(2) + df.groupby([pd.Grouper(freq='6M',key='Date'),'Buyer']).sum() + + df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum() + + +Taking the first rows of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Just like for a DataFrame or Series you can call head and tail on a groupby: + +.. ipython:: python + + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df + + g = df.groupby('A') + g.head(1) + + g.tail(1) + +This shows the first or last n rows from each group. + +.. warning:: + + Before 0.14.0 this was implemented with a fall-through apply, + so the result would incorrectly respect the as_index flag: + + .. code-block:: python + + >>> g.head(1): # was equivalent to g.apply(lambda x: x.head(1)) + A B + A + 1 0 1 2 + 5 2 5 6 + +.. _groupby.nth: + +Taking the nth row of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To select from a DataFrame or Series the nth item, use the nth method. This is a reduction method, and will return a single row (or no row) per group: + +.. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + g.nth(0) + g.nth(-1) + g.nth(1) + +If you want to select the nth not-null method, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna, for a Series this just needs to be truthy. + +.. ipython:: python + + # nth(0) is the same as g.first() + g.nth(0, dropna='any') + g.first() + + # nth(-1) is the same as g.last() + g.nth(-1, dropna='any') # NaNs denote group exhausted when using dropna + g.last() + + g.B.nth(0, dropna=True) + +As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row. + +.. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A',as_index=False) + + g.nth(0) + g.nth(-1) + +Enumerate group items +~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.13.0 + +To see the order in which each row appears within its group, use the +``cumcount`` method: + +.. ipython:: python + + df = pd.DataFrame(list('aaabba'), columns=['A']) + df + + df.groupby('A').cumcount() + + df.groupby('A').cumcount(ascending=False) # kwarg only + +Plotting +~~~~~~~~ + +Groupby also works with some plotting methods. For example, suppose we +suspect that some features in a DataFrame my differ by group, in this case, +the values in column 1 where the group is "B" are 3 higher on average. + +.. ipython:: python + + np.random.seed(1234) + df = DataFrame(np.random.randn(50, 2)) + df['g'] = np.random.choice(['A', 'B'], size=50) + df.loc[df['g'] == 'B', 1] += 3 + +We can easily visualize this with a boxplot: + +.. ipython:: python + :okwarning: + + @savefig groupby_boxplot.png + df.groupby('g').boxplot() + +The result of calling ``boxplot`` is a dictionary whose keys are the values +of our grouping column ``g`` ("A" and "B"). The values of the resulting dictionary +can be controlled by the ``return_type`` keyword of ``boxplot``. +See the :ref:`visualization documentation` for more. + +.. warning:: + + For historical reasons, ``df.groupby("g").boxplot()`` is not equivalent + to ``df.boxplot(by="g")``. See :ref:`here` for + an explanation. + +Examples +-------- + +Regrouping by factor +~~~~~~~~~~~~~~~~~~~~ + +Regroup columns of a DataFrame according to their sum, and sum the aggregated ones. + +.. ipython:: python + + df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]}) + df + df.groupby(df.sum(), axis=1).sum() + + +Returning a Series to propogate names +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Group DataFrame columns, compute a set of metrics and return a named Series. +The Series name is used as the name for the column index. This is especially +useful in conjunction with reshaping operations such as stacking in which the +column index name will be used as the name of the inserted column: + +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + }) + + def compute_metrics(x): + result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} + return pd.Series(result, name='metrics') + + result = df.groupby('a').apply(compute_metrics) + + result + + result.stack() diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template new file mode 100644 index 00000000..f5352bc1 --- /dev/null +++ b/doc/source/index.rst.template @@ -0,0 +1,151 @@ +.. pandas documentation master file, created by + +********************************************* +pandas: powerful Python data analysis toolkit +********************************************* + +`PDF Version `__ + +`Zipped HTML `__ + +.. module:: pandas + +**Date**: |today| **Version**: |version| + +**Binary Installers:** http://pypi.python.org/pypi/pandas + +**Source Repository:** http://github.com/pydata/pandas + +**Issues & Ideas:** https://github.com/pydata/pandas/issues + +**Q&A Support:** http://stackoverflow.com/questions/tagged/pandas + +**Developer Mailing List:** http://groups.google.com/group/pydata + +**pandas** is a `Python `__ package providing fast, +flexible, and expressive data structures designed to make working with +"relational" or "labeled" data both easy and intuitive. It aims to be the +fundamental high-level building block for doing practical, **real world** data +analysis in Python. Additionally, it has the broader goal of becoming **the +most powerful and flexible open source data analysis / manipulation tool +available in any language**. It is already well on its way toward this goal. + +pandas is well suited for many different kinds of data: + + - Tabular data with heterogeneously-typed columns, as in an SQL table or + Excel spreadsheet + - Ordered and unordered (not necessarily fixed-frequency) time series data. + - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and + column labels + - Any other form of observational / statistical data sets. The data actually + need not be labeled at all to be placed into a pandas data structure + +The two primary data structures of pandas, :class:`Series` (1-dimensional) +and :class:`DataFrame` (2-dimensional), handle the vast majority of typical use +cases in finance, statistics, social science, and many areas of +engineering. For R users, :class:`DataFrame` provides everything that R's +``data.frame`` provides and much more. pandas is built on top of `NumPy +`__ and is intended to integrate well within a scientific +computing environment with many other 3rd party libraries. + +Here are just a few of the things that pandas does well: + + - Easy handling of **missing data** (represented as NaN) in floating point as + well as non-floating point data + - Size mutability: columns can be **inserted and deleted** from DataFrame and + higher dimensional objects + - Automatic and explicit **data alignment**: objects can be explicitly + aligned to a set of labels, or the user can simply ignore the labels and + let `Series`, `DataFrame`, etc. automatically align the data for you in + computations + - Powerful, flexible **group by** functionality to perform + split-apply-combine operations on data sets, for both aggregating and + transforming data + - Make it **easy to convert** ragged, differently-indexed data in other + Python and NumPy data structures into DataFrame objects + - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting** + of large data sets + - Intuitive **merging** and **joining** data sets + - Flexible **reshaping** and pivoting of data sets + - **Hierarchical** labeling of axes (possible to have multiple labels per + tick) + - Robust IO tools for loading data from **flat files** (CSV and delimited), + Excel files, databases, and saving / loading data from the ultrafast **HDF5 + format** + - **Time series**-specific functionality: date range generation and frequency + conversion, moving window statistics, moving window linear regressions, + date shifting and lagging, etc. + +Many of these principles are here to address the shortcomings frequently +experienced using other languages / scientific research environments. For data +scientists, working with data is typically divided into multiple stages: +munging and cleaning data, analyzing / modeling it, then organizing the results +of the analysis into a form suitable for plotting or tabular display. pandas +is the ideal tool for all of these tasks. + +Some other notes + + - pandas is **fast**. Many of the low-level algorithmic bits have been + extensively tweaked in `Cython `__ code. However, as with + anything else generalization usually sacrifices performance. So if you focus + on one feature for your application you may be able to create a faster + specialized tool. + + - pandas is a dependency of `statsmodels + `__, making it an important part of the + statistical computing ecosystem in Python. + + - pandas has been used extensively in production in financial applications. + +.. note:: + + This documentation assumes general familiarity with NumPy. If you haven't + used NumPy much or at all, do invest some time in `learning about NumPy + `__ first. + +See the package overview for more detail about what's in the library. + + +.. toctree:: + :maxdepth: 3 + + {% if single -%} + {{ single }} + {% endif -%} + {%if not single -%} + whatsnew + install + faq + overview + 10min + tutorials + cookbook + dsintro + basics + options + indexing + computation + missing_data + groupby + merging + reshaping + timeseries + visualization + rplot + io + remote_data + enhancingperf + sparse + gotchas + r_interface + ecosystem + comparison_with_r + comparison_with_sql + {% endif -%} + {% if api -%} + api + {% endif -%} + {%if not single -%} + contributing + release + {% endif -%} diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst new file mode 100644 index 00000000..84736d49 --- /dev/null +++ b/doc/source/indexing.rst @@ -0,0 +1,2301 @@ +.. _indexing: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + import pandas as pd + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + from pandas.compat import range, zip + +*************************** +Indexing and Selecting Data +*************************** + +The axis labeling information in pandas objects serves many purposes: + + - Identifies data (i.e. provides *metadata*) using known indicators, + important for analysis, visualization, and interactive console display + - Enables automatic and explicit data alignment + - Allows intuitive getting and setting of subsets of the data set + +In this section, we will focus on the final point: namely, how to slice, dice, +and generally get and set subsets of pandas objects. The primary focus will be +on Series and DataFrame as they have received more development attention in +this area. Expect more work to be invested higher-dimensional data structures +(including ``Panel``) in the future, especially in label-based advanced +indexing. + +.. note:: + + The Python and NumPy indexing operators ``[]`` and attribute operator ``.`` + provide quick and easy access to pandas data structures across a wide range + of use cases. This makes interactive work intuitive, as there's little new + to learn if you already know how to deal with Python dictionaries and NumPy + arrays. However, since the type of the data to be accessed isn't known in + advance, directly using standard operators has some optimization limits. For + production code, we recommended that you take advantage of the optimized + pandas data access methods exposed in this chapter. + +.. warning:: + + Whether a copy or a reference is returned for a setting operation, may + depend on the context. This is sometimes called ``chained assignment`` and + should be avoided. See :ref:`Returning a View versus Copy + ` + +See the :ref:`cookbook` for some advanced strategies + +Different Choices for Indexing (``loc``, ``iloc``, and ``ix``) +-------------------------------------------------------------- + +.. versionadded:: 0.11.0 + +Object selection has had a number of user-requested additions in order to +support more explicit location based indexing. pandas now supports three types +of multi-axis indexing. + +- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are + not found, allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a + *label* of the index. This use is **not** an integer position along the + index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'``, (note that contrary to usual python + slices, **both** the start and the stop are included!) + - A boolean array + + See more at :ref:`Selection by Label ` + +- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of + the axis), will raise ``IndexError`` if an indexer is requested and it + is out-of-bounds, except *slice* indexers which allow out-of-bounds indexing. + (this conforms with python/numpy *slice* semantics). Allowed inputs are: + + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + + See more at :ref:`Selection by Position ` + +- ``.ix`` supports mixed integer and label based access. It is primarily label + based, but will fallback to integer positional access. ``.ix`` is the most + general and will support any of the inputs to ``.loc`` and ``.iloc``, as well + as support for floating point label schemes. ``.ix`` is especially useful + when dealing with mixed positional and label based hierarchial indexes. + As using integer slices with ``.ix`` have different behavior depending on + whether the slice is interpreted as position based or label based, it's + usually better to be explicit and use ``.iloc`` or ``.loc``. + + See more at :ref:`Advanced Indexing `, :ref:`Advanced + Hierarchical ` and :ref:`Fallback Indexing + ` + +Getting values from an object with multi-axes selection uses the following +notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as +well). Any of the axes accessors may be the null slice ``:``. Axes left out of +the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to +``p.loc['a', :, :]``) + +.. csv-table:: + :header: "Object Type", "Indexers" + :widths: 30, 50 + :delim: ; + + Series; ``s.loc[indexer]`` + DataFrame; ``df.loc[row_indexer,column_indexer]`` + Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]`` + +Deprecations +------------ + +Beginning with version 0.11.0, it's recommended that you transition away from +the following methods as they *may* be deprecated in future versions. + + - ``irow`` + - ``icol`` + - ``iget_value`` + +See the section :ref:`Selection by Position ` for substitutes. + +.. _indexing.basics: + +Basics +------ + +As mentioned when introducing the data structures in the :ref:`last section +`, the primary function of indexing with ``[]`` (a.k.a. ``__getitem__`` +for those familiar with implementing class behavior in Python) is selecting out +lower-dimensional slices. Thus, + +.. csv-table:: + :header: "Object Type", "Selection", "Return Value Type" + :widths: 30, 30, 60 + :delim: ; + + Series; ``series[label]``; scalar value + DataFrame; ``frame[colname]``; ``Series`` corresponding to colname + Panel; ``panel[itemname]``; ``DataFrame`` corresponing to the itemname + +Here we construct a simple time series data set to use for illustrating the +indexing functionality: + +.. ipython:: python + + dates = date_range('1/1/2000', periods=8) + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df + panel = Panel({'one' : df, 'two' : df - df.mean()}) + panel + +.. note:: + + None of the indexing functionality is time series specific unless + specifically stated. + +Thus, as per above, we have the most basic indexing using ``[]``: + +.. ipython:: python + + s = df['A'] + s[dates[5]] + panel['two'] + +You can pass a list of columns to ``[]`` to select columns in that order. +If a column is not contained in the DataFrame, an exception will be +raised. Multiple columns can also be set in this manner: + +.. ipython:: python + + df + df[['B', 'A']] = df[['A', 'B']] + df + +You may find this useful for applying a transform (in-place) to a subset of the +columns. + +Attribute Access +---------------- + +.. _indexing.columns.multiple: + +.. _indexing.df_cols: + +.. _indexing.attribute_access: + +You may access an index on a ``Series``, column on a ``DataFrame``, and a item on a ``Panel`` directly +as an attribute: + +.. ipython:: python + + sa = Series([1,2,3],index=list('abc')) + dfa = df.copy() + +.. ipython:: python + + sa.b + dfa.A + panel.one + +You can use attribute access to modify an existing element of a Series or column of a DataFrame, but be careful; +if you try to use attribute access to create a new column, it fails silently, creating a new attribute rather than a +new column. + +.. ipython:: python + + sa.a = 5 + sa + dfa.A = list(range(len(dfa.index))) # ok if A already exists + dfa + dfa['A'] = list(range(len(dfa.index))) # use this form to create a new column + dfa + +.. warning:: + + - You can use this access only if the index element is a valid python identifier, e.g. ``s.1`` is not allowed. + see `here for an explanation of valid identifiers + `__. + + - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed. + + - The ``Series/Panel`` accesses are available starting in 0.13.0. + +If you are using the IPython environment, you may also use tab-completion to +see these accessable attributes. + +Slicing ranges +-------------- + +The most robust and consistent way of slicing ranges along arbitrary axes is +described in the :ref:`Selection by Position ` section +detailing the ``.iloc`` method. For now, we explain the semantics of slicing using the ``[]`` operator. + +With Series, the syntax works exactly as with an ndarray, returning a slice of +the values and the corresponding labels: + +.. ipython:: python + + s[:5] + s[::2] + s[::-1] + +Note that setting works as well: + +.. ipython:: python + + s2 = s.copy() + s2[:5] = 0 + s2 + +With DataFrame, slicing inside of ``[]`` **slices the rows**. This is provided +largely as a convenience since it is such a common operation. + +.. ipython:: python + + df[:3] + df[::-1] + +.. _indexing.label: + +Selection By Label +------------------ + +.. warning:: + + Whether a copy or a reference is returned for a setting operation, may depend on the context. + This is sometimes called ``chained assignment`` and should be avoided. + See :ref:`Returning a View versus Copy ` + +pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. +**ALL** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. + +The ``.loc`` attribute is the primary access method. The following are valid inputs: + +- A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) +- A list or array of labels ``['a', 'b', 'c']`` +- A slice object with labels ``'a':'f'`` (note that contrary to usual python slices, **both** the start and the stop are included!) +- A boolean array + +.. ipython:: python + + s1 = Series(np.random.randn(6),index=list('abcdef')) + s1 + s1.loc['c':] + s1.loc['b'] + +Note that setting works as well: + +.. ipython:: python + + s1.loc['c':] = 0 + s1 + +With a DataFrame + +.. ipython:: python + + df1 = DataFrame(np.random.randn(6,4), + index=list('abcdef'), + columns=list('ABCD')) + df1 + df1.loc[['a','b','d'],:] + +Accessing via label slices + +.. ipython:: python + + df1.loc['d':,'A':'C'] + +For getting a cross section using a label (equiv to ``df.xs('a')``) + +.. ipython:: python + + df1.loc['a'] + +For getting values with a boolean array + +.. ipython:: python + + df1.loc['a']>0 + df1.loc[:,df1.loc['a']>0] + +For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``) + +.. ipython:: python + + # this is also equivalent to ``df1.at['a','A']`` + df1.loc['a','A'] + +.. _indexing.integer: + +Selection By Position +--------------------- + +.. warning:: + + Whether a copy or a reference is returned for a setting operation, may depend on the context. + This is sometimes called ``chained assignment`` and should be avoided. + See :ref:`Returning a View versus Copy ` + +pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. When slicing, the start bounds is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise a ``IndexError``. + +The ``.iloc`` attribute is the primary access method. The following are valid inputs: + +- An integer e.g. ``5`` +- A list or array of integers ``[4, 3, 0]`` +- A slice object with ints ``1:7`` + +.. ipython:: python + + s1 = Series(np.random.randn(5),index=list(range(0,10,2))) + s1 + s1.iloc[:3] + s1.iloc[3] + +Note that setting works as well: + +.. ipython:: python + + s1.iloc[:3] = 0 + s1 + +With a DataFrame + +.. ipython:: python + + df1 = DataFrame(np.random.randn(6,4), + index=list(range(0,12,2)), + columns=list(range(0,8,2))) + df1 + +Select via integer slicing + +.. ipython:: python + + df1.iloc[:3] + df1.iloc[1:5,2:4] + +Select via integer list + +.. ipython:: python + + df1.iloc[[1,3,5],[1,3]] + +For slicing rows explicitly (equiv to deprecated ``df.irow(slice(1,3))``). + +.. ipython:: python + + df1.iloc[1:3,:] + +For slicing columns explicitly (equiv to deprecated ``df.icol(slice(1,3))``). + +.. ipython:: python + + df1.iloc[:,1:3] + +For getting a scalar via integer position (equiv to deprecated ``df.get_value(1,1)``) + +.. ipython:: python + + # this is also equivalent to ``df1.iat[1,1]`` + df1.iloc[1,1] + +For getting a cross section using an integer position (equiv to ``df.xs(1)``) + +.. ipython:: python + + df1.iloc[1] + +There is one signficant departure from standard python/numpy slicing semantics. +python/numpy allow slicing past the end of an array without an associated error. + +.. ipython:: python + + # these are allowed in python/numpy. + x = list('abcdef') + x[4:10] + x[8:10] + +- as of v0.14.0, ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being + indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds + values. A single indexer / list of indexers that is out-of-bounds will still raise + ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned) + +.. ipython:: python + + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + dfl + dfl.iloc[:,2:3] + dfl.iloc[:,1:3] + dfl.iloc[4:6] + +These are out-of-bounds selections + +.. code-block:: python + + dfl.iloc[[4,5,6]] + IndexError: positional indexers are out-of-bounds + + dfl.iloc[:,4] + IndexError: single positional indexer is out-of-bounds + +.. _indexing.basics.partial_setting: + +Setting With Enlargement +------------------------ + +.. versionadded:: 0.13 + +The ``.loc/.ix/[]`` operations can perform enlargement when setting a non-existant key for that axis. + +In the ``Series`` case this is effectively an appending operation + +.. ipython:: python + + se = Series([1,2,3]) + se + se[5] = 5. + se + +A ``DataFrame`` can be enlarged on either axis via ``.loc`` + +.. ipython:: python + + dfi = DataFrame(np.arange(6).reshape(3,2), + columns=['A','B']) + dfi + dfi.loc[:,'C'] = dfi.loc[:,'A'] + dfi + +This is like an ``append`` operation on the ``DataFrame``. + +.. ipython:: python + + dfi.loc[3] = 5 + dfi + +.. _indexing.basics.get_value: + +Fast scalar value getting and setting +------------------------------------- + +Since indexing with ``[]`` must handle a lot of cases (single-label access, +slicing, boolean indexing, etc.), it has a bit of overhead in order to figure +out what you're asking for. If you only want to access a scalar value, the +fastest way is to use the ``at`` and ``iat`` methods, which are implemented on +all of the data structures. + +Similary to ``loc``, ``at`` provides **label** based scalar lookups, while, ``iat`` provides **integer** based lookups analagously to ``iloc`` + +.. ipython:: python + + s.iat[5] + df.at[dates[5], 'A'] + df.iat[3, 0] + +You can also set using these same indexers. + +.. ipython:: python + + df.at[dates[5], 'E'] = 7 + df.iat[3, 0] = 7 + +``at`` may enlarge the object in-place as above if the indexer is missing. + +.. ipython:: python + + df.at[dates[-1]+1, 0] = 7 + df + +Boolean indexing +---------------- + +.. _indexing.boolean: + +Another common operation is the use of boolean vectors to filter the data. +The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``. These **must** be grouped by using parentheses. + +Using a boolean vector to index a Series works exactly as in a numpy ndarray: + +.. ipython:: python + + s[s > 0] + s[(s < 0) & (s > -0.5)] + s[(s < -1) | (s > 1 )] + s[~(s < 0)] + +You may select rows from a DataFrame using a boolean vector the same length as +the DataFrame's index (for example, something derived from one of the columns +of the DataFrame): + +.. ipython:: python + + df[df['A'] > 0] + +List comprehensions and ``map`` method of Series can also be used to produce +more complex criteria: + +.. ipython:: python + + df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : randn(7)}) + + # only want 'two' or 'three' + criterion = df2['a'].map(lambda x: x.startswith('t')) + + df2[criterion] + + # equivalent but slower + df2[[x.startswith('t') for x in df2['a']]] + + # Multiple criteria + df2[criterion & (df2['b'] == 'x')] + +Note, with the choice methods :ref:`Selection by Label `, :ref:`Selection by Position `, +and :ref:`Advanced Indexing ` you may select along more than one axis using boolean vectors combined with other indexing expressions. + +.. ipython:: python + + df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] + +.. _indexing.basics.indexing_isin: + +Indexing with isin +~~~~~~~~~~~~~~~~~~ + +Consider the ``isin`` method of Series, which returns a boolean vector that is +true wherever the Series elements exist in the passed list. This allows you to +select rows where one or more columns have values you want: + +.. ipython:: python + + s = Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64') + + s + + s.isin([2, 4]) + + s[s.isin([2, 4])] + + +DataFrame also has an ``isin`` method. When calling ``isin``, pass a set of +values as either an array or dict. If values is an array, ``isin`` returns +a DataFrame of booleans that is the same shape as the original DataFrame, with True +wherever the element is in the sequence of values. + +.. ipython:: python + + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}) + + values = ['a', 'b', 1, 3] + + df.isin(values) + +Oftentimes you'll want to match certain values with certain columns. +Just make values a ``dict`` where the key is the column, and the value is +a list of items you want to check for. + +.. ipython:: python + + values = {'ids': ['a', 'b'], 'vals': [1, 3]} + + df.isin(values) + +Combine DataFrame's ``isin`` with the ``any()`` and ``all()`` methods to +quickly select subsets of your data that meet a given criteria. +To select a row where each column meets its own criterion: + +.. ipython:: python + + values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]} + + row_mask = df.isin(values).all(1) + + df[row_mask] + +The :meth:`~pandas.DataFrame.where` Method and Masking +------------------------------------------------------ + +Selecting values from a Series with a boolean vector generally returns a +subset of the data. To guarantee that selection output has the same shape as +the original data, you can use the ``where`` method in ``Series`` and ``DataFrame``. + +To return only the selected rows + +.. ipython:: python + + s[s > 0] + +To return a Series of the same shape as the original + +.. ipython:: python + + s.where(s > 0) + +Selecting values from a DataFrame with a boolean critierion now also preserves +input data shape. ``where`` is used under the hood as the implementation. +Equivalent is ``df.where(df < 0)`` + +.. ipython:: python + :suppress: + + dates = date_range('1/1/2000', periods=8) + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + +.. ipython:: python + + df[df < 0] + +In addition, ``where`` takes an optional ``other`` argument for replacement of +values where the condition is False, in the returned copy. + +.. ipython:: python + + df.where(df < 0, -df) + +You may wish to set values based on some boolean criteria. +This can be done intuitively like so: + +.. ipython:: python + + s2 = s.copy() + s2[s2 < 0] = 0 + s2 + + df2 = df.copy() + df2[df2 < 0] = 0 + df2 + +By default, ``where`` returns a modified copy of the data. There is an +optional parameter ``inplace`` so that the original data can be modified +without creating a copy: + +.. ipython:: python + + df_orig = df.copy() + df_orig.where(df > 0, -df, inplace=True); + df_orig + +**alignment** + +Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), +such that partial selection with setting is possible. This is analagous to +partial setting via ``.ix`` (but on the contents rather than the axis labels) + +.. ipython:: python + + df2 = df.copy() + df2[ df2[1:4] > 0 ] = 3 + df2 + +.. versionadded:: 0.13 + +Where can also accept ``axis`` and ``level`` parameters to align the input when +performing the ``where``. + +.. ipython:: python + + df2 = df.copy() + df2.where(df2>0,df2['A'],axis='index') + +This is equivalent (but faster than) the following. + +.. ipython:: python + + df2 = df.copy() + df.apply(lambda x, y: x.where(x>0,y), y=df['A']) + +**mask** + +``mask`` is the inverse boolean operation of ``where``. + +.. ipython:: python + + s.mask(s >= 0) + df.mask(df >= 0) + +.. _indexing.query: + +The :meth:`~pandas.DataFrame.query` Method (Experimental) +--------------------------------------------------------- + +.. versionadded:: 0.13 + +:class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` +method that allows selection using an expression. + +You can get the value of the frame where column ``b`` has values +between the values of columns ``a`` and ``c``. For example: + +.. ipython:: python + :suppress: + + from numpy.random import randint, rand + np.random.seed(1234) + +.. ipython:: python + + n = 10 + df = DataFrame(rand(n, 3), columns=list('abc')) + df + + # pure python + df[(df.a < df.b) & (df.b < df.c)] + + # query + df.query('(a < b) & (b < c)') + +Do the same thing but fallback on a named index if there is no column +with the name ``a``. + +.. ipython:: python + + df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) + df.index.name = 'a' + df + df.query('a < b and b < c') + +If instead you don't want to or cannot name your index, you can use the name +``index`` in your query expression: + +.. ipython:: python + :suppress: + + old_index = index + del index + +.. ipython:: python + + df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df + df.query('index < b < c') + +.. ipython:: python + :suppress: + + index = old_index + del old_index + + +.. note:: + + If the name of your index overlaps with a column name, the column name is + given precedence. For example, + + .. ipython:: python + + df = DataFrame({'a': randint(5, size=5)}) + df.index.name = 'a' + df.query('a > 2') # uses the column 'a', not the index + + You can still use the index in a query expression by using the special + identifier 'index': + + .. ipython:: python + + df.query('index > 2') + + If for some reason you have a column named ``index``, then you can refer to + the index as ``ilevel_0`` as well, but at this point you should consider + renaming your columns to something less ambiguous. + + +:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can also use the levels of a ``DataFrame`` with a +:class:`~pandas.MultiIndex` as if they were columns in the frame: + +.. ipython:: python + + import pandas.util.testing as tm + + n = 10 + colors = tm.choice(['red', 'green'], size=n) + foods = tm.choice(['eggs', 'ham'], size=n) + colors + foods + + index = MultiIndex.from_arrays([colors, foods], names=['color', 'food']) + df = DataFrame(randn(n, 2), index=index) + df + df.query('color == "red"') + +If the levels of the ``MultiIndex`` are unnamed, you can refer to them using +special names: + + +.. ipython:: python + + df.index.names = [None, None] + df + df.query('ilevel_0 == "red"') + + +The convention is ``ilevel_0``, which means "index level 0" for the 0th level +of the ``index``. + + +:meth:`~pandas.DataFrame.query` Use Cases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A use case for :meth:`~pandas.DataFrame.query` is when you have a collection of +:class:`~pandas.DataFrame` objects that have a subset of column names (or index +levels/names) in common. You can pass the same query to both frames *without* +having to specify which frame you're interested in querying + +.. ipython:: python + + df = DataFrame(rand(n, 3), columns=list('abc')) + df + df2 = DataFrame(rand(n + 2, 3), columns=df.columns) + df2 + expr = '0.0 <= a <= c <= 0.5' + map(lambda frame: frame.query(expr), [df, df2]) + +:meth:`~pandas.DataFrame.query` Python versus pandas Syntax Comparison +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Full numpy-like syntax + +.. ipython:: python + + df = DataFrame(randint(n, size=(n, 3)), columns=list('abc')) + df + df.query('(a < b) & (b < c)') + df[(df.a < df.b) & (df.b < df.c)] + +Slightly nicer by removing the parentheses (by binding making comparison +operators bind tighter than ``&``/``|``) + +.. ipython:: python + + df.query('a < b & b < c') + +Use English instead of symbols + +.. ipython:: python + + df.query('a < b and b < c') + +Pretty close to how you might write it on paper + +.. ipython:: python + + df.query('a < b < c') + +The ``in`` and ``not in`` operators +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~pandas.DataFrame.query` also supports special use of Python's ``in`` and +``not in`` comparison operators, providing a succint syntax for calling the +``isin`` method of a ``Series`` or ``DataFrame``. + +.. ipython:: python + :suppress: + + try: + old_d = d + del d + except NameError: + pass + +.. ipython:: python + + # get all rows where columns "a" and "b" have overlapping values + df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), + 'c': randint(5, size=12), 'd': randint(9, size=12)}) + df + df.query('a in b') + + # How you'd do it in pure Python + df[df.a.isin(df.b)] + + df.query('a not in b') + + # pure Python + df[~df.a.isin(df.b)] + + +You can combine this with other expressions for very succinct queries: + + +.. ipython:: python + + # rows where cols a and b have overlapping values and col c's values are less than col d's + df.query('a in b and c < d') + + # pure Python + df[df.b.isin(df.a) & (df.c < df.d)] + + +.. note:: + + Note that ``in`` and ``not in`` are evaluated in Python, since ``numexpr`` + has no equivalent of this operation. However, **only the** ``in``/``not in`` + **expression itself** is evaluated in vanilla Python. For example, in the + expression + + .. code-block:: python + + df.query('a in b + c + d') + + ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in`` + operation is evaluated in plain Python. In general, any operations that can + be evaluated using ``numexpr`` will be. + +Special use of the ``==`` operator with ``list`` objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Comparing a ``list`` of values to a column using ``==``/``!=`` works similarly +to ``in``/``not in`` + +.. ipython:: python + + df.query('b == ["a", "b", "c"]') + + # pure Python + df[df.b.isin(["a", "b", "c"])] + + df.query('c == [1, 2]') + + df.query('c != [1, 2]') + + # using in/not in + df.query('[1, 2] in c') + + df.query('[1, 2] not in c') + + # pure Python + df[df.c.isin([1, 2])] + + +Boolean Operators +~~~~~~~~~~~~~~~~~ + +You can negate boolean expressions with the word ``not`` or the ``~`` operator. + +.. ipython:: python + + df = DataFrame(rand(n, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + df.query('~bools') + df.query('not bools') + df.query('not bools') == df[~df.bools] + +Of course, expressions can be arbitrarily complex too + +.. ipython:: python + + # short query syntax + shorter = df.query('a < b < c and (not bools) or bools > 2') + + # equivalent in pure Python + longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + + shorter + longer + + shorter == longer + +.. ipython:: python + :suppress: + + try: + d = old_d + del old_d + except NameError: + pass + + +Performance of :meth:`~pandas.DataFrame.query` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for +large frames + +.. image:: _static/query-perf.png + +.. note:: + + You will only see the performance benefits of using the ``numexpr`` engine + with ``DataFrame.query()`` if your frame has more than approximately 200,000 + rows + + .. image:: _static/query-perf-small.png + +This plot was created using a ``DataFrame`` with 3 columns each containing +floating point values generated using ``numpy.random.randn()``. + +.. ipython:: python + :suppress: + + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df2 = df.copy() + +Take Methods +------------ + +.. _indexing.take: + +Similar to numpy ndarrays, pandas Index, Series, and DataFrame also provides +the ``take`` method that retrieves elements along a given axis at the given +indices. The given indices must be either a list or an ndarray of integer +index positions. ``take`` will also accept negative integers as relative positions to the end of the object. + +.. ipython:: python + + index = Index(randint(0, 1000, 10)) + index + + positions = [0, 9, 3] + + index[positions] + index.take(positions) + + ser = Series(randn(10)) + + ser.ix[positions] + ser.take(positions) + +For DataFrames, the given indices should be a 1d list or ndarray that specifies +row or column positions. + +.. ipython:: python + + frm = DataFrame(randn(5, 3)) + + frm.take([1, 4, 3]) + + frm.take([0, 2], axis=1) + +It is important to note that the ``take`` method on pandas objects are not +intended to work on boolean indices and may return unexpected results. + +.. ipython:: python + + arr = randn(10) + arr.take([False, False, True, True]) + arr[[0, 1]] + + ser = Series(randn(10)) + ser.take([False, False, True, True]) + ser.ix[[0, 1]] + +Finally, as a small note on performance, because the ``take`` method handles +a narrower range of inputs, it can offer performance that is a good deal +faster than fancy indexing. + +.. ipython:: + + arr = randn(10000, 5) + indexer = np.arange(10000) + random.shuffle(indexer) + + timeit arr[indexer] + timeit arr.take(indexer, axis=0) + + ser = Series(arr[:, 0]) + timeit ser.ix[indexer] + timeit ser.take(indexer) + +Duplicate Data +-------------- + +.. _indexing.duplicate: + +If you want to identify and remove duplicate rows in a DataFrame, there are +two methods that will help: ``duplicated`` and ``drop_duplicates``. Each +takes as an argument the columns to use to identify duplicated rows. + +- ``duplicated`` returns a boolean vector whose length is the number of rows, and which indicates whether a row is duplicated. +- ``drop_duplicates`` removes duplicate rows. + +By default, the first observed row of a duplicate set is considered unique, but +each method has a ``take_last`` parameter that indicates the last observed row +should be taken instead. + +.. ipython:: python + + df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : np.random.randn(7)}) + df2.duplicated(['a','b']) + df2.drop_duplicates(['a','b']) + df2.drop_duplicates(['a','b'], take_last=True) + +.. _indexing.dictionarylike: + +Dictionary-like :meth:`~pandas.DataFrame.get` method +---------------------------------------------------- + +Each of Series, DataFrame, and Panel have a ``get`` method which can return a +default value. + +.. ipython:: python + + s = Series([1,2,3], index=['a','b','c']) + s.get('a') # equivalent to s['a'] + s.get('x', default=-1) + +.. _indexing.advanced: + +Advanced Indexing with ``.ix`` +------------------------------ + +.. note:: + + The recent addition of ``.loc`` and ``.iloc`` have enabled users to be quite + explicit about indexing choices. ``.ix`` allows a great flexibility to + specify indexing locations by *label* and/or *integer position*. pandas will + attempt to use any passed *integer* as *label* locations first (like what + ``.loc`` would do, then to fall back on *positional* indexing, like what + ``.iloc`` would do). See :ref:`Fallback Indexing ` for + an example. + +The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by +Label `, and ``.iloc`` in :ref:`Selection by Position `. + +The ``.ix`` attribute takes the following inputs: + +- An integer or single label, e.g. ``5`` or ``'a'`` +- A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` +- A slice object with ints ``1:7`` or labels ``'a':'f'`` +- A boolean array + +We'll illustrate all of these methods. First, note that this provides a concise +way of reindexing on multiple axes at once: + +.. ipython:: python + + subindex = dates[[3,4,5]] + df.reindex(index=subindex, columns=['C', 'B']) + df.ix[subindex, ['C', 'B']] + +Assignment / setting values is possible when using ``ix``: + +.. ipython:: python + + df2 = df.copy() + df2.ix[subindex, ['C', 'B']] = 0 + df2 + +Indexing with an array of integers can also be done: + +.. ipython:: python + + df.ix[[4,3,1]] + df.ix[dates[[4,3,1]]] + +**Slicing** has standard Python semantics for integer slices: + +.. ipython:: python + + df.ix[1:7, :2] + +Slicing with labels is semantically slightly different because the slice start +and stop are **inclusive** in the label-based case: + +.. ipython:: python + + date1, date2 = dates[[2, 4]] + print(date1, date2) + df.ix[date1:date2] + df['A'].ix[date1:date2] + +Getting and setting rows in a DataFrame, especially by their location, is much +easier: + +.. ipython:: python + + df2 = df[:5].copy() + df2.ix[3] + df2.ix[3] = np.arange(len(df2.columns)) + df2 + +Column or row selection can be combined as you would expect with arrays of +labels or even boolean vectors: + +.. ipython:: python + + df.ix[df['A'] > 0, 'B'] + df.ix[date1:date2, 'B'] + df.ix[date1, 'B'] + +Slicing with labels is closely related to the ``truncate`` method which does +precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons). + +The :meth:`~pandas.DataFrame.select` Method +------------------------------------------- + +Another way to extract slices from an object is with the ``select`` method of +Series, DataFrame, and Panel. This method should be used only when there is no +more direct way. ``select`` takes a function which operates on labels along +``axis`` and returns a boolean. For instance: + +.. ipython:: python + + df.select(lambda x: x == 'A', axis=1) + +The :meth:`~pandas.DataFrame.lookup` Method +------------------------------------------- + +Sometimes you want to extract a set of values given a sequence of row labels +and column labels, and the ``lookup`` method allows for this and returns a +numpy array. For instance, + +.. ipython:: python + + dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) + dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D']) + +.. _indexing.float64index: + +Float64Index +------------ + +.. note:: + + As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype + array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype + array. Using a ``float64`` dtype in the backend speeds up arithmetic + operations by about 30x and boolean indexing operations on the + ``Float64Index`` itself are about 2x as fast. + + +.. versionadded:: 0.13.0 + +By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation. +This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the +same. + +.. ipython:: python + + indexf = Index([1.5, 2, 3, 4.5, 5]) + indexf + sf = Series(range(5),index=indexf) + sf + +Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) + +.. ipython:: python + + sf[3] + sf[3.0] + sf.ix[3] + sf.ix[3.0] + sf.loc[3] + sf.loc[3.0] + +The only positional indexing is via ``iloc`` + +.. ipython:: python + + sf.iloc[3] + +A scalar index that is not found will raise ``KeyError`` + +Slicing is ALWAYS on the values of the index, for ``[],ix,loc`` and ALWAYS positional with ``iloc`` + +.. ipython:: python + + sf[2:4] + sf.ix[2:4] + sf.loc[2:4] + sf.iloc[2:4] + +In float indexes, slicing using floats is allowed + +.. ipython:: python + + sf[2.1:4.6] + sf.loc[2.1:4.6] + +In non-float indexes, slicing using floats will raise a ``TypeError`` + +.. code-block:: python + + In [1]: Series(range(5))[3.5] + TypeError: the label [3.5] is not a proper indexer for this index type (Int64Index) + + In [1]: Series(range(5))[3.5:4.5] + TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index) + +Using a scalar float indexer will be deprecated in a future version, but is allowed for now. + +.. code-block:: python + + In [3]: Series(range(5))[3.0] + Out[3]: 3 + +Here is a typical use-case for using this type of indexing. Imagine that you have a somewhat +irregular timedelta-like indexing scheme, but the data is recorded as floats. This could for +example be millisecond offsets. + +.. ipython:: python + + dfir = concat([DataFrame(randn(5,2), + index=np.arange(5) * 250.0, + columns=list('AB')), + DataFrame(randn(6,2), + index=np.arange(4,10) * 250.1, + columns=list('AB'))]) + dfir + +Selection operations then will always work on a value basis, for all selection operators. + +.. ipython:: python + + dfir[0:1000.4] + dfir.loc[0:1001,'A'] + dfir.loc[1000.4] + +You could then easily pick out the first 1 second (1000 ms) of data then. + +.. ipython:: python + + dfir[0:1000] + +Of course if you need integer based selection, then use ``iloc`` + +.. ipython:: python + + dfir.iloc[0:5] + +.. _indexing.view_versus_copy: + +Returning a view versus a copy +------------------------------ + +When setting values in a pandas object, care must be taken to avoid what is called +``chained indexing``. Here is an example. + +.. ipython:: python + + dfmi = DataFrame([list('abcd'), + list('efgh'), + list('ijkl'), + list('mnop')], + columns=MultiIndex.from_product([['one','two'], + ['first','second']])) + dfmi + +Compare these two access methods: + +.. ipython:: python + + dfmi['one']['second'] + +.. ipython:: python + + dfmi.loc[:,('one','second')] + +These both yield the same results, so which should you use? It is instructive to understand the order +of operations on these and why method 2 (``.loc``) is much preferred over method 1 (chained ``[]``) + +``dfmi['one']`` selects the first level of the columns and returns a data frame that is singly-indexed. +Then another python operation ``dfmi_with_one['second']`` selects the series indexed by ``'second'`` happens. +This is indicated by the variable ``dfmi_with_one`` because pandas sees these operations as separate events. +e.g. separate calls to ``__getitem__``, so it has to treat them as linear operations, they happen one after another. + +Contrast this to ``df.loc[:,('one','second')]`` which passes a nested tuple of ``(slice(None),('one','second'))`` to a single call to +``__getitem__``. This allows pandas to deal with this as a single entity. Furthermore this order of operations *can* be significantly +faster, and allows one to index *both* axes if so desired. + +Why does the assignment when using chained indexing fail! +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +So, why does this show the ``SettingWithCopy`` warning / and possibly not work when you do chained indexing and assignement: + +.. code-block:: python + + dfmi['one']['second'] = value + +Since the chained indexing is 2 calls, it is possible that either call may return a **copy** of the data because of the way it is sliced. +Thus when setting, you are actually setting a **copy**, and not the original frame data. It is impossible for pandas to figure this out because their are 2 separate python operations that are not connected. + +The ``SettingWithCopy`` warning is a 'heuristic' to detect this (meaning it tends to catch most cases but is simply a lightweight check). Figuring this out for real is way complicated. + +The ``.loc`` operation is a single python operation, and thus can select a slice (which still may be a copy), but allows pandas to assign that slice back into the frame after it is modified, thus setting the values as you would think. + +The reason for having the ``SettingWithCopy`` warning is this. Sometimes when you slice an array you will simply get a view back, which means you can set it no problem. However, even a single dtyped array can generate a copy if it is sliced in a particular way. A multi-dtyped DataFrame (meaning it has say ``float`` and ``object`` data), will almost always yield a copy. Whether a view is created is dependent on the memory layout of the array. + +Evaluation order matters +~~~~~~~~~~~~~~~~~~~~~~~~ + +Furthermore, in chained expressions, the order may determine whether a copy is returned or not. +If an expression will set values on a copy of a slice, then a ``SettingWithCopy`` +exception will be raised (this raise/warn behavior is new starting in 0.13.0) + +You can control the action of a chained assignment via the option ``mode.chained_assignment``, +which can take the values ``['raise','warn',None]``, where showing a warning is the default. + +.. ipython:: python + + dfb = DataFrame({'a' : ['one', 'one', 'two', + 'three', 'two', 'one', 'six'], + 'c' : np.arange(7)}) + + # passed via reference (will stay) + dfb['c'][dfb.a.str.startswith('o')] = 42 + +This however is operating on a copy and will not work. + +:: + + >>> pd.set_option('mode.chained_assignment','warn') + >>> dfb[dfb.a.str.startswith('o')]['c'] = 42 + Traceback (most recent call last) + ... + SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_index,col_indexer] = value instead + +A chained assignment can also crop up in setting in a mixed dtype frame. + +.. note:: + + These setting rules apply to all of ``.loc/.iloc/.ix`` + +This is the correct access method + +.. ipython:: python + + dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + dfc.loc[0,'A'] = 11 + dfc + +This *can* work at times, but is not guaranteed, and so should be avoided + +.. ipython:: python + + dfc = dfc.copy() + dfc['A'][0] = 111 + dfc + +This will **not** work at all, and so should be avoided + +:: + + >>> pd.set_option('mode.chained_assignment','raise') + >>> dfc.loc[0]['A'] = 1111 + Traceback (most recent call last) + ... + SettingWithCopyException: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_index,col_indexer] = value instead + +.. warning:: + + The chained assignment warnings / exceptions are aiming to inform the user of a possibly invalid + assignment. There may be false positives; situations where a chained assignment is inadvertantly + reported. + + +Fallback indexing +----------------- + +.. _indexing.fallback: + +Float indexes should be used only with caution. If you have a float indexed +``DataFrame`` and try to select using an integer, the row that pandas returns +might not be what you expect. pandas first attempts to use the *integer* +as a *label* location, but fails to find a match (because the types +are not equal). pandas then falls back to back to positional indexing. + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(4,4), + columns=list('ABCD'), index=[1.0, 2.0, 3.0, 4.0]) + df + df.ix[1] + +To select the row you do expect, instead use a float label or +use ``iloc``. + +.. ipython:: python + + df.ix[1.0] + df.iloc[0] + +Instead of using a float index, it is often better to +convert to an integer index: + +.. ipython:: python + + df_new = df.reset_index() + df_new[df_new['index'] == 1.0] + # now you can also do "float selection" + df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)] + + +.. _indexing.class: + +Index objects +------------- + +The pandas :class:`~pandas.Index` class and its subclasses can be viewed as +implementing an *ordered multiset*. Duplicates are allowed. However, if you try +to convert an :class:`~pandas.Index` object with duplicate entries into a +``set``, an exception will be raised. + +:class:`~pandas.Index` also provides the infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create an +:class:`~pandas.Index` directly is to pass a ``list`` or other sequence to +:class:`~pandas.Index`: + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index + +You can also pass a ``name`` to be stored in the index: + + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name + +Starting with pandas 0.5, the name, if set, will be shown in the console +display: + +.. ipython:: python + + index = Index(list(range(5)), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] + + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.set_ops: + +The three main operations are ``union (|)``, ``intersection (&)``, and ``diff +(-)``. These can be directly called as instance methods or used via overloaded +operators: + +.. ipython:: python + + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a.union(b) + a | b + a & b + a - b + +Also available is the ``sym_diff (^)`` operation, which returns elements +that appear in either ``idx1`` or ``idx2`` but not both. This is +equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)``, +with duplicates dropped. + +.. ipython:: python + + idx1 = Index([1, 2, 3, 4]) + idx2 = Index([2, 3, 4, 5]) + idx1.sym_diff(idx2) + idx1 ^ idx2 + +The ``isin`` method of Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One additional operation is the ``isin`` method that works analogously to the +``Series.isin`` method found :ref:`here `. + +.. _indexing.hierarchical: + +Hierarchical indexing (MultiIndex) +---------------------------------- + +Hierarchical indexing (also referred to as "multi-level" indexing) is brand new +in the pandas 0.4 release. It is very exciting as it opens the door to some +quite sophisticated data analysis and manipulation, especially for working with +higher dimensional data. In essence, it enables you to store and manipulate +data with an arbitrary number of dimensions in lower dimensional data +structures like Series (1d) and DataFrame (2d). + +In this section, we will show what exactly we mean by "hierarchical" indexing +and how it integrates with the all of the pandas indexing functionality +described above and in prior sections. Later, when discussing :ref:`group by +` and :ref:`pivoting and reshaping data `, we'll show +non-trivial applications to illustrate how it aids in structuring data for +analysis. + +See the :ref:`cookbook` for some advanced strategies + +.. note:: + + Given that hierarchical indexing is so new to the library, it is definitely + "bleeding-edge" functionality but is certainly suitable for production. But, + there may inevitably be some minor API changes as more use cases are + explored and any weaknesses in the design / implementation are identified. + pandas aims to be "eminently usable" so any feedback about new + functionality like this is extremely helpful. + +Creating a MultiIndex (hierarchical index) object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``MultiIndex`` object is the hierarchical analogue of the standard +``Index`` object which typically stores the axis labels in pandas objects. You +can think of ``MultiIndex`` an array of tuples where each tuple is unique. A +``MultiIndex`` can be created from a list of arrays (using +``MultiIndex.from_arrays``), an array of tuples (using +``MultiIndex.from_tuples``), or a crossed set of iterables (using +``MultiIndex.from_product``). The ``Index`` constructor will attempt to return +a ``MultiIndex`` when it is passed a list of tuples. The following examples +demo different ways to initialize MultiIndexes. + + +.. ipython:: python + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = list(zip(*arrays)) + tuples + + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + index + + s = Series(randn(8), index=index) + s + +When you want every pairing of the elements in two iterables, it can be easier +to use the ``MultiIndex.from_product`` function: + +.. ipython:: python + + iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] + MultiIndex.from_product(iterables, names=['first', 'second']) + +As a convenience, you can pass a list of arrays directly into Series or +DataFrame to construct a MultiIndex automatically: + +.. ipython:: python + + arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']) + , + np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']) + ] + s = Series(randn(8), index=arrays) + s + df = DataFrame(randn(8, 4), index=arrays) + df + +All of the ``MultiIndex`` constructors accept a ``names`` argument which stores +string names for the levels themselves. If no names are provided, ``None`` will +be assigned: + +.. ipython:: python + + df.index.names + +This index can back any axis of a pandas object, and the number of **levels** +of the index is up to you: + +.. ipython:: python + + df = DataFrame(randn(3, 8), index=['A', 'B', 'C'], columns=index) + df + DataFrame(randn(6, 6), index=index[:6], columns=index[:6]) + +We've "sparsified" the higher levels of the indexes to make the console output a +bit easier on the eyes. + +It's worth keeping in mind that there's nothing preventing you from using +tuples as atomic labels on an axis: + +.. ipython:: python + + Series(randn(8), index=tuples) + +The reason that the ``MultiIndex`` matters is that it can allow you to do +grouping, selection, and reshaping operations as we will describe below and in +subsequent areas of the documentation. As you will see in later sections, you +can find yourself working with hierarchically-indexed data without creating a +``MultiIndex`` explicitly yourself. However, when loading data from a file, you +may wish to generate your own ``MultiIndex`` when preparing the data set. + +Note that how the index is displayed by be controlled using the +``multi_sparse`` option in ``pandas.set_printoptions``: + +.. ipython:: python + + pd.set_option('display.multi_sparse', False) + df + pd.set_option('display.multi_sparse', True) + +Reconstructing the level labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.get_level_values: + +The method ``get_level_values`` will return a vector of the labels for each +location at a particular level: + +.. ipython:: python + + index.get_level_values(0) + index.get_level_values('second') + + +Basic indexing on axis with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the important features of hierarchical indexing is that you can select +data by a "partial" label identifying a subgroup in the data. **Partial** +selection "drops" levels of the hierarchical index in the result in a +completely analogous way to selecting a column in a regular DataFrame: + +.. ipython:: python + + df['bar'] + df['bar', 'one'] + df['bar']['one'] + s['qux'] + +See :ref:`Cross-section with hierarchical index ` for how to select +on a deeper level. + + +Data alignment and using ``reindex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Operations between differently-indexed objects having ``MultiIndex`` on the +axes will work as you expect; data alignment will work the same as an Index of +tuples: + +.. ipython:: python + + s + s[:-2] + s + s[::2] + +``reindex`` can be called with another ``MultiIndex`` or even a list or array +of tuples: + +.. ipython:: python + + s.reindex(index[:3]) + s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) + +.. _indexing.advanced_hierarchical: + +Advanced indexing with hierarchical index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc/.ix`` is a +bit challenging, but we've made every effort to do so. for example the +following works as you would expect: + +.. ipython:: python + + df = df.T + df + df.loc['bar'] + df.loc['bar', 'two'] + +"Partial" slicing also works quite nicely. + +.. ipython:: python + + df.loc['baz':'foo'] + +You can slice with a 'range' of values, by providing a slice of tuples. + +.. ipython:: python + + df.loc[('baz', 'two'):('qux', 'one')] + df.loc[('baz', 'two'):'foo'] + +Passing a list of labels or tuples works similar to reindexing: + +.. ipython:: python + + df.ix[[('bar', 'two'), ('qux', 'one')]] + +.. _indexing.mi_slicers: + +Multiindexing using slicers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +In 0.14.0 we added a new way to slice multi-indexed objects. +You can slice a multi-index by providing multiple indexers. + +You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, +including slices, lists of labels, labels, and boolean indexers. + +You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the +*deeper* levels, they will be implied as ``slice(None)``. + +As usual, **both sides** of the slicers are included as this is label indexing. + +.. warning:: + + You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and + for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted + as indexing *both* axes, rather than into say the MuliIndex for the rows. + + You should do this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....),:] + + rather than this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....)] + +.. warning:: + + You will need to make sure that the selection axes are fully lexsorted! + +.. ipython:: python + + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + miindex = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + micolumns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + dfmi = DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))), + index=miindex, + columns=micolumns).sortlevel().sortlevel(axis=1) + dfmi + +Basic multi-index slicing using slices, lists, and labels. + +.. ipython:: python + + dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +It is possible to perform quite complicated selections using this method on multiple +axes at the same time. + +.. ipython:: python + + dfmi.loc['A1',(slice(None),'foo')] + dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +Using a boolean indexer you can provide selection related to the *values*. + +.. ipython:: python + + mask = dfmi[('a','foo')]>200 + dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + +You can also specify the ``axis`` argument to ``.loc`` to interpret the passed +slicers on a single axis. + +.. ipython:: python + + dfmi.loc(axis=0)[:,:,['C1','C3']] + +Furthermore you can *set* the values using these methods + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc(axis=0)[:,:,['C1','C3']] = -10 + df2 + +You can use a right-hand-side of an alignable object as well. + +.. ipython:: python + + df2 = dfmi.copy() + df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 + df2 + +.. _indexing.xs: + +Cross-section with hierarchical index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``xs`` method of ``DataFrame`` additionally takes a level argument to make +selecting data at a particular level of a MultiIndex easier. + +.. ipython:: python + + df.xs('one', level='second') + +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[(slice(None),'one'),:] + +You can also select on the columns with :meth:`~pandas.MultiIndex.xs`, by +providing the axis argument + +.. ipython:: python + + df = df.T + df.xs('one', level='second', axis=1) + +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,(slice(None),'one')] + +:meth:`~pandas.MultiIndex.xs` also allows selection with multiple keys + +.. ipython:: python + + df.xs(('one', 'bar'), level=('second', 'first'), axis=1) + +.. ipython:: python + + # using the slicers (new in 0.14.0) + df.loc[:,('bar','one')] + +.. versionadded:: 0.13.0 + +You can pass ``drop_level=False`` to :meth:`~pandas.MultiIndex.xs` to retain +the level that was selected + +.. ipython:: python + + df.xs('one', level='second', axis=1, drop_level=False) + +versus the result with ``drop_level=True`` (the default value) + +.. ipython:: python + + df.xs('one', level='second', axis=1, drop_level=True) + +.. ipython:: python + :suppress: + + df = df.T + +.. _indexing.advanced_reindex: + +Advanced reindexing and alignment with hierarchical index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The parameter ``level`` has been added to the ``reindex`` and ``align`` methods +of pandas objects. This is useful to broadcast values across a level. For +instance: + +.. ipython:: python + + midx = MultiIndex(levels=[['zero', 'one'], ['x','y']], + labels=[[1,1,0,0],[1,0,1,0]]) + df = DataFrame(randn(4,2), index=midx) + print(df) + df2 = df.mean(level=0) + print(df2) + print(df2.reindex(df.index, level=0)) + df_aligned, df2_aligned = df.align(df2, level=0) + print(df_aligned) + print(df2_aligned) + + +The need for sortedness with :class:`~pandas.MultiIndex` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Caveat emptor**: the present implementation of ``MultiIndex`` requires that +the labels be sorted for some of the slicing / indexing routines to work +correctly. You can think about breaking the axis into unique groups, where at +the hierarchical level of interest, each distinct group shares a label, but no +two have the same label. However, the ``MultiIndex`` does not enforce this: +**you are responsible for ensuring that things are properly sorted**. There is +an important new method ``sortlevel`` to sort an axis within a ``MultiIndex`` +so that its labels are grouped and sorted by the original ordering of the +associated factor at that level. Note that this does not necessarily mean the +labels will be sorted lexicographically! + +.. ipython:: python + + import random; random.shuffle(tuples) + s = Series(randn(8), index=MultiIndex.from_tuples(tuples)) + s + s.sortlevel(0) + s.sortlevel(1) + +.. _indexing.sortlevel_byname: + +Note, you may also pass a level name to ``sortlevel`` if the MultiIndex levels +are named. + +.. ipython:: python + + s.index.set_names(['L1', 'L2'], inplace=True) + s.sortlevel(level='L1') + s.sortlevel(level='L2') + +Some indexing will work even if the data are not sorted, but will be rather +inefficient and will also return a copy of the data rather than a view: + +.. ipython:: python + + s['qux'] + s.sortlevel(1)['qux'] + +On higher dimensional objects, you can sort any of the other axes by level if +they have a MultiIndex: + +.. ipython:: python + + df.T.sortlevel(1, axis=1) + +The ``MultiIndex`` object has code to **explicity check the sort depth**. Thus, +if you try to index at a depth at which the index is not sorted, it will raise +an exception. Here is a concrete example to illustrate this: + +.. ipython:: python + + tuples = [('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')] + idx = MultiIndex.from_tuples(tuples) + idx.lexsort_depth + + reordered = idx[[1, 0, 3, 2]] + reordered.lexsort_depth + + s = Series(randn(4), index=reordered) + s.ix['a':'a'] + +However: + +:: + + >>> s.ix[('a', 'b'):('b', 'a')] + Traceback (most recent call last) + ... + KeyError: Key length (3) was greater than MultiIndex lexsort depth (2) + +Swapping levels with :meth:`~pandas.MultiIndex.swaplevel` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``swaplevel`` function can switch the order of two levels: + +.. ipython:: python + + df[:5] + df[:5].swaplevel(0, 1, axis=0) + +.. _indexing.reorderlevels: + +Reordering levels with :meth:`~pandas.MultiIndex.reorder_levels` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``reorder_levels`` function generalizes the ``swaplevel`` function, +allowing you to permute the hierarchical index levels in one step: + +.. ipython:: python + + df[:5].reorder_levels([1,0], axis=0) + + +Some gory internal details +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Internally, the ``MultiIndex`` consists of a few things: the **levels**, the +integer **labels**, and the level **names**: + +.. ipython:: python + + index + index.levels + index.labels + index.names + +You can probably guess that the labels determine which unique element is +identified with that location at each layer of the index. It's important to +note that sortedness is determined **solely** from the integer labels and does +not check (or care) whether the levels themselves are sorted. Fortunately, the +constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but +if you compute the levels and labels yourself, please be careful. + + +Setting index metadata (``name(s)``, ``levels``, ``labels``) +------------------------------------------------------------ + +.. versionadded:: 0.13.0 + +.. _indexing.set_metadata: + +Indexes are "mostly immutable", but it is possible to set and change their +metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and +``labels``). + +You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_labels`` +to set these attributes directly. They default to returning a copy; however, +you can specify ``inplace=True`` to have the data change inplace. + +.. ipython:: python + + ind = Index([1, 2, 3]) + ind.rename("apple") + ind + ind.set_names(["apple"], inplace=True) + ind.name = "bob" + ind + +Adding an index to an existing DataFrame +---------------------------------------- + +Occasionally you will load or create a data set into a DataFrame and want to +add an index after you've already done so. There are a couple of different +ways. + +Add an index using DataFrame columns +------------------------------------ + +.. _indexing.set_index: + +DataFrame has a ``set_index`` method which takes a column name (for a regular +``Index``) or a list of column names (for a ``MultiIndex``), to create a new, +indexed DataFrame: + +.. ipython:: python + :suppress: + + data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'], + 'b' : ['one', 'two', 'one', 'two'], + 'c' : ['z', 'y', 'x', 'w'], + 'd' : [1., 2., 3, 4]}) + +.. ipython:: python + + data + indexed1 = data.set_index('c') + indexed1 + indexed2 = data.set_index(['a', 'b']) + indexed2 + +The ``append`` keyword option allow you to keep the existing index and append +the given columns to a MultiIndex: + +.. ipython:: python + + frame = data.set_index('c', drop=False) + frame = frame.set_index(['a', 'b'], append=True) + frame + +Other options in ``set_index`` allow you not drop the index columns or to add +the index in-place (without creating a new object): + +.. ipython:: python + + data.set_index('c', drop=False) + data.set_index(['a', 'b'], inplace=True) + data + +Remove / reset the index, ``reset_index`` +------------------------------------------ + +As a convenience, there is a new function on DataFrame called ``reset_index`` +which transfers the index values into the DataFrame's columns and sets a simple +integer index. This is the inverse operation to ``set_index`` + +.. ipython:: python + + data + data.reset_index() + +The output is more similar to a SQL table or a record array. The names for the +columns derived from the index are the ones stored in the ``names`` attribute. + +You can use the ``level`` keyword to remove only a portion of the index: + +.. ipython:: python + + frame + frame.reset_index(level=1) + + +``reset_index`` takes an optional parameter ``drop`` which if true simply +discards the index, instead of putting index values in the DataFrame's columns. + +.. note:: + + The ``reset_index`` method used to be called ``delevel`` which is now + deprecated. + +Adding an ad hoc index +---------------------- + +If you create an index yourself, you can just assign it to the ``index`` field: + +.. code-block:: python + + data.index = index + +Indexing internal details +------------------------- + +.. note:: + + The following is largely relevant for those actually working on the pandas + codebase. The source code is still the best place to look at the specifics + of how things are implemented. + +In pandas there are a few objects implemented which can serve as valid +containers for the axis labels: + + - ``Index``: the generic "ordered set" object, an ndarray of object dtype + assuming nothing about its contents. The labels must be hashable (and + likely immutable) and unique. Populates a dict of label to location in + Cython to do :math:`O(1)` lookups. + - ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer + data, such as time stamps + - ``MultiIndex``: the standard hierarchical index object + - ``PeriodIndex``: An Index object with Period elements + - ``DatetimeIndex``: An Index object with Timestamp elements + - ``date_range``: fixed frequency date range generated from a time rule or + DateOffset. An ndarray of Python datetime objects + +The motivation for having an ``Index`` class in the first place was to enable +different implementations of indexing. This means that it's possible for you, +the user, to implement a custom ``Index`` subclass that may be better suited to +a particular application than the ones provided in pandas. + +From an internal implementation point of view, the relevant methods that an +``Index`` must define are one or more of the following (depending on how +incompatible the new object internals are with the ``Index`` functions): + + - ``get_loc``: returns an "indexer" (an integer, or in some cases a + slice object) for a label + - ``slice_locs``: returns the "range" to slice between two labels + - ``get_indexer``: Computes the indexing vector for reindexing / data + alignment purposes. See the source / docstrings for more on this + - ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data + alignment purposes when the index is non-unique. See the source / docstrings + for more on this + - ``reindex``: Does any pre-conversion of the input index then calls + ``get_indexer`` + - ``union``, ``intersection``: computes the union or intersection of two + Index objects + - ``insert``: Inserts a new label into an Index, yielding a new object + - ``delete``: Delete a label, yielding a new object + - ``drop``: Deletes a set of labels + - ``take``: Analogous to ndarray.take diff --git a/doc/source/install.rst b/doc/source/install.rst new file mode 100644 index 00000000..fe56b53d --- /dev/null +++ b/doc/source/install.rst @@ -0,0 +1,239 @@ +.. _install: + +.. currentmodule:: pandas + +************ +Installation +************ + +You have the option to install an `official release +`__ or to build the `development version +`__. If you choose to install from source and +are running Windows, you will have to ensure that you have a compatible C +compiler (MinGW or Visual Studio) installed. `How-to install MinGW on Windows +`__ + +Python version support +~~~~~~~~~~~~~~~~~~~~~~ + +Officially Python 2.6, 2.7, 3.2, 3.3, and 3.4. + + +Binary installers +~~~~~~~~~~~~~~~~~ + +.. _all-platforms: + +All platforms +_____________ + +Stable installers available on `PyPI `__ + +Preliminary builds and installers on the `pandas download page `__ . + +Overview +___________ + + + +.. csv-table:: + :header: "Platform", "Distribution", "Status", "Download / Repository Link", "Install method" + :widths: 10, 10, 10, 20, 50 + + + Windows, all, stable, :ref:`all-platforms`, ``pip install pandas`` + Mac, all, stable, :ref:`all-platforms`, ``pip install pandas`` + Linux, Debian, stable, `official Debian repository `__ , ``sudo apt-get install python-pandas`` + Linux, Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python-pandas`` + Linux, Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python-pandas`` + Linux, Ubuntu, unstable (daily builds), `PythonXY PPA `__; activate by: ``sudo add-apt-repository ppa:pythonxy/pythonxy-devel && sudo apt-get update``, ``sudo apt-get install python-pandas`` + Linux, OpenSuse & Fedora, stable, `OpenSuse Repository `__ , ``zypper in python-pandas`` + + + + + + + + + + +Dependencies +~~~~~~~~~~~~ + + * `NumPy `__: 1.6.1 or higher + * `python-dateutil `__ 1.5 + * `pytz `__ + * Needed for time zone support + +.. _install.recommended_dependencies: + +Recommended Dependencies +~~~~~~~~~~~~~~~~~~~~~~~~ + + * `numexpr `__: for accelerating certain numerical operations. + ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. + + * `bottleneck `__: for accelerating certain types of ``nan`` + evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. + +.. note:: + + You are highly encouraged to install these libraries, as they provide large speedups, especially + if working with large data sets. + + +.. _install.optional_dependencies: + +Optional Dependencies +~~~~~~~~~~~~~~~~~~~~~ + + * `Cython `__: Only necessary to build development + version. Version 0.17.1 or higher. + * `SciPy `__: miscellaneous statistical functions + * `PyTables `__: necessary for HDF5-based storage + * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. + * `matplotlib `__: for plotting + * `statsmodels `__ + * Needed for parts of :mod:`pandas.stats` + * `openpyxl `__, `xlrd/xlwt `__ + * openpyxl version 1.6.1 or higher, but lower than 2.0.0 + * Needed for Excel I/O + * `XlsxWriter `__ + * Alternative Excel writer. + * `boto `__: necessary for Amazon S3 + access. + * One of `PyQt4 + `__, `PySide + `__, `pygtk + `__, `xsel + `__, or `xclip + `__: necessary to use + :func:`~pandas.io.clipboard.read_clipboard`. Most package managers on Linux + distributions will have xclip and/or xsel immediately available for + installation. + * Google's `python-gflags` and `google-api-python-client` + * Needed for :mod:`~pandas.io.gbq` + * `httplib2` + * Needed for :mod:`~pandas.io.gbq` + * One of the following combinations of libraries is needed to use the + top-level :func:`~pandas.io.html.read_html` function: + + * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is + okay.) + * `BeautifulSoup4`_ and `lxml`_ + * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ + * Only `lxml`_, although see :ref:`HTML reading gotchas ` + for reasons as to why you should probably **not** take this approach. + + .. warning:: + + * if you install `BeautifulSoup4`_ you must install either + `lxml`_ or `html5lib`_ or both. + :func:`~pandas.io.html.read_html` will **not** work with *only* + `BeautifulSoup4`_ installed. + * You are highly encouraged to read :ref:`HTML reading gotchas + `. It explains issues surrounding the installation and + usage of the above three libraries + * You may need to install an older version of `BeautifulSoup4`_: + - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and + 32-bit Ubuntu/Debian + * Additionally, if you're using `Anaconda`_ you should definitely + read :ref:`the gotchas about HTML parsing libraries ` + + .. note:: + + * if you're on a system with ``apt-get`` you can do + + .. code-block:: sh + + sudo apt-get build-dep python-lxml + + to get the necessary dependencies for installation of `lxml`_. This + will prevent further headaches down the line. + + +.. _html5lib: https://github.com/html5lib/html5lib-python +.. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup +.. _lxml: http://lxml.de +.. _Anaconda: https://store.continuum.io/cshop/anaconda + +.. note:: + + Without the optional dependencies, many useful features will not + work. Hence, it is highly recommended that you install these. A packaged + distribution like `Enthought Canopy + `__ may be worth considering. + +Installing from source +~~~~~~~~~~~~~~~~~~~~~~ +.. note:: + + Installing from the git repository requires a recent installation of `Cython + `__ as the cythonized C sources are no longer checked + into source control. Released source distributions will contain the built C + files. I recommend installing the latest Cython via ``easy_install -U + Cython`` + +The source code is hosted at http://github.com/pydata/pandas, it can be checked +out using git and compiled / installed like so: + +:: + + git clone git://github.com/pydata/pandas.git + cd pandas + python setup.py install + +Make sure you have Cython installed when installing from the repository, +rather then a tarball or pypi. + +On Windows, I suggest installing the MinGW compiler suite following the +directions linked to above. Once configured property, run the following on the +command line: + +:: + + python setup.py build --compiler=mingw32 + python setup.py install + +Note that you will not be able to import pandas if you open an interpreter in +the source directory unless you build the C extensions in place: + +:: + + python setup.py build_ext --inplace + +The most recent version of MinGW (any installer dated after 2011-08-03) +has removed the '-mno-cygwin' option but Distutils has not yet been updated to +reflect that. Thus, you may run into an error like "unrecognized command line +option '-mno-cygwin'". Until the bug is fixed in Distutils, you may need to +install a slightly older version of MinGW (2011-08-02 installer). + +Running the test suite +~~~~~~~~~~~~~~~~~~~~~~ + +pandas is equipped with an exhaustive set of unit tests covering about 97% of +the codebase as of this writing. To run it on your machine to verify that +everything is working (and you have all of the dependencies, soft and hard, +installed), make sure you have `nose +`__ and run: + +:: + + $ nosetests pandas + .......................................................................... + .......................S.................................................. + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .................S........................................................ + .... + ---------------------------------------------------------------------- + Ran 818 tests in 21.631s + + OK (SKIP=2) diff --git a/doc/source/io.rst b/doc/source/io.rst new file mode 100644 index 00000000..cfa97ca0 --- /dev/null +++ b/doc/source/io.rst @@ -0,0 +1,3662 @@ +.. _io: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import os + import csv + from pandas.compat import StringIO, BytesIO + import pandas as pd + ExcelWriter = pd.ExcelWriter + + import numpy as np + np.random.seed(123456) + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + + import matplotlib.pyplot as plt + plt.close('all') + + from pandas import * + options.display.max_rows=15 + import pandas.util.testing as tm + clipdf = DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, + index=['x','y','z']) + +******************************* +IO Tools (Text, CSV, HDF5, ...) +******************************* + +The pandas I/O api is a set of top level ``reader`` functions accessed like ``pd.read_csv()`` that generally return a ``pandas`` +object. + + * :ref:`read_csv` + * :ref:`read_excel` + * :ref:`read_hdf` + * :ref:`read_sql` + * :ref:`read_json` + * :ref:`read_msgpack` (experimental) + * :ref:`read_html` + * :ref:`read_gbq` (experimental) + * :ref:`read_stata` + * :ref:`read_clipboard` + * :ref:`read_pickle` + +The corresponding ``writer`` functions are object methods that are accessed like ``df.to_csv()`` + + * :ref:`to_csv` + * :ref:`to_excel` + * :ref:`to_hdf` + * :ref:`to_sql` + * :ref:`to_json` + * :ref:`to_msgpack` (experimental) + * :ref:`to_html` + * :ref:`to_gbq` (experimental) + * :ref:`to_stata` + * :ref:`to_clipboard` + * :ref:`to_pickle` + +:ref:`Here ` is an informal performance comparison for some of these IO methods. + +.. note:: + For examples that use the ``StringIO`` class, make sure you import it + according to your Python version, i.e. ``from StringIO import StringIO`` for + Python 2 and ``from io import StringIO`` for Python 3. + +.. _io.read_csv_table: + +CSV & Text files +---------------- + +The two workhorse functions for reading text files (a.k.a. flat files) are +:func:`~pandas.io.parsers.read_csv` and :func:`~pandas.io.parsers.read_table`. +They both use the same parsing code to intelligently convert tabular +data into a DataFrame object. See the :ref:`cookbook` +for some advanced strategies + +They can take a number of arguments: + + - ``filepath_or_buffer``: Either a string path to a file, url + (including http, ftp, and s3 locations), or any object with a ``read`` + method (such as an open file or ``StringIO``). + - ``sep`` or ``delimiter``: A delimiter / separator to split fields + on. `read_csv` is capable of inferring the delimiter automatically in some + cases by "sniffing." The separator may be specified as a regular + expression; for instance you may use '\|\\s*' to indicate a pipe plus + arbitrary whitespace. + - ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file + (much faster than using a regular expression) + - ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly. + - ``dialect``: string or :class:`python:csv.Dialect` instance to expose more + ways to specify the file format + - ``dtype``: A data type name or a dict of column name to data type. If not + specified, data types will be inferred. (Unsupported with + ``engine='python'``) + - ``header``: row number(s) to use as the column names, and the start of the + data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly + pass ``header=0`` to be able to replace existing names. The header can be + a list of integers that specify row locations for a multi-index on the columns + E.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example are skipped). Note that this parameter + ignores commented lines, so header=0 denotes the first line of + data rather than the first line of the file. + - ``skiprows``: A collection of numbers for rows in the file to skip. Can + also be an integer to skip the first ``n`` rows + - ``index_col``: column number, column name, or list of column numbers/names, + to use as the ``index`` (row labels) of the resulting DataFrame. By default, + it will number the rows without using any column, unless there is one more + data column than there are headers, in which case the first column is taken + as the index. + - ``names``: List of column names to use as column names. To replace header + existing in file, explicitly pass ``header=0``. + - ``na_values``: optional list of strings to recognize as NaN (missing + values), either in addition to or in lieu of the default set. + - ``true_values``: list of strings to recognize as ``True`` + - ``false_values``: list of strings to recognize as ``False`` + - ``keep_default_na``: whether to include the default set of missing values + in addition to the ones specified in ``na_values`` + - ``parse_dates``: if True then index will be parsed as dates + (False by default). You can specify more complicated options to parse + a subset of columns or a combination of columns into a single date column + (list of ints or names, list of lists, or dict) + [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column + [[1, 3]] -> combine columns 1 and 3 and parse as a single date column + {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' + - ``keep_date_col``: if True, then date component columns passed into + ``parse_dates`` will be retained in the output (False by default). + - ``date_parser``: function to use to parse strings into datetime + objects. If ``parse_dates`` is True, it defaults to the very robust + ``dateutil.parser``. Specifying this implicitly sets ``parse_dates`` as True. + You can also use functions from community supported date converters from + date_converters.py + - ``dayfirst``: if True then uses the DD/MM international/European date format + (This is False by default) + - ``thousands``: specifies the thousands separator. If not None, this character will + be stripped from numeric dtypes. However, if it is the first character in a field, + that column will be imported as a string. In the PythonParser, if not None, + then parser will try to look for it in the output and parse relevant data to numeric + dtypes. Because it has to essentially scan through the data again, this causes a + significant performance hit so only use if necessary. + - ``lineterminator`` : string (length 1), default ``None``, Character to break file into lines. Only valid with C parser + - ``quotechar`` : string, The character to used to denote the start and end of a quoted item. + Quoted items can include the delimiter and it will be ignored. + - ``quoting`` : int, + Controls whether quotes should be recognized. Values are taken from `csv.QUOTE_*` values. + Acceptable values are 0, 1, 2, and 3 for QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONE, and QUOTE_NONNUMERIC, respectively. + - ``skipinitialspace`` : boolean, default ``False``, Skip spaces after delimiter + - ``escapechar`` : string, to specify how to escape quoted data + - ``comment``: Indicates remainder of line should not be parsed. If found at the + beginning of a line, the line will be ignored altogether. This parameter + must be a single character. Also, fully commented lines + are ignored by the parameter `header` but not by `skiprows`. For example, + if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will + result in '1,2,3' being treated as the header. + - ``nrows``: Number of rows to read out of the file. Useful to only read a + small portion of a large file + - ``iterator``: If True, return a ``TextFileReader`` to enable reading a file + into memory piece by piece + - ``chunksize``: An number of rows to be used to "chunk" a file into + pieces. Will cause an ``TextFileReader`` object to be returned. More on this + below in the section on :ref:`iterating and chunking ` + - ``skip_footer``: number of lines to skip at bottom of file (default 0) + (Unsupported with ``engine='c'``) + - ``converters``: a dictionary of functions for converting values in certain + columns, where keys are either integers or column labels + - ``encoding``: a string representing the encoding to use for decoding + unicode data, e.g. ``'utf-8``` or ``'latin-1'``. + - ``verbose``: show number of NA values inserted in non-numeric columns + - ``squeeze``: if True then output with only one column is turned into Series + - ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines ` + - ``usecols``: a subset of columns to return, results in much faster parsing + time and lower memory usage. + - ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified + as 'X.0'...'X.N', rather than 'X'...'X' + - ``tupleize_cols``: boolean, default False, if False, convert a list of tuples + to a multi-index of columns, otherwise, leave the column index as a list of tuples + +.. ipython:: python + :suppress: + + f = open('foo.csv','w') + f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + +Consider a typical CSV file containing, in this case, some time series data: + +.. ipython:: python + + print(open('foo.csv').read()) + +The default for `read_csv` is to create a DataFrame with simple numbered rows: + +.. ipython:: python + + pd.read_csv('foo.csv') + +In the case of indexed data, you can pass the column number or column name you +wish to use as the index: + +.. ipython:: python + + pd.read_csv('foo.csv', index_col=0) + +.. ipython:: python + + pd.read_csv('foo.csv', index_col='date') + +You can also use a list of columns to create a hierarchical index: + +.. ipython:: python + + pd.read_csv('foo.csv', index_col=[0, 'A']) + +.. _io.dialect: + +The ``dialect`` keyword gives greater flexibility in specifying the file format. +By default it uses the Excel dialect but you can specify either the dialect name +or a :class:`python:csv.Dialect` instance. + +.. ipython:: python + :suppress: + + data = ('label1,label2,label3\n' + 'index1,"a,c,e\n' + 'index2,b,d,f') + +Suppose you had data with unenclosed quotes: + +.. ipython:: python + + print(data) + +By default, ``read_csv`` uses the Excel dialect and treats the double quote as +the quote character, which causes it to fail when it finds a newline before it +finds the closing double quote. + +We can get around this using ``dialect`` + +.. ipython:: python + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + pd.read_csv(StringIO(data), dialect=dia) + +All of the dialect options can be specified separately by keyword arguments: + +.. ipython:: python + + data = 'a,b,c~1,2,3~4,5,6' + pd.read_csv(StringIO(data), lineterminator='~') + +Another common dialect option is ``skipinitialspace``, to skip any whitespace +after a delimiter: + +.. ipython:: python + + data = 'a, b, c\n1, 2, 3\n4, 5, 6' + print(data) + pd.read_csv(StringIO(data), skipinitialspace=True) + +Moreover, ``read_csv`` ignores any completely commented lines: + +.. ipython:: python + + data = 'a,b,c\n# commented line\n1,2,3\n#another comment\n4,5,6' + print(data) + pd.read_csv(StringIO(data), comment='#') + +.. note:: + + The presence of ignored lines might create ambiguities involving line numbers; + the parameter ``header`` uses row numbers (ignoring commented + lines), while ``skiprows`` uses line numbers (including commented lines): + + .. ipython:: python + + data = '#comment\na,b,c\nA,B,C\n1,2,3' + pd.read_csv(StringIO(data), comment='#', header=1) + data = 'A,B,C\n#comment\na,b,c\n1,2,3' + pd.read_csv(StringIO(data), comment='#', skiprows=2) + +The parsers make every attempt to "do the right thing" and not be very +fragile. Type inference is a pretty big deal. So if a column can be coerced to +integer dtype without altering the contents, it will do so. Any non-numeric +columns will come through as object dtype as with the rest of pandas objects. + +.. _io.dtypes: + +Specifying column data types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Starting with v0.10, you can indicate the data type for the whole DataFrame or +individual columns: + +.. ipython:: python + + data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + print(data) + + df = pd.read_csv(StringIO(data), dtype=object) + df + df['a'][0] + df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64}) + df.dtypes + +.. note:: + The ``dtype`` option is currently only supported by the C engine. + Specifying ``dtype`` with ``engine`` other than 'c' raises a + ``ValueError``. + +.. _io.headers: + +Handling column names +~~~~~~~~~~~~~~~~~~~~~ + +A file may or may not have a header row. pandas assumes the first row should be +used as the column names: + +.. ipython:: python + + data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + print(data) + pd.read_csv(StringIO(data)) + +By specifying the ``names`` argument in conjunction with ``header`` you can +indicate other names to use and whether or not to throw away the header row (if +any): + +.. ipython:: python + + print(data) + pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0) + pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None) + +If the header is in a row other than the first, pass the row number to +``header``. This will skip the preceding rows: + +.. ipython:: python + + data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' + pd.read_csv(StringIO(data), header=1) + +.. _io.usecols: + +Filtering columns (``usecols``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``usecols`` argument allows you to select any subset of the columns in a +file, either using the column names or position numbers: + +.. ipython:: python + + data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz' + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), usecols=['b', 'd']) + pd.read_csv(StringIO(data), usecols=[0, 2, 3]) + +.. _io.unicode: + +Dealing with Unicode Data +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``encoding`` argument should be used for encoded unicode data, which will +result in byte strings being decoded to unicode in the result: + +.. ipython:: python + + data = b'word,length\nTr\xc3\xa4umen,7\nGr\xc3\xbc\xc3\x9fe,5'.decode('utf8').encode('latin-1') + df = pd.read_csv(BytesIO(data), encoding='latin-1') + df + df['word'][1] + +Some formats which encode all characters as multiple bytes, like UTF-16, won't +parse correctly at all without specifying the encoding. + +.. _io.index_col: + +Index columns and trailing delimiters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a file has one more column of data than the number of column names, the +first column will be used as the DataFrame's row names: + +.. ipython:: python + + data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + pd.read_csv(StringIO(data)) + +.. ipython:: python + + data = 'index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + pd.read_csv(StringIO(data), index_col=0) + +Ordinarily, you can achieve this behavior using the ``index_col`` option. + +There are some exception cases when a file has been prepared with delimiters at +the end of each data line, confusing the parser. To explicitly disable the +index column inference and discard the last column, pass ``index_col=False``: + +.. ipython:: python + + data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + print(data) + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), index_col=False) + +.. _io.parse_dates: + +Specifying Date Columns +~~~~~~~~~~~~~~~~~~~~~~~ + +To better facilitate working with datetime data, +:func:`~pandas.io.parsers.read_csv` and :func:`~pandas.io.parsers.read_table` +uses the keyword arguments ``parse_dates`` and ``date_parser`` to allow users +to specify a variety of columns and date/time formats to turn the input text +data into ``datetime`` objects. + +The simplest case is to just pass in ``parse_dates=True``: + +.. ipython:: python + + # Use a column as an index, and parse it as dates. + df = pd.read_csv('foo.csv', index_col=0, parse_dates=True) + df + + # These are python datetime objects + df.index + +It is often the case that we may want to store date and time data separately, +or store various date fields separately. the ``parse_dates`` keyword can be +used to specify a combination of columns to parse the dates and/or times from. + +You can specify a list of column lists to ``parse_dates``, the resulting date +columns will be prepended to the output (so as to not affect the existing column +order) and the new column names will be the concatenation of the component +column names: + +.. ipython:: python + :suppress: + + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print(open('tmp.csv').read()) + df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]]) + df + +By default the parser removes the component date columns, but you can choose +to retain them via the ``keep_date_col`` keyword: + +.. ipython:: python + + df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]], + keep_date_col=True) + df + +Note that if you wish to combine multiple columns into a single date column, a +nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that +the second and third columns should each be parsed as separate date columns +while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a +single column. + +You can also use a dict to specify custom name columns: + +.. ipython:: python + + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec) + df + +It is important to remember that if multiple text columns are to be parsed into +a single date column, then a new column is prepended to the data. The `index_col` +specification is based off of this new set of columns rather than the original +data columns: + + +.. ipython:: python + + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, + index_col=0) #index is the nominal column + df + +.. note:: + read_csv has a fast_path for parsing datetime strings in iso8601 format, + e.g "2000-01-01T00:01:02+00:00" and similar variations. If you can arrange + for your data to store datetimes in this format, load times will be + significantly faster, ~20x has been observed. + + +.. note:: + + When passing a dict as the `parse_dates` argument, the order of + the columns prepended is not guaranteed, because `dict` objects do not impose + an ordering on their keys. On Python 2.7+ you may use `collections.OrderedDict` + instead of a regular `dict` if this matters to you. Because of this, when using a + dict for 'parse_dates' in conjunction with the `index_col` argument, it's best to + specify `index_col` as a column label rather then as an index on the resulting frame. + + +Date Parsing Functions +~~~~~~~~~~~~~~~~~~~~~~ +Finally, the parser allows you can specify a custom ``date_parser`` function to +take full advantage of the flexiblity of the date parsing API: + +.. ipython:: python + + import pandas.io.date_converters as conv + df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, + date_parser=conv.parse_date_time) + df + +You can explore the date parsing functionality in ``date_converters.py`` and +add your own. We would love to turn this module into a community supported set +of date/time parsers. To get you started, ``date_converters.py`` contains +functions to parse dual date and time columns, year/month/day columns, +and year/month/day/hour/minute/second columns. It also contains a +``generic_parser`` function so you can curry it with a function that deals with +a single date rather than the entire array. + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +.. _io.dayfirst: + + +Inferring Datetime Format +~~~~~~~~~~~~~~~~~~~~~~~~~ +If you have ``parse_dates`` enabled for some or all of your columns, and your +datetime strings are all formatted the same way, you may get a large speed +up by setting ``infer_datetime_format=True``. If set, pandas will attempt +to guess the format of your datetime strings, and then use a faster means +of parsing the strings. 5-10x parsing speeds have been observed. pandas +will fallback to the usual parsing if either the format cannot be guessed +or the format that was guessed cannot properly parse the entire column +of strings. So in general, ``infer_datetime_format`` should not have any +negative consequences if enabled. + +Here are some examples of datetime strings that can be guessed (All +representing December 30th, 2011 at 00:00:00) + +- "20111230" +- "2011/12/30" +- "20111230 00:00:00" +- "12/30/2011 00:00:00" +- "30/Dec/2011 00:00:00" +- "30/December/2011 00:00:00" + +``infer_datetime_format`` is sensitive to ``dayfirst``. With +``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With +``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. + +.. ipython:: python + + # Try to infer the format for the index column + df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, + infer_datetime_format=True) + df + +.. ipython:: python + :suppress: + + os.remove('foo.csv') + +International Date Formats +~~~~~~~~~~~~~~~~~~~~~~~~~~ +While US date formats tend to be MM/DD/YYYY, many international formats use +DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: + +.. ipython:: python + :suppress: + + data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print(open('tmp.csv').read()) + + pd.read_csv('tmp.csv', parse_dates=[0]) + pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) + +.. _io.thousands: + +Thousand Separators +~~~~~~~~~~~~~~~~~~~ +For large numbers that have been written with a thousands separator, you can +set the ``thousands`` keyword to a string of length 1 so that integers will be parsed +correctly: + +.. ipython:: python + :suppress: + + data = ("ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +By default, numbers with a thousands separator will be parsed as strings + +.. ipython:: python + + print(open('tmp.csv').read()) + df = pd.read_csv('tmp.csv', sep='|') + df + + df.level.dtype + +The ``thousands`` keyword allows integers to be parsed correctly + +.. ipython:: python + + print(open('tmp.csv').read()) + df = pd.read_csv('tmp.csv', sep='|', thousands=',') + df + + df.level.dtype + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +.. _io.na_values: + +NA Values +~~~~~~~~~ + +To control which values are parsed as missing values (which are signified by ``NaN``), specifiy a +list of strings in ``na_values``. If you specify a number (a ``float``, like ``5.0`` or an ``integer`` like ``5``), +the corresponding equivalent values will also imply a missing value (in this case effectively +``[5.0,5]`` are recognized as ``NaN``. + +To completely override the default values that are recognized as missing, specify ``keep_default_na=False``. +The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A','N/A', 'NA', +'#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan']``. + +.. code-block:: python + + read_csv(path, na_values=[5]) + +the default values, in addition to ``5`` , ``5.0`` when interpreted as numbers are recognized as ``NaN`` + +.. code-block:: python + + read_csv(path, keep_default_na=False, na_values=[""]) + +only an empty field will be ``NaN`` + +.. code-block:: python + + read_csv(path, keep_default_na=False, na_values=["NA", "0"]) + +only ``NA`` and ``0`` as strings are ``NaN`` + +.. code-block:: python + + read_csv(path, na_values=["Nope"]) + +the default values, in addition to the string ``"Nope"`` are recognized as ``NaN`` + +.. _io.infinity: + +Infinity +~~~~~~~~ + +``inf`` like values will be parsed as ``np.inf`` (positive infinity), and ``-inf`` as ``-np.inf`` (negative infinity). +These will ignore the case of the value, meaning ``Inf``, will also be parsed as ``np.inf``. + + +.. _io.comments: + +Comments +~~~~~~~~ +Sometimes comments or meta data may be included in a file: + +.. ipython:: python + :suppress: + + data = ("ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print(open('tmp.csv').read()) + +By default, the parse includes the comments in the output: + +.. ipython:: python + + df = pd.read_csv('tmp.csv') + df + +We can suppress the comments using the ``comment`` keyword: + +.. ipython:: python + + df = pd.read_csv('tmp.csv', comment='#') + df + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +Returning Series +~~~~~~~~~~~~~~~~ + +Using the ``squeeze`` keyword, the parser will return output with a single column +as a ``Series``: + +.. ipython:: python + :suppress: + + data = ("level\n" + "Patient1,123000\n" + "Patient2,23000\n" + "Patient3,1234018") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print(open('tmp.csv').read()) + + output = pd.read_csv('tmp.csv', squeeze=True) + output + + type(output) + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +.. _io.boolean: + +Boolean values +~~~~~~~~~~~~~~ + +The common values ``True``, ``False``, ``TRUE``, and ``FALSE`` are all +recognized as boolean. Sometime you would want to recognize some other values +as being boolean. To do this use the ``true_values`` and ``false_values`` +options: + +.. ipython:: python + + data= 'a,b,c\n1,Yes,2\n3,No,4' + print(data) + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) + +.. _io.bad_lines: + +Handling "bad" lines +~~~~~~~~~~~~~~~~~~~~ + +Some files may have malformed lines with too few fields or too many. Lines with +too few fields will have NA values filled in the trailing fields. Lines with +too many will cause an error by default: + +.. ipython:: python + :suppress: + + data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + +.. code-block:: ipython + + In [27]: data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + + In [28]: pd.read_csv(StringIO(data)) + --------------------------------------------------------------------------- + CParserError Traceback (most recent call last) + CParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4 + +You can elect to skip bad lines: + +.. code-block:: ipython + + In [29]: pd.read_csv(StringIO(data), error_bad_lines=False) + Skipping line 3: expected 3 fields, saw 4 + + Out[29]: + a b c + 0 1 2 3 + 1 8 9 10 + +.. _io.quoting: + +Quoting and Escape Characters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Quotes (and other escape characters) in embedded fields can be handled in any +number of ways. One way is to use backslashes; to properly parse this data, you +should pass the ``escapechar`` option: + +.. ipython:: python + + data = 'a,b\n"hello, \\"Bob\\", nice to see you",5' + print(data) + pd.read_csv(StringIO(data), escapechar='\\') + +.. _io.fwf: + +Files with Fixed Width Columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +While ``read_csv`` reads delimited data, the :func:`~pandas.io.parsers.read_fwf` +function works with data files that have known and fixed column widths. +The function parameters to ``read_fwf`` are largely the same as `read_csv` with +two extra parameters: + + - ``colspecs``: A list of pairs (tuples) giving the extents of the + fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try detecting + the column specifications from the first 100 rows of the data. Default + behaviour, if not specified, is to infer. + - ``widths``: A list of field widths which can be used instead of 'colspecs' + if the intervals are contiguous. + +.. ipython:: python + :suppress: + + f = open('bar.csv', 'w') + data1 = ("id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3") + f.write(data1) + f.close() + +Consider a typical fixed-width data file: + +.. ipython:: python + + print(open('bar.csv').read()) + +In order to parse this file into a DataFrame, we simply need to supply the +column specifications to the `read_fwf` function along with the file name: + +.. ipython:: python + + #Column specifications are a list of half-intervals + colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] + df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0) + df + +Note how the parser automatically picks column names X. when +``header=None`` argument is specified. Alternatively, you can supply just the +column widths for contiguous columns: + +.. ipython:: python + + #Widths are a list of integers + widths = [6, 14, 13, 10] + df = pd.read_fwf('bar.csv', widths=widths, header=None) + df + +The parser will take care of extra white spaces around the columns +so it's ok to have extra separation between the columns in the file. + +.. versionadded:: 0.13.0 + +By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the +first 100 rows of the file. It can do it only in cases when the columns are +aligned and correctly separated by the provided ``delimiter`` (default delimiter +is whitespace). + +.. ipython:: python + + df = pd.read_fwf('bar.csv', header=None, index_col=0) + df + +.. ipython:: python + :suppress: + + os.remove('bar.csv') + +Files with an "implicit" index column +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + f = open('foo.csv', 'w') + f.write('A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + +Consider a file with one less entry in the header than the number of data +column: + +.. ipython:: python + + print(open('foo.csv').read()) + +In this special case, ``read_csv`` assumes that the first column is to be used +as the index of the DataFrame: + +.. ipython:: python + + pd.read_csv('foo.csv') + +Note that the dates weren't automatically parsed. In that case you would need +to do as before: + +.. ipython:: python + + df = pd.read_csv('foo.csv', parse_dates=True) + df.index + +.. ipython:: python + :suppress: + + os.remove('foo.csv') + + +Reading an index with a ``MultiIndex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _io.csv_multiindex: + +Suppose you have data indexed by two columns: + +.. ipython:: python + + print(open('data/mindex_ex.csv').read()) + +The ``index_col`` argument to ``read_csv`` and ``read_table`` can take a list of +column numbers to turn multiple columns into a ``MultiIndex`` for the index of the +returned object: + +.. ipython:: python + + df = pd.read_csv("data/mindex_ex.csv", index_col=[0,1]) + df + df.ix[1978] + +.. _io.multi_index_columns: + +Reading columns with a ``MultiIndex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By specifying list of row locations for the ``header`` argument, you +can read in a ``MultiIndex`` for the columns. Specifying non-consecutive +rows will skip the interveaning rows. In order to have the pre-0.13 behavior +of tupleizing columns, specify ``tupleize_cols=True``. + +.. ipython:: python + + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv('mi.csv') + print(open('mi.csv').read()) + pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1]) + +Starting in 0.13.0, ``read_csv`` will be able to interpret a more common format +of multi-columns indices. + +.. ipython:: python + :suppress: + + data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12" + fh = open('mi2.csv','w') + fh.write(data) + fh.close() + +.. ipython:: python + + print(open('mi2.csv').read()) + pd.read_csv('mi2.csv',header=[0,1],index_col=0) + +Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it +with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will be *lost*. + +.. ipython:: python + :suppress: + + import os + os.remove('mi.csv') + os.remove('mi2.csv') + +.. _io.sniff: + +Automatically "sniffing" the delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``read_csv`` is capable of inferring delimited (not necessarily +comma-separated) files. YMMV, as pandas uses the :class:`python:csv.Sniffer` +class of the csv module. + +.. ipython:: python + :suppress: + + df = DataFrame(np.random.randn(10, 4)) + df.to_csv('tmp.sv', sep='|') + df.to_csv('tmp2.sv', sep=':') + +.. ipython:: python + + print(open('tmp2.sv').read()) + pd.read_csv('tmp2.sv') + +.. _io.chunking: + +Iterating through files chunk by chunk +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose you wish to iterate through a (potentially very large) file lazily +rather than reading the entire file into memory, such as the following: + + +.. ipython:: python + + print(open('tmp.sv').read()) + table = pd.read_table('tmp.sv', sep='|') + table + + +By specifiying a ``chunksize`` to ``read_csv`` or ``read_table``, the return +value will be an iterable object of type ``TextFileReader``: + +.. ipython:: python + + reader = pd.read_table('tmp.sv', sep='|', chunksize=4) + reader + + for chunk in reader: + print(chunk) + + +Specifying ``iterator=True`` will also return the ``TextFileReader`` object: + +.. ipython:: python + + reader = pd.read_table('tmp.sv', sep='|', iterator=True) + reader.get_chunk(5) + +.. ipython:: python + :suppress: + + os.remove('tmp.sv') + os.remove('tmp2.sv') + +Specifying the parser engine +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Under the hood pandas uses a fast and efficient parser implemented in C as well +as a python implementation which is currently more feature-complete. Where +possible pandas uses the C parser (specified as ``engine='c'``), but may fall +back to python if C-unsupported options are specified. Currently, C-unsupported +options include: + +- ``sep`` other than a single character (e.g. regex separators) +- ``skip_footer`` +- ``sep=None`` with ``delim_whitespace=False`` + +Specifying any of the above options will produce a ``ParserWarning`` unless the +python engine is selected explicitly using ``engine='python'``. + +.. _io.store_in_csv: + +Writing to CSV format +~~~~~~~~~~~~~~~~~~~~~ + +The Series and DataFrame objects have an instance method ``to_csv`` which +allows storing the contents of the object as a comma-separated-values file. The +function takes a number of arguments. Only the first is required. + + - ``path_or_buf``: A string path to the file to write or a StringIO + - ``sep`` : Field delimiter for the output file (default ",") + - ``na_rep``: A string representation of a missing value (default '') + - ``float_format``: Format string for floating point numbers + - ``cols``: Columns to write (default None) + - ``header``: Whether to write out the column names (default True) + - ``index``: whether to write row (index) names (default True) + - ``index_label``: Column label(s) for index column(s) if desired. If None + (default), and `header` and `index` are True, then the index names are + used. (A sequence should be given if the DataFrame uses MultiIndex). + - ``mode`` : Python write mode, default 'w' + - ``encoding``: a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + - ``line_terminator``: Character sequence denoting line end (default '\\n') + - ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL) + - ``quotechar``: Character used to quote fields (default '"') + - ``doublequote``: Control quoting of ``quotechar`` in fields (default True) + - ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when + appropriate (default None) + - ``chunksize``: Number of rows to write at a time + - ``tupleize_cols``: If False (default), write as a list of tuples, otherwise + write in an expanded line format suitable for ``read_csv`` + - ``date_format``: Format string for datetime objects + +Writing a formatted string +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _io.formatting: + +The DataFrame object has an instance method ``to_string`` which allows control +over the string representation of the object. All arguments are optional: + + - ``buf`` default None, for example a StringIO object + - ``columns`` default None, which columns to write + - ``col_space`` default None, minimum width of each column. + - ``na_rep`` default ``NaN``, representation of NA value + - ``formatters`` default None, a dictionary (by column) of functions each of + which takes a single argument and returns a formatted string + - ``float_format`` default None, a function which takes a single (float) + argument and returns a formatted string; to be applied to floats in the + DataFrame. + - ``sparsify`` default True, set to False for a DataFrame with a hierarchical + index to print every multiindex key at each row. + - ``index_names`` default True, will print the names of the indices + - ``index`` default True, will print the index (ie, row labels) + - ``header`` default True, will print the column labels + - ``justify`` default ``left``, will print column headers left- or + right-justified + +The Series object also has a ``to_string`` method, but with only the ``buf``, +``na_rep``, ``float_format`` arguments. There is also a ``length`` argument +which, if set to ``True``, will additionally output the length of the Series. + +.. _io.json: + +JSON +---- + +Read and write ``JSON`` format files and strings. + +.. _io.json_writer: + +Writing JSON +~~~~~~~~~~~~ + +A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` +with optional parameters: + +- ``path_or_buf`` : the pathname or buffer to write the output + This can be ``None`` in which case a JSON string is returned +- ``orient`` : + + Series : + - default is ``index`` + - allowed values are {``split``, ``records``, ``index``} + + DataFrame + - default is ``columns`` + - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``} + + The format of the JSON string + + .. csv-table:: + :widths: 20, 150 + :delim: ; + + ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} + ``records``; list like [{column -> value}, ... , {column -> value}] + ``index``; dict like {index -> {column -> value}} + ``columns``; dict like {column -> {index -> value}} + ``values``; just the values array + +- ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. +- ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. +- ``force_ascii`` : force encoded string to be ASCII, default True. +- ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. +- ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serialisable object. + +Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. + +.. ipython:: python + + dfj = DataFrame(randn(5, 2), columns=list('AB')) + json = dfj.to_json() + json + +Orient Options +++++++++++++++ + +There are a number of different options for the format of the resulting JSON +file / string. Consider the following DataFrame and Series: + +.. ipython:: python + + dfjo = DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), + columns=list('ABC'), index=list('xyz')) + dfjo + sjo = Series(dict(x=15, y=16, z=17), name='D') + sjo + +**Column oriented** (the default for ``DataFrame``) serialises the data as +nested JSON objects with column labels acting as the primary index: + +.. ipython:: python + + dfjo.to_json(orient="columns") + # Not available for Series + +**Index oriented** (the default for ``Series``) similar to column oriented +but the index labels are now primary: + +.. ipython:: python + + dfjo.to_json(orient="index") + sjo.to_json(orient="index") + +**Record oriented** serialises the data to a JSON array of column -> value records, +index labels are not included. This is useful for passing DataFrame data to plotting +libraries, for example the JavaScript library d3.js: + +.. ipython:: python + + dfjo.to_json(orient="records") + sjo.to_json(orient="records") + +**Value oriented** is a bare-bones option which serialises to nested JSON arrays of +values only, column and index labels are not included: + +.. ipython:: python + + dfjo.to_json(orient="values") + # Not available for Series + +**Split oriented** serialises to a JSON object containing separate entries for +values, index and columns. Name is also included for ``Series``: + +.. ipython:: python + + dfjo.to_json(orient="split") + sjo.to_json(orient="split") + +.. note:: + + Any orient option that encodes to a JSON object will not preserve the ordering of + index and column labels during round-trip serialisation. If you wish to preserve + label ordering use the `split` option as it uses ordered containers. + +Date Handling ++++++++++++++ + +Writing in iso date format + +.. ipython:: python + + dfd = DataFrame(randn(5, 2), columns=list('AB')) + dfd['date'] = Timestamp('20130101') + dfd = dfd.sort_index(1, ascending=False) + json = dfd.to_json(date_format='iso') + json + +Writing in iso date format, with microseconds + +.. ipython:: python + + json = dfd.to_json(date_format='iso', date_unit='us') + json + +Epoch timestamps, in seconds + +.. ipython:: python + + json = dfd.to_json(date_format='epoch', date_unit='s') + json + +Writing to a file, with a date index and a date column + +.. ipython:: python + + dfj2 = dfj.copy() + dfj2['date'] = Timestamp('20130101') + dfj2['ints'] = list(range(5)) + dfj2['bools'] = True + dfj2.index = date_range('20130101', periods=5) + dfj2.to_json('test.json') + open('test.json').read() + +Fallback Behavior ++++++++++++++++++ + +If the JSON serialiser cannot handle the container contents directly it will fallback in the following manner: + +- if a ``toDict`` method is defined by the unrecognised object then that + will be called and its returned ``dict`` will be JSON serialised. +- if a ``default_handler`` has been passed to ``to_json`` that will + be called to convert the object. +- otherwise an attempt is made to convert the object to a ``dict`` by + parsing its contents. However if the object is complex this will often fail + with an ``OverflowError``. + +Your best bet when encountering ``OverflowError`` during serialisation +is to specify a ``default_handler``. For example ``timedelta`` can cause +problems: + +.. ipython:: python + :suppress: + + from datetime import timedelta + dftd = DataFrame([timedelta(23), timedelta(seconds=5), 42]) + +.. code-block:: ipython + + In [141]: from datetime import timedelta + + In [142]: dftd = DataFrame([timedelta(23), timedelta(seconds=5), 42]) + + In [143]: dftd.to_json() + + --------------------------------------------------------------------------- + OverflowError Traceback (most recent call last) + OverflowError: Maximum recursion level reached + +which can be dealt with by specifying a simple ``default_handler``: + +.. ipython:: python + + dftd.to_json(default_handler=str) + + def my_handler(obj): + return obj.total_seconds() + dftd.to_json(default_handler=my_handler) + +.. _io.json_reader: + +Reading JSON +~~~~~~~~~~~~ + +Reading a JSON string to pandas object can take a number of parameters. +The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or +is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` + +- ``filepath_or_buffer`` : a **VALID** JSON string or file handle / StringIO. The string could be + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host + is expected. For instance, a local file could be + file ://localhost/path/to/table.json +- ``typ`` : type of object to recover (series or frame), default 'frame' +- ``orient`` : + + Series : + - default is ``index`` + - allowed values are {``split``, ``records``, ``index``} + + DataFrame + - default is ``columns`` + - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``} + + The format of the JSON string + + .. csv-table:: + :widths: 20, 150 + :delim: ; + + ``split``; dict like {index -> [index], columns -> [columns], data -> [values]} + ``records``; list like [{column -> value}, ... , {column -> value}] + ``index``; dict like {index -> {column -> value}} + ``columns``; dict like {column -> {index -> value}} + ``values``; just the values array + +- ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data +- ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is True +- ``convert_dates`` : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True +- ``keep_default_dates`` : boolean, default True. If parsing dates, then parse the default datelike columns +- ``numpy`` : direct decoding to numpy arrays. default is False; + Supports numeric data only, although labels may be non-numeric. Also note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` +- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality +- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default + None. By default the timestamp precision will be detected, if this is not desired + then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to + seconds, milliseconds, microseconds or nanoseconds respectively. + +The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. + +If a non-default ``orient`` was used when encoding to JSON be sure to pass the same +option here so that decoding produces sensible results, see `Orient Options`_ for an +overview. + +Data Conversion ++++++++++++++++ + +The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True`` will try to parse the axes, and all of the data +into appropriate types, including dates. If you need to override specific dtypes, pass a dict to ``dtype``. ``convert_axes`` should only +be set to ``False`` if you need to preserve string-like numbers (e.g. '1', '2') in an axes. + +.. note:: + + Large integer values may be converted to dates if ``convert_dates=True`` and the data and / or column labels appear 'date-like'. The exact threshold depends on the ``date_unit`` specified. + +.. warning:: + + When reading JSON data, automatic coercing into dtypes has some quirks: + + * an index can be reconstructed in a different order from serialization, that is, the returned order is not guaranteed to be the same as before serialization + * a column that was ``float`` data will be converted to ``integer`` if it can be done safely, e.g. a column of ``1.`` + * bool columns will be converted to ``integer`` on reconstruction + + Thus there are times where you may want to specify specific dtypes via the ``dtype`` keyword argument. + +Reading from a JSON string: + +.. ipython:: python + + pd.read_json(json) + +Reading from a file: + +.. ipython:: python + + pd.read_json('test.json') + +Don't convert any data (but still convert axes and dates): + +.. ipython:: python + + pd.read_json('test.json', dtype=object).dtypes + +Specify dtypes for conversion: + +.. ipython:: python + + pd.read_json('test.json', dtype={'A' : 'float32', 'bools' : 'int8'}).dtypes + +Preserve string indicies: + +.. ipython:: python + + si = DataFrame(np.zeros((4, 4)), + columns=list(range(4)), + index=[str(i) for i in range(4)]) + si + si.index + si.columns + json = si.to_json() + + sij = pd.read_json(json, convert_axes=False) + sij + sij.index + sij.columns + +Dates written in nanoseconds need to be read back in nanoseconds: + +.. ipython:: python + + json = dfj2.to_json(date_unit='ns') + + # Try to parse timestamps as millseconds -> Won't Work + dfju = pd.read_json(json, date_unit='ms') + dfju + + # Let pandas detect the correct precision + dfju = pd.read_json(json) + dfju + + # Or specify that all timestamps are in nanoseconds + dfju = pd.read_json(json, date_unit='ns') + dfju + +The Numpy Parameter ++++++++++++++++++++ + +.. note:: + This supports numeric data only. Index and columns labels may be non-numeric, e.g. strings, dates etc. + +If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff +an appropriate dtype during deserialisation and to subsequently decode directly +to numpy arrays, bypassing the need for intermediate Python objects. + +This can provide speedups if you are deserialising a large amount of numeric +data: + +.. ipython:: python + + randfloats = np.random.uniform(-100, 1000, 10000) + randfloats.shape = (1000, 10) + dffloats = DataFrame(randfloats, columns=list('ABCDEFGHIJ')) + + jsonfloats = dffloats.to_json() + +.. ipython:: python + + timeit read_json(jsonfloats) + +.. ipython:: python + + timeit read_json(jsonfloats, numpy=True) + +The speedup is less noticable for smaller datasets: + +.. ipython:: python + + jsonfloats = dffloats.head(100).to_json() + +.. ipython:: python + + timeit read_json(jsonfloats) + +.. ipython:: python + + timeit read_json(jsonfloats, numpy=True) + +.. warning:: + + Direct numpy decoding makes a number of assumptions and may fail or produce + unexpected output if these assumptions are not satisfied: + + - data is numeric. + + - data is uniform. The dtype is sniffed from the first value decoded. + A ``ValueError`` may be raised, or incorrect output may be produced + if this condition is not satisfied. + + - labels are ordered. Labels are only read from the first container, it is assumed + that each subsequent row / column has been encoded in the same order. This should be satisfied if the + data was encoded using ``to_json`` but may not be the case if the JSON + is from another source. + +.. ipython:: python + :suppress: + + import os + os.remove('test.json') + +.. _io.json_normalize: + +Normalization +~~~~~~~~~~~~~ + +.. versionadded:: 0.13.0 + +pandas provides a utility function to take a dict or list of dicts and *normalize* this semi-structured data +into a flat table. + +.. ipython:: python + + from pandas.io.json import json_normalize + data = [{'state': 'Florida', + 'shortname': 'FL', + 'info': { + 'governor': 'Rick Scott' + }, + 'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, + {'state': 'Ohio', + 'shortname': 'OH', + 'info': { + 'governor': 'John Kasich' + }, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] + + json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) + +HTML +---- + +.. _io.read_html: + +Reading HTML Content +~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + We **highly encourage** you to read the :ref:`HTML parsing gotchas + ` regarding the issues surrounding the + BeautifulSoup4/html5lib/lxml parsers. + +.. versionadded:: 0.12.0 + +The top-level :func:`~pandas.io.html.read_html` function can accept an HTML +string/file/url and will parse HTML tables into list of pandas DataFrames. +Let's look at a few examples. + +.. note:: + + ``read_html`` returns a ``list`` of ``DataFrame`` objects, even if there is + only a single table contained in the HTML content + +Read a URL with no options + +.. ipython:: python + + url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' + dfs = read_html(url) + dfs + +.. note:: + + The data from the above URL changes every Monday so the resulting data above + and the data below may be slightly different. + +Read in the content of the file from the above URL and pass it to ``read_html`` +as a string + +.. ipython:: python + :suppress: + + import os + file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) + +.. ipython:: python + + with open(file_path, 'r') as f: + dfs = read_html(f.read()) + dfs + +You can even pass in an instance of ``StringIO`` if you so desire + +.. ipython:: python + + with open(file_path, 'r') as f: + sio = StringIO(f.read()) + + dfs = read_html(sio) + dfs + +.. note:: + + The following examples are not run by the IPython evaluator due to the fact + that having so many network-accessing functions slows down the documentation + build. If you spot an error or an example that doesn't run, please do not + hesitate to report it over on `pandas GitHub issues page + `__. + + +Read a URL and match a table that contains specific text + +.. code-block:: python + + match = 'Metcalf Bank' + df_list = read_html(url, match=match) + +Specify a header row (by default ```` elements are used to form the column +index); if specified, the header row is taken from the data minus the parsed +header elements (```` elements). + +.. code-block:: python + + dfs = read_html(url, header=0) + +Specify an index column + +.. code-block:: python + + dfs = read_html(url, index_col=0) + +Specify a number of rows to skip + +.. code-block:: python + + dfs = read_html(url, skiprows=0) + +Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works +as well) + +.. code-block:: python + + dfs = read_html(url, skiprows=range(2)) + +Don't infer numeric and date types + +.. code-block:: python + + dfs = read_html(url, infer_types=False) + +Specify an HTML attribute + +.. code-block:: python + + dfs1 = read_html(url, attrs={'id': 'table'}) + dfs2 = read_html(url, attrs={'class': 'sortable'}) + print(np.array_equal(dfs1[0], dfs2[0])) # Should be True + +Use some combination of the above + +.. code-block:: python + + dfs = read_html(url, match='Metcalf Bank', index_col=0) + +Read in pandas ``to_html`` output (with some loss of floating point precision) + +.. code-block:: python + + df = DataFrame(randn(2, 2)) + s = df.to_html(float_format='{0:.40g}'.format) + dfin = read_html(s, index_col=0) + +The ``lxml`` backend will raise an error on a failed parse if that is the only +parser you provide (if you only have a single parser you can provide just a +string, but it is considered good practice to pass a list with one string if, +for example, the function expects a sequence of strings) + +.. code-block:: python + + dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) + +or + +.. code-block:: python + + dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') + +However, if you have bs4 and html5lib installed and pass ``None`` or ``['lxml', +'bs4']`` then the parse will most likely succeed. Note that *as soon as a parse +succeeds, the function will return*. + +.. code-block:: python + + dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) + + +.. _io.html: + +Writing to HTML files +~~~~~~~~~~~~~~~~~~~~~~ + +``DataFrame`` objects have an instance method ``to_html`` which renders the +contents of the ``DataFrame`` as an HTML table. The function arguments are as +in the method ``to_string`` described above. + +.. note:: + + Not all of the possible options for ``DataFrame.to_html`` are shown here for + brevity's sake. See :func:`~pandas.core.frame.DataFrame.to_html` for the + full set of options. + +.. ipython:: python + :suppress: + + def write_html(df, filename, *args, **kwargs): + static = os.path.abspath(os.path.join('source', '_static')) + with open(os.path.join(static, filename + '.html'), 'w') as f: + df.to_html(f, *args, **kwargs) + +.. ipython:: python + + df = DataFrame(randn(2, 2)) + df + print(df.to_html()) # raw html + +.. ipython:: python + :suppress: + + write_html(df, 'basic') + +HTML: + +.. raw:: html + :file: _static/basic.html + +The ``columns`` argument will limit the columns shown + +.. ipython:: python + + print(df.to_html(columns=[0])) + +.. ipython:: python + :suppress: + + write_html(df, 'columns', columns=[0]) + +HTML: + +.. raw:: html + :file: _static/columns.html + +``float_format`` takes a Python callable to control the precision of floating +point values + +.. ipython:: python + + print(df.to_html(float_format='{0:.10f}'.format)) + +.. ipython:: python + :suppress: + + write_html(df, 'float_format', float_format='{0:.10f}'.format) + +HTML: + +.. raw:: html + :file: _static/float_format.html + +``bold_rows`` will make the row labels bold by default, but you can turn that +off + +.. ipython:: python + + print(df.to_html(bold_rows=False)) + +.. ipython:: python + :suppress: + + write_html(df, 'nobold', bold_rows=False) + +.. raw:: html + :file: _static/nobold.html + +The ``classes`` argument provides the ability to give the resulting HTML +table CSS classes. Note that these classes are *appended* to the existing +``'dataframe'`` class. + +.. ipython:: python + + print(df.to_html(classes=['awesome_table_class', 'even_more_awesome_class'])) + +Finally, the ``escape`` argument allows you to control whether the +"<", ">" and "&" characters escaped in the resulting HTML (by default it is +``True``). So to get the HTML without escaped characters pass ``escape=False`` + +.. ipython:: python + + df = DataFrame({'a': list('&<>'), 'b': randn(3)}) + + +.. ipython:: python + :suppress: + + write_html(df, 'escape') + write_html(df, 'noescape', escape=False) + +Escaped: + +.. ipython:: python + + print(df.to_html()) + +.. raw:: html + :file: _static/escape.html + +Not escaped: + +.. ipython:: python + + print(df.to_html(escape=False)) + +.. raw:: html + :file: _static/noescape.html + +.. note:: + + Some browsers may not show a difference in the rendering of the previous two + HTML tables. + +.. _io.excel: + +Excel files +----------- + +The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and +Excel 2007 (``.xlsx``) files using the ``xlrd`` Python +module and use the same parsing code as the above to convert tabular data into +a DataFrame. See the :ref:`cookbook` for some +advanced strategies + +Besides ``read_excel`` you can also read Excel files using the ``ExcelFile`` +class. The following two commands are equivalent: + +.. code-block:: python + + # using the ExcelFile class + xls = pd.ExcelFile('path_to_file.xls') + xls.parse('Sheet1', index_col=None, na_values=['NA']) + + # using the read_excel function + read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + +The class based approach can be used to read multiple sheets or to introspect +the sheet names using the ``sheet_names`` attribute. + +.. note:: + + The prior method of accessing ``ExcelFile`` has been moved from + ``pandas.io.parsers`` to the top level namespace starting from pandas + 0.12.0. + +.. versionadded:: 0.13 + +There are now two ways to read in sheets from an Excel file. You can provide +either the index of a sheet or its name to by passing different values for +``sheet_name``. + +- Pass a string to refer to the name of a particular sheet in the workbook. +- Pass an integer to refer to the index of a sheet. Indices follow Python + convention, beginning at 0. +- The default value is ``sheet_name=0``. This reads the first sheet. + +Using the sheet name: + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + +Using the sheet index: + +.. code-block:: python + + read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + +Using all default values: + +.. code-block:: python + + read_excel('path_to_file.xls') + +It is often the case that users will insert columns to do temporary computations +in Excel and you may not want to read in those columns. `read_excel` takes +a `parse_cols` keyword to allow you to specify a subset of columns to parse. + +If `parse_cols` is an integer, then it is assumed to indicate the last column +to be parsed. + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', parse_cols=2) + +If `parse_cols` is a list of integers, then it is assumed to be the file column +indices to be parsed. + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) + +To write a DataFrame object to a sheet of an Excel file, you can use the +``to_excel`` instance method. The arguments are largely the same as ``to_csv`` +described above, the first argument being the name of the excel file, and the +optional second argument the name of the sheet to which the DataFrame should be +written. For example: + +.. code-block:: python + + df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + +Files with a ``.xls`` extension will be written using ``xlwt`` and those with a +``.xlsx`` extension will be written using ``xlsxwriter`` (if available) or +``openpyxl``. + +The DataFrame will be written in a way that tries to mimic the REPL output. One +difference from 0.12.0 is that the ``index_label`` will be placed in the second +row instead of the first. You can get the previous behaviour by setting the +``merge_cells`` option in ``to_excel()`` to ``False``: + +.. code-block:: python + + df.to_excel('path_to_file.xlsx', index_label='label', merge_cells=False) + +The Panel class also has a ``to_excel`` instance method, +which writes each DataFrame in the Panel to a separate sheet. + +In order to write separate DataFrames to separate sheets in a single Excel file, +one can pass an :class:`~pandas.io.excel.ExcelWriter`. + +.. code-block:: python + + with ExcelWriter('path_to_file.xlsx') as writer: + df1.to_excel(writer, sheet_name='Sheet1') + df2.to_excel(writer, sheet_name='Sheet2') + +.. note:: Wringing a little more performance out of ``read_excel`` + Internally, Excel stores all numeric data as floats. Because this can + produce unexpected behavior when reading in data, pandas defaults to trying + to convert integers to floats if it doesn't lose information (``1.0 --> + 1``). You can pass ``convert_float=False`` to disable this behavior, which + may give a slight performance improvement. + +.. _io.excel.writers: + +Excel writer engines +~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.13 + +``pandas`` chooses an Excel writer via two methods: + +1. the ``engine`` keyword argument +2. the filename extension (via the default specified in config options) + +By default, ``pandas`` uses the `XlsxWriter`_ for ``.xlsx`` and `openpyxl`_ +for ``.xlsm`` files and `xlwt`_ for ``.xls`` files. If you have multiple +engines installed, you can set the default engine through :ref:`setting the +config options ` ``io.excel.xlsx.writer`` and +``io.excel.xls.writer``. pandas will fall back on `openpyxl`_ for ``.xlsx`` +files if `Xlsxwriter`_ is not available. + +.. _XlsxWriter: http://xlsxwriter.readthedocs.org +.. _openpyxl: http://packages.python.org/openpyxl/ +.. _xlwt: http://www.python-excel.org + +To specify which writer you want to use, you can pass an engine keyword +argument to ``to_excel`` and to ``ExcelWriter``. + +.. code-block:: python + + # By setting the 'engine' in the DataFrame and Panel 'to_excel()' methods. + df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') + + # By setting the 'engine' in the ExcelWriter constructor. + writer = ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + + # Or via pandas configuration. + from pandas import options + options.io.excel.xlsx.writer = 'xlsxwriter' + + df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + +.. _io.clipboard: + +Clipboard +--------- + +A handy way to grab data is to use the ``read_clipboard`` method, which takes +the contents of the clipboard buffer and passes them to the ``read_table`` +method. For instance, you can copy the following +text to the clipboard (CTRL-C on many operating systems): + +.. code-block:: python + + A B C + x 1 4 p + y 2 5 q + z 3 6 r + +And then import the data directly to a DataFrame by calling: + +.. code-block:: python + + clipdf = pd.read_clipboard() + +.. ipython:: python + + clipdf + +The ``to_clipboard`` method can be used to write the contents of a DataFrame to +the clipboard. Following which you can paste the clipboard contents into other +applications (CTRL-V on many operating systems). Here we illustrate writing a +DataFrame into clipboard and reading it back. + +.. ipython:: python + + df=pd.DataFrame(randn(5,3)) + df + df.to_clipboard() + pd.read_clipboard() + +We can see that we got the same content back, which we had earlier written to the clipboard. + +.. note:: + + You may need to install xclip or xsel (with gtk or PyQt4 modules) on Linux to use these methods. + +.. _io.pickle: + +Pickling +-------- + +All pandas objects are equipped with ``to_pickle`` methods which use Python's +``cPickle`` module to save data structures to disk using the pickle format. + +.. ipython:: python + + df + df.to_pickle('foo.pkl') + +The ``read_pickle`` function in the ``pandas`` namespace can be used to load +any pickled pandas object (or any other pickled object) from file: + + +.. ipython:: python + + read_pickle('foo.pkl') + +.. ipython:: python + :suppress: + + import os + os.remove('foo.pkl') + +.. warning:: + + Loading pickled data received from untrusted sources can be unsafe. + + See: http://docs.python.org/2.7/library/pickle.html + +.. warning:: + + In 0.13, pickle preserves compatibility with pickles created prior to 0.13. These must + be read with ``pd.read_pickle``, rather than the default python ``pickle.load``. + See `this question `__ + for a detailed explanation. + +.. note:: + + These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated. + +.. _io.msgpack: + +msgpack (experimental) +---------------------- + +.. versionadded:: 0.13.0 + +Starting in 0.13.0, pandas is supporting the ``msgpack`` format for +object serialization. This is a lightweight portable binary format, similar +to binary JSON, that is highly space efficient, and provides good performance +both on the writing (serialization), and reading (deserialization). + +.. warning:: + + This is a very new feature of pandas. We intend to provide certain + optimizations in the io of the ``msgpack`` data. Since this is marked + as an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. + +.. ipython:: python + + df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df.to_msgpack('foo.msg') + pd.read_msgpack('foo.msg') + s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + +You can pass a list of objects and you will receive them back on deserialization. + +.. ipython:: python + + pd.to_msgpack('foo.msg', df, 'foo', np.array([1,2,3]), s) + pd.read_msgpack('foo.msg') + +You can pass ``iterator=True`` to iterate over the unpacked results + +.. ipython:: python + + for o in pd.read_msgpack('foo.msg',iterator=True): + print o + +You can pass ``append=True`` to the writer to append to an existing pack + +.. ipython:: python + + df.to_msgpack('foo.msg',append=True) + pd.read_msgpack('foo.msg') + +Unlike other io methods, ``to_msgpack`` is available on both a per-object basis, +``df.to_msgpack()`` and using the top-level ``pd.to_msgpack(...)`` where you +can pack arbitrary collections of python lists, dicts, scalars, while intermixing +pandas objects. + +.. ipython:: python + + pd.to_msgpack('foo2.msg', { 'dict' : [ { 'df' : df }, { 'string' : 'foo' }, { 'scalar' : 1. }, { 's' : s } ] }) + pd.read_msgpack('foo2.msg') + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('foo.msg') + os.remove('foo2.msg') + +Read/Write API +~~~~~~~~~~~~~~ + +Msgpacks can also be read from and written to strings. + +.. ipython:: python + + df.to_msgpack() + +Furthermore you can concatenate the strings to produce a list of the original objects. + +.. ipython:: python + + pd.read_msgpack(df.to_msgpack() + s.to_msgpack()) + +.. _io.hdf5: + +HDF5 (PyTables) +--------------- + +``HDFStore`` is a dict-like object which reads and writes pandas using +the high performance HDF5 format using the excellent `PyTables +`__ library. See the :ref:`cookbook ` +for some advanced strategies + +.. note:: + + ``PyTables`` 3.0.0 was recently released to enable support for Python 3. + pandas should be fully compatible (and previously written stores should be + backwards compatible) with all ``PyTables`` >= 2.3. For ``python >= 3.2``, + ``pandas >= 0.12.0`` is required for compatibility. + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +.. ipython:: python + + store = HDFStore('store.h5') + print(store) + +Objects can be written to the file just like adding key-value pairs to a +dict: + +.. ipython:: python + + np.random.seed(1234) + index = date_range('1/1/2000', periods=8) + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = DataFrame(randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + + # store.put('s', s) is an equivalent method + store['s'] = s + + store['df'] = df + + store['wp'] = wp + + # the type of stored data + store.root.wp._v_attrs.pandas_type + + store + +In a current or later Python session, you can retrieve stored objects: + +.. ipython:: python + + # store.get('df') is an equivalent method + store['df'] + + # dotted (attribute) access provides get as well + store.df + +Deletion of the object specified by the key + +.. ipython:: python + + # store.remove('wp') is an equivalent method + del store['wp'] + + store + +Closing a Store, Context Manager + +.. ipython:: python + + store.close() + store + store.is_open + + # Working with, and automatically closing the store with the context + # manager + with get_store('store.h5') as store: + store.keys() + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + +Read/Write API +~~~~~~~~~~~~~~ + +``HDFStore`` supports an top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing, +similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0) + +.. ipython:: python + + df_tl = DataFrame(dict(A=list(range(5)), B=list(range(5)))) + df_tl.to_hdf('store_tl.h5','table',append=True) + read_hdf('store_tl.h5', 'table', where = ['index>2']) + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store_tl.h5') + +.. _io.hdf5-fixed: + +Fixed Format +~~~~~~~~~~~~ + +.. note:: + + This was prior to 0.13.0 the ``Storer`` format. + +The examples above show storing using ``put``, which write the HDF5 to ``PyTables`` in a fixed array format, called +the ``fixed`` format. These types of stores are are **not** appendable once written (though you can simply +remove them and rewrite). Nor are they **queryable**; they must be +retrieved in their entirety. These offer very fast writing and slightly faster reading than ``table`` stores. +This format is specified by default when using ``put`` or ``to_hdf`` or by ``format='fixed'`` or ``format='f'`` + +.. warning:: + + A ``fixed`` format will raise a ``TypeError`` if you try to retrieve using a ``where`` . + + .. code-block:: python + + DataFrame(randn(10,2)).to_hdf('test_fixed.h5','df') + + pd.read_hdf('test_fixed.h5','df',where='index>5') + TypeError: cannot pass a where specification when reading a fixed format. + this store must be selected in its entirety + + +.. _io.hdf5-table: + +Table Format +~~~~~~~~~~~~ + +``HDFStore`` supports another ``PyTables`` format on disk, the ``table`` +format. Conceptually a ``table`` is shaped very much like a DataFrame, +with rows and columns. A ``table`` may be appended to in the same or +other sessions. In addition, delete & query type operations are +supported. This format is specified by ``format='table'`` or ``format='t'`` +to ``append`` or ``put`` or ``to_hdf`` + +.. versionadded:: 0.13 + +This format can be set as an option as well ``pd.set_option('io.hdf.default_format','table')`` to +enable ``put/append/to_hdf`` to by default store in the ``table`` format. + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +.. ipython:: python + + store = HDFStore('store.h5') + df1 = df[0:4] + df2 = df[4:] + + # append data (creates a table automatically) + store.append('df', df1) + store.append('df', df2) + store + + # select the entire object + store.select('df') + + # the type of stored data + store.root.df._v_attrs.pandas_type + +.. note:: + + You can also create a ``table`` by passing ``format='table'`` or ``format='t'`` to a ``put`` operation. + +.. _io.hdf5-keys: + +Hierarchical Keys +~~~~~~~~~~~~~~~~~ + +Keys to a store can be specified as a string. These can be in a +hierarchical path-name like format (e.g. ``foo/bar/bah``), which will +generate a hierarchy of sub-stores (or ``Groups`` in PyTables +parlance). Keys can be specified with out the leading '/' and are ALWAYS +absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove +everying in the sub-store and BELOW, so be *careful*. + +.. ipython:: python + + store.put('foo/bar/bah', df) + store.append('food/orange', df) + store.append('food/apple', df) + store + + # a list of keys are returned + store.keys() + + # remove all nodes under this level + store.remove('food') + store + +.. _io.hdf5-types: + +Storing Mixed Types in a Table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Storing mixed-dtype data is supported. Strings are stored as a +fixed-width using the maximum size of the appended column. Subsequent +appends will truncate strings at this length. + +Passing ``min_itemsize={`values`: size}`` as a parameter to append +will set a larger minimum for the string columns. Storing ``floats, +strings, ints, bools, datetime64`` are currently supported. For string +columns, passing ``nan_rep = 'nan'`` to append will change the default +nan representation on disk (which converts to/from `np.nan`), this +defaults to `nan`. + +.. ipython:: python + + df_mixed = DataFrame({ 'A' : randn(8), + 'B' : randn(8), + 'C' : np.array(randn(8),dtype='float32'), + 'string' :'string', + 'int' : 1, + 'bool' : True, + 'datetime64' : Timestamp('20010102')}, + index=list(range(8))) + df_mixed.ix[3:5,['A', 'B', 'string', 'datetime64']] = np.nan + + store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) + df_mixed1 = store.select('df_mixed') + df_mixed1 + df_mixed1.get_dtype_counts() + + # we have provided a minimum string column size + store.root.df_mixed.table + +Storing Multi-Index DataFrames +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Storing multi-index dataframes as tables is very similar to +storing/selecting from homogeneous index DataFrames. + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df_mi = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + df_mi + + store.append('df_mi',df_mi) + store.select('df_mi') + + # the levels are automatically included as data columns + store.select('df_mi', 'foo=bar') + + +.. _io.hdf5-query: + +Querying a Table +~~~~~~~~~~~~~~~~ + +.. warning:: + + This query capabilities have changed substantially starting in ``0.13.0``. + Queries from prior version are accepted (with a ``DeprecationWarning``) printed + if its not string-like. + +``select`` and ``delete`` operations have an optional criterion that can +be specified to select/delete only a subset of the data. This allows one +to have a very large on-disk table and retrieve only a portion of the +data. + +A query is specified using the ``Term`` class under the hood, as a boolean expression. + + - ``index`` and ``columns`` are supported indexers of a DataFrame + - ``major_axis``, ``minor_axis``, and ``items`` are supported indexers of + the Panel + - if ``data_columns`` are specified, these can be used as additional indexers + +Valid comparison operators are: + + - ``=, ==, !=, >, >=, <, <=`` + +Valid boolean expressions are combined with: + + - ``|`` : or + - ``&`` : and + - ``(`` and ``)`` : for grouping + +These rules are similar to how boolean expressions are used in pandas for indexing. + +.. note:: + + - ``=`` will be automatically expanded to the comparison operator ``==`` + - ``~`` is the not operator, but can only be used in very limited + circumstances + - If a list/tuple of expressions is passed they will be combined via ``&`` + +The following are valid expressions: + + - ``'index>=date'`` + - ``"columns=['A', 'D']"`` + - ``"columns in ['A', 'D']"`` + - ``'columns=A'`` + - ``'columns==A'`` + - ``"~(columns=['A','B'])"`` + - ``'index>df.index[3] & string="bar"'`` + - ``'(index>df.index[3] & index<=df.index[6]) | string="bar"'`` + - ``"ts>=Timestamp('2012-02-01')"`` + - ``"major_axis>=20130101"`` + +The ``indexers`` are on the left-hand side of the sub-expression: + + - ``columns``, ``major_axis``, ``ts`` + +The right-hand side of the sub-expression (after a comparsion operator) can be: + + - functions that will be evaluated, e.g. ``Timestamp('2012-02-01')`` + - strings, e.g. ``"bar"`` + - date-like, e.g. ``20130101``, or ``"20130101"`` + - lists, e.g. ``"['A','B']"`` + - variables that are defined in the local names space, e.g. ``date`` + +.. note:: + + Passing a string to a query by interpolating it into the query + expression is not recommended. Simply assign the string of interest to a + variable and use that variable in an expression. For example, do this + + .. code-block:: python + + string = "HolyMoly'" + store.select('df', 'index == string') + + instead of this + + .. code-block:: python + + string = "HolyMoly'" + store.select('df', 'index == %s' % string) + + The latter will **not** work and will raise a ``SyntaxError``.Note that + there's a single quote followed by a double quote in the ``string`` + variable. + + If you *must* interpolate, use the ``'%r'`` format specifier + + .. code-block:: python + + store.select('df', 'index == %r' % string) + + which will quote ``string``. + + +Here are some examples: + +.. ipython:: python + + dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + store.append('dfq',dfq,format='table',data_columns=True) + +Use boolean expressions, with in-line function evaluation. + +.. ipython:: python + + store.select('dfq',"index>Timestamp('20130104') & columns=['A', 'B']") + +Use and inline column reference + +.. ipython:: python + + store.select('dfq',where="A>0 or C>0") + +Works with a Panel as well. + +.. ipython:: python + + store.append('wp',wp) + store + store.select('wp', "major_axis>Timestamp('20000102') & minor_axis=['A', 'B']") + +The ``columns`` keyword can be supplied to select a list of columns to be +returned, this is equivalent to passing a +``'columns=list_of_columns_to_filter'``: + +.. ipython:: python + + store.select('df', "columns=['A', 'B']") + +``start`` and ``stop`` parameters can be specified to limit the total search +space. These are in terms of the total number of rows in a table. + +.. ipython:: python + + # this is effectively what the storage of a Panel looks like + wp.to_frame() + + # limiting the search + store.select('wp',"major_axis>20000102 & minor_axis=['A','B']", + start=0, stop=10) + +.. note:: + + ``select`` will raise a ``ValueError`` if the query expression has an unknown + variable reference. Usually this means that you are trying to select on a column + that is **not** a data_column. + + ``select`` will raise a ``SyntaxError`` if the query expression is not valid. + + +.. _io.hdf5-timedelta: + +**Using timedelta64[ns]** + +.. versionadded:: 0.13 + +Beginning in 0.13.0, you can store and query using the ``timedelta64[ns]`` type. Terms can be +specified in the format: ``()``, where float may be signed (and fractional), and unit can be +``D,s,ms,us,ns`` for the timedelta. Here's an example: + +.. warning:: + + This requires ``numpy >= 1.7`` + +.. ipython:: python + + from datetime import timedelta + dftd = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) + dftd['C'] = dftd['A']-dftd['B'] + dftd + store.append('dftd',dftd,data_columns=True) + store.select('dftd',"C<'-3.5D'") + +Indexing +~~~~~~~~ + +You can create/modify an index for a table with ``create_table_index`` +after data is already in the table (after and ``append/put`` +operation). Creating a table index is **highly** encouraged. This will +speed your queries a great deal when you use a ``select`` with the +indexed dimension as the ``where``. + +.. note:: + + Indexes are automagically created (starting ``0.10.1``) on the indexables + and any data columns you specify. This behavior can be turned off by passing + ``index=False`` to ``append``. + +.. ipython:: python + + # we have automagically already created an index (in the first section) + i = store.root.df.table.cols.index.index + i.optlevel, i.kind + + # change an index by passing new parameters + store.create_table_index('df', optlevel=9, kind='full') + i = store.root.df.table.cols.index.index + i.optlevel, i.kind + +See `here `__ for how to create a completely-sorted-index (CSI) on an existing store. + +Query via Data Columns +~~~~~~~~~~~~~~~~~~~~~~ + +You can designate (and index) certain columns that you want to be able +to perform queries (other than the `indexable` columns, which you can +always query). For instance say you want to perform this common +operation, on-disk, and return just the frame that matches this +query. You can specify ``data_columns = True`` to force all columns to +be data_columns + +.. ipython:: python + + df_dc = df.copy() + df_dc['string'] = 'foo' + df_dc.ix[4:6,'string'] = np.nan + df_dc.ix[7:9,'string'] = 'bar' + df_dc['string2'] = 'cool' + df_dc.ix[1:3,['B','C']] = 1.0 + df_dc + + # on-disk operations + store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + store.select('df_dc', [ Term('B>0') ]) + + # getting creative + store.select('df_dc', 'B > 0 & C > 0 & string == foo') + + # this is in-memory version of this type of selection + df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + + # we have automagically created this index and the B/C/string/string2 + # columns are stored separately as ``PyTables`` columns + store.root.df_dc.table + +There is some performance degredation by making lots of columns into +`data columns`, so it is up to the user to designate these. In addition, +you cannot change data columns (nor indexables) after the first +append/put operation (Of course you can simply read in the data and +create a new table!) + +Iterator +~~~~~~~~ + +Starting in ``0.11.0``, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk`` +to ``select`` and ``select_as_multiple`` to return an iterator on the results. +The default is 50,000 rows returned in a chunk. + +.. ipython:: python + + for df in store.select('df', chunksize=3): + print(df) + +.. note:: + + .. versionadded:: 0.12.0 + + You can also use the iterator with ``read_hdf`` which will open, then + automatically close the store when finished iterating. + + .. code-block:: python + + for df in read_hdf('store.h5','df', chunsize=3): + print(df) + +Note, that the chunksize keyword applies to the **source** rows. So if you +are doing a query, then the chunksize will subdivide the total rows in the table +and the query applied, returning an iterator on potentially unequal sized chunks. + +Here is a recipe for generating a query and using it to create equal sized return +chunks. + +.. ipython:: python + + dfeq = DataFrame({'number': np.arange(1,11)}) + dfeq + + store.append('dfeq', dfeq, data_columns=['number']) + + def chunks(l, n): + return [l[i:i+n] for i in range(0, len(l), n)] + + evens = [2,4,6,8,10] + coordinates = store.select_as_coordinates('dfeq','number=evens') + for c in chunks(coordinates, 2): + print store.select('dfeq',where=c) + +Advanced Queries +~~~~~~~~~~~~~~~~ + +**Select a Single Column** + +To retrieve a single indexable or data column, use the +method ``select_column``. This will, for example, enable you to get the index +very quickly. These return a ``Series`` of the result, indexed by the row number. +These do not currently accept the ``where`` selector. + +.. ipython:: python + + store.select_column('df_dc', 'index') + store.select_column('df_dc', 'string') + +.. _io.hdf5-selecting_coordinates: + +**Selecting coordinates** + +Sometimes you want to get the coordinates (a.k.a the index locations) of your query. This returns an +``Int64Index`` of the resulting locations. These coordinates can also be passed to subsequent +``where`` operations. + +.. ipython:: python + + df_coord = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + store.append('df_coord',df_coord) + c = store.select_as_coordinates('df_coord','index>20020101') + c.summary() + store.select('df_coord',where=c) + +.. _io.hdf5-where_mask: + +**Selecting using a where mask** + +Sometime your query can involve creating a list of rows to select. Usually this ``mask`` would +be a resulting ``index`` from an indexing operation. This example selects the months of +a datetimeindex which are 5. + +.. ipython:: python + + df_mask = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + store.append('df_mask',df_mask) + c = store.select_column('df_mask','index') + where = c[DatetimeIndex(c).month==5].index + store.select('df_mask',where=where) + +**Storer Object** + +If you want to inspect the stored object, retrieve via +``get_storer``. You could use this programmatically to say get the number +of rows in an object. + +.. ipython:: python + + store.get_storer('df_dc').nrows + + +Multiple Table Queries +~~~~~~~~~~~~~~~~~~~~~~ + +New in 0.10.1 are the methods ``append_to_multiple`` and +``select_as_multiple``, that can perform appending/selecting from +multiple tables at once. The idea is to have one table (call it the +selector table) that you index most/all of the columns, and perform your +queries. The other table(s) are data tables with an index matching the +selector table's index. You can then perform a very fast query +on the selector table, yet get lots of data back. This method is similar to +having a very wide table, but enables more efficient queries. + +The ``append_to_multiple`` method splits a given single DataFrame +into multiple tables according to ``d``, a dictionary that maps the +table names to a list of 'columns' you want in that table. If `None` +is used in place of a list, that table will have the remaining +unspecified columns of the given DataFrame. The argument ``selector`` +defines which table is the selector table (which you can make queries from). +The argument ``dropna`` will drop rows from the input DataFrame to ensure +tables are synchronized. This means that if a row for one of the tables +being written to is entirely ``np.NaN``, that row will be dropped from all tables. + +If ``dropna`` is False, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. +Remember that entirely ``np.Nan`` rows are not written to the HDFStore, so if +you choose to call ``dropna=False``, some tables may have more rows than others, +and therefore ``select_as_multiple`` may not work or it may return unexpected +results. + +.. ipython:: python + + df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt['foo'] = 'bar' + df_mt.ix[1, ('A', 'B')] = np.nan + + # you can also create the tables individually + store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, + df_mt, selector='df1_mt') + store + + # individual tables were created + store.select('df1_mt') + store.select('df2_mt') + + # as a multiple + store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], + selector = 'df1_mt') + + +Delete from a Table +~~~~~~~~~~~~~~~~~~~ + +You can delete from a table selectively by specifying a ``where``. In +deleting rows, it is important to understand the ``PyTables`` deletes +rows by erasing the rows, then **moving** the following data. Thus +deleting can potentially be a very expensive operation depending on the +orientation of your data. This is especially true in higher dimensional +objects (``Panel`` and ``Panel4D``). To get optimal performance, it's +worthwhile to have the dimension you are deleting be the first of the +``indexables``. + +Data is ordered (on the disk) in terms of the ``indexables``. Here's a +simple use case. You store panel-type data, with dates in the +``major_axis`` and ids in the ``minor_axis``. The data is then +interleaved like this: + + - date_1 + - id_1 + - id_2 + - . + - id_n + - date_2 + - id_1 + - . + - id_n + +It should be clear that a delete operation on the ``major_axis`` will be +fairly quick, as one chunk is removed, then the following data moved. On +the other hand a delete operation on the ``minor_axis`` will be very +expensive. In this case it would almost certainly be faster to rewrite +the table using a ``where`` that selects all but the missing data. + +.. ipython:: python + + # returns the number of rows deleted + store.remove('wp', 'major_axis>20000102' ) + store.select('wp') + +Please note that HDF5 **DOES NOT RECLAIM SPACE** in the h5 files +automatically. Thus, repeatedly deleting (or removing nodes) and adding +again **WILL TEND TO INCREASE THE FILE SIZE**. To *clean* the file, use +``ptrepack`` (see below). + +Compression +~~~~~~~~~~~ + +``PyTables`` allows the stored data to be compressed. This applies to +all kinds of stores, not just tables. + + - Pass ``complevel=int`` for a compression level (1-9, with 0 being no + compression, and the default) + - Pass ``complib=lib`` where lib is any of ``zlib, bzip2, lzo, blosc`` for + whichever compression library you prefer. + +``HDFStore`` will use the file based compression scheme if no overriding +``complib`` or ``complevel`` options are provided. ``blosc`` offers very +fast compression, and is my most used. Note that ``lzo`` and ``bzip2`` +may not be installed (by Python) by default. + +Compression for all objects within the file + + - ``store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc')`` + +Or on-the-fly compression (this only applies to tables). You can turn +off file compression for a specific table by passing ``complevel=0`` + + - ``store.append('df', df, complib='zlib', complevel=5)`` + +**ptrepack** + +``PyTables`` offers better write performance when tables are compressed after +they are written, as opposed to turning on compression at the very +beginning. You can use the supplied ``PyTables`` utility +``ptrepack``. In addition, ``ptrepack`` can change compression levels +after the fact. + + - ``ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc in.h5 out.h5`` + +Furthermore ``ptrepack in.h5 out.h5`` will *repack* the file to allow +you to reuse previously deleted space. Aalternatively, one can simply +remove the file and write again, or use the ``copy`` method. + +.. _io.hdf5-notes: + +Notes & Caveats +~~~~~~~~~~~~~~~ + + - Once a ``table`` is created its items (Panel) / columns (DataFrame) + are fixed; only exactly the same columns can be appended + - If a row has ``np.nan`` for **EVERY COLUMN** (having a ``nan`` + in a string, or a ``NaT`` in a datetime-like column counts as having + a value), then those rows **WILL BE DROPPED IMPLICITLY**. This limitation + *may* be addressed in the future. + - ``HDFStore`` is **not-threadsafe for writing**. The underlying + ``PyTables`` only supports concurrent reads (via threading or + processes). If you need reading and writing *at the same time*, you + need to serialize these operations in a single thread in a single + process. You will corrupt your data otherwise. See the issue + (:`2397`) for more information. + - If you use locks to manage write access between multiple processes, you + may want to use :py:func:`~os.fsync` before releasing write locks. For + convenience you can use ``store.flush(fsync=True)`` to do this for you. + - ``PyTables`` only supports fixed-width string columns in + ``tables``. The sizes of a string based indexing column + (e.g. *columns* or *minor_axis*) are determined as the maximum size + of the elements in that axis or by passing the parameter + - Be aware that timezones (e.g., ``pytz.timezone('US/Eastern')``) + are not necessarily equal across timezone versions. So if data is + localized to a specific timezone in the HDFStore using one version + of a timezone library and that data is updated with another version, the data + will be converted to UTC since these timezones are not considered + equal. Either use the same version of timezone library or use ``tz_convert`` with + the updated timezone definition. + +.. warning:: + + ``PyTables`` will show a ``NaturalNameWarning`` if a column name + cannot be used as an attribute selector. Generally identifiers that + have spaces, start with numbers, or ``_``, or have ``-`` embedded are not considered + *natural*. These types of identifiers cannot be used in a ``where`` clause + and are generally a bad idea. + +DataTypes +~~~~~~~~~ + +``HDFStore`` will map an object dtype to the ``PyTables`` underlying +dtype. This means the following types are known to work: + + - floating : ``float64, float32, float16`` *(using* ``np.nan`` *to + represent invalid values)* + - integer : ``int64, int32, int8, uint64, uint32, uint8`` + - bool + - datetime64[ns] *(using* ``NaT`` *to represent invalid values)* + - object : ``strings`` *(using* ``np.nan`` *to represent invalid + values)* + +Currently, ``unicode`` and ``datetime`` columns (represented with a +dtype of ``object``), **WILL FAIL**. In addition, even though a column +may look like a ``datetime64[ns]``, if it contains ``np.nan``, this +**WILL FAIL**. You can try to convert datetimelike columns to proper +``datetime64[ns]`` columns, that possibily contain ``NaT`` to represent +invalid values. (Some of these issues have been addressed and these +conversion may not be necessary in future versions of pandas) + + .. ipython:: python + + import datetime + df = DataFrame(dict(datelike=Series([datetime.datetime(2001, 1, 1), + datetime.datetime(2001, 1, 2), np.nan]))) + df + df.dtypes + + # to convert + df['datelike'] = Series(df['datelike'].values, dtype='M8[ns]') + df + df.dtypes + +String Columns +~~~~~~~~~~~~~~ + +**min_itemsize** + +The underlying implementation of ``HDFStore`` uses a fixed column width (itemsize) for string columns. +A string column itemsize is calculated as the maximum of the +length of data (for that column) that is passed to the ``HDFStore``, **in the first append**. Subsequent appends, +may introduce a string for a column **larger** than the column can hold, an Exception will be raised (otherwise you +could have a silent truncation of these columns, leading to loss of information). In the future we may relax this and +allow a user-specified truncation to occur. + +Pass ``min_itemsize`` on the first table creation to a-priori specifiy the minimum length of a particular string column. +``min_itemsize`` can be an integer, or a dict mapping a column name to an integer. You can pass ``values`` as a key to +allow all *indexables* or *data_columns* to have this min_itemsize. + +Starting in 0.11.0, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically. + +.. note:: + + If you are not passing any *data_columns*, then the min_itemsize will be the maximum of the length of any string passed + +.. ipython:: python + + dfs = DataFrame(dict(A = 'foo', B = 'bar'),index=list(range(5))) + dfs + + # A and B have a size of 30 + store.append('dfs', dfs, min_itemsize = 30) + store.get_storer('dfs').table + + # A is created as a data_column with a size of 30 + # B is size is calculated + store.append('dfs2', dfs, min_itemsize = { 'A' : 30 }) + store.get_storer('dfs2').table + +**nan_rep** + +String columns will serialize a ``np.nan`` (a missing value) with the ``nan_rep`` string representation. This defaults to the string value ``nan``. +You could inadvertently turn an actual ``nan`` value into a missing value. + +.. ipython:: python + + dfss = DataFrame(dict(A = ['foo','bar','nan'])) + dfss + + store.append('dfss', dfss) + store.select('dfss') + + # here you need to specify a different nan rep + store.append('dfss2', dfss, nan_rep='_nan_') + store.select('dfss2') + +External Compatibility +~~~~~~~~~~~~~~~~~~~~~~ + +``HDFStore`` write ``table`` format objects in specific formats suitable for +producing loss-less roundtrips to pandas objects. For external +compatibility, ``HDFStore`` can read native ``PyTables`` format +tables. It is possible to write an ``HDFStore`` object that can easily +be imported into ``R`` using the ``rhdf5`` library. Create a table +format store like this: + + .. ipython:: python + + store_export = HDFStore('export.h5') + store_export.append('df_dc', df_dc, data_columns=df_dc.columns) + store_export + + .. ipython:: python + :suppress: + + store_export.close() + import os + os.remove('export.h5') + +Backwards Compatibility +~~~~~~~~~~~~~~~~~~~~~~~ + +0.10.1 of ``HDFStore`` can read tables created in a prior version of pandas, +however query terms using the +prior (undocumented) methodology are unsupported. ``HDFStore`` will +issue a warning if you try to use a legacy-format file. You must +read in the entire file and write it out using the new format, using the +method ``copy`` to take advantage of the updates. The group attribute +``pandas_version`` contains the version information. ``copy`` takes a +number of options, please see the docstring. + + + .. ipython:: python + :suppress: + + import os + legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5') + + .. ipython:: python + + # a legacy store + legacy_store = HDFStore(legacy_file_path,'r') + legacy_store + + # copy (and return the new handle) + new_store = legacy_store.copy('store_new.h5') + new_store + new_store.close() + + .. ipython:: python + :suppress: + + legacy_store.close() + import os + os.remove('store_new.h5') + + +Performance +~~~~~~~~~~~ + + - ``Tables`` come with a writing performance penalty as compared to + regular stores. The benefit is the ability to append/delete and + query (potentially very large amounts of data). Write times are + generally longer as compared with regular stores. Query times can + be quite fast, especially on an indexed axis. + - You can pass ``chunksize=`` to ``append``, specifying the + write chunksize (default is 50000). This will signficantly lower + your memory usage on writing. + - You can pass ``expectedrows=`` to the first ``append``, + to set the TOTAL number of expected rows that ``PyTables`` will + expected. This will optimize read/write performance. + - Duplicate rows can be written to tables, but are filtered out in + selection (with the last items being selected; thus a table is + unique on major, minor pairs) + - A ``PerformanceWarning`` will be raised if you are attempting to + store types that will be pickled by PyTables (rather than stored as + endemic types). See + `Here `__ + for more information and some solutions. + +Experimental +~~~~~~~~~~~~ + +HDFStore supports ``Panel4D`` storage. + +.. ipython:: python + + p4d = Panel4D({ 'l1' : wp }) + p4d + store.append('p4d', p4d) + store + +These, by default, index the three axes ``items, major_axis, +minor_axis``. On an ``AppendableTable`` it is possible to setup with the +first append a different indexing scheme, depending on how you want to +store your data. Pass the ``axes`` keyword with a list of dimensions +(currently must by exactly 1 less than the total dimensions of the +object). This cannot be changed after table creation. + +.. ipython:: python + + store.append('p4d2', p4d, axes=['labels', 'major_axis', 'minor_axis']) + store + store.select('p4d2', [ Term('labels=l1'), Term('items=Item1'), Term('minor_axis=A_big_strings') ]) + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + + +.. _io.sql: + +SQL Queries +----------- + +The :mod:`pandas.io.sql` module provides a collection of query wrappers to both +facilitate data retrieval and to reduce dependency on DB-specific API. Database abstraction +is provided by SQLAlchemy if installed, in addition you will need a driver library for +your database. + +.. versionadded:: 0.14.0 + +If SQLAlchemy is not installed, a fallback is only provided for sqlite (and +for mysql for backwards compatibility, but this is deprecated and will be +removed in a future version). +This mode requires a Python database adapter which respect the `Python +DB-API `__. + +See also some :ref:`cookbook examples ` for some advanced strategies. + +The key functions are: + +.. autosummary:: + :toctree: generated/ + + read_sql_table + read_sql_query + read_sql + DataFrame.to_sql + +.. note:: + + The function :func:`~pandas.read_sql` is a convenience wrapper around + :func:`~pandas.read_sql_table` and :func:`~pandas.read_sql_query` (and for + backward compatibility) and will delegate to specific function depending on + the provided input (database table name or sql query). + +In the following example, we use the `SQlite `__ SQL database +engine. You can use a temporary SQLite database where data are stored in +"memory". + +To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine +object from database URI. You only need to create the engine once per database you are +connecting to. +For more information on :func:`create_engine` and the URI formatting, see the examples +below and the SQLAlchemy `documentation `__ + +.. ipython:: python + + from sqlalchemy import create_engine + # Create your connection. + engine = create_engine('sqlite:///:memory:') + +Writing DataFrames +~~~~~~~~~~~~~~~~~~ + +Assuming the following data is in a DataFrame ``data``, we can insert it into +the database using :func:`~pandas.DataFrame.to_sql`. + ++-----+------------+-------+-------+-------+ +| id | Date | Col_1 | Col_2 | Col_3 | ++=====+============+=======+=======+=======+ +| 26 | 2012-10-18 | X | 25.7 | True | ++-----+------------+-------+-------+-------+ +| 42 | 2012-10-19 | Y | -12.4 | False | ++-----+------------+-------+-------+-------+ +| 63 | 2012-10-20 | Z | 5.73 | True | ++-----+------------+-------+-------+-------+ + + +.. ipython:: python + :suppress: + + import datetime + c = ['id', 'Date', 'Col_1', 'Col_2', 'Col_3'] + d = [(26, datetime.datetime(2010,10,18), 'X', 27.5, True), + (42, datetime.datetime(2010,10,19), 'Y', -12.5, False), + (63, datetime.datetime(2010,10,20), 'Z', 5.73, True)] + + data = DataFrame(d, columns=c) + +.. ipython:: python + + data.to_sql('data', engine) + +.. note:: + + Due to the limited support for timedelta's in the different database + flavors, columns with type ``timedelta64`` will be written as integer + values as nanoseconds to the database and a warning will be raised. + + +Reading Tables +~~~~~~~~~~~~~~ + +:func:`~pandas.read_sql_table` will read a database table given the +table name and optionally a subset of columns to read. + +.. note:: + + In order to use :func:`~pandas.read_sql_table`, you **must** have the + SQLAlchemy optional dependency installed. + +.. ipython:: python + + pd.read_sql_table('data', engine) + +You can also specify the name of the column as the DataFrame index, +and specify a subset of columns to be read. + +.. ipython:: python + + pd.read_sql_table('data', engine, index_col='id') + pd.read_sql_table('data', engine, columns=['Col_1', 'Col_2']) + +And you can explicitly force columns to be parsed as dates: + +.. ipython:: python + + pd.read_sql_table('data', engine, parse_dates=['Date']) + +If needed you can explicitly specifiy a format string, or a dict of arguments +to pass to :func:`pandas.to_datetime`: + +.. code-block:: python + + pd.read_sql_table('data', engine, parse_dates={'Date': '%Y-%m-%d'}) + pd.read_sql_table('data', engine, parse_dates={'Date': {'format': '%Y-%m-%d %H:%M:%S'}}) + + +You can check if a table exists using :func:`~pandas.io.sql.has_table` + + +Querying +~~~~~~~~ + +You can query using raw SQL in the :func:`~pandas.read_sql_query` function. +In this case you must use the SQL variant appropriate for your database. +When using SQLAlchemy, you can also pass SQLAlchemy Expression language constructs, +which are database-agnostic. + +.. ipython:: python + + pd.read_sql_query('SELECT * FROM data', engine) + +Of course, you can specify a more "complex" query. + +.. ipython:: python + + pd.read_sql_query("SELECT id, Col_1, Col_2 FROM data WHERE id = 42;", engine) + + +You can also run a plain query without creating a dataframe with +:func:`~pandas.io.sql.execute`. This is useful for queries that don't return values, +such as INSERT. This is functionally equivalent to calling ``execute`` on the +SQLAlchemy engine or db connection object. Again, you must use the SQL syntax +variant appropriate for your database. + +.. code-block:: python + + from pandas.io import sql + sql.execute('SELECT * FROM table_name', engine) + sql.execute('INSERT INTO table_name VALUES(?, ?, ?)', engine, params=[('id', 1, 12.2, True)]) + + +Engine connection examples +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine +object from database URI. You only need to create the engine once per database you are +connecting to. + +.. code-block:: python + + from sqlalchemy import create_engine + + engine = create_engine('postgresql://scott:tiger@localhost:5432/mydatabase') + + engine = create_engine('mysql+mysqldb://scott:tiger@localhost/foo') + + engine = create_engine('oracle://scott:tiger@127.0.0.1:1521/sidname') + + engine = create_engine('mssql+pyodbc://mydsn') + + # sqlite:/// + # where is relative: + engine = create_engine('sqlite:///foo.db') + + # or absolute, starting with a slash: + engine = create_engine('sqlite:////absolute/path/to/foo.db') + +For more information see the examples the SQLAlchemy `documentation `__ + + +Sqlite fallback +~~~~~~~~~~~~~~~ + +The use of sqlite is supported without using SQLAlchemy. +This mode requires a Python database adapter which respect the `Python +DB-API `__. + +You can create connections like so: + +.. code-block:: python + + import sqlite3 + con = sqlite3.connect(':memory:') + +And then issue the following queries: + +.. code-block:: python + + data.to_sql('data', cnx) + pd.read_sql_query("SELECT * FROM data", con) + + +.. _io.bigquery: + +Google BigQuery (Experimental) +------------------------------ + +.. versionadded:: 0.13.0 + +The :mod:`pandas.io.gbq` module provides a wrapper for Google's BigQuery +analytics web service to simplify retrieving results from BigQuery tables +using SQL-like queries. Result sets are parsed into a pandas +DataFrame with a shape and data types derived from the source table. +Additionally, DataFrames can be appended to existing BigQuery tables if +the destination table is the same shape as the DataFrame. + +For specifics on the service itself, see `here `__ + +As an example, suppose you want to load all data from an existing BigQuery +table : `test_dataset.test_table` into a DataFrame using the :func:`~pandas.io.read_gbq` +function. + +.. code-block:: python + + # Insert your BigQuery Project ID Here + # Can be found in the Google web console + projectid = "xxxxxxxx" + + data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table', project_id = projectid) + +You will then be authenticated to the specified BigQuery account +via Google's Oauth2 mechanism. In general, this is as simple as following the +prompts in a browser window which will be opened for you. Should the browser not +be available, or fail to launch, a code will be provided to complete the process +manually. Additional information on the authentication mechanism can be found +`here `__ + +You can define which column from BigQuery to use as an index in the +destination DataFrame as well as a preferred column order as follows: + +.. code-block:: python + + data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table', + index_col='index_column_name', + col_order=['col1', 'col2', 'col3'], project_id = projectid) + +Finally, you can append data to a BigQuery table from a pandas DataFrame +using the :func:`~pandas.io.to_gbq` function. This function uses the +Google streaming API which requires that your destination table exists in +BigQuery. Given the BigQuery table already exists, your DataFrame should +match the destination table in column order, structure, and data types. +DataFrame indexes are not supported. By default, rows are streamed to +BigQuery in chunks of 10,000 rows, but you can pass other chuck values +via the ``chunksize`` argument. You can also see the progess of your +post via the ``verbose`` flag which defaults to ``True``. The http +response code of Google BigQuery can be successful (200) even if the +append failed. For this reason, if there is a failure to append to the +table, the complete error response from BigQuery is returned which +can be quite long given it provides a status for each row. You may want +to start with smaller chuncks to test that the size and types of your +dataframe match your destination table to make debugging simpler. + +.. code-block:: python + + df = pandas.DataFrame({'string_col_name' : ['hello'], + 'integer_col_name' : [1], + 'boolean_col_name' : [True]}) + df.to_gbq('my_dataset.my_table', project_id = projectid) + +The BigQuery SQL query language has some oddities, see `here `__ + +While BigQuery uses SQL-like syntax, it has some important differences +from traditional databases both in functionality, API limitations (size and +qunatity of queries or uploads), and how Google charges for use of the service. +You should refer to Google documentation often as the service seems to +be changing and evolving. BiqQuery is best for analyzing large sets of +data quickly, but it is not a direct replacement for a transactional database. + +You can access the management console to determine project id's by: + + +.. warning:: + + To use this module, you will need a valid BigQuery account. See + for details on the + service. + +.. _io.stata: + +STATA Format +------------ + +.. versionadded:: 0.12.0 + +.. _io.stata_writer: + +Writing to STATA format +~~~~~~~~~~~~~~~~~~~~~~~ + +The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame +into a .dta file. The format version of this file is always 115 (Stata 12). + +.. ipython:: python + + df = DataFrame(randn(10, 2), columns=list('AB')) + df.to_stata('stata.dta') + +.. _io.stata_reader: + +Reading from STATA format +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The top-level function ``read_stata`` will read a dta format file +and return a DataFrame: +The class :class:`~pandas.io.stata.StataReader` will read the header of the +given dta file at initialization. Its method +:func:`~pandas.io.stata.StataReader.data` will read the observations, +converting them to a DataFrame which is returned: + +.. ipython:: python + + pd.read_stata('stata.dta') + +Currently the ``index`` is retrieved as a column on read back. + +The parameter ``convert_categoricals`` indicates wheter value labels should be +read and used to create a ``Categorical`` variable from them. Value labels can +also be retrieved by the function ``variable_labels``, which requires data to be +called before (see ``pandas.io.stata.StataReader``). + +The StataReader supports .dta Formats 104, 105, 108, 113-115 and 117. +Alternatively, the function :func:`~pandas.io.stata.read_stata` can be used + +.. ipython:: python + :suppress: + + import os + os.remove('stata.dta') + +.. _io.perf: + +Performance Considerations +-------------------------- + +This is an informal comparison of various IO methods, using pandas 0.13.1. + +.. code-block:: python + + In [3]: df = DataFrame(randn(1000000,2),columns=list('AB')) + + Int64Index: 1000000 entries, 0 to 999999 + Data columns (total 2 columns): + A 1000000 non-null values + B 1000000 non-null values + dtypes: float64(2) + + +Writing + +.. code-block:: python + + In [14]: %timeit test_sql_write(df) + 1 loops, best of 3: 6.24 s per loop + + In [15]: %timeit test_hdf_fixed_write(df) + 1 loops, best of 3: 237 ms per loop + + In [26]: %timeit test_hdf_fixed_write_compress(df) + 1 loops, best of 3: 245 ms per loop + + In [16]: %timeit test_hdf_table_write(df) + 1 loops, best of 3: 901 ms per loop + + In [27]: %timeit test_hdf_table_write_compress(df) + 1 loops, best of 3: 952 ms per loop + + In [17]: %timeit test_csv_write(df) + 1 loops, best of 3: 3.44 s per loop + +Reading + +.. code-block:: python + + In [18]: %timeit test_sql_read() + 1 loops, best of 3: 766 ms per loop + + In [19]: %timeit test_hdf_fixed_read() + 10 loops, best of 3: 19.1 ms per loop + + In [28]: %timeit test_hdf_fixed_read_compress() + 10 loops, best of 3: 36.3 ms per loop + + In [20]: %timeit test_hdf_table_read() + 10 loops, best of 3: 39 ms per loop + + In [29]: %timeit test_hdf_table_read_compress() + 10 loops, best of 3: 60.6 ms per loop + + In [22]: %timeit test_csv_read() + 1 loops, best of 3: 620 ms per loop + +Space on disk (in bytes) + +.. code-block:: python + + 25843712 Apr 8 14:11 test.sql + 24007368 Apr 8 14:11 test_fixed.hdf + 15580682 Apr 8 14:11 test_fixed_compress.hdf + 24458444 Apr 8 14:11 test_table.hdf + 16797283 Apr 8 14:11 test_table_compress.hdf + 46152810 Apr 8 14:11 test.csv + +And here's the code + +.. code-block:: python + + import sqlite3 + import os + from pandas.io import sql + + df = DataFrame(randn(1000000,2),columns=list('AB')) + + def test_sql_write(df): + if os.path.exists('test.sql'): + os.remove('test.sql') + sql_db = sqlite3.connect('test.sql') + sql.write_frame(df, name='test_table', con=sql_db) + sql_db.close() + + def test_sql_read(): + sql_db = sqlite3.connect('test.sql') + sql.read_frame("select * from test_table", sql_db) + sql_db.close() + + def test_hdf_fixed_write(df): + df.to_hdf('test_fixed.hdf','test',mode='w') + + def test_hdf_fixed_read(): + pd.read_hdf('test_fixed.hdf','test') + + def test_hdf_fixed_write_compress(df): + df.to_hdf('test_fixed_compress.hdf','test',mode='w',complib='blosc') + + def test_hdf_fixed_read_compress(): + pd.read_hdf('test_fixed_compress.hdf','test') + + def test_hdf_table_write(df): + df.to_hdf('test_table.hdf','test',mode='w',format='table') + + def test_hdf_table_read(): + pd.read_hdf('test_table.hdf','test') + + def test_hdf_table_write_compress(df): + df.to_hdf('test_table_compress.hdf','test',mode='w',complib='blosc',format='table') + + def test_hdf_table_read_compress(): + pd.read_hdf('test_table_compress.hdf','test') + + def test_csv_write(df): + df.to_csv('test.csv',mode='w') + + def test_csv_read(): + pd.read_csv('test.csv',index_col=0) diff --git a/doc/source/merging.rst b/doc/source/merging.rst new file mode 100644 index 00000000..04fb0b06 --- /dev/null +++ b/doc/source/merging.rst @@ -0,0 +1,713 @@ +.. currentmodule:: pandas +.. _merging: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from numpy import nan + from pandas import * + options.display.max_rows=15 + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + +**************************** +Merge, join, and concatenate +**************************** + +pandas provides various facilities for easily combining together Series, +DataFrame, and Panel objects with various kinds of set logic for the indexes +and relational algebra functionality in the case of join / merge-type +operations. + +.. _merging.concat: + +Concatenating objects +--------------------- + +The ``concat`` function (in the main pandas namespace) does all of the heavy +lifting of performing concatenation operations along an axis while performing +optional set logic (union or intersection) of the indexes (if any) on the other +axes. Note that I say "if any" because there is only a single possible axis of +concatenation for Series. + +Before diving into all of the details of ``concat`` and what it can do, here is +a simple example: + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 4)) + df + + # break it into pieces + pieces = [df[:3], df[3:7], df[7:]] + + concatenated = concat(pieces) + concatenated + +Like its sibling function on ndarrays, ``numpy.concatenate``, ``pandas.concat`` +takes a list or dict of homogeneously-typed objects and concatenates them with +some configurable handling of "what to do with the other axes": + +:: + + concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, + keys=None, levels=None, names=None, verify_integrity=False) + +- ``objs``: list or dict of Series, DataFrame, or Panel objects. If a dict is + passed, the sorted keys will be used as the `keys` argument, unless it is + passed, in which case the values will be selected (see below) +- ``axis``: {0, 1, ...}, default 0. The axis to concatenate along +- ``join``: {'inner', 'outer'}, default 'outer'. How to handle indexes on + other axis(es). Outer for union and inner for intersection +- ``join_axes``: list of Index objects. Specific indexes to use for the other + n - 1 axes instead of performing inner/outer set logic +- ``keys``: sequence, default None. Construct hierarchical index using the + passed keys as the outermost level If multiple levels passed, should + contain tuples. +- ``levels`` : list of sequences, default None. If keys passed, specific + levels to use for the resulting MultiIndex. Otherwise they will be inferred + from the keys +- ``names``: list, default None. Names for the levels in the resulting + hierarchical index +- ``verify_integrity``: boolean, default False. Check whether the new + concatenated axis contains duplicates. This can be very expensive relative + to the actual data concatenation +- ``ignore_index`` : boolean, default False. If True, do not use the index + values on the concatenation axis. The resulting axis will be labeled 0, ..., + n - 1. This is useful if you are concatenating objects where the + concatenation axis does not have meaningful indexing information. + +Without a little bit of context and example many of these arguments don't make +much sense. Let's take the above example. Suppose we wanted to associate +specific keys with each of the pieces of the chopped up DataFrame. We can do +this using the ``keys`` argument: + +.. ipython:: python + + concatenated = concat(pieces, keys=['first', 'second', 'third']) + concatenated + +As you can see (if you've read the rest of the documentation), the resulting +object's index has a :ref:`hierarchical index `. This +means that we can now do stuff like select out each chunk by key: + +.. ipython:: python + + concatenated.ix['second'] + +It's not a stretch to see how this can be very useful. More detail on this +functionality below. + +Set logic on the other axes +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When gluing together multiple DataFrames (or Panels or...), for example, you +have a choice of how to handle the other axes (other than the one being +concatenated). This can be done in three ways: + +- Take the (sorted) union of them all, ``join='outer'``. This is the default + option as it results in zero information loss. +- Take the intersection, ``join='inner'``. +- Use a specific index (in the case of DataFrame) or indexes (in the case of + Panel or future higher dimensional objects), i.e. the ``join_axes`` argument + +Here is a example of each of these methods. First, the default ``join='outer'`` +behavior: + +.. ipython:: python + + from pandas.util.testing import rands + df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'], + index=[rands(5) for _ in range(10)]) + df + + concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], + df.ix[-7:, ['d']]], axis=1) + +Note that the row indexes have been unioned and sorted. Here is the same thing +with ``join='inner'``: + +.. ipython:: python + + concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], + df.ix[-7:, ['d']]], axis=1, join='inner') + +Lastly, suppose we just wanted to reuse the *exact index* from the original +DataFrame: + +.. ipython:: python + + concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], + df.ix[-7:, ['d']]], axis=1, join_axes=[df.index]) + +.. _merging.concatenation: + +Concatenating using ``append`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A useful shortcut to ``concat`` are the ``append`` instance methods on Series +and DataFrame. These methods actually predated ``concat``. They concatenate +along ``axis=0``, namely the index: + +.. ipython:: python + + s = Series(randn(10), index=np.arange(10)) + s1 = s[:5] # note we're slicing with labels here, so 5 is included + s2 = s[6:] + s1.append(s2) + +In the case of DataFrame, the indexes must be disjoint but the columns do not +need to be: + +.. ipython:: python + + df = DataFrame(randn(6, 4), index=date_range('1/1/2000', periods=6), + columns=['A', 'B', 'C', 'D']) + df1 = df.ix[:3] + df2 = df.ix[3:, :3] + df1 + df2 + df1.append(df2) + +``append`` may take multiple objects to concatenate: + +.. ipython:: python + + df1 = df.ix[:2] + df2 = df.ix[2:4] + df3 = df.ix[4:] + df1.append([df2,df3]) + +.. note:: + + Unlike `list.append` method, which appends to the original list and + returns nothing, ``append`` here **does not** modify ``df1`` and + returns its copy with ``df2`` appended. + +.. _merging.ignore_index: + +Ignoring indexes on the concatenation axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For DataFrames which don't have a meaningful index, you may wish to append them +and ignore the fact that they may have overlapping indexes: + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + df2 = DataFrame(randn(3, 4), columns=['A', 'B', 'C', 'D']) + + df1 + df2 + +To do this, use the ``ignore_index`` argument: + +.. ipython:: python + + concat([df1, df2], ignore_index=True) + +This is also a valid argument to ``DataFrame.append``: + +.. ipython:: python + + df1.append(df2, ignore_index=True) + +.. _merging.mixed_ndims: + +Concatenating with mixed ndims +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can concatenate a mix of Series and DataFrames. The +Series will be transformed to DataFrames with the column name as +the name of the Series. + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + s1 = Series(randn(6), name='foo') + concat([df1, s1],axis=1) + +If unnamed Series are passed they will be numbered consecutively. + +.. ipython:: python + + s2 = Series(randn(6)) + concat([df1, s2, s2, s2],axis=1) + +Passing ``ignore_index=True`` will drop all name references. + +.. ipython:: python + + concat([df1, s1],axis=1,ignore_index=True) + +More concatenating with group keys +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's consider a variation on the first example presented: + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 4)) + df + + # break it into pieces + pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] + + result = concat(pieces, axis=1, keys=['one', 'two', 'three']) + result + +You can also pass a dict to ``concat`` in which case the dict keys will be used +for the ``keys`` argument (unless other keys are specified): + +.. ipython:: python + + pieces = {'one': df.ix[:, [0, 1]], + 'two': df.ix[:, [2]], + 'three': df.ix[:, [3]]} + concat(pieces, axis=1) + concat(pieces, keys=['three', 'two']) + +The MultiIndex created has levels that are constructed from the passed keys and +the columns of the DataFrame pieces: + +.. ipython:: python + + result.columns.levels + +If you wish to specify other levels (as will occasionally be the case), you can +do so using the ``levels`` argument: + +.. ipython:: python + + result = concat(pieces, axis=1, keys=['one', 'two', 'three'], + levels=[['three', 'two', 'one', 'zero']], + names=['group_key']) + result + result.columns.levels + +Yes, this is fairly esoteric, but is actually necessary for implementing things +like GroupBy where the order of a categorical variable is meaningful. + +.. _merging.append.row: + +Appending rows to a DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While not especially efficient (since a new object must be created), you can +append a single row to a DataFrame by passing a Series or dict to ``append``, +which returns a new DataFrame as above. + +.. ipython:: python + + df = DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df + s = df.xs(3) + df.append(s, ignore_index=True) + +You should use ``ignore_index`` with this method to instruct DataFrame to +discard its index. If you wish to preserve the index, you should construct an +appropriately-indexed DataFrame and append or concatenate those objects. + +You can also pass a list of dicts or Series: + +.. ipython:: python + + df = DataFrame(np.random.randn(5, 4), + columns=['foo', 'bar', 'baz', 'qux']) + dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, + {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] + result = df.append(dicts, ignore_index=True) + result + +.. _merging.join: + +Database-style DataFrame joining/merging +---------------------------------------- + +pandas has full-featured, **high performance** in-memory join operations +idiomatically very similar to relational databases like SQL. These methods +perform significantly better (in some cases well over an order of magnitude +better) than other open source implementations (like ``base::merge.data.frame`` +in R). The reason for this is careful algorithmic design and internal layout of +the data in DataFrame. + +See the :ref:`cookbook` for some advanced strategies. + +Users who are familiar with SQL but new to pandas might be interested in a +:ref:`comparison with SQL`. + +pandas provides a single function, ``merge``, as the entry point for all +standard database join operations between DataFrame objects: + +:: + + merge(left, right, how='left', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True) + +Here's a description of what each argument is for: + + - ``left``: A DataFrame object + - ``right``: Another DataFrame object + - ``on``: Columns (names) to join on. Must be found in both the left and + right DataFrame objects. If not passed and ``left_index`` and + ``right_index`` are ``False``, the intersection of the columns in the + DataFrames will be inferred to be the join keys + - ``left_on``: Columns from the left DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame + - ``right_on``: Columns from the right DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame + - ``left_index``: If ``True``, use the index (row labels) from the left + DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex + (hierarchical), the number of levels must match the number of join keys + from the right DataFrame + - ``right_index``: Same usage as ``left_index`` for the right DataFrame + - ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults + to ``inner``. See below for more detailed description of each method + - ``sort``: Sort the result DataFrame by the join keys in lexicographical + order. Defaults to ``True``, setting to ``False`` will improve performance + substantially in many cases + - ``suffixes``: A tuple of string suffixes to apply to overlapping + columns. Defaults to ``('_x', '_y')``. + - ``copy``: Always copy data (default ``True``) from the passed DataFrame + objects, even when reindexing is not necessary. Cannot be avoided in many + cases but may improve performance / memory usage. The cases where copying + can be avoided are somewhat pathological but this option is provided + nonetheless. + +``merge`` is a function in the pandas namespace, and it is also available as a +DataFrame instance method, with the calling DataFrame being implicitly +considered the left object in the join. + +The related ``DataFrame.join`` method, uses ``merge`` internally for the +index-on-index and index-on-column(s) joins, but *joins on indexes* by default +rather than trying to join on common columns (the default behavior for +``merge``). If you are joining on index, you may wish to use ``DataFrame.join`` +to save yourself some typing. + +Brief primer on merge methods (relational algebra) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Experienced users of relational databases like SQL will be familiar with the +terminology used to describe join operations between two SQL-table like +structures (DataFrame objects). There are several cases to consider which are +very important to understand: + + - **one-to-one** joins: for example when joining two DataFrame objects on + their indexes (which must contain unique values) + - **many-to-one** joins: for example when joining an index (unique) to one or + more columns in a DataFrame + - **many-to-many** joins: joining columns on columns. + +.. note:: + + When joining columns on columns (potentially a many-to-many join), any + indexes on the passed DataFrame objects **will be discarded**. + + +It is worth spending some time understanding the result of the **many-to-many** +join case. In SQL / standard relational algebra, if a key combination appears +more than once in both tables, the resulting table will have the **Cartesian +product** of the associated data. Here is a very basic example with one unique +key combination: + +.. ipython:: python + + left = DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) + right = DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left + right + merge(left, right, on='key') + +Here is a more complicated example with multiple join keys: + +.. ipython:: python + + left = DataFrame({'key1': ['foo', 'foo', 'bar'], + 'key2': ['one', 'two', 'one'], + 'lval': [1, 2, 3]}) + right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], + 'key2': ['one', 'one', 'one', 'two'], + 'rval': [4, 5, 6, 7]}) + merge(left, right, how='outer') + merge(left, right, how='inner') + +The ``how`` argument to ``merge`` specifies how to determine which keys are to +be included in the resulting table. If a key combination **does not appear** in +either the left or right tables, the values in the joined table will be +``NA``. Here is a summary of the ``how`` options and their SQL equivalent names: + +.. csv-table:: + :header: "Merge method", "SQL Join Name", "Description" + :widths: 20, 20, 60 + + ``left``, ``LEFT OUTER JOIN``, Use keys from left frame only + ``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only + ``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames + ``inner``, ``INNER JOIN``, Use intersection of keys from both frames + +.. _merging.join.index: + +Joining on index +~~~~~~~~~~~~~~~~ + +``DataFrame.join`` is a convenient method for combining the columns of two +potentially differently-indexed DataFrames into a single result DataFrame. Here +is a very basic example: + +.. ipython:: python + + df = DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df1 = df.ix[1:, ['A', 'B']] + df2 = df.ix[:5, ['C', 'D']] + df1 + df2 + df1.join(df2) + df1.join(df2, how='outer') + df1.join(df2, how='inner') + +The data alignment here is on the indexes (row labels). This same behavior can +be achieved using ``merge`` plus additional arguments instructing it to use the +indexes: + +.. ipython:: python + + merge(df1, df2, left_index=True, right_index=True, how='outer') + +Joining key columns on an index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``join`` takes an optional ``on`` argument which may be a column or multiple +column names, which specifies that the passed DataFrame is to be aligned on +that column in the DataFrame. These two function calls are completely +equivalent: + +:: + + left.join(right, on=key_or_keys) + merge(left, right, left_on=key_or_keys, right_index=True, + how='left', sort=False) + +Obviously you can choose whichever form you find more convenient. For +many-to-one joins (where one of the DataFrame's is already indexed by the join +key), using ``join`` may be more convenient. Here is a simple example: + +.. ipython:: python + + df['key'] = ['foo', 'bar'] * 4 + to_join = DataFrame(randn(2, 2), index=['bar', 'foo'], + columns=['j1', 'j2']) + df + to_join + df.join(to_join, on='key') + merge(df, to_join, left_on='key', right_index=True, + how='left', sort=False) + +.. _merging.multikey_join: + +To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + data + to_join + +Now this can be joined by passing the two key column names: + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2']) + +.. _merging.df_inner_join: + +The default for ``DataFrame.join`` is to perform a left join (essentially a +"VLOOKUP" operation, for Excel users), which uses only the keys found in the +calling DataFrame. Other join types, for example inner join, can be just as +easily performed: + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2'], how='inner') + +As you can see, this drops any rows where there was no match. + +Overlapping value columns +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The merge ``suffixes`` argument takes a tuple of list of strings to append to +overlapping column names in the input DataFrames to disambiguate the result +columns: + +.. ipython:: python + + left = DataFrame({'key': ['foo', 'foo'], 'value': [1, 2]}) + right = DataFrame({'key': ['foo', 'foo'], 'value': [4, 5]}) + merge(left, right, on='key', suffixes=['_left', '_right']) + +``DataFrame.join`` has ``lsuffix`` and ``rsuffix`` arguments which behave +similarly. + +.. _merging.ordered_merge: + +Merging Ordered Data +~~~~~~~~~~~~~~~~~~~~ + +New in v0.8.0 is the ordered_merge function for combining time series and other +ordered data. In particular it has an optional ``fill_method`` keyword to +fill/interpolate missing data: + +.. ipython:: python + :suppress: + + A = DataFrame({'key' : ['a', 'c', 'e'] * 2, + 'lvalue' : [1, 2, 3] * 2, + 'group' : ['a', 'a', 'a', 'b', 'b', 'b']}) + B = DataFrame({'key' : ['b', 'c', 'd'], + 'rvalue' : [1, 2, 3]}) + +.. ipython:: python + + A + + B + + ordered_merge(A, B, fill_method='ffill', left_by='group') + +.. _merging.multiple_join: + +Joining multiple DataFrame or Panel objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A list or tuple of DataFrames can also be passed to ``DataFrame.join`` to join +them together on their indexes. The same is true for ``Panel.join``. + +.. ipython:: python + + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + df1 + df1.join([df2, df3]) + +.. _merging.combine_first.update: + +Merging together values within Series or DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Another fairly common situation is to have two like-indexed (or similarly +indexed) Series or DataFrame objects and wanting to "patch" values in one +object from values for matching indices in the other. Here is an example: + +.. ipython:: python + + df1 = DataFrame([[nan, 3., 5.], [-4.6, np.nan, nan], + [nan, 7., nan]]) + df2 = DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]], + index=[1, 2]) + +For this, use the ``combine_first`` method: + +.. ipython:: python + + df1.combine_first(df2) + +Note that this method only takes values from the right DataFrame if they are +missing in the left DataFrame. A related method, ``update``, alters non-NA +values inplace: + +.. ipython:: python + + df1.update(df2) + df1 + +.. _merging.on_mi: + +Merging with Multi-indexes +-------------------------- + +.. _merging.join_on_mi: + +Joining a single Index to a Multi-index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14.0 + +You can join a singly-indexed DataFrame with a level of a multi-indexed DataFrame. +The level will match on the name of the index of the singly-indexed frame against +a level name of the multi-indexed frame. + +.. ipython:: python + + household = DataFrame(dict(household_id = [1,2,3], + male = [0,1,0], + wealth = [196087.3,316478.7,294750]), + columns = ['household_id','male','wealth'] + ).set_index('household_id') + household + portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29", + "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell", + "AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','name','share'] + ).set_index(['household_id','asset_id']) + portfolio + + household.join(portfolio, how='inner') + +This is equivalent but less verbose and more memory efficient / faster than this. + +.. code-block:: python + + merge(household.reset_index(), + portfolio.reset_index(), + on=['household_id'], + how='inner' + ).set_index(['household_id','asset_id']) + +Joining with two multi-indexes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is not Implemented via ``join`` at-the-moment, however it can be done using the following. + +.. ipython:: python + + household = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29", + "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','share'] + ).set_index(['household_id','asset_id']) + household + + log_return = DataFrame(dict(asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", + "lu0197800237", "lu0197800237"], + t = [233, 234, 235, 180, 181], + log_return = [.09604978, -.06524096, .03532373, .03025441, .036997]), + ).set_index(["asset_id","t"]) + log_return + + merge(household.reset_index(), + log_return.reset_index(), + on=['asset_id'], + how='inner' + ).set_index(['household_id','asset_id','t']) diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst new file mode 100644 index 00000000..9263eb2c --- /dev/null +++ b/doc/source/missing_data.rst @@ -0,0 +1,671 @@ +.. currentmodule:: pandas +.. _missing_data: + +.. ipython:: python + :suppress: + + from pandas import * + options.display.max_rows=15 + +************************* +Working with missing data +************************* + +In this section, we will discuss missing (also referred to as NA) values in +pandas. + +.. ipython:: python + :suppress: + + import numpy as np; randn = np.random.randn; randint =np.random.randint + from pandas import * + import matplotlib.pyplot as plt + from pandas.compat import lrange + +.. note:: + + The choice of using ``NaN`` internally to denote missing data was largely + for simplicity and performance reasons. It differs from the MaskedArray + approach of, for example, :mod:`scikits.timeseries`. We are hopeful that + NumPy will soon be able to provide a native NA type solution (similar to R) + performant enough to be used in pandas. + +See the :ref:`cookbook` for some advanced strategies + +Missing data basics +------------------- + +When / why does data become missing? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some might quibble over our usage of *missing*. By "missing" we simply mean +**null** or "not present for whatever reason". Many data sets simply arrive with +missing data, either because it exists and was not collected or it never +existed. For example, in a collection of financial time series, some of the time +series might start on different dates. Thus, values prior to the start date +would generally be marked as missing. + +In pandas, one of the most common ways that missing data is **introduced** into +a data set is by reindexing. For example + +.. ipython:: python + + df = DataFrame(randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], + columns=['one', 'two', 'three']) + df['four'] = 'bar' + df['five'] = df['one'] > 0 + df + df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']) + df2 + +Values considered "missing" +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As data comes in many shapes and forms, pandas aims to be flexible with regard +to handling missing data. While ``NaN`` is the default missing value marker for +reasons of computational speed and convenience, we need to be able to easily +detect this value with data of different types: floating point, integer, +boolean, and general object. In many cases, however, the Python ``None`` will +arise and we wish to also consider that "missing" or "null". + +Until recently, for legacy reasons ``inf`` and ``-inf`` were also +considered to be "null" in computations. This is no longer the case by +default; use the ``mode.use_inf_as_null`` option to recover it. + +.. _missing.isnull: + +To make detecting missing values easier (and across different array dtypes), +pandas provides the :func:`~pandas.core.common.isnull` and +:func:`~pandas.core.common.notnull` functions, which are also methods on +``Series`` objects: + +.. ipython:: python + + df2['one'] + isnull(df2['one']) + df2['four'].notnull() + +**Summary:** ``NaN`` and ``None`` (in object arrays) are considered +missing by the ``isnull`` and ``notnull`` functions. ``inf`` and +``-inf`` are no longer considered missing by default. + +Datetimes +--------- + +For datetime64[ns] types, ``NaT`` represents missing values. This is a pseudo-native +sentinel value that can be represented by numpy in a singular dtype (datetime64[ns]). +pandas objects provide intercompatibility between ``NaT`` and ``NaN``. + +.. ipython:: python + + df2 = df.copy() + df2['timestamp'] = Timestamp('20120101') + df2 + df2.ix[['a','c','h'],['one','timestamp']] = np.nan + df2 + df2.get_dtype_counts() + + +Calculations with missing data +------------------------------ + +Missing values propagate naturally through arithmetic operations between pandas +objects. + +.. ipython:: python + :suppress: + + df = df2.ix[:, ['one', 'two', 'three']] + a = df2.ix[:5, ['one', 'two']].fillna(method='pad') + b = df2.ix[:5, ['one', 'two', 'three']] + +.. ipython:: python + + a + b + a + b + +The descriptive statistics and computational methods discussed in the +:ref:`data structure overview ` (and listed :ref:`here +` and :ref:`here `) are all written to +account for missing data. For example: + + * When summing data, NA (missing) values will be treated as zero + * If the data are all NA, the result will be NA + * Methods like **cumsum** and **cumprod** ignore NA values, but preserve them + in the resulting arrays + +.. ipython:: python + + df + df['one'].sum() + df.mean(1) + df.cumsum() + +NA values in GroupBy +~~~~~~~~~~~~~~~~~~~~ + +NA groups in GroupBy are automatically excluded. This behavior is consistent +with R, for example. + + + +Cleaning / filling missing data +-------------------------------- + +pandas objects are equipped with various data manipulation methods for dealing +with missing data. + +.. _missing_data.fillna: + +Filling missing values: fillna +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The **fillna** function can "fill in" NA values with non-null data in a couple +of ways, which we illustrate: + +**Replace NA with a scalar value** + +.. ipython:: python + + df2 + df2.fillna(0) + df2['four'].fillna('missing') + +**Fill gaps forward or backward** + +Using the same filling arguments as :ref:`reindexing `, we +can propagate non-null values forward or backward: + +.. ipython:: python + + df + df.fillna(method='pad') + +.. _missing_data.fillna.limit: + +**Limit the amount of filling** + +If we only want consecutive gaps filled up to a certain number of data points, +we can use the `limit` keyword: + +.. ipython:: python + :suppress: + + df.ix[2:4, :] = np.nan + +.. ipython:: python + + df + df.fillna(method='pad', limit=1) + +To remind you, these are the available filling methods: + +.. csv-table:: + :header: "Method", "Action" + :widths: 30, 50 + + pad / ffill, Fill values forward + bfill / backfill, Fill values backward + +With time series data, using pad/ffill is extremely common so that the "last +known value" is available at every time point. + +The ``ffill()`` function is equivalent to ``fillna(method='ffill')`` +and ``bfill()`` is equivalent to ``fillna(method='bfill')`` + +.. _missing_data.PandasObject: + +Filling with a PandasObject +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.12 + +You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series +must match the columns of the frame you wish to fill. The +use case of this is to fill a DataFrame with the mean of that column. + +.. ipython:: python + + dff = DataFrame(np.random.randn(10,3),columns=list('ABC')) + dff.iloc[3:5,0] = np.nan + dff.iloc[4:6,1] = np.nan + dff.iloc[5:8,2] = np.nan + dff + + dff.fillna(dff.mean()) + dff.fillna(dff.mean()['B':'C']) + +.. versionadded:: 0.13 + +Same result as above, but is aligning the 'fill' value which is +a Series in this case. + +.. ipython:: python + + dff.where(notnull(dff),dff.mean(),axis='columns') + + +.. _missing_data.dropna: + +Dropping axis labels with missing data: dropna +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You may wish to simply exclude labels from a data set which refer to missing +data. To do this, use the **dropna** method: + +.. ipython:: python + :suppress: + + df['two'] = df['two'].fillna(0) + df['three'] = df['three'].fillna(0) + +.. ipython:: python + + df + df.dropna(axis=0) + df.dropna(axis=1) + df['one'].dropna() + +**dropna** is presently only implemented for Series and DataFrame, but will be +eventually added to Panel. Series.dropna is a simpler method as it only has one +axis to consider. DataFrame.dropna has considerably more options, which can be +examined :ref:`in the API `. + +.. _missing_data.interpolate: + +Interpolation +~~~~~~~~~~~~~ + +.. versionadded:: 0.13.0 + + :meth:`~pandas.DataFrame.interpolate`, and :meth:`~pandas.Series.interpolate` have + revamped interpolation methods and functionaility. + +Both Series and Dataframe objects have an ``interpolate`` method that, by default, +performs linear interpolation at missing datapoints. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + idx = date_range('1/1/2000', periods=100, freq='BM') + ts = Series(randn(100), index=idx) + ts[1:20] = np.nan + ts[60:80] = np.nan + ts = ts.cumsum() + +.. ipython:: python + + ts + ts.count() + ts.interpolate().count() + + plt.figure() + @savefig series_interpolate.png + ts.interpolate().plot() + +Index aware interpolation is available via the ``method`` keyword: + +.. ipython:: python + :suppress: + + ts2 = ts[[0, 1, 30, 60, 99]] + +.. ipython:: python + + ts2 + ts2.interpolate() + ts2.interpolate(method='time') + +For a floating-point index, use ``method='values'``: + +.. ipython:: python + :suppress: + + idx = [0., 1., 10.] + ser = Series([0., np.nan, 10.], idx) + +.. ipython:: python + + ser + ser.interpolate() + ser.interpolate(method='values') + +You can also interpolate with a DataFrame: + +.. ipython:: python + + df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], + 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df + df.interpolate() + +The ``method`` argument gives access to fancier interpolation methods. +If you have scipy_ installed, you can set pass the name of a 1-d interpolation routine to ``method``. +You'll want to consult the full scipy interpolation documentation_ and reference guide_ for details. +The appropriate interpolation method will depend on the type of data you are working with. +For example, if you are dealing with a time series that is growing at an increasing rate, +``method='quadratic'`` may be appropriate. If you have values approximating a cumulative +distribution function, then ``method='pchip'`` should work well. + +.. warning:: + + These methods require ``scipy``. + +.. ipython:: python + + df.interpolate(method='barycentric') + + df.interpolate(method='pchip') + +When interpolating via a polynomial or spline approximation, you must also specify +the degree or order of the approximation: + +.. ipython:: python + + df.interpolate(method='spline', order=2) + + df.interpolate(method='polynomial', order=2) + +Compare several methods: + +.. ipython:: python + + np.random.seed(2) + + ser = Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37)) + bad = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29]) + ser[bad] = np.nan + methods = ['linear', 'quadratic', 'cubic'] + + df = DataFrame({m: ser.interpolate(method=m) for m in methods}) + plt.figure() + @savefig compare_interpolations.png + df.plot() + +Another use case is interpolation at *new* values. +Suppose you have 100 observations from some distribution. And let's suppose +that you're particularly interested in what's happening around the middle. +You can mix pandas' ``reindex`` and ``interpolate`` methods to interpolate +at the new values. + +.. ipython:: python + + ser = Series(np.sort(np.random.uniform(size=100))) + + # interpolate at new_index + new_index = ser.index + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + interp_s = ser.reindex(new_index).interpolate(method='pchip') + interp_s[49:51] + +.. _scipy: http://www.scipy.org +.. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation +.. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html + + +Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword +argument. Use this to limit the number of consecutive interpolations, keeping +``NaN`` values for interpolations that are too far from the last valid +observation: + +.. ipython:: python + + ser = Series([1, 3, np.nan, np.nan, np.nan, 11]) + ser.interpolate(limit=2) + +.. _missing_data.replace: + +Replacing Generic Values +~~~~~~~~~~~~~~~~~~~~~~~~ +Often times we want to replace arbitrary values with other values. New in v0.8 +is the ``replace`` method in Series/DataFrame that provides an efficient yet +flexible way to perform such replacements. + +For a Series, you can replace a single value or a list of values by another +value: + +.. ipython:: python + + ser = Series([0., 1., 2., 3., 4.]) + + ser.replace(0, 5) + +You can replace a list of values by a list of other values: + +.. ipython:: python + + ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) + +You can also specify a mapping dict: + +.. ipython:: python + + ser.replace({0: 10, 1: 100}) + +For a DataFrame, you can specify individual values by column: + +.. ipython:: python + + df = DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) + + df.replace({'a': 0, 'b': 5}, 100) + +Instead of replacing with specified values, you can treat all given values as +missing and interpolate over them: + +.. ipython:: python + + ser.replace([1, 2, 3], method='pad') + +.. _missing_data.replace_expression: + +String/Regular Expression Replacement +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + Python strings prefixed with the ``r`` character such as ``r'hello world'`` + are so-called "raw" strings. They have different semantics regarding + backslashes than strings without this prefix. Backslashes in raw strings + will be interpreted as an escaped backslash, e.g., ``r'\' == '\\'``. You + should `read about them + `__ + if this is unclear. + +Replace the '.' with ``nan`` (str -> str) + +.. ipython:: python + :suppress: + + from numpy.random import rand, randn + from numpy import nan + from pandas import DataFrame + +.. ipython:: python + + d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(d) + df.replace('.', nan) + +Now do it with a regular expression that removes surrounding whitespace +(regex -> regex) + +.. ipython:: python + + df.replace(r'\s*\.\s*', nan, regex=True) + +Replace a few different values (list -> list) + +.. ipython:: python + + df.replace(['a', '.'], ['b', nan]) + +list of regex -> list of regex + +.. ipython:: python + + df.replace([r'\.', r'(a)'], ['dot', '\1stuff'], regex=True) + +Only search in column ``'b'`` (dict -> dict) + +.. ipython:: python + + df.replace({'b': '.'}, {'b': nan}) + +Same as the previous example, but use a regular expression for +searching instead (dict of regex -> dict) + +.. ipython:: python + + df.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True) + +You can pass nested dictionaries of regular expressions that use ``regex=True`` + +.. ipython:: python + + df.replace({'b': {'b': r''}}, regex=True) + +or you can pass the nested dictionary like so + +.. ipython:: python + + df.replace(regex={'b': {r'\s*\.\s*': nan}}) + +You can also use the group of a regular expression match when replacing (dict +of regex -> dict of regex), this works for lists as well + +.. ipython:: python + + df.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + +You can pass a list of regular expressions, of which those that match +will be replaced with a scalar (list of regex -> regex) + +.. ipython:: python + + df.replace([r'\s*\.\s*', r'a|b'], nan, regex=True) + +All of the regular expression examples can also be passed with the +``to_replace`` argument as the ``regex`` argument. In this case the ``value`` +argument must be passed explicity by name or ``regex`` must be a nested +dictionary. The previous example, in this case, would then be + +.. ipython:: python + + df.replace(regex=[r'\s*\.\s*', r'a|b'], value=nan) + +This can be convenient if you do not want to pass ``regex=True`` every time you +want to use a regular expression. + +.. note:: + + Anywhere in the above ``replace`` examples that you see a regular expression + a compiled regular expression is valid as well. + +Numeric Replacement +~~~~~~~~~~~~~~~~~~~ + +Similiar to ``DataFrame.fillna`` + +.. ipython:: python + :suppress: + + from numpy.random import rand, randn + from numpy import nan + from pandas import DataFrame + from pandas.util.testing import assert_frame_equal + +.. ipython:: python + + df = DataFrame(randn(10, 2)) + df[rand(df.shape[0]) > 0.5] = 1.5 + df.replace(1.5, nan) + +Replacing more than one value via lists works as well + +.. ipython:: python + + df00 = df.values[0, 0] + df.replace([1.5, df00], [nan, 'a']) + df[1].dtype + +You can also operate on the DataFrame in place + +.. ipython:: python + + df.replace(1.5, nan, inplace=True) + +.. warning:: + + When replacing multiple ``bool`` or ``datetime64`` objects, the first + argument to ``replace`` (``to_replace``) must match the type of the value + being replaced type. For example, + + .. code-block:: python + + s = Series([True, False, True]) + s.replace({'a string': 'new value', True: False}) # raises + + TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' + + will raise a ``TypeError`` because one of the ``dict`` keys is not of the + correct type for replacement. + + However, when replacing a *single* object such as, + + .. ipython:: python + + s = Series([True, False, True]) + s.replace('a string', 'another string') + + the original ``NDFrame`` object will be returned untouched. We're working on + unifying this API, but for backwards compatibility reasons we cannot break + the latter behavior. See :issue:`6354` for more details. + +Missing data casting rules and indexing +--------------------------------------- + +While pandas supports storing arrays of integer and boolean type, these types +are not capable of storing missing data. Until we can switch to using a native +NA type in NumPy, we've established some "casting rules" when reindexing will +cause missing data to be introduced into, say, a Series or DataFrame. Here they +are: + +.. csv-table:: + :header: "data type", "Cast to" + :widths: 40, 40 + + integer, float + boolean, object + float, no cast + object, no cast + +For example: + +.. ipython:: python + + s = Series(randn(5), index=[0, 2, 4, 6, 7]) + s > 0 + (s > 0).dtype + crit = (s > 0).reindex(list(range(8))) + crit + crit.dtype + +Ordinarily NumPy will complain if you try to use an object array (even if it +contains boolean values) instead of a boolean array to get or set values from +an ndarray (e.g. selecting values based on some criteria). If a boolean vector +contains NAs, an exception will be generated: + +.. ipython:: python + :okexcept: + + reindexed = s.reindex(list(range(8))).fillna(0) + reindexed[crit] + +However, these can be filled in using **fillna** and it will work fine: + +.. ipython:: python + + reindexed[crit.fillna(False)] + reindexed[crit.fillna(True)] diff --git a/doc/source/options.rst b/doc/source/options.rst new file mode 100644 index 00000000..961797ac --- /dev/null +++ b/doc/source/options.rst @@ -0,0 +1,411 @@ +.. _options: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import pandas as pd + import numpy as np + np.random.seed(123456) + +******************** +Options and Settings +******************** + +Overview +-------- +pandas has an options system that lets you customize some aspects of it's behaviour, +display-related options being those the user is most likely to adjust. + +Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``), +You can get/set options directly as attributes of the top-level ``options`` attribute: + +.. ipython:: python + + import pandas as pd + pd.options.display.max_rows + pd.options.display.max_rows = 999 + pd.options.display.max_rows + +There is also an API composed of 5 relevant functions, available directly from the ``pandas`` +namespace, and they are: + +- :func:`~pandas.get_option` / :func:`~pandas.set_option` - get/set the value of a single option. +- :func:`~pandas.reset_option` - reset one or more options to their default value. +- :func:`~pandas.describe_option` - print the descriptions of one or more options. +- :func:`~pandas.option_context` - execute a codeblock with a set of options + that revert to prior settings after execution. + +**Note:** developers can check out pandas/core/config.py for more info. + +All of the functions above accept a regexp pattern (``re.search`` style) as an argument, +and so passing in a substring will work - as long as it is unambiguous : + +.. ipython:: python + + pd.get_option("display.max_rows") + pd.set_option("display.max_rows",101) + pd.get_option("display.max_rows") + pd.set_option("max_r",102) + pd.get_option("display.max_rows") + + +The following will **not work** because it matches multiple option names, e.g. +``display.max_colwidth``, ``display.max_rows``, ``display.max_columns``: + +.. ipython:: python + :okexcept: + + try: + pd.get_option("column") + except KeyError as e: + print(e) + + +**Note:** Using this form of shorthand may cause your code to break if new options with similar names are added in future versions. + + +You can get a list of available options and their descriptions with ``describe_option``. When called +with no argument ``describe_option`` will print out the descriptions for all available options. + +.. ipython:: python + :suppress: + + pd.reset_option("all") + +Getting and Setting Options +--------------------------- + +As described above, ``get_option()`` and ``set_option()`` are available from the +pandas namespace. To change an option, call ``set_option('option regex', new_value)`` + +.. ipython:: python + + pd.get_option('mode.sim_interactive') + pd.set_option('mode.sim_interactive', True) + pd.get_option('mode.sim_interactive') + +All options also have a default value, and you can use ``reset_option`` to do just that: + +.. ipython:: python + :suppress: + + pd.reset_option("display.max_rows") + +.. ipython:: python + + pd.get_option("display.max_rows") + pd.set_option("display.max_rows",999) + pd.get_option("display.max_rows") + pd.reset_option("display.max_rows") + pd.get_option("display.max_rows") + + +It's also possible to reset multiple options at once (using a regex): + +.. ipython:: python + + pd.reset_option("^display") + + +``option_context`` context manager has been exposed through +the top-level API, allowing you to execute code with given option values. Option values +are restored automatically when you exit the `with` block: + +.. ipython:: python + + with pd.option_context("display.max_rows",10,"display.max_columns", 5): + print(pd.get_option("display.max_rows")) + print(pd.get_option("display.max_columns")) + print(pd.get_option("display.max_rows")) + print(pd.get_option("display.max_columns")) + + +Frequently Used Options +----------------------- +The following is a walkthrough of the more frequently used display options. + +``display.max_rows`` and ``display.max_columns`` sets the maximum number +of rows and columns displayed when a frame is pretty-printed. Truncated +lines are replaced by an ellipsis. + +.. ipython:: python + + df=pd.DataFrame(np.random.randn(7,2)) + pd.set_option('max_rows', 7) + df + pd.set_option('max_rows', 5) + df + pd.reset_option('max_rows') + +``display.expand_frame_repr`` allows for the the representation of +dataframes to stretch across pages, wrapped over the full column vs row-wise. + +.. ipython:: python + + df=pd.DataFrame(np.random.randn(5,10)) + pd.set_option('expand_frame_repr', True) + df + pd.set_option('expand_frame_repr', False) + df + pd.reset_option('expand_frame_repr') + +``display.large_repr`` lets you select whether to display dataframes that exceed +``max_columns`` or ``max_rows`` as a truncated frame, or as a summary. + +.. ipython:: python + + df=pd.DataFrame(np.random.randn(10,10)) + pd.set_option('max_rows', 5) + pd.set_option('large_repr', 'truncate') + df + pd.set_option('large_repr', 'info') + df + pd.reset_option('large_repr') + pd.reset_option('max_rows') + +``display.max_columnwidth`` sets the maximum width of columns. Cells +of this length or longer will be truncated with an elipsis. + +.. ipython:: python + + df=pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'], + ['horse', 'cow', 'banana', 'apple']])) + pd.set_option('max_colwidth',40) + df + pd.set_option('max_colwidth', 6) + df + pd.reset_option('max_colwidth') + +``display.max_info_columns`` sets a threshold for when by-column info +will be given. + +.. ipython:: python + + df=pd.DataFrame(np.random.randn(10,10)) + pd.set_option('max_info_columns', 11) + df.info() + pd.set_option('max_info_columns', 5) + df.info() + pd.reset_option('max_info_columns') + +``display.max_info_rows``: ``df.info()`` will usually show null-counts for each column. +For large frames this can be quite slow. ``max_info_rows`` and ``max_info_cols`` +limit this null check only to frames with smaller dimensions then specified. + +.. ipython:: python + + df=pd.DataFrame(np.random.choice([0,1,np.nan],size=(10,10))) + df + pd.set_option('max_info_rows', 11) + df.info() + pd.set_option('max_info_rows', 5) + df.info() + pd.reset_option('max_info_rows') + +``display.precision`` sets the output display precision. This is only a +suggestion. + +.. ipython:: python + + df=pd.DataFrame(np.random.randn(5,5)) + pd.set_option('precision',7) + df + pd.set_option('precision',4) + df + +``display.chop_threshold`` sets at what level pandas rounds to zero when +it displays a Series of DataFrame. Note, this does not effect the +precision at which the number is stored. + +.. ipython:: python + + df=pd.DataFrame(np.random.randn(6,6)) + pd.set_option('chop_threshold', 0) + df + pd.set_option('chop_threshold', .5) + df + pd.reset_option('chop_threshold') + +``display.colheader_justify`` controls the justification of the headers. +Options are 'right', and 'left'. + +.. ipython:: python + + df=pd.DataFrame(np.array([np.random.randn(6), np.random.randint(1,9,6)*.1, np.zeros(6)]).T, columns=['A', 'B', 'C'], dtype='float') + pd.set_option('colheader_justify', 'right') + df + pd.set_option('colheader_justify', 'left') + df + pd.reset_option('colheader_justify') + + + +List of Options +--------------- + +========================== ============ ================================== +Option Default Function +========================== ============ ================================== +display.chop_threshold None If set to a float value, all float + values smaller then the given + threshold will be displayed as + exactly 0 by repr and friends. +display.colheader_justify right Controls the justification of + column headers. used by DataFrameFormatter. +display.column_space 12 No description available. +display.date_dayfirst False When True, prints and parses dates + with the day first, eg 20/01/2005 +display.date_yearfirst False When True, prints and parses dates + with the year first, eg 2005/01/20 +display.encoding UTF-8 Defaults to the detected encoding + of the console. Specifies the encoding + to be used for strings returned by + to_string, these are generally strings + meant to be displayed on the console. +display.expand_frame_repr True Whether to print out the full DataFrame + repr for wide DataFrames across + multiple lines, `max_columns` is + still respected, but the output will + wrap-around across multiple "pages" + if it's width exceeds `display.width`. +display.float_format None The callable should accept a floating + point number and return a string with + the desired format of the number. + This is used in some places like + SeriesFormatter. + See core.format.EngFormatter for an example. +display.height 60 Deprecated. Use `display.max_rows` instead. +display.large_repr truncate For DataFrames exceeding max_rows/max_cols, + the repr (and HTML repr) can show + a truncated table (the default from 0.13), + or switch to the view from df.info() + (the behaviour in earlier versions of pandas). + allowable settings, ['truncate', 'info'] +display.line_width 80 Deprecated. Use `display.width` instead. +display.max_columns 20 max_rows and max_columns are used + in __repr__() methods to decide if + to_string() or info() is used to + render an object to a string. In + case python/IPython is running in + a terminal this can be set to 0 and + pandas will correctly auto-detect + the width the terminal and swap to + a smaller format in case all columns + would not fit vertically. The IPython + notebook, IPython qtconsole, or IDLE + do not run in a terminal and hence + it is not possible to do correct + auto-detection. 'None' value means + unlimited. +display.max_colwidth 50 The maximum width in characters of + a column in the repr of a pandas + data structure. When the column overflows, + a "..." placeholder is embedded in + the output. +display.max_info_columns 100 max_info_columns is used in DataFrame.info + method to decide if per column information + will be printed. +display.max_info_rows 1690785 df.info() will usually show null-counts + for each column. For large frames + this can be quite slow. max_info_rows + and max_info_cols limit this null + check only to frames with smaller + dimensions then specified. +display.max_rows 60 This sets the maximum number of rows + pandas should output when printing + out various output. For example, + this value determines whether the + repr() for a dataframe prints out + fully or just a summary repr. + 'None' value means unlimited. +display.max_seq_items 100 when pretty-printing a long sequence, + no more then `max_seq_items` will + be printed. If items are omitted, + they will be denoted by the addition + of "..." to the resulting string. + If set to None, the number of items + to be printed is unlimited. +display.mpl_style None Setting this to 'default' will modify + the rcParams used by matplotlib + to give plots a more pleasing visual + style by default. Setting this to + None/False restores the values to + their initial value. +display.multi_sparse True "Sparsify" MultiIndex display (don't + display repeated elements in outer + levels within groups) +display.notebook_repr_html True When True, IPython notebook will + use html representation for + pandas objects (if it is available). +display.pprint_nest_depth 3 Controls the number of nested levels + to process when pretty-printing +display.precision 7 Floating point output precision + (number of significant digits). This is + only a suggestion +display.show_dimensions truncate Whether to print out dimensions + at the end of DataFrame repr. + If 'truncate' is specified, only + print out the dimensions if the + frame is truncated (e.g. not display + all rows and/or columns) +display.width 80 Width of the display in characters. + In case python/IPython is running in + a terminal this can be set to None + and pandas will correctly auto-detect + the width. Note that the IPython notebook, + IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible + to correctly detect the width. +io.excel.xls.writer xlwt The default Excel writer engine for + 'xls' files. +io.excel.xlsm.writer openpyxl The default Excel writer engine for + 'xlsm' files. Available options: + 'openpyxl' (the default). +io.excel.xlsx.writer openpyxl The default Excel writer engine for + 'xlsx' files. +io.hdf.default_format None default format writing format, if + None, then put will default to + 'fixed' and append will default to + 'table' +io.hdf.dropna_table True drop ALL nan rows when appending + to a table +mode.chained_assignment warn Raise an exception, warn, or no + action if trying to use chained + assignment, The default is warn +mode.sim_interactive False Whether to simulate interactive mode + for purposes of testing +mode.use_inf_as_null False True means treat None, NaN, -INF, + INF as null (old way), False means + None and NaN are null, but INF, -INF + are not null (new way). +========================== ============ ================================== + +.. _basics.console_output: + +Number Formatting +------------------ + +pandas also allow you to set how numbers are displayed in the console. +This option is not set through the ``set_options`` API. + +Use the ``set_eng_float_format`` function +to alter the floating-point formatting of pandas objects to produce a particular +format. + +For instance: + +.. ipython:: python + + import numpy as np + + pd.set_eng_float_format(accuracy=3, use_eng_prefix=True) + s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s/1.e3 + s/1.e6 + +.. ipython:: python + :suppress: + + pd.reset_option('^display\.') diff --git a/doc/source/overview.rst b/doc/source/overview.rst new file mode 100644 index 00000000..8e474663 --- /dev/null +++ b/doc/source/overview.rst @@ -0,0 +1,121 @@ +.. _overview: + +.. currentmodule:: pandas + +**************** +Package overview +**************** + +:mod:`pandas` consists of the following things + + * A set of labeled array data structures, the primary of which are + Series/TimeSeries and DataFrame + * Index objects enabling both simple axis indexing and multi-level / + hierarchical axis indexing + * An integrated group by engine for aggregating and transforming data sets + * Date range generation (date_range) and custom date offsets enabling the + implementation of customized frequencies + * Input/Output tools: loading tabular data from flat files (CSV, delimited, + Excel 2003), and saving and loading pandas objects from the fast and + efficient PyTables/HDF5 format. + * Memory-efficent "sparse" versions of the standard data structures for storing + data that is mostly missing or mostly constant (some fixed value) + * Moving window statistics (rolling mean, rolling standard deviation, etc.) + * Static and moving window linear and `panel regression + `__ + +Data structures at a glance +--------------------------- + +.. csv-table:: + :header: "Dimensions", "Name", "Description" + :widths: 15, 20, 50 + + 1, Series, "1D labeled homogeneously-typed array" + 1, TimeSeries, "Series with index containing datetimes" + 2, DataFrame, "General 2D labeled, size-mutable tabular structure with + potentially heterogeneously-typed columns" + 3, Panel, "General 3D labeled, also size-mutable array" + +Why more than 1 data structure? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The best way to think about the pandas data structures is as flexible +containers for lower dimensional data. For example, DataFrame is a container +for Series, and Panel is a container for DataFrame objects. We would like to be +able to insert and remove objects from these containers in a dictionary-like +fashion. + +Also, we would like sensible default behaviors for the common API functions +which take into account the typical orientation of time series and +cross-sectional data sets. When using ndarrays to store 2- and 3-dimensional +data, a burden is placed on the user to consider the orientation of the data +set when writing functions; axes are considered more or less equivalent (except +when C- or Fortran-contiguousness matters for performance). In pandas, the axes +are intended to lend more semantic meaning to the data; i.e., for a particular +data set there is likely to be a "right" way to orient the data. The goal, +then, is to reduce the amount of mental effort required to code up data +transformations in downstream functions. + +For example, with tabular data (DataFrame) it is more semantically helpful to +think of the **index** (the rows) and the **columns** rather than axis 0 and +axis 1. And iterating through the columns of the DataFrame thus results in more +readable code: + +:: + + for col in df.columns: + series = df[col] + # do something with series + +Mutability and copying of data +------------------------------ + +All pandas data structures are value-mutable (the values they contain can be +altered) but not always size-mutable. The length of a Series cannot be +changed, but, for example, columns can be inserted into a DataFrame. However, +the vast majority of methods produce new objects and leave the input data +untouched. In general, though, we like to **favor immutability** where +sensible. + +Getting Support +--------------- + +The first stop for pandas issues and ideas is the `Github Issue Tracker +`__. If you have a general question, +pandas community experts can answer through `Stack Overflow +`__. + +Longer discussions occur on the `developer mailing list +`__, and commercial support +inquiries for Lambda Foundry should be sent to: support@lambdafoundry.com + +Credits +------- + +pandas development began at `AQR Capital Management `__ in +April 2008. It was open-sourced at the end of 2009. AQR continued to provide +resources for development through the end of 2011, and continues to contribute +bug reports today. + +Since January 2012, `Lambda Foundry `__, has +been providing development resources, as well as commercial support, +training, and consulting for pandas. + +pandas is only made possible by a group of people around the world like you +who have contributed new code, bug reports, fixes, comments and ideas. A +complete list can be found `on Github `__. + +Development Team +---------------- + +pandas is a part of the PyData project. The PyData Development Team is a +collection of developers focused on the improvement of Python's data +libraries. The core team that coordinates development can be found on `Github +`__. If you're interested in contributing, please +visit the `project website `__. + +License +------- + +.. literalinclude:: ../../LICENSE diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst new file mode 100644 index 00000000..98fc4edf --- /dev/null +++ b/doc/source/r_interface.rst @@ -0,0 +1,110 @@ +.. currentmodule:: pandas.rpy + +.. _rpy: + +.. ipython:: python + :suppress: + + from pandas import * + options.display.max_rows=15 + + +****************** +rpy2 / R interface +****************** + +.. note:: + + This is all highly experimental. I would like to get more people involved + with building a nice RPy2 interface for pandas + + +If your computer has R and rpy2 (> 2.2) installed (which will be left to the +reader), you will be able to leverage the below functionality. On Windows, +doing this is quite an ordeal at the moment, but users on Unix-like systems +should find it quite easy. rpy2 evolves in time, and is currently reaching +its release 2.3, while the current interface is +designed for the 2.2.x series. We recommend to use 2.2.x over other series +unless you are prepared to fix parts of the code, yet the rpy2-2.3.0 +introduces improvements such as a better R-Python bridge memory management +layer so it might be a good idea to bite the bullet and submit patches for +the few minor differences that need to be fixed. + + +:: + + # if installing for the first time + hg clone http://bitbucket.org/lgautier/rpy2 + + cd rpy2 + hg pull + hg update version_2.2.x + sudo python setup.py install + +.. note:: + + To use R packages with this interface, you will need to install + them inside R yourself. At the moment it cannot install them for + you. + +Once you have done installed R and rpy2, you should be able to import +``pandas.rpy.common`` without a hitch. + +Transferring R data sets into Python +------------------------------------ + +The **load_data** function retrieves an R data set and converts it to the +appropriate pandas object (most likely a DataFrame): + + +.. ipython:: python + + import pandas.rpy.common as com + infert = com.load_data('infert') + + infert.head() + + +Converting DataFrames into R objects +------------------------------------ + +.. versionadded:: 0.8 + +Starting from pandas 0.8, there is **experimental** support to convert +DataFrames into the equivalent R object (that is, **data.frame**): + +.. ipython:: python + + from pandas import DataFrame + + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, + index=["one", "two", "three"]) + r_dataframe = com.convert_to_r_dataframe(df) + + print(type(r_dataframe)) + print(r_dataframe) + + +The DataFrame's index is stored as the ``rownames`` attribute of the +data.frame instance. + +You can also use **convert_to_r_matrix** to obtain a ``Matrix`` instance, but +bear in mind that it will only work with homogeneously-typed DataFrames (as +R matrices bear no information on the data type): + + +.. ipython:: python + + r_matrix = com.convert_to_r_matrix(df) + + print(type(r_matrix)) + print(r_matrix) + + +Calling R functions with pandas objects +--------------------------------------- + + + +High-level interface to R estimators +------------------------------------ diff --git a/doc/source/release.rst b/doc/source/release.rst new file mode 100644 index 00000000..fb06dc4d --- /dev/null +++ b/doc/source/release.rst @@ -0,0 +1,4245 @@ +.. _release: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import os + import csv + from pandas.compat import StringIO + import pandas as pd + ExcelWriter = pd.ExcelWriter + + import numpy as np + np.random.seed(123456) + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + + import matplotlib.pyplot as plt + plt.close('all') + + from pandas import * + options.display.max_rows=15 + import pandas.util.testing as tm + +************* +Release Notes +************* + +This is the list of changes to pandas between each release. For full details, +see the commit logs at http://github.com/pydata/pandas + +**What is it** + +pandas is a Python package providing fast, flexible, and expressive data +structures designed to make working with “relational” or “labeled” data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, real world data analysis in Python. Additionally, it has the +broader goal of becoming the most powerful and flexible open source data +analysis / manipulation tool available in any language. + +**Where to get it** + +* Source code: http://github.com/pydata/pandas +* Binary installers on PyPI: http://pypi.python.org/pypi/pandas +* Documentation: http://pandas.pydata.org + +pandas 0.14.1 +------------- + +**Release date:** (July 11, 2014) + +This is a minor release from 0.14.0 and includes a small number of API changes, several new features, enhancements, and +performance improvements along with a large number of bug fixes. + +Highlights include: + +- New methods :meth:`~pandas.DataFrame.select_dtypes` to select columns + based on the dtype and :meth:`~pandas.Series.sem` to calculate the + standard error of the mean. +- Support for dateutil timezones (see :ref:`docs `). +- Support for ignoring full line comments in the :func:`~pandas.read_csv` + text parser. +- New documentation section on :ref:`Options and Settings `. +- Lots of bug fixes. + +See the :ref:`v0.14.1 Whatsnew ` overview or the issue tracker on GitHub for an extensive list +of all API changes, enhancements and bugs that have been fixed in 0.14.1. + +Thanks +~~~~~~ + +- Andrew Rosenfeld +- Andy Hayden +- Benjamin Adams +- Benjamin M. Gross +- Brian Quistorff +- Brian Wignall +- bwignall +- clham +- Daniel Waeber +- David Bew +- David Stephens +- DSM +- dsm054 +- helger +- immerrr +- Jacob Schaer +- jaimefrio +- Jan Schulz +- John David Reaver +- John W. O'Brien +- Joris Van den Bossche +- jreback +- Julien Danjou +- Kevin Sheppard +- K.-Michael Aye +- Kyle Meyer +- lexual +- Matthew Brett +- Matt Wittmann +- Michael Mueller +- Mortada Mehyar +- onesandzeroes +- Phillip Cloud +- Rob Levy +- rockg +- sanguineturtle +- Schaer, Jacob C +- seth-p +- sinhrks +- Stephan Hoyer +- Thomas Kluyver +- Todd Jennings +- TomAugspurger +- unknown +- yelite + +pandas 0.14.0 +------------- + +**Release date:** (May 31, 2014) + +This is a major release from 0.13.1 and includes a number of API changes, several new features, enhancements, and +performance improvements along with a large number of bug fixes. + +Highlights include: + +- Officially support Python 3.4 +- SQL interfaces updated to use ``sqlalchemy``, see :ref:`here`. +- Display interface changes, see :ref:`here` +- MultiIndexing using Slicers, see :ref:`here`. +- Ability to join a singly-indexed DataFrame with a multi-indexed DataFrame, see :ref:`here ` +- More consistency in groupby results and more flexible groupby specifications, see :ref:`here` +- Holiday calendars are now supported in ``CustomBusinessDay``, see :ref:`here ` +- Several improvements in plotting functions, including: hexbin, area and pie plots, see :ref:`here`. +- Performance doc section on I/O operations, see :ref:`here ` + +See the :ref:`v0.14.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list +of all API changes, enhancements and bugs that have been fixed in 0.14.0. + +Thanks +~~~~~~ + +- Acanthostega +- Adam Marcus +- agijsberts +- akittredge +- Alex Gaudio +- Alex Rothberg +- AllenDowney +- Andrew Rosenfeld +- Andy Hayden +- ankostis +- anomrake +- Antoine Mazières +- anton-d +- bashtage +- Benedikt Sauer +- benjamin +- Brad Buran +- bwignall +- cgohlke +- chebee7i +- Christopher Whelan +- Clark Fitzgerald +- clham +- Dale Jung +- Dan Allan +- Dan Birken +- danielballan +- Daniel Waeber +- David Jung +- David Stephens +- Douglas McNeil +- DSM +- Garrett Drapala +- Gouthaman Balaraman +- Guillaume Poulin +- hshimizu77 +- hugo +- immerrr +- ischwabacher +- Jacob Howard +- Jacob Schaer +- jaimefrio +- Jason Sexauer +- Jeff Reback +- Jeffrey Starr +- Jeff Tratner +- John David Reaver +- John McNamara +- John W. O'Brien +- Jonathan Chambers +- Joris Van den Bossche +- jreback +- jsexauer +- Julia Evans +- Júlio +- Katie Atkinson +- kdiether +- Kelsey Jordahl +- Kevin Sheppard +- K.-Michael Aye +- Matthias Kuhn +- Matt Wittmann +- Max Grender-Jones +- Michael E. Gruen +- michaelws +- mikebailey +- Mike Kelly +- Nipun Batra +- Noah Spies +- ojdo +- onesandzeroes +- Patrick O'Keeffe +- phaebz +- Phillip Cloud +- Pietro Battiston +- PKEuS +- Randy Carnevale +- ribonoous +- Robert Gibboni +- rockg +- sinhrks +- Skipper Seabold +- SplashDance +- Stephan Hoyer +- Tim Cera +- Tobias Brandt +- Todd Jennings +- TomAugspurger +- Tom Augspurger +- unutbu +- westurner +- Yaroslav Halchenko +- y-p +- zach powers + +pandas 0.13.1 +------------- + +**Release date:** (February 3, 2014) + +New Features +~~~~~~~~~~~~ + +- Added ``date_format`` and ``datetime_format`` attribute to ``ExcelWriter``. + (:issue:`4133`) + +API Changes +~~~~~~~~~~~ + +- ``Series.sort`` will raise a ``ValueError`` (rather than a ``TypeError``) on sorting an + object that is a view of another (:issue:`5856`, :issue:`5853`) +- Raise/Warn ``SettingWithCopyError`` (according to the option ``chained_assignment`` in more cases, + when detecting chained assignment, related (:issue:`5938`, :issue:`6025`) +- DataFrame.head(0) returns self instead of empty frame (:issue:`5846`) +- ``autocorrelation_plot`` now accepts ``**kwargs``. (:issue:`5623`) +- ``convert_objects`` now accepts a ``convert_timedeltas='coerce'`` argument to allow forced dtype conversion of + timedeltas (:issue:`5458`,:issue:`5689`) +- Add ``-NaN`` and ``-nan`` to the default set of NA values + (:issue:`5952`). See :ref:`NA Values `. +- ``NDFrame`` now has an ``equals`` method. (:issue:`5283`) +- ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a + ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is + empty (:issue:`6007`). + +Experimental Features +~~~~~~~~~~~~~~~~~~~~~ + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- perf improvements in Series datetime/timedelta binary operations (:issue:`5801`) +- `option_context` context manager now available as top-level API (:issue:`5752`) +- df.info() view now display dtype info per column (:issue:`5682`) +- df.info() now honors option max_info_rows, disable null counts for large frames (:issue:`5974`) +- perf improvements in DataFrame ``count/dropna`` for ``axis=1`` +- Series.str.contains now has a `regex=False` keyword which can be faster for plain (non-regex) string patterns. (:issue:`5879`) +- support ``dtypes`` property on ``Series/Panel/Panel4D`` +- extend ``Panel.apply`` to allow arbitrary functions (rather than only ufuncs) (:issue:`1148`) + allow multiple axes to be used to operate on slabs of a ``Panel`` +- The ``ArrayFormatter`` for ``datetime`` and ``timedelta64`` now intelligently + limit precision based on the values in the array (:issue:`3401`) +- ``pd.show_versions()`` is now available for convenience when reporting issues. +- perf improvements to Series.str.extract (:issue:`5944`) +- perf improvments in ``dtypes/ftypes`` methods (:issue:`5968`) +- perf improvments in indexing with object dtypes (:issue:`5968`) +- improved dtype inference for ``timedelta`` like passed to constructors (:issue:`5458`, :issue:`5689`) +- escape special characters when writing to latex (:issue: `5374`) +- perf improvements in ``DataFrame.apply`` (:issue:`6013`) +- ``pd.read_csv`` and ``pd.to_datetime`` learned a new ``infer_datetime_format`` keyword which greatly + improves parsing perf in many cases. Thanks to @lexual for suggesting and @danbirken + for rapidly implementing. (:issue:`5490`,:issue:`6021`) +- add ability to recognize '%p' format code (am/pm) to date parsers when the specific format + is supplied (:issue:`5361`) +- Fix performance regression in JSON IO (:issue:`5765`) +- performance regression in Index construction from Series (:issue:`6150`) + +.. _release.bug_fixes-0.13.1: + +Bug Fixes +~~~~~~~~~ + +- Bug in ``io.wb.get_countries`` not including all countries (:issue:`6008`) +- Bug in Series replace with timestamp dict (:issue:`5797`) +- read_csv/read_table now respects the `prefix` kwarg (:issue:`5732`). +- Bug in selection with missing values via ``.ix`` from a duplicate indexed DataFrame failing (:issue:`5835`) +- Fix issue of boolean comparison on empty DataFrames (:issue:`5808`) +- Bug in isnull handling ``NaT`` in an object array (:issue:`5443`) +- Bug in ``to_datetime`` when passed a ``np.nan`` or integer datelike and a format string (:issue:`5863`) +- Bug in groupby dtype conversion with datetimelike (:issue:`5869`) +- Regression in handling of empty Series as indexers to Series (:issue:`5877`) +- Bug in internal caching, related to (:issue:`5727`) +- Testing bug in reading json/msgpack from a non-filepath on windows under py3 (:issue:`5874`) +- Bug when assigning to .ix[tuple(...)] (:issue:`5896`) +- Bug in fully reindexing a Panel (:issue:`5905`) +- Bug in idxmin/max with object dtypes (:issue:`5914`) +- Bug in ``BusinessDay`` when adding n days to a date not on offset when n>5 and n%5==0 (:issue:`5890`) +- Bug in assigning to chained series with a series via ix (:issue:`5928`) +- Bug in creating an empty DataFrame, copying, then assigning (:issue:`5932`) +- Bug in DataFrame.tail with empty frame (:issue:`5846`) +- Bug in propogating metadata on ``resample`` (:issue:`5862`) +- Fixed string-representation of ``NaT`` to be "NaT" (:issue:`5708`) +- Fixed string-representation for Timestamp to show nanoseconds if present (:issue:`5912`) +- ``pd.match`` not returning passed sentinel +- ``Panel.to_frame()`` no longer fails when ``major_axis`` is a + ``MultiIndex`` (:issue:`5402`). +- Bug in ``pd.read_msgpack`` with inferring a ``DateTimeIndex`` frequency + incorrectly (:issue:`5947`) +- Fixed ``to_datetime`` for array with both Tz-aware datetimes and ``NaT``'s (:issue:`5961`) +- Bug in rolling skew/kurtosis when passed a Series with bad data (:issue:`5749`) +- Bug in scipy ``interpolate`` methods with a datetime index (:issue:`5975`) +- Bug in NaT comparison if a mixed datetime/np.datetime64 with NaT were passed (:issue:`5968`) +- Fixed bug with ``pd.concat`` losing dtype information if all inputs are empty (:issue:`5742`) +- Recent changes in IPython cause warnings to be emitted when using previous versions + of pandas in QTConsole, now fixed. If you're using an older version and + need to suppress the warnings, see (:issue:`5922`). +- Bug in merging ``timedelta`` dtypes (:issue:`5695`) +- Bug in plotting.scatter_matrix function. Wrong alignment among diagonal + and off-diagonal plots, see (:issue:`5497`). +- Regression in Series with a multi-index via ix (:issue:`6018`) +- Bug in Series.xs with a multi-index (:issue:`6018`) +- Bug in Series construction of mixed type with datelike and an integer (which should result in + object type and not automatic conversion) (:issue:`6028`) +- Possible segfault when chained indexing with an object array under numpy 1.7.1 (:issue:`6026`, :issue:`6056`) +- Bug in setting using fancy indexing a single element with a non-scalar (e.g. a list), + (:issue:`6043`) +- ``to_sql`` did not respect ``if_exists`` (:issue:`4110` :issue:`4304`) +- Regression in ``.get(None)`` indexing from 0.12 (:issue:`5652`) +- Subtle ``iloc`` indexing bug, surfaced in (:issue:`6059`) +- Bug with insert of strings into DatetimeIndex (:issue:`5818`) +- Fixed unicode bug in to_html/HTML repr (:issue:`6098`) +- Fixed missing arg validation in get_options_data (:issue:`6105`) +- Bug in assignment with duplicate columns in a frame where the locations + are a slice (e.g. next to each other) (:issue:`6120`) +- Bug in propogating _ref_locs during construction of a DataFrame with dups + index/columns (:issue:`6121`) +- Bug in ``DataFrame.apply`` when using mixed datelike reductions (:issue:`6125`) +- Bug in ``DataFrame.append`` when appending a row with different columns (:issue:`6129`) +- Bug in DataFrame construction with recarray and non-ns datetime dtype (:issue:`6140`) +- Bug in ``.loc`` setitem indexing with a dataframe on rhs, multiple item setting, and + a datetimelike (:issue:`6152`) +- Fixed a bug in ``query``/``eval`` during lexicographic string comparisons (:issue:`6155`). +- Fixed a bug in ``query`` where the index of a single-element ``Series`` was + being thrown away (:issue:`6148`). +- Bug in ``HDFStore`` on appending a dataframe with multi-indexed columns to + an existing table (:issue:`6167`) +- Consistency with dtypes in setting an empty DataFrame (:issue:`6171`) +- Bug in selecting on a multi-index ``HDFStore`` even in the presence of under + specified column spec (:issue:`6169`) +- Bug in ``nanops.var`` with ``ddof=1`` and 1 elements would sometimes return ``inf`` + rather than ``nan`` on some platforms (:issue:`6136`) +- Bug in Series and DataFrame bar plots ignoring the ``use_index`` keyword (:issue:`6209`) +- Bug in groupby with mixed str/int under python3 fixed; ``argsort`` was failing (:issue:`6212`) + +pandas 0.13.0 +------------- + +**Release date:** January 3, 2014 + +New Features +~~~~~~~~~~~~ + +- ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and + ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set + the bandwidth, and to gkde.evaluate() to specify the indicies at which it + is evaluated, respectively. See scipy docs. (:issue:`4298`) +- Added ``isin`` method to DataFrame (:issue:`4211`) +- ``df.to_clipboard()`` learned a new ``excel`` keyword that let's you + paste df data directly into excel (enabled by default). (:issue:`5070`). +- Clipboard functionality now works with PySide (:issue:`4282`) +- New ``extract`` string method returns regex matches more conveniently + (:issue:`4685`) +- Auto-detect field widths in read_fwf when unspecified (:issue:`4488`) +- ``to_csv()`` now outputs datetime objects according to a specified format + string via the ``date_format`` keyword (:issue:`4313`) +- Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`) +- Added ``cumcount`` groupby method (:issue:`4646`) +- Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`) +- Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the + statistical mode(s) of a column/series. (:issue:`5367`) + +Experimental Features +~~~~~~~~~~~~~~~~~~~~~ + +- The new :func:`~pandas.eval` function implements expression evaluation + using ``numexpr`` behind the scenes. This results in large speedups for + complicated expressions involving large DataFrames/Series. +- :class:`~pandas.DataFrame` has a new :meth:`~pandas.DataFrame.eval` that + evaluates an expression in the context of the ``DataFrame``; allows + inline expression assignment +- A :meth:`~pandas.DataFrame.query` method has been added that allows + you to select elements of a ``DataFrame`` using a natural query syntax + nearly identical to Python syntax. +- ``pd.eval`` and friends now evaluate operations involving ``datetime64`` + objects in Python space because ``numexpr`` cannot handle ``NaT`` values + (:issue:`4897`). +- Add msgpack support via ``pd.read_msgpack()`` and ``pd.to_msgpack()`` / + ``df.to_msgpack()`` for serialization of arbitrary pandas (and python + objects) in a lightweight portable binary format (:issue:`686`, :issue:`5506`) +- Added PySide support for the qtpandas DataFrameModel and DataFrameWidget. +- Added :mod:`pandas.io.gbq` for reading from (and writing to) Google + BigQuery into a DataFrame. (:issue:`4140`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``read_html`` now raises a ``URLError`` instead of catching and raising a + ``ValueError`` (:issue:`4303`, :issue:`4305`) +- ``read_excel`` now supports an integer in its ``sheetname`` argument giving + the index of the sheet to read in (:issue:`4301`). +- ``get_dummies`` works with NaN (:issue:`4446`) +- Added a test for ``read_clipboard()`` and ``to_clipboard()`` + (:issue:`4282`) +- Added bins argument to ``value_counts`` (:issue:`3945`), also sort and + ascending, now available in Series method as well as top-level function. +- Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf", + "iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting + ``read_table``, ``read_csv``, etc. +- Added a more informative error message when plot arguments contain + overlapping color and style arguments (:issue:`4402`) +- Significant table writing performance improvements in ``HDFStore`` +- JSON date serialization now performed in low-level C code. +- JSON support for encoding datetime.time +- Expanded JSON docs, more info about orient options and the use of the numpy + param when decoding. +- Add ``drop_level`` argument to xs (:issue:`4180`) +- Can now resample a DataFrame with ohlc (:issue:`2320`) +- ``Index.copy()`` and ``MultiIndex.copy()`` now accept keyword arguments to + change attributes (i.e., ``names``, ``levels``, ``labels``) + (:issue:`4039`) +- Add ``rename`` and ``set_names`` methods to ``Index`` as well as + ``set_names``, ``set_levels``, ``set_labels`` to ``MultiIndex``. + (:issue:`4039`) with improved validation for all (:issue:`4039`, + :issue:`4794`) +- A Series of dtype ``timedelta64[ns]`` can now be divided/multiplied + by an integer series (:issue:`4521`) +- A Series of dtype ``timedelta64[ns]`` can now be divided by another + ``timedelta64[ns]`` object to yield a ``float64`` dtyped Series. This + is frequency conversion; astyping is also supported. +- Timedelta64 support ``fillna/ffill/bfill`` with an integer interpreted as + seconds, or a ``timedelta`` (:issue:`3371`) +- Box numeric ops on ``timedelta`` Series (:issue:`4984`) +- Datetime64 support ``ffill/bfill`` +- Performance improvements with ``__getitem__`` on ``DataFrames`` with + when the key is a column +- Support for using a ``DatetimeIndex/PeriodsIndex`` directly in a datelike + calculation e.g. s-s.index (:issue:`4629`) +- Better/cleaned up exceptions in core/common, io/excel and core/format + (:issue:`4721`, :issue:`3954`), as well as cleaned up test cases in + tests/test_frame, tests/test_multilevel (:issue:`4732`). +- Performance improvement of timeseries plotting with PeriodIndex and added + test to vbench (:issue:`4705` and :issue:`4722`) +- Add ``axis`` and ``level`` keywords to ``where``, so that the ``other`` + argument can now be an alignable pandas object. +- ``to_datetime`` with a format of '%Y%m%d' now parses much faster +- It's now easier to hook new Excel writers into pandas (just subclass + ``ExcelWriter`` and register your engine). You can specify an ``engine`` in + ``to_excel`` or in ``ExcelWriter``. You can also specify which writers you + want to use by default with config options ``io.excel.xlsx.writer`` and + ``io.excel.xls.writer``. (:issue:`4745`, :issue:`4750`) +- ``Panel.to_excel()`` now accepts keyword arguments that will be passed to + its ``DataFrame``'s ``to_excel()`` methods. (:issue:`4750`) +- Added XlsxWriter as an optional ``ExcelWriter`` engine. This is about 5x + faster than the default openpyxl xlsx writer and is equivalent in speed + to the xlwt xls writer module. (:issue:`4542`) +- allow DataFrame constructor to accept more list-like objects, e.g. list of + ``collections.Sequence`` and ``array.Array`` objects (:issue:`3783`, + :issue:`4297`, :issue:`4851`), thanks @lgautier +- DataFrame constructor now accepts a numpy masked record array + (:issue:`3478`), thanks @jnothman +- ``__getitem__`` with ``tuple`` key (e.g., ``[:, 2]``) on ``Series`` + without ``MultiIndex`` raises ``ValueError`` (:issue:`4759`, :issue:`4837`) +- ``read_json`` now raises a (more informative) ``ValueError`` when the dict + contains a bad key and ``orient='split'`` (:issue:`4730`, :issue:`4838`) +- ``read_stata`` now accepts Stata 13 format (:issue:`4291`) +- ``ExcelWriter`` and ``ExcelFile`` can be used as contextmanagers. + (:issue:`3441`, :issue:`4933`) +- ``pandas`` is now tested with two different versions of ``statsmodels`` + (0.4.3 and 0.5.0) (:issue:`4981`). +- Better string representations of ``MultiIndex`` (including ability to + roundtrip via ``repr``). (:issue:`3347`, :issue:`4935`) +- Both ExcelFile and read_excel to accept an xlrd.Book for the io + (formerly path_or_buf) argument; this requires engine to be set. + (:issue:`4961`). +- ``concat`` now gives a more informative error message when passed objects + that cannot be concatenated (:issue:`4608`). +- Add ``halflife`` option to exponentially weighted moving functions (PR + :issue:`4998`) +- ``to_dict`` now takes ``records`` as a possible outtype. Returns an array + of column-keyed dictionaries. (:issue:`4936`) +- ``tz_localize`` can infer a fall daylight savings transition based on the + structure of unlocalized data (:issue:`4230`) +- DatetimeIndex is now in the API documentation +- Improve support for converting R datasets to pandas objects (more + informative index for timeseries and numeric, support for factors, dist, + and high-dimensional arrays). +- :func:`~pandas.read_html` now supports the ``parse_dates``, + ``tupleize_cols`` and ``thousands`` parameters (:issue:`4770`). +- :meth:`~pandas.io.json.json_normalize` is a new method to allow you to + create a flat table from semi-structured JSON data. :ref:`See the + docs` (:issue:`1067`) +- ``DataFrame.from_records()`` will now accept generators (:issue:`4910`) +- ``DataFrame.interpolate()`` and ``Series.interpolate()`` have been expanded + to include interpolation methods from scipy. (:issue:`4434`, :issue:`1892`) +- ``Series`` now supports a ``to_frame`` method to convert it to a + single-column DataFrame (:issue:`5164`) +- DatetimeIndex (and date_range) can now be constructed in a left- or + right-open fashion using the ``closed`` parameter (:issue:`4579`) +- Python csv parser now supports usecols (:issue:`4335`) +- Added support for Google Analytics v3 API segment IDs that also supports v2 + IDs. (:issue:`5271`) +- ``NDFrame.drop()`` now accepts names as well as integers for the axis + argument. (:issue:`5354`) +- Added short docstrings to a few methods that were missing them + fixed the + docstrings for Panel flex methods. (:issue:`5336`) +- ``NDFrame.drop()``, ``NDFrame.dropna()``, and ``.drop_duplicates()`` all + accept ``inplace`` as a keyword argument; however, this only means that the + wrapper is updated inplace, a copy is still made internally. + (:issue:`1960`, :issue:`5247`, :issue:`5628`, and related :issue:`2325` [still not + closed]) +- Fixed bug in `tools.plotting.andrews_curvres` so that lines are drawn grouped + by color as expected. +- ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int + by default. (:issue:`5394`) +- Excel writers now have a default option ``merge_cells`` in ``to_excel()`` + to merge cells in MultiIndex and Hierarchical Rows. Note: using this + option it is no longer possible to round trip Excel files with merged + MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to + restore the previous behaviour. (:issue:`5254`) +- The FRED DataReader now accepts multiple series (:issue`3413`) +- StataWriter adjusts variable names to Stata's limitations (:issue:`5709`) + +API Changes +~~~~~~~~~~~ + +- ``DataFrame.reindex()`` and forward/backward filling now raises ValueError + if either index is not monotonic (:issue:`4483`, :issue:`4484`). +- ``pandas`` now is Python 2/3 compatible without the need for 2to3 thanks to + @jtratner. As a result, pandas now uses iterators more extensively. This + also led to the introduction of substantive parts of the Benjamin + Peterson's ``six`` library into compat. (:issue:`4384`, :issue:`4375`, + :issue:`4372`) +- ``pandas.util.compat`` and ``pandas.util.py3compat`` have been merged into + ``pandas.compat``. ``pandas.compat`` now includes many functions allowing + 2/3 compatibility. It contains both list and iterator versions of range, + filter, map and zip, plus other necessary elements for Python 3 + compatibility. ``lmap``, ``lzip``, ``lrange`` and ``lfilter`` all produce + lists instead of iterators, for compatibility with ``numpy``, subscripting + and ``pandas`` constructors.(:issue:`4384`, :issue:`4375`, :issue:`4372`) +- deprecated ``iterkv``, which will be removed in a future release (was just + an alias of iteritems used to get around ``2to3``'s changes). + (:issue:`4384`, :issue:`4375`, :issue:`4372`) +- ``Series.get`` with negative indexers now returns the same as ``[]`` + (:issue:`4390`) +- allow ``ix/loc`` for Series/DataFrame/Panel to set on any axis even when + the single-key is not currently contained in the index for that axis + (:issue:`2578`, :issue:`5226`, :issue:`5632`, :issue:`5720`, + :issue:`5744`, :issue:`5756`) +- Default export for ``to_clipboard`` is now csv with a sep of `\t` for + compat (:issue:`3368`) +- ``at`` now will enlarge the object inplace (and return the same) + (:issue:`2578`) +- ``DataFrame.plot`` will scatter plot x versus y by passing + ``kind='scatter'`` (:issue:`2215`) + +- ``HDFStore`` + + - ``append_to_multiple`` automatically synchronizes writing rows to multiple + tables and adds a ``dropna`` kwarg (:issue:`4698`) + - handle a passed ``Series`` in table format (:issue:`4330`) + - added an ``is_open`` property to indicate if the underlying file handle + is_open; a closed store will now report 'CLOSED' when viewing the store + (rather than raising an error) (:issue:`4409`) + - a close of a ``HDFStore`` now will close that instance of the + ``HDFStore`` but will only close the actual file if the ref count (by + ``PyTables``) w.r.t. all of the open handles are 0. Essentially you have + a local instance of ``HDFStore`` referenced by a variable. Once you close + it, it will report closed. Other references (to the same file) will + continue to operate until they themselves are closed. Performing an + action on a closed file will raise ``ClosedFileError`` + - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if + retrieving duplicate rows from a table (:issue:`4367`) + - removed the ``warn`` argument from ``open``. Instead a + ``PossibleDataLossError`` exception will be raised if you try to use + ``mode='w'`` with an OPEN file handle (:issue:`4367`) + - allow a passed locations array or mask as a ``where`` condition + (:issue:`4467`) + - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan + rows are not written to the store (default is ``True``, ALL nan rows are + NOT written), also settable via the option ``io.hdf.dropna_table`` + (:issue:`4625`) + - the ``format`` keyword now replaces the ``table`` keyword; allowed values + are ``fixed(f)|table(t)`` the ``Storer`` format has been renamed to + ``Fixed`` + - a column multi-index will be recreated properly (:issue:`4710`); raise on + trying to use a multi-index with data_columns on the same axis + - ``select_as_coordinates`` will now return an ``Int64Index`` of the + resultant selection set + - support ``timedelta64[ns]`` as a serialization type (:issue:`3577`) + - store `datetime.date` objects as ordinals rather then timetuples to avoid + timezone issues (:issue:`2852`), thanks @tavistmorph and @numpand + - ``numexpr`` 2.2.2 fixes incompatiblity in PyTables 2.4 (:issue:`4908`) + - ``flush`` now accepts an ``fsync`` parameter, which defaults to ``False`` + (:issue:`5364`) + - ``unicode`` indices not supported on ``table`` formats (:issue:`5386`) + - pass thru store creation arguments; can be used to support in-memory stores +- ``JSON`` + + - added ``date_unit`` parameter to specify resolution of timestamps. + Options are seconds, milliseconds, microseconds and nanoseconds. + (:issue:`4362`, :issue:`4498`). + - added ``default_handler`` parameter to allow a callable to be passed + which will be responsible for handling otherwise unserialisable objects. + (:issue:`5138`) + +- ``Index`` and ``MultiIndex`` changes (:issue:`4039`): + + - Setting ``levels`` and ``labels`` directly on ``MultiIndex`` is now + deprecated. Instead, you can use the ``set_levels()`` and + ``set_labels()`` methods. + - ``levels``, ``labels`` and ``names`` properties no longer return lists, + but instead return containers that do not allow setting of items + ('mostly immutable') + - ``levels``, ``labels`` and ``names`` are validated upon setting and are + either copied or shallow-copied. + - inplace setting of ``levels`` or ``labels`` now correctly invalidates the + cached properties. (:issue:`5238`). + - ``__deepcopy__`` now returns a shallow copy (currently: a view) of the + data - allowing metadata changes. + - ``MultiIndex.astype()`` now only allows ``np.object_``-like dtypes and + now returns a ``MultiIndex`` rather than an ``Index``. (:issue:`4039`) + - Added ``is_`` method to ``Index`` that allows fast equality comparison of + views (similar to ``np.may_share_memory`` but no false positives, and + changes on ``levels`` and ``labels`` setting on ``MultiIndex``). + (:issue:`4859` , :issue:`4909`) + - Aliased ``__iadd__`` to ``__add__``. (:issue:`4996`) + - Added ``is_`` method to ``Index`` that allows fast equality comparison of + views (similar to ``np.may_share_memory`` but no false positives, and + changes on ``levels`` and ``labels`` setting on ``MultiIndex``). + (:issue:`4859`, :issue:`4909`) + +- Infer and downcast dtype if ``downcast='infer'`` is passed to + ``fillna/ffill/bfill`` (:issue:`4604`) +- ``__nonzero__`` for all NDFrame objects, will now raise a ``ValueError``, + this reverts back to (:issue:`1073`, :issue:`4633`) behavior. Add + ``.bool()`` method to ``NDFrame`` objects to facilitate evaluating of + single-element boolean Series +- ``DataFrame.update()`` no longer raises a ``DataConflictError``, it now + will raise a ``ValueError`` instead (if necessary) (:issue:`4732`) +- ``Series.isin()`` and ``DataFrame.isin()`` now raise a ``TypeError`` when + passed a string (:issue:`4763`). Pass a ``list`` of one element (containing + the string) instead. +- Remove undocumented/unused ``kind`` keyword argument from ``read_excel``, + and ``ExcelFile``. (:issue:`4713`, :issue:`4712`) +- The ``method`` argument of ``NDFrame.replace()`` is valid again, so that a + a list can be passed to ``to_replace`` (:issue:`4743`). +- provide automatic dtype conversions on _reduce operations (:issue:`3371`) +- exclude non-numerics if mixed types with datelike in _reduce operations + (:issue:`3371`) +- default for ``tupleize_cols`` is now ``False`` for both ``to_csv`` and + ``read_csv``. Fair warning in 0.12 (:issue:`3604`) +- moved timedeltas support to pandas.tseries.timedeltas.py; add timedeltas + string parsing, add top-level ``to_timedelta`` function +- ``NDFrame`` now is compatible with Python's toplevel ``abs()`` function + (:issue:`4821`). +- raise a ``TypeError`` on invalid comparison ops on Series/DataFrame (e.g. + integer/datetime) (:issue:`4968`) +- Added a new index type, ``Float64Index``. This will be automatically + created when passing floating values in index creation. This enables a + pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar + indexing and slicing work exactly the same. Indexing on other index types + are preserved (and positional fallback for ``[],ix``), with the exception, + that floating point slicing on indexes on non ``Float64Index`` will raise a + ``TypeError``, e.g. ``Series(range(5))[3.5:4.5]`` (:issue:`263`,:issue:`5375`) +- Make Categorical repr nicer (:issue:`4368`) +- Remove deprecated ``Factor`` (:issue:`3650`) +- Remove deprecated ``set_printoptions/reset_printoptions`` (:issue:``3046``) +- Remove deprecated ``_verbose_info`` (:issue:`3215`) +- Begin removing methods that don't make sense on ``GroupBy`` objects + (:issue:`4887`). +- Remove deprecated ``read_clipboard/to_clipboard/ExcelFile/ExcelWriter`` + from ``pandas.io.parsers`` (:issue:`3717`) +- All non-Index NDFrames (``Series``, ``DataFrame``, ``Panel``, ``Panel4D``, + ``SparsePanel``, etc.), now support the entire set of arithmetic operators + and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not + support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`) +- Arithemtic func factories are now passed real names (suitable for using + with super) (:issue:`5240`) +- Provide numpy compatibility with 1.7 for a calling convention like + ``np.prod(pandas_object)`` as numpy call with additional keyword args + (:issue:`4435`) +- Provide __dir__ method (and local context) for tab completion / remove + ipython completers code (:issue:`4501`) +- Support non-unique axes in a Panel via indexing operations (:issue:`4960`) +- ``.truncate`` will raise a ``ValueError`` if invalid before and afters + dates are given (:issue:`5242`) +- ``Timestamp`` now supports ``now/today/utcnow`` class methods + (:issue:`5339`) +- default for `display.max_seq_len` is now 100 rather then `None`. This activates + truncated display ("...") of long sequences in various places. (:issue:`3391`) +- **All** division with ``NDFrame`` - likes is now truedivision, regardless + of the future import. You can use ``//`` and ``floordiv`` to do integer + division. + +.. code-block:: python + + In [3]: arr = np.array([1, 2, 3, 4]) + + In [4]: arr2 = np.array([5, 3, 2, 1]) + + In [5]: arr / arr2 + Out[5]: array([0, 0, 1, 4]) + + In [6]: pd.Series(arr) / pd.Series(arr2) # no future import required + Out[6]: + 0 0.200000 + 1 0.666667 + 2 1.500000 + 3 4.000000 + dtype: float64 + +- raise/warn ``SettingWithCopyError/Warning`` exception/warning when setting of a + copy thru chained assignment is detected, settable via option ``mode.chained_assignment`` +- test the list of ``NA`` values in the csv parser. add ``N/A``, ``#NA`` as independent default + na values (:issue:`5521`) +- The refactoring involving``Series`` deriving from ``NDFrame`` breaks ``rpy2<=2.3.8``. an Issue + has been opened against rpy2 and a workaround is detailed in :issue:`5698`. Thanks @JanSchulz. +- ``Series.argmin`` and ``Series.argmax`` are now aliased to ``Series.idxmin`` and ``Series.idxmax``. + These return the *index* of the min or max element respectively. Prior to 0.13.0 these would return + the position of the min / max element (:issue:`6214`) + +Internal Refactoring +~~~~~~~~~~~~~~~~~~~~ + +In 0.13.0 there is a major refactor primarily to subclass ``Series`` from +``NDFrame``, which is the base class currently for ``DataFrame`` and ``Panel``, +to unify methods and behaviors. Series formerly subclassed directly from +``ndarray``. (:issue:`4080`, :issue:`3862`, :issue:`816`) +See :ref:`Internal Refactoring` + +- Refactor of series.py/frame.py/panel.py to move common code to generic.py + + - added ``_setup_axes`` to created generic NDFrame structures + - moved methods + + - ``from_axes``, ``_wrap_array``, ``axes``, ``ix``, ``loc``, ``iloc``, + ``shape``, ``empty``, ``swapaxes``, ``transpose``, ``pop`` + - ``__iter__``, ``keys``, ``__contains__``, ``__len__``, ``__neg__``, + ``__invert__`` + - ``convert_objects``, ``as_blocks``, ``as_matrix``, ``values`` + - ``__getstate__``, ``__setstate__`` (compat remains in frame/panel) + - ``__getattr__``, ``__setattr__`` + - ``_indexed_same``, ``reindex_like``, ``align``, ``where``, ``mask`` + - ``fillna``, ``replace`` (``Series`` replace is now consistent with + ``DataFrame``) + - ``filter`` (also added axis argument to selectively filter on a different + axis) + - ``reindex``, ``reindex_axis``, ``take`` + - ``truncate`` (moved to become part of ``NDFrame``) + - ``isnull/notnull`` now available on ``NDFrame`` objects + +- These are API changes which make ``Panel`` more consistent with ``DataFrame`` + + - ``swapaxes`` on a ``Panel`` with the same axes specified now return a copy + - support attribute access for setting + - ``filter`` supports same api as original ``DataFrame`` filter + - ``fillna`` refactored to ``core/generic.py``, while > 3ndim is + ``NotImplemented`` + +- Series now inherits from ``NDFrame`` rather than directly from ``ndarray``. + There are several minor changes that affect the API. + + - numpy functions that do not support the array interface will now return + ``ndarrays`` rather than series, e.g. ``np.diff``, ``np.ones_like``, + ``np.where`` + - ``Series(0.5)`` would previously return the scalar ``0.5``, this is no + longer supported + - ``TimeSeries`` is now an alias for ``Series``. the property + ``is_time_series`` can be used to distinguish (if desired) + +- Refactor of Sparse objects to use BlockManager + + - Created a new block type in internals, ``SparseBlock``, which can hold + multi-dtypes and is non-consolidatable. ``SparseSeries`` and + ``SparseDataFrame`` now inherit more methods from there hierarchy + (Series/DataFrame), and no longer inherit from ``SparseArray`` (which + instead is the object of the ``SparseBlock``) + - Sparse suite now supports integration with non-sparse data. Non-float + sparse data is supportable (partially implemented) + - Operations on sparse structures within DataFrames should preserve + sparseness, merging type operations will convert to dense (and back to + sparse), so might be somewhat inefficient + - enable setitem on ``SparseSeries`` for boolean/integer/slices + - ``SparsePanels`` implementation is unchanged (e.g. not using BlockManager, + needs work) + +- added ``ftypes`` method to Series/DataFame, similar to ``dtypes``, but + indicates if the underlying is sparse/dense (as well as the dtype) +- All ``NDFrame`` objects now have a ``_prop_attributes``, which can be used + to indcated various values to propogate to a new object from an existing + (e.g. name in ``Series`` will follow more automatically now) +- Internal type checking is now done via a suite of generated classes, + allowing ``isinstance(value, klass)`` without having to directly import the + klass, courtesy of @jtratner +- Bug in Series update where the parent frame is not updating its cache based + on changes (:issue:`4080`, :issue:`5216`) or types (:issue:`3217`), fillna + (:issue:`3386`) +- Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`) +- Refactor ``Series.reindex`` to core/generic.py (:issue:`4604`, + :issue:`4618`), allow ``method=`` in reindexing on a Series to work +- ``Series.copy`` no longer accepts the ``order`` parameter and is now + consistent with ``NDFrame`` copy +- Refactor ``rename`` methods to core/generic.py; fixes ``Series.rename`` for + (:issue:`4605`), and adds ``rename`` with the same signature for ``Panel`` +- Series (for index) / Panel (for items) now as attribute access to its + elements (:issue:`1903`) +- Refactor ``clip`` methods to core/generic.py (:issue:`4798`) +- Refactor of ``_get_numeric_data/_get_bool_data`` to core/generic.py, + allowing Series/Panel functionaility +- Refactor of Series arithmetic with time-like objects + (datetime/timedelta/time etc.) into a separate, cleaned up wrapper class. + (:issue:`4613`) +- Complex compat for ``Series`` with ``ndarray``. (:issue:`4819`) +- Removed unnecessary ``rwproperty`` from codebase in favor of builtin + property. (:issue:`4843`) +- Refactor object level numeric methods (mean/sum/min/max...) from object + level modules to ``core/generic.py`` (:issue:`4435`). +- Refactor cum objects to core/generic.py (:issue:`4435`), note that these + have a more numpy-like function signature. +- :func:`~pandas.read_html` now uses ``TextParser`` to parse HTML data from + bs4/lxml (:issue:`4770`). +- Removed the ``keep_internal`` keyword parameter in + ``pandas/core/groupby.py`` because it wasn't being used (:issue:`5102`). +- Base ``DateOffsets`` are no longer all instantiated on importing pandas, + instead they are generated and cached on the fly. The internal + representation and handling of DateOffsets has also been clarified. + (:issue:`5189`, related :issue:`5004`) +- ``MultiIndex`` constructor now validates that passed levels and labels are + compatible. (:issue:`5213`, :issue:`5214`) +- Unity ``dropna`` for Series/DataFrame signature (:issue:`5250`), + tests from :issue:`5234`, courtesy of @rockg +- Rewrite assert_almost_equal() in cython for performance (:issue:`4398`) +- Added an internal ``_update_inplace`` method to facilitate updating + ``NDFrame`` wrappers on inplace ops (only is for convenience of caller, + doesn't actually prevent copies). (:issue:`5247`) + +.. _release.bug_fixes-0.13.0: + + +Bug Fixes +~~~~~~~~~ + +- ``HDFStore`` + + - raising an invalid ``TypeError`` rather than ``ValueError`` when + appending with a different block ordering (:issue:`4096`) + - ``read_hdf`` was not respecting as passed ``mode`` (:issue:`4504`) + - appending a 0-len table will work correctly (:issue:`4273`) + - ``to_hdf`` was raising when passing both arguments ``append`` and + ``table`` (:issue:`4584`) + - reading from a store with duplicate columns across dtypes would raise + (:issue:`4767`) + - Fixed a bug where ``ValueError`` wasn't correctly raised when column + names weren't strings (:issue:`4956`) + - A zero length series written in Fixed format not deserializing properly. + (:issue:`4708`) + - Fixed decoding perf issue on pyt3 (:issue:`5441`) + - Validate levels in a multi-index before storing (:issue:`5527`) + - Correctly handle ``data_columns`` with a Panel (:issue:`5717`) +- Fixed bug in tslib.tz_convert(vals, tz1, tz2): it could raise IndexError + exception while trying to access trans[pos + 1] (:issue:`4496`) +- The ``by`` argument now works correctly with the ``layout`` argument + (:issue:`4102`, :issue:`4014`) in ``*.hist`` plotting methods +- Fixed bug in ``PeriodIndex.map`` where using ``str`` would return the str + representation of the index (:issue:`4136`) +- Fixed test failure ``test_time_series_plot_color_with_empty_kwargs`` when + using custom matplotlib default colors (:issue:`4345`) +- Fix running of stata IO tests. Now uses temporary files to write + (:issue:`4353`) +- Fixed an issue where ``DataFrame.sum`` was slower than ``DataFrame.mean`` + for integer valued frames (:issue:`4365`) +- ``read_html`` tests now work with Python 2.6 (:issue:`4351`) +- Fixed bug where ``network`` testing was throwing ``NameError`` because a + local variable was undefined (:issue:`4381`) +- In ``to_json``, raise if a passed ``orient`` would cause loss of data + because of a duplicate index (:issue:`4359`) +- In ``to_json``, fix date handling so milliseconds are the default timestamp + as the docstring says (:issue:`4362`). +- ``as_index`` is no longer ignored when doing groupby apply (:issue:`4648`, + :issue:`3417`) +- JSON NaT handling fixed, NaTs are now serialised to `null` (:issue:`4498`) +- Fixed JSON handling of escapable characters in JSON object keys + (:issue:`4593`) +- Fixed passing ``keep_default_na=False`` when ``na_values=None`` + (:issue:`4318`) +- Fixed bug with ``values`` raising an error on a DataFrame with duplicate + columns and mixed dtypes, surfaced in (:issue:`4377`) +- Fixed bug with duplicate columns and type conversion in ``read_json`` when + ``orient='split'`` (:issue:`4377`) +- Fixed JSON bug where locales with decimal separators other than '.' threw + exceptions when encoding / decoding certain values. (:issue:`4918`) +- Fix ``.iat`` indexing with a ``PeriodIndex`` (:issue:`4390`) +- Fixed an issue where ``PeriodIndex`` joining with self was returning a new + instance rather than the same instance (:issue:`4379`); also adds a test + for this for the other index types +- Fixed a bug with all the dtypes being converted to object when using the + CSV cparser with the usecols parameter (:issue:`3192`) +- Fix an issue in merging blocks where the resulting DataFrame had partially + set _ref_locs (:issue:`4403`) +- Fixed an issue where hist subplots were being overwritten when they were + called using the top level matplotlib API (:issue:`4408`) +- Fixed a bug where calling ``Series.astype(str)`` would truncate the string + (:issue:`4405`, :issue:`4437`) +- Fixed a py3 compat issue where bytes were being repr'd as tuples + (:issue:`4455`) +- Fixed Panel attribute naming conflict if item is named 'a' + (:issue:`3440`) +- Fixed an issue where duplicate indexes were raising when plotting + (:issue:`4486`) +- Fixed an issue where cumsum and cumprod didn't work with bool dtypes + (:issue:`4170`, :issue:`4440`) +- Fixed Panel slicing issued in ``xs`` that was returning an incorrect dimmed + object (:issue:`4016`) +- Fix resampling bug where custom reduce function not used if only one group + (:issue:`3849`, :issue:`4494`) +- Fixed Panel assignment with a transposed frame (:issue:`3830`) +- Raise on set indexing with a Panel and a Panel as a value which needs + alignment (:issue:`3777`) +- frozenset objects now raise in the ``Series`` constructor (:issue:`4482`, + :issue:`4480`) +- Fixed issue with sorting a duplicate multi-index that has multiple dtypes + (:issue:`4516`) +- Fixed bug in ``DataFrame.set_values`` which was causing name attributes to + be lost when expanding the index. (:issue:`3742`, :issue:`4039`) +- Fixed issue where individual ``names``, ``levels`` and ``labels`` could be + set on ``MultiIndex`` without validation (:issue:`3714`, :issue:`4039`) +- Fixed (:issue:`3334`) in pivot_table. Margins did not compute if values is + the index. +- Fix bug in having a rhs of ``np.timedelta64`` or ``np.offsets.DateOffset`` + when operating with datetimes (:issue:`4532`) +- Fix arithmetic with series/datetimeindex and ``np.timedelta64`` not working + the same (:issue:`4134`) and buggy timedelta in numpy 1.6 (:issue:`4135`) +- Fix bug in ``pd.read_clipboard`` on windows with PY3 (:issue:`4561`); not + decoding properly +- ``tslib.get_period_field()`` and ``tslib.get_period_field_arr()`` now raise + if code argument out of range (:issue:`4519`, :issue:`4520`) +- Fix boolean indexing on an empty series loses index names (:issue:`4235`), + infer_dtype works with empty arrays. +- Fix reindexing with multiple axes; if an axes match was not replacing the + current axes, leading to a possible lazay frequency inference issue + (:issue:`3317`) +- Fixed issue where ``DataFrame.apply`` was reraising exceptions incorrectly + (causing the original stack trace to be truncated). +- Fix selection with ``ix/loc`` and non_unique selectors (:issue:`4619`) +- Fix assignment with iloc/loc involving a dtype change in an existing column + (:issue:`4312`, :issue:`5702`) have internal setitem_with_indexer in core/indexing + to use Block.setitem +- Fixed bug where thousands operator was not handled correctly for floating + point numbers in csv_import (:issue:`4322`) +- Fix an issue with CacheableOffset not properly being used by many + DateOffset; this prevented the DateOffset from being cached (:issue:`4609`) +- Fix boolean comparison with a DataFrame on the lhs, and a list/tuple on the + rhs (:issue:`4576`) +- Fix error/dtype conversion with setitem of ``None`` on ``Series/DataFrame`` + (:issue:`4667`) +- Fix decoding based on a passed in non-default encoding in ``pd.read_stata`` + (:issue:`4626`) +- Fix ``DataFrame.from_records`` with a plain-vanilla ``ndarray``. + (:issue:`4727`) +- Fix some inconsistencies with ``Index.rename`` and ``MultiIndex.rename``, + etc. (:issue:`4718`, :issue:`4628`) +- Bug in using ``iloc/loc`` with a cross-sectional and duplicate indicies + (:issue:`4726`) +- Bug with using ``QUOTE_NONE`` with ``to_csv`` causing ``Exception``. + (:issue:`4328`) +- Bug with Series indexing not raising an error when the right-hand-side has + an incorrect length (:issue:`2702`) +- Bug in multi-indexing with a partial string selection as one part of a + MultIndex (:issue:`4758`) +- Bug with reindexing on the index with a non-unique index will now raise + ``ValueError`` (:issue:`4746`) +- Bug in setting with ``loc/ix`` a single indexer with a multi-index axis and + a numpy array, related to (:issue:`3777`) +- Bug in concatenation with duplicate columns across dtypes not merging with + axis=0 (:issue:`4771`, :issue:`4975`) +- Bug in ``iloc`` with a slice index failing (:issue:`4771`) +- Incorrect error message with no colspecs or width in ``read_fwf``. + (:issue:`4774`) +- Fix bugs in indexing in a Series with a duplicate index (:issue:`4548`, + :issue:`4550`) +- Fixed bug with reading compressed files with ``read_fwf`` in Python 3. + (:issue:`3963`) +- Fixed an issue with a duplicate index and assignment with a dtype change + (:issue:`4686`) +- Fixed bug with reading compressed files in as ``bytes`` rather than ``str`` + in Python 3. Simplifies bytes-producing file-handling in Python 3 + (:issue:`3963`, :issue:`4785`). +- Fixed an issue related to ticklocs/ticklabels with log scale bar plots + across different versions of matplotlib (:issue:`4789`) +- Suppressed DeprecationWarning associated with internal calls issued by + repr() (:issue:`4391`) +- Fixed an issue with a duplicate index and duplicate selector with ``.loc`` + (:issue:`4825`) +- Fixed an issue with ``DataFrame.sort_index`` where, when sorting by a + single column and passing a list for ``ascending``, the argument for + ``ascending`` was being interpreted as ``True`` (:issue:`4839`, + :issue:`4846`) +- Fixed ``Panel.tshift`` not working. Added `freq` support to ``Panel.shift`` + (:issue:`4853`) +- Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser) + with thousands != "," (:issue:`4596`) +- Bug in getitem with a duplicate index when using where (:issue:`4879`) +- Fix Type inference code coerces float column into datetime (:issue:`4601`) +- Fixed ``_ensure_numeric`` does not check for complex numbers + (:issue:`4902`) +- Fixed a bug in ``Series.hist`` where two figures were being created when + the ``by`` argument was passed (:issue:`4112`, :issue:`4113`). +- Fixed a bug in ``convert_objects`` for > 2 ndims (:issue:`4937`) +- Fixed a bug in DataFrame/Panel cache insertion and subsequent indexing + (:issue:`4939`, :issue:`5424`) +- Fixed string methods for ``FrozenNDArray`` and ``FrozenList`` + (:issue:`4929`) +- Fixed a bug with setting invalid or out-of-range values in indexing + enlargement scenarios (:issue:`4940`) +- Tests for fillna on empty Series (:issue:`4346`), thanks @immerrr +- Fixed ``copy()`` to shallow copy axes/indices as well and thereby keep + separate metadata. (:issue:`4202`, :issue:`4830`) +- Fixed skiprows option in Python parser for read_csv (:issue:`4382`) +- Fixed bug preventing ``cut`` from working with ``np.inf`` levels without + explicitly passing labels (:issue:`3415`) +- Fixed wrong check for overlapping in ``DatetimeIndex.union`` + (:issue:`4564`) +- Fixed conflict between thousands separator and date parser in csv_parser + (:issue:`4678`) +- Fix appending when dtypes are not the same (error showing mixing + float/np.datetime64) (:issue:`4993`) +- Fix repr for DateOffset. No longer show duplicate entries in kwds. + Removed unused offset fields. (:issue:`4638`) +- Fixed wrong index name during read_csv if using usecols. Applies to c + parser only. (:issue:`4201`) +- ``Timestamp`` objects can now appear in the left hand side of a comparison + operation with a ``Series`` or ``DataFrame`` object (:issue:`4982`). +- Fix a bug when indexing with ``np.nan`` via ``iloc/loc`` (:issue:`5016`) +- Fixed a bug where low memory c parser could create different types in + different chunks of the same file. Now coerces to numerical type or raises + warning. (:issue:`3866`) +- Fix a bug where reshaping a ``Series`` to its own shape raised + ``TypeError`` (:issue:`4554`) and other reshaping issues. +- Bug in setting with ``ix/loc`` and a mixed int/string index (:issue:`4544`) +- Make sure series-series boolean comparions are label based (:issue:`4947`) +- Bug in multi-level indexing with a Timestamp partial indexer + (:issue:`4294`) +- Tests/fix for multi-index construction of an all-nan frame (:issue:`4078`) +- Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring + values of tables with commas (:issue:`5029`) +- Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable + ordering of returned tables (:issue:`4770`, :issue:`5029`). +- Fixed a bug where :func:`~pandas.read_html` was incorrectly parsing when + passed ``index_col=0`` (:issue:`5066`). +- Fixed a bug where :func:`~pandas.read_html` was incorrectly infering the + type of headers (:issue:`5048`). +- Fixed a bug where ``DatetimeIndex`` joins with ``PeriodIndex`` caused a + stack overflow (:issue:`3899`). +- Fixed a bug where ``groupby`` objects didn't allow plots (:issue:`5102`). +- Fixed a bug where ``groupby`` objects weren't tab-completing column names + (:issue:`5102`). +- Fixed a bug where ``groupby.plot()`` and friends were duplicating figures + multiple times (:issue:`5102`). +- Provide automatic conversion of ``object`` dtypes on fillna, related + (:issue:`5103`) +- Fixed a bug where default options were being overwritten in the option + parser cleaning (:issue:`5121`). +- Treat a list/ndarray identically for ``iloc`` indexing with list-like + (:issue:`5006`) +- Fix ``MultiIndex.get_level_values()`` with missing values (:issue:`5074`) +- Fix bound checking for Timestamp() with datetime64 input (:issue:`4065`) +- Fix a bug where ``TestReadHtml`` wasn't calling the correct ``read_html()`` + function (:issue:`5150`). +- Fix a bug with ``NDFrame.replace()`` which made replacement appear as + though it was (incorrectly) using regular expressions (:issue:`5143`). +- Fix better error message for to_datetime (:issue:`4928`) +- Made sure different locales are tested on travis-ci (:issue:`4918`). Also + adds a couple of utilities for getting locales and setting locales with a + context manager. +- Fixed segfault on ``isnull(MultiIndex)`` (now raises an error instead) + (:issue:`5123`, :issue:`5125`) +- Allow duplicate indices when performing operations that align + (:issue:`5185`, :issue:`5639`) +- Compound dtypes in a constructor raise ``NotImplementedError`` + (:issue:`5191`) +- Bug in comparing duplicate frames (:issue:`4421`) related +- Bug in describe on duplicate frames +- Bug in ``to_datetime`` with a format and ``coerce=True`` not raising + (:issue:`5195`) +- Bug in ``loc`` setting with multiple indexers and a rhs of a Series that + needs broadcasting (:issue:`5206`) +- Fixed bug where inplace setting of levels or labels on ``MultiIndex`` would + not clear cached ``values`` property and therefore return wrong ``values``. + (:issue:`5215`) +- Fixed bug where filtering a grouped DataFrame or Series did not maintain + the original ordering (:issue:`4621`). +- Fixed ``Period`` with a business date freq to always roll-forward if on a + non-business date. (:issue:`5203`) +- Fixed bug in Excel writers where frames with duplicate column names weren't + written correctly. (:issue:`5235`) +- Fixed issue with ``drop`` and a non-unique index on Series (:issue:`5248`) +- Fixed seg fault in C parser caused by passing more names than columns in + the file. (:issue:`5156`) +- Fix ``Series.isin`` with date/time-like dtypes (:issue:`5021`) +- C and Python Parser can now handle the more common multi-index column + format which doesn't have a row for index names (:issue:`4702`) +- Bug when trying to use an out-of-bounds date as an object dtype + (:issue:`5312`) +- Bug when trying to display an embedded PandasObject (:issue:`5324`) +- Allows operating of Timestamps to return a datetime if the result is out-of-bounds + related (:issue:`5312`) +- Fix return value/type signature of ``initObjToJSON()`` to be compatible + with numpy's ``import_array()`` (:issue:`5334`, :issue:`5326`) +- Bug when renaming then set_index on a DataFrame (:issue:`5344`) +- Test suite no longer leaves around temporary files when testing graphics. (:issue:`5347`) + (thanks for catching this @yarikoptic!) +- Fixed html tests on win32. (:issue:`4580`) +- Make sure that ``head/tail`` are ``iloc`` based, (:issue:`5370`) +- Fixed bug for ``PeriodIndex`` string representation if there are 1 or 2 + elements. (:issue:`5372`) +- The GroupBy methods ``transform`` and ``filter`` can be used on Series + and DataFrames that have repeated (non-unique) indices. (:issue:`4620`) +- Fix empty series not printing name in repr (:issue:`4651`) +- Make tests create temp files in temp directory by default. (:issue:`5419`) +- ``pd.to_timedelta`` of a scalar returns a scalar (:issue:`5410`) +- ``pd.to_timedelta`` accepts ``NaN`` and ``NaT``, returning ``NaT`` instead of raising (:issue:`5437`) +- performance improvements in ``isnull`` on larger size pandas objects +- Fixed various setitem with 1d ndarray that does not have a matching + length to the indexer (:issue:`5508`) +- Bug in getitem with a multi-index and ``iloc`` (:issue:`5528`) +- Bug in delitem on a Series (:issue:`5542`) +- Bug fix in apply when using custom function and objects are not mutated (:issue:`5545`) +- Bug in selecting from a non-unique index with ``loc`` (:issue:`5553`) +- Bug in groupby returning non-consistent types when user function returns a ``None``, (:issue:`5592`) +- Work around regression in numpy 1.7.0 which erroneously raises IndexError from ``ndarray.item`` (:issue:`5666`) +- Bug in repeated indexing of object with resultant non-unique index (:issue:`5678`) +- Bug in fillna with Series and a passed series/dict (:issue:`5703`) +- Bug in groupby transform with a datetime-like grouper (:issue:`5712`) +- Bug in multi-index selection in PY3 when using certain keys (:issue:`5725`) +- Row-wise concat of differing dtypes failing in certain cases (:issue:`5754`) + +pandas 0.12.0 +------------- + +**Release date:** 2013-07-24 + +New Features +~~~~~~~~~~~~ + +- ``pd.read_html()`` can now parse HTML strings, files or urls and returns a + list of ``DataFrame`` s courtesy of @cpcloud. (:issue:`3477`, + :issue:`3605`, :issue:`3606`) +- Support for reading Amazon S3 files. (:issue:`3504`) +- Added module for reading and writing JSON strings/files: pandas.io.json + includes ``to_json`` DataFrame/Series method, and a ``read_json`` top-level reader + various issues (:issue:`1226`, :issue:`3804`, :issue:`3876`, :issue:`3867`, :issue:`1305`) +- Added module for reading and writing Stata files: pandas.io.stata (:issue:`1512`) + includes ``to_stata`` DataFrame method, and a ``read_stata`` top-level reader +- Added support for writing in ``to_csv`` and reading in ``read_csv``, + multi-index columns. The ``header`` option in ``read_csv`` now accepts a + list of the rows from which to read the index. Added the option, + ``tupleize_cols`` to provide compatiblity for the pre 0.12 behavior of + writing and reading multi-index columns via a list of tuples. The default in + 0.12 is to write lists of tuples and *not* interpret list of tuples as a + multi-index column. + Note: The default value will change in 0.12 to make the default *to* write and + read multi-index columns in the new format. (:issue:`3571`, :issue:`1651`, :issue:`3141`) +- Add iterator to ``Series.str`` (:issue:`3638`) +- ``pd.set_option()`` now allows N option, value pairs (:issue:`3667`). +- Added keyword parameters for different types of scatter_matrix subplots +- A ``filter`` method on grouped Series or DataFrames returns a subset of + the original (:issue:`3680`, :issue:`919`) +- Access to historical Google Finance data in pandas.io.data (:issue:`3814`) +- DataFrame plotting methods can sample column colors from a Matplotlib + colormap via the ``colormap`` keyword. (:issue:`3860`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Fixed various issues with internal pprinting code, the repr() for various objects + including TimeStamp and Index now produces valid python code strings and + can be used to recreate the object, (:issue:`3038`, :issue:`3379`, :issue:`3251`, :issue:`3460`) +- ``convert_objects`` now accepts a ``copy`` parameter (defaults to ``True``) +- ``HDFStore`` + + - will retain index attributes (freq,tz,name) on recreation (:issue:`3499`,:issue:`4098`) + - will warn with a ``AttributeConflictWarning`` if you are attempting to append + an index with a different frequency than the existing, or attempting + to append an index with a different name than the existing + - support datelike columns with a timezone as data_columns (:issue:`2852`) + - table writing performance improvements. + - support python3 (via ``PyTables 3.0.0``) (:issue:`3750`) +- Add modulo operator to Series, DataFrame +- Add ``date`` method to DatetimeIndex +- Add ``dropna`` argument to pivot_table (:issue: `3820`) +- Simplified the API and added a describe method to Categorical +- ``melt`` now accepts the optional parameters ``var_name`` and ``value_name`` + to specify custom column names of the returned DataFrame (:issue:`3649`), + thanks @hoechenberger. If ``var_name`` is not specified and ``dataframe.columns.name`` + is not None, then this will be used as the ``var_name`` (:issue:`4144`). + Also support for MultiIndex columns. +- clipboard functions use pyperclip (no dependencies on Windows, alternative + dependencies offered for Linux) (:issue:`3837`). +- Plotting functions now raise a ``TypeError`` before trying to plot anything + if the associated objects have have a dtype of ``object`` (:issue:`1818`, + :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object + arrays to numeric arrays if possible so that you can still plot, for example, an + object array with floats. This happens before any drawing takes place which + elimnates any spurious plots from showing up. +- Added Faq section on repr display options, to help users customize their setup. +- ``where`` operations that result in block splitting are much faster (:issue:`3733`) +- Series and DataFrame hist methods now take a ``figsize`` argument (:issue:`3834`) +- DatetimeIndexes no longer try to convert mixed-integer indexes during join + operations (:issue:`3877`) +- Add ``unit`` keyword to ``Timestamp`` and ``to_datetime`` to enable passing of + integers or floats that are in an epoch unit of ``D, s, ms, us, ns``, thanks @mtkini (:issue:`3969`) + (e.g. unix timestamps or epoch ``s``, with fracional seconds allowed) (:issue:`3540`) +- DataFrame corr method (spearman) is now cythonized. +- Improved ``network`` test decorator to catch ``IOError`` (and therefore + ``URLError`` as well). Added ``with_connectivity_check`` decorator to allow + explicitly checking a website as a proxy for seeing if there is network + connectivity. Plus, new ``optional_args`` decorator factory for decorators. + (:issue:`3910`, :issue:`3914`) +- ``read_csv`` will now throw a more informative error message when a file + contains no columns, e.g., all newline characters +- Added ``layout`` keyword to DataFrame.hist() for more customizable layout (:issue:`4050`) +- Timestamp.min and Timestamp.max now represent valid Timestamp instances instead + of the default datetime.min and datetime.max (respectively), thanks @SleepingPills +- ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0 + is detected (:issue:`4214`) + +API Changes +~~~~~~~~~~~ + +- ``HDFStore`` + + - When removing an object, ``remove(key)`` raises + ``KeyError`` if the key is not a valid store object. + - raise a ``TypeError`` on passing ``where`` or ``columns`` + to select with a Storer; these are invalid parameters at this time (:issue:`4189`) + - can now specify an ``encoding`` option to ``append/put`` + to enable alternate encodings (:issue:`3750`) + - enable support for ``iterator/chunksize`` with ``read_hdf`` +- The repr() for (Multi)Index now obeys display.max_seq_items rather + then numpy threshold print options. (:issue:`3426`, :issue:`3466`) +- Added mangle_dupe_cols option to read_table/csv, allowing users + to control legacy behaviour re dupe cols (A, A.1, A.2 vs A, A ) (:issue:`3468`) + Note: The default value will change in 0.12 to the "no mangle" behaviour, + If your code relies on this behaviour, explicitly specify mangle_dupe_cols=True + in your calls. +- Do not allow astypes on ``datetime64[ns]`` except to ``object``, and + ``timedelta64[ns]`` to ``object/int`` (:issue:`3425`) +- The behavior of ``datetime64`` dtypes has changed with respect to certain + so-called reduction operations (:issue:`3726`). The following operations now + raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty* + ``Series`` when performed on a ``DataFrame`` similar to performing these + operations on, for example, a ``DataFrame`` of ``slice`` objects: + - sum, prod, mean, std, var, skew, kurt, corr, and cov +- Do not allow datetimelike/timedeltalike creation except with valid types + (e.g. cannot pass ``datetime64[ms]``) (:issue:`3423`) +- Add ``squeeze`` keyword to ``groupby`` to allow reduction from + DataFrame -> Series if groups are unique. Regression from 0.10.1, + partial revert on (:issue:`2893`) with (:issue:`3596`) +- Raise on ``iloc`` when boolean indexing with a label based indexer mask + e.g. a boolean Series, even with integer labels, will raise. Since ``iloc`` + is purely positional based, the labels on the Series are not alignable (:issue:`3631`) +- The ``raise_on_error`` option to plotting methods is obviated by :issue:`3572`, + so it is removed. Plots now always raise when data cannot be plotted or the + object being plotted has a dtype of ``object``. +- ``DataFrame.interpolate()`` is now deprecated. Please use + ``DataFrame.fillna()`` and ``DataFrame.replace()`` instead (:issue:`3582`, + :issue:`3675`, :issue:`3676`). +- the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are + deprecated +- ``DataFrame.replace`` 's ``infer_types`` parameter is removed and now + performs conversion by default. (:issue:`3907`) +- Deprecated display.height, display.width is now only a formatting option + does not control triggering of summary, similar to < 0.11.0. +- Add the keyword ``allow_duplicates`` to ``DataFrame.insert`` to allow a duplicate column + to be inserted if ``True``, default is ``False`` (same as prior to 0.12) (:issue:`3679`) +- io API changes + + - added ``pandas.io.api`` for i/o imports + - removed ``Excel`` support to ``pandas.io.excel`` + - added top-level ``pd.read_sql`` and ``to_sql`` DataFrame methods + - removed ``clipboard`` support to ``pandas.io.clipboard`` + - replace top-level and instance methods ``save`` and ``load`` with + top-level ``read_pickle`` and ``to_pickle`` instance method, ``save`` and + ``load`` will give deprecation warning. +- the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are + deprecated +- set FutureWarning to require data_source, and to replace year/month with + expiry date in pandas.io options. This is in preparation to add options + data from google (:issue:`3822`) +- the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are + deprecated +- Implement ``__nonzero__`` for ``NDFrame`` objects (:issue:`3691`, :issue:`3696`) +- ``as_matrix`` with mixed signed and unsigned dtypes will result in 2 x the lcd of the unsigned + as an int, maxing with ``int64``, to avoid precision issues (:issue:`3733`) +- ``na_values`` in a list provided to ``read_csv/read_excel`` will match string and numeric versions + e.g. ``na_values=['99']`` will match 99 whether the column ends up being int, float, or string (:issue:`3611`) +- ``read_html`` now defaults to ``None`` when reading, and falls back on + ``bs4`` + ``html5lib`` when lxml fails to parse. a list of parsers to try + until success is also valid +- more consistency in the to_datetime return types (give string/array of string inputs) (:issue:`3888`) +- The internal ``pandas`` class hierarchy has changed (slightly). The + previous ``PandasObject`` now is called ``PandasContainer`` and a new + ``PandasObject`` has become the baseclass for ``PandasContainer`` as well + as ``Index``, ``Categorical``, ``GroupBy``, ``SparseList``, and + ``SparseArray`` (+ their base classes). Currently, ``PandasObject`` + provides string methods (from ``StringMixin``). (:issue:`4090`, :issue:`4092`) +- New ``StringMixin`` that, given a ``__unicode__`` method, gets python 2 and + python 3 compatible string methods (``__str__``, ``__bytes__``, and + ``__repr__``). Plus string safety throughout. Now employed in many places + throughout the pandas library. (:issue:`4090`, :issue:`4092`) + +Experimental Features +~~~~~~~~~~~~~~~~~~~~~ + +- Added experimental ``CustomBusinessDay`` class to support ``DateOffsets`` + with custom holiday calendars and custom weekmasks. (:issue:`2301`) + +Bug Fixes +~~~~~~~~~ + +- Fixed an esoteric excel reading bug, xlrd>= 0.9.0 now required for excel + support. Should provide python3 support (for reading) which has been + lacking. (:issue:`3164`) +- Disallow Series constructor called with MultiIndex which caused segfault (:issue:`4187`) +- Allow unioning of date ranges sharing a timezone (:issue:`3491`) +- Fix to_csv issue when having a large number of rows and ``NaT`` in some + columns (:issue:`3437`) +- ``.loc`` was not raising when passed an integer list (:issue:`3449`) +- Unordered time series selection was misbehaving when using label slicing (:issue:`3448`) +- Fix sorting in a frame with a list of columns which contains datetime64[ns] dtypes (:issue:`3461`) +- DataFrames fetched via FRED now handle '.' as a NaN. (:issue:`3469`) +- Fix regression in a DataFrame apply with axis=1, objects were not being converted back + to base dtypes correctly (:issue:`3480`) +- Fix issue when storing uint dtypes in an HDFStore. (:issue:`3493`) +- Non-unique index support clarified (:issue:`3468`) + + - Addressed handling of dupe columns in df.to_csv new and old (:issue:`3454`, :issue:`3457`) + - Fix assigning a new index to a duplicate index in a DataFrame would fail (:issue:`3468`) + - Fix construction of a DataFrame with a duplicate index + - ref_locs support to allow duplicative indices across dtypes, + allows iget support to always find the index (even across dtypes) (:issue:`2194`) + - applymap on a DataFrame with a non-unique index now works + (removed warning) (:issue:`2786`), and fix (:issue:`3230`) + - Fix to_csv to handle non-unique columns (:issue:`3495`) + - Duplicate indexes with getitem will return items in the correct order (:issue:`3455`, :issue:`3457`) + and handle missing elements like unique indices (:issue:`3561`) + - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (:issue:`3562`) + - Concat to produce a non-unique columns when duplicates are across dtypes is fixed (:issue:`3602`) + - Non-unique indexing with a slice via ``loc`` and friends fixed (:issue:`3659`) + - Allow insert/delete to non-unique columns (:issue:`3679`) + - Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`) + - ``DataFrame.itertuples()`` now works with frames with duplicate column + names (:issue:`3873`) + - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to + ``reindex`` for location-based taking + - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`) + - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`) + +- Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`) +- Allow index name to be used in groupby for non MultiIndex (:issue:`4014`) +- Fixed bug in mixed-frame assignment with aligned series (:issue:`3492`) +- Fixed bug in selecting month/quarter/year from a series would not select the time element + on the last day (:issue:`3546`) +- Fixed a couple of MultiIndex rendering bugs in df.to_html() (:issue:`3547`, :issue:`3553`) +- Properly convert np.datetime64 objects in a Series (:issue:`3416`) +- Raise a ``TypeError`` on invalid datetime/timedelta operations + e.g. add datetimes, multiple timedelta x datetime +- Fix ``.diff`` on datelike and timedelta operations (:issue:`3100`) +- ``combine_first`` not returning the same dtype in cases where it can (:issue:`3552`) +- Fixed bug with ``Panel.transpose`` argument aliases (:issue:`3556`) +- Fixed platform bug in ``PeriodIndex.take`` (:issue:`3579`) +- Fixed bud in incorrect conversion of datetime64[ns] in ``combine_first`` (:issue:`3593`) +- Fixed bug in reset_index with ``NaN`` in a multi-index (:issue:`3586`) +- ``fillna`` methods now raise a ``TypeError`` when the ``value`` parameter + is a ``list`` or ``tuple``. +- Fixed bug where a time-series was being selected in preference to an actual column name + in a frame (:issue:`3594`) +- Make secondary_y work properly for bar plots (:issue:`3598`) +- Fix modulo and integer division on Series,DataFrames to act similary to ``float`` dtypes to return + ``np.nan`` or ``np.inf`` as appropriate (:issue:`3590`) +- Fix incorrect dtype on groupby with ``as_index=False`` (:issue:`3610`) +- Fix ``read_csv/read_excel`` to correctly encode identical na_values, e.g. ``na_values=[-999.0,-999]`` + was failing (:issue:`3611`) +- Disable HTML output in qtconsole again. (:issue:`3657`) +- Reworked the new repr display logic, which users found confusing. (:issue:`3663`) +- Fix indexing issue in ndim >= 3 with ``iloc`` (:issue:`3617`) +- Correctly parse date columns with embedded (nan/NaT) into datetime64[ns] dtype in ``read_csv`` + when ``parse_dates`` is specified (:issue:`3062`) +- Fix not consolidating before to_csv (:issue:`3624`) +- Fix alignment issue when setitem in a DataFrame with a piece of a DataFrame (:issue:`3626`) or + a mixed DataFrame and a Series (:issue:`3668`) +- Fix plotting of unordered DatetimeIndex (:issue:`3601`) +- ``sql.write_frame`` failing when writing a single column to sqlite (:issue:`3628`), + thanks to @stonebig +- Fix pivoting with ``nan`` in the index (:issue:`3558`) +- Fix running of bs4 tests when it is not installed (:issue:`3605`) +- Fix parsing of html table (:issue:`3606`) +- ``read_html()`` now only allows a single backend: ``html5lib`` (:issue:`3616`) +- ``convert_objects`` with ``convert_dates='coerce'`` was parsing some single-letter strings into today's date +- ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`) +- ``DataFrame.to_csv`` will succeed with the deprecated option ``nanRep``, @tdsmith +- ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for + their first argument (:issue:`3702`) +- Fix file tokenization error with \r delimiter and quoted fields (:issue:`3453`) +- Groupby transform with item-by-item not upcasting correctly (:issue:`3740`) +- Incorrectly read a HDFStore multi-index Frame witha column specification (:issue:`3748`) +- ``read_html`` now correctly skips tests (:issue:`3741`) +- PandasObjects raise TypeError when trying to hash (:issue:`3882`) +- Fix incorrect arguments passed to concat that are not list-like (e.g. concat(df1,df2)) (:issue:`3481`) +- Correctly parse when passed the ``dtype=str`` (or other variable-len string dtypes) + in ``read_csv`` (:issue:`3795`) +- Fix index name not propogating when using ``loc/ix`` (:issue:`3880`) +- Fix groupby when applying a custom function resulting in a returned DataFrame was + not converting dtypes (:issue:`3911`) +- Fixed a bug where ``DataFrame.replace`` with a compiled regular expression + in the ``to_replace`` argument wasn't working (:issue:`3907`) +- Fixed ``__truediv__`` in Python 2.7 with ``numexpr`` installed to actually do true division when dividing + two integer arrays with at least 10000 cells total (:issue:`3764`) +- Indexing with a string with seconds resolution not selecting from a time index (:issue:`3925`) +- csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was + specified (:issue:`3967`), python parser failing with ``chunksize=1`` +- Fix index name not propogating when using ``shift`` +- Fixed dropna=False being ignored with multi-index stack (:issue:`3997`) +- Fixed flattening of columns when renaming MultiIndex columns DataFrame (:issue:`4004`) +- Fix ``Series.clip`` for datetime series. NA/NaN threshold values will now throw ValueError (:issue:`3996`) +- Fixed insertion issue into DataFrame, after rename (:issue:`4032`) +- Fixed testing issue where too many sockets where open thus leading to a + connection reset issue (:issue:`3982`, :issue:`3985`, :issue:`4028`, + :issue:`4054`) +- Fixed failing tests in test_yahoo, test_google where symbols were not + retrieved but were being accessed (:issue:`3982`, :issue:`3985`, + :issue:`4028`, :issue:`4054`) +- ``Series.hist`` will now take the figure from the current environment if + one is not passed +- Fixed bug where a 1xN DataFrame would barf on a 1xN mask (:issue:`4071`) +- Fixed running of ``tox`` under python3 where the pickle import was getting + rewritten in an incompatible way (:issue:`4062`, :issue:`4063`) +- Fixed bug where sharex and sharey were not being passed to grouped_hist + (:issue:`4089`) +- Fix bug where ``HDFStore`` will fail to append because of a different block + ordering on-disk (:issue:`4096`) +- Better error messages on inserting incompatible columns to a frame (:issue:`4107`) +- Fixed bug in ``DataFrame.replace`` where a nested dict wasn't being + iterated over when regex=False (:issue:`4115`) +- Fixed bug in ``convert_objects(convert_numeric=True)`` where a mixed numeric and + object Series/Frame was not converting properly (:issue:`4119`) +- Fixed bugs in multi-index selection with column multi-index and duplicates + (:issue:`4145`, :issue:`4146`) +- Fixed bug in the parsing of microseconds when using the ``format`` + argument in ``to_datetime`` (:issue:`4152`) +- Fixed bug in ``PandasAutoDateLocator`` where ``invert_xaxis`` triggered + incorrectly ``MilliSecondLocator`` (:issue:`3990`) +- Fixed bug in ``Series.where`` where broadcasting a single element input vector + to the length of the series resulted in multiplying the value + inside the input (:issue:`4192`) +- Fixed bug in plotting that wasn't raising on invalid colormap for + matplotlib 1.1.1 (:issue:`4215`) +- Fixed the legend displaying in ``DataFrame.plot(kind='kde')`` (:issue:`4216`) +- Fixed bug where Index slices weren't carrying the name attribute + (:issue:`4226`) +- Fixed bug in initializing ``DatetimeIndex`` with an array of strings + in a certain time zone (:issue:`4229`) +- Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`) +- Fixed bug where get_data_famafrench wasn't using the correct file edges + (:issue:`4281`) + +pandas 0.11.0 +------------- + +**Release date:** 2013-04-22 + +New Features +~~~~~~~~~~~~ + +- New documentation section, ``10 Minutes to Pandas`` +- New documentation section, ``Cookbook`` +- Allow mixed dtypes (e.g ``float32/float64/int32/int16/int8``) to coexist in DataFrames and propogate in operations +- Add function to pandas.io.data for retrieving stock index components from Yahoo! finance (:issue:`2795`) +- Support slicing with time objects (:issue:`2681`) +- Added ``.iloc`` attribute, to support strict integer based indexing, analogous to ``.ix`` (:issue:`2922`) +- Added ``.loc`` attribute, to support strict label based indexing, analagous to ``.ix`` (:issue:`3053`) +- Added ``.iat`` attribute, to support fast scalar access via integers (replaces ``iget_value/iset_value``) +- Added ``.at`` attribute, to support fast scalar access via labels (replaces ``get_value/set_value``) +- Moved functionaility from ``irow,icol,iget_value/iset_value`` to ``.iloc`` indexer (via ``_ixs`` methods in each object) +- Added support for expression evaluation using the ``numexpr`` library +- Added ``convert=boolean`` to ``take`` routines to translate negative indices to positive, defaults to True +- Added to_series() method to indices, to facilitate the creation of indexeres (:issue:`3275`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improved performance of df.to_csv() by up to 10x in some cases. (:issue:`3059`) +- added ``blocks`` attribute to DataFrames, to return a dict of dtypes to homogeneously dtyped DataFrames +- added keyword ``convert_numeric`` to ``convert_objects()`` to try to convert object dtypes to numeric types (default is False) +- ``convert_dates`` in ``convert_objects`` can now be ``coerce`` which will return + a datetime64[ns] dtype with non-convertibles set as ``NaT``; will preserve an all-nan object + (e.g. strings), default is True (to perform soft-conversion +- Series print output now includes the dtype by default +- Optimize internal reindexing routines (:issue:`2819`, :issue:`2867`) +- ``describe_option()`` now reports the default and current value of options. +- Add ``format`` option to ``pandas.to_datetime`` with faster conversion of strings that can be parsed with datetime.strptime +- Add ``axes`` property to ``Series`` for compatibility +- Add ``xs`` function to ``Series`` for compatibility +- Allow setitem in a frame where only mixed numerics are present (e.g. int and float), (:issue:`3037`) +- ``HDFStore`` + + - Provide dotted attribute access to ``get`` from stores (e.g. store.df == store['df']) + - New keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are provided to support iteration on ``select`` and ``select_as_multiple`` (:issue:`3076`) + - support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv`` (:issue:`3222`) + +- Add ``squeeze`` method to possibly remove length 1 dimensions from an object. + + .. ipython:: python + + p = Panel(randn(3,4,4),items=['ItemA','ItemB','ItemC'], + major_axis=date_range('20010102',periods=4), + minor_axis=['A','B','C','D']) + p + p.reindex(items=['ItemA']).squeeze() + p.reindex(items=['ItemA'],minor=['B']).squeeze() + +- Improvement to Yahoo API access in ``pd.io.data.Options`` (:issue:`2758`) +- added option `display.max_seq_items` to control the number of elements printed per sequence pprinting it. (:issue:`2979`) +- added option `display.chop_threshold` to control display of small numerical values. (:issue:`2739`) +- added option `display.max_info_rows` to prevent verbose_info from being + calculated for frames above 1M rows (configurable). (:issue:`2807`, :issue:`2918`) +- value_counts() now accepts a "normalize" argument, for normalized histograms. (:issue:`2710`). +- DataFrame.from_records now accepts not only dicts but any instance of the collections.Mapping ABC. +- Allow selection semantics via a string with a datelike index to work in both Series and DataFrames (:issue:`3070`) + + .. ipython:: python + + idx = date_range("2001-10-1", periods=5, freq='M') + ts = Series(np.random.rand(len(idx)),index=idx) + ts['2001'] + + df = DataFrame(dict(A = ts)) + df['2001'] + +- added option `display.mpl_style` providing a sleeker visual style for plots. Based on https://gist.github.com/huyng/816622 (:issue:`3075`). +- Improved performance across several core functions by taking memory ordering of + arrays into account. Courtesy of @stephenwlin (:issue:`3130`) +- Improved performance of groupby transform method (:issue:`2121`) +- Handle "ragged" CSV files missing trailing delimiters in rows with missing fields + when also providing explicit list of column names (so the parser knows how many columns to expect in the result) (:issue:`2981`) +- On a mixed DataFrame, allow setting with indexers with ndarray/DataFrame on rhs (:issue:`3216`) +- Treat boolean values as integers (values 1 and 0) for numeric operations. (:issue:`2641`) +- Add ``time`` method to DatetimeIndex (:issue:`3180`) +- Return NA when using Series.str[...] for values that are not long enough (:issue:`3223`) +- Display cursor coordinate information in time-series plots (:issue:`1670`) +- to_html() now accepts an optional "escape" argument to control reserved HTML character + escaping (enabled by default) and escapes ``&``, in addition to ``<`` and ``>``. (:issue:`2919`) + +API Changes +~~~~~~~~~~~ + +- Do not automatically upcast numeric specified dtypes to ``int64`` or + ``float64`` (:issue:`622` and :issue:`797`) +- DataFrame construction of lists and scalars, with no dtype present, will + result in casting to ``int64`` or ``float64``, regardless of platform. + This is not an apparent change in the API, but noting it. +- Guarantee that ``convert_objects()`` for Series/DataFrame always returns a + copy +- groupby operations will respect dtypes for numeric float operations + (float32/float64); other types will be operated on, and will try to cast + back to the input dtype (e.g. if an int is passed, as long as the output + doesn't have nans, then an int will be returned) +- backfill/pad/take/diff/ohlc will now support ``float32/int16/int8`` + operations +- Block types will upcast as needed in where/masking operations (:issue:`2793`) +- Series now automatically will try to set the correct dtype based on passed + datetimelike objects (datetime/Timestamp) + + - timedelta64 are returned in appropriate cases (e.g. Series - Series, + when both are datetime64) + - mixed datetimes and objects (:issue:`2751`) in a constructor will be cast + correctly + - astype on datetimes to object are now handled (as well as NaT + conversions to np.nan) + - all timedelta like objects will be correctly assigned to ``timedelta64`` + with mixed ``NaN`` and/or ``NaT`` allowed + +- arguments to DataFrame.clip were inconsistent to numpy and Series clipping + (:issue:`2747`) +- util.testing.assert_frame_equal now checks the column and index names (:issue:`2964`) +- Constructors will now return a more informative ValueError on failures + when invalid shapes are passed +- Don't suppress TypeError in GroupBy.agg (:issue:`3238`) +- Methods return None when inplace=True (:issue:`1893`) +- ``HDFStore`` + + - added the method ``select_column`` to select a single column from a table as a Series. + - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - ``min_itemsize`` parameter will now automatically create data_columns for passed keys + +- Downcast on pivot if possible (:issue:`3283`), adds argument ``downcast`` to ``fillna`` +- Introduced options `display.height/width` for explicitly specifying terminal + height/width in characters. Deprecated display.line_width, now replaced by display.width. + These defaults are in effect for scripts as well, so unless disabled, previously + very wide output will now be output as "expand_repr" style wrapped output. +- Various defaults for options (including display.max_rows) have been revised, + after a brief survey concluded they were wrong for everyone. Now at w=80,h=60. +- HTML repr output in IPython qtconsole is once again controlled by the option + `display.notebook_repr_html`, and on by default. + +Bug Fixes +~~~~~~~~~ + +- Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill`` + (:issue:`2778`) +- Single element ndarrays of datetimelike objects are handled + (e.g. np.array(datetime(2001,1,1,0,0))), w/o dtype being passed +- 0-dim ndarrays with a passed dtype are handled correctly + (e.g. np.array(0.,dtype='float32')) +- Fix some boolean indexing inconsistencies in Series.__getitem__/__setitem__ + (:issue:`2776`) +- Fix issues with DataFrame and Series constructor with integers that + overflow ``int64`` and some mixed typed type lists (:issue:`2845`) + +- ``HDFStore`` + + - Fix weird PyTables error when using too many selectors in a where + also correctly filter on any number of values in a Term expression + (so not using numexpr filtering, but isin filtering) + - Internally, change all variables to be private-like (now have leading + underscore) + - Fixes for query parsing to correctly interpret boolean and != (:issue:`2849`, :issue:`2973`) + - Fixes for pathological case on SparseSeries with 0-len array and + compression (:issue:`2931`) + - Fixes bug with writing rows if part of a block was all-nan (:issue:`3012`) + - Exceptions are now ValueError or TypeError as needed + - A table will now raise if min_itemsize contains fields which are not queryables + +- Bug showing up in applymap where some object type columns are converted (:issue:`2909`) + had an incorrect default in convert_objects + +- TimeDeltas + + - Series ops with a Timestamp on the rhs was throwing an exception (:issue:`2898`) + added tests for Series ops with datetimes,timedeltas,Timestamps, and datelike + Series on both lhs and rhs + - Fixed subtle timedelta64 inference issue on py3 & numpy 1.7.0 (:issue:`3094`) + - Fixed some formatting issues on timedelta when negative + - Support null checking on timedelta64, representing (and formatting) with NaT + - Support setitem with np.nan value, converts to NaT + - Support min/max ops in a Dataframe (abs not working, nor do we error on non-supported ops) + - Support idxmin/idxmax/abs/max/min in a Series (:issue:`2989`, :issue:`2982`) + +- Bug on in-place putmasking on an ``integer`` series that needs to be converted to + ``float`` (:issue:`2746`) +- Bug in argsort of ``datetime64[ns]`` Series with ``NaT`` (:issue:`2967`) +- Bug in value_counts of ``datetime64[ns]`` Series (:issue:`3002`) +- Fixed printing of ``NaT`` in an index +- Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (:issue:`2982`) +- Bug in ``icol, take`` with negative indicies was producing incorrect return + values (see :issue:`2922`, :issue:`2892`), also check for out-of-bounds indices (:issue:`3029`) +- Bug in DataFrame column insertion when the column creation fails, existing frame is left in + an irrecoverable state (:issue:`3010`) +- Bug in DataFrame update, combine_first where non-specified values could cause + dtype changes (:issue:`3016`, :issue:`3041`) +- Bug in groupby with first/last where dtypes could change (:issue:`3041`, :issue:`2763`) +- Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from + other values), (:issue:`2850`) +- Unstack of a frame with no nans would always cause dtype upcasting (:issue:`2929`) +- Fix scalar datetime.datetime parsing bug in read_csv (:issue:`3071`) +- Fixed slow printing of large Dataframes, due to inefficient dtype + reporting (:issue:`2807`) +- Fixed a segfault when using a function as grouper in groupby (:issue:`3035`) +- Fix pretty-printing of infinite data structures (closes :issue:`2978`) +- Fixed exception when plotting timeseries bearing a timezone (closes :issue:`2877`) +- str.contains ignored na argument (:issue:`2806`) +- Substitute warning for segfault when grouping with categorical grouper + of mismatched length (:issue:`3011`) +- Fix exception in SparseSeries.density (:issue:`2083`) +- Fix upsampling bug with closed='left' and daily to daily data (:issue:`3020`) +- Fixed missing tick bars on scatter_matrix plot (:issue:`3063`) +- Fixed bug in Timestamp(d,tz=foo) when d is date() rather then datetime() (:issue:`2993`) +- series.plot(kind='bar') now respects pylab color schem (:issue:`3115`) +- Fixed bug in reshape if not passed correct input, now raises TypeError (:issue:`2719`) +- Fixed a bug where Series ctor did not respect ordering if OrderedDict passed in (:issue:`3282`) +- Fix NameError issue on RESO_US (:issue:`2787`) +- Allow selection in an *unordered* timeseries to work similary + to an *ordered* timeseries (:issue:`2437`). +- Fix implemented ``.xs`` when called with ``axes=1`` and a level parameter (:issue:`2903`) +- Timestamp now supports the class method fromordinal similar to datetimes (:issue:`3042`) +- Fix issue with indexing a series with a boolean key and specifiying a 1-len list on the rhs (:issue:`2745`) + or a list on the rhs (:issue:`3235`) +- Fixed bug in groupby apply when kernel generate list of arrays having unequal len (:issue:`1738`) +- fixed handling of rolling_corr with center=True which could produce corr>1 (:issue:`3155`) +- Fixed issues where indices can be passed as 'index/column' in addition to 0/1 for the axis parameter +- PeriodIndex.tolist now boxes to Period (:issue:`3178`) +- PeriodIndex.get_loc KeyError now reports Period instead of ordinal (:issue:`3179`) +- df.to_records bug when handling MultiIndex (GH3189) +- Fix Series.__getitem__ segfault when index less than -length (:issue:`3168`) +- Fix bug when using Timestamp as a date parser (:issue:`2932`) +- Fix bug creating date range from Timestamp with time zone and passing same + time zone (:issue:`2926`) +- Add comparison operators to Period object (:issue:`2781`) +- Fix bug when concatenating two Series into a DataFrame when they have the + same name (:issue:`2797`) +- Fix automatic color cycling when plotting consecutive timeseries + without color arguments (:issue:`2816`) +- fixed bug in the pickling of PeriodIndex (:issue:`2891`) +- Upcast/split blocks when needed in a mixed DataFrame when setitem + with an indexer (:issue:`3216`) +- Invoking df.applymap on a dataframe with dupe cols now raises a ValueError (:issue:`2786`) +- Apply with invalid returned indices raise correct Exception (:issue:`2808`) +- Fixed a bug in plotting log-scale bar plots (:issue:`3247`) +- df.plot() grid on/off now obeys the mpl default style, just like + series.plot(). (:issue:`3233`) +- Fixed a bug in the legend of plotting.andrews_curves() (:issue:`3278`) +- Produce a series on apply if we only generate a singular series and have + a simple index (:issue:`2893`) +- Fix Python ascii file parsing when integer falls outside of floating point + spacing (:issue:`3258`) +- fixed pretty priniting of sets (:issue:`3294`) +- Panel() and Panel.from_dict() now respects ordering when give OrderedDict (:issue:`3303`) +- DataFrame where with a datetimelike incorrectly selecting (:issue:`3311`) +- Ensure index casts work even in Int64Index +- Fix set_index segfault when passing MultiIndex (:issue:`3308`) +- Ensure pickles created in py2 can be read in py3 +- Insert ellipsis in MultiIndex summary repr (:issue:`3348`) +- Groupby will handle mutation among an input groups columns (and fallback + to non-fast apply) (:issue:`3380`) +- Eliminated unicode errors on FreeBSD when using MPL GTK backend (:issue:`3360`) +- Period.strftime should return unicode strings always (:issue:`3363`) +- Respect passed read_* chunksize in get_chunk function (:issue:`3406`) + +pandas 0.10.1 +------------- + +**Release date:** 2013-01-22 + +New Features +~~~~~~~~~~~~ + +- Add data inferface to World Bank WDI pandas.io.wb (:issue:`2592`) + +API Changes +~~~~~~~~~~~ + +- Restored inplace=True behavior returning self (same object) with + deprecation warning until 0.11 (:issue:`1893`) +- ``HDFStore`` + + - refactored HFDStore to deal with non-table stores as objects, will allow future enhancements + - removed keyword ``compression`` from ``put`` (replaced by keyword + ``complib`` to be consistent across library) + - warn `PerformanceWarning` if you are attempting to store types that will be pickled by PyTables + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``HDFStore`` + + - enables storing of multi-index dataframes (closes :issue:`1277`) + - support data column indexing and selection, via ``data_columns`` keyword + in append + - support write chunking to reduce memory footprint, via ``chunksize`` + keyword to append + - support automagic indexing via ``index`` keyword to append + - support ``expectedrows`` keyword in append to inform ``PyTables`` about + the expected tablesize + - support ``start`` and ``stop`` keywords in select to limit the row + selection space + - added ``get_store`` context manager to automatically import with pandas + - added column filtering via ``columns`` keyword in select + - added methods append_to_multiple/select_as_multiple/select_as_coordinates + to do multiple-table append/selection + - added support for datetime64 in columns + - added method ``unique`` to select the unique values in an indexable or + data column + - added method ``copy`` to copy an existing store (and possibly upgrade) + - show the shape of the data on disk for non-table stores when printing the + store + - added ability to read PyTables flavor tables (allows compatiblity to + other HDF5 systems) + +- Add ``logx`` option to DataFrame/Series.plot (:issue:`2327`, :issue:`2565`) +- Support reading gzipped data from file-like object +- ``pivot_table`` aggfunc can be anything used in GroupBy.aggregate (:issue:`2643`) +- Implement DataFrame merges in case where set cardinalities might overflow + 64-bit integer (:issue:`2690`) +- Raise exception in C file parser if integer dtype specified and have NA + values. (:issue:`2631`) +- Attempt to parse ISO8601 format dates when parse_dates=True in read_csv for + major performance boost in such cases (:issue:`2698`) +- Add methods ``neg`` and ``inv`` to Series +- Implement ``kind`` option in ``ExcelFile`` to indicate whether it's an XLS + or XLSX file (:issue:`2613`) +- Documented a fast-path in pd.read_Csv when parsing iso8601 datetime strings + yielding as much as a 20x speedup. (:issue:`5993`) + + +Bug Fixes +~~~~~~~~~ + +- Fix read_csv/read_table multithreading issues (:issue:`2608`) +- ``HDFStore`` + + - correctly handle ``nan`` elements in string columns; serialize via the + ``nan_rep`` keyword to append + - raise correctly on non-implemented column types (unicode/date) + - handle correctly ``Term`` passed types (e.g. ``index<1000``, when index + is ``Int64``), (closes :issue:`512`) + - handle Timestamp correctly in data_columns (closes :issue:`2637`) + - contains correctly matches on non-natural names + - correctly store ``float32`` dtypes in tables (if not other float types in + the same table) + +- Fix DataFrame.info bug with UTF8-encoded columns. (:issue:`2576`) +- Fix DatetimeIndex handling of FixedOffset tz (:issue:`2604`) +- More robust detection of being in IPython session for wide DataFrame + console formatting (:issue:`2585`) +- Fix platform issues with ``file:///`` in unit test (:issue:`2564`) +- Fix bug and possible segfault when grouping by hierarchical level that + contains NA values (:issue:`2616`) +- Ensure that MultiIndex tuples can be constructed with NAs (:issue:`2616`) +- Fix int64 overflow issue when unstacking MultiIndex with many levels + (:issue:`2616`) +- Exclude non-numeric data from DataFrame.quantile by default (:issue:`2625`) +- Fix a Cython C int64 boxing issue causing read_csv to return incorrect + results (:issue:`2599`) +- Fix groupby summing performance issue on boolean data (:issue:`2692`) +- Don't bork Series containing datetime64 values with to_datetime (:issue:`2699`) +- Fix DataFrame.from_records corner case when passed columns, index column, + but empty record list (:issue:`2633`) +- Fix C parser-tokenizer bug with trailing fields. (:issue:`2668`) +- Don't exclude non-numeric data from GroupBy.max/min (:issue:`2700`) +- Don't lose time zone when calling DatetimeIndex.drop (:issue:`2621`) +- Fix setitem on a Series with a boolean key and a non-scalar as value + (:issue:`2686`) +- Box datetime64 values in Series.apply/map (:issue:`2627`, :issue:`2689`) +- Upconvert datetime + datetime64 values when concatenating frames (:issue:`2624`) +- Raise a more helpful error message in merge operations when one DataFrame + has duplicate columns (:issue:`2649`) +- Fix partial date parsing issue occuring only when code is run at EOM + (:issue:`2618`) +- Prevent MemoryError when using counting sort in sortlevel with + high-cardinality MultiIndex objects (:issue:`2684`) +- Fix Period resampling bug when all values fall into a single bin (:issue:`2070`) +- Fix buggy interaction with usecols argument in read_csv when there is an + implicit first index column (:issue:`2654`) +- Fix bug in ``Index.summary()`` where string format methods were being called incorrectly. + (:issue:`3869`) + +pandas 0.10.0 +------------- + +**Release date:** 2012-12-17 + +New Features +~~~~~~~~~~~~ + +- Brand new high-performance delimited file parsing engine written in C and + Cython. 50% or better performance in many standard use cases with a + fraction as much memory usage. (:issue:`407`, :issue:`821`) +- Many new file parser (read_csv, read_table) features: + + - Support for on-the-fly gzip or bz2 decompression (`compression` option) + - Ability to get back numpy.recarray instead of DataFrame + (`as_recarray=True`) + - `dtype` option: explicit column dtypes + - `usecols` option: specify list of columns to be read from a file. Good + for reading very wide files with many irrelevant columns (:issue:`1216` :issue:`926`, :issue:`2465`) + - Enhanced unicode decoding support via `encoding` option + - `skipinitialspace` dialect option + - Can specify strings to be recognized as True (`true_values`) or False + (`false_values`) + - High-performance `delim_whitespace` option for whitespace-delimited + files; a preferred alternative to the '\s+' regular expression delimiter + - Option to skip "bad" lines (wrong number of fields) that would otherwise + have caused an error in the past (`error_bad_lines` and `warn_bad_lines` + options) + - Substantially improved performance in the parsing of integers with + thousands markers and lines with comments + - Easy of European (and other) decimal formats (`decimal` option) (:issue:`584`, :issue:`2466`) + - Custom line terminators (e.g. lineterminator='~') (:issue:`2457`) + - Handling of no trailing commas in CSV files (:issue:`2333`) + - Ability to handle fractional seconds in date_converters (:issue:`2209`) + - read_csv allow scalar arg to na_values (:issue:`1944`) + - Explicit column dtype specification in read_* functions (:issue:`1858`) + - Easier CSV dialect specification (:issue:`1743`) + - Improve parser performance when handling special characters (:issue:`1204`) + +- Google Analytics API integration with easy oauth2 workflow (:issue:`2283`) +- Add error handling to Series.str.encode/decode (:issue:`2276`) +- Add ``where`` and ``mask`` to Series (:issue:`2337`) +- Grouped histogram via `by` keyword in Series/DataFrame.hist (:issue:`2186`) +- Support optional ``min_periods`` keyword in ``corr`` and ``cov`` + for both Series and DataFrame (:issue:`2002`) +- Add ``duplicated`` and ``drop_duplicates`` functions to Series (:issue:`1923`) +- Add docs for ``HDFStore table`` format +- 'density' property in `SparseSeries` (:issue:`2384`) +- Add ``ffill`` and ``bfill`` convenience functions for forward- and + backfilling time series data (:issue:`2284`) +- New option configuration system and functions `set_option`, `get_option`, + `describe_option`, and `reset_option`. Deprecate `set_printoptions` and + `reset_printoptions` (:issue:`2393`). + You can also access options as attributes via ``pandas.options.X`` +- Wide DataFrames can be viewed more easily in the console with new + `expand_frame_repr` and `line_width` configuration options. This is on by + default now (:issue:`2436`) +- Scikits.timeseries-like moving window functions via ``rolling_window`` (:issue:`1270`) + +Experimental Features +~~~~~~~~~~~~~~~~~~~~~ + +- Add support for Panel4D, a named 4 Dimensional stucture +- Add support for ndpanel factory functions, to create custom, + domain-specific N-Dimensional containers + +API Changes +~~~~~~~~~~~ + +- The default binning/labeling behavior for ``resample`` has been changed to + `closed='left', label='left'` for daily and lower frequencies. This had + been a large source of confusion for users. See "what's new" page for more + on this. (:issue:`2410`) +- Methods with ``inplace`` option now return None instead of the calling + (modified) object (:issue:`1893`) +- The special case DataFrame - TimeSeries doing column-by-column broadcasting + has been deprecated. Users should explicitly do e.g. df.sub(ts, axis=0) + instead. This is a legacy hack and can lead to subtle bugs. +- inf/-inf are no longer considered as NA by isnull/notnull. To be clear, this + is legacy cruft from early pandas. This behavior can be globally re-enabled + using the new option ``mode.use_inf_as_null`` (:issue:`2050`, :issue:`1919`) +- ``pandas.merge`` will now default to ``sort=False``. For many use cases + sorting the join keys is not necessary, and doing it by default is wasteful +- Specify ``header=0`` explicitly to replace existing column names in file in + read_* functions. +- Default column names for header-less parsed files (yielded by read_csv, + etc.) are now the integers 0, 1, .... A new argument `prefix` has been + added; to get the v0.9.x behavior specify ``prefix='X'`` (:issue:`2034`). This API + change was made to make the default column names more consistent with the + DataFrame constructor's default column names when none are specified. +- DataFrame selection using a boolean frame now preserves input shape +- If function passed to Series.apply yields a Series, result will be a + DataFrame (:issue:`2316`) +- Values like YES/NO/yes/no will not be considered as boolean by default any + longer in the file parsers. This can be customized using the new + ``true_values`` and ``false_values`` options (:issue:`2360`) +- `obj.fillna()` is no longer valid; make `method='pad'` no longer the + default option, to be more explicit about what kind of filling to + perform. Add `ffill/bfill` convenience functions per above (:issue:`2284`) +- `HDFStore.keys()` now returns an absolute path-name for each key +- `to_string()` now always returns a unicode string. (:issue:`2224`) +- File parsers will not handle NA sentinel values arising from passed + converter functions + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Add ``nrows`` option to DataFrame.from_records for iterators (:issue:`1794`) +- Unstack/reshape algorithm rewrite to avoid high memory use in cases where + the number of observed key-tuples is much smaller than the total possible + number that could occur (:issue:`2278`). Also improves performance in most cases. +- Support duplicate columns in DataFrame.from_records (:issue:`2179`) +- Add ``normalize`` option to Series/DataFrame.asfreq (:issue:`2137`) +- SparseSeries and SparseDataFrame construction from empty and scalar + values now no longer create dense ndarrays unnecessarily (:issue:`2322`) +- ``HDFStore`` now supports hierarchial keys (:issue:`2397`) +- Support multiple query selection formats for ``HDFStore tables`` (:issue:`1996`) +- Support ``del store['df']`` syntax to delete HDFStores +- Add multi-dtype support for ``HDFStore tables`` +- ``min_itemsize`` parameter can be specified in ``HDFStore table`` creation +- Indexing support in ``HDFStore tables`` (:issue:`698`) +- Add `line_terminator` option to DataFrame.to_csv (:issue:`2383`) +- added implementation of str(x)/unicode(x)/bytes(x) to major pandas data + structures, which should do the right thing on both py2.x and py3.x. (:issue:`2224`) +- Reduce groupby.apply overhead substantially by low-level manipulation of + internal NumPy arrays in DataFrames (:issue:`535`) +- Implement ``value_vars`` in ``melt`` and add ``melt`` to pandas namespace + (:issue:`2412`) +- Added boolean comparison operators to Panel +- Enable ``Series.str.strip/lstrip/rstrip`` methods to take an argument (:issue:`2411`) +- The DataFrame ctor now respects column ordering when given + an OrderedDict (:issue:`2455`) +- Assigning DatetimeIndex to Series changes the class to TimeSeries (:issue:`2139`) +- Improve performance of .value_counts method on non-integer data (:issue:`2480`) +- ``get_level_values`` method for MultiIndex return Index instead of ndarray (:issue:`2449`) +- ``convert_to_r_dataframe`` conversion for datetime values (:issue:`2351`) +- Allow ``DataFrame.to_csv`` to represent inf and nan differently (:issue:`2026`) +- Add ``min_i`` argument to ``nancorr`` to specify minimum required observations (:issue:`2002`) +- Add ``inplace`` option to ``sortlevel`` / ``sort`` functions on DataFrame (:issue:`1873`) +- Enable DataFrame to accept scalar constructor values like Series (:issue:`1856`) +- DataFrame.from_records now takes optional ``size`` parameter (:issue:`1794`) +- include iris dataset (:issue:`1709`) +- No datetime64 DataFrame column conversion of datetime.datetime with tzinfo (:issue:`1581`) +- Micro-optimizations in DataFrame for tracking state of internal consolidation (:issue:`217`) +- Format parameter in DataFrame.to_csv (:issue:`1525`) +- Partial string slicing for ``DatetimeIndex`` for daily and higher frequencies (:issue:`2306`) +- Implement ``col_space`` parameter in ``to_html`` and ``to_string`` in DataFrame (:issue:`1000`) +- Override ``Series.tolist`` and box datetime64 types (:issue:`2447`) +- Optimize ``unstack`` memory usage by compressing indices (:issue:`2278`) +- Fix HTML repr in IPython qtconsole if opening window is small (:issue:`2275`) +- Escape more special characters in console output (:issue:`2492`) +- df.select now invokes bool on the result of crit(x) (:issue:`2487`) + +Bug Fixes +~~~~~~~~~ + +- Fix major performance regression in DataFrame.iteritems (:issue:`2273`) +- Fixes bug when negative period passed to Series/DataFrame.diff (:issue:`2266`) +- Escape tabs in console output to avoid alignment issues (:issue:`2038`) +- Properly box datetime64 values when retrieving cross-section from + mixed-dtype DataFrame (:issue:`2272`) +- Fix concatenation bug leading to :issue:`2057`, :issue:`2257` +- Fix regression in Index console formatting (:issue:`2319`) +- Box Period data when assigning PeriodIndex to frame column (:issue:`2243`, :issue:`2281`) +- Raise exception on calling reset_index on Series with inplace=True (:issue:`2277`) +- Enable setting multiple columns in DataFrame with hierarchical columns + (:issue:`2295`) +- Respect dtype=object in DataFrame constructor (:issue:`2291`) +- Fix DatetimeIndex.join bug with tz-aware indexes and how='outer' (:issue:`2317`) +- pop(...) and del works with DataFrame with duplicate columns (:issue:`2349`) +- Treat empty strings as NA in date parsing (rather than let dateutil do + something weird) (:issue:`2263`) +- Prevent uint64 -> int64 overflows (:issue:`2355`) +- Enable joins between MultiIndex and regular Index (:issue:`2024`) +- Fix time zone metadata issue when unioning non-overlapping DatetimeIndex + objects (:issue:`2367`) +- Raise/handle int64 overflows in parsers (:issue:`2247`) +- Deleting of consecutive rows in ``HDFStore tables``` is much faster than before +- Appending on a HDFStore would fail if the table was not first created via ``put`` +- Use `col_space` argument as minimum column width in DataFrame.to_html (:issue:`2328`) +- Fix tz-aware DatetimeIndex.to_period (:issue:`2232`) +- Fix DataFrame row indexing case with MultiIndex (:issue:`2314`) +- Fix to_excel exporting issues with Timestamp objects in index (:issue:`2294`) +- Fixes assigning scalars and array to hierarchical column chunk (:issue:`1803`) +- Fixed a UnicdeDecodeError with series tidy_repr (:issue:`2225`) +- Fixed issued with duplicate keys in an index (:issue:`2347`, :issue:`2380`) +- Fixed issues re: Hash randomization, default on starting w/ py3.3 (:issue:`2331`) +- Fixed issue with missing attributes after loading a pickled dataframe (:issue:`2431`) +- Fix Timestamp formatting with tzoffset time zone in dateutil 2.1 (:issue:`2443`) +- Fix GroupBy.apply issue when using BinGrouper to do ts binning (:issue:`2300`) +- Fix issues resulting from datetime.datetime columns being converted to + datetime64 when calling DataFrame.apply. (:issue:`2374`) +- Raise exception when calling to_panel on non uniquely-indexed frame (:issue:`2441`) +- Improved detection of console encoding on IPython zmq frontends (:issue:`2458`) +- Preserve time zone when .append-ing two time series (:issue:`2260`) +- Box timestamps when calling reset_index on time-zone-aware index rather + than creating a tz-less datetime64 column (:issue:`2262`) +- Enable searching non-string columns in DataFrame.filter(like=...) (:issue:`2467`) +- Fixed issue with losing nanosecond precision upon conversion to DatetimeIndex(:issue:`2252`) +- Handle timezones in Datetime.normalize (:issue:`2338`) +- Fix test case where dtype specification with endianness causes + failures on big endian machines (:issue:`2318`) +- Fix plotting bug where upsampling causes data to appear shifted in time (:issue:`2448`) +- Fix ``read_csv`` failure for UTF-16 with BOM and skiprows(:issue:`2298`) +- read_csv with names arg not implicitly setting header=None(:issue:`2459`) +- Unrecognized compression mode causes segfault in read_csv(:issue:`2474`) +- In read_csv, header=0 and passed names should discard first row(:issue:`2269`) +- Correctly route to stdout/stderr in read_table (:issue:`2071`) +- Fix exception when Timestamp.to_datetime is called on a Timestamp with tzoffset (:issue:`2471`) +- Fixed unintentional conversion of datetime64 to long in groupby.first() (:issue:`2133`) +- Union of empty DataFrames now return empty with concatenated index (:issue:`2307`) +- DataFrame.sort_index raises more helpful exception if sorting by column + with duplicates (:issue:`2488`) +- DataFrame.to_string formatters can be list, too (:issue:`2520`) +- DataFrame.combine_first will always result in the union of the index and + columns, even if one DataFrame is length-zero (:issue:`2525`) +- Fix several DataFrame.icol/irow with duplicate indices issues (:issue:`2228`, :issue:`2259`) +- Use Series names for column names when using concat with axis=1 (:issue:`2489`) +- Raise Exception if start, end, periods all passed to date_range (:issue:`2538`) +- Fix Panel resampling issue (:issue:`2537`) + +pandas 0.9.1 +------------ + +**Release date:** 2012-11-14 + +New Features +~~~~~~~~~~~~ + +- Can specify multiple sort orders in DataFrame/Series.sort/sort_index (:issue:`928`) +- New `top` and `bottom` options for handling NAs in rank (:issue:`1508`, :issue:`2159`) +- Add `where` and `mask` functions to DataFrame (:issue:`2109`, :issue:`2151`) +- Add `at_time` and `between_time` functions to DataFrame (:issue:`2149`) +- Add flexible `pow` and `rpow` methods to DataFrame (:issue:`2190`) + +API Changes +~~~~~~~~~~~ + +- Upsampling period index "spans" intervals. Example: annual periods + upsampled to monthly will span all months in each year +- Period.end_time will yield timestamp at last nanosecond in the interval + (:issue:`2124`, :issue:`2125`, :issue:`1764`) +- File parsers no longer coerce to float or bool for columns that have custom + converters specified (:issue:`2184`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Time rule inference for week-of-month (e.g. WOM-2FRI) rules (:issue:`2140`) +- Improve performance of datetime + business day offset with large number of + offset periods +- Improve HTML display of DataFrame objects with hierarchical columns +- Enable referencing of Excel columns by their column names (:issue:`1936`) +- DataFrame.dot can accept ndarrays (:issue:`2042`) +- Support negative periods in Panel.shift (:issue:`2164`) +- Make .drop(...) work with non-unique indexes (:issue:`2101`) +- Improve performance of Series/DataFrame.diff (re: :issue:`2087`) +- Support unary ~ (__invert__) in DataFrame (:issue:`2110`) +- Turn off pandas-style tick locators and formatters (:issue:`2205`) +- DataFrame[DataFrame] uses DataFrame.where to compute masked frame (:issue:`2230`) + +Bug Fixes +~~~~~~~~~ + +- Fix some duplicate-column DataFrame constructor issues (:issue:`2079`) +- Fix bar plot color cycle issues (:issue:`2082`) +- Fix off-center grid for stacked bar plots (:issue:`2157`) +- Fix plotting bug if inferred frequency is offset with N > 1 (:issue:`2126`) +- Implement comparisons on date offsets with fixed delta (:issue:`2078`) +- Handle inf/-inf correctly in read_* parser functions (:issue:`2041`) +- Fix matplotlib unicode interaction bug +- Make WLS r-squared match statsmodels 0.5.0 fixed value +- Fix zero-trimming DataFrame formatting bug +- Correctly compute/box datetime64 min/max values from Series.min/max (:issue:`2083`) +- Fix unstacking edge case with unrepresented groups (:issue:`2100`) +- Fix Series.str failures when using pipe pattern '|' (:issue:`2119`) +- Fix pretty-printing of dict entries in Series, DataFrame (:issue:`2144`) +- Cast other datetime64 values to nanoseconds in DataFrame ctor (:issue:`2095`) +- Alias Timestamp.astimezone to tz_convert, so will yield Timestamp (:issue:`2060`) +- Fix timedelta64 formatting from Series (:issue:`2165`, :issue:`2146`) +- Handle None values gracefully in dict passed to Panel constructor (:issue:`2075`) +- Box datetime64 values as Timestamp objects in Series/DataFrame.iget (:issue:`2148`) +- Fix Timestamp indexing bug in DatetimeIndex.insert (:issue:`2155`) +- Use index name(s) (if any) in DataFrame.to_records (:issue:`2161`) +- Don't lose index names in Panel.to_frame/DataFrame.to_panel (:issue:`2163`) +- Work around length-0 boolean indexing NumPy bug (:issue:`2096`) +- Fix partial integer indexing bug in DataFrame.xs (:issue:`2107`) +- Fix variety of cut/qcut string-bin formatting bugs (:issue:`1978`, :issue:`1979`) +- Raise Exception when xs view not possible of MultiIndex'd DataFrame (:issue:`2117`) +- Fix groupby(...).first() issue with datetime64 (:issue:`2133`) +- Better floating point error robustness in some rolling_* functions + (:issue:`2114`, :issue:`2527`) +- Fix ewma NA handling in the middle of Series (:issue:`2128`) +- Fix numerical precision issues in diff with integer data (:issue:`2087`) +- Fix bug in MultiIndex.__getitem__ with NA values (:issue:`2008`) +- Fix DataFrame.from_records dict-arg bug when passing columns (:issue:`2179`) +- Fix Series and DataFrame.diff for integer dtypes (:issue:`2087`, :issue:`2174`) +- Fix bug when taking intersection of DatetimeIndex with empty index (:issue:`2129`) +- Pass through timezone information when calling DataFrame.align (:issue:`2127`) +- Properly sort when joining on datetime64 values (:issue:`2196`) +- Fix indexing bug in which False/True were being coerced to 0/1 (:issue:`2199`) +- Many unicode formatting fixes (:issue:`2201`) +- Fix improper MultiIndex conversion issue when assigning + e.g. DataFrame.index (:issue:`2200`) +- Fix conversion of mixed-type DataFrame to ndarray with dup columns (:issue:`2236`) +- Fix duplicate columns issue (:issue:`2218`, :issue:`2219`) +- Fix SparseSeries.__pow__ issue with NA input (:issue:`2220`) +- Fix icol with integer sequence failure (:issue:`2228`) +- Fixed resampling tz-aware time series issue (:issue:`2245`) +- SparseDataFrame.icol was not returning SparseSeries (:issue:`2227`, :issue:`2229`) +- Enable ExcelWriter to handle PeriodIndex (:issue:`2240`) +- Fix issue constructing DataFrame from empty Series with name (:issue:`2234`) +- Use console-width detection in interactive sessions only (:issue:`1610`) +- Fix parallel_coordinates legend bug with mpl 1.2.0 (:issue:`2237`) +- Make tz_localize work in corner case of empty Series (:issue:`2248`) + +pandas 0.9.0 +------------ + +**Release date:** 10/7/2012 + +New Features +~~~~~~~~~~~~ + +- Add ``str.encode`` and ``str.decode`` to Series (:issue:`1706`) +- Add `to_latex` method to DataFrame (:issue:`1735`) +- Add convenient expanding window equivalents of all rolling_* ops (:issue:`1785`) +- Add Options class to pandas.io.data for fetching options data from Yahoo! + Finance (:issue:`1748`, :issue:`1739`) +- Recognize and convert more boolean values in file parsing (Yes, No, TRUE, + FALSE, variants thereof) (:issue:`1691`, :issue:`1295`) +- Add Panel.update method, analogous to DataFrame.update (:issue:`1999`, :issue:`1988`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Proper handling of NA values in merge operations (:issue:`1990`) +- Add ``flags`` option for ``re.compile`` in some Series.str methods (:issue:`1659`) +- Parsing of UTC date strings in read_* functions (:issue:`1693`) +- Handle generator input to Series (:issue:`1679`) +- Add `na_action='ignore'` to Series.map to quietly propagate NAs (:issue:`1661`) +- Add args/kwds options to Series.apply (:issue:`1829`) +- Add inplace option to Series/DataFrame.reset_index (:issue:`1797`) +- Add ``level`` parameter to ``Series.reset_index`` +- Add quoting option for DataFrame.to_csv (:issue:`1902`) +- Indicate long column value truncation in DataFrame output with ... (:issue:`1854`) +- DataFrame.dot will not do data alignment, and also work with Series (:issue:`1915`) +- Add ``na`` option for missing data handling in some vectorized string + methods (:issue:`1689`) +- If index_label=False in DataFrame.to_csv, do not print fields/commas in the + text output. Results in easier importing into R (:issue:`1583`) +- Can pass tuple/list of axes to DataFrame.dropna to simplify repeated calls + (dropping both columns and rows) (:issue:`924`) +- Improve DataFrame.to_html output for hierarchically-indexed rows (do not + repeat levels) (:issue:`1929`) +- TimeSeries.between_time can now select times across midnight (:issue:`1871`) +- Enable `skip_footer` parameter in `ExcelFile.parse` (:issue:`1843`) + +API Changes +~~~~~~~~~~~ + +- Change default header names in read_* functions to more Pythonic X0, X1, + etc. instead of X.1, X.2. (:issue:`2000`) +- Deprecated ``day_of_year`` API removed from PeriodIndex, use ``dayofyear`` + (:issue:`1723`) +- Don't modify NumPy suppress printoption at import time +- The internal HDF5 data arrangement for DataFrames has been + transposed. Legacy files will still be readable by HDFStore (:issue:`1834`, :issue:`1824`) +- Legacy cruft removed: pandas.stats.misc.quantileTS +- Use ISO8601 format for Period repr: monthly, daily, and on down (:issue:`1776`) +- Empty DataFrame columns are now created as object dtype. This will prevent + a class of TypeErrors that was occurring in code where the dtype of a + column would depend on the presence of data or not (e.g. a SQL query having + results) (:issue:`1783`) +- Setting parts of DataFrame/Panel using ix now aligns input Series/DataFrame + (:issue:`1630`) +- `first` and `last` methods in `GroupBy` no longer drop non-numeric columns + (:issue:`1809`) +- Resolved inconsistencies in specifying custom NA values in text parser. + `na_values` of type dict no longer override default NAs unless + `keep_default_na` is set to false explicitly (:issue:`1657`) +- Enable `skipfooter` parameter in text parsers as an alias for `skip_footer` + +Bug Fixes +~~~~~~~~~ + +- Perform arithmetic column-by-column in mixed-type DataFrame to avoid type + upcasting issues. Caused downstream DataFrame.diff bug (:issue:`1896`) +- Fix matplotlib auto-color assignment when no custom spectrum passed. Also + respect passed color keyword argument (:issue:`1711`) +- Fix resampling logical error with closed='left' (:issue:`1726`) +- Fix critical DatetimeIndex.union bugs (:issue:`1730`, :issue:`1719`, :issue:`1745`, :issue:`1702`, :issue:`1753`) +- Fix critical DatetimeIndex.intersection bug with unanchored offsets (:issue:`1708`) +- Fix MM-YYYY time series indexing case (:issue:`1672`) +- Fix case where Categorical group key was not being passed into index in + GroupBy result (:issue:`1701`) +- Handle Ellipsis in Series.__getitem__/__setitem__ (:issue:`1721`) +- Fix some bugs with handling datetime64 scalars of other units in NumPy 1.6 + and 1.7 (:issue:`1717`) +- Fix performance issue in MultiIndex.format (:issue:`1746`) +- Fixed GroupBy bugs interacting with DatetimeIndex asof / map methods (:issue:`1677`) +- Handle factors with NAs in pandas.rpy (:issue:`1615`) +- Fix statsmodels import in pandas.stats.var (:issue:`1734`) +- Fix DataFrame repr/info summary with non-unique columns (:issue:`1700`) +- Fix Series.iget_value for non-unique indexes (:issue:`1694`) +- Don't lose tzinfo when passing DatetimeIndex as DataFrame column (:issue:`1682`) +- Fix tz conversion with time zones that haven't had any DST transitions since + first date in the array (:issue:`1673`) +- Fix field access with UTC->local conversion on unsorted arrays (:issue:`1756`) +- Fix isnull handling of array-like (list) inputs (:issue:`1755`) +- Fix regression in handling of Series in Series constructor (:issue:`1671`) +- Fix comparison of Int64Index with DatetimeIndex (:issue:`1681`) +- Fix min_periods handling in new rolling_max/min at array start (:issue:`1695`) +- Fix errors with how='median' and generic NumPy resampling in some cases + caused by SeriesBinGrouper (:issue:`1648`, :issue:`1688`) +- When grouping by level, exclude unobserved levels (:issue:`1697`) +- Don't lose tzinfo in DatetimeIndex when shifting by different offset (:issue:`1683`) +- Hack to support storing data with a zero-length axis in HDFStore (:issue:`1707`) +- Fix DatetimeIndex tz-aware range generation issue (:issue:`1674`) +- Fix method='time' interpolation with intraday data (:issue:`1698`) +- Don't plot all-NA DataFrame columns as zeros (:issue:`1696`) +- Fix bug in scatter_plot with by option (:issue:`1716`) +- Fix performance problem in infer_freq with lots of non-unique stamps (:issue:`1686`) +- Fix handling of PeriodIndex as argument to create MultiIndex (:issue:`1705`) +- Fix re: unicode MultiIndex level names in Series/DataFrame repr (:issue:`1736`) +- Handle PeriodIndex in to_datetime instance method (:issue:`1703`) +- Support StaticTzInfo in DatetimeIndex infrastructure (:issue:`1692`) +- Allow MultiIndex setops with length-0 other type indexes (:issue:`1727`) +- Fix handling of DatetimeIndex in DataFrame.to_records (:issue:`1720`) +- Fix handling of general objects in isnull on which bool(...) fails (:issue:`1749`) +- Fix .ix indexing with MultiIndex ambiguity (:issue:`1678`) +- Fix .ix setting logic error with non-unique MultiIndex (:issue:`1750`) +- Basic indexing now works on MultiIndex with > 1000000 elements, regression + from earlier version of pandas (:issue:`1757`) +- Handle non-float64 dtypes in fast DataFrame.corr/cov code paths (:issue:`1761`) +- Fix DatetimeIndex.isin to function properly (:issue:`1763`) +- Fix conversion of array of tz-aware datetime.datetime to DatetimeIndex with + right time zone (:issue:`1777`) +- Fix DST issues with generating ancxhored date ranges (:issue:`1778`) +- Fix issue calling sort on result of Series.unique (:issue:`1807`) +- Fix numerical issue leading to square root of negative number in + rolling_std (:issue:`1840`) +- Let Series.str.split accept no arguments (like str.split) (:issue:`1859`) +- Allow user to have dateutil 2.1 installed on a Python 2 system (:issue:`1851`) +- Catch ImportError less aggressively in pandas/__init__.py (:issue:`1845`) +- Fix pip source installation bug when installing from GitHub (:issue:`1805`) +- Fix error when window size > array size in rolling_apply (:issue:`1850`) +- Fix pip source installation issues via SSH from GitHub +- Fix OLS.summary when column is a tuple (:issue:`1837`) +- Fix bug in __doc__ patching when -OO passed to interpreter + (:issue:`1792` :issue:`1741` :issue:`1774`) +- Fix unicode console encoding issue in IPython notebook (:issue:`1782`, :issue:`1768`) +- Fix unicode formatting issue with Series.name (:issue:`1782`) +- Fix bug in DataFrame.duplicated with datetime64 columns (:issue:`1833`) +- Fix bug in Panel internals resulting in error when doing fillna after + truncate not changing size of panel (:issue:`1823`) +- Prevent segfault due to MultiIndex not being supported in HDFStore table + format (:issue:`1848`) +- Fix UnboundLocalError in Panel.__setitem__ and add better error (:issue:`1826`) +- Fix to_csv issues with list of string entries. Isnull works on list of + strings now too (:issue:`1791`) +- Fix Timestamp comparisons with datetime values outside the nanosecond range + (1677-2262) +- Revert to prior behavior of normalize_date with datetime.date objects + (return datetime) +- Fix broken interaction between np.nansum and Series.any/all +- Fix bug with multiple column date parsers (:issue:`1866`) +- DatetimeIndex.union(Int64Index) was broken +- Make plot x vs y interface consistent with integer indexing (:issue:`1842`) +- set_index inplace modified data even if unique check fails (:issue:`1831`) +- Only use Q-OCT/NOV/DEC in quarterly frequency inference (:issue:`1789`) +- Upcast to dtype=object when unstacking boolean DataFrame (:issue:`1820`) +- Fix float64/float32 merging bug (:issue:`1849`) +- Fixes to Period.start_time for non-daily frequencies (:issue:`1857`) +- Fix failure when converter used on index_col in read_csv (:issue:`1835`) +- Implement PeriodIndex.append so that pandas.concat works correctly (:issue:`1815`) +- Avoid Cython out-of-bounds access causing segfault sometimes in pad_2d, + backfill_2d +- Fix resampling error with intraday times and anchored target time (like + AS-DEC) (:issue:`1772`) +- Fix .ix indexing bugs with mixed-integer indexes (:issue:`1799`) +- Respect passed color keyword argument in Series.plot (:issue:`1890`) +- Fix rolling_min/max when the window is larger than the size of the input + array. Check other malformed inputs (:issue:`1899`, :issue:`1897`) +- Rolling variance / standard deviation with only a single observation in + window (:issue:`1884`) +- Fix unicode sheet name failure in to_excel (:issue:`1828`) +- Override DatetimeIndex.min/max to return Timestamp objects (:issue:`1895`) +- Fix column name formatting issue in length-truncated column (:issue:`1906`) +- Fix broken handling of copying Index metadata to new instances created by + view(...) calls inside the NumPy infrastructure +- Support datetime.date again in DateOffset.rollback/rollforward +- Raise Exception if set passed to Series constructor (:issue:`1913`) +- Add TypeError when appending HDFStore table w/ wrong index type (:issue:`1881`) +- Don't raise exception on empty inputs in EW functions (e.g. ewma) (:issue:`1900`) +- Make asof work correctly with PeriodIndex (:issue:`1883`) +- Fix extlinks in doc build +- Fill boolean DataFrame with NaN when calling shift (:issue:`1814`) +- Fix setuptools bug causing pip not to Cythonize .pyx files sometimes +- Fix negative integer indexing regression in .ix from 0.7.x (:issue:`1888`) +- Fix error while retrieving timezone and utc offset from subclasses of + datetime.tzinfo without .zone and ._utcoffset attributes (:issue:`1922`) +- Fix DataFrame formatting of small, non-zero FP numbers (:issue:`1911`) +- Various fixes by upcasting of date -> datetime (:issue:`1395`) +- Raise better exception when passing multiple functions with the same name, + such as lambdas, to GroupBy.aggregate +- Fix DataFrame.apply with axis=1 on a non-unique index (:issue:`1878`) +- Proper handling of Index subclasses in pandas.unique (:issue:`1759`) +- Set index names in DataFrame.from_records (:issue:`1744`) +- Fix time series indexing error with duplicates, under and over hash table + size cutoff (:issue:`1821`) +- Handle list keys in addition to tuples in DataFrame.xs when + partial-indexing a hierarchically-indexed DataFrame (:issue:`1796`) +- Support multiple column selection in DataFrame.__getitem__ with duplicate + columns (:issue:`1943`) +- Fix time zone localization bug causing improper fields (e.g. hours) in time + zones that have not had a UTC transition in a long time (:issue:`1946`) +- Fix errors when parsing and working with with fixed offset timezones + (:issue:`1922`, :issue:`1928`) +- Fix text parser bug when handling UTC datetime objects generated by + dateutil (:issue:`1693`) +- Fix plotting bug when 'B' is the inferred frequency but index actually + contains weekends (:issue:`1668`, :issue:`1669`) +- Fix plot styling bugs (:issue:`1666`, :issue:`1665`, :issue:`1658`) +- Fix plotting bug with index/columns with unicode (:issue:`1685`) +- Fix DataFrame constructor bug when passed Series with datetime64 dtype + in a dict (:issue:`1680`) +- Fixed regression in generating DatetimeIndex using timezone aware + datetime.datetime (:issue:`1676`) +- Fix DataFrame bug when printing concatenated DataFrames with duplicated + columns (:issue:`1675`) +- Fixed bug when plotting time series with multiple intraday frequencies + (:issue:`1732`) +- Fix bug in DataFrame.duplicated to enable iterables other than list-types + as input argument (:issue:`1773`) +- Fix resample bug when passed list of lambdas as `how` argument (:issue:`1808`) +- Repr fix for MultiIndex level with all NAs (:issue:`1971`) +- Fix PeriodIndex slicing bug when slice start/end are out-of-bounds (:issue:`1977`) +- Fix read_table bug when parsing unicode (:issue:`1975`) +- Fix BlockManager.iget bug when dealing with non-unique MultiIndex as columns + (:issue:`1970`) +- Fix reset_index bug if both drop and level are specified (:issue:`1957`) +- Work around unsafe NumPy object->int casting with Cython function (:issue:`1987`) +- Fix datetime64 formatting bug in DataFrame.to_csv (:issue:`1993`) +- Default start date in pandas.io.data to 1/1/2000 as the docs say (:issue:`2011`) + +pandas 0.8.1 +------------ + +**Release date:** July 22, 2012 + +New Features +~~~~~~~~~~~~ + +- Add vectorized, NA-friendly string methods to Series (:issue:`1621`, :issue:`620`) +- Can pass dict of per-column line styles to DataFrame.plot (:issue:`1559`) +- Selective plotting to secondary y-axis on same subplot (:issue:`1640`) +- Add new ``bootstrap_plot`` plot function +- Add new ``parallel_coordinates`` plot function (:issue:`1488`) +- Add ``radviz`` plot function (:issue:`1566`) +- Add ``multi_sparse`` option to ``set_printoptions`` to modify display of + hierarchical indexes (:issue:`1538`) +- Add ``dropna`` method to Panel (:issue:`171`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Use moving min/max algorithms from Bottleneck in rolling_min/rolling_max + for > 100x speedup. (:issue:`1504`, :issue:`50`) +- Add Cython group median method for >15x speedup (:issue:`1358`) +- Drastically improve ``to_datetime`` performance on ISO8601 datetime strings + (with no time zones) (:issue:`1571`) +- Improve single-key groupby performance on large data sets, accelerate use of + groupby with a Categorical variable +- Add ability to append hierarchical index levels with ``set_index`` and to + drop single levels with ``reset_index`` (:issue:`1569`, :issue:`1577`) +- Always apply passed functions in ``resample``, even if upsampling (:issue:`1596`) +- Avoid unnecessary copies in DataFrame constructor with explicit dtype (:issue:`1572`) +- Cleaner DatetimeIndex string representation with 1 or 2 elements (:issue:`1611`) +- Improve performance of array-of-Period to PeriodIndex, convert such arrays + to PeriodIndex inside Index (:issue:`1215`) +- More informative string representation for weekly Period objects (:issue:`1503`) +- Accelerate 3-axis multi data selection from homogeneous Panel (:issue:`979`) +- Add ``adjust`` option to ewma to disable adjustment factor (:issue:`1584`) +- Add new matplotlib converters for high frequency time series plotting (:issue:`1599`) +- Handling of tz-aware datetime.datetime objects in to_datetime; raise + Exception unless utc=True given (:issue:`1581`) + +Bug Fixes +~~~~~~~~~ + +- Fix NA handling in DataFrame.to_panel (:issue:`1582`) +- Handle TypeError issues inside PyObject_RichCompareBool calls in khash + (:issue:`1318`) +- Fix resampling bug to lower case daily frequency (:issue:`1588`) +- Fix kendall/spearman DataFrame.corr bug with no overlap (:issue:`1595`) +- Fix bug in DataFrame.set_index (:issue:`1592`) +- Don't ignore axes in boxplot if by specified (:issue:`1565`) +- Fix Panel .ix indexing with integers bug (:issue:`1603`) +- Fix Partial indexing bugs (years, months, ...) with PeriodIndex (:issue:`1601`) +- Fix MultiIndex console formatting issue (:issue:`1606`) +- Unordered index with duplicates doesn't yield scalar location for single + entry (:issue:`1586`) +- Fix resampling of tz-aware time series with "anchored" freq (:issue:`1591`) +- Fix DataFrame.rank error on integer data (:issue:`1589`) +- Selection of multiple SparseDataFrame columns by list in __getitem__ (:issue:`1585`) +- Override Index.tolist for compatibility with MultiIndex (:issue:`1576`) +- Fix hierarchical summing bug with MultiIndex of length 1 (:issue:`1568`) +- Work around numpy.concatenate use/bug in Series.set_value (:issue:`1561`) +- Ensure Series/DataFrame are sorted before resampling (:issue:`1580`) +- Fix unhandled IndexError when indexing very large time series (:issue:`1562`) +- Fix DatetimeIndex intersection logic error with irregular indexes (:issue:`1551`) +- Fix unit test errors on Python 3 (:issue:`1550`) +- Fix .ix indexing bugs in duplicate DataFrame index (:issue:`1201`) +- Better handle errors with non-existing objects in HDFStore (:issue:`1254`) +- Don't copy int64 array data in DatetimeIndex when copy=False (:issue:`1624`) +- Fix resampling of conforming periods quarterly to annual (:issue:`1622`) +- Don't lose index name on resampling (:issue:`1631`) +- Support python-dateutil version 2.1 (:issue:`1637`) +- Fix broken scatter_matrix axis labeling, esp. with time series (:issue:`1625`) +- Fix cases where extra keywords weren't being passed on to matplotlib from + Series.plot (:issue:`1636`) +- Fix BusinessMonthBegin logic for dates before 1st bday of month (:issue:`1645`) +- Ensure string alias converted (valid in DatetimeIndex.get_loc) in + DataFrame.xs / __getitem__ (:issue:`1644`) +- Fix use of string alias timestamps with tz-aware time series (:issue:`1647`) +- Fix Series.max/min and Series.describe on len-0 series (:issue:`1650`) +- Handle None values in dict passed to concat (:issue:`1649`) +- Fix Series.interpolate with method='values' and DatetimeIndex (:issue:`1646`) +- Fix IndexError in left merges on a DataFrame with 0-length (:issue:`1628`) +- Fix DataFrame column width display with UTF-8 encoded characters (:issue:`1620`) +- Handle case in pandas.io.data.get_data_yahoo where Yahoo! returns duplicate + dates for most recent business day +- Avoid downsampling when plotting mixed frequencies on the same subplot (:issue:`1619`) +- Fix read_csv bug when reading a single line (:issue:`1553`) +- Fix bug in C code causing monthly periods prior to December 1969 to be off (:issue:`1570`) + +pandas 0.8.0 +------------ + +**Release date:** 6/29/2012 + +New Features +~~~~~~~~~~~~ + +- New unified DatetimeIndex class for nanosecond-level timestamp data +- New Timestamp datetime.datetime subclass with easy time zone conversions, + and support for nanoseconds +- New PeriodIndex class for timespans, calendar logic, and Period scalar object +- High performance resampling of timestamp and period data. New `resample` + method of all pandas data structures +- New frequency names plus shortcut string aliases like '15h', '1h30min' +- Time series string indexing shorthand (:issue:`222`) +- Add week, dayofyear array and other timestamp array-valued field accessor + functions to DatetimeIndex +- Add GroupBy.prod optimized aggregation function and 'prod' fast time series + conversion method (:issue:`1018`) +- Implement robust frequency inference function and `inferred_freq` attribute + on DatetimeIndex (:issue:`391`) +- New ``tz_convert`` and ``tz_localize`` methods in Series / DataFrame +- Convert DatetimeIndexes to UTC if time zones are different in join/setops + (:issue:`864`) +- Add limit argument for forward/backward filling to reindex, fillna, + etc. (:issue:`825` and others) +- Add support for indexes (dates or otherwise) with duplicates and common + sense indexing/selection functionality +- Series/DataFrame.update methods, in-place variant of combine_first (:issue:`961`) +- Add ``match`` function to API (:issue:`502`) +- Add Cython-optimized first, last, min, max, prod functions to GroupBy (:issue:`994`, + :issue:`1043`) +- Dates can be split across multiple columns (:issue:`1227`, :issue:`1186`) +- Add experimental support for converting pandas DataFrame to R data.frame + via rpy2 (:issue:`350`, :issue:`1212`) +- Can pass list of (name, function) to GroupBy.aggregate to get aggregates in + a particular order (:issue:`610`) +- Can pass dicts with lists of functions or dicts to GroupBy aggregate to do + much more flexible multiple function aggregation (:issue:`642`, :issue:`610`) +- New ordered_merge functions for merging DataFrames with ordered + data. Also supports group-wise merging for panel data (:issue:`813`) +- Add keys() method to DataFrame +- Add flexible replace method for replacing potentially values to Series and + DataFrame (:issue:`929`, :issue:`1241`) +- Add 'kde' plot kind for Series/DataFrame.plot (:issue:`1059`) +- More flexible multiple function aggregation with GroupBy +- Add pct_change function to Series/DataFrame +- Add option to interpolate by Index values in Series.interpolate (:issue:`1206`) +- Add ``max_colwidth`` option for DataFrame, defaulting to 50 +- Conversion of DataFrame through rpy2 to R data.frame (:issue:`1282`, ) +- Add keys() method on DataFrame (:issue:`1240`) +- Add new ``match`` function to API (similar to R) (:issue:`502`) +- Add dayfirst option to parsers (:issue:`854`) +- Add ``method`` argument to ``align`` method for forward/backward fillin + (:issue:`216`) +- Add Panel.transpose method for rearranging axes (:issue:`695`) +- Add new ``cut`` function (patterned after R) for discretizing data into + equal range-length bins or arbitrary breaks of your choosing (:issue:`415`) +- Add new ``qcut`` for cutting with quantiles (:issue:`1378`) +- Add ``value_counts`` top level array method (:issue:`1392`) +- Added Andrews curves plot tupe (:issue:`1325`) +- Add lag plot (:issue:`1440`) +- Add autocorrelation_plot (:issue:`1425`) +- Add support for tox and Travis CI (:issue:`1382`) +- Add support for Categorical use in GroupBy (:issue:`292`) +- Add ``any`` and ``all`` methods to DataFrame (:issue:`1416`) +- Add ``secondary_y`` option to Series.plot +- Add experimental ``lreshape`` function for reshaping wide to long + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Switch to klib/khash-based hash tables in Index classes for better + performance in many cases and lower memory footprint +- Shipping some functions from scipy.stats to reduce dependency, + e.g. Series.describe and DataFrame.describe (:issue:`1092`) +- Can create MultiIndex by passing list of lists or list of arrays to Series, + DataFrame constructor, etc. (:issue:`831`) +- Can pass arrays in addition to column names to DataFrame.set_index (:issue:`402`) +- Improve the speed of "square" reindexing of homogeneous DataFrame objects + by significant margin (:issue:`836`) +- Handle more dtypes when passed MaskedArrays in DataFrame constructor (:issue:`406`) +- Improved performance of join operations on integer keys (:issue:`682`) +- Can pass multiple columns to GroupBy object, e.g. grouped[[col1, col2]] to + only aggregate a subset of the value columns (:issue:`383`) +- Add histogram / kde plot options for scatter_matrix diagonals (:issue:`1237`) +- Add inplace option to Series/DataFrame.rename and sort_index, + DataFrame.drop_duplicates (:issue:`805`, :issue:`207`) +- More helpful error message when nothing passed to Series.reindex (:issue:`1267`) +- Can mix array and scalars as dict-value inputs to DataFrame ctor (:issue:`1329`) +- Use DataFrame columns' name for legend title in plots +- Preserve frequency in DatetimeIndex when possible in boolean indexing + operations +- Promote datetime.date values in data alignment operations (:issue:`867`) +- Add ``order`` method to Index classes (:issue:`1028`) +- Avoid hash table creation in large monotonic hash table indexes (:issue:`1160`) +- Store time zones in HDFStore (:issue:`1232`) +- Enable storage of sparse data structures in HDFStore (:issue:`85`) +- Enable Series.asof to work with arrays of timestamp inputs +- Cython implementation of DataFrame.corr speeds up by > 100x (:issue:`1349`, :issue:`1354`) +- Exclude "nuisance" columns automatically in GroupBy.transform (:issue:`1364`) +- Support functions-as-strings in GroupBy.transform (:issue:`1362`) +- Use index name as xlabel/ylabel in plots (:issue:`1415`) +- Add ``convert_dtype`` option to Series.apply to be able to leave data as + dtype=object (:issue:`1414`) +- Can specify all index level names in concat (:issue:`1419`) +- Add ``dialect`` keyword to parsers for quoting conventions (:issue:`1363`) +- Enable DataFrame[bool_DataFrame] += value (:issue:`1366`) +- Add ``retries`` argument to ``get_data_yahoo`` to try to prevent Yahoo! API + 404s (:issue:`826`) +- Improve performance of reshaping by using O(N) categorical sorting +- Series names will be used for index of DataFrame if no index passed (:issue:`1494`) +- Header argument in DataFrame.to_csv can accept a list of column names to + use instead of the object's columns (:issue:`921`) +- Add ``raise_conflict`` argument to DataFrame.update (:issue:`1526`) +- Support file-like objects in ExcelFile (:issue:`1529`) + +API Changes +~~~~~~~~~~~ + +- Rename `pandas._tseries` to `pandas.lib` +- Rename Factor to Categorical and add improvements. Numerous Categorical bug + fixes +- Frequency name overhaul, WEEKDAY/EOM and rules with @ + deprecated. get_legacy_offset_name backwards compatibility function added +- Raise ValueError in DataFrame.__nonzero__, so "if df" no longer works + (:issue:`1073`) +- Change BDay (business day) to not normalize dates by default (:issue:`506`) +- Remove deprecated DataMatrix name +- Default merge suffixes for overlap now have underscores instead of periods + to facilitate tab completion, etc. (:issue:`1239`) +- Deprecation of offset, time_rule timeRule parameters throughout codebase +- Series.append and DataFrame.append no longer check for duplicate indexes + by default, add verify_integrity parameter (:issue:`1394`) +- Refactor Factor class, old constructor moved to Factor.from_array +- Modified internals of MultiIndex to use less memory (no longer represented + as array of tuples) internally, speed up construction time and many methods + which construct intermediate hierarchical indexes (:issue:`1467`) + +Bug Fixes +~~~~~~~~~ + +- Fix OverflowError from storing pre-1970 dates in HDFStore by switching to + datetime64 (:issue:`179`) +- Fix logical error with February leap year end in YearEnd offset +- Series([False, nan]) was getting casted to float64 (:issue:`1074`) +- Fix binary operations between boolean Series and object Series with + booleans and NAs (:issue:`1074`, :issue:`1079`) +- Couldn't assign whole array to column in mixed-type DataFrame via .ix + (:issue:`1142`) +- Fix label slicing issues with float index values (:issue:`1167`) +- Fix segfault caused by empty groups passed to groupby (:issue:`1048`) +- Fix occasionally misbehaved reindexing in the presence of NaN labels (:issue:`522`) +- Fix imprecise logic causing weird Series results from .apply (:issue:`1183`) +- Unstack multiple levels in one shot, avoiding empty columns in some + cases. Fix pivot table bug (:issue:`1181`) +- Fix formatting of MultiIndex on Series/DataFrame when index name coincides + with label (:issue:`1217`) +- Handle Excel 2003 #N/A as NaN from xlrd (:issue:`1213`, :issue:`1225`) +- Fix timestamp locale-related deserialization issues with HDFStore by moving + to datetime64 representation (:issue:`1081`, :issue:`809`) +- Fix DataFrame.duplicated/drop_duplicates NA value handling (:issue:`557`) +- Actually raise exceptions in fast reducer (:issue:`1243`) +- Fix various timezone-handling bugs from 0.7.3 (:issue:`969`) +- GroupBy on level=0 discarded index name (:issue:`1313`) +- Better error message with unmergeable DataFrames (:issue:`1307`) +- Series.__repr__ alignment fix with unicode index values (:issue:`1279`) +- Better error message if nothing passed to reindex (:issue:`1267`) +- More robust NA handling in DataFrame.drop_duplicates (:issue:`557`) +- Resolve locale-based and pre-epoch HDF5 timestamp deserialization issues + (:issue:`973`, :issue:`1081`, :issue:`179`) +- Implement Series.repeat (:issue:`1229`) +- Fix indexing with namedtuple and other tuple subclasses (:issue:`1026`) +- Fix float64 slicing bug (:issue:`1167`) +- Parsing integers with commas (:issue:`796`) +- Fix groupby improper data type when group consists of one value (:issue:`1065`) +- Fix negative variance possibility in nanvar resulting from floating point + error (:issue:`1090`) +- Consistently set name on groupby pieces (:issue:`184`) +- Treat dict return values as Series in GroupBy.apply (:issue:`823`) +- Respect column selection for DataFrame in in GroupBy.transform (:issue:`1365`) +- Fix MultiIndex partial indexing bug (:issue:`1352`) +- Enable assignment of rows in mixed-type DataFrame via .ix (:issue:`1432`) +- Reset index mapping when grouping Series in Cython (:issue:`1423`) +- Fix outer/inner DataFrame.join with non-unique indexes (:issue:`1421`) +- Fix MultiIndex groupby bugs with empty lower levels (:issue:`1401`) +- Calling fillna with a Series will have same behavior as with dict (:issue:`1486`) +- SparseSeries reduction bug (:issue:`1375`) +- Fix unicode serialization issue in HDFStore (:issue:`1361`) +- Pass keywords to pyplot.boxplot in DataFrame.boxplot (:issue:`1493`) +- Bug fixes in MonthBegin (:issue:`1483`) +- Preserve MultiIndex names in drop (:issue:`1513`) +- Fix Panel DataFrame slice-assignment bug (:issue:`1533`) +- Don't use locals() in read_* functions (:issue:`1547`) + +pandas 0.7.3 +------------ + +**Release date:** April 12, 2012 + +New Features +~~~~~~~~~~~~ + +- Support for non-unique indexes: indexing and selection, many-to-one and + many-to-many joins (:issue:`1306`) +- Added fixed-width file reader, read_fwf (:issue:`952`) +- Add group_keys argument to groupby to not add group names to MultiIndex in + result of apply (:issue:`938`) +- DataFrame can now accept non-integer label slicing (:issue:`946`). Previously + only DataFrame.ix was able to do so. +- DataFrame.apply now retains name attributes on Series objects (:issue:`983`) +- Numeric DataFrame comparisons with non-numeric values now raises proper + TypeError (:issue:`943`). Previously raise "PandasError: DataFrame constructor + not properly called!" +- Add ``kurt`` methods to Series and DataFrame (:issue:`964`) +- Can pass dict of column -> list/set NA values for text parsers (:issue:`754`) +- Allows users specified NA values in text parsers (:issue:`754`) +- Parsers checks for openpyxl dependency and raises ImportError if not found + (:issue:`1007`) +- New factory function to create HDFStore objects that can be used in a with + statement so users do not have to explicitly call HDFStore.close (:issue:`1005`) +- pivot_table is now more flexible with same parameters as groupby (:issue:`941`) +- Added stacked bar plots (:issue:`987`) +- scatter_matrix method in pandas/tools/plotting.py (:issue:`935`) +- DataFrame.boxplot returns plot results for ex-post styling (:issue:`985`) +- Short version number accessible as pandas.version.short_version (:issue:`930`) +- Additional documentation in panel.to_frame (:issue:`942`) +- More informative Series.apply docstring regarding element-wise apply + (:issue:`977`) +- Notes on rpy2 installation (:issue:`1006`) +- Add rotation and font size options to hist method (:issue:`1012`) +- Use exogenous / X variable index in result of OLS.y_predict. Add + OLS.predict method (:issue:`1027`, :issue:`1008`) + +API Changes +~~~~~~~~~~~ + +- Calling apply on grouped Series, e.g. describe(), will no longer yield + DataFrame by default. Will have to call unstack() to get prior behavior +- NA handling in non-numeric comparisons has been tightened up (:issue:`933`, :issue:`953`) +- No longer assign dummy names key_0, key_1, etc. to groupby index (:issue:`1291`) + +Bug Fixes +~~~~~~~~~ + +- Fix logic error when selecting part of a row in a DataFrame with a + MultiIndex index (:issue:`1013`) +- Series comparison with Series of differing length causes crash (:issue:`1016`). +- Fix bug in indexing when selecting section of hierarchically-indexed row + (:issue:`1013`) +- DataFrame.plot(logy=True) has no effect (:issue:`1011`). +- Broken arithmetic operations between SparsePanel-Panel (:issue:`1015`) +- Unicode repr issues in MultiIndex with non-ascii characters (:issue:`1010`) +- DataFrame.lookup() returns inconsistent results if exact match not present + (:issue:`1001`) +- DataFrame arithmetic operations not treating None as NA (:issue:`992`) +- DataFrameGroupBy.apply returns incorrect result (:issue:`991`) +- Series.reshape returns incorrect result for multiple dimensions (:issue:`989`) +- Series.std and Series.var ignores ddof parameter (:issue:`934`) +- DataFrame.append loses index names (:issue:`980`) +- DataFrame.plot(kind='bar') ignores color argument (:issue:`958`) +- Inconsistent Index comparison results (:issue:`948`) +- Improper int dtype DataFrame construction from data with NaN (:issue:`846`) +- Removes default 'result' name in grouby results (:issue:`995`) +- DataFrame.from_records no longer mutate input columns (:issue:`975`) +- Use Index name when grouping by it (:issue:`1313`) + +pandas 0.7.2 +------------ + +**Release date:** March 16, 2012 + +New Features +~~~~~~~~~~~~ + +- Add additional tie-breaking methods in DataFrame.rank (:issue:`874`) +- Add ascending parameter to rank in Series, DataFrame (:issue:`875`) +- Add sort_columns parameter to allow unsorted plots (:issue:`918`) +- IPython tab completion on GroupBy objects + +API Changes +~~~~~~~~~~~ + +- Series.sum returns 0 instead of NA when called on an empty + series. Analogously for a DataFrame whose rows or columns are length 0 + (:issue:`844`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Don't use groups dict in Grouper.size (:issue:`860`) +- Use khash for Series.value_counts, add raw function to algorithms.py (:issue:`861`) +- Enable column access via attributes on GroupBy (:issue:`882`) +- Enable setting existing columns (only) via attributes on DataFrame, Panel + (:issue:`883`) +- Intercept __builtin__.sum in groupby (:issue:`885`) +- Can pass dict to DataFrame.fillna to use different values per column (:issue:`661`) +- Can select multiple hierarchical groups by passing list of values in .ix + (:issue:`134`) +- Add level keyword to ``drop`` for dropping values from a level (:issue:`159`) +- Add ``coerce_float`` option on DataFrame.from_records (:issue:`893`) +- Raise exception if passed date_parser fails in ``read_csv`` +- Add ``axis`` option to DataFrame.fillna (:issue:`174`) +- Fixes to Panel to make it easier to subclass (:issue:`888`) + +Bug Fixes +~~~~~~~~~ + +- Fix overflow-related bugs in groupby (:issue:`850`, :issue:`851`) +- Fix unhelpful error message in parsers (:issue:`856`) +- Better err msg for failed boolean slicing of dataframe (:issue:`859`) +- Series.count cannot accept a string (level name) in the level argument (:issue:`869`) +- Group index platform int check (:issue:`870`) +- concat on axis=1 and ignore_index=True raises TypeError (:issue:`871`) +- Further unicode handling issues resolved (:issue:`795`) +- Fix failure in multiindex-based access in Panel (:issue:`880`) +- Fix DataFrame boolean slice assignment failure (:issue:`881`) +- Fix combineAdd NotImplementedError for SparseDataFrame (:issue:`887`) +- Fix DataFrame.to_html encoding and columns (:issue:`890`, :issue:`891`, :issue:`909`) +- Fix na-filling handling in mixed-type DataFrame (:issue:`910`) +- Fix to DataFrame.set_value with non-existant row/col (:issue:`911`) +- Fix malformed block in groupby when excluding nuisance columns (:issue:`916`) +- Fix inconsistant NA handling in dtype=object arrays (:issue:`925`) +- Fix missing center-of-mass computation in ewmcov (:issue:`862`) +- Don't raise exception when opening read-only HDF5 file (:issue:`847`) +- Fix possible out-of-bounds memory access in 0-length Series (:issue:`917`) + +pandas 0.7.1 +------------ + +**Release date:** February 29, 2012 + +New Features +~~~~~~~~~~~~ + +- Add ``to_clipboard`` function to pandas namespace for writing objects to + the system clipboard (:issue:`774`) +- Add ``itertuples`` method to DataFrame for iterating through the rows of a + dataframe as tuples (:issue:`818`) +- Add ability to pass fill_value and method to DataFrame and Series align + method (:issue:`806`, :issue:`807`) +- Add fill_value option to reindex, align methods (:issue:`784`) +- Enable concat to produce DataFrame from Series (:issue:`787`) +- Add ``between`` method to Series (:issue:`802`) +- Add HTML representation hook to DataFrame for the IPython HTML notebook + (:issue:`773`) +- Support for reading Excel 2007 XML documents using openpyxl + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improve performance and memory usage of fillna on DataFrame +- Can concatenate a list of Series along axis=1 to obtain a DataFrame (:issue:`787`) + +Bug Fixes +~~~~~~~~~ + +- Fix memory leak when inserting large number of columns into a single + DataFrame (:issue:`790`) +- Appending length-0 DataFrame with new columns would not result in those new + columns being part of the resulting concatenated DataFrame (:issue:`782`) +- Fixed groupby corner case when passing dictionary grouper and as_index is + False (:issue:`819`) +- Fixed bug whereby bool array sometimes had object dtype (:issue:`820`) +- Fix exception thrown on np.diff (:issue:`816`) +- Fix to_records where columns are non-strings (:issue:`822`) +- Fix Index.intersection where indices have incomparable types (:issue:`811`) +- Fix ExcelFile throwing an exception for two-line file (:issue:`837`) +- Add clearer error message in csv parser (:issue:`835`) +- Fix loss of fractional seconds in HDFStore (:issue:`513`) +- Fix DataFrame join where columns have datetimes (:issue:`787`) +- Work around numpy performance issue in take (:issue:`817`) +- Improve comparison operations for NA-friendliness (:issue:`801`) +- Fix indexing operation for floating point values (:issue:`780`, :issue:`798`) +- Fix groupby case resulting in malformed dataframe (:issue:`814`) +- Fix behavior of reindex of Series dropping name (:issue:`812`) +- Improve on redudant groupby computation (:issue:`775`) +- Catch possible NA assignment to int/bool series with exception (:issue:`839`) + +pandas 0.7.0 +------------ + +**Release date:** 2/9/2012 + +New Features +~~~~~~~~~~~~ + +- New ``merge`` function for efficiently performing full gamut of database / + relational-algebra operations. Refactored existing join methods to use the + new infrastructure, resulting in substantial performance gains (:issue:`220`, + :issue:`249`, :issue:`267`) +- New ``concat`` function for concatenating DataFrame or Panel objects along + an axis. Can form union or intersection of the other axes. Improves + performance of ``DataFrame.append`` (:issue:`468`, :issue:`479`, :issue:`273`) +- Handle differently-indexed output values in ``DataFrame.apply`` (:issue:`498`) +- Can pass list of dicts (e.g., a list of shallow JSON objects) to DataFrame + constructor (:issue:`526`) +- Add ``reorder_levels`` method to Series and DataFrame (:issue:`534`) +- Add dict-like ``get`` function to DataFrame and Panel (:issue:`521`) +- ``DataFrame.iterrows`` method for efficiently iterating through the rows of + a DataFrame +- Added ``DataFrame.to_panel`` with code adapted from ``LongPanel.to_long`` +- ``reindex_axis`` method added to DataFrame +- Add ``level`` option to binary arithmetic functions on ``DataFrame`` and + ``Series`` +- Add ``level`` option to the ``reindex`` and ``align`` methods on Series and + DataFrame for broadcasting values across a level (:issue:`542`, :issue:`552`, others) +- Add attribute-based item access to ``Panel`` and add IPython completion (PR + :issue:`554`) +- Add ``logy`` option to ``Series.plot`` for log-scaling on the Y axis +- Add ``index``, ``header``, and ``justify`` options to + ``DataFrame.to_string``. Add option to (:issue:`570`, :issue:`571`) +- Can pass multiple DataFrames to ``DataFrame.join`` to join on index (:issue:`115`) +- Can pass multiple Panels to ``Panel.join`` (:issue:`115`) +- Can pass multiple DataFrames to `DataFrame.append` to concatenate (stack) + and multiple Series to ``Series.append`` too +- Added ``justify`` argument to ``DataFrame.to_string`` to allow different + alignment of column headers +- Add ``sort`` option to GroupBy to allow disabling sorting of the group keys + for potential speedups (:issue:`595`) +- Can pass MaskedArray to Series constructor (:issue:`563`) +- Add Panel item access via attributes and IPython completion (:issue:`554`) +- Implement ``DataFrame.lookup``, fancy-indexing analogue for retrieving + values given a sequence of row and column labels (:issue:`338`) +- Add ``verbose`` option to ``read_csv`` and ``read_table`` to show number of + NA values inserted in non-numeric columns (:issue:`614`) +- Can pass a list of dicts or Series to ``DataFrame.append`` to concatenate + multiple rows (:issue:`464`) +- Add ``level`` argument to ``DataFrame.xs`` for selecting data from other + MultiIndex levels. Can take one or more levels with potentially a tuple of + keys for flexible retrieval of data (:issue:`371`, :issue:`629`) +- New ``crosstab`` function for easily computing frequency tables (:issue:`170`) +- Can pass a list of functions to aggregate with groupby on a DataFrame, + yielding an aggregated result with hierarchical columns (:issue:`166`) +- Add integer-indexing functions ``iget`` in Series and ``irow`` / ``iget`` + in DataFrame (:issue:`628`) +- Add new ``Series.unique`` function, significantly faster than + ``numpy.unique`` (:issue:`658`) +- Add new ``cummin`` and ``cummax`` instance methods to ``Series`` and + ``DataFrame`` (:issue:`647`) +- Add new ``value_range`` function to return min/max of a dataframe (:issue:`288`) +- Add ``drop`` parameter to ``reset_index`` method of ``DataFrame`` and added + method to ``Series`` as well (:issue:`699`) +- Add ``isin`` method to Index objects, works just like ``Series.isin`` (GH + :issue:`657`) +- Implement array interface on Panel so that ufuncs work (re: :issue:`740`) +- Add ``sort`` option to ``DataFrame.join`` (:issue:`731`) +- Improved handling of NAs (propagation) in binary operations with + dtype=object arrays (:issue:`737`) +- Add ``abs`` method to Pandas objects +- Added ``algorithms`` module to start collecting central algos + +API Changes +~~~~~~~~~~~ + +- Label-indexing with integer indexes now raises KeyError if a label is not + found instead of falling back on location-based indexing (:issue:`700`) +- Label-based slicing via ``ix`` or ``[]`` on Series will now only work if + exact matches for the labels are found or if the index is monotonic (for + range selections) +- Label-based slicing and sequences of labels can be passed to ``[]`` on a + Series for both getting and setting (:issue:`86`) +- `[]` operator (``__getitem__`` and ``__setitem__``) will raise KeyError + with integer indexes when an index is not contained in the index. The prior + behavior would fall back on position-based indexing if a key was not found + in the index which would lead to subtle bugs. This is now consistent with + the behavior of ``.ix`` on DataFrame and friends (:issue:`328`) +- Rename ``DataFrame.delevel`` to ``DataFrame.reset_index`` and add + deprecation warning +- `Series.sort` (an in-place operation) called on a Series which is a view on + a larger array (e.g. a column in a DataFrame) will generate an Exception to + prevent accidentally modifying the data source (:issue:`316`) +- Refactor to remove deprecated ``LongPanel`` class (:issue:`552`) +- Deprecated ``Panel.to_long``, renamed to ``to_frame`` +- Deprecated ``colSpace`` argument in ``DataFrame.to_string``, renamed to + ``col_space`` +- Rename ``precision`` to ``accuracy`` in engineering float formatter (GH + :issue:`395`) +- The default delimiter for ``read_csv`` is comma rather than letting + ``csv.Sniffer`` infer it +- Rename ``col_or_columns`` argument in ``DataFrame.drop_duplicates`` (GH + :issue:`734`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Better error message in DataFrame constructor when passed column labels + don't match data (:issue:`497`) +- Substantially improve performance of multi-GroupBy aggregation when a + Python function is passed, reuse ndarray object in Cython (:issue:`496`) +- Can store objects indexed by tuples and floats in HDFStore (:issue:`492`) +- Don't print length by default in Series.to_string, add `length` option (GH + :issue:`489`) +- Improve Cython code for multi-groupby to aggregate without having to sort + the data (:issue:`93`) +- Improve MultiIndex reindexing speed by storing tuples in the MultiIndex, + test for backwards unpickling compatibility +- Improve column reindexing performance by using specialized Cython take + function +- Further performance tweaking of Series.__getitem__ for standard use cases +- Avoid Index dict creation in some cases (i.e. when getting slices, etc.), + regression from prior versions +- Friendlier error message in setup.py if NumPy not installed +- Use common set of NA-handling operations (sum, mean, etc.) in Panel class + also (:issue:`536`) +- Default name assignment when calling ``reset_index`` on DataFrame with a + regular (non-hierarchical) index (:issue:`476`) +- Use Cythonized groupers when possible in Series/DataFrame stat ops with + ``level`` parameter passed (:issue:`545`) +- Ported skiplist data structure to C to speed up ``rolling_median`` by about + 5-10x in most typical use cases (:issue:`374`) +- Some performance enhancements in constructing a Panel from a dict of + DataFrame objects +- Made ``Index._get_duplicates`` a public method by removing the underscore +- Prettier printing of floats, and column spacing fix (:issue:`395`, :issue:`571`) +- Add ``bold_rows`` option to DataFrame.to_html (:issue:`586`) +- Improve the performance of ``DataFrame.sort_index`` by up to 5x or more + when sorting by multiple columns +- Substantially improve performance of DataFrame and Series constructors when + passed a nested dict or dict, respectively (:issue:`540`, :issue:`621`) +- Modified setup.py so that pip / setuptools will install dependencies (GH + :issue:`507`, various pull requests) +- Unstack called on DataFrame with non-MultiIndex will return Series (GH + :issue:`477`) +- Improve DataFrame.to_string and console formatting to be more consistent in + the number of displayed digits (:issue:`395`) +- Use bottleneck if available for performing NaN-friendly statistical + operations that it implemented (:issue:`91`) +- Monkey-patch context to traceback in ``DataFrame.apply`` to indicate which + row/column the function application failed on (:issue:`614`) +- Improved ability of read_table and read_clipboard to parse + console-formatted DataFrames (can read the row of index names, etc.) +- Can pass list of group labels (without having to convert to an ndarray + yourself) to ``groupby`` in some cases (:issue:`659`) +- Use ``kind`` argument to Series.order for selecting different sort kinds + (:issue:`668`) +- Add option to Series.to_csv to omit the index (:issue:`684`) +- Add ``delimiter`` as an alternative to ``sep`` in ``read_csv`` and other + parsing functions +- Substantially improved performance of groupby on DataFrames with many + columns by aggregating blocks of columns all at once (:issue:`745`) +- Can pass a file handle or StringIO to Series/DataFrame.to_csv (:issue:`765`) +- Can pass sequence of integers to DataFrame.irow(icol) and Series.iget, (GH + :issue:`654`) +- Prototypes for some vectorized string functions +- Add float64 hash table to solve the Series.unique problem with NAs (:issue:`714`) +- Memoize objects when reading from file to reduce memory footprint +- Can get and set a column of a DataFrame with hierarchical columns + containing "empty" ('') lower levels without passing the empty levels (PR + :issue:`768`) + +Bug Fixes +~~~~~~~~~ + +- Raise exception in out-of-bounds indexing of Series instead of + seg-faulting, regression from earlier releases (:issue:`495`) +- Fix error when joining DataFrames of different dtypes within the same + typeclass (e.g. float32 and float64) (:issue:`486`) +- Fix bug in Series.min/Series.max on objects like datetime.datetime (GH + :issue:`487`) +- Preserve index names in Index.union (:issue:`501`) +- Fix bug in Index joining causing subclass information (like DateRange type) + to be lost in some cases (:issue:`500`) +- Accept empty list as input to DataFrame constructor, regression from 0.6.0 + (:issue:`491`) +- Can output DataFrame and Series with ndarray objects in a dtype=object + array (:issue:`490`) +- Return empty string from Series.to_string when called on empty Series (GH + :issue:`488`) +- Fix exception passing empty list to DataFrame.from_records +- Fix Index.format bug (excluding name field) with datetimes with time info +- Fix scalar value access in Series to always return NumPy scalars, + regression from prior versions (:issue:`510`) +- Handle rows skipped at beginning of file in read_* functions (:issue:`505`) +- Handle improper dtype casting in ``set_value`` methods +- Unary '-' / __neg__ operator on DataFrame was returning integer values +- Unbox 0-dim ndarrays from certain operators like all, any in Series +- Fix handling of missing columns (was combine_first-specific) in + DataFrame.combine for general case (:issue:`529`) +- Fix type inference logic with boolean lists and arrays in DataFrame indexing +- Use centered sum of squares in R-square computation if entity_effects=True + in panel regression +- Handle all NA case in Series.{corr, cov}, was raising exception (:issue:`548`) +- Aggregating by multiple levels with ``level`` argument to DataFrame, Series + stat method, was broken (:issue:`545`) +- Fix Cython buf when converter passed to read_csv produced a numeric array + (buffer dtype mismatch when passed to Cython type inference function) (GH + :issue:`546`) +- Fix exception when setting scalar value using .ix on a DataFrame with a + MultiIndex (:issue:`551`) +- Fix outer join between two DateRanges with different offsets that returned + an invalid DateRange +- Cleanup DataFrame.from_records failure where index argument is an integer +- Fix Data.from_records failure when passed a dictionary +- Fix NA handling in {Series, DataFrame}.rank with non-floating point dtypes +- Fix bug related to integer type-checking in .ix-based indexing +- Handle non-string index name passed to DataFrame.from_records +- DataFrame.insert caused the columns name(s) field to be discarded (:issue:`527`) +- Fix erroneous in monotonic many-to-one left joins +- Fix DataFrame.to_string to remove extra column white space (:issue:`571`) +- Format floats to default to same number of digits (:issue:`395`) +- Added decorator to copy docstring from one function to another (:issue:`449`) +- Fix error in monotonic many-to-one left joins +- Fix __eq__ comparison between DateOffsets with different relativedelta + keywords passed +- Fix exception caused by parser converter returning strings (:issue:`583`) +- Fix MultiIndex formatting bug with integer names (:issue:`601`) +- Fix bug in handling of non-numeric aggregates in Series.groupby (:issue:`612`) +- Fix TypeError with tuple subclasses (e.g. namedtuple) in + DataFrame.from_records (:issue:`611`) +- Catch misreported console size when running IPython within Emacs +- Fix minor bug in pivot table margins, loss of index names and length-1 + 'All' tuple in row labels +- Add support for legacy WidePanel objects to be read from HDFStore +- Fix out-of-bounds segfault in pad_object and backfill_object methods when + either source or target array are empty +- Could not create a new column in a DataFrame from a list of tuples +- Fix bugs preventing SparseDataFrame and SparseSeries working with groupby + (:issue:`666`) +- Use sort kind in Series.sort / argsort (:issue:`668`) +- Fix DataFrame operations on non-scalar, non-pandas objects (:issue:`672`) +- Don't convert DataFrame column to integer type when passing integer to + __setitem__ (:issue:`669`) +- Fix downstream bug in pivot_table caused by integer level names in + MultiIndex (:issue:`678`) +- Fix SparseSeries.combine_first when passed a dense Series (:issue:`687`) +- Fix performance regression in HDFStore loading when DataFrame or Panel + stored in table format with datetimes +- Raise Exception in DateRange when offset with n=0 is passed (:issue:`683`) +- Fix get/set inconsistency with .ix property and integer location but + non-integer index (:issue:`707`) +- Use right dropna function for SparseSeries. Return dense Series for NA fill + value (:issue:`730`) +- Fix Index.format bug causing incorrectly string-formatted Series with + datetime indexes (:issue:`726`, :issue:`758`) +- Fix errors caused by object dtype arrays passed to ols (:issue:`759`) +- Fix error where column names lost when passing list of labels to + DataFrame.__getitem__, (:issue:`662`) +- Fix error whereby top-level week iterator overwrote week instance +- Fix circular reference causing memory leak in sparse array / series / + frame, (:issue:`663`) +- Fix integer-slicing from integers-as-floats (:issue:`670`) +- Fix zero division errors in nanops from object dtype arrays in all NA case + (:issue:`676`) +- Fix csv encoding when using unicode (:issue:`705`, :issue:`717`, :issue:`738`) +- Fix assumption that each object contains every unique block type in concat, + (:issue:`708`) +- Fix sortedness check of multiindex in to_panel (:issue:`719`, 720) +- Fix that None was not treated as NA in PyObjectHashtable +- Fix hashing dtype because of endianness confusion (:issue:`747`, :issue:`748`) +- Fix SparseSeries.dropna to return dense Series in case of NA fill value (GH + :issue:`730`) +- Use map_infer instead of np.vectorize. handle NA sentinels if converter + yields numeric array, (:issue:`753`) +- Fixes and improvements to DataFrame.rank (:issue:`742`) +- Fix catching AttributeError instead of NameError for bottleneck +- Try to cast non-MultiIndex to better dtype when calling reset_index (:issue:`726` + :issue:`440`) +- Fix #1.QNAN0' float bug on 2.6/win64 +- Allow subclasses of dicts in DataFrame constructor, with tests +- Fix problem whereby set_index destroys column multiindex (:issue:`764`) +- Hack around bug in generating DateRange from naive DateOffset (:issue:`770`) +- Fix bug in DateRange.intersection causing incorrect results with some + overlapping ranges (:issue:`771`) + +Thanks +~~~~~~ + +- Craig Austin +- Chris Billington +- Marius Cobzarenco +- Mario Gamboa-Cavazos +- Hans-Martin Gaudecker +- Arthur Gerigk +- Yaroslav Halchenko +- Jeff Hammerbacher +- Matt Harrison +- Andreas Hilboll +- Luc Kesters +- Adam Klein +- Gregg Lind +- Solomon Negusse +- Wouter Overmeire +- Christian Prinoth +- Jeff Reback +- Sam Reckoner +- Craig Reeson +- Jan Schulz +- Skipper Seabold +- Ted Square +- Graham Taylor +- Aman Thakral +- Chris Uga +- Dieter Vandenbussche +- Texas P. +- Pinxing Ye +- ... and everyone I forgot + +pandas 0.6.1 +------------ + +**Release date:** 12/13/2011 + +API Changes +~~~~~~~~~~~ + +- Rename `names` argument in DataFrame.from_records to `columns`. Add + deprecation warning +- Boolean get/set operations on Series with boolean Series will reindex + instead of requiring that the indexes be exactly equal (:issue:`429`) + +New Features +~~~~~~~~~~~~ + +- Can pass Series to DataFrame.append with ignore_index=True for appending a + single row (:issue:`430`) +- Add Spearman and Kendall correlation options to Series.corr and + DataFrame.corr (:issue:`428`) +- Add new `get_value` and `set_value` methods to Series, DataFrame, and Panel + to very low-overhead access to scalar elements. df.get_value(row, column) + is about 3x faster than df[column][row] by handling fewer cases (:issue:`437`, + :issue:`438`). Add similar methods to sparse data structures for compatibility +- Add Qt table widget to sandbox (:issue:`435`) +- DataFrame.align can accept Series arguments, add axis keyword (:issue:`461`) +- Implement new SparseList and SparseArray data structures. SparseSeries now + derives from SparseArray (:issue:`463`) +- max_columns / max_rows options in set_printoptions (:issue:`453`) +- Implement Series.rank and DataFrame.rank, fast versions of + scipy.stats.rankdata (:issue:`428`) +- Implement DataFrame.from_items alternate constructor (:issue:`444`) +- DataFrame.convert_objects method for inferring better dtypes for object + columns (:issue:`302`) +- Add rolling_corr_pairwise function for computing Panel of correlation + matrices (:issue:`189`) +- Add `margins` option to `pivot_table` for computing subgroup aggregates (GH + :issue:`114`) +- Add `Series.from_csv` function (:issue:`482`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improve memory usage of `DataFrame.describe` (do not copy data + unnecessarily) (:issue:`425`) +- Use same formatting function for outputting floating point Series to console + as in DataFrame (:issue:`420`) +- DataFrame.delevel will try to infer better dtype for new columns (:issue:`440`) +- Exclude non-numeric types in DataFrame.{corr, cov} +- Override Index.astype to enable dtype casting (:issue:`412`) +- Use same float formatting function for Series.__repr__ (:issue:`420`) +- Use available console width to output DataFrame columns (:issue:`453`) +- Accept ndarrays when setting items in Panel (:issue:`452`) +- Infer console width when printing __repr__ of DataFrame to console (PR + :issue:`453`) +- Optimize scalar value lookups in the general case by 25% or more in Series + and DataFrame +- Can pass DataFrame/DataFrame and DataFrame/Series to + rolling_corr/rolling_cov (:issue:`462`) +- Fix performance regression in cross-sectional count in DataFrame, affecting + DataFrame.dropna speed +- Column deletion in DataFrame copies no data (computes views on blocks) (GH + :issue:`158`) +- MultiIndex.get_level_values can take the level name +- More helpful error message when DataFrame.plot fails on one of the columns + (:issue:`478`) +- Improve performance of DataFrame.{index, columns} attribute lookup + +Bug Fixes +~~~~~~~~~ + +- Fix O(K^2) memory leak caused by inserting many columns without + consolidating, had been present since 0.4.0 (:issue:`467`) +- `DataFrame.count` should return Series with zero instead of NA with length-0 + axis (:issue:`423`) +- Fix Yahoo! Finance API usage in pandas.io.data (:issue:`419`, :issue:`427`) +- Fix upstream bug causing failure in Series.align with empty Series (:issue:`434`) +- Function passed to DataFrame.apply can return a list, as long as it's the + right length. Regression from 0.4 (:issue:`432`) +- Don't "accidentally" upcast scalar values when indexing using .ix (:issue:`431`) +- Fix groupby exception raised with as_index=False and single column selected + (:issue:`421`) +- Implement DateOffset.__ne__ causing downstream bug (:issue:`456`) +- Fix __doc__-related issue when converting py -> pyo with py2exe +- Bug fix in left join Cython code with duplicate monotonic labels +- Fix bug when unstacking multiple levels described in :issue:`451` +- Exclude NA values in dtype=object arrays, regression from 0.5.0 (:issue:`469`) +- Use Cython map_infer function in DataFrame.applymap to properly infer + output type, handle tuple return values and other things that were breaking + (:issue:`465`) +- Handle floating point index values in HDFStore (:issue:`454`) +- Fixed stale column reference bug (cached Series object) caused by type + change / item deletion in DataFrame (:issue:`473`) +- Index.get_loc should always raise Exception when there are duplicates +- Handle differently-indexed Series input to DataFrame constructor (:issue:`475`) +- Omit nuisance columns in multi-groupby with Python function +- Buglet in handling of single grouping in general apply +- Handle type inference properly when passing list of lists or tuples to + DataFrame constructor (:issue:`484`) +- Preserve Index / MultiIndex names in GroupBy.apply concatenation step (GH + :issue:`481`) + +Thanks +~~~~~~ + +- Ralph Bean +- Luca Beltrame +- Marius Cobzarenco +- Andreas Hilboll +- Jev Kuznetsov +- Adam Lichtenstein +- Wouter Overmeire +- Fernando Perez +- Nathan Pinger +- Christian Prinoth +- Alex Reyfman +- Joon Ro +- Chang She +- Ted Square +- Chris Uga +- Dieter Vandenbussche + +pandas 0.6.0 +------------ + +**Release date:** 11/25/2011 + +API Changes +~~~~~~~~~~~ + +- Arithmetic methods like `sum` will attempt to sum dtype=object values by + default instead of excluding them (:issue:`382`) + +New Features +~~~~~~~~~~~~ + +- Add `melt` function to `pandas.core.reshape` +- Add `level` parameter to group by level in Series and DataFrame + descriptive statistics (:issue:`313`) +- Add `head` and `tail` methods to Series, analogous to to DataFrame (PR + :issue:`296`) +- Add `Series.isin` function which checks if each value is contained in a + passed sequence (:issue:`289`) +- Add `float_format` option to `Series.to_string` +- Add `skip_footer` (:issue:`291`) and `converters` (:issue:`343`) options to + `read_csv` and `read_table` +- Add proper, tested weighted least squares to standard and panel OLS (GH + :issue:`303`) +- Add `drop_duplicates` and `duplicated` functions for removing duplicate + DataFrame rows and checking for duplicate rows, respectively (:issue:`319`) +- Implement logical (boolean) operators ``&``, ``|``, ``^`` on DataFrame + (:issue:`347`) +- Add `Series.mad`, mean absolute deviation, matching DataFrame +- Add `QuarterEnd` DateOffset (:issue:`321`) +- Add matrix multiplication function `dot` to DataFrame (:issue:`65`) +- Add `orient` option to `Panel.from_dict` to ease creation of mixed-type + Panels (:issue:`359`, :issue:`301`) +- Add `DataFrame.from_dict` with similar `orient` option +- Can now pass list of tuples or list of lists to `DataFrame.from_records` + for fast conversion to DataFrame (:issue:`357`) +- Can pass multiple levels to groupby, e.g. `df.groupby(level=[0, 1])` (GH + :issue:`103`) +- Can sort by multiple columns in `DataFrame.sort_index` (:issue:`92`, :issue:`362`) +- Add fast `get_value` and `put_value` methods to DataFrame and + micro-performance tweaks (:issue:`360`) +- Add `cov` instance methods to Series and DataFrame (:issue:`194`, :issue:`362`) +- Add bar plot option to `DataFrame.plot` (:issue:`348`) +- Add `idxmin` and `idxmax` functions to Series and DataFrame for computing + index labels achieving maximum and minimum values (:issue:`286`) +- Add `read_clipboard` function for parsing DataFrame from OS clipboard, + should work across platforms (:issue:`300`) +- Add `nunique` function to Series for counting unique elements (:issue:`297`) +- DataFrame constructor will use Series name if no columns passed (:issue:`373`) +- Support regular expressions and longer delimiters in read_table/read_csv, + but does not handle quoted strings yet (:issue:`364`) +- Add `DataFrame.to_html` for formatting DataFrame to HTML (:issue:`387`) +- MaskedArray can be passed to DataFrame constructor and masked values will be + converted to NaN (:issue:`396`) +- Add `DataFrame.boxplot` function (:issue:`368`, others) +- Can pass extra args, kwds to DataFrame.apply (:issue:`376`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Raise more helpful exception if date parsing fails in DateRange (:issue:`298`) +- Vastly improved performance of GroupBy on axes with a MultiIndex (:issue:`299`) +- Print level names in hierarchical index in Series repr (:issue:`305`) +- Return DataFrame when performing GroupBy on selected column and + as_index=False (:issue:`308`) +- Can pass vector to `on` argument in `DataFrame.join` (:issue:`312`) +- Don't show Series name if it's None in the repr, also omit length for short + Series (:issue:`317`) +- Show legend by default in `DataFrame.plot`, add `legend` boolean flag (GH + :issue:`324`) +- Significantly improved performance of `Series.order`, which also makes + np.unique called on a Series faster (:issue:`327`) +- Faster cythonized count by level in Series and DataFrame (:issue:`341`) +- Raise exception if dateutil 2.0 installed on Python 2.x runtime (:issue:`346`) +- Significant GroupBy performance enhancement with multiple keys with many + "empty" combinations +- New Cython vectorized function `map_infer` speeds up `Series.apply` and + `Series.map` significantly when passed elementwise Python function, + motivated by :issue:`355` +- Cythonized `cache_readonly`, resulting in substantial micro-performance + enhancements throughout the codebase (:issue:`361`) +- Special Cython matrix iterator for applying arbitrary reduction operations + with 3-5x better performance than `np.apply_along_axis` (:issue:`309`) +- Add `raw` option to `DataFrame.apply` for getting better performance when + the passed function only requires an ndarray (:issue:`309`) +- Improve performance of `MultiIndex.from_tuples` +- Can pass multiple levels to `stack` and `unstack` (:issue:`370`) +- Can pass multiple values columns to `pivot_table` (:issue:`381`) +- Can call `DataFrame.delevel` with standard Index with name set (:issue:`393`) +- Use Series name in GroupBy for result index (:issue:`363`) +- Refactor Series/DataFrame stat methods to use common set of NaN-friendly + function +- Handle NumPy scalar integers at C level in Cython conversion routines + +Bug Fixes +~~~~~~~~~ + +- Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index + name (:issue:`290`) +- DataFrame should clear its Series caches on consolidation, was causing + "stale" Series to be returned in some corner cases (:issue:`304`) +- DataFrame constructor failed if a column had a list of tuples (:issue:`293`) +- Ensure that `Series.apply` always returns a Series and implement + `Series.round` (:issue:`314`) +- Support boolean columns in Cythonized groupby functions (:issue:`315`) +- `DataFrame.describe` should not fail if there are no numeric columns, + instead return categorical describe (:issue:`323`) +- Fixed bug which could cause columns to be printed in wrong order in + `DataFrame.to_string` if specific list of columns passed (:issue:`325`) +- Fix legend plotting failure if DataFrame columns are integers (:issue:`326`) +- Shift start date back by one month for Yahoo! Finance API in pandas.io.data + (:issue:`329`) +- Fix `DataFrame.join` failure on unconsolidated inputs (:issue:`331`) +- DataFrame.min/max will no longer fail on mixed-type DataFrame (:issue:`337`) +- Fix `read_csv` / `read_table` failure when passing list to index_col that is + not in ascending order (:issue:`349`) +- Fix failure passing Int64Index to Index.union when both are monotonic +- Fix error when passing SparseSeries to (dense) DataFrame constructor +- Added missing bang at top of setup.py (:issue:`352`) +- Change `is_monotonic` on MultiIndex so it properly compares the tuples +- Fix MultiIndex outer join logic (:issue:`351`) +- Set index name attribute with single-key groupby (:issue:`358`) +- Bug fix in reflexive binary addition in Series and DataFrame for + non-commutative operations (like string concatenation) (:issue:`353`) +- setupegg.py will invoke Cython (:issue:`192`) +- Fix block consolidation bug after inserting column into MultiIndex (:issue:`366`) +- Fix bug in join operations between Index and Int64Index (:issue:`367`) +- Handle min_periods=0 case in moving window functions (:issue:`365`) +- Fixed corner cases in DataFrame.apply/pivot with empty DataFrame (:issue:`378`) +- Fixed repr exception when Series name is a tuple +- Always return DateRange from `asfreq` (:issue:`390`) +- Pass level names to `swaplavel` (:issue:`379`) +- Don't lose index names in `MultiIndex.droplevel` (:issue:`394`) +- Infer more proper return type in `DataFrame.apply` when no columns or rows + depending on whether the passed function is a reduction (:issue:`389`) +- Always return NA/NaN from Series.min/max and DataFrame.min/max when all of a + row/column/values are NA (:issue:`384`) +- Enable partial setting with .ix / advanced indexing (:issue:`397`) +- Handle mixed-type DataFrames correctly in unstack, do not lose type + information (:issue:`403`) +- Fix integer name formatting bug in Index.format and in Series.__repr__ +- Handle label types other than string passed to groupby (:issue:`405`) +- Fix bug in .ix-based indexing with partial retrieval when a label is not + contained in a level +- Index name was not being pickled (:issue:`408`) +- Level name should be passed to result index in GroupBy.apply (:issue:`416`) + +Thanks +~~~~~~ + +- Craig Austin +- Marius Cobzarenco +- Joel Cross +- Jeff Hammerbacher +- Adam Klein +- Thomas Kluyver +- Jev Kuznetsov +- Kieran O'Mahony +- Wouter Overmeire +- Nathan Pinger +- Christian Prinoth +- Skipper Seabold +- Chang She +- Ted Square +- Aman Thakral +- Chris Uga +- Dieter Vandenbussche +- carljv +- rsamson + +pandas 0.5.0 +------------ + +**Release date:** 10/24/2011 + +This release of pandas includes a number of API changes (see below) and cleanup of deprecated APIs +from pre-0.4.0 releases. There are also bug fixes, new features, numerous significant performance enhancements, and includes a new ipython +completer hook to enable tab completion of DataFrame columns accesses and attributes (a new feature). + +In addition to the changes listed here from 0.4.3 to 0.5.0, the minor releases 4.1, +0.4.2, and 0.4.3 brought some significant new functionality and performance improvements that are worth taking a look at. + +Thanks to all for bug reports, contributed patches and generally providing feedback on the library. + +API Changes +~~~~~~~~~~~ + +- `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for + `index_col` is now None. To use one or more of the columns as the resulting + DataFrame's index, these must be explicitly specified now +- Parsing functions like `read_csv` no longer parse dates by default (GH + :issue:`225`) +- Removed `weights` option in panel regression which was not doing anything + principled (:issue:`155`) +- Changed `buffer` argument name in `Series.to_string` to `buf` +- `Series.to_string` and `DataFrame.to_string` now return strings by default + instead of printing to sys.stdout +- Deprecated `nanRep` argument in various `to_string` and `to_csv` functions + in favor of `na_rep`. Will be removed in 0.6 (:issue:`275`) +- Renamed `delimiter` to `sep` in `DataFrame.from_csv` for consistency +- Changed order of `Series.clip` arguments to match those of `numpy.clip` and + added (unimplemented) `out` argument so `numpy.clip` can be called on a + Series (:issue:`272`) +- Series functions renamed (and thus deprecated) in 0.4 series have been + removed: + + - `asOf`, use `asof` + - `toDict`, use `to_dict` + - `toString`, use `to_string` + - `toCSV`, use `to_csv` + - `merge`, use `map` + - `applymap`, use `apply` + - `combineFirst`, use `combine_first` + - `_firstTimeWithValue` use `first_valid_index` + - `_lastTimeWithValue` use `last_valid_index` + +- DataFrame functions renamed / deprecated in 0.4 series have been removed: + + - `asMatrix` method, use `as_matrix` or `values` attribute + - `combineFirst`, use `combine_first` + - `getXS`, use `xs` + - `merge`, use `join` + - `fromRecords`, use `from_records` + - `fromcsv`, use `from_csv` + - `toRecords`, use `to_records` + - `toDict`, use `to_dict` + - `toString`, use `to_string` + - `toCSV`, use `to_csv` + - `_firstTimeWithValue` use `first_valid_index` + - `_lastTimeWithValue` use `last_valid_index` + - `toDataMatrix` is no longer needed + - `rows()` method, use `index` attribute + - `cols()` method, use `columns` attribute + - `dropEmptyRows()`, use `dropna(how='all')` + - `dropIncompleteRows()`, use `dropna()` + - `tapply(f)`, use `apply(f, axis=1)` + - `tgroupby(keyfunc, aggfunc)`, use `groupby` with `axis=1` + +Deprecations Removed +~~~~~~~~~~~~~~~~~~~~ + + - `indexField` argument in `DataFrame.from_records` + - `missingAtEnd` argument in `Series.order`. Use `na_last` instead + - `Series.fromValue` classmethod, use regular `Series` constructor instead + - Functions `parseCSV`, `parseText`, and `parseExcel` methods in + `pandas.io.parsers` have been removed + - `Index.asOfDate` function + - `Panel.getMinorXS` (use `minor_xs`) and `Panel.getMajorXS` (use + `major_xs`) + - `Panel.toWide`, use `Panel.to_wide` instead + +New Features +~~~~~~~~~~~~ + +- Added `DataFrame.align` method with standard join options +- Added `parse_dates` option to `read_csv` and `read_table` methods to + optionally try to parse dates in the index columns +- Add `nrows`, `chunksize`, and `iterator` arguments to `read_csv` and + `read_table`. The last two return a new `TextParser` class capable of + lazily iterating through chunks of a flat file (:issue:`242`) +- Added ability to join on multiple columns in `DataFrame.join` (:issue:`214`) +- Added private `_get_duplicates` function to `Index` for identifying + duplicate values more easily +- Added column attribute access to DataFrame, e.g. df.A equivalent to df['A'] + if 'A' is a column in the DataFrame (:issue:`213`) +- Added IPython tab completion hook for DataFrame columns. (:issue:`233`, :issue:`230`) +- Implement `Series.describe` for Series containing objects (:issue:`241`) +- Add inner join option to `DataFrame.join` when joining on key(s) (:issue:`248`) +- Can select set of DataFrame columns by passing a list to `__getitem__` (GH + :issue:`253`) +- Can use & and | to intersection / union Index objects, respectively (GH + :issue:`261`) +- Added `pivot_table` convenience function to pandas namespace (:issue:`234`) +- Implemented `Panel.rename_axis` function (:issue:`243`) +- DataFrame will show index level names in console output +- Implemented `Panel.take` +- Add `set_eng_float_format` function for setting alternate DataFrame + floating point string formatting +- Add convenience `set_index` function for creating a DataFrame index from + its existing columns + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Major performance improvements in file parsing functions `read_csv` and + `read_table` +- Added Cython function for converting tuples to ndarray very fast. Speeds up + many MultiIndex-related operations +- File parsing functions like `read_csv` and `read_table` will explicitly + check if a parsed index has duplicates and raise a more helpful exception + rather than deferring the check until later +- Refactored merging / joining code into a tidy class and disabled unnecessary + computations in the float/object case, thus getting about 10% better + performance (:issue:`211`) +- Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about + 5x, regression from 0.3.0 (:issue:`215`) +- With new `DataFrame.align` method, speeding up binary operations between + differently-indexed DataFrame objects by 10-25%. +- Significantly sped up conversion of nested dict into DataFrame (:issue:`212`) +- Can pass hierarchical index level name to `groupby` instead of the level + number if desired (:issue:`223`) +- Add support for different delimiters in `DataFrame.to_csv` (:issue:`244`) +- Add more helpful error message when importing pandas post-installation from + the source directory (:issue:`250`) +- Significantly speed up DataFrame `__repr__` and `count` on large mixed-type + DataFrame objects +- Better handling of pyx file dependencies in Cython module build (:issue:`271`) + +Bug Fixes +~~~~~~~~~ + +- `read_csv` / `read_table` fixes + + - Be less aggressive about converting float->int in cases of floating point + representations of integers like 1.0, 2.0, etc. + - "True"/"False" will not get correctly converted to boolean + - Index name attribute will get set when specifying an index column + - Passing column names should force `header=None` (:issue:`257`) + - Don't modify passed column names when `index_col` is not None + (:issue:`258`) + - Can sniff CSV separator in zip file (since seek is not supported, was + failing before) + +- Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should + be reported upstream to matplotlib (:issue:`224`) +- DataFrame.iteritems was not returning Series with the name attribute + set. Also neither was DataFrame._series +- Can store datetime.date objects in HDFStore (:issue:`231`) +- Index and Series names are now stored in HDFStore +- Fixed problem in which data would get upcasted to object dtype in + GroupBy.apply operations (:issue:`237`) +- Fixed outer join bug with empty DataFrame (:issue:`238`) +- Can create empty Panel (:issue:`239`) +- Fix join on single key when passing list with 1 entry (:issue:`246`) +- Don't raise Exception on plotting DataFrame with an all-NA column (:issue:`251`, + :issue:`254`) +- Bug min/max errors when called on integer DataFrames (:issue:`241`) +- `DataFrame.iteritems` and `DataFrame._series` not assigning name attribute +- Panel.__repr__ raised exception on length-0 major/minor axes +- `DataFrame.join` on key with empty DataFrame produced incorrect columns +- Implemented `MultiIndex.diff` (:issue:`260`) +- `Int64Index.take` and `MultiIndex.take` lost name field, fix downstream + issue :issue:`262` +- Can pass list of tuples to `Series` (:issue:`270`) +- Can pass level name to `DataFrame.stack` +- Support set operations between MultiIndex and Index +- Fix many corner cases in MultiIndex set operations + - Fix MultiIndex-handling bug with GroupBy.apply when returned groups are not + indexed the same +- Fix corner case bugs in DataFrame.apply +- Setting DataFrame index did not cause Series cache to get cleared +- Various int32 -> int64 platform-specific issues +- Don't be too aggressive converting to integer when parsing file with + MultiIndex (:issue:`285`) +- Fix bug when slicing Series with negative indices before beginning + +Thanks +~~~~~~ + +- Thomas Kluyver +- Daniel Fortunov +- Aman Thakral +- Luca Beltrame +- Wouter Overmeire + +pandas 0.4.3 +------------ + +**Release date:** 10/9/2011 + +is is largely a bugfix release from 0.4.2 but also includes a handful of new +d enhanced features. Also, pandas can now be installed and used on Python 3 +hanks Thomas Kluyver!). + +New Features +~~~~~~~~~~~~ + +- Python 3 support using 2to3 (:issue:`200`, Thomas Kluyver) +- Add `name` attribute to `Series` and added relevant logic and tests. Name + now prints as part of `Series.__repr__` +- Add `name` attribute to standard Index so that stacking / unstacking does + not discard names and so that indexed DataFrame objects can be reliably + round-tripped to flat files, pickle, HDF5, etc. +- Add `isnull` and `notnull` as instance methods on Series (:issue:`209`, :issue:`203`) + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Skip xlrd-related unit tests if not installed +- `Index.append` and `MultiIndex.append` can accept a list of Index objects to + concatenate together +- Altered binary operations on differently-indexed SparseSeries objects to use + the integer-based (dense) alignment logic which is faster with a larger + number of blocks (:issue:`205`) +- Refactored `Series.__repr__` to be a bit more clean and consistent + +API Changes +~~~~~~~~~~~ + +- `Series.describe` and `DataFrame.describe` now bring the 25% and 75% + quartiles instead of the 10% and 90% deciles. The other outputs have not + changed +- `Series.toString` will print deprecation warning, has been de-camelCased to + `to_string` + +Bug Fixes +~~~~~~~~~ + +- Fix broken interaction between `Index` and `Int64Index` when calling + intersection. Implement `Int64Index.intersection` +- `MultiIndex.sortlevel` discarded the level names (:issue:`202`) +- Fix bugs in groupby, join, and append due to improper concatenation of + `MultiIndex` objects (:issue:`201`) +- Fix regression from 0.4.1, `isnull` and `notnull` ceased to work on other + kinds of Python scalar objects like `datetime.datetime` +- Raise more helpful exception when attempting to write empty DataFrame or + LongPanel to `HDFStore` (:issue:`204`) +- Use stdlib csv module to properly escape strings with commas in + `DataFrame.to_csv` (:issue:`206`, Thomas Kluyver) +- Fix Python ndarray access in Cython code for sparse blocked index integrity + check +- Fix bug writing Series to CSV in Python 3 (:issue:`209`) +- Miscellaneous Python 3 bugfixes + +Thanks +~~~~~~ + +- Thomas Kluyver +- rsamson + +pandas 0.4.2 +------------ + +**Release date:** 10/3/2011 + +is is a performance optimization release with several bug fixes. The new +t64Index and new merging / joining Cython code and related Python +frastructure are the main new additions + +New Features +~~~~~~~~~~~~ + +- Added fast `Int64Index` type with specialized join, union, + intersection. Will result in significant performance enhancements for + int64-based time series (e.g. using NumPy's datetime64 one day) and also + faster operations on DataFrame objects storing record array-like data. +- Refactored `Index` classes to have a `join` method and associated data + alignment routines throughout the codebase to be able to leverage optimized + joining / merging routines. +- Added `Series.align` method for aligning two series with choice of join + method +- Wrote faster Cython data alignment / merging routines resulting in + substantial speed increases +- Added `is_monotonic` property to `Index` classes with associated Cython + code to evaluate the monotonicity of the `Index` values +- Add method `get_level_values` to `MultiIndex` +- Implemented shallow copy of `BlockManager` object in `DataFrame` internals + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improved performance of `isnull` and `notnull`, a regression from v0.3.0 + (:issue:`187`) +- Wrote templating / code generation script to auto-generate Cython code for + various functions which need to be available for the 4 major data types + used in pandas (float64, bool, object, int64) +- Refactored code related to `DataFrame.join` so that intermediate aligned + copies of the data in each `DataFrame` argument do not need to be + created. Substantial performance increases result (:issue:`176`) +- Substantially improved performance of generic `Index.intersection` and + `Index.union` +- Improved performance of `DateRange.union` with overlapping ranges and + non-cacheable offsets (like Minute). Implemented analogous fast + `DateRange.intersection` for overlapping ranges. +- Implemented `BlockManager.take` resulting in significantly faster `take` + performance on mixed-type `DataFrame` objects (:issue:`104`) +- Improved performance of `Series.sort_index` +- Significant groupby performance enhancement: removed unnecessary integrity + checks in DataFrame internals that were slowing down slicing operations to + retrieve groups +- Added informative Exception when passing dict to DataFrame groupby + aggregation with axis != 0 + +API Changes +~~~~~~~~~~~ + +Bug Fixes +~~~~~~~~~ + +- Fixed minor unhandled exception in Cython code implementing fast groupby + aggregation operations +- Fixed bug in unstacking code manifesting with more than 3 hierarchical + levels +- Throw exception when step specified in label-based slice (:issue:`185`) +- Fix isnull to correctly work with np.float32. Fix upstream bug described in + :issue:`182` +- Finish implementation of as_index=False in groupby for DataFrame + aggregation (:issue:`181`) +- Raise SkipTest for pre-epoch HDFStore failure. Real fix will be sorted out + via datetime64 dtype + +Thanks +~~~~~~ + +- Uri Laserson +- Scott Sinclair + +pandas 0.4.1 +------------ + +**Release date:** 9/25/2011 + +is is primarily a bug fix release but includes some new features and +provements + +New Features +~~~~~~~~~~~~ + +- Added new `DataFrame` methods `get_dtype_counts` and property `dtypes` +- Setting of values using ``.ix`` indexing attribute in mixed-type DataFrame + objects has been implemented (fixes :issue:`135`) +- `read_csv` can read multiple columns into a `MultiIndex`. DataFrame's + `to_csv` method will properly write out a `MultiIndex` which can be read + back (:issue:`151`, thanks to Skipper Seabold) +- Wrote fast time series merging / joining methods in Cython. Will be + integrated later into DataFrame.join and related functions +- Added `ignore_index` option to `DataFrame.append` for combining unindexed + records stored in a DataFrame + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Some speed enhancements with internal Index type-checking function +- `DataFrame.rename` has a new `copy` parameter which can rename a DataFrame + in place +- Enable unstacking by level name (:issue:`142`) +- Enable sortlevel to work by level name (:issue:`141`) +- `read_csv` can automatically "sniff" other kinds of delimiters using + `csv.Sniffer` (:issue:`146`) +- Improved speed of unit test suite by about 40% +- Exception will not be raised calling `HDFStore.remove` on non-existent node + with where clause +- Optimized `_ensure_index` function resulting in performance savings in + type-checking Index objects + +API Changes +~~~~~~~~~~~ + +Bug Fixes +~~~~~~~~~ + +- Fixed DataFrame constructor bug causing downstream problems (e.g. .copy() + failing) when passing a Series as the values along with a column name and + index +- Fixed single-key groupby on DataFrame with as_index=False (:issue:`160`) +- `Series.shift` was failing on integer Series (:issue:`154`) +- `unstack` methods were producing incorrect output in the case of duplicate + hierarchical labels. An exception will now be raised (:issue:`147`) +- Calling `count` with level argument caused reduceat failure or segfault in + earlier NumPy (:issue:`169`) +- Fixed `DataFrame.corrwith` to automatically exclude non-numeric data (GH + :issue:`144`) +- Unicode handling bug fixes in `DataFrame.to_string` (:issue:`138`) +- Excluding OLS degenerate unit test case that was causing platform specific + failure (:issue:`149`) +- Skip blosc-dependent unit tests for PyTables < 2.2 (:issue:`137`) +- Calling `copy` on `DateRange` did not copy over attributes to the new object + (:issue:`168`) +- Fix bug in `HDFStore` in which Panel data could be appended to a Table with + different item order, thus resulting in an incorrect result read back + +Thanks +~~~~~~ + +- Yaroslav Halchenko +- Jeff Reback +- Skipper Seabold +- Dan Lovell +- Nick Pentreath + +pandas 0.4.0 +------------ + +**Release date:** 9/12/2011 + +New Features +~~~~~~~~~~~~ + +- `pandas.core.sparse` module: "Sparse" (mostly-NA, or some other fill value) + versions of `Series`, `DataFrame`, and `Panel`. For low-density data, this + will result in significant performance boosts, and smaller memory + footprint. Added `to_sparse` methods to `Series`, `DataFrame`, and + `Panel`. See online documentation for more on these +- Fancy indexing operator on Series / DataFrame, e.g. via .ix operator. Both + getting and setting of values is supported; however, setting values will only + currently work on homogeneously-typed DataFrame objects. Things like: + + - series.ix[[d1, d2, d3]] + - frame.ix[5:10, ['C', 'B', 'A']], frame.ix[5:10, 'A':'C'] + - frame.ix[date1:date2] + +- Significantly enhanced `groupby` functionality + + - Can groupby multiple keys, e.g. df.groupby(['key1', 'key2']). Iteration with + multiple groupings products a flattened tuple + - "Nuisance" columns (non-aggregatable) will automatically be excluded from + DataFrame aggregation operations + - Added automatic "dispatching to Series / DataFrame methods to more easily + invoke methods on groups. e.g. s.groupby(crit).std() will work even though + `std` is not implemented on the `GroupBy` class + +- Hierarchical / multi-level indexing + + - New the `MultiIndex` class. Integrated `MultiIndex` into `Series` and + `DataFrame` fancy indexing, slicing, __getitem__ and __setitem, + reindexing, etc. Added `level` keyword argument to `groupby` to enable + grouping by a level of a `MultiIndex` + +- New data reshaping functions: `stack` and `unstack` on DataFrame and Series + + - Integrate with MultiIndex to enable sophisticated reshaping of data + +- `Index` objects (labels for axes) are now capable of holding tuples +- `Series.describe`, `DataFrame.describe`: produces an R-like table of summary + statistics about each data column +- `DataFrame.quantile`, `Series.quantile` for computing sample quantiles of data + across requested axis +- Added general `DataFrame.dropna` method to replace `dropIncompleteRows` and + `dropEmptyRows`, deprecated those. +- `Series` arithmetic methods with optional fill_value for missing data, + e.g. a.add(b, fill_value=0). If a location is missing for both it will still + be missing in the result though. +- fill_value option has been added to `DataFrame`.{add, mul, sub, div} methods + similar to `Series` +- Boolean indexing with `DataFrame` objects: data[data > 0.1] = 0.1 or + data[data> other] = 1. +- `pytz` / tzinfo support in `DateRange` + + - `tz_localize`, `tz_normalize`, and `tz_validate` methods added + +- Added `ExcelFile` class to `pandas.io.parsers` for parsing multiple sheets out + of a single Excel 2003 document +- `GroupBy` aggregations can now optionally *broadcast*, e.g. produce an object + of the same size with the aggregated value propagated +- Added `select` function in all data structures: reindex axis based on + arbitrary criterion (function returning boolean value), + e.g. frame.select(lambda x: 'foo' in x, axis=1) +- `DataFrame.consolidate` method, API function relating to redesigned internals +- `DataFrame.insert` method for inserting column at a specified location rather + than the default __setitem__ behavior (which puts it at the end) +- `HDFStore` class in `pandas.io.pytables` has been largely rewritten using + patches from Jeff Reback from others. It now supports mixed-type `DataFrame` + and `Series` data and can store `Panel` objects. It also has the option to + query `DataFrame` and `Panel` data. Loading data from legacy `HDFStore` + files is supported explicitly in the code +- Added `set_printoptions` method to modify appearance of DataFrame tabular + output +- `rolling_quantile` functions; a moving version of `Series.quantile` / + `DataFrame.quantile` +- Generic `rolling_apply` moving window function +- New `drop` method added to `Series`, `DataFrame`, etc. which can drop a set of + labels from an axis, producing a new object +- `reindex` methods now sport a `copy` option so that data is not forced to be + copied then the resulting object is indexed the same +- Added `sort_index` methods to Series and Panel. Renamed `DataFrame.sort` + to `sort_index`. Leaving `DataFrame.sort` for now. +- Added ``skipna`` option to statistical instance methods on all the data + structures +- `pandas.io.data` module providing a consistent interface for reading time + series data from several different sources + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- The 2-dimensional `DataFrame` and `DataMatrix` classes have been extensively + redesigned internally into a single class `DataFrame`, preserving where + possible their optimal performance characteristics. This should reduce + confusion from users about which class to use. + + - Note that under the hood there is a new essentially "lazy evaluation" + scheme within respect to adding columns to DataFrame. During some + operations, like-typed blocks will be "consolidated" but not before. + +- `DataFrame` accessing columns repeatedly is now significantly faster than + `DataMatrix` used to be in 0.3.0 due to an internal Series caching mechanism + (which are all views on the underlying data) +- Column ordering for mixed type data is now completely consistent in + `DataFrame`. In prior releases, there was inconsistent column ordering in + `DataMatrix` +- Improved console / string formatting of DataMatrix with negative numbers +- Improved tabular data parsing functions, `read_table` and `read_csv`: + + - Added `skiprows` and `na_values` arguments to `pandas.io.parsers` functions + for more flexible IO + - `parseCSV` / `read_csv` functions and others in `pandas.io.parsers` now can + take a list of custom NA values, and also a list of rows to skip + +- Can slice `DataFrame` and get a view of the data (when homogeneously typed), + e.g. frame.xs(idx, copy=False) or frame.ix[idx] +- Many speed optimizations throughout `Series` and `DataFrame` +- Eager evaluation of groups when calling ``groupby`` functions, so if there is + an exception with the grouping function it will raised immediately versus + sometime later on when the groups are needed +- `datetools.WeekOfMonth` offset can be parameterized with `n` different than 1 + or -1. +- Statistical methods on DataFrame like `mean`, `std`, `var`, `skew` will now + ignore non-numerical data. Before a not very useful error message was + generated. A flag `numeric_only` has been added to `DataFrame.sum` and + `DataFrame.count` to enable this behavior in those methods if so desired + (disabled by default) +- `DataFrame.pivot` generalized to enable pivoting multiple columns into a + `DataFrame` with hierarchical columns +- `DataFrame` constructor can accept structured / record arrays +- `Panel` constructor can accept a dict of DataFrame-like objects. Do not + need to use `from_dict` anymore (`from_dict` is there to stay, though). + +API Changes +~~~~~~~~~~~ + +- The `DataMatrix` variable now refers to `DataFrame`, will be removed within + two releases +- `WidePanel` is now known as `Panel`. The `WidePanel` variable in the pandas + namespace now refers to the renamed `Panel` class +- `LongPanel` and `Panel` / `WidePanel` now no longer have a common + subclass. `LongPanel` is now a subclass of `DataFrame` having a number of + additional methods and a hierarchical index instead of the old + `LongPanelIndex` object, which has been removed. Legacy `LongPanel` pickles + may not load properly +- Cython is now required to build `pandas` from a development branch. This was + done to avoid continuing to check in cythonized C files into source + control. Builds from released source distributions will not require Cython +- Cython code has been moved up to a top level `pandas/src` directory. Cython + extension modules have been renamed and promoted from the `lib` subpackage to + the top level, i.e. + + - `pandas.lib.tseries` -> `pandas._tseries` + - `pandas.lib.sparse` -> `pandas._sparse` + +- `DataFrame` pickling format has changed. Backwards compatibility for legacy + pickles is provided, but it's recommended to consider PyTables-based + `HDFStore` for storing data with a longer expected shelf life +- A `copy` argument has been added to the `DataFrame` constructor to avoid + unnecessary copying of data. Data is no longer copied by default when passed + into the constructor +- Handling of boolean dtype in `DataFrame` has been improved to support storage + of boolean data with NA / NaN values. Before it was being converted to float64 + so this should not (in theory) cause API breakage +- To optimize performance, Index objects now only check that their labels are + unique when uniqueness matters (i.e. when someone goes to perform a + lookup). This is a potentially dangerous tradeoff, but will lead to much + better performance in many places (like groupby). +- Boolean indexing using Series must now have the same indices (labels) +- Backwards compatibility support for begin/end/nPeriods keyword arguments in + DateRange class has been removed +- More intuitive / shorter filling aliases `ffill` (for `pad`) and `bfill` (for + `backfill`) have been added to the functions that use them: `reindex`, + `asfreq`, `fillna`. +- `pandas.core.mixins` code moved to `pandas.core.generic` +- `buffer` keyword arguments (e.g. `DataFrame.toString`) renamed to `buf` to + avoid using Python built-in name +- `DataFrame.rows()` removed (use `DataFrame.index`) +- Added deprecation warning to `DataFrame.cols()`, to be removed in next release +- `DataFrame` deprecations and de-camelCasing: `merge`, `asMatrix`, + `toDataMatrix`, `_firstTimeWithValue`, `_lastTimeWithValue`, `toRecords`, + `fromRecords`, `tgroupby`, `toString` +- `pandas.io.parsers` method deprecations + + - `parseCSV` is now `read_csv` and keyword arguments have been de-camelCased + - `parseText` is now `read_table` + - `parseExcel` is replaced by the `ExcelFile` class and its `parse` method + +- `fillMethod` arguments (deprecated in prior release) removed, should be + replaced with `method` +- `Series.fill`, `DataFrame.fill`, and `Panel.fill` removed, use `fillna` + instead +- `groupby` functions now exclude NA / NaN values from the list of groups. This + matches R behavior with NAs in factors e.g. with the `tapply` function +- Removed `parseText`, `parseCSV` and `parseExcel` from pandas namespace +- `Series.combineFunc` renamed to `Series.combine` and made a bit more general + with a `fill_value` keyword argument defaulting to NaN +- Removed `pandas.core.pytools` module. Code has been moved to + `pandas.core.common` +- Tacked on `groupName` attribute for groups in GroupBy renamed to `name` +- Panel/LongPanel `dims` attribute renamed to `shape` to be more conformant +- Slicing a `Series` returns a view now +- More Series deprecations / renaming: `toCSV` to `to_csv`, `asOf` to `asof`, + `merge` to `map`, `applymap` to `apply`, `toDict` to `to_dict`, + `combineFirst` to `combine_first`. Will print `FutureWarning`. +- `DataFrame.to_csv` does not write an "index" column label by default + anymore since the output file can be read back without it. However, there + is a new ``index_label`` argument. So you can do ``index_label='index'`` to + emulate the old behavior +- `datetools.Week` argument renamed from `dayOfWeek` to `weekday` +- `timeRule` argument in `shift` has been deprecated in favor of using the + `offset` argument for everything. So you can still pass a time rule string + to `offset` +- Added optional `encoding` argument to `read_csv`, `read_table`, `to_csv`, + `from_csv` to handle unicode in python 2.x + +Bug Fixes +~~~~~~~~~ + +- Column ordering in `pandas.io.parsers.parseCSV` will match CSV in the presence + of mixed-type data +- Fixed handling of Excel 2003 dates in `pandas.io.parsers` +- `DateRange` caching was happening with high resolution `DateOffset` objects, + e.g. `DateOffset(seconds=1)`. This has been fixed +- Fixed __truediv__ issue in `DataFrame` +- Fixed `DataFrame.toCSV` bug preventing IO round trips in some cases +- Fixed bug in `Series.plot` causing matplotlib to barf in exceptional cases +- Disabled `Index` objects from being hashable, like ndarrays +- Added `__ne__` implementation to `Index` so that operations like ts[ts != idx] + will work +- Added `__ne__` implementation to `DataFrame` +- Bug / unintuitive result when calling `fillna` on unordered labels +- Bug calling `sum` on boolean DataFrame +- Bug fix when creating a DataFrame from a dict with scalar values +- Series.{sum, mean, std, ...} now return NA/NaN when the whole Series is NA +- NumPy 1.4 through 1.6 compatibility fixes +- Fixed bug in bias correction in `rolling_cov`, was affecting `rolling_corr` + too +- R-square value was incorrect in the presence of fixed and time effects in + the `PanelOLS` classes +- `HDFStore` can handle duplicates in table format, will take + +Thanks +~~~~~~ + +- Joon Ro +- Michael Pennington +- Chris Uga +- Chris Withers +- Jeff Reback +- Ted Square +- Craig Austin +- William Ferreira +- Daniel Fortunov +- Tony Roberts +- Martin Felder +- John Marino +- Tim McNamara +- Justin Berka +- Dieter Vandenbussche +- Shane Conway +- Skipper Seabold +- Chris Jordan-Squire + +pandas 0.3.0 +------------ + +**Release date:** February 20, 2011 + +New features +~~~~~~~~~~~~ + +- `corrwith` function to compute column- or row-wise correlations between two + DataFrame objects +- Can boolean-index DataFrame objects, e.g. df[df > 2] = 2, px[px > last_px] = 0 +- Added comparison magic methods (__lt__, __gt__, etc.) +- Flexible explicit arithmetic methods (add, mul, sub, div, etc.) +- Added `reindex_like` method +- Added `reindex_like` method to WidePanel +- Convenience functions for accessing SQL-like databases in `pandas.io.sql` + module +- Added (still experimental) HDFStore class for storing pandas data + structures using HDF5 / PyTables in `pandas.io.pytables` module +- Added WeekOfMonth date offset +- `pandas.rpy` (experimental) module created, provide some interfacing / + conversion between rpy2 and pandas + +Improvements to existing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Unit test coverage: 100% line coverage of core data structures +- Speed enhancement to rolling_{median, max, min} +- Column ordering between DataFrame and DataMatrix is now consistent: before + DataFrame would not respect column order +- Improved {Series, DataFrame}.plot methods to be more flexible (can pass + matplotlib Axis arguments, plot DataFrame columns in multiple subplots, + etc.) + +API Changes +~~~~~~~~~~~ + +- Exponentially-weighted moment functions in `pandas.stats.moments` have a + more consistent API and accept a min_periods argument like their regular + moving counterparts. +- **fillMethod** argument in Series, DataFrame changed to **method**, + `FutureWarning` added. +- **fill** method in Series, DataFrame/DataMatrix, WidePanel renamed to + **fillna**, `FutureWarning` added to **fill** +- Renamed **DataFrame.getXS** to **xs**, `FutureWarning` added +- Removed **cap** and **floor** functions from DataFrame, renamed to + **clip_upper** and **clip_lower** for consistency with NumPy + +Bug Fixes +~~~~~~~~~ + +- Fixed bug in IndexableSkiplist Cython code that was breaking rolling_max + function +- Numerous numpy.int64-related indexing fixes +- Several NumPy 1.4.0 NaN-handling fixes +- Bug fixes to pandas.io.parsers.parseCSV +- Fixed `DateRange` caching issue with unusual date offsets +- Fixed bug in `DateRange.union` +- Fixed corner case in `IndexableSkiplist` implementation diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst new file mode 100644 index 00000000..33db9de4 --- /dev/null +++ b/doc/source/remote_data.rst @@ -0,0 +1,256 @@ +.. _remote_data: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import os + import csv + import pandas as pd + + import numpy as np + np.random.seed(123456) + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + + import matplotlib.pyplot as plt + plt.close('all') + + from pandas import * + options.display.max_rows=15 + import pandas.util.testing as tm + +****************** +Remote Data Access +****************** + +.. _remote_data.data_reader: + +Functions from :mod:`pandas.io.data` extract data from various Internet +sources into a DataFrame. Currently the following sources are supported: + + - Yahoo! Finance + - Google Finance + - St. Louis FED (FRED) + - Kenneth French's data library + - World Bank + +It should be noted, that various sources support different kinds of data, so not all sources implement the same methods and the data elements returned might also differ. + +.. _remote_data.yahoo: + +Yahoo! Finance +-------------- + +.. ipython:: python + + import pandas.io.data as web + import datetime + start = datetime.datetime(2010, 1, 1) + end = datetime.datetime(2013, 1, 27) + f=web.DataReader("F", 'yahoo', start, end) + f.ix['2010-01-04'] + +.. _remote_data.yahoo_options: + +Yahoo! Finance Options +---------------------- +***Experimental*** + +The Options class allows the download of options data from Yahoo! Finance. + +The ``get_all_data`` method downloads and caches option data for all expiry months +and provides a formatted ``DataFrame`` with a hierarchical index, so its easy to get +to the specific option you want. + +.. ipython:: python + + from pandas.io.data import Options + aapl = Options('aapl', 'yahoo') + data = aapl.get_all_data() + data.iloc[0:5, 0:5] + + #Show the $100 strike puts at all expiry dates: + data.loc[(100, slice(None), 'put'),:].iloc[0:5, 0:5] + + #Show the volume traded of $100 strike puts at all expiry dates: + data.loc[(100, slice(None), 'put'),'Vol'].head() + +If you don't want to download all the data, more specific requests can be made. + +.. ipython:: python + + import datetime + expiry = datetime.date(2016, 1, 1) + data = aapl.get_call_data(expiry=expiry) + data.iloc[0:5:, 0:5] + +Note that if you call ``get_all_data`` first, this second call will happen much faster, as the data is cached. + + +.. _remote_data.google: + +Google Finance +-------------- + +.. ipython:: python + + import pandas.io.data as web + import datetime + start = datetime.datetime(2010, 1, 1) + end = datetime.datetime(2013, 1, 27) + f=web.DataReader("F", 'google', start, end) + f.ix['2010-01-04'] + +.. _remote_data.fred: + +FRED +---- + +.. ipython:: python + + import pandas.io.data as web + import datetime + start = datetime.datetime(2010, 1, 1) + end = datetime.datetime(2013, 1, 27) + gdp=web.DataReader("GDP", "fred", start, end) + gdp.ix['2013-01-01'] + + # Multiple series: + inflation = web.DataReader(["CPIAUCSL", "CPILFESL"], "fred", start, end) + inflation.head() +.. _remote_data.ff: + +Fama/French +----------- + +Dataset names are listed at `Fama/French Data Library +`__. + +.. ipython:: python + + import pandas.io.data as web + ip=web.DataReader("5_Industry_Portfolios", "famafrench") + ip[4].ix[192607] + +.. _remote_data.wb: + +World Bank +---------- + +``pandas`` users can easily access thousands of panel data series from the +`World Bank's World Development Indicators `__ +by using the ``wb`` I/O functions. + +For example, if you wanted to compare the Gross Domestic Products per capita in +constant dollars in North America, you would use the ``search`` function: + +.. code-block:: python + + In [1]: from pandas.io import wb + + In [2]: wb.search('gdp.*capita.*const').iloc[:,:2] + Out[2]: + id name + 3242 GDPPCKD GDP per Capita, constant US$, millions + 5143 NY.GDP.PCAP.KD GDP per capita (constant 2005 US$) + 5145 NY.GDP.PCAP.KN GDP per capita (constant LCU) + 5147 NY.GDP.PCAP.PP.KD GDP per capita, PPP (constant 2005 internation... + +Then you would use the ``download`` function to acquire the data from the World +Bank's servers: + +.. code-block:: python + + In [3]: dat = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008) + + In [4]: print(dat) + NY.GDP.PCAP.KD + country year + Canada 2008 36005.5004978584 + 2007 36182.9138439757 + 2006 35785.9698172849 + 2005 35087.8925933298 + Mexico 2008 8113.10219480083 + 2007 8119.21298908649 + 2006 7961.96818458178 + 2005 7666.69796097264 + United States 2008 43069.5819857208 + 2007 43635.5852068142 + 2006 43228.111147107 + 2005 42516.3934699993 + +The resulting dataset is a properly formatted ``DataFrame`` with a hierarchical +index, so it is easy to apply ``.groupby`` transformations to it: + +.. code-block:: python + + In [6]: dat['NY.GDP.PCAP.KD'].groupby(level=0).mean() + Out[6]: + country + Canada 35765.569188 + Mexico 7965.245332 + United States 43112.417952 + dtype: float64 + +Now imagine you want to compare GDP to the share of people with cellphone +contracts around the world. + +.. code-block:: python + + In [7]: wb.search('cell.*%').iloc[:,:2] + Out[7]: + id name + 3990 IT.CEL.SETS.FE.ZS Mobile cellular telephone users, female (% of ... + 3991 IT.CEL.SETS.MA.ZS Mobile cellular telephone users, male (% of po... + 4027 IT.MOB.COV.ZS Population coverage of mobile cellular telepho... + +Notice that this second search was much faster than the first one because +``pandas`` now has a cached list of available data series. + +.. code-block:: python + + In [13]: ind = ['NY.GDP.PCAP.KD', 'IT.MOB.COV.ZS'] + In [14]: dat = wb.download(indicator=ind, country='all', start=2011, end=2011).dropna() + In [15]: dat.columns = ['gdp', 'cellphone'] + In [16]: print(dat.tail()) + gdp cellphone + country year + Swaziland 2011 2413.952853 94.9 + Tunisia 2011 3687.340170 100.0 + Uganda 2011 405.332501 100.0 + Zambia 2011 767.911290 62.0 + Zimbabwe 2011 419.236086 72.4 + +Finally, we use the ``statsmodels`` package to assess the relationship between +our two variables using ordinary least squares regression. Unsurprisingly, +populations in rich countries tend to use cellphones at a higher rate: + +.. code-block:: python + + In [17]: import numpy as np + In [18]: import statsmodels.formula.api as smf + In [19]: mod = smf.ols("cellphone ~ np.log(gdp)", dat).fit() + In [20]: print(mod.summary()) + OLS Regression Results + ============================================================================== + Dep. Variable: cellphone R-squared: 0.297 + Model: OLS Adj. R-squared: 0.274 + Method: Least Squares F-statistic: 13.08 + Date: Thu, 25 Jul 2013 Prob (F-statistic): 0.00105 + Time: 15:24:42 Log-Likelihood: -139.16 + No. Observations: 33 AIC: 282.3 + Df Residuals: 31 BIC: 285.3 + Df Model: 1 + =============================================================================== + coef std err t P>|t| [95.0% Conf. Int.] + ------------------------------------------------------------------------------- + Intercept 16.5110 19.071 0.866 0.393 -22.384 55.406 + np.log(gdp) 9.9333 2.747 3.616 0.001 4.331 15.535 + ============================================================================== + Omnibus: 36.054 Durbin-Watson: 2.071 + Prob(Omnibus): 0.000 Jarque-Bera (JB): 119.133 + Skew: -2.314 Prob(JB): 1.35e-26 + Kurtosis: 11.077 Cond. No. 45.8 + ============================================================================== diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst new file mode 100644 index 00000000..db68c0eb --- /dev/null +++ b/doc/source/reshaping.rst @@ -0,0 +1,458 @@ +.. currentmodule:: pandas +.. _reshaping: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + from pandas.core.reshape import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + from pandas.tools.tile import * + from pandas.compat import zip + +************************** +Reshaping and Pivot Tables +************************** + +Reshaping by pivoting DataFrame objects +--------------------------------------- + +.. ipython:: + :suppress: + + In [1]: import pandas.util.testing as tm; tm.N = 3 + + In [2]: def unpivot(frame): + ...: N, K = frame.shape + ...: data = {'value' : frame.values.ravel('F'), + ...: 'variable' : np.asarray(frame.columns).repeat(N), + ...: 'date' : np.tile(np.asarray(frame.index), K)} + ...: columns = ['date', 'variable', 'value'] + ...: return DataFrame(data, columns=columns) + ...: + + In [3]: df = unpivot(tm.makeTimeDataFrame()) + +Data is often stored in CSV files or databases in so-called "stacked" or +"record" format: + +.. ipython:: python + + df + + +For the curious here is how the above DataFrame was created: + +.. code-block:: python + + import pandas.util.testing as tm; tm.N = 3 + def unpivot(frame): + N, K = frame.shape + data = {'value' : frame.values.ravel('F'), + 'variable' : np.asarray(frame.columns).repeat(N), + 'date' : np.tile(np.asarray(frame.index), K)} + return DataFrame(data, columns=['date', 'variable', 'value']) + df = unpivot(tm.makeTimeDataFrame()) + +To select out everything for variable ``A`` we could do: + +.. ipython:: python + + df[df['variable'] == 'A'] + +But suppose we wish to do time series operations with the variables. A better +representation would be where the ``columns`` are the unique variables and an +``index`` of dates identifies individual observations. To reshape the data into +this form, use the ``pivot`` function: + +.. ipython:: python + + df.pivot(index='date', columns='variable', values='value') + +If the ``values`` argument is omitted, and the input DataFrame has more than +one column of values which are not used as column or index inputs to ``pivot``, +then the resulting "pivoted" DataFrame will have :ref:`hierarchical columns +` whose topmost level indicates the respective value +column: + +.. ipython:: python + + df['value2'] = df['value'] * 2 + pivoted = df.pivot('date', 'variable') + pivoted + +You of course can then select subsets from the pivoted DataFrame: + +.. ipython:: python + + pivoted['value2'] + +Note that this returns a view on the underlying data in the case where the data +are homogeneously-typed. + +.. _reshaping.stacking: + +Reshaping by stacking and unstacking +------------------------------------ + +Closely related to the ``pivot`` function are the related ``stack`` and +``unstack`` functions currently available on Series and DataFrame. These +functions are designed to work together with ``MultiIndex`` objects (see the +section on :ref:`hierarchical indexing `). Here are +essentially what these functions do: + + - ``stack``: "pivot" a level of the (possibly hierarchical) column labels, + returning a DataFrame with an index with a new inner-most level of row + labels. + - ``unstack``: inverse operation from ``stack``: "pivot" a level of the + (possibly hierarchical) row index to the column axis, producing a reshaped + DataFrame with a new inner-most level of column labels. + +The clearest way to explain is by example. Let's take a prior example data set +from the hierarchical indexing section: + +.. ipython:: python + + tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']])) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = DataFrame(randn(8, 2), index=index, columns=['A', 'B']) + df2 = df[:4] + df2 + +The ``stack`` function "compresses" a level in the DataFrame's columns to +produce either: + + - A Series, in the case of a simple column Index + - A DataFrame, in the case of a ``MultiIndex`` in the columns + +If the columns have a ``MultiIndex``, you can choose which level to stack. The +stacked level becomes the new lowest level in a ``MultiIndex`` on the columns: + +.. ipython:: python + + stacked = df2.stack() + stacked + +With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the +``index``), the inverse operation of ``stack`` is ``unstack``, which by default +unstacks the **last level**: + +.. ipython:: python + + stacked.unstack() + stacked.unstack(1) + stacked.unstack(0) + +.. _reshaping.unstack_by_name: + +If the indexes have names, you can use the level names instead of specifying +the level numbers: + +.. ipython:: python + + stacked.unstack('second') + +You may also stack or unstack more than one level at a time by passing a list +of levels, in which case the end result is as if each level in the list were +processed individually. + +These functions are intelligent about handling missing data and do not expect +each subgroup within the hierarchical index to have the same set of labels. +They also can handle the index being unsorted (but you can make it sorted by +calling ``sortlevel``, of course). Here is a more complex example: + +.. ipython:: python + + columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), + ('B', 'cat'), ('A', 'dog')], + names=['exp', 'animal']) + df = DataFrame(randn(8, 4), index=index, columns=columns) + df2 = df.ix[[0, 1, 2, 4, 5, 7]] + df2 + +As mentioned above, ``stack`` can be called with a ``level`` argument to select +which level in the columns to stack: + +.. ipython:: python + + df2.stack('exp') + df2.stack('animal') + +Unstacking when the columns are a ``MultiIndex`` is also careful about doing +the right thing: + +.. ipython:: python + + df[:3].unstack(0) + df2.unstack(1) + +.. _reshaping.melt: + +Reshaping by Melt +----------------- + +The :func:`~pandas.melt` function is useful to massage a +DataFrame into a format where one or more columns are identifier variables, +while all other columns, considered measured variables, are "unpivoted" to the +row axis, leaving just two non-identifier columns, "variable" and "value". The +names of those columns can be customized by supplying the ``var_name`` and +``value_name`` parameters. + +For instance, + +.. ipython:: python + + cheese = DataFrame({'first' : ['John', 'Mary'], + 'last' : ['Doe', 'Bo'], + 'height' : [5.5, 6.0], + 'weight' : [130, 150]}) + cheese + melt(cheese, id_vars=['first', 'last']) + melt(cheese, id_vars=['first', 'last'], var_name='quantity') + +Another way to transform is to use the ``wide_to_long`` panel data convenience function. + +.. ipython:: python + + dft = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, + "A1980" : {0 : "d", 1 : "e", 2 : "f"}, + "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, + "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, + "X" : dict(zip(range(3), np.random.randn(3))) + }) + dft["id"] = dft.index + dft + pd.wide_to_long(dft, ["A", "B"], i="id", j="year") + +Combining with stats and GroupBy +-------------------------------- + +It should be no shock that combining ``pivot`` / ``stack`` / ``unstack`` with +GroupBy and the basic Series and DataFrame statistical functions can produce +some very expressive and fast data manipulations. + +.. ipython:: python + + df + df.stack().mean(1).unstack() + + # same result, another way + df.groupby(level=1, axis=1).mean() + + df.stack().groupby(level=1).mean() + + df.mean().unstack(0) + + +Pivot tables and cross-tabulations +---------------------------------- + +.. _reshaping.pivot: + +The function ``pandas.pivot_table`` can be used to create spreadsheet-style pivot +tables. See the :ref:`cookbook` for some advanced strategies + +It takes a number of arguments + +- ``data``: A DataFrame object +- ``values``: a column or a list of columns to aggregate +- ``index``: a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. +- ``columns``: a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. +- ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean`` + +Consider a data set like this: + +.. ipython:: python + + import datetime + df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6, + 'B' : ['A', 'B', 'C'] * 8, + 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, + 'D' : np.random.randn(24), + 'E' : np.random.randn(24), + 'F' : [datetime.datetime(2013, i, 1) for i in range(1, 13)] + + [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) + df + +We can produce pivot tables from this data very easily: + +.. ipython:: python + + pivot_table(df, values='D', index=['A', 'B'], columns=['C']) + pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) + pivot_table(df, values=['D','E'], index=['B'], columns=['A', 'C'], aggfunc=np.sum) + +The result object is a DataFrame having potentially hierarchical indexes on the +rows and columns. If the ``values`` column name is not given, the pivot table +will include all of the data that can be aggregated in an additional level of +hierarchy in the columns: + +.. ipython:: python + + pivot_table(df, index=['A', 'B'], columns=['C']) + +Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification `. + +.. ipython:: python + + pivot_table(df, values='D', index=Grouper(freq='M', key='F'), columns='C') + +You can render a nice output of the table omitting the missing values by +calling ``to_string`` if you wish: + +.. ipython:: python + + table = pivot_table(df, index=['A', 'B'], columns=['C']) + print(table.to_string(na_rep='')) + +Note that ``pivot_table`` is also available as an instance method on DataFrame. + +Cross tabulations +~~~~~~~~~~~~~~~~~ + +Use the ``crosstab`` function to compute a cross-tabulation of two (or more) +factors. By default ``crosstab`` computes a frequency table of the factors +unless an array of values and an aggregation function are passed. + +It takes a number of arguments + +- ``index``: array-like, values to group by in the rows +- ``columns``: array-like, values to group by in the columns +- ``values``: array-like, optional, array of values to aggregate according to + the factors +- ``aggfunc``: function, optional, If no values array is passed, computes a + frequency table +- ``rownames``: sequence, default None, must match number of row arrays passed +- ``colnames``: sequence, default None, if passed, must match number of column + arrays passed +- ``margins``: boolean, default False, Add row/column margins (subtotals) + +Any Series passed will have their name attributes used unless row or column +names for the cross-tabulation are specified + +For example: + +.. ipython:: python + + foo, bar, dull, shiny, one, two = 'foo', 'bar', 'dull', 'shiny', 'one', 'two' + a = np.array([foo, foo, bar, bar, foo, foo], dtype=object) + b = np.array([one, one, two, one, two, one], dtype=object) + c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) + crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + +.. _reshaping.pivot.margins: + +Adding margins (partial aggregates) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you pass ``margins=True`` to ``pivot_table``, special ``All`` columns and +rows will be added with partial group aggregates across the categories on the +rows and columns: + +.. ipython:: python + + df.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.std) + +.. _reshaping.tile: + +Tiling +------ + +.. _reshaping.tile.cut: + +The ``cut`` function computes groupings for the values of the input array and +is often used to transform continuous variables to discrete or categorical +variables: + +.. ipython:: python + + ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]) + + + cut(ages, bins=3) + +If the ``bins`` keyword is an integer, then equal-width bins are formed. +Alternatively we can specify custom bin-edges: + +.. ipython:: python + + cut(ages, bins=[0, 18, 35, 70]) + + +.. _reshaping.dummies: + +Computing indicator / dummy variables +------------------------------------- + +To convert a categorical variable into a "dummy" or "indicator" DataFrame, for example +a column in a DataFrame (a Series) which has ``k`` distinct values, can derive a DataFrame +containing ``k`` columns of 1s and 0s: + +.. ipython:: python + + df = DataFrame({'key': list('bbacab'), 'data1': range(6)}) + + + get_dummies(df['key']) + +Sometimes it's useful to prefix the column names, for example when merging the result +with the original DataFrame: + +.. ipython:: python + + dummies = get_dummies(df['key'], prefix='key') + dummies + + + df[['data1']].join(dummies) + +This function is often used along with discretization functions like ``cut``: + +.. ipython:: python + + values = randn(10) + values + + + bins = [0, 0.2, 0.4, 0.6, 0.8, 1] + + + get_dummies(cut(values, bins)) + +See also :func:`Series.str.get_dummies `. + +Factorizing values +------------------ + +To encode 1-d values as an enumerated type use ``factorize``: + +.. ipython:: python + + x = pd.Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + x + labels, uniques = pd.factorize(x) + labels + uniques + +Note that ``factorize`` is similar to ``numpy.unique``, but differs in its +handling of NaN: + +.. note:: + The following ``numpy.unique`` will fail under Python 3 with a ``TypeError`` + because of an ordering bug. See also + `Here `__ + +.. ipython:: python + + pd.factorize(x, sort=True) + np.unique(x, return_inverse=True)[::-1] diff --git a/doc/source/rplot.rst b/doc/source/rplot.rst new file mode 100644 index 00000000..cdecee39 --- /dev/null +++ b/doc/source/rplot.rst @@ -0,0 +1,179 @@ +.. currentmodule:: pandas +.. _rplot: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + options.display.max_rows=15 + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + tips_data = read_csv('data/tips.csv') + iris_data = read_csv('data/iris.data') + from pandas import read_csv + from pandas.tools.plotting import radviz + import pandas.tools.rplot as rplot + plt.close('all') + +************************** +Trellis plotting interface +************************** + +.. note:: + + The tips data set can be downloaded `here + `__. Once you download it execute + + .. code-block:: python + + from pandas import read_csv + tips_data = read_csv('tips.csv') + + from the directory where you downloaded the file. + +We import the rplot API: + +.. ipython:: python + + import pandas.tools.rplot as rplot + +-------- +Examples +-------- + +RPlot is a flexible API for producing Trellis plots. These plots allow you to arrange data in a rectangular grid by values of certain attributes. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot1_tips.png + plot.render(plt.gcf()) + +In the example above, data from the tips data set is arranged by the attributes 'sex' and 'smoker'. Since both of those attributes can take on one of two values, the resulting grid has two columns and two rows. A histogram is displayed for each cell of the grid. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomDensity()) + + @savefig rplot2_tips.png + plot.render(plt.gcf()) + +Example above is the same as previous except the plot is set to kernel density estimation. This shows how easy it is to have different plots for the same Trellis structure. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomScatter()) + plot.add(rplot.GeomPolyFit(degree=2)) + + @savefig rplot3_tips.png + plot.render(plt.gcf()) + +The plot above shows that it is possible to have two or more plots for the same data displayed on the same Trellis grid cell. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomScatter()) + plot.add(rplot.GeomDensity2D()) + + @savefig rplot4_tips.png + plot.render(plt.gcf()) + +Above is a similar plot but with 2D kernel desnity estimation plot superimposed. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['sex', '.'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot5_tips.png + plot.render(plt.gcf()) + +It is possible to only use one attribute for grouping data. The example above only uses 'sex' attribute. If the second grouping attribute is not specified, the plots will be arranged in a column. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['.', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + @savefig rplot6_tips.png + plot.render(plt.gcf()) + +If the first grouping attribute is not specified the plots will be arranged in a row. + +.. ipython:: python + + plt.figure() + + plot = rplot.RPlot(tips_data, x='total_bill', y='tip') + plot.add(rplot.TrellisGrid(['.', 'smoker'])) + plot.add(rplot.GeomHistogram()) + + plot = rplot.RPlot(tips_data, x='tip', y='total_bill') + plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + plot.add(rplot.GeomPoint(size=80.0, colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'), alpha=1.0)) + + @savefig rplot7_tips.png + plot.render(plt.gcf()) + +As shown above, scatter plots are also possible. Scatter plots allow you to map various data attributes to graphical properties of the plot. In the example above the colour and shape of the scatter plot graphical objects is mapped to 'day' and 'size' attributes respectively. You use scale objects to specify these mappings. The list of scale classes is given below with initialization arguments for quick reference. + +------ +Scales +------ + +:: + + ScaleGradient(column, colour1, colour2) + +This one allows you to map an attribute (specified by parameter column) value to the colour of a graphical object. The larger the value of the attribute the closer the colour will be to colour2, the smaller the value, the closer it will be to colour1. + +:: + + ScaleGradient2(column, colour1, colour2, colour3) + +The same as ScaleGradient but interpolates linearly between three colours instead of two. + +:: + + ScaleSize(column, min_size, max_size, transform) + +Map attribute value to size of the graphical object. Parameter min_size (default 5.0) is the minimum size of the graphical object, max_size (default 100.0) is the maximum size and transform is a one argument function that will be used to transform the attribute value (defaults to lambda x: x). + +:: + + ScaleShape(column) + +Map the shape of the object to attribute value. The attribute has to be categorical. + +:: + + ScaleRandomColour(column) + +Assign a random colour to a value of categorical attribute specified by column. diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst new file mode 100644 index 00000000..391aae1c --- /dev/null +++ b/doc/source/sparse.rst @@ -0,0 +1,137 @@ +.. currentmodule:: pandas +.. _sparse: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + options.display.mpl_style='default' + options.display.max_rows = 15 + +********************** +Sparse data structures +********************** + +We have implemented "sparse" versions of Series, DataFrame, and Panel. These +are not sparse in the typical "mostly 0". You can view these objects as being +"compressed" where any data matching a specific value (NaN/missing by default, +though any value can be chosen) is omitted. A special ``SparseIndex`` object +tracks where data has been "sparsified". This will make much more sense in an +example. All of the standard pandas data structures have a ``to_sparse`` +method: + +.. ipython:: python + + ts = Series(randn(10)) + ts[2:-2] = np.nan + sts = ts.to_sparse() + sts + +The ``to_sparse`` method takes a ``kind`` argument (for the sparse index, see +below) and a ``fill_value``. So if we had a mostly zero Series, we could +convert it to sparse with ``fill_value=0``: + +.. ipython:: python + + ts.fillna(0).to_sparse(fill_value=0) + +The sparse objects exist for memory efficiency reasons. Suppose you had a +large, mostly NA DataFrame: + +.. ipython:: python + + df = DataFrame(randn(10000, 4)) + df.ix[:9998] = np.nan + sdf = df.to_sparse() + sdf + sdf.density + +As you can see, the density (% of values that have not been "compressed") is +extremely low. This sparse object takes up much less memory on disk (pickled) +and in the Python interpreter. Functionally, their behavior should be nearly +identical to their dense counterparts. + +Any sparse object can be converted back to the standard dense form by calling +``to_dense``: + +.. ipython:: python + + sts.to_dense() + +.. _sparse.array: + +SparseArray +----------- + +``SparseArray`` is the base layer for all of the sparse indexed data +structures. It is a 1-dimensional ndarray-like object storing only values +distinct from the ``fill_value``: + +.. ipython:: python + + arr = np.random.randn(10) + arr[2:5] = np.nan; arr[7:8] = np.nan + sparr = SparseArray(arr) + sparr + +Like the indexed objects (SparseSeries, SparseDataFrame, SparsePanel), a +``SparseArray`` can be converted back to a regular ndarray by calling +``to_dense``: + +.. ipython:: python + + sparr.to_dense() + +.. _sparse.list: + +SparseList +---------- + +``SparseList`` is a list-like data structure for managing a dynamic collection +of SparseArrays. To create one, simply call the ``SparseList`` constructor with +a ``fill_value`` (defaulting to ``NaN``): + +.. ipython:: python + + spl = SparseList() + spl + +The two important methods are ``append`` and ``to_array``. ``append`` can +accept scalar values or any 1-dimensional sequence: + +.. ipython:: python + :suppress: + + from numpy import nan + +.. ipython:: python + + spl.append(np.array([1., nan, nan, 2., 3.])) + spl.append(5) + spl.append(sparr) + spl + +As you can see, all of the contents are stored internally as a list of +memory-efficient ``SparseArray`` objects. Once you've accumulated all of the +data, you can call ``to_array`` to get a single ``SparseArray`` with all the +data: + +.. ipython:: python + + spl.to_array() + +SparseIndex objects +------------------- + +Two kinds of ``SparseIndex`` are implemented, ``block`` and ``integer``. We +recommend using ``block`` as it's more memory efficient. The ``integer`` format +keeps an arrays of all of the locations where the data are not equal to the +fill value. The ``block`` format tracks only the locations and sizes of blocks +of data. diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html new file mode 100644 index 00000000..3fae6ef5 --- /dev/null +++ b/doc/source/themes/nature_with_gtoc/layout.html @@ -0,0 +1,69 @@ +{# + +Subset of agogo theme +agogo/layout.html + +Sphinx layout template for the agogo theme, originally written +by Andi Albrecht. + +:copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. +:license: BSD, see LICENSE for details. +#} +{% extends "basic/layout.html" %} + +{%- block content %} +
+
+
+
+ {%- block sidebar1 %} + {%- block sidebartoc %} +

{{ _('Table Of Contents') }}

+ {{ toctree() }} + {%- endblock %} + {%- block sidebarsearch %} +

{{ _('Search') }}

+ + +

+ {{ _('Enter search terms or a module, class or function name.') }} +

+ +

+ +

+ +

+
+ {%- endblock %} + {# possible location for sidebar #} {% endblock %} + + + {%- block document %} +
+ {%- if render_sidebar %} +
+ {%- endif %} +
+ {% block body %} {% endblock %} +
+ {%- if render_sidebar %} +
+ {%- endif %} +
+ {%- endblock %} + + {%- block sidebar2 %} + + {% endblock %} +
+
+
+
+{%- endblock %} diff --git a/doc/source/themes/nature_with_gtoc/static/nature.css_t b/doc/source/themes/nature_with_gtoc/static/nature.css_t new file mode 100644 index 00000000..61b0e2cc --- /dev/null +++ b/doc/source/themes/nature_with_gtoc/static/nature.css_t @@ -0,0 +1,310 @@ +/* + * nature.css_t + * ~~~~~~~~~~~~ + * + * Sphinx stylesheet -- nature theme. + * + * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +@import url("basic.css"); + +/* -- page layout ----------------------------------------------------------- */ + +body { + font-family: Arial, sans-serif; + font-size: 100%; + background-color: #111; + color: #555; + margin: 0; + padding: 0; +} + + +div.documentwrapper { + width: 100%; +} + +div.bodywrapper { +/* ugly hack, probably not attractive with other font size for re*/ + margin: 0 0 0 {{ theme_sidebarwidth|toint}}px; + min-width: 540px; + max-width: 720px; +} + + +hr { + border: 1px solid #B1B4B6; +} + +div.document { + background-color: #eee; +} + +div.body { + background-color: #ffffff; + color: #3E4349; + padding: 0 30px 30px 30px; + font-size: 0.9em; +} + +div.footer { + color: #555; + width: 100%; + padding: 13px 0; + text-align: center; + font-size: 75%; +} + +div.footer a { + color: #444; + text-decoration: underline; +} + +div.related { + background-color: #6BA81E; + line-height: 32px; + color: #fff; + text-shadow: 0px 1px 0 #444; + font-size: 0.9em; +} + +div.related a { + color: #E2F3CC; +} + +div.sphinxsidebar { + font-size: 0.75em; + line-height: 1.5em; + width: {{ theme_sidebarwidth|toint }}px; + margin: 0 ; + float: left; + + background-color: #eee; +} +/* +div.sphinxsidebarwrapper{ + padding: 20px 0; +} +*/ +div.sphinxsidebar h3, +div.sphinxsidebar h4 { + font-family: Arial, sans-serif; + color: #222; + font-size: 1.2em; + font-weight: normal; + margin: 20px 0 0 0; + padding: 5px 10px; + background-color: #ddd; + text-shadow: 1px 1px 0 white +} + +div.sphinxsidebar h4{ + font-size: 1.1em; +} + +div.sphinxsidebar h3 a { + color: #444; +} + + +div.sphinxsidebar p { + color: #888; +/* padding: 5px 20px;*/ +} + +div.sphinxsidebar p.searchtip { + color: #888; + padding: 5px 20px; +} + + +div.sphinxsidebar p.topless { +} + +div.sphinxsidebar ul { + margin: 10px 20px; + padding: 0; + color: #000; +} + +div.sphinxsidebar a { + color: #444; +} + +div.sphinxsidebar input { + border: 1px solid #ccc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar input[type=text]{ + margin-left: 20px; +} + +/* -- body styles ----------------------------------------------------------- */ + +a { + color: #005B81; + text-decoration: none; +} + +a:hover { + color: #E32E00; + text-decoration: underline; +} + +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + font-family: Arial, sans-serif; + background-color: #BED4EB; + font-weight: normal; + color: #212224; + margin: 30px 0px 10px 0px; + padding: 5px 0 5px 10px; + text-shadow: 0px 1px 0 white +} + +div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 200%; } +div.body h2 { font-size: 150%; background-color: #C8D5E3; } +div.body h3 { font-size: 120%; background-color: #D8DEE3; } +div.body h4 { font-size: 110%; background-color: #D8DEE3; } +div.body h5 { font-size: 100%; background-color: #D8DEE3; } +div.body h6 { font-size: 100%; background-color: #D8DEE3; } + +p.rubric { + border-bottom: 1px solid rgb(201, 201, 201); +} + +a.headerlink { + color: #c60f0f; + font-size: 0.8em; + padding: 0 4px 0 4px; + text-decoration: none; +} + +a.headerlink:hover { + background-color: #c60f0f; + color: white; +} + +div.body p, div.body dd, div.body li { + line-height: 1.5em; +} + +div.admonition p.admonition-title + p { + display: inline; +} + +div.highlight{ + background-color: white; +} + +div.note { + background-color: #eee; + border: 1px solid #ccc; +} + +div.seealso { + background-color: #ffc; + border: 1px solid #ff6; +} + +div.topic { + background-color: #eee; +} + +div.warning { + background-color: #ffe4e4; + border: 1px solid #f66; +} + +p.admonition-title { + display: inline; +} + +p.admonition-title:after { + content: ":"; +} + +pre { + padding: 10px; + background-color: rgb(250,250,250); + color: #222; + line-height: 1.2em; + border: 1px solid rgb(201,201,201); + font-size: 1.1em; + margin: 1.5em 0 1.5em 0; + -webkit-box-shadow: 1px 1px 1px #d8d8d8; + -moz-box-shadow: 1px 1px 1px #d8d8d8; +} + +tt { + background-color: #ecf0f3; + color: #222; + /* padding: 1px 2px; */ + font-size: 1.1em; + font-family: monospace; +} + +.viewcode-back { + font-family: Arial, sans-serif; +} + +div.viewcode-block:target { + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + + +/** + * Styling for field lists + */ + + /* grey highlighting of 'parameter' and 'returns' field */ +table.field-list { + border-collapse: separate; + border-spacing: 10px; + margin-left: 1px; + /* border-left: 5px solid rgb(238, 238, 238) !important; */ +} + +table.field-list th.field-name { + /* display: inline-block; */ + padding: 1px 8px 1px 5px; + white-space: nowrap; + background-color: rgb(238, 238, 238); +} + +/* italic font for parameter types */ +table.field-list td.field-body > p { + font-style: italic; +} + +table.field-list td.field-body > p > strong { + font-style: normal; +} + +/* reduced space around parameter description */ +td.field-body blockquote { + border-left: none; + margin: 0em 0em 0.3em; + padding-left: 30px; +} + + +/** + * See also + */ + +div.seealso dd { + margin-top: 0; + margin-bottom: 0; +} diff --git a/doc/source/themes/nature_with_gtoc/theme.conf b/doc/source/themes/nature_with_gtoc/theme.conf new file mode 100644 index 00000000..1cc40044 --- /dev/null +++ b/doc/source/themes/nature_with_gtoc/theme.conf @@ -0,0 +1,4 @@ +[theme] +inherit = basic +stylesheet = nature.css +pygments_style = tango diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst new file mode 100644 index 00000000..76bc796b --- /dev/null +++ b/doc/source/timeseries.rst @@ -0,0 +1,1611 @@ +.. currentmodule:: pandas +.. _timeseries: + +.. ipython:: python + :suppress: + + from datetime import datetime + import numpy as np + np.random.seed(123456) + from pandas import * + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + options.display.max_rows=15 + import dateutil + import pytz + from dateutil.relativedelta import relativedelta + from pandas.tseries.api import * + from pandas.tseries.offsets import * + +******************************** +Time Series / Date functionality +******************************** + +pandas has proven very successful as a tool for working with time series data, +especially in the financial data analysis space. With the 0.8 release, we have +further improved the time series API in pandas by leaps and bounds. Using the +new NumPy ``datetime64`` dtype, we have consolidated a large number of features +from other Python libraries like ``scikits.timeseries`` as well as created +a tremendous amount of new functionality for manipulating time series data. + +In working with time series data, we will frequently seek to: + + - generate sequences of fixed-frequency dates and time spans + - conform or convert time series to a particular frequency + - compute "relative" dates based on various non-standard time increments + (e.g. 5 business days before the last business day of the year), or "roll" + dates forward or backward + +pandas provides a relatively compact and self-contained set of tools for +performing the above tasks. + +Create a range of dates: + +.. ipython:: python + + # 72 hours starting with midnight Jan 1st, 2011 + rng = date_range('1/1/2011', periods=72, freq='H') + rng[:5] + +Index pandas objects with dates: + +.. ipython:: python + + ts = Series(randn(len(rng)), index=rng) + ts.head() + +Change frequency and fill gaps: + +.. ipython:: python + + # to 45 minute frequency and forward fill + converted = ts.asfreq('45Min', method='pad') + converted.head() + +Resample: + +.. ipython:: python + + # Daily means + ts.resample('D', how='mean') + + +.. _timeseries.representation: + +Time Stamps vs. Time Spans +-------------------------- + +Time-stamped data is the most basic type of timeseries data that associates +values with points in time. For pandas objects it means using the points in +time to create the index + +.. ipython:: python + + dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + ts = Series(np.random.randn(3), dates) + + type(ts.index) + + ts + +However, in many cases it is more natural to associate things like change +variables with a time span instead. + +For example: + +.. ipython:: python + + periods = PeriodIndex([Period('2012-01'), Period('2012-02'), + Period('2012-03')]) + + ts = Series(np.random.randn(3), periods) + + type(ts.index) + + ts + +Starting with 0.8, pandas allows you to capture both representations and +convert between them. Under the hood, pandas represents timestamps using +instances of ``Timestamp`` and sequences of timestamps using instances of +``DatetimeIndex``. For regular time spans, pandas uses ``Period`` objects for +scalar values and ``PeriodIndex`` for sequences of spans. Better support for +irregular intervals with arbitrary start and end points are forth-coming in +future releases. + + +.. _timeseries.converting: + +Converting to Timestamps +------------------------ + +To convert a Series or list-like object of date-like objects e.g. strings, +epochs, or a mixture, you can use the ``to_datetime`` function. When passed +a Series, this returns a Series (with the same index), while a list-like +is converted to a DatetimeIndex: + +.. ipython:: python + + to_datetime(Series(['Jul 31, 2009', '2010-01-10', None])) + + to_datetime(['2005/11/23', '2010.12.31']) + +If you use dates which start with the day first (i.e. European style), +you can pass the ``dayfirst`` flag: + +.. ipython:: python + + to_datetime(['04-01-2012 10:00'], dayfirst=True) + + to_datetime(['14-01-2012', '01-14-2012'], dayfirst=True) + +.. warning:: + + You see in the above example that ``dayfirst`` isn't strict, so if a date + can't be parsed with the day being first it will be parsed as if + ``dayfirst`` were False. + +.. note:: + Specifying a ``format`` argument will potentially speed up the conversion + considerably and on versions later then 0.13.0 explicitly specifying + a format string of '%Y%m%d' takes a faster path still. + + +Invalid Data +~~~~~~~~~~~~ + +Pass ``coerce=True`` to convert invalid data to ``NaT`` (not a time): + +.. ipython:: python + + to_datetime(['2009-07-31', 'asd']) + + to_datetime(['2009-07-31', 'asd'], coerce=True) + + +Take care, ``to_datetime`` may not act as you expect on mixed data: + +.. ipython:: python + + to_datetime([1, '1']) + +Epoch Timestamps +~~~~~~~~~~~~~~~~ + +It's also possible to convert integer or float epoch times. The default unit +for these is nanoseconds (since these are how Timestamps are stored). However, +often epochs are stored in another ``unit`` which can be specified: + +Typical epoch stored units + +.. ipython:: python + + to_datetime([1349720105, 1349806505, 1349892905, + 1349979305, 1350065705], unit='s') + + to_datetime([1349720105100, 1349720105200, 1349720105300, + 1349720105400, 1349720105500 ], unit='ms') + +These *work*, but the results may be unexpected. + +.. ipython:: python + + to_datetime([1]) + + to_datetime([1, 3.14], unit='s') + +.. note:: + + Epoch times will be rounded to the nearest nanosecond. + +.. _timeseries.daterange: + +Generating Ranges of Timestamps +------------------------------- + +To generate an index with time stamps, you can use either the DatetimeIndex or +Index constructor and pass in a list of datetime objects: + +.. ipython:: python + + dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + index = DatetimeIndex(dates) + index # Note the frequency information + + index = Index(dates) + index # Automatically converted to DatetimeIndex + +Practically, this becomes very cumbersome because we often need a very long +index with a large number of timestamps. If we need timestamps on a regular +frequency, we can use the pandas functions ``date_range`` and ``bdate_range`` +to create timestamp indexes. + +.. ipython:: python + + index = date_range('2000-1-1', periods=1000, freq='M') + index + + index = bdate_range('2012-1-1', periods=250) + index + +Convenience functions like ``date_range`` and ``bdate_range`` utilize a +variety of frequency aliases. The default frequency for ``date_range`` is a +**calendar day** while the default for ``bdate_range`` is a **business day** + +.. ipython:: python + + start = datetime(2011, 1, 1) + end = datetime(2012, 1, 1) + + rng = date_range(start, end) + rng + + rng = bdate_range(start, end) + rng + +``date_range`` and ``bdate_range`` makes it easy to generate a range of dates +using various combinations of parameters like ``start``, ``end``, +``periods``, and ``freq``: + +.. ipython:: python + + date_range(start, end, freq='BM') + + date_range(start, end, freq='W') + + bdate_range(end=end, periods=20) + + bdate_range(start=start, periods=20) + +The start and end dates are strictly inclusive. So it will not generate any +dates outside of those dates if specified. + +.. _timeseries.datetimeindex: + +DatetimeIndex +------------- + +One of the main uses for ``DatetimeIndex`` is as an index for pandas objects. +The ``DatetimeIndex`` class contains many timeseries related optimizations: + + - A large range of dates for various offsets are pre-computed and cached + under the hood in order to make generating subsequent date ranges very fast + (just have to grab a slice) + - Fast shifting using the ``shift`` and ``tshift`` method on pandas objects + - Unioning of overlapping DatetimeIndex objects with the same frequency is + very fast (important for fast data alignment) + - Quick access to date fields via properties such as ``year``, ``month``, etc. + - Regularization functions like ``snap`` and very fast ``asof`` logic + +DatetimeIndex objects has all the basic functionality of regular Index objects +and a smorgasbord of advanced timeseries-specific methods for easy frequency +processing. + +.. seealso:: + :ref:`Reindexing methods ` + +.. note:: + + While pandas does not force you to have a sorted date index, some of these + methods may have unexpected or incorrect behavior if the dates are + unsorted. So please be careful. + +``DatetimeIndex`` can be used like a regular index and offers all of its +intelligent functionality like selection, slicing, etc. + +.. ipython:: python + + rng = date_range(start, end, freq='BM') + ts = Series(randn(len(rng)), index=rng) + ts.index + ts[:5].index + ts[::2].index + +.. _timeseries.partialindexing: + +DatetimeIndex Partial String Indexing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can pass in dates and strings that parse to dates as indexing parameters: + +.. ipython:: python + + ts['1/31/2011'] + + ts[datetime(2011, 12, 25):] + + ts['10/31/2011':'12/31/2011'] + +To provide convenience for accessing longer time series, you can also pass in +the year or year and month as strings: + +.. ipython:: python + + ts['2011'] + + ts['2011-6'] + +This type of slicing will work on a DataFrame with a ``DateTimeIndex`` as well. Since the +partial string selection is a form of label slicing, the endpoints **will be** included. This +would include matching times on an included date. Here's an example: + +.. ipython:: python + + dft = DataFrame(randn(100000,1),columns=['A'],index=date_range('20130101',periods=100000,freq='T')) + dft + dft['2013'] + +This starts on the very first time in the month, and includes the last date & time for the month + +.. ipython:: python + + dft['2013-1':'2013-2'] + +This specifies a stop time **that includes all of the times on the last day** + +.. ipython:: python + + dft['2013-1':'2013-2-28'] + +This specifies an **exact** stop time (and is not the same as the above) + +.. ipython:: python + + dft['2013-1':'2013-2-28 00:00:00'] + +We are stopping on the included end-point as its part of the index + +.. ipython:: python + + dft['2013-1-15':'2013-1-15 12:30:00'] + +.. warning:: + + The following selection will raise a ``KeyError``; otherwise this selection methodology + would be inconsistent with other selection methods in pandas (as this is not a *slice*, nor does it + resolve to one) + + .. code-block:: python + + dft['2013-1-15 12:30:00'] + + To select a single row, use ``.loc`` + + .. ipython:: python + + dft.loc['2013-1-15 12:30:00'] + + +Datetime Indexing +~~~~~~~~~~~~~~~~~ + +Indexing a ``DateTimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the frequency of the index. In contrast, indexing with datetime objects is exact, because the objects have exact meaning. These also follow the sematics of *including both endpoints*. + +These ``datetime`` objects are specific ``hours, minutes,`` and ``seconds`` even though they were not explicity specified (they are ``0``). + +.. ipython:: python + + dft[datetime(2013, 1, 1):datetime(2013,2,28)] + +With no defaults. + +.. ipython:: python + + dft[datetime(2013, 1, 1, 10, 12, 0):datetime(2013, 2, 28, 10, 12, 0)] + + +Truncating & Fancy Indexing +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A ``truncate`` convenience function is provided that is equivalent to slicing: + +.. ipython:: python + + ts.truncate(before='10/31/2011', after='12/31/2011') + +Even complicated fancy indexing that breaks the DatetimeIndex's frequency +regularity will result in a ``DatetimeIndex`` (but frequency is lost): + +.. ipython:: python + + ts[[0, 2, 6]].index + +.. _timeseries.offsets: + +Time/Date Components +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are several time/date properties that one can access from ``Timestamp`` or a collection of timestamps like a ``DateTimeIndex``. + +.. csv-table:: + :header: "Property", "Description" + :widths: 15, 65 + + year, "The year of the datetime" + month,"The month of the datetime" + day,"The days of the datetime" + hour,"The hour of the datetime" + minute,"The minutes of the datetime" + second,"The seconds of the datetime" + microsecond,"The microseconds of the datetime" + nanosecond,"The nanoseconds of the datetime" + date,"Returns datetime.date" + time,"Returns datetime.time" + dayofyear,"The ordinal day of year" + weekofyear,"The week ordinal of the year" + week,"The week ordinal of the year" + dayofweek,"The day of the week with Monday=0, Sunday=6" + weekday,"The day of the week with Monday=0, Sunday=6" + quarter,"Quarter of the date: Jan=Mar = 1, Apr-Jun = 2, etc." + is_month_start,"Logical indicating if first day of month (defined by frequency)" + is_month_end,"Logical indicating if last day of month (defined by frequency)" + is_quarter_start,"Logical indicating if first day of quarter (defined by frequency)" + is_quarter_end,"Logical indicating if last day of quarter (defined by frequency)" + is_year_start,"Logical indicating if first day of year (defined by frequency)" + is_year_end,"Logical indicating if last day of year (defined by frequency)" + + +DateOffset objects +------------------ + +In the preceding examples, we created DatetimeIndex objects at various +frequencies by passing in frequency strings like 'M', 'W', and 'BM to the +``freq`` keyword. Under the hood, these frequency strings are being translated +into an instance of pandas ``DateOffset``, which represents a regular +frequency increment. Specific offset logic like "month", "business day", or +"one hour" is represented in its various subclasses. + +.. csv-table:: + :header: "Class name", "Description" + :widths: 15, 65 + + DateOffset, "Generic offset class, defaults to 1 calendar day" + BDay, "business day (weekday)" + CDay, "custom business day (experimental)" + Week, "one week, optionally anchored on a day of the week" + WeekOfMonth, "the x-th day of the y-th week of each month" + LastWeekOfMonth, "the x-th day of the last week of each month" + MonthEnd, "calendar month end" + MonthBegin, "calendar month begin" + BMonthEnd, "business month end" + BMonthBegin, "business month begin" + CBMonthEnd, "custom business month end" + CBMonthBegin, "custom business month begin" + QuarterEnd, "calendar quarter end" + QuarterBegin, "calendar quarter begin" + BQuarterEnd, "business quarter end" + BQuarterBegin, "business quarter begin" + FY5253Quarter, "retail (aka 52-53 week) quarter" + YearEnd, "calendar year end" + YearBegin, "calendar year begin" + BYearEnd, "business year end" + BYearBegin, "business year begin" + FY5253, "retail (aka 52-53 week) year" + Hour, "one hour" + Minute, "one minute" + Second, "one second" + Milli, "one millisecond" + Micro, "one microsecond" + + +The basic ``DateOffset`` takes the same arguments as +``dateutil.relativedelta``, which works like: + +.. ipython:: python + + d = datetime(2008, 8, 18, 9, 0) + d + relativedelta(months=4, days=5) + +We could have done the same thing with ``DateOffset``: + +.. ipython:: python + + from pandas.tseries.offsets import * + d + DateOffset(months=4, days=5) + +The key features of a ``DateOffset`` object are: + + - it can be added / subtracted to/from a datetime object to obtain a + shifted date + - it can be multiplied by an integer (positive or negative) so that the + increment will be applied multiple times + - it has ``rollforward`` and ``rollback`` methods for moving a date forward + or backward to the next or previous "offset date" + +Subclasses of ``DateOffset`` define the ``apply`` function which dictates +custom date increment logic, such as adding business days: + +.. code-block:: python + + class BDay(DateOffset): + """DateOffset increments between business days""" + def apply(self, other): + ... + +.. ipython:: python + + d - 5 * BDay() + d + BMonthEnd() + +The ``rollforward`` and ``rollback`` methods do exactly what you would expect: + +.. ipython:: python + + d + offset = BMonthEnd() + offset.rollforward(d) + offset.rollback(d) + +It's definitely worth exploring the ``pandas.tseries.offsets`` module and the +various docstrings for the classes. + +These operations (``apply``, ``rollforward`` and ``rollback``) preserves time (hour, minute, etc) information by default. To reset time, use ``normalize=True`` keyword when create offset instance. If ``normalize=True``, result is normalized after the function is applied. + + + .. ipython:: python + + day = Day() + day.apply(Timestamp('2014-01-01 09:00')) + + day = Day(normalize=True) + day.apply(Timestamp('2014-01-01 09:00')) + + hour = Hour() + hour.apply(Timestamp('2014-01-01 22:00')) + + hour = Hour(normalize=True) + hour.apply(Timestamp('2014-01-01 22:00')) + hour.apply(Timestamp('2014-01-01 23:00')) + + +Parametric offsets +~~~~~~~~~~~~~~~~~~ + +Some of the offsets can be "parameterized" when created to result in different +behavior. For example, the ``Week`` offset for generating weekly data accepts a +``weekday`` parameter which results in the generated dates always lying on a +particular day of the week: + +.. ipython:: python + + d + d + Week() + d + Week(weekday=4) + (d + Week(weekday=4)).weekday() + + d - Week() + +``normalize`` option will be effective for addition and subtraction. + +.. ipython:: python + + d + Week(normalize=True) + d - Week(normalize=True) + + +Another example is parameterizing ``YearEnd`` with the specific ending month: + +.. ipython:: python + + d + YearEnd() + d + YearEnd(month=6) + +.. _timeseries.alias: + +Custom Business Days (Experimental) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``CDay`` or ``CustomBusinessDay`` class provides a parametric +``BusinessDay`` class which can be used to create customized business day +calendars which account for local holidays and local weekend conventions. + +.. ipython:: python + + from pandas.tseries.offsets import CustomBusinessDay + # As an interesting example, let's look at Egypt where + # a Friday-Saturday weekend is observed. + weekmask_egypt = 'Sun Mon Tue Wed Thu' + # They also observe International Workers' Day so let's + # add that for a couple of years + holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] + bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) + dt = datetime(2013, 4, 30) + dt + 2 * bday_egypt + dts = date_range(dt, periods=5, freq=bday_egypt) + Series(dts.weekday, dts).map(Series('Mon Tue Wed Thu Fri Sat Sun'.split())) + +As of v0.14 holiday calendars can be used to provide the list of holidays. See the +:ref:`holiday calendar` section for more information. + +.. ipython:: python + + from pandas.tseries.holiday import USFederalHolidayCalendar + bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar()) + # Friday before MLK Day + dt = datetime(2014, 1, 17) + # Tuesday after MLK Day (Monday is skipped because it's a holiday) + dt + bday_us + +Monthly offsets that respect a certain holiday calendar can be defined +in the usual way. + +.. ipython:: python + + from pandas.tseries.offsets import CustomBusinessMonthBegin + bmth_us = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar()) + # Skip new years + dt = datetime(2013, 12, 17) + dt + bmth_us + + # Define date index with custom offset + from pandas import DatetimeIndex + DatetimeIndex(start='20100101',end='20120101',freq=bmth_us) + +.. note:: + + The frequency string 'C' is used to indicate that a CustomBusinessDay + DateOffset is used, it is important to note that since CustomBusinessDay is + a parameterised type, instances of CustomBusinessDay may differ and this is + not detectable from the 'C' frequency string. The user therefore needs to + ensure that the 'C' frequency string is used consistently within the user's + application. + + +.. note:: + + This uses the ``numpy.busdaycalendar`` API introduced in Numpy 1.7 and + therefore requires Numpy 1.7.0 or newer. + +.. warning:: + + There are known problems with the timezone handling in Numpy 1.7 and users + should therefore use this **experimental(!)** feature with caution and at + their own risk. + + To the extent that the ``datetime64`` and ``busdaycalendar`` APIs in Numpy + have to change to fix the timezone issues, the behaviour of the + ``CustomBusinessDay`` class may have to change in future versions. + +Offset Aliases +~~~~~~~~~~~~~~ + +A number of string aliases are given to useful common time series +frequencies. We will refer to these aliases as *offset aliases* +(referred to as *time rules* prior to v0.8.0). + +.. csv-table:: + :header: "Alias", "Description" + :widths: 15, 100 + + "B", "business day frequency" + "C", "custom business day frequency (experimental)" + "D", "calendar day frequency" + "W", "weekly frequency" + "M", "month end frequency" + "BM", "business month end frequency" + "CBM", "custom business month end frequency" + "MS", "month start frequency" + "BMS", "business month start frequency" + "CBMS", "custom business month start frequency" + "Q", "quarter end frequency" + "BQ", "business quarter endfrequency" + "QS", "quarter start frequency" + "BQS", "business quarter start frequency" + "A", "year end frequency" + "BA", "business year end frequency" + "AS", "year start frequency" + "BAS", "business year start frequency" + "H", "hourly frequency" + "T", "minutely frequency" + "S", "secondly frequency" + "L", "milliseonds" + "U", "microseconds" + +Combining Aliases +~~~~~~~~~~~~~~~~~ + +As we have seen previously, the alias and the offset instance are fungible in +most functions: + +.. ipython:: python + + date_range(start, periods=5, freq='B') + + date_range(start, periods=5, freq=BDay()) + +You can combine together day and intraday offsets: + +.. ipython:: python + + date_range(start, periods=10, freq='2h20min') + + date_range(start, periods=10, freq='1D10U') + +Anchored Offsets +~~~~~~~~~~~~~~~~ + +For some frequencies you can specify an anchoring suffix: + +.. csv-table:: + :header: "Alias", "Description" + :widths: 15, 100 + + "W\-SUN", "weekly frequency (sundays). Same as 'W'" + "W\-MON", "weekly frequency (mondays)" + "W\-TUE", "weekly frequency (tuesdays)" + "W\-WED", "weekly frequency (wednesdays)" + "W\-THU", "weekly frequency (thursdays)" + "W\-FRI", "weekly frequency (fridays)" + "W\-SAT", "weekly frequency (saturdays)" + "(B)Q(S)\-DEC", "quarterly frequency, year ends in December. Same as 'Q'" + "(B)Q(S)\-JAN", "quarterly frequency, year ends in January" + "(B)Q(S)\-FEB", "quarterly frequency, year ends in February" + "(B)Q(S)\-MAR", "quarterly frequency, year ends in March" + "(B)Q(S)\-APR", "quarterly frequency, year ends in April" + "(B)Q(S)\-MAY", "quarterly frequency, year ends in May" + "(B)Q(S)\-JUN", "quarterly frequency, year ends in June" + "(B)Q(S)\-JUL", "quarterly frequency, year ends in July" + "(B)Q(S)\-AUG", "quarterly frequency, year ends in August" + "(B)Q(S)\-SEP", "quarterly frequency, year ends in September" + "(B)Q(S)\-OCT", "quarterly frequency, year ends in October" + "(B)Q(S)\-NOV", "quarterly frequency, year ends in November" + "(B)A(S)\-DEC", "annual frequency, anchored end of December. Same as 'A'" + "(B)A(S)\-JAN", "annual frequency, anchored end of January" + "(B)A(S)\-FEB", "annual frequency, anchored end of February" + "(B)A(S)\-MAR", "annual frequency, anchored end of March" + "(B)A(S)\-APR", "annual frequency, anchored end of April" + "(B)A(S)\-MAY", "annual frequency, anchored end of May" + "(B)A(S)\-JUN", "annual frequency, anchored end of June" + "(B)A(S)\-JUL", "annual frequency, anchored end of July" + "(B)A(S)\-AUG", "annual frequency, anchored end of August" + "(B)A(S)\-SEP", "annual frequency, anchored end of September" + "(B)A(S)\-OCT", "annual frequency, anchored end of October" + "(B)A(S)\-NOV", "annual frequency, anchored end of November" + +These can be used as arguments to ``date_range``, ``bdate_range``, constructors +for ``DatetimeIndex``, as well as various other timeseries-related functions +in pandas. + +Legacy Aliases +~~~~~~~~~~~~~~ +Note that prior to v0.8.0, time rules had a slightly different look. pandas +will continue to support the legacy time rules for the time being but it is +strongly recommended that you switch to using the new offset aliases. + +.. csv-table:: + :header: "Legacy Time Rule", "Offset Alias" + :widths: 15, 65 + + "WEEKDAY", "B" + "EOM", "BM" + "W\@MON", "W\-MON" + "W\@TUE", "W\-TUE" + "W\@WED", "W\-WED" + "W\@THU", "W\-THU" + "W\@FRI", "W\-FRI" + "W\@SAT", "W\-SAT" + "W\@SUN", "W\-SUN" + "Q\@JAN", "BQ\-JAN" + "Q\@FEB", "BQ\-FEB" + "Q\@MAR", "BQ\-MAR" + "A\@JAN", "BA\-JAN" + "A\@FEB", "BA\-FEB" + "A\@MAR", "BA\-MAR" + "A\@APR", "BA\-APR" + "A\@MAY", "BA\-MAY" + "A\@JUN", "BA\-JUN" + "A\@JUL", "BA\-JUL" + "A\@AUG", "BA\-AUG" + "A\@SEP", "BA\-SEP" + "A\@OCT", "BA\-OCT" + "A\@NOV", "BA\-NOV" + "A\@DEC", "BA\-DEC" + "min", "T" + "ms", "L" + "us", "U" + +As you can see, legacy quarterly and annual frequencies are business quarter +and business year ends. Please also note the legacy time rule for milliseconds +``ms`` versus the new offset alias for month start ``MS``. This means that +offset alias parsing is case sensitive. + +.. _timeseries.holiday: + +Holidays / Holiday Calendars +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Holidays and calendars provide a simple way to define holiday rules to be used +with ``CustomBusinessDay`` or in other analysis that requires a predefined +set of holidays. The ``AbstractHolidayCalendar`` class provides all the necessary +methods to return a list of holidays and only ``rules`` need to be defined +in a specific holiday calendar class. Further, ``start_date`` and ``end_date`` +class attributes determine over what date range holidays are generated. These +should be overwritten on the ``AbstractHolidayCalendar`` class to have the range +apply to all calendar subclasses. ``USFederalHolidayCalendar`` is the +only calendar that exists and primarily serves as an example for developing +other calendars. + +For holidays that occur on fixed dates (e.g., US Memorial Day or July 4th) an +observance rule determines when that holiday is observed if it falls on a weekend +or some other non-observed day. Defined observance rules are: + +.. csv-table:: + :header: "Rule", "Description" + :widths: 15, 70 + + "nearest_workday", "move Saturday to Friday and Sunday to Monday" + "sunday_to_monday", "move Sunday to following Monday" + "next_monday_or_tuesday", "move Saturday to Monday and Sunday/Monday to Tuesday" + "previous_friday", move Saturday and Sunday to previous Friday" + "next_monday", "move Saturday and Sunday to following Monday" + +An example of how holidays and holiday calendars are defined: + +.. ipython:: python + + from pandas.tseries.holiday import Holiday, USMemorialDay,\ + AbstractHolidayCalendar, nearest_workday, MO + class ExampleCalendar(AbstractHolidayCalendar): + rules = [ + USMemorialDay, + Holiday('July 4th', month=7, day=4, observance=nearest_workday), + Holiday('Columbus Day', month=10, day=1, + offset=DateOffset(weekday=MO(2))), #same as 2*Week(weekday=2) + ] + cal = ExampleCalendar() + cal.holidays(datetime(2012, 1, 1), datetime(2012, 12, 31)) + +Using this calendar, creating an index or doing offset arithmetic skips weekends +and holidays (i.e., Memorial Day/July 4th). + +.. ipython:: python + + DatetimeIndex(start='7/1/2012', end='7/10/2012', + freq=CDay(calendar=cal)).to_pydatetime() + offset = CustomBusinessDay(calendar=cal) + datetime(2012, 5, 25) + offset + datetime(2012, 7, 3) + offset + datetime(2012, 7, 3) + 2 * offset + datetime(2012, 7, 6) + offset + +Ranges are defined by the ``start_date`` and ``end_date`` class attributes +of ``AbstractHolidayCalendar``. The defaults are below. + +.. ipython:: python + + AbstractHolidayCalendar.start_date + AbstractHolidayCalendar.end_date + +These dates can be overwritten by setting the attributes as +datetime/Timestamp/string. + +.. ipython:: python + + AbstractHolidayCalendar.start_date = datetime(2012, 1, 1) + AbstractHolidayCalendar.end_date = datetime(2012, 12, 31) + cal.holidays() + +Every calendar class is accessible by name using the ``get_calendar`` function +which returns a holiday class instance. Any imported calendar class will +automatically be available by this function. Also, ``HolidayCalendarFactory`` +provides an easy interface to create calendars that are combinations of calendars +or calendars with additional rules. + +.. ipython:: python + + from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory,\ + USLaborDay + cal = get_calendar('ExampleCalendar') + cal.rules + new_cal = HolidayCalendarFactory('NewExampleCalendar', cal, USLaborDay) + new_cal.rules + +.. _timeseries.advanced_datetime: + +Time series-related instance methods +------------------------------------ + +Shifting / lagging +~~~~~~~~~~~~~~~~~~ + +One may want to *shift* or *lag* the values in a TimeSeries back and forward in +time. The method for this is ``shift``, which is available on all of the pandas +objects. In DataFrame, ``shift`` will currently only shift along the ``index`` +and in Panel along the ``major_axis``. + +.. ipython:: python + + ts = ts[:5] + ts.shift(1) + +The shift method accepts an ``freq`` argument which can accept a +``DateOffset`` class or other ``timedelta``-like object or also a :ref:`offset alias `: + +.. ipython:: python + + ts.shift(5, freq=datetools.bday) + ts.shift(5, freq='BM') + +Rather than changing the alignment of the data and the index, ``DataFrame`` and +``TimeSeries`` objects also have a ``tshift`` convenience method that changes +all the dates in the index by a specified number of offsets: + +.. ipython:: python + + ts.tshift(5, freq='D') + +Note that with ``tshift``, the leading entry is no longer NaN because the data +is not being realigned. + +Frequency conversion +~~~~~~~~~~~~~~~~~~~~ + +The primary function for changing frequencies is the ``asfreq`` function. +For a ``DatetimeIndex``, this is basically just a thin, but convenient wrapper +around ``reindex`` which generates a ``date_range`` and calls ``reindex``. + +.. ipython:: python + + dr = date_range('1/1/2010', periods=3, freq=3 * datetools.bday) + ts = Series(randn(3), index=dr) + ts + ts.asfreq(BDay()) + +``asfreq`` provides a further convenience so you can specify an interpolation +method for any gaps that may appear after the frequency conversion + +.. ipython:: python + + ts.asfreq(BDay(), method='pad') + +Filling forward / backward +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Related to ``asfreq`` and ``reindex`` is the ``fillna`` function documented in +the :ref:`missing data section `. + +Converting to Python datetimes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``DatetimeIndex`` can be converted to an array of Python native datetime.datetime objects using the +``to_pydatetime`` method. + +.. _timeseries.resampling: + +Up- and downsampling +-------------------- + +With 0.8, pandas introduces simple, powerful, and efficient functionality for +performing resampling operations during frequency conversion (e.g., converting +secondly data into 5-minutely data). This is extremely common in, but not +limited to, financial applications. + +See some :ref:`cookbook examples ` for some advanced strategies + +.. ipython:: python + + rng = date_range('1/1/2012', periods=100, freq='S') + + ts = Series(randint(0, 500, len(rng)), index=rng) + + ts.resample('5Min', how='sum') + +The ``resample`` function is very flexible and allows you to specify many +different parameters to control the frequency conversion and resampling +operation. + +The ``how`` parameter can be a function name or numpy array function that takes +an array and produces aggregated values: + +.. ipython:: python + + ts.resample('5Min') # default is mean + + ts.resample('5Min', how='ohlc') + + ts.resample('5Min', how=np.max) + +Any function available via :ref:`dispatching ` can be given to +the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``sem``, +``max``, ``min``, ``median``, ``first``, ``last``, ``ohlc``. + +For downsampling, ``closed`` can be set to 'left' or 'right' to specify which +end of the interval is closed: + +.. ipython:: python + + ts.resample('5Min', closed='right') + + ts.resample('5Min', closed='left') + +For upsampling, the ``fill_method`` and ``limit`` parameters can be specified +to interpolate over the gaps that are created: + +.. ipython:: python + + # from secondly to every 250 milliseconds + + ts[:2].resample('250L') + + ts[:2].resample('250L', fill_method='pad') + + ts[:2].resample('250L', fill_method='pad', limit=2) + +Parameters like ``label`` and ``loffset`` are used to manipulate the resulting +labels. ``label`` specifies whether the result is labeled with the beginning or +the end of the interval. ``loffset`` performs a time adjustment on the output +labels. + +.. ipython:: python + + ts.resample('5Min') # by default label='right' + + ts.resample('5Min', label='left') + + ts.resample('5Min', label='left', loffset='1s') + +The ``axis`` parameter can be set to 0 or 1 and allows you to resample the +specified axis for a DataFrame. + +``kind`` can be set to 'timestamp' or 'period' to convert the resulting index +to/from time-stamp and time-span representations. By default ``resample`` +retains the input representation. + +``convention`` can be set to 'start' or 'end' when resampling period data +(detail below). It specifies how low frequency periods are converted to higher +frequency periods. + +Note that 0.8 marks a watershed in the timeseries functionality in pandas. In +previous versions, resampling had to be done using a combination of +``date_range``, ``groupby`` with ``asof``, and then calling an aggregation +function on the grouped object. This was not nearly convenient or performant as +the new pandas timeseries API. + +.. _timeseries.periods: + +Time Span Representation +------------------------ + +Regular intervals of time are represented by ``Period`` objects in pandas while +sequences of ``Period`` objects are collected in a ``PeriodIndex``, which can +be created with the convenience function ``period_range``. + +Period +~~~~~~ +A ``Period`` represents a span of time (e.g., a day, a month, a quarter, etc). +It can be created using a frequency alias: + +.. ipython:: python + + Period('2012', freq='A-DEC') + + Period('2012-1-1', freq='D') + + Period('2012-1-1 19:00', freq='H') + +Unlike time stamped data, pandas does not support frequencies at multiples of +DateOffsets (e.g., '3Min') for periods. + +Adding and subtracting integers from periods shifts the period by its own +frequency. + +.. ipython:: python + + p = Period('2012', freq='A-DEC') + + p + 1 + + p - 3 + +Taking the difference of ``Period`` instances with the same frequency will +return the number of frequency units between them: + +.. ipython:: python + + Period('2012', freq='A-DEC') - Period('2002', freq='A-DEC') + +PeriodIndex and period_range +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Regular sequences of ``Period`` objects can be collected in a ``PeriodIndex``, +which can be constructed using the ``period_range`` convenience function: + +.. ipython:: python + + prng = period_range('1/1/2011', '1/1/2012', freq='M') + prng + +The ``PeriodIndex`` constructor can also be used directly: + +.. ipython:: python + + PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + +Just like ``DatetimeIndex``, a ``PeriodIndex`` can also be used to index pandas +objects: + +.. ipython:: python + + ps = Series(randn(len(prng)), prng) + ps + +PeriodIndex Partial String Indexing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can pass in dates and strings to `Series` and `DataFrame` with `PeriodIndex`, as the same manner as `DatetimeIndex`. For details, refer to :ref:`DatetimeIndex Partial String Indexing `. + +.. ipython:: python + + ps['2011-01'] + + ps[datetime(2011, 12, 25):] + + ps['10/31/2011':'12/31/2011'] + +Passing string represents lower frequency than `PeriodIndex` returns partial sliced data. + +.. ipython:: python + + ps['2011'] + + dfp = DataFrame(randn(600,1), columns=['A'], + index=period_range('2013-01-01 9:00', periods=600, freq='T')) + dfp + dfp['2013-01-01 10H'] + +As the same as `DatetimeIndex`, the endpoints will be included in the result. Below example slices data starting from 10:00 to 11:59. + +.. ipython:: python + + dfp['2013-01-01 10H':'2013-01-01 11H'] + +Frequency Conversion and Resampling with PeriodIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The frequency of Periods and PeriodIndex can be converted via the ``asfreq`` +method. Let's start with the fiscal year 2011, ending in December: + +.. ipython:: python + + p = Period('2011', freq='A-DEC') + p + +We can convert it to a monthly frequency. Using the ``how`` parameter, we can +specify whether to return the starting or ending month: + +.. ipython:: python + + p.asfreq('M', how='start') + + p.asfreq('M', how='end') + +The shorthands 's' and 'e' are provided for convenience: + +.. ipython:: python + + p.asfreq('M', 's') + p.asfreq('M', 'e') + +Converting to a "super-period" (e.g., annual frequency is a super-period of +quarterly frequency) automatically returns the super-period that includes the +input period: + +.. ipython:: python + + p = Period('2011-12', freq='M') + + p.asfreq('A-NOV') + +Note that since we converted to an annual frequency that ends the year in +November, the monthly period of December 2011 is actually in the 2012 A-NOV +period. + +.. _timeseries.quarterly: + +Period conversions with anchored frequencies are particularly useful for +working with various quarterly data common to economics, business, and other +fields. Many organizations define quarters relative to the month in which their +fiscal year start and ends. Thus, first quarter of 2011 could start in 2010 or +a few months into 2011. Via anchored frequencies, pandas works all quarterly +frequencies ``Q-JAN`` through ``Q-DEC``. + +``Q-DEC`` define regular calendar quarters: + +.. ipython:: python + + p = Period('2012Q1', freq='Q-DEC') + + p.asfreq('D', 's') + + p.asfreq('D', 'e') + +``Q-MAR`` defines fiscal year end in March: + +.. ipython:: python + + p = Period('2011Q4', freq='Q-MAR') + + p.asfreq('D', 's') + + p.asfreq('D', 'e') + +.. _timeseries.interchange: + +Converting between Representations +---------------------------------- + +Timestamped data can be converted to PeriodIndex-ed data using ``to_period`` +and vice-versa using ``to_timestamp``: + +.. ipython:: python + + rng = date_range('1/1/2012', periods=5, freq='M') + + ts = Series(randn(len(rng)), index=rng) + + ts + + ps = ts.to_period() + + ps + + ps.to_timestamp() + +Remember that 's' and 'e' can be used to return the timestamps at the start or +end of the period: + +.. ipython:: python + + ps.to_timestamp('D', how='s') + +Converting between period and timestamp enables some convenient arithmetic +functions to be used. In the following example, we convert a quarterly +frequency with year ending in November to 9am of the end of the month following +the quarter end: + +.. ipython:: python + + prng = period_range('1990Q1', '2000Q4', freq='Q-NOV') + + ts = Series(randn(len(prng)), prng) + + ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + + ts.head() + +.. _timeseries.timezone: + +Time Zone Handling +------------------ + +Pandas provides rich support for working with timestamps in different time zones using ``pytz`` and ``dateutil`` libraries. +``dateutil`` support is new [in 0.14.1] and currently only supported for fixed offset and tzfile zones. The default library is ``pytz``. +Support for ``dateutil`` is provided for compatibility with other applications e.g. if you use ``dateutil`` in other python packages. + +By default, pandas objects are time zone unaware: + +.. ipython:: python + + rng = date_range('3/6/2012 00:00', periods=15, freq='D') + rng.tz is None + +To supply the time zone, you can use the ``tz`` keyword to ``date_range`` and +other functions. Dateutil time zone strings are distinguished from ``pytz`` +time zones by starting with ``dateutil/``. + +- In ``pytz`` you can find a list of common (and less common) time zones using + ``from pytz import common_timezones, all_timezones``. +- ``dateutil`` uses the OS timezones so there isn't a fixed list available. For + common zones, the names are the same as ``pytz``. + +.. ipython:: python + + # pytz + rng_pytz = date_range('3/6/2012 00:00', periods=10, freq='D', + tz='Europe/London') + rng_pytz.tz + + # dateutil + rng_dateutil = date_range('3/6/2012 00:00', periods=10, freq='D', + tz='dateutil/Europe/London') + rng_dateutil.tz + + # dateutil - utc special case + rng_utc = date_range('3/6/2012 00:00', periods=10, freq='D', + tz=dateutil.tz.tzutc()) + rng_utc.tz + +Note that the ``UTC`` timezone is a special case in ``dateutil`` and should be constructed explicitly +as an instance of ``dateutil.tz.tzutc``. You can also construct other timezones explicitly first, +which gives you more control over which time zone is used: + +.. ipython:: python + + # pytz + tz_pytz = pytz.timezone('Europe/London') + rng_pytz = date_range('3/6/2012 00:00', periods=10, freq='D', + tz=tz_pytz) + rng_pytz.tz == tz_pytz + + # dateutil + tz_dateutil = dateutil.tz.gettz('Europe/London') + rng_dateutil = date_range('3/6/2012 00:00', periods=10, freq='D', + tz=tz_dateutil) + rng_dateutil.tz == tz_dateutil + +Timestamps, like Python's ``datetime.datetime`` object can be either time zone +naive or time zone aware. Naive time series and DatetimeIndex objects can be +*localized* using ``tz_localize``: + +.. ipython:: python + + ts = Series(randn(len(rng)), rng) + + ts_utc = ts.tz_localize('UTC') + ts_utc + +Again, you can explicitly construct the timezone object first. +You can use the ``tz_convert`` method to convert pandas objects to convert +tz-aware data to another time zone: + +.. ipython:: python + + ts_utc.tz_convert('US/Eastern') + +.. warning:: + + Be wary of conversions between libraries. For some zones ``pytz`` and ``dateutil`` have different + definitions of the zone. This is more of a problem for unusual timezones than for + 'standard' zones like ``US/Eastern``. + +.. warning:: + + Be aware that a timezone definition across versions of timezone libraries may not + be considered equal. This may cause problems when working with stored data that + is localized using one version and operated on with a different version. + See :ref:`here` for how to handle such a situation. + +Under the hood, all timestamps are stored in UTC. Scalar values from a +``DatetimeIndex`` with a time zone will have their fields (day, hour, minute) +localized to the time zone. However, timestamps with the same UTC value are +still considered to be equal even if they are in different time zones: + +.. ipython:: python + + rng_eastern = rng_utc.tz_convert('US/Eastern') + rng_berlin = rng_utc.tz_convert('Europe/Berlin') + + rng_eastern[5] + rng_berlin[5] + rng_eastern[5] == rng_berlin[5] + +Like Series, DataFrame, and DatetimeIndex, Timestamps can be converted to other +time zones using ``tz_convert``: + +.. ipython:: python + + rng_eastern[5] + rng_berlin[5] + rng_eastern[5].tz_convert('Europe/Berlin') + +Localization of Timestamps functions just like DatetimeIndex and TimeSeries: + +.. ipython:: python + + rng[5] + rng[5].tz_localize('Asia/Shanghai') + + +Operations between TimeSeries in different time zones will yield UTC +TimeSeries, aligning the data on the UTC timestamps: + +.. ipython:: python + + eastern = ts_utc.tz_convert('US/Eastern') + berlin = ts_utc.tz_convert('Europe/Berlin') + result = eastern + berlin + result + result.index + +In some cases, localize cannot determine the DST and non-DST hours when there are +duplicates. This often happens when reading files that simply duplicate the hours. +The infer_dst argument in tz_localize will attempt +to determine the right offset. + +.. ipython:: python + :okexcept: + + rng_hourly = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00']) + rng_hourly.tz_localize('US/Eastern') + rng_hourly_eastern = rng_hourly.tz_localize('US/Eastern', infer_dst=True) + rng_hourly_eastern.values + +.. _timeseries.timedeltas: + +Time Deltas +----------- + +Timedeltas are differences in times, expressed in difference units, e.g. days,hours,minutes,seconds. +They can be both positive and negative. :ref:`DateOffsets` that are absolute in nature +(``Day, Hour, Minute, Second, Milli, Micro, Nano``) can be used as ``timedeltas``. + +.. ipython:: python + + from datetime import datetime, timedelta + s = Series(date_range('2012-1-1', periods=3, freq='D')) + td = Series([ timedelta(days=i) for i in range(3) ]) + df = DataFrame(dict(A = s, B = td)) + df + df['C'] = df['A'] + df['B'] + df + df.dtypes + + s - s.max() + s - datetime(2011,1,1,3,5) + s + timedelta(minutes=5) + s + Minute(5) + s + Minute(5) + Milli(5) + +Getting scalar results from a ``timedelta64[ns]`` series + +.. ipython:: python + + y = s - s[0] + y + +Series of timedeltas with ``NaT`` values are supported + +.. ipython:: python + + y = s - s.shift() + y + +Elements can be set to ``NaT`` using ``np.nan`` analagously to datetimes + +.. ipython:: python + + y[1] = np.nan + y + +Operands can also appear in a reversed order (a singular object operated with a Series) + +.. ipython:: python + + s.max() - s + datetime(2011,1,1,3,5) - s + timedelta(minutes=5) + s + +Some timedelta numeric like operations are supported. + +.. ipython:: python + + td - timedelta(minutes=5, seconds=5, microseconds=5) + +``min, max`` and the corresponding ``idxmin, idxmax`` operations are supported on frames + +.. ipython:: python + + A = s - Timestamp('20120101') - timedelta(minutes=5, seconds=5) + B = s - Series(date_range('2012-1-2', periods=3, freq='D')) + + df = DataFrame(dict(A=A, B=B)) + df + + df.min() + df.min(axis=1) + + df.idxmin() + df.idxmax() + +``min, max`` operations are supported on series; these return a single element +``timedelta64[ns]`` Series (this avoids having to deal with numpy timedelta64 +issues). ``idxmin, idxmax`` are supported as well. + +.. ipython:: python + + df.min().max() + df.min(axis=1).min() + + df.min().idxmax() + df.min(axis=1).idxmin() + +You can fillna on timedeltas. Integers will be interpreted as seconds. You can +pass a timedelta to get a particular value. + +.. ipython:: python + + y.fillna(0) + y.fillna(10) + y.fillna(timedelta(days=-1,seconds=5)) + +.. _timeseries.timedeltas_reductions: + +Time Deltas & Reductions +------------------------ + +.. warning:: + + A numeric reduction operation for ``timedelta64[ns]`` can return a single-element ``Series`` of + dtype ``timedelta64[ns]``. + +You can do numeric reduction operations on timedeltas. + +.. ipython:: python + + y2 = y.fillna(timedelta(days=-1,seconds=5)) + y2 + y2.mean() + y2.quantile(.1) + +.. _timeseries.timedeltas_convert: + +Time Deltas & Conversions +------------------------- + +.. versionadded:: 0.13 + +**string/integer conversion** + +Using the top-level ``to_timedelta``, you can convert a scalar or array from the standard +timedelta format (produced by ``to_csv``) into a timedelta type (``np.timedelta64`` in ``nanoseconds``). +It can also construct Series. + +.. warning:: + + This requires ``numpy >= 1.7`` + +.. ipython:: python + + to_timedelta('1 days 06:05:01.00003') + to_timedelta('15.5us') + to_timedelta(['1 days 06:05:01.00003','15.5us','nan']) + to_timedelta(np.arange(5),unit='s') + to_timedelta(np.arange(5),unit='d') + +**frequency conversion** + +Timedeltas can be converted to other 'frequencies' by dividing by another timedelta, +or by astyping to a specific timedelta type. These operations yield ``float64`` dtyped Series. + +.. ipython:: python + + td = Series(date_range('20130101',periods=4))-Series(date_range('20121201',periods=4)) + td[2] += np.timedelta64(timedelta(minutes=5,seconds=3)) + td[3] = np.nan + td + + # to days + td / np.timedelta64(1,'D') + td.astype('timedelta64[D]') + + # to seconds + td / np.timedelta64(1,'s') + td.astype('timedelta64[s]') + +Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series +yields another ``timedelta64[ns]`` dtypes Series. + +.. ipython:: python + + td * -1 + td * Series([1,2,3,4]) + +Numpy < 1.7 Compatibility +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Numpy < 1.7 has a broken ``timedelta64`` type that does not correctly work +for arithmetic. pandas bypasses this, but for frequency conversion as above, +you need to create the divisor yourself. The ``np.timetimedelta64`` type only +has 1 argument, the number of **micro** seconds. + +The following are equivalent statements in the two versions of numpy. + +.. code-block:: python + + from distutils.version import LooseVersion + if LooseVersion(np.__version__) <= '1.6.2': + y / np.timedelta(86400*int(1e6)) + y / np.timedelta(int(1e6)) + else: + y / np.timedelta64(1,'D') + y / np.timedelta64(1,'s') diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst new file mode 100644 index 00000000..421304bb --- /dev/null +++ b/doc/source/tutorials.rst @@ -0,0 +1,126 @@ +.. _tutorials: + +********* +Tutorials +********* + +This is a guide to many pandas tutorials, geared mainly for new users. + +Internal Guides +--------------- + +pandas own :ref:`10 Minutes to pandas<10min>` + +More complex recipes are in the :ref:`Cookbook` + +pandas Cookbook +--------------- + +The goal of this cookbook (by `Julia Evans `_) is to +give you some concrete examples for getting started with pandas. These +are examples with real-world data, and all the bugs and weirdness that +that entails. + +Here are links to the v0.1 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub +repository `_. To run the examples in this tutorial, you'll need to +clone the GitHub repository and get IPython Notebook running. +See `How to use this cookbook `_. + +- `A quick tour of the IPython Notebook: `_ + Shows off IPython's awesome tab completion and magic functions. +- `Chapter 1: `_ + Reading your data into pandas is pretty much the easiest thing. Even + when the encoding is wrong! +- `Chapter 2: `_ + It's not totally obvious how to select data from a pandas dataframe. + Here we explain the basics (how to take slices and get columns) +- `Chapter 3: `_ + Here we get into serious slicing and dicing and learn how to filter + dataframes in complicated ways, really fast. +- `Chapter 4: `_ + Groupby/aggregate is seriously my favorite thing about pandas + and I use it all the time. You should probably read this. +- `Chapter 5: `_ + Here you get to find out if it's cold in Montreal in the winter + (spoiler: yes). Web scraping with pandas is fun! Here we combine dataframes. +- `Chapter 6: `_ + Strings with pandas are great. It has all these vectorized string + operations and they're the best. We will turn a bunch of strings + containing "Snow" into vectors of numbers in a trice. +- `Chapter 7: `_ + Cleaning up messy data is never a joy, but with pandas it's easier. +- `Chapter 8: `_ + Parsing Unix timestamps is confusing at first but it turns out + to be really easy. + + +Lessons for New pandas Users +---------------------------- + +For more resources, please visit the main `repository `_. + +- `01 - Lesson: `_ + - Importing libraries + - Creating data sets + - Creating data frames + - Reading from CSV + - Exporting to CSV + - Finding maximums + - Plotting data + +- `02 - Lesson: `_ + - Reading from TXT + - Exporting to TXT + - Selecting top/bottom records + - Descriptive statistics + - Grouping/sorting data + +- `03 - Lesson: `_ + - Creating functions + - Reading from EXCEL + - Exporting to EXCEL + - Outliers + - Lambda functions + - Slice and dice data + +- `04 - Lesson: `_ + - Adding/deleting columns + - Index operations + +- `05 - Lesson: `_ + - Stack/Unstack/Transpose functions + +- `06 - Lesson: `_ + - GroupBy function + +- `07 - Lesson: `_ + - Ways to calculate outliers + +- `08 - Lesson: `_ + - Read from Microsoft SQL databases + +- `09 - Lesson: `_ + - Export to CSV/EXCEL/TXT + +- `10 - Lesson: `_ + - Converting between different kinds of formats + +- `11 - Lesson: `_ + - Combining data from various sources + + +Excel charts with pandas, vincent and xlsxwriter +------------------------------------------------ + +- `Using Pandas and XlsxWriter to create Excel charts `_ + +Various Tutorials +----------------- + +- `Wes McKinney's (pandas BDFL) blog `_ +- `Statistical analysis made easy in Python with SciPy and pandas DataFrames, by Randal Olson `_ +- `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ +- `Financial analysis in python, by Thomas Wiecki `_ +- `Intro to pandas data structures, by Greg Reda `_ +- `Pandas and Python: Top 10, by Manish Amde `_ +- `Pandas Tutorial, by Mikhail Semeniuk `_ diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt new file mode 100644 index 00000000..93ab3b91 --- /dev/null +++ b/doc/source/v0.10.0.txt @@ -0,0 +1,356 @@ +.. _whatsnew_0100: + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +v0.10.0 (December 17, 2012) +--------------------------- + +This is a major release from 0.9.1 and includes many new features and +enhancements along with a large number of bug fixes. There are also a number of +important API changes that long-time pandas users should pay close attention +to. + +File parsing new features +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The delimited file parsing engine (the guts of ``read_csv`` and ``read_table``) +has been rewritten from the ground up and now uses a fraction the amount of +memory while parsing, while being 40% or more faster in most use cases (in some +cases much faster). + +There are also many new features: + +- Much-improved Unicode handling via the ``encoding`` option. +- Column filtering (``usecols``) +- Dtype specification (``dtype`` argument) +- Ability to specify strings to be recognized as True/False +- Ability to yield NumPy record arrays (``as_recarray``) +- High performance ``delim_whitespace`` option +- Decimal format (e.g. European format) specification +- Easier CSV dialect options: ``escapechar``, ``lineterminator``, + ``quotechar``, etc. +- More robust handling of many exceptional kinds of files observed in the wild + +API changes +~~~~~~~~~~~ + +**Deprecated DataFrame BINOP TimeSeries special case behavior** + +The default behavior of binary operations between a DataFrame and a Series has +always been to align on the DataFrame's columns and broadcast down the rows, +**except** in the special case that the DataFrame contains time series. Since +there are now method for each binary operator enabling you to specify how you +want to broadcast, we are phasing out this special case (Zen of Python: +*Special cases aren't special enough to break the rules*). Here's what I'm +talking about: + +.. ipython:: python + + import pandas as pd + df = pd.DataFrame(np.random.randn(6, 4), + index=pd.date_range('1/1/2000', periods=6)) + df + # deprecated now + df - df[0] + # Change your code to + df.sub(df[0], axis=0) # align on axis 0 (rows) + +You will get a deprecation warning in the 0.10.x series, and the deprecated +functionality will be removed in 0.11 or later. + +**Altered resample default behavior** + +The default time series ``resample`` binning behavior of daily ``D`` and +*higher* frequencies has been changed to ``closed='left', label='left'``. Lower +nfrequencies are unaffected. The prior defaults were causing a great deal of +confusion for users, especially resampling data to daily frequency (which +labeled the aggregated group with the end of the interval: the next day). + +Note: + +.. ipython:: python + + dates = pd.date_range('1/1/2000', '1/5/2000', freq='4h') + series = Series(np.arange(len(dates)), index=dates) + series + series.resample('D', how='sum') + # old behavior + series.resample('D', how='sum', closed='right', label='right') + +- Infinity and negative infinity are no longer treated as NA by ``isnull`` and + ``notnull``. That they every were was a relic of early pandas. This behavior + can be re-enabled globally by the ``mode.use_inf_as_null`` option: + +.. ipython:: python + + s = pd.Series([1.5, np.inf, 3.4, -np.inf]) + pd.isnull(s) + s.fillna(0) + pd.set_option('use_inf_as_null', True) + pd.isnull(s) + s.fillna(0) + pd.reset_option('use_inf_as_null') + +- Methods with the ``inplace`` option now all return ``None`` instead of the + calling object. E.g. code written like ``df = df.fillna(0, inplace=True)`` + may stop working. To fix, simply delete the unnecessary variable assignment. + +- ``pandas.merge`` no longer sorts the group keys (``sort=False``) by + default. This was done for performance reasons: the group-key sorting is + often one of the more expensive parts of the computation and is often + unnecessary. + +- The default column names for a file with no header have been changed to the + integers ``0`` through ``N - 1``. This is to create consistency with the + DataFrame constructor with no columns specified. The v0.9.0 behavior (names + ``X0``, ``X1``, ...) can be reproduced by specifying ``prefix='X'``: + +.. ipython:: python + + data= 'a,b,c\n1,Yes,2\n3,No,4' + print(data) + pd.read_csv(StringIO(data), header=None) + pd.read_csv(StringIO(data), header=None, prefix='X') + +- Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, + though this can be controlled by new ``true_values`` and ``false_values`` + arguments: + +.. ipython:: python + + print(data) + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) + +- The file parsers will not recognize non-string values arising from a + converter function as NA if passed in the ``na_values`` argument. It's better + to do post-processing using the ``replace`` function instead. + +- Calling ``fillna`` on Series or DataFrame with no arguments is no longer + valid code. You must either specify a fill value or an interpolation method: + +.. ipython:: python + + s = Series([np.nan, 1., 2., np.nan, 4]) + s + s.fillna(0) + s.fillna(method='pad') + +Convenience methods ``ffill`` and ``bfill`` have been added: + +.. ipython:: python + + s.ffill() + + +- ``Series.apply`` will now operate on a returned value from the applied + function, that is itself a series, and possibly upcast the result to a + DataFrame + + .. ipython:: python + + def f(x): + return Series([ x, x**2 ], index = ['x', 'x^2']) + + s = Series(np.random.rand(5)) + s + s.apply(f) + +- New API functions for working with pandas options (:issue:`2097`): + + - ``get_option`` / ``set_option`` - get/set the value of an option. Partial + names are accepted. - ``reset_option`` - reset one or more options to + their default value. Partial names are accepted. - ``describe_option`` - + print a description of one or more options. When called with no + arguments. print all registered options. + + Note: ``set_printoptions``/ ``reset_printoptions`` are now deprecated (but + functioning), the print options now live under "display.XYZ". For example: + + .. ipython:: python + + get_option("display.max_rows") + +- to_string() methods now always return unicode strings (:issue:`2224`). + +New features +~~~~~~~~~~~~ + +Wide DataFrame Printing +~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of printing the summary information, pandas now splits the string +representation across multiple rows by default: + +.. ipython:: python + + wide_frame = DataFrame(randn(5, 16)) + + wide_frame + +The old behavior of printing out summary information can be achieved via the +'expand_frame_repr' print option: + +.. ipython:: python + + pd.set_option('expand_frame_repr', False) + + wide_frame + +.. ipython:: python + :suppress: + + pd.reset_option('expand_frame_repr') + +The width of each line can be changed via 'line_width' (80 by default): + +.. ipython:: python + + pd.set_option('line_width', 40) + + wide_frame + +.. ipython:: python + :suppress: + + pd.reset_option('line_width') + + +Updated PyTables Support +~~~~~~~~~~~~~~~~~~~~~~~~ + +:ref:`Docs ` for PyTables ``Table`` format & several enhancements to the api. Here is a taste of what to expect. + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +.. ipython:: python + + store = HDFStore('store.h5') + df = DataFrame(randn(8, 3), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C']) + df + + # appending data frames + df1 = df[0:4] + df2 = df[4:] + store.append('df', df1) + store.append('df', df2) + store + + # selecting the entire store + store.select('df') + +.. ipython:: python + + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + wp + + # storing a panel + store.append('wp',wp) + + # selecting via A QUERY + store.select('wp', + [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) + + # removing data from tables + store.remove('wp', Term('major_axis>20000103')) + store.select('wp') + + # deleting a store + del store['df'] + store + +**Enhancements** + +- added ability to hierarchical keys + + .. ipython:: python + + store.put('foo/bar/bah', df) + store.append('food/orange', df) + store.append('food/apple', df) + store + + # remove all nodes under this level + store.remove('food') + store + +- added mixed-dtype support! + + .. ipython:: python + + df['string'] = 'string' + df['int'] = 1 + store.append('df',df) + df1 = store.select('df') + df1 + df1.get_dtype_counts() + +- performance improvments on table writing +- support for arbitrarily indexed dimensions +- ``SparseSeries`` now has a ``density`` property (:issue:`2384`) +- enable ``Series.str.strip/lstrip/rstrip`` methods to take an input argument + to strip arbitrary characters (:issue:`2411`) +- implement ``value_vars`` in ``melt`` to limit values to certain columns + and add ``melt`` to pandas namespace (:issue:`2412`) + +**Bug Fixes** + +- added ``Term`` method of specifying where conditions (:issue:`1996`). +- ``del store['df']`` now call ``store.remove('df')`` for store deletion +- deleting of consecutive rows is much faster than before +- ``min_itemsize`` parameter can be specified in table creation to force a + minimum size for indexing columns (the previous implementation would set the + column size based on the first append) +- indexing support via ``create_table_index`` (requires PyTables >= 2.3) + (:issue:`698`). +- appending on a store would fail if the table was not first created via ``put`` +- fixed issue with missing attributes after loading a pickled dataframe (GH2431) +- minor change to select and remove: require a table ONLY if where is also + provided (and not None) + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + +**Compatibility** + +0.10 of ``HDFStore`` is backwards compatible for reading tables created in a prior version of pandas, +however, query terms using the prior (undocumented) methodology are unsupported. You must read in the entire +file and write it out using the new format to take advantage of the updates. + +N Dimensional Panels (Experimental) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Adding experimental support for Panel4D and factory functions to create n-dimensional named panels. +:ref:`Docs ` for NDim. Here is a taste of what to expect. + + .. ipython:: python + + p4d = Panel4D(randn(2, 2, 5, 4), + labels=['Label1','Label2'], + items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + p4d + + + + + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt new file mode 100644 index 00000000..b61d3193 --- /dev/null +++ b/doc/source/v0.10.1.txt @@ -0,0 +1,211 @@ +.. _whatsnew_0101: + +v0.10.1 (January 22, 2013) +--------------------------- + +This is a minor release from 0.10.0 and includes new features, enhancements, +and bug fixes. In particular, there is substantial new HDFStore functionality +contributed by Jeff Reback. + +An undesired API breakage with functions taking the ``inplace`` option has been +reverted and deprecation warnings added. + +API changes +~~~~~~~~~~~ + +- Functions taking an ``inplace`` option return the calling object as before. A + deprecation message has been added +- Groupby aggregations Max/Min no longer exclude non-numeric data (:issue:`2700`) +- Resampling an empty DataFrame now returns an empty DataFrame instead of + raising an exception (:issue:`2640`) +- The file reader will now raise an exception when NA values are found in an + explicitly specified integer column instead of converting the column to float + (:issue:`2631`) +- DatetimeIndex.unique now returns a DatetimeIndex with the same name and +- timezone instead of an array (:issue:`2563`) + +New features +~~~~~~~~~~~~ + +- MySQL support for database (contribution from Dan Allan) + +HDFStore +~~~~~~~~ + +You may need to upgrade your existing data files. Please visit the +**compatibility** section in the main docs. + + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +You can designate (and index) certain columns that you want to be able to +perform queries on a table, by passing a list to ``data_columns`` + +.. ipython:: python + + store = HDFStore('store.h5') + df = DataFrame(randn(8, 3), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C']) + df['string'] = 'foo' + df.ix[4:6,'string'] = np.nan + df.ix[7:9,'string'] = 'bar' + df['string2'] = 'cool' + df + + # on-disk operations + store.append('df', df, data_columns = ['B','C','string','string2']) + store.select('df',[ 'B > 0', 'string == foo' ]) + + # this is in-memory version of this type of selection + df[(df.B > 0) & (df.string == 'foo')] + +Retrieving unique values in an indexable or data column. + +.. code-block:: python + + # note that this is deprecated as of 0.14.0 + # can be replicated by: store.select_column('df','index').unique() + store.unique('df','index') + store.unique('df','string') + +You can now store ``datetime64`` in data columns + +.. ipython:: python + + df_mixed = df.copy() + df_mixed['datetime64'] = Timestamp('20010102') + df_mixed.ix[3:4,['A','B']] = np.nan + + store.append('df_mixed', df_mixed) + df_mixed1 = store.select('df_mixed') + df_mixed1 + df_mixed1.get_dtype_counts() + +You can pass ``columns`` keyword to select to filter a list of the return +columns, this is equivalent to passing a +``Term('columns',list_of_columns_to_filter)`` + +.. ipython:: python + + store.select('df',columns = ['A','B']) + +``HDFStore`` now serializes multi-index dataframes when appending tables. + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + df + + store.append('mi',df) + store.select('mi') + + # the levels are automatically included as data columns + store.select('mi', Term('foo=bar')) + +Multi-table creation via ``append_to_multiple`` and selection via +``select_as_multiple`` can create/select from multiple tables and return a +combined result, by using ``where`` on a selector table. + +.. ipython:: python + + df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt['foo'] = 'bar' + + # you can also create the tables individually + store.append_to_multiple({ 'df1_mt' : ['A','B'], 'df2_mt' : None }, df_mt, selector = 'df1_mt') + store + + # indiviual tables were created + store.select('df1_mt') + store.select('df2_mt') + + # as a multiple + store.select_as_multiple(['df1_mt','df2_mt'], where = [ 'A>0','B>0' ], selector = 'df1_mt') + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + +**Enhancements** + +- ``HDFStore`` now can read native PyTables table format tables + +- You can pass ``nan_rep = 'my_nan_rep'`` to append, to change the default nan + representation on disk (which converts to/from `np.nan`), this defaults to + `nan`. + +- You can pass ``index`` to ``append``. This defaults to ``True``. This will + automagically create indicies on the *indexables* and *data columns* of the + table + +- You can pass ``chunksize=an integer`` to ``append``, to change the writing + chunksize (default is 50000). This will signficantly lower your memory usage + on writing. + +- You can pass ``expectedrows=an integer`` to the first ``append``, to set the + TOTAL number of expectedrows that ``PyTables`` will expected. This will + optimize read/write performance. + +- ``Select`` now supports passing ``start`` and ``stop`` to provide selection + space limiting in selection. + +- Greatly improved ISO8601 (e.g., yyyy-mm-dd) date parsing for file parsers (:issue:`2698`) +- Allow ``DataFrame.merge`` to handle combinatorial sizes too large for 64-bit + integer (:issue:`2690`) +- Series now has unary negation (-series) and inversion (~series) operators (:issue:`2686`) +- DataFrame.plot now includes a ``logx`` parameter to change the x-axis to log scale (:issue:`2327`) +- Series arithmetic operators can now handle constant and ndarray input (:issue:`2574`) +- ExcelFile now takes a ``kind`` argument to specify the file type (:issue:`2613`) +- A faster implementation for Series.str methods (:issue:`2602`) + +**Bug Fixes** + +- ``HDFStore`` tables can now store ``float32`` types correctly (cannot be + mixed with ``float64`` however) +- Fixed Google Analytics prefix when specifying request segment (:issue:`2713`). +- Function to reset Google Analytics token store so users can recover from + improperly setup client secrets (:issue:`2687`). +- Fixed groupby bug resulting in segfault when passing in MultiIndex (:issue:`2706`) +- Fixed bug where passing a Series with datetime64 values into `to_datetime` + results in bogus output values (:issue:`2699`) +- Fixed bug in ``pattern in HDFStore`` expressions when pattern is not a valid + regex (:issue:`2694`) +- Fixed performance issues while aggregating boolean data (:issue:`2692`) +- When given a boolean mask key and a Series of new values, Series __setitem__ + will now align the incoming values with the original Series (:issue:`2686`) +- Fixed MemoryError caused by performing counting sort on sorting MultiIndex + levels with a very large number of combinatorial values (:issue:`2684`) +- Fixed bug that causes plotting to fail when the index is a DatetimeIndex with + a fixed-offset timezone (:issue:`2683`) +- Corrected businessday subtraction logic when the offset is more than 5 bdays + and the starting date is on a weekend (:issue:`2680`) +- Fixed C file parser behavior when the file has more columns than data + (:issue:`2668`) +- Fixed file reader bug that misaligned columns with data in the presence of an + implicit column and a specified `usecols` value +- DataFrames with numerical or datetime indices are now sorted prior to + plotting (:issue:`2609`) +- Fixed DataFrame.from_records error when passed columns, index, but empty + records (:issue:`2633`) +- Several bug fixed for Series operations when dtype is datetime64 (:issue:`2689`, + :issue:`2629`, :issue:`2626`) + + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt new file mode 100644 index 00000000..3a567941 --- /dev/null +++ b/doc/source/v0.11.0.txt @@ -0,0 +1,332 @@ +.. _whatsnew_0110: + +v0.11.0 (April 22, 2013) +------------------------ + +This is a major release from 0.10.1 and includes many new features and +enhancements along with a large number of bug fixes. The methods of Selecting +Data have had quite a number of additions, and Dtype support is now full-fledged. +There are also a number of important API changes that long-time pandas users should +pay close attention to. + +There is a new section in the documentation, :ref:`10 Minutes to Pandas <10min>`, +primarily geared to new users. + +There is a new section in the documentation, :ref:`Cookbook `, a collection +of useful recipes in pandas (and that we want contributions!). + +There are several libraries that are now :ref:`Recommended Dependencies ` + +Selection Choices +~~~~~~~~~~~~~~~~~ + +Starting in 0.11.0, object selection has had a number of user-requested additions in +order to support more explicit location based indexing. Pandas now supports +three types of multi-axis indexing. + +- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) + - A list or array of labels ``['a', 'b', 'c']`` + - A slice object with labels ``'a':'f'``, (note that contrary to usual python slices, **both** the start and the stop are included!) + - A boolean array + + See more at :ref:`Selection by Label ` + +- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: + + - An integer e.g. ``5`` + - A list or array of integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` + - A boolean array + + See more at :ref:`Selection by Position ` + +- ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general and will support + any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. ``.ix`` is especially useful when dealing with mixed positional and label + based hierarchial indexes. + + As using integer slices with ``.ix`` have different behavior depending on whether the slice + is interpreted as position based or label based, it's usually better to be + explicit and use ``.iloc`` or ``.loc``. + + See more at :ref:`Advanced Indexing `, :ref:`Advanced Hierarchical ` and + :ref:`Fallback Indexing ` + + +Selection Deprecations +~~~~~~~~~~~~~~~~~~~~~~ + +Starting in version 0.11.0, these methods *may* be deprecated in future versions. + +- ``irow`` +- ``icol`` +- ``iget_value`` + +See the section :ref:`Selection by Position ` for substitutes. + +Dtypes +~~~~~~ + +Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. + +.. ipython:: python + + df1 = DataFrame(randn(8, 1), columns = ['A'], dtype = 'float32') + df1 + df1.dtypes + df2 = DataFrame(dict( A = Series(randn(8),dtype='float16'), + B = Series(randn(8)), + C = Series(randn(8),dtype='uint8') )) + df2 + df2.dtypes + + # here you get some upcasting + df3 = df1.reindex_like(df2).fillna(value=0.0) + df2 + df3 + df3.dtypes + +Dtype Conversion +~~~~~~~~~~~~~~~~ + +This is lower-common-denomicator upcasting, meaning you get the dtype which can accomodate all of the types + +.. ipython:: python + + df3.values.dtype + +Conversion + +.. ipython:: python + + df3.astype('float32').dtypes + +Mixed Conversion + +.. ipython:: python + + df3['D'] = '1.' + df3['E'] = '1' + df3.convert_objects(convert_numeric=True).dtypes + + # same, but specific dtype conversion + df3['D'] = df3['D'].astype('float16') + df3['E'] = df3['E'].astype('int32') + df3.dtypes + +Forcing Date coercion (and setting ``NaT`` when not datelike) + +.. ipython:: python + + from datetime import datetime + s = Series([datetime(2001,1,1,0,0), 'foo', 1.0, 1, + Timestamp('20010104'), '20010105'],dtype='O') + s.convert_objects(convert_dates='coerce') + +Dtype Gotchas +~~~~~~~~~~~~~ + +**Platform Gotchas** + +Starting in 0.11.0, construction of DataFrame/Series will use default dtypes of ``int64`` and ``float64``, +*regardless of platform*. This is not an apparent change from earlier versions of pandas. If you specify +dtypes, they *WILL* be respected, however (:issue:`2837`) + +The following will all result in ``int64`` dtypes + +.. ipython:: python + + DataFrame([1,2],columns=['a']).dtypes + DataFrame({'a' : [1,2] }).dtypes + DataFrame({'a' : 1 }, index=range(2)).dtypes + +Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms! + + +**Upcasting Gotchas** + +Performing indexing operations on integer type data can easily upcast the data. +The dtype of the input data will be preserved in cases where ``nans`` are not introduced. + +.. ipython:: python + + dfi = df3.astype('int32') + dfi['D'] = dfi['D'].astype('int64') + dfi + dfi.dtypes + + casted = dfi[dfi>0] + casted + casted.dtypes + +While float dtypes are unchanged. + +.. ipython:: python + + df4 = df3.copy() + df4['A'] = df4['A'].astype('float32') + df4.dtypes + + casted = df4[df4>0] + casted + casted.dtypes + +Datetimes Conversion +~~~~~~~~~~~~~~~~~~~~ + +Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, +in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. +Furthermore ``datetime64[ns]`` columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*) +(:issue:`2809`, :issue:`2810`) + +.. ipython:: python + + df = DataFrame(randn(6,2),date_range('20010102',periods=6),columns=['A','B']) + df['timestamp'] = Timestamp('20010103') + df + + # datetime64[ns] out of the box + df.get_dtype_counts() + + # use the traditional nan, which is mapped to NaT internally + df.ix[2:4,['A','timestamp']] = np.nan + df + +Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`` to ``np.nan`` + +.. ipython:: python + + import datetime + s = Series([datetime.datetime(2001, 1, 2, 0, 0) for i in range(3)]) + s.dtype + s[1] = np.nan + s + s.dtype + s = s.astype('O') + s + s.dtype + + +API changes +~~~~~~~~~~~ + + - Added to_series() method to indicies, to facilitate the creation of indexers + (:issue:`3275`) + + - ``HDFStore`` + + - added the method ``select_column`` to select a single column from a table as a Series. + - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + - ``min_itemsize`` parameter to ``append`` will now automatically create data_columns for passed keys + +Enhancements +~~~~~~~~~~~~ + + - Improved performance of df.to_csv() by up to 10x in some cases. (:issue:`3059`) + + - Numexpr is now a :ref:`Recommended Dependencies `, to accelerate certain + types of numerical and boolean operations + + - Bottleneck is now a :ref:`Recommended Dependencies `, to accelerate certain + types of ``nan`` operations + + - ``HDFStore`` + + - support ``read_hdf/to_hdf`` API similar to ``read_csv/to_csv`` + + .. ipython:: python + :suppress: + + from pandas.compat import lrange + + .. ipython:: python + + df = DataFrame(dict(A=lrange(5), B=lrange(5))) + df.to_hdf('store.h5','table',append=True) + read_hdf('store.h5', 'table', where = ['index>2']) + + .. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + + - provide dotted attribute access to ``get`` from stores, e.g. ``store.df == store['df']`` + + - new keywords ``iterator=boolean``, and ``chunksize=number_in_a_chunk`` are + provided to support iteration on ``select`` and ``select_as_multiple`` (:issue:`3076`) + + - You can now select timestamps from an *unordered* timeseries similarly to an *ordered* timeseries (:issue:`2437`) + + - You can now select with a string from a DataFrame with a datelike index, in a similar way to a Series (:issue:`3070`) + + .. ipython:: python + + idx = date_range("2001-10-1", periods=5, freq='M') + ts = Series(np.random.rand(len(idx)),index=idx) + ts['2001'] + + df = DataFrame(dict(A = ts)) + df['2001'] + + - ``Squeeze`` to possibly remove length 1 dimensions from an object. + + .. ipython:: python + + p = Panel(randn(3,4,4),items=['ItemA','ItemB','ItemC'], + major_axis=date_range('20010102',periods=4), + minor_axis=['A','B','C','D']) + p + p.reindex(items=['ItemA']).squeeze() + p.reindex(items=['ItemA'],minor=['B']).squeeze() + + - In ``pd.io.data.Options``, + + + Fix bug when trying to fetch data for the current month when already + past expiry. + + Now using lxml to scrape html instead of BeautifulSoup (lxml was faster). + + New instance variables for calls and puts are automatically created + when a method that creates them is called. This works for current month + where the instance variables are simply ``calls`` and ``puts``. Also + works for future expiry months and save the instance variable as + ``callsMMYY`` or ``putsMMYY``, where ``MMYY`` are, respectively, the + month and year of the option's expiry. + + ``Options.get_near_stock_price`` now allows the user to specify the + month for which to get relevant options data. + + ``Options.get_forward_data`` now has optional kwargs ``near`` and + ``above_below``. This allows the user to specify if they would like to + only return forward looking data for options near the current stock + price. This just obtains the data from Options.get_near_stock_price + instead of Options.get_xxx_data() (:issue:`2758`). + + - Cursor coordinate information is now displayed in time-series plots. + + - added option `display.max_seq_items` to control the number of + elements printed per sequence pprinting it. (:issue:`2979`) + + - added option `display.chop_threshold` to control display of small numerical + values. (:issue:`2739`) + + - added option `display.max_info_rows` to prevent verbose_info from being + calculated for frames above 1M rows (configurable). (:issue:`2807`, :issue:`2918`) + + - value_counts() now accepts a "normalize" argument, for normalized + histograms. (:issue:`2710`). + + - DataFrame.from_records now accepts not only dicts but any instance of + the collections.Mapping ABC. + + - added option `display.mpl_style` providing a sleeker visual style + for plots. Based on https://gist.github.com/huyng/816622 (:issue:`3075`). + + - Treat boolean values as integers (values 1 and 0) for numeric + operations. (:issue:`2641`) + + - to_html() now accepts an optional "escape" argument to control reserved + HTML character escaping (enabled by default) and escapes ``&``, in addition + to ``<`` and ``>``. (:issue:`2919`) + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt new file mode 100644 index 00000000..fd726af3 --- /dev/null +++ b/doc/source/v0.12.0.txt @@ -0,0 +1,494 @@ +.. _whatsnew_0120: + +v0.12.0 (July 24, 2013) +------------------------ + +This is a major release from 0.11.0 and includes several new features and +enhancements along with a large number of bug fixes. + +Highlights include a consistent I/O API naming scheme, routines to read html, +write multi-indexes to csv files, read & write STATA data files, read & write JSON format +files, Python 3 support for ``HDFStore``, filtering of groupby expressions via ``filter``, and a +revamped ``replace`` routine that accepts regular expressions. + +API changes +~~~~~~~~~~~ + + - The I/O API is now much more consistent with a set of top level ``reader`` functions + accessed like ``pd.read_csv()`` that generally return a ``pandas`` object. + + * ``read_csv`` + * ``read_excel`` + * ``read_hdf`` + * ``read_sql`` + * ``read_json`` + * ``read_html`` + * ``read_stata`` + * ``read_clipboard`` + + The corresponding ``writer`` functions are object methods that are accessed like ``df.to_csv()`` + + * ``to_csv`` + * ``to_excel`` + * ``to_hdf`` + * ``to_sql`` + * ``to_json`` + * ``to_html`` + * ``to_stata`` + * ``to_clipboard`` + + + - Fix modulo and integer division on Series,DataFrames to act similary to ``float`` dtypes to return + ``np.nan`` or ``np.inf`` as appropriate (:issue:`3590`). This correct a numpy bug that treats ``integer`` + and ``float`` dtypes differently. + + .. ipython:: python + + p = DataFrame({ 'first' : [4,5,8], 'second' : [0,0,3] }) + p % 0 + p % p + p / p + p / 0 + + - Add ``squeeze`` keyword to ``groupby`` to allow reduction from + DataFrame -> Series if groups are unique. This is a Regression from 0.10.1. + We are reverting back to the prior behavior. This means groupby will return the + same shaped objects whether the groups are unique or not. Revert this issue (:issue:`2893`) + with (:issue:`3596`). + + .. ipython:: python + + df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, + {"val1":1, "val2": 27}, {"val1":1, "val2": 12}]) + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + # squeezing the result frame to a series (because we have unique groups) + df2.groupby("val1", squeeze=True).apply(func) + + # no squeezing (the default, and behavior in 0.10.1) + df2.groupby("val1").apply(func) + + - Raise on ``iloc`` when boolean indexing with a label based indexer mask + e.g. a boolean Series, even with integer labels, will raise. Since ``iloc`` + is purely positional based, the labels on the Series are not alignable (:issue:`3631`) + + This case is rarely used, and there are plently of alternatives. This preserves the + ``iloc`` API to be *purely* positional based. + + .. ipython:: python + :suppress: + + from pandas.compat import lrange + + .. ipython:: python + + df = DataFrame(lrange(5), list('ABCDE'), columns=['a']) + mask = (df.a%2 == 0) + mask + + # this is what you should use + df.loc[mask] + + # this will work as well + df.iloc[mask.values] + + ``df.iloc[mask]`` will raise a ``ValueError`` + + - The ``raise_on_error`` argument to plotting functions is removed. Instead, + plotting functions raise a ``TypeError`` when the ``dtype`` of the object + is ``object`` to remind you to avoid ``object`` arrays whenever possible + and thus you should cast to an appropriate numeric dtype if you need to + plot something. + + - Add ``colormap`` keyword to DataFrame plotting methods. Accepts either a + matplotlib colormap object (ie, matplotlib.cm.jet) or a string name of such + an object (ie, 'jet'). The colormap is sampled to select the color for each + column. Please see :ref:`visualization.colormaps` for more information. + (:issue:`3860`) + + - ``DataFrame.interpolate()`` is now deprecated. Please use + ``DataFrame.fillna()`` and ``DataFrame.replace()`` instead. (:issue:`3582`, + :issue:`3675`, :issue:`3676`) + + - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are + deprecated + + - ``DataFrame.replace`` 's ``infer_types`` parameter is removed and now + performs conversion by default. (:issue:`3907`) + + - Add the keyword ``allow_duplicates`` to ``DataFrame.insert`` to allow a duplicate column + to be inserted if ``True``, default is ``False`` (same as prior to 0.12) (:issue:`3679`) + - Implement ``__nonzero__`` for ``NDFrame`` objects (:issue:`3691`, :issue:`3696`) + + - IO api + + - added top-level function ``read_excel`` to replace the following, + The original API is deprecated and will be removed in a future version + + .. code-block:: python + + from pandas.io.parsers import ExcelFile + xls = ExcelFile('path_to_file.xls') + xls.parse('Sheet1', index_col=None, na_values=['NA']) + + With + + .. code-block:: python + + import pandas as pd + pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + + - added top-level function ``read_sql`` that is equivalent to the following + + .. code-block:: python + + from pandas.io.sql import read_frame + read_frame(....) + + - ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for + their first argument (:issue:`3702`) + + - Do not allow astypes on ``datetime64[ns]`` except to ``object``, and + ``timedelta64[ns]`` to ``object/int`` (:issue:`3425`) + + - The behavior of ``datetime64`` dtypes has changed with respect to certain + so-called reduction operations (:issue:`3726`). The following operations now + raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty* + ``Series`` when performed on a ``DataFrame`` similar to performing these + operations on, for example, a ``DataFrame`` of ``slice`` objects: + + - sum, prod, mean, std, var, skew, kurt, corr, and cov + + - ``read_html`` now defaults to ``None`` when reading, and falls back on + ``bs4`` + ``html5lib`` when lxml fails to parse. a list of parsers to try + until success is also valid + + - The internal ``pandas`` class hierarchy has changed (slightly). The + previous ``PandasObject`` now is called ``PandasContainer`` and a new + ``PandasObject`` has become the baseclass for ``PandasContainer`` as well + as ``Index``, ``Categorical``, ``GroupBy``, ``SparseList``, and + ``SparseArray`` (+ their base classes). Currently, ``PandasObject`` + provides string methods (from ``StringMixin``). (:issue:`4090`, :issue:`4092`) + + - New ``StringMixin`` that, given a ``__unicode__`` method, gets python 2 and + python 3 compatible string methods (``__str__``, ``__bytes__``, and + ``__repr__``). Plus string safety throughout. Now employed in many places + throughout the pandas library. (:issue:`4090`, :issue:`4092`) + +I/O Enhancements +~~~~~~~~~~~~~~~~ + + - ``pd.read_html()`` can now parse HTML strings, files or urls and return + DataFrames, courtesy of @cpcloud. (:issue:`3477`, :issue:`3605`, :issue:`3606`, :issue:`3616`). + It works with a *single* parser backend: BeautifulSoup4 + html5lib :ref:`See the docs` + + You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so + + .. ipython :: python + :okwarning: + + df = DataFrame({'a': range(3), 'b': list('abc')}) + print(df) + html = df.to_html() + alist = pd.read_html(html, infer_types=True, index_col=0) + print(df == alist[0]) + + Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and + ``DataFrame.to_html()`` are not inverses. + + - ``pd.read_html()`` no longer performs hard conversion of date strings + (:issue:`3656`). + + .. warning:: + + You may have to install an older version of BeautifulSoup4, + :ref:`See the installation docs` + + - Added module for reading and writing Stata files: ``pandas.io.stata`` (:issue:`1512`) + accessable via ``read_stata`` top-level function for reading, + and ``to_stata`` DataFrame method for writing, :ref:`See the docs` + + - Added module for reading and writing json format files: ``pandas.io.json`` + accessable via ``read_json`` top-level function for reading, + and ``to_json`` DataFrame method for writing, :ref:`See the docs` + various issues (:issue:`1226`, :issue:`3804`, :issue:`3876`, :issue:`3867`, :issue:`1305`) + + - ``MultiIndex`` column support for reading and writing csv format files + + - The ``header`` option in ``read_csv`` now accepts a + list of the rows from which to read the index. + + - The option, ``tupleize_cols`` can now be specified in both ``to_csv`` and + ``read_csv``, to provide compatiblity for the pre 0.12 behavior of + writing and reading ``MultIndex`` columns via a list of tuples. The default in + 0.12 is to write lists of tuples and *not* interpret list of tuples as a + ``MultiIndex`` column. + + Note: The default behavior in 0.12 remains unchanged from prior versions, but starting with 0.13, + the default *to* write and read ``MultiIndex`` columns will be in the new + format. (:issue:`3571`, :issue:`1651`, :issue:`3141`) + + - If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it + with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will + be *lost*. + + .. ipython:: python + + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv('mi.csv',tupleize_cols=False) + print(open('mi.csv').read()) + pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + + .. ipython:: python + :suppress: + + import os + os.remove('mi.csv') + + - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3 + + - Iterator support via ``read_hdf`` that automatically opens and closes the + store when iteration is finished. This is only for *tables* + + .. ipython:: python + :okwarning: + + path = 'store_iterator.h5' + DataFrame(randn(10,2)).to_hdf(path,'df',table=True) + for df in read_hdf(path,'df', chunksize=3): + print(df) + + .. ipython:: python + :suppress: + + import os + os.remove(path) + + - ``read_csv`` will now throw a more informative error message when a file + contains no columns, e.g., all newline characters + +Other Enhancements +~~~~~~~~~~~~~~~~~~ + + - ``DataFrame.replace()`` now allows regular expressions on contained + ``Series`` with object dtype. See the examples section in the regular docs + :ref:`Replacing via String Expression ` + + For example you can do + + .. ipython :: python + + df = DataFrame({'a': list('ab..'), 'b': [1, 2, 3, 4]}) + df.replace(regex=r'\s*\.\s*', value=np.nan) + + to replace all occurrences of the string ``'.'`` with zero or more + instances of surrounding whitespace with ``NaN``. + + Regular string replacement still works as expected. For example, you can do + + .. ipython :: python + + df.replace('.', np.nan) + + to replace all occurrences of the string ``'.'`` with ``NaN``. + + - ``pd.melt()`` now accepts the optional parameters ``var_name`` and ``value_name`` + to specify custom column names of the returned DataFrame. + + - ``pd.set_option()`` now allows N option, value pairs (:issue:`3667`). + + Let's say that we had an option ``'a.b'`` and another option ``'b.c'``. + We can set them at the same time: + + .. ipython:: python + :suppress: + + pd.core.config.register_option('a.b', 2, 'ay dot bee') + pd.core.config.register_option('b.c', 3, 'bee dot cee') + + .. ipython:: python + + pd.get_option('a.b') + pd.get_option('b.c') + pd.set_option('a.b', 1, 'b.c', 4) + pd.get_option('a.b') + pd.get_option('b.c') + + - The ``filter`` method for group objects returns a subset of the original + object. Suppose we want to take only elements that belong to groups with a + group sum greater than 2. + + .. ipython:: python + + sf = Series([1, 1, 2, 3, 3, 3]) + sf.groupby(sf).filter(lambda x: x.sum() > 2) + + The argument of ``filter`` must a function that, applied to the group as a + whole, returns ``True`` or ``False``. + + Another useful operation is filtering out elements that belong to groups + with only a couple members. + + .. ipython:: python + + dff = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) + dff.groupby('B').filter(lambda x: len(x) > 2) + + Alternatively, instead of dropping the offending groups, we can return a + like-indexed objects where the groups that do not pass the filter are + filled with NaNs. + + .. ipython:: python + + dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + + - Series and DataFrame hist methods now take a ``figsize`` argument (:issue:`3834`) + + - DatetimeIndexes no longer try to convert mixed-integer indexes during join + operations (:issue:`3877`) + + - Timestamp.min and Timestamp.max now represent valid Timestamp instances instead + of the default datetime.min and datetime.max (respectively), thanks @SleepingPills + + - ``read_html`` now raises when no tables are found and BeautifulSoup==4.2.0 + is detected (:issue:`4214`) + + +Experimental Features +~~~~~~~~~~~~~~~~~~~~~ + + - Added experimental ``CustomBusinessDay`` class to support ``DateOffsets`` + with custom holiday calendars and custom weekmasks. (:issue:`2301`) + + .. note:: + + This uses the ``numpy.busdaycalendar`` API introduced in Numpy 1.7 and + therefore requires Numpy 1.7.0 or newer. + + .. ipython:: python + + from pandas.tseries.offsets import CustomBusinessDay + from datetime import datetime + # As an interesting example, let's look at Egypt where + # a Friday-Saturday weekend is observed. + weekmask_egypt = 'Sun Mon Tue Wed Thu' + # They also observe International Workers' Day so let's + # add that for a couple of years + holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] + bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) + dt = datetime(2013, 4, 30) + print(dt + 2 * bday_egypt) + dts = date_range(dt, periods=5, freq=bday_egypt) + print(Series(dts.weekday, dts).map(Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) + +Bug Fixes +~~~~~~~~~ + + - Plotting functions now raise a ``TypeError`` before trying to plot anything + if the associated objects have have a dtype of ``object`` (:issue:`1818`, + :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to + numeric arrays if possible so that you can still plot, for example, an + object array with floats. This happens before any drawing takes place which + elimnates any spurious plots from showing up. + + - ``fillna`` methods now raise a ``TypeError`` if the ``value`` parameter is + a list or tuple. + + - ``Series.str`` now supports iteration (:issue:`3638`). You can iterate over the + individual elements of each string in the ``Series``. Each iteration yields + yields a ``Series`` with either a single character at each index of the + original ``Series`` or ``NaN``. For example, + + .. ipython:: python + + strs = 'go', 'bow', 'joe', 'slow' + ds = Series(strs) + + for s in ds.str: + print(s) + + s + s.dropna().values.item() == 'w' + + The last element yielded by the iterator will be a ``Series`` containing + the last element of the longest string in the ``Series`` with all other + elements being ``NaN``. Here since ``'slow'`` is the longest string + and there are no other strings with the same length ``'w'`` is the only + non-null string in the yielded ``Series``. + + - ``HDFStore`` + + - will retain index attributes (freq,tz,name) on recreation (:issue:`3499`) + - will warn with a ``AttributeConflictWarning`` if you are attempting to append + an index with a different frequency than the existing, or attempting + to append an index with a different name than the existing + - support datelike columns with a timezone as data_columns (:issue:`2852`) + + - Non-unique index support clarified (:issue:`3468`). + + - Fix assigning a new index to a duplicate index in a DataFrame would fail (:issue:`3468`) + - Fix construction of a DataFrame with a duplicate index + - ref_locs support to allow duplicative indices across dtypes, + allows iget support to always find the index (even across dtypes) (:issue:`2194`) + - applymap on a DataFrame with a non-unique index now works + (removed warning) (:issue:`2786`), and fix (:issue:`3230`) + - Fix to_csv to handle non-unique columns (:issue:`3495`) + - Duplicate indexes with getitem will return items in the correct order (:issue:`3455`, :issue:`3457`) + and handle missing elements like unique indices (:issue:`3561`) + - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (:issue:`3562`) + - Concat to produce a non-unique columns when duplicates are across dtypes is fixed (:issue:`3602`) + - Allow insert/delete to non-unique columns (:issue:`3679`) + - Non-unique indexing with a slice via ``loc`` and friends fixed (:issue:`3659`) + - Allow insert/delete to non-unique columns (:issue:`3679`) + - Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`) + - ``DataFrame.itertuples()`` now works with frames with duplicate column + names (:issue:`3873`) + - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to + ``reindex`` for location-based taking + - Allow non-unique indexing in series via ``.ix/.loc`` and ``__getitem__`` (:issue:`4246`) + - Fixed non-unique indexing memory allocation issue with ``.ix/.loc`` (:issue:`4280`) + + - ``DataFrame.from_records`` did not accept empty recarrays (:issue:`3682`) + - ``read_html`` now correctly skips tests (:issue:`3741`) + - Fixed a bug where ``DataFrame.replace`` with a compiled regular expression + in the ``to_replace`` argument wasn't working (:issue:`3907`) + - Improved ``network`` test decorator to catch ``IOError`` (and therefore + ``URLError`` as well). Added ``with_connectivity_check`` decorator to allow + explicitly checking a website as a proxy for seeing if there is network + connectivity. Plus, new ``optional_args`` decorator factory for decorators. + (:issue:`3910`, :issue:`3914`) + - Fixed testing issue where too many sockets where open thus leading to a + connection reset issue (:issue:`3982`, :issue:`3985`, :issue:`4028`, + :issue:`4054`) + - Fixed failing tests in test_yahoo, test_google where symbols were not + retrieved but were being accessed (:issue:`3982`, :issue:`3985`, + :issue:`4028`, :issue:`4054`) + - ``Series.hist`` will now take the figure from the current environment if + one is not passed + - Fixed bug where a 1xN DataFrame would barf on a 1xN mask (:issue:`4071`) + - Fixed running of ``tox`` under python3 where the pickle import was getting + rewritten in an incompatible way (:issue:`4062`, :issue:`4063`) + - Fixed bug where sharex and sharey were not being passed to grouped_hist + (:issue:`4089`) + - Fixed bug in ``DataFrame.replace`` where a nested dict wasn't being + iterated over when regex=False (:issue:`4115`) + - Fixed bug in the parsing of microseconds when using the ``format`` + argument in ``to_datetime`` (:issue:`4152`) + - Fixed bug in ``PandasAutoDateLocator`` where ``invert_xaxis`` triggered + incorrectly ``MilliSecondLocator`` (:issue:`3990`) + - Fixed bug in plotting that wasn't raising on invalid colormap for + matplotlib 1.1.1 (:issue:`4215`) + - Fixed the legend displaying in ``DataFrame.plot(kind='kde')`` (:issue:`4216`) + - Fixed bug where Index slices weren't carrying the name attribute + (:issue:`4226`) + - Fixed bug in initializing ``DatetimeIndex`` with an array of strings + in a certain time zone (:issue:`4229`) + - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`) + - Fixed bug where get_data_famafrench wasn't using the correct file edges + (:issue:`4281`) + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list. diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt new file mode 100644 index 00000000..ac0a14f4 --- /dev/null +++ b/doc/source/v0.13.0.txt @@ -0,0 +1,983 @@ +.. _whatsnew_0130: + +v0.13.0 (January 3, 2014) +--------------------------- + +This is a major release from 0.12.0 and includes a number of API changes, several new features and +enhancements along with a large number of bug fixes. + +Highlights include: + +- support for a new index type ``Float64Index``, and other Indexing enhancements +- ``HDFStore`` has a new string based syntax for query specification +- support for new methods of interpolation +- updated ``timedelta`` operations +- a new string manipulation method ``extract`` +- Nanosecond support for Offsets +- ``isin`` for DataFrames + +Several experimental features are added, including: + +- new ``eval/query`` methods for expression evaluation +- support for ``msgpack`` serialization +- an i/o interface to Google's ``BigQuery`` + +Their are several new or updated docs sections including: + +- :ref:`Comparison with SQL`, which should be useful for those familiar with SQL but still learning pandas. +- :ref:`Comparison with R`, idiom translations from R to pandas. +- :ref:`Enhancing Performance`, ways to enhance pandas performance with ``eval/query``. + +.. warning:: + + In 0.13.0 ``Series`` has internally been refactored to no longer sub-class ``ndarray`` + but instead subclass ``NDFrame``, similar to the rest of the pandas containers. This should be + a transparent change with only very limited API implications. See :ref:`Internal Refactoring` + +API changes +~~~~~~~~~~~ + +- ``read_excel`` now supports an integer in its ``sheetname`` argument giving + the index of the sheet to read in (:issue:`4301`). +- Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf", + "iNf", etc.) as infinity. (:issue:`4220`, :issue:`4219`), affecting + ``read_table``, ``read_csv``, etc. +- ``pandas`` now is Python 2/3 compatible without the need for 2to3 thanks to + @jtratner. As a result, pandas now uses iterators more extensively. This + also led to the introduction of substantive parts of the Benjamin + Peterson's ``six`` library into compat. (:issue:`4384`, :issue:`4375`, + :issue:`4372`) +- ``pandas.util.compat`` and ``pandas.util.py3compat`` have been merged into + ``pandas.compat``. ``pandas.compat`` now includes many functions allowing + 2/3 compatibility. It contains both list and iterator versions of range, + filter, map and zip, plus other necessary elements for Python 3 + compatibility. ``lmap``, ``lzip``, ``lrange`` and ``lfilter`` all produce + lists instead of iterators, for compatibility with ``numpy``, subscripting + and ``pandas`` constructors.(:issue:`4384`, :issue:`4375`, :issue:`4372`) +- ``Series.get`` with negative indexers now returns the same as ``[]`` (:issue:`4390`) +- Changes to how ``Index`` and ``MultiIndex`` handle metadata (``levels``, + ``labels``, and ``names``) (:issue:`4039`): + + .. code-block:: python + + # previously, you would have set levels or labels directly + index.levels = [[1, 2, 3, 4], [1, 2, 4, 4]] + + # now, you use the set_levels or set_labels methods + index = index.set_levels([[1, 2, 3, 4], [1, 2, 4, 4]]) + + # similarly, for names, you can rename the object + # but setting names is not deprecated + index = index.set_names(["bob", "cranberry"]) + + # and all methods take an inplace kwarg - but return None + index.set_names(["bob", "cranberry"], inplace=True) + +- **All** division with ``NDFrame`` objects is now *truedivision*, regardless + of the future import. This means that operating on pandas objects will by default + use *floating point* division, and return a floating point dtype. + You can use ``//`` and ``floordiv`` to do integer division. + + Integer division + + .. code-block:: python + + In [3]: arr = np.array([1, 2, 3, 4]) + + In [4]: arr2 = np.array([5, 3, 2, 1]) + + In [5]: arr / arr2 + Out[5]: array([0, 0, 1, 4]) + + In [6]: Series(arr) // Series(arr2) + Out[6]: + 0 0 + 1 0 + 2 1 + 3 4 + dtype: int64 + + True Division + + .. code-block:: python + + In [7]: pd.Series(arr) / pd.Series(arr2) # no future import required + Out[7]: + 0 0.200000 + 1 0.666667 + 2 1.500000 + 3 4.000000 + dtype: float64 + +- Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`) +- ``__nonzero__`` for all NDFrame objects, will now raise a ``ValueError``, this reverts back to (:issue:`1073`, :issue:`4633`) + behavior. See :ref:`gotchas` for a more detailed discussion. + + This prevents doing boolean comparison on *entire* pandas objects, which is inherently ambiguous. These all will raise a ``ValueError``. + + .. code-block:: python + + if df: + .... + df1 and df2 + s1 and s2 + + Added the ``.bool()`` method to ``NDFrame`` objects to facilitate evaluating of single-element boolean Series: + + .. ipython:: python + + Series([True]).bool() + Series([False]).bool() + DataFrame([[True]]).bool() + DataFrame([[False]]).bool() + +- All non-Index NDFrames (``Series``, ``DataFrame``, ``Panel``, ``Panel4D``, + ``SparsePanel``, etc.), now support the entire set of arithmetic operators + and arithmetic flex methods (add, sub, mul, etc.). ``SparsePanel`` does not + support ``pow`` or ``mod`` with non-scalars. (:issue:`3765`) +- ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the + statistical mode(s) by axis/Series. (:issue:`5367`) + +- Chained assignment will now by default warn if the user is assigning to a copy. This can be changed + with the option ``mode.chained_assignment``, allowed options are ``raise/warn/None``. See :ref:`the docs`. + + .. ipython:: python + + dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + pd.set_option('chained_assignment','warn') + + The following warning / exception will show if this is attempted. + + .. ipython:: python + :okwarning: + + dfc.loc[0]['A'] = 1111 + + :: + + Traceback (most recent call last) + ... + SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_index,col_indexer] = value instead + + Here is the correct method of assignment. + + .. ipython:: python + + dfc.loc[0,'A'] = 11 + dfc + +- ``Panel.reindex`` has the following call signature ``Panel.reindex(items=None, major_axis=None, minor_axis=None, **kwargs)`` + to conform with other ``NDFrame`` objects. See :ref:`Internal Refactoring` for more information. + +- ``Series.argmin`` and ``Series.argmax`` are now aliased to ``Series.idxmin`` and ``Series.idxmax``. These return the *index* of the + min or max element respectively. Prior to 0.13.0 these would return the position of the min / max element. (:issue:`6214`) + +Prior Version Deprecations/Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These were announced changes in 0.12 or prior that are taking effect as of 0.13.0 + +- Remove deprecated ``Factor`` (:issue:`3650`) +- Remove deprecated ``set_printoptions/reset_printoptions`` (:issue:`3046`) +- Remove deprecated ``_verbose_info`` (:issue:`3215`) +- Remove deprecated ``read_clipboard/to_clipboard/ExcelFile/ExcelWriter`` from ``pandas.io.parsers`` (:issue:`3717`) + These are available as functions in the main pandas namespace (e.g. ``pd.read_clipboard``) +- default for ``tupleize_cols`` is now ``False`` for both ``to_csv`` and ``read_csv``. Fair warning in 0.12 (:issue:`3604`) +- default for `display.max_seq_len` is now 100 rather then `None`. This activates + truncated display ("...") of long sequences in various places. (:issue:`3391`) + +Deprecations +~~~~~~~~~~~~ + +Deprecated in 0.13.0 + +- deprecated ``iterkv``, which will be removed in a future release (this was + an alias of iteritems used to bypass ``2to3``'s changes). + (:issue:`4384`, :issue:`4375`, :issue:`4372`) +- deprecated the string method ``match``, whose role is now performed more + idiomatically by ``extract``. In a future release, the default behavior + of ``match`` will change to become analogous to ``contains``, which returns + a boolean indexer. (Their + distinction is strictness: ``match`` relies on ``re.match`` while + ``contains`` relies on ``re.search``.) In this release, the deprecated + behavior is the default, but the new behavior is available through the + keyword argument ``as_indexer=True``. + +Indexing API Changes +~~~~~~~~~~~~~~~~~~~~ + +Prior to 0.13, it was impossible to use a label indexer (``.loc/.ix``) to set a value that +was not contained in the index of a particular axis. (:issue:`2578`). See :ref:`the docs` + +In the ``Series`` case this is effectively an appending operation + +.. ipython:: python + + s = Series([1,2,3]) + s + s[5] = 5. + s + +.. ipython:: python + + dfi = DataFrame(np.arange(6).reshape(3,2), + columns=['A','B']) + dfi + +This would previously ``KeyError`` + +.. ipython:: python + + dfi.loc[:,'C'] = dfi.loc[:,'A'] + dfi + +This is like an ``append`` operation. + +.. ipython:: python + + dfi.loc[3] = 5 + dfi + +A Panel setting operation on an arbitrary axis aligns the input to the Panel + +.. ipython:: python + + p = pd.Panel(np.arange(16).reshape(2,4,2), + items=['Item1','Item2'], + major_axis=pd.date_range('2001/1/12',periods=4), + minor_axis=['A','B'],dtype='float64') + p + p.loc[:,:,'C'] = Series([30,32],index=p.items) + p + p.loc[:,:,'C'] + +Float64Index API Change +~~~~~~~~~~~~~~~~~~~~~~~ + +- Added a new index type, ``Float64Index``. This will be automatically created when passing floating values in index creation. + This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the + same. See :ref:`the docs`, (:issue:`263`) + + Construction is by default for floating type values. + + .. ipython:: python + + index = Index([1.5, 2, 3, 4.5, 5]) + index + s = Series(range(5),index=index) + s + + Scalar selection for ``[],.ix,.loc`` will always be label based. An integer will match an equal float index (e.g. ``3`` is equivalent to ``3.0``) + + .. ipython:: python + + s[3] + s.ix[3] + s.loc[3] + + The only positional indexing is via ``iloc`` + + .. ipython:: python + + s.iloc[3] + + A scalar index that is not found will raise ``KeyError`` + + Slicing is ALWAYS on the values of the index, for ``[],ix,loc`` and ALWAYS positional with ``iloc`` + + .. ipython:: python + + s[2:4] + s.ix[2:4] + s.loc[2:4] + s.iloc[2:4] + + In float indexes, slicing using floats are allowed + + .. ipython:: python + + s[2.1:4.6] + s.loc[2.1:4.6] + +- Indexing on other index types are preserved (and positional fallback for ``[],ix``), with the exception, that floating point slicing + on indexes on non ``Float64Index`` will now raise a ``TypeError``. + + .. code-block:: python + + In [1]: Series(range(5))[3.5] + TypeError: the label [3.5] is not a proper indexer for this index type (Int64Index) + + In [1]: Series(range(5))[3.5:4.5] + TypeError: the slice start [3.5] is not a proper indexer for this index type (Int64Index) + + Using a scalar float indexer will be deprecated in a future version, but is allowed for now. + + .. code-block:: python + + In [3]: Series(range(5))[3.0] + Out[3]: 3 + +HDFStore API Changes +~~~~~~~~~~~~~~~~~~~~ + +- Query Format Changes. A much more string-like query format is now supported. See :ref:`the docs`. + + .. ipython:: python + + path = 'test.h5' + dfq = DataFrame(randn(10,4), + columns=list('ABCD'), + index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table',data_columns=True) + + Use boolean expressions, with in-line function evaluation. + + .. ipython:: python + + read_hdf(path,'dfq', + where="index>Timestamp('20130104') & columns=['A', 'B']") + + Use an inline column reference + + .. ipython:: python + + read_hdf(path,'dfq', + where="A>0 or C>0") + + .. ipython:: python + :suppress: + + import os + os.remove(path) + +- the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)`` + the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies ``fixed`` format and ``append`` implies + ``table`` format. This default format can be set as an option by setting ``io.hdf.default_format``. + + .. ipython:: python + + path = 'test.h5' + df = DataFrame(randn(10,2)) + df.to_hdf(path,'df_table',format='table') + df.to_hdf(path,'df_table2',append=True) + df.to_hdf(path,'df_fixed') + with get_store(path) as store: + print(store) + + .. ipython:: python + :suppress: + + import os + os.remove(path) + +- Significant table writing performance improvements +- handle a passed ``Series`` in table format (:issue:`4330`) +- can now serialize a ``timedelta64[ns]`` dtype in a table (:issue:`3577`), See :ref:`the docs`. +- added an ``is_open`` property to indicate if the underlying file handle is_open; + a closed store will now report 'CLOSED' when viewing the store (rather than raising an error) + (:issue:`4409`) +- a close of a ``HDFStore`` now will close that instance of the ``HDFStore`` + but will only close the actual file if the ref count (by ``PyTables``) w.r.t. all of the open handles + are 0. Essentially you have a local instance of ``HDFStore`` referenced by a variable. Once you + close it, it will report closed. Other references (to the same file) will continue to operate + until they themselves are closed. Performing an action on a closed file will raise + ``ClosedFileError`` + + .. ipython:: python + + path = 'test.h5' + df = DataFrame(randn(10,2)) + store1 = HDFStore(path) + store2 = HDFStore(path) + store1.append('df',df) + store2.append('df2',df) + + store1 + store2 + store1.close() + store2 + store2.close() + store2 + + .. ipython:: python + :suppress: + + import os + os.remove(path) + +- removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving + duplicate rows from a table (:issue:`4367`) +- removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will + be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) +- allow a passed locations array or mask as a ``where`` condition (:issue:`4467`). + See :ref:`the docs` for an example. +- add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written + to the store (default is ``True``, ALL nan rows are NOT written), also settable + via the option ``io.hdf.dropna_table`` (:issue:`4625`) +- pass thru store creation arguments; can be used to support in-memory stores + +DataFrame repr Changes +~~~~~~~~~~~~~~~~~~~~~~ + +The HTML and plain text representations of :class:`DataFrame` now show +a truncated view of the table once it exceeds a certain size, rather +than switching to the short info view (:issue:`4886`, :issue:`5550`). +This makes the representation more consistent as small DataFrames get +larger. + +.. image:: _static/df_repr_truncated.png + :alt: Truncated HTML representation of a DataFrame + +To get the info view, call :meth:`DataFrame.info`. If you prefer the +info view as the repr for large DataFrames, you can set this by running +``set_option('display.large_repr', 'info')``. + +Enhancements +~~~~~~~~~~~~ + +- ``df.to_clipboard()`` learned a new ``excel`` keyword that let's you + paste df data directly into excel (enabled by default). (:issue:`5070`). +- ``read_html`` now raises a ``URLError`` instead of catching and raising a + ``ValueError`` (:issue:`4303`, :issue:`4305`) +- Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`) +- Clipboard functionality now works with PySide (:issue:`4282`) +- Added a more informative error message when plot arguments contain + overlapping color and style arguments (:issue:`4402`) +- ``to_dict`` now takes ``records`` as a possible outtype. Returns an array + of column-keyed dictionaries. (:issue:`4936`) + +- ``NaN`` handing in get_dummies (:issue:`4446`) with `dummy_na` + + .. ipython:: python + + # previously, nan was erroneously counted as 2 here + # now it is not counted at all + get_dummies([1, 2, np.nan]) + + # unless requested + get_dummies([1, 2, np.nan], dummy_na=True) + + +- ``timedelta64[ns]`` operations. See :ref:`the docs`. + + .. warning:: + + Most of these operations require ``numpy >= 1.7`` + + Using the new top-level ``to_timedelta``, you can convert a scalar or array from the standard + timedelta format (produced by ``to_csv``) into a timedelta type (``np.timedelta64`` in ``nanoseconds``). + + .. ipython:: python + + to_timedelta('1 days 06:05:01.00003') + to_timedelta('15.5us') + to_timedelta(['1 days 06:05:01.00003','15.5us','nan']) + to_timedelta(np.arange(5),unit='s') + to_timedelta(np.arange(5),unit='d') + + A Series of dtype ``timedelta64[ns]`` can now be divided by another + ``timedelta64[ns]`` object, or astyped to yield a ``float64`` dtyped Series. This + is frequency conversion. See :ref:`the docs` for the docs. + + .. ipython:: python + + from datetime import timedelta + td = Series(date_range('20130101',periods=4))-Series(date_range('20121201',periods=4)) + td[2] += np.timedelta64(timedelta(minutes=5,seconds=3)) + td[3] = np.nan + td + + # to days + td / np.timedelta64(1,'D') + td.astype('timedelta64[D]') + + # to seconds + td / np.timedelta64(1,'s') + td.astype('timedelta64[s]') + + Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series + + .. ipython:: python + + td * -1 + td * Series([1,2,3,4]) + + Absolute ``DateOffset`` objects can act equivalently to ``timedeltas`` + + .. ipython:: python + + from pandas import offsets + td + offsets.Minute(5) + offsets.Milli(5) + + Fillna is now supported for timedeltas + + .. ipython:: python + + td.fillna(0) + td.fillna(timedelta(days=1,seconds=5)) + + You can do numeric reduction operations on timedeltas. + + .. ipython:: python + + td.mean() + td.quantile(.1) + +- ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and + ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set + the bandwidth, and to gkde.evaluate() to specify the indices at which it + is evaluated, respectively. See scipy docs. (:issue:`4298`) + +- DataFrame constructor now accepts a numpy masked record array (:issue:`3478`) + +- The new vectorized string method ``extract`` return regular expression + matches more conveniently. + + .. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') + + Elements that do not match return ``NaN``. Extracting a regular expression + with more than one group returns a DataFrame with one column per group. + + + .. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') + + Elements that do not match return a row of ``NaN``. + Thus, a Series of messy strings can be *converted* into a + like-indexed Series or DataFrame of cleaned-up or more useful strings, + without necessitating ``get()`` to access tuples or ``re.match`` objects. + + Named groups like + + .. ipython:: python + + Series(['a1', 'b2', 'c3']).str.extract( + '(?P[ab])(?P\d)') + + and optional groups can also be used. + + .. ipython:: python + + Series(['a1', 'b2', '3']).str.extract( + '(?P[ab])?(?P\d)') + +- ``read_stata`` now accepts Stata 13 format (:issue:`4291`) + +- ``read_fwf`` now infers the column specifications from the first 100 rows of + the file if the data has correctly separated and properly aligned columns + using the delimiter provided to the function (:issue:`4488`). + +- support for nanosecond times as an offset + + .. warning:: + + These operations require ``numpy >= 1.7`` + + Period conversions in the range of seconds and below were reworked and extended + up to nanoseconds. Periods in the nanosecond range are now available. + + .. ipython:: python + + date_range('2013-01-01', periods=5, freq='5N') + + or with frequency as offset + + .. ipython:: python + + date_range('2013-01-01', periods=5, freq=pd.offsets.Nano(5)) + + Timestamps can be modified in the nanosecond range + + .. ipython:: python + + t = Timestamp('20130101 09:01:02') + t + pd.datetools.Nano(123) + +- A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs` for more. + + To get the rows where any of the conditions are met: + + .. ipython:: python + + dfi = DataFrame({'A': [1, 2, 3, 4], 'B': ['a', 'b', 'f', 'n']}) + dfi + other = DataFrame({'A': [1, 3, 3, 7], 'B': ['e', 'f', 'f', 'e']}) + mask = dfi.isin(other) + mask + dfi[mask.any(1)] + +- ``Series`` now supports a ``to_frame`` method to convert it to a single-column DataFrame (:issue:`5164`) + +- All R datasets listed here http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html can now be loaded into Pandas objects + + .. code-block:: python + + import pandas.rpy.common as com + com.load_data('Titanic') + +- ``tz_localize`` can infer a fall daylight savings transition based on the structure + of the unlocalized data (:issue:`4230`), see :ref:`the docs` + +- ``DatetimeIndex`` is now in the API documentation, see :ref:`the docs` + +- :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table + from semi-structured JSON data. See :ref:`the docs` (:issue:`1067`) + +- Added PySide support for the qtpandas DataFrameModel and DataFrameWidget. + +- Python csv parser now supports usecols (:issue:`4335`) + +- Frequencies gained several new offsets: + + * ``LastWeekOfMonth`` (:issue:`4637`) + * ``FY5253``, and ``FY5253Quarter`` (:issue:`4511`) + + +- DataFrame has a new ``interpolate`` method, similar to Series (:issue:`4434`, :issue:`1892`) + + .. ipython:: python + + df = DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], + 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df.interpolate() + + Additionally, the ``method`` argument to ``interpolate`` has been expanded + to include ``'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'krogh', 'piecewise_polynomial', 'pchip', `polynomial`, 'spline'`` + The new methods require scipy_. Consult the Scipy reference guide_ and documentation_ for more information + about when the various methods are appropriate. See :ref:`the docs`. + + Interpolate now also accepts a ``limit`` keyword argument. + This works similar to ``fillna``'s limit: + + .. ipython:: python + + ser = Series([1, 3, np.nan, np.nan, np.nan, 11]) + ser.interpolate(limit=2) + +- Added ``wide_to_long`` panel data convenience function. See :ref:`the docs`. + + .. ipython:: python + + np.random.seed(123) + df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, + "A1980" : {0 : "d", 1 : "e", 2 : "f"}, + "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, + "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, + "X" : dict(zip(range(3), np.random.randn(3))) + }) + df["id"] = df.index + df + wide_to_long(df, ["A", "B"], i="id", j="year") + +.. _scipy: http://www.scipy.org +.. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation +.. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html + +- ``to_csv`` now takes a ``date_format`` keyword argument that specifies how + output datetime objects should be formatted. Datetimes encountered in the + index, columns, and values will all have this formatting applied. (:issue:`4313`) +- ``DataFrame.plot`` will scatter plot x versus y by passing ``kind='scatter'`` (:issue:`2215`) +- Added support for Google Analytics v3 API segment IDs that also supports v2 IDs. (:issue:`5271`) + +.. _whatsnew_0130.experimental: + +Experimental +~~~~~~~~~~~~ + +- The new :func:`~pandas.eval` function implements expression evaluation using + ``numexpr`` behind the scenes. This results in large speedups for + complicated expressions involving large DataFrames/Series. For example, + + .. ipython:: python + + nrows, ncols = 20000, 100 + df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) + for _ in range(4)] + + .. ipython:: python + + # eval with NumExpr backend + %timeit pd.eval('df1 + df2 + df3 + df4') + + .. ipython:: python + + # pure Python evaluation + %timeit df1 + df2 + df3 + df4 + + For more details, see the :ref:`the docs` + +- Similar to ``pandas.eval``, :class:`~pandas.DataFrame` has a new + ``DataFrame.eval`` method that evaluates an expression in the context of + the ``DataFrame``. For example, + + .. ipython:: python + :suppress: + + try: + del a + except NameError: + pass + + try: + del b + except NameError: + pass + + .. ipython:: python + + df = DataFrame(randn(10, 2), columns=['a', 'b']) + df.eval('a + b') + +- :meth:`~pandas.DataFrame.query` method has been added that allows + you to select elements of a ``DataFrame`` using a natural query syntax + nearly identical to Python syntax. For example, + + .. ipython:: python + :suppress: + + try: + del a + except NameError: + pass + + try: + del b + except NameError: + pass + + try: + del c + except NameError: + pass + + .. ipython:: python + + n = 20 + df = DataFrame(np.random.randint(n, size=(n, 3)), columns=['a', 'b', 'c']) + df.query('a < b < c') + + selects all the rows of ``df`` where ``a < b < c`` evaluates to ``True``. + For more details see the :ref:`the docs`. + +- ``pd.read_msgpack()`` and ``pd.to_msgpack()`` are now a supported method of serialization + of arbitrary pandas (and python objects) in a lightweight portable binary format. See :ref:`the docs` + + .. warning:: + + Since this is an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. + + .. ipython:: python + + df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df.to_msgpack('foo.msg') + pd.read_msgpack('foo.msg') + + s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + pd.to_msgpack('foo.msg', df, s) + pd.read_msgpack('foo.msg') + + You can pass ``iterator=True`` to iterator over the unpacked results + + .. ipython:: python + + for o in pd.read_msgpack('foo.msg',iterator=True): + print o + + .. ipython:: python + :suppress: + :okexcept: + + os.remove('foo.msg') + +- ``pandas.io.gbq`` provides a simple way to extract from, and load data into, + Google's BigQuery Data Sets by way of pandas DataFrames. BigQuery is a high + performance SQL-like database service, useful for performing ad-hoc queries + against extremely large datasets. :ref:`See the docs ` + + .. code-block:: python + + from pandas.io import gbq + + # A query to select the average monthly temperatures in the + # in the year 2000 across the USA. The dataset, + # publicata:samples.gsod, is available on all BigQuery accounts, + # and is based on NOAA gsod data. + + query = """SELECT station_number as STATION, + month as MONTH, AVG(mean_temp) as MEAN_TEMP + FROM publicdata:samples.gsod + WHERE YEAR = 2000 + GROUP BY STATION, MONTH + ORDER BY STATION, MONTH ASC""" + + # Fetch the result set for this query + + # Your Google BigQuery Project ID + # To find this, see your dashboard: + # https://code.google.com/apis/console/b/0/?noredirect + projectid = xxxxxxxxx; + + df = gbq.read_gbq(query, project_id = projectid) + + # Use pandas to process and reshape the dataset + + df2 = df.pivot(index='STATION', columns='MONTH', values='MEAN_TEMP') + df3 = pandas.concat([df2.min(), df2.mean(), df2.max()], + axis=1,keys=["Min Tem", "Mean Temp", "Max Temp"]) + + The resulting DataFrame is:: + + > df3 + Min Tem Mean Temp Max Temp + MONTH + 1 -53.336667 39.827892 89.770968 + 2 -49.837500 43.685219 93.437932 + 3 -77.926087 48.708355 96.099998 + 4 -82.892858 55.070087 97.317240 + 5 -92.378261 61.428117 102.042856 + 6 -77.703334 65.858888 102.900000 + 7 -87.821428 68.169663 106.510714 + 8 -89.431999 68.614215 105.500000 + 9 -86.611112 63.436935 107.142856 + 10 -78.209677 56.880838 92.103333 + 11 -50.125000 48.861228 94.996428 + 12 -50.332258 42.286879 94.396774 + + .. warning:: + + To use this module, you will need a BigQuery account. See + for details. + + As of 10/10/13, there is a bug in Google's API preventing result sets + from being larger than 100,000 rows. A patch is scheduled for the week of + 10/14/13. + +.. _whatsnew_0130.refactoring: + +Internal Refactoring +~~~~~~~~~~~~~~~~~~~~ + +In 0.13.0 there is a major refactor primarily to subclass ``Series`` from +``NDFrame``, which is the base class currently for ``DataFrame`` and ``Panel``, +to unify methods and behaviors. Series formerly subclassed directly from +``ndarray``. (:issue:`4080`, :issue:`3862`, :issue:`816`) + +.. warning:: + + There are two potential incompatibilities from < 0.13.0 + + - Using certain numpy functions would previously return a ``Series`` if passed a ``Series`` + as an argument. This seems only to affect ``np.ones_like``, ``np.empty_like``, + ``np.diff`` and ``np.where``. These now return ``ndarrays``. + + .. ipython:: python + + s = Series([1,2,3,4]) + + Numpy Usage + + .. ipython:: python + + np.ones_like(s) + np.diff(s) + np.where(s>1,s,np.nan) + + Pandonic Usage + + .. ipython:: python + + Series(1,index=s.index) + s.diff() + s.where(s>1) + + - Passing a ``Series`` directly to a cython function expecting an ``ndarray`` type will no + long work directly, you must pass ``Series.values``, See :ref:`Enhancing Performance` + + - ``Series(0.5)`` would previously return the scalar ``0.5``, instead this will return a 1-element ``Series`` + + - This change breaks ``rpy2<=2.3.8``. an Issue has been opened against rpy2 and a workaround + is detailed in :issue:`5698`. Thanks @JanSchulz. + +- Pickle compatibility is preserved for pickles created prior to 0.13. These must be unpickled with ``pd.read_pickle``, see :ref:`Pickling`. + +- Refactor of series.py/frame.py/panel.py to move common code to generic.py + + - added ``_setup_axes`` to created generic NDFrame structures + - moved methods + + - ``from_axes,_wrap_array,axes,ix,loc,iloc,shape,empty,swapaxes,transpose,pop`` + - ``__iter__,keys,__contains__,__len__,__neg__,__invert__`` + - ``convert_objects,as_blocks,as_matrix,values`` + - ``__getstate__,__setstate__`` (compat remains in frame/panel) + - ``__getattr__,__setattr__`` + - ``_indexed_same,reindex_like,align,where,mask`` + - ``fillna,replace`` (``Series`` replace is now consistent with ``DataFrame``) + - ``filter`` (also added axis argument to selectively filter on a different axis) + - ``reindex,reindex_axis,take`` + - ``truncate`` (moved to become part of ``NDFrame``) + +- These are API changes which make ``Panel`` more consistent with ``DataFrame`` + + - ``swapaxes`` on a ``Panel`` with the same axes specified now return a copy + - support attribute access for setting + - filter supports the same API as the original ``DataFrame`` filter + +- Reindex called with no arguments will now return a copy of the input object + +- ``TimeSeries`` is now an alias for ``Series``. the property ``is_time_series`` + can be used to distinguish (if desired) + +- Refactor of Sparse objects to use BlockManager + + - Created a new block type in internals, ``SparseBlock``, which can hold multi-dtypes + and is non-consolidatable. ``SparseSeries`` and ``SparseDataFrame`` now inherit + more methods from there hierarchy (Series/DataFrame), and no longer inherit + from ``SparseArray`` (which instead is the object of the ``SparseBlock``) + - Sparse suite now supports integration with non-sparse data. Non-float sparse + data is supportable (partially implemented) + - Operations on sparse structures within DataFrames should preserve sparseness, + merging type operations will convert to dense (and back to sparse), so might + be somewhat inefficient + - enable setitem on ``SparseSeries`` for boolean/integer/slices + - ``SparsePanels`` implementation is unchanged (e.g. not using BlockManager, needs work) + +- added ``ftypes`` method to Series/DataFrame, similar to ``dtypes``, but indicates + if the underlying is sparse/dense (as well as the dtype) +- All ``NDFrame`` objects can now use ``__finalize__()`` to specify various + values to propagate to new objects from an existing one (e.g. ``name`` in ``Series`` will + follow more automatically now) +- Internal type checking is now done via a suite of generated classes, allowing ``isinstance(value, klass)`` + without having to directly import the klass, courtesy of @jtratner +- Bug in Series update where the parent frame is not updating its cache based on + changes (:issue:`4080`) or types (:issue:`3217`), fillna (:issue:`3386`) +- Indexing with dtype conversions fixed (:issue:`4463`, :issue:`4204`) +- Refactor ``Series.reindex`` to core/generic.py (:issue:`4604`, :issue:`4618`), allow ``method=`` in reindexing + on a Series to work +- ``Series.copy`` no longer accepts the ``order`` parameter and is now consistent with ``NDFrame`` copy +- Refactor ``rename`` methods to core/generic.py; fixes ``Series.rename`` for (:issue:`4605`), and adds ``rename`` + with the same signature for ``Panel`` +- Refactor ``clip`` methods to core/generic.py (:issue:`4798`) +- Refactor of ``_get_numeric_data/_get_bool_data`` to core/generic.py, allowing Series/Panel functionality +- ``Series`` (for index) / ``Panel`` (for items) now allow attribute access to its elements (:issue:`1903`) + + .. ipython:: python + + s = Series([1,2,3],index=list('abc')) + s.b + s.a = 5 + s + +Bug Fixes +~~~~~~~~~ + +See :ref:`V0.13.0 Bug Fixes` for an extensive list of bugs that have been fixed in 0.13.0. + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list of all API changes, Enhancements and Bug Fixes. diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt new file mode 100644 index 00000000..b48f555f --- /dev/null +++ b/doc/source/v0.13.1.txt @@ -0,0 +1,286 @@ +.. _whatsnew_0131: + +v0.13.1 (February 3, 2014) +-------------------------- + +This is a minor release from 0.13.0 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +Highlights include: + +- Added ``infer_datetime_format`` keyword to ``read_csv/to_datetime`` to allow speedups for homogeneously formatted datetimes. +- Will intelligently limit display precision for datetime/timedelta formats. +- Enhanced Panel :meth:`~pandas.Panel.apply` method. +- Suggested tutorials in new :ref:`Tutorials` section. +- Our pandas ecosystem is growing, We now feature related projects in a new :ref:`Pandas Ecosystem` section. +- Much work has been taking place on improving the docs, and a new :ref:`Contributing` section has been added. +- Even though it may only be of interest to devs, we <3 our new CI status page: `ScatterCI `__. + +.. warning:: + + 0.13.1 fixes a bug that was caused by a combination of having numpy < 1.8, and doing + chained assignment on a string-like array. Please review :ref:`the docs`, + chained indexing can have unexpected results and should generally be avoided. + + This would previously segfault: + + .. ipython:: python + + df = DataFrame(dict(A = np.array(['foo','bar','bah','foo','bar']))) + df['A'].iloc[0] = np.nan + df + + The recommended way to do this type of assignment is: + + .. ipython:: python + + df = DataFrame(dict(A = np.array(['foo','bar','bah','foo','bar']))) + df.ix[0,'A'] = np.nan + df + +Output Formatting Enhancements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- df.info() view now display dtype info per column (:issue:`5682`) + +- df.info() now honors the option ``max_info_rows``, to disable null counts for large frames (:issue:`5974`) + + .. ipython:: python + + max_info_rows = pd.get_option('max_info_rows') + + df = DataFrame(dict(A = np.random.randn(10), + B = np.random.randn(10), + C = date_range('20130101',periods=10))) + df.iloc[3:6,[0,2]] = np.nan + + .. ipython:: python + + # set to not display the null counts + pd.set_option('max_info_rows',0) + df.info() + + .. ipython:: python + + # this is the default (same as in 0.13.0) + pd.set_option('max_info_rows',max_info_rows) + df.info() + +- Add ``show_dimensions`` display option for the new DataFrame repr to control whether the dimensions print. + + .. ipython:: python + + df = DataFrame([[1, 2], [3, 4]]) + pd.set_option('show_dimensions', False) + df + + pd.set_option('show_dimensions', True) + df + +- The ``ArrayFormatter`` for ``datetime`` and ``timedelta64`` now intelligently + limit precision based on the values in the array (:issue:`3401`) + + Previously output might look like: + + .. code-block:: python + + age today diff + 0 2001-01-01 00:00:00 2013-04-19 00:00:00 4491 days, 00:00:00 + 1 2004-06-01 00:00:00 2013-04-19 00:00:00 3244 days, 00:00:00 + + Now the output looks like: + + .. ipython:: python + + df = DataFrame([ Timestamp('20010101'), + Timestamp('20040601') ], columns=['age']) + df['today'] = Timestamp('20130419') + df['diff'] = df['today']-df['age'] + df + +API changes +~~~~~~~~~~~ + +- Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`). + See :ref:`NA Values `. + +- Added ``Series.str.get_dummies`` vectorized string method (:issue:`6021`), to extract + dummy/indicator variables for separated string columns: + + .. ipython:: python + + s = Series(['a', 'a|b', np.nan, 'a|c']) + s.str.get_dummies(sep='|') + +- Added the ``NDFrame.equals()`` method to compare if two NDFrames are + equal have equal axes, dtypes, and values. Added the + ``array_equivalent`` function to compare if two ndarrays are + equal. NaNs in identical locations are treated as + equal. (:issue:`5283`) See also :ref:`the docs` for a motivating example. + + .. ipython:: python + + df = DataFrame({'col':['foo', 0, np.nan]}).sort() + df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) + df.equals(df) + + import pandas.core.common as com + com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) + np.array_equal(np.array([0, np.nan]), np.array([0, np.nan])) + +- ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a + ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is + empty (:issue:`6007`). + + Previously, calling ``DataFrame.apply`` an empty ``DataFrame`` would return + either a ``DataFrame`` if there were no columns, or the function being + applied would be called with an empty ``Series`` to guess whether a + ``Series`` or ``DataFrame`` should be returned: + + .. ipython:: python + + def applied_func(col): + print("Apply function being called with: ", col) + return col.sum() + + empty = DataFrame(columns=['a', 'b']) + empty.apply(applied_func) + + Now, when ``apply`` is called on an empty ``DataFrame``: if the ``reduce`` + argument is ``True`` a ``Series`` will returned, if it is ``False`` a + ``DataFrame`` will be returned, and if it is ``None`` (the default) the + function being applied will be called with an empty series to try and guess + the return type. + + .. ipython:: python + + empty.apply(applied_func, reduce=True) + empty.apply(applied_func, reduce=False) + +Prior Version Deprecations/Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are no announced changes in 0.13 or prior that are taking effect as of 0.13.1 + +Deprecations +~~~~~~~~~~~~ + +There are no deprecations of prior behavior in 0.13.1 + +Enhancements +~~~~~~~~~~~~ + +- ``pd.read_csv`` and ``pd.to_datetime`` learned a new ``infer_datetime_format`` keyword which greatly + improves parsing perf in many cases. Thanks to @lexual for suggesting and @danbirken + for rapidly implementing. (:issue:`5490`, :issue:`6021`) + + If ``parse_dates`` is enabled and this flag is set, pandas will attempt to + infer the format of the datetime strings in the columns, and if it can + be inferred, switch to a faster method of parsing them. In some cases + this can increase the parsing speed by ~5-10x. + + .. code-block:: python + + # Try to infer the format for the index column + df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, + infer_datetime_format=True) + +- ``date_format`` and ``datetime_format`` keywords can now be specified when writing to ``excel`` + files (:issue:`4133`) + +- ``MultiIndex.from_product`` convenience function for creating a MultiIndex from + the cartesian product of a set of iterables (:issue:`6055`): + + .. ipython:: python + + shades = ['light', 'dark'] + colors = ['red', 'green', 'blue'] + + MultiIndex.from_product([shades, colors], names=['shade', 'color']) + +- Panel :meth:`~pandas.Panel.apply` will work on non-ufuncs. See :ref:`the docs`. + + .. ipython:: python + + import pandas.util.testing as tm + panel = tm.makePanel(5) + panel + panel['ItemA'] + + Specifying an ``apply`` that operates on a Series (to return a single element) + + .. ipython:: python + + panel.apply(lambda x: x.dtype, axis='items') + + A similar reduction type operation + + .. ipython:: python + + panel.apply(lambda x: x.sum(), axis='major_axis') + + This is equivalent to + + .. ipython:: python + + panel.sum('major_axis') + + A transformation operation that returns a Panel, but is computing + the z-score across the major_axis + + .. ipython:: python + + result = panel.apply( + lambda x: (x-x.mean())/x.std(), + axis='major_axis') + result + result['ItemA'] + +- Panel :meth:`~pandas.Panel.apply` operating on cross-sectional slabs. (:issue:`1148`) + + .. ipython:: python + + f = lambda x: ((x.T-x.mean(1))/x.std(1)).T + + result = panel.apply(f, axis = ['items','major_axis']) + result + result.loc[:,:,'ItemA'] + + This is equivalent to the following + + .. ipython:: python + + result = Panel(dict([ (ax,f(panel.loc[:,:,ax])) + for ax in panel.minor_axis ])) + result + result.loc[:,:,'ItemA'] + +Performance +~~~~~~~~~~~ + +Performance improvements for 0.13.1 + +- Series datetime/timedelta binary operations (:issue:`5801`) +- DataFrame ``count/dropna`` for ``axis=1`` +- Series.str.contains now has a `regex=False` keyword which can be faster for plain (non-regex) string patterns. (:issue:`5879`) +- Series.str.extract (:issue:`5944`) +- ``dtypes/ftypes`` methods (:issue:`5968`) +- indexing with object dtypes (:issue:`5968`) +- ``DataFrame.apply`` (:issue:`6013`) +- Regression in JSON IO (:issue:`5765`) +- Index construction from Series (:issue:`6150`) + +Experimental +~~~~~~~~~~~~ + +There are no experimental changes in 0.13.1 + +Bug Fixes +~~~~~~~~~ + +See :ref:`V0.13.1 Bug Fixes` for an extensive list of bugs that have been fixed in 0.13.1. + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list of all API changes, Enhancements and Bug Fixes. diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt new file mode 100644 index 00000000..96ab3d1e --- /dev/null +++ b/doc/source/v0.14.0.txt @@ -0,0 +1,1042 @@ +.. _whatsnew_0140: + +v0.14.0 (May 31 , 2014) +----------------------- + +This is a major release from 0.13.1 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +- Highlights include: + + - Officially support Python 3.4 + - SQL interfaces updated to use ``sqlalchemy``, See :ref:`Here`. + - Display interface changes, See :ref:`Here` + - MultiIndexing Using Slicers, See :ref:`Here`. + - Ability to join a singly-indexed DataFrame with a multi-indexed DataFrame, see :ref:`Here ` + - More consistency in groupby results and more flexible groupby specifications, See :ref:`Here` + - Holiday calendars are now supported in ``CustomBusinessDay``, see :ref:`Here ` + - Several improvements in plotting functions, including: hexbin, area and pie plots, see :ref:`Here`. + - Performance doc section on I/O operations, See :ref:`Here ` + +- :ref:`Other Enhancements ` + +- :ref:`API Changes ` + +- :ref:`Text Parsing API Changes ` + +- :ref:`Groupby API Changes ` + +- :ref:`Performance Improvements ` + +- :ref:`Prior Deprecations ` + +- :ref:`Deprecations ` + +- :ref:`Known Issues ` + +- :ref:`Bug Fixes ` + +.. warning:: + + In 0.14.0 all ``NDFrame`` based containers have undergone significant internal refactoring. Before that each block of + homogeneous data had its own labels and extra care was necessary to keep those in sync with the parent container's labels. + This should not have any visible user/API behavior changes (:issue:`6745`) + +.. _whatsnew_0140.api: + +API changes +~~~~~~~~~~~ + +- ``read_excel`` uses 0 as the default sheet (:issue:`6573`) +- ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being + indexed. These will be excluded. This will make pandas conform more with python/numpy indexing of out-of-bounds + values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise + ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned) + + .. ipython:: python + + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + dfl + dfl.iloc[:,2:3] + dfl.iloc[:,1:3] + dfl.iloc[4:6] + + These are out-of-bounds selections + + .. code-block:: python + + dfl.iloc[[4,5,6]] + IndexError: positional indexers are out-of-bounds + + dfl.iloc[:,4] + IndexError: single positional indexer is out-of-bounds + +- Slicing with negative start, stop & step values handles corner cases better (:issue:`6531`): + + - ``df.iloc[:-len(df)]`` is now empty + - ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse + +- The :meth:`DataFrame.interpolate` keyword ``downcast`` default has been changed from ``infer`` to + ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). +- When converting a dataframe to HTML it used to return `Empty DataFrame`. This special case has + been removed, instead a header with the column names is returned (:issue:`6062`). +- ``Series`` and ``Index`` now internall share more common operations, e.g. ``factorize(),nunique(),value_counts()`` are + now supported on ``Index`` types as well. The ``Series.weekday`` property from is removed + from Series for API consistency. Using a ``DatetimeIndex/PeriodIndex`` method on a Series will now raise a ``TypeError``. + (:issue:`4551`, :issue:`4056`, :issue:`5519`, :issue:`6380`, :issue:`7206`). + +- Add ``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end`` accessors for ``DateTimeIndex`` / ``Timestamp`` which return a boolean array of whether the timestamp(s) are at the start/end of the month/quarter/year defined by the frequency of the ``DateTimeIndex`` / ``Timestamp`` (:issue:`4565`, :issue:`6998`) + +- Local variable usage has changed in + :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` + (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have + changed + + - Column names are now given precedence over locals + - Local variables must be referred to explicitly. This means that even if + you have a local variable that is *not* a column you must still refer to + it with the ``'@'`` prefix. + - You can have an expression like ``df.query('@a < a')`` with no complaints + from ``pandas`` about ambiguity of the name ``a``. + - The top-level :func:`pandas.eval` function does not allow you use the + ``'@'`` prefix and provides you with an error message telling you so. + - ``NameResolutionError`` was removed because it isn't necessary anymore. + +- Define and document the order of column vs index names in query/eval (:issue:`6676`) +- ``concat`` will now concatenate mixed Series and DataFrames using the Series name + or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` +- Slicing and advanced/boolean indexing operations on ``Index`` classes as well + as :meth:`Index.delete` and :meth:`Index.drop` methods will no longer change the type of the + resulting index (:issue:`6440`, :issue:`7040`) + + .. ipython:: python + + i = pd.Index([1, 2, 3, 'a' , 'b', 'c']) + i[[0,1,2]] + i.drop(['a', 'b', 'c']) + + Previously, the above operation would return ``Int64Index``. If you'd like + to do this manually, use :meth:`Index.astype` + + .. ipython:: python + + i[[0,1,2]].astype(np.int_) + +- ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example, + the old behavior returned an Index in this case (:issue:`6459`): + + .. ipython:: python + :suppress: + + np.random.seed(1234) + from itertools import product + tuples = list(product(('a', 'b'), ('c', 'd'))) + mi = MultiIndex.from_tuples(tuples) + df_multi = DataFrame(np.random.randn(4, 2), index=mi) + tuple_ind = pd.Index(tuples,tupleize_cols=False) + df_multi.index + + .. ipython:: python + + # Old behavior, casted MultiIndex to an Index + tuple_ind + df_multi.set_index(tuple_ind) + + # New behavior + mi + df_multi.set_index(mi) + + This also applies when passing multiple indices to ``set_index``: + + .. ipython:: python + + @suppress + df_multi.index = tuple_ind + + # Old output, 2-level MultiIndex of tuples + df_multi.set_index([df_multi.index, df_multi.index]) + + @suppress + df_multi.index = mi + + # New output, 4-level MultiIndex + df_multi.set_index([df_multi.index, df_multi.index]) + +- ``pairwise`` keyword was added to the statistical moment functions + ``rolling_cov``, ``rolling_corr``, ``ewmcov``, ``ewmcorr``, + ``expanding_cov``, ``expanding_corr`` to allow the calculation of moving + window covariance and correlation matrices (:issue:`4950`). See + :ref:`Computing rolling pairwise covariances and correlations + ` in the docs. + + .. ipython:: python + + df = DataFrame(np.random.randn(10,4),columns=list('ABCD')) + covs = rolling_cov(df[['A','B','C']], df[['B','C','D']], 5, pairwise=True) + covs[df.index[-1]] + +- ``Series.iteritems()`` is now lazy (returns an iterator rather than a list). This was the documented behavior prior to 0.14. (:issue:`6760`) + +- Added ``nunique`` and ``value_counts`` functions to ``Index`` for counting unique elements. (:issue:`6734`) +- ``stack`` and ``unstack`` now raise a ``ValueError`` when the ``level`` keyword refers + to a non-unique item in the ``Index`` (previously raised a ``KeyError``). (:issue:`6738`) +- drop unused order argument from ``Series.sort``; args now are in the same order as ``Series.order``; + add ``na_position`` arg to conform to ``Series.order`` (:issue:`6847`) +- default sorting algorithm for ``Series.order`` is now ``quicksort``, to conform with ``Series.sort`` + (and numpy defaults) +- add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`) +- ``DataFrame.sort`` now places NaNs at the beginning or end of the sort according to the ``na_position`` parameter. (:issue:`3917`) +- accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`), this was a regression + from 0.13.1 +- Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`) +- ``describe`` on a DataFrame with a mix of Timestamp and string like objects returns a different Index (:issue:`7088`). + Previously the index was unintentionally sorted. +- Arithmetic operations with **only** ``bool`` dtypes now give a warning indicating + that they are evaluated in Python space for ``+``, ``-``, + and ``*`` operations and raise for all others (:issue:`7011`, :issue:`6762`, + :issue:`7015`, :issue:`7210`) + + .. code-block:: python + + x = pd.Series(np.random.rand(10) > 0.5) + y = True + x + y # warning generated: should do x | y instead + x / y # this raises because it doesn't make sense + + NotImplementedError: operator '/' not implemented for bool dtypes + +- In ``HDFStore``, ``select_as_multiple`` will always raise a ``KeyError``, when a key or the selector is not found (:issue:`6177`) +- ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent; + previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`) +- ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`) +- ``df.to_csv`` will now return a string of the CSV data if neither a target path nor a buffer is provided + (:issue:`6061`) +- ``pd.infer_freq()`` will now raise a ``TypeError`` if given an invalid ``Series/Index`` + type (:issue:`6407`, :issue:`6463`) +- A tuple passed to ``DataFame.sort_index`` will be interpreted as the levels of + the index, rather than requiring a list of tuple (:issue:`4370`) +- all offset operations now return ``Timestamp`` types (rather than datetime), Business/Week frequencies were incorrect (:issue:`4069`) +- ``to_excel`` now converts ``np.inf`` into a string representation, + customizable by the ``inf_rep`` keyword argument (Excel has no native inf + representation) (:issue:`6782`) +- Replace ``pandas.compat.scipy.scoreatpercentile`` with ``numpy.percentile`` (:issue:`6810`) +- ``.quantile`` on a ``datetime[ns]`` series now returns ``Timestamp`` instead + of ``np.datetime64`` objects (:issue:`6810`) +- change ``AssertionError`` to ``TypeError`` for invalid types passed to ``concat`` (:issue:`6583`) +- Raise a ``TypeError`` when ``DataFrame`` is passed an iterator as the + ``data`` argument (:issue:`5357`) + + +.. _whatsnew_0140.display: + +Display Changes +~~~~~~~~~~~~~~~ + +- The default way of printing large DataFrames has changed. DataFrames + exceeding ``max_rows`` and/or ``max_columns`` are now displayed in a + centrally truncated view, consistent with the printing of a + :class:`pandas.Series` (:issue:`5603`). + + In previous versions, a DataFrame was truncated once the dimension + constraints were reached and an ellipse (...) signaled that part of + the data was cut off. + + .. image:: _static/trunc_before.png + :alt: The previous look of truncate. + + In the current version, large DataFrames are centrally truncated, + showing a preview of head and tail in both dimensions. + + .. image:: _static/trunc_after.png + :alt: The new look. + +- allow option ``'truncate'`` for ``display.show_dimensions`` to only show the dimensions if the + frame is truncated (:issue:`6547`). + + The default for ``display.show_dimensions`` will now be ``truncate``. This is consistent with + how Series display length. + + .. ipython:: python + + dfd = pd.DataFrame(np.arange(25).reshape(-1,5), index=[0,1,2,3,4], columns=[0,1,2,3,4]) + + # show dimensions since this is truncated + with pd.option_context('display.max_rows', 2, 'display.max_columns', 2, + 'display.show_dimensions', 'truncate'): + print(dfd) + + # will not show dimensions since it is not truncated + with pd.option_context('display.max_rows', 10, 'display.max_columns', 40, + 'display.show_dimensions', 'truncate'): + print(dfd) + +- Regression in the display of a MultiIndexed Series with ``display.max_rows`` is less than the + length of the series (:issue:`7101`) +- Fixed a bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the + `large_repr` set to 'info' (:issue:`7105`) +- The `verbose` keyword in ``DataFrame.info()``, which controls whether to shorten the ``info`` + representation, is now ``None`` by default. This will follow the global setting in + ``display.max_info_columns``. The global setting can be overriden with ``verbose=True`` or + ``verbose=False``. +- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`) +- Offset/freq info now in Timestamp __repr__ (:issue:`4553`) + +.. _whatsnew_0140.parsing: + +Text Parsing API Changes +~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`read_csv`/:func:`read_table` will now be noiser w.r.t invalid options rather than falling back to the ``PythonParser``. + +- Raise ``ValueError`` when ``sep`` specified with + ``delim_whitespace=True`` in :func:`read_csv`/:func:`read_table` + (:issue:`6607`) +- Raise ``ValueError`` when ``engine='c'`` specified with unsupported + options in :func:`read_csv`/:func:`read_table` (:issue:`6607`) +- Raise ``ValueError`` when fallback to python parser causes options to be + ignored (:issue:`6607`) +- Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python + parser when no options are ignored (:issue:`6607`) +- Translate ``sep='\s+'`` to ``delim_whitespace=True`` in + :func:`read_csv`/:func:`read_table` if no other C-unsupported options + specified (:issue:`6607`) + +.. _whatsnew_0140.groupby: + +Groupby API Changes +~~~~~~~~~~~~~~~~~~~ + +More consistent behaviour for some groupby methods: + +- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: + + .. ipython:: python + + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.head(1) # filters DataFrame + + g.apply(lambda x: x.head(1)) # used to simply fall-through + +- groupby head and tail respect column selection: + + .. ipython:: python + + g[['B']].head(1) + +- groupby ``nth`` now reduces by default; filtering can be achieved by passing ``as_index=False``. With an optional ``dropna`` argument to ignore + NaN. See :ref:`the docs `. + + Reducing + + .. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) + + # this is equivalent to g.first() + g.nth(0, dropna='any') + + # this is equivalent to g.last() + g.nth(-1, dropna='any') + + Filtering + + .. ipython:: python + + gf = df.groupby('A',as_index=False) + gf.nth(0) + gf.nth(0, dropna='any') + +- groupby will now not return the grouped column for non-cython functions (:issue:`5610`, :issue:`5614`, :issue:`6732`), + as its already the index + + .. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) + g = df.groupby('A') + g.count() + g.describe() + +- passing ``as_index`` will leave the grouped column in-place (this is not change in 0.14.0) + + .. ipython:: python + + df = DataFrame([[1, np.nan], [1, 4], [5, 6], [5, 8]], columns=['A', 'B']) + g = df.groupby('A',as_index=False) + g.count() + g.describe() + +- Allow specification of a more complex groupby via ``pd.Grouper``, such as grouping + by a Time and a string field simultaneously. See :ref:`the docs `. (:issue:`3794`) + +- Better propagation/preservation of Series names when performing groupby + operations: + + - ``SeriesGroupBy.agg`` will ensure that the name attribute of the original + series is propagated to the result (:issue:`6265`). + - If the function provided to ``GroupBy.apply`` returns a named series, the + name of the series will be kept as the name of the column index of the + DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates + ``DataFrame.stack`` operations where the name of the column index is used as + the name of the inserted column containing the pivoted data. + + +.. _whatsnew_0140.sql: + +SQL +~~~ + +The SQL reading and writing functions now support more database flavors +through SQLAlchemy (:issue:`2717`, :issue:`4163`, :issue:`5950`, :issue:`6292`). +All databases supported by SQLAlchemy can be used, such +as PostgreSQL, MySQL, Oracle, Microsoft SQL server (see documentation of +SQLAlchemy on `included dialects +`_). + +The functionality of providing DBAPI connection objects will only be supported +for sqlite3 in the future. The ``'mysql'`` flavor is deprecated. + +The new functions :func:`~pandas.read_sql_query` and :func:`~pandas.read_sql_table` +are introduced. The function :func:`~pandas.read_sql` is kept as a convenience +wrapper around the other two and will delegate to specific function depending on +the provided input (database table name or sql query). + +In practice, you have to provide a SQLAlchemy ``engine`` to the sql functions. +To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine +object from database URI. You only need to create the engine once per database you are +connecting to. For an in-memory sqlite database: + +.. ipython:: python + + from sqlalchemy import create_engine + # Create your connection. + engine = create_engine('sqlite:///:memory:') + +This ``engine`` can then be used to write or read data to/from this database: + +.. ipython:: python + + df = pd.DataFrame({'A': [1,2,3], 'B': ['a', 'b', 'c']}) + df.to_sql('db_table', engine, index=False) + +You can read data from a database by specifying the table name: + +.. ipython:: python + + pd.read_sql_table('db_table', engine) + +or by specifying a sql query: + +.. ipython:: python + + pd.read_sql_query('SELECT * FROM db_table', engine) + +Some other enhancements to the sql functions include: + +- support for writing the index. This can be controlled with the ``index`` + keyword (default is True). +- specify the column label to use when writing the index with ``index_label``. +- specify string columns to parse as datetimes withh the ``parse_dates`` + keyword in :func:`~pandas.read_sql_query` and :func:`~pandas.read_sql_table`. + +.. warning:: + + Some of the existing functions or function aliases have been deprecated + and will be removed in future versions. This includes: ``tquery``, ``uquery``, + ``read_frame``, ``frame_query``, ``write_frame``. + +.. warning:: + + The support for the 'mysql' flavor when using DBAPI connection objects has been deprecated. + MySQL will be further supported with SQLAlchemy engines (:issue:`6900`). + + +.. _whatsnew_0140.slicers: + +MultiIndexing Using Slicers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In 0.14.0 we added a new way to slice multi-indexed objects. +You can slice a multi-index by providing multiple indexers. + +You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, +including slices, lists of labels, labels, and boolean indexers. + +You can use ``slice(None)`` to select all the contents of *that* level. You do not need to specify all the +*deeper* levels, they will be implied as ``slice(None)``. + +As usual, **both sides** of the slicers are included as this is label indexing. + +See :ref:`the docs` +See also issues (:issue:`6134`, :issue:`4036`, :issue:`3057`, :issue:`2598`, :issue:`5641`, :issue:`7106`) + +.. warning:: + + You should specify all axes in the ``.loc`` specifier, meaning the indexer for the **index** and + for the **columns**. Their are some ambiguous cases where the passed indexer could be mis-interpreted + as indexing *both* axes, rather than into say the MuliIndex for the rows. + + You should do this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....),:] + + rather than this: + + .. code-block:: python + + df.loc[(slice('A1','A3'),.....)] + +.. warning:: + + You will need to make sure that the selection axes are fully lexsorted! + +.. ipython:: python + + def mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + + index = MultiIndex.from_product([mklbl('A',4), + mklbl('B',2), + mklbl('C',4), + mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns)).reshape((len(index),len(columns))), + index=index, + columns=columns).sortlevel().sortlevel(axis=1) + df + +Basic multi-index slicing using slices, lists, and labels. + +.. ipython:: python + + df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + +You can use a ``pd.IndexSlice`` to shortcut the creation of these slices + +.. ipython:: python + + idx = pd.IndexSlice + df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +It is possible to perform quite complicated selections using this method on multiple +axes at the same time. + +.. ipython:: python + + df.loc['A1',(slice(None),'foo')] + df.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + +Using a boolean indexer you can provide selection related to the *values*. + +.. ipython:: python + + mask = df[('a','foo')]>200 + df.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + +You can also specify the ``axis`` argument to ``.loc`` to interpret the passed +slicers on a single axis. + +.. ipython:: python + + df.loc(axis=0)[:,:,['C1','C3']] + +Furthermore you can *set* the values using these methods + +.. ipython:: python + + df2 = df.copy() + df2.loc(axis=0)[:,:,['C1','C3']] = -10 + df2 + +You can use a right-hand-side of an alignable object as well. + +.. ipython:: python + + df2 = df.copy() + df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 + df2 + +.. _whatsnew_0140.plotting: + +Plotting +~~~~~~~~ + +- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`), See :ref:`the docs`. +- ``DataFrame.plot`` and ``Series.plot`` now supports area plot with specifying ``kind='area'`` (:issue:`6656`), See :ref:`the docs` +- Pie plots from ``Series.plot`` and ``DataFrame.plot`` with ``kind='pie'`` (:issue:`6976`), See :ref:`the docs`. +- Plotting with Error Bars is now supported in the ``.plot`` method of ``DataFrame`` and ``Series`` objects (:issue:`3796`, :issue:`6834`), See :ref:`the docs`. +- ``DataFrame.plot`` and ``Series.plot`` now support a ``table`` keyword for plotting ``matplotlib.Table``, See :ref:`the docs`. The ``table`` keyword can receive the following values. + + - ``False``: Do nothing (default). + - ``True``: Draw a table using the ``DataFrame`` or ``Series`` called ``plot`` method. Data will be transposed to meet matplotlib's default layout. + - ``DataFrame`` or ``Series``: Draw matplotlib.table using the passed data. The data will be drawn as displayed in print method (not transposed automatically). + Also, helper function ``pandas.tools.plotting.table`` is added to create a table from ``DataFrame`` and ``Series``, and add it to an ``matplotlib.Axes``. + +- ``plot(legend='reverse')`` will now reverse the order of legend labels for + most plot kinds. (:issue:`6014`) +- Line plot and area plot can be stacked by ``stacked=True`` (:issue:`6656`) + +- Following keywords are now acceptable for :meth:`DataFrame.plot` with ``kind='bar'`` and ``kind='barh'``: + + - `width`: Specify the bar width. In previous versions, static value 0.5 was passed to matplotlib and it cannot be overwritten. (:issue:`6604`) + - `align`: Specify the bar alignment. Default is `center` (different from matplotlib). In previous versions, pandas passes `align='edge'` to matplotlib and adjust the location to `center` by itself, and it results `align` keyword is not applied as expected. (:issue:`4525`) + - `position`: Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1(right/top-end). Default is 0.5 (center). (:issue:`6604`) + + Because of the default `align` value changes, coordinates of bar plots are now located on integer values (0.0, 1.0, 2.0 ...). This is intended to make bar plot be located on the same coodinates as line plot. However, bar plot may differs unexpectedly when you manually adjust the bar location or drawing area, such as using `set_xlim`, `set_ylim`, etc. In this cases, please modify your script to meet with new coordinates. + +- The :func:`parallel_coordinates` function now takes argument ``color`` + instead of ``colors``. A ``FutureWarning`` is raised to alert that + the old ``colors`` argument will not be supported in a future release. (:issue:`6956`) + +- The :func:`parallel_coordinates` and :func:`andrews_curves` functions now take + positional argument ``frame`` instead of ``data``. A ``FutureWarning`` is + raised if the old ``data`` argument is used by name. (:issue:`6956`) + +- :meth:`DataFrame.boxplot` now supports ``layout`` keyword (:issue:`6769`) +- :meth:`DataFrame.boxplot` has a new keyword argument, `return_type`. It accepts ``'dict'``, + ``'axes'``, or ``'both'``, in which case a namedtuple with the matplotlib + axes and a dict of matplotlib Lines is returned. + + +.. _whatsnew_0140.prior_deprecations: + +Prior Version Deprecations/Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are prior version deprecations that are taking effect as of 0.14.0. + +- Remove :class:`DateRange` in favor of :class:`DatetimeIndex` (:issue:`6816`) +- Remove ``column`` keyword from ``DataFrame.sort`` (:issue:`4370`) +- Remove ``precision`` keyword from :func:`set_eng_float_format` (:issue:`395`) +- Remove ``force_unicode`` keyword from :meth:`DataFrame.to_string`, + :meth:`DataFrame.to_latex`, and :meth:`DataFrame.to_html`; these function + encode in unicode by default (:issue:`2224`, :issue:`2225`) +- Remove ``nanRep`` keyword from :meth:`DataFrame.to_csv` and + :meth:`DataFrame.to_string` (:issue:`275`) +- Remove ``unique`` keyword from :meth:`HDFStore.select_column` (:issue:`3256`) +- Remove ``inferTimeRule`` keyword from :func:`Timestamp.offset` (:issue:`391`) +- Remove ``name`` keyword from :func:`get_data_yahoo` and + :func:`get_data_google` ( `commit b921d1a `__ ) +- Remove ``offset`` keyword from :class:`DatetimeIndex` constructor + ( `commit 3136390 `__ ) +- Remove ``time_rule`` from several rolling-moment statistical functions, such + as :func:`rolling_sum` (:issue:`1042`) +- Removed neg ``-`` boolean operations on numpy arrays in favor of inv ``~``, as this is going to + be deprecated in numpy 1.9 (:issue:`6960`) + +.. _whatsnew_0140.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- The :func:`pivot_table`/:meth:`DataFrame.pivot_table` and :func:`crosstab` functions + now take arguments ``index`` and ``columns`` instead of ``rows`` and ``cols``. A + ``FutureWarning`` is raised to alert that the old ``rows`` and ``cols`` arguments + will not be supported in a future release (:issue:`5505`) + +- The :meth:`DataFrame.drop_duplicates` and :meth:`DataFrame.duplicated` methods + now take argument ``subset`` instead of ``cols`` to better align with + :meth:`DataFrame.dropna`. A ``FutureWarning`` is raised to alert that the old + ``cols`` arguments will not be supported in a future release (:issue:`6680`) + +- The :meth:`DataFrame.to_csv` and :meth:`DataFrame.to_excel` functions + now takes argument ``columns`` instead of ``cols``. A + ``FutureWarning`` is raised to alert that the old ``cols`` arguments + will not be supported in a future release (:issue:`6645`) + +- Indexers will warn ``FutureWarning`` when used with a scalar indexer and + a non-floating point Index (:issue:`4892`, :issue:`6960`) + + .. code-block:: python + + # non-floating point indexes can only be indexed by integers / labels + In [1]: Series(1,np.arange(5))[3.0] + pandas/core/index.py:469: FutureWarning: scalar indexers for index type Int64Index should be integers and not floating point + Out[1]: 1 + + In [2]: Series(1,np.arange(5)).iloc[3.0] + pandas/core/index.py:469: FutureWarning: scalar indexers for index type Int64Index should be integers and not floating point + Out[2]: 1 + + In [3]: Series(1,np.arange(5)).iloc[3.0:4] + pandas/core/index.py:527: FutureWarning: slice indexers when using iloc should be integers and not floating point + Out[3]: + 3 1 + dtype: int64 + + # these are Float64Indexes, so integer or floating point is acceptable + In [4]: Series(1,np.arange(5.))[3] + Out[4]: 1 + + In [5]: Series(1,np.arange(5.))[3.0] + Out[6]: 1 + +- Numpy 1.9 compat w.r.t. deprecation warnings (:issue:`6960`) + +- :meth:`Panel.shift` now has a function signature that matches :meth:`DataFrame.shift`. + The old positional argument ``lags`` has been changed to a keyword argument + ``periods`` with a default value of 1. A ``FutureWarning`` is raised if the + old argument ``lags`` is used by name. (:issue:`6910`) +- The ``order`` keyword argument of :func:`factorize` will be removed. (:issue:`6926`). + +- Remove the ``copy`` keyword from :meth:`DataFrame.xs`, :meth:`Panel.major_xs`, :meth:`Panel.minor_xs`. A view will be + returned if possible, otherwise a copy will be made. Previously the user could think that ``copy=False`` would + ALWAYS return a view. (:issue:`6894`) + +- The :func:`parallel_coordinates` function now takes argument ``color`` + instead of ``colors``. A ``FutureWarning`` is raised to alert that + the old ``colors`` argument will not be supported in a future release. (:issue:`6956`) + +- The :func:`parallel_coordinates` and :func:`andrews_curves` functions now take + positional argument ``frame`` instead of ``data``. A ``FutureWarning`` is + raised if the old ``data`` argument is used by name. (:issue:`6956`) + +- The support for the 'mysql' flavor when using DBAPI connection objects has been deprecated. + MySQL will be further supported with SQLAlchemy engines (:issue:`6900`). + +- The following ``io.sql`` functions have been deprecated: ``tquery``, ``uquery``, ``read_frame``, ``frame_query``, ``write_frame``. + +- The `percentile_width` keyword argument in :meth:`~DataFrame.describe` has been deprecated. + Use the `percentiles` keyword instead, which takes a list of percentiles to display. The + default output is unchanged. + +- The default return type of :func:`boxplot` will change from a dict to a matpltolib Axes + in a future release. You can use the future behavior now by passing ``return_type='axes'`` + to boxplot. + +.. _whatsnew_0140.knownissues: + +Known Issues +~~~~~~~~~~~~ + +- OpenPyXL 2.0.0 breaks backwards compatibility (:issue:`7169`) + + +.. _whatsnew_0140.enhancements: + +Enhancements +~~~~~~~~~~~~ + +- DataFrame and Series will create a MultiIndex object if passed a tuples dict, See :ref:`the docs` (:issue:`3323`) + + .. ipython:: python + + Series({('a', 'b'): 1, ('a', 'a'): 0, + ('a', 'c'): 2, ('b', 'a'): 3, ('b', 'b'): 4}) + DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, + ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, + ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, + ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, + ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + +- Added the ``sym_diff`` method to ``Index`` (:issue:`5543`) +- ``DataFrame.to_latex`` now takes a longtable keyword, which if True will return a table in a longtable environment. (:issue:`6617`) +- Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`) +- ``pd.read_clipboard`` will, if the keyword ``sep`` is unspecified, try to detect data copied from a spreadsheet + and parse accordingly. (:issue:`6223`) +- Joining a singly-indexed DataFrame with a multi-indexed DataFrame (:issue:`3662`) + + See :ref:`the docs`. Joining multi-index DataFrames on both the left and right is not yet supported ATM. + + .. ipython:: python + + household = DataFrame(dict(household_id = [1,2,3], + male = [0,1,0], + wealth = [196087.3,316478.7,294750]), + columns = ['household_id','male','wealth'] + ).set_index('household_id') + household + portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29", + "gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell", + "AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','name','share'] + ).set_index(['household_id','asset_id']) + portfolio + + household.join(portfolio, how='inner') + +- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when + using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) +- Partially sort by only the specified levels of a MultiIndex with the + ``sort_remaining`` boolean kwarg. (:issue:`3984`) +- Added ``to_julian_date`` to ``TimeStamp`` and ``DatetimeIndex``. The Julian + Date is used primarily in astronomy and represents the number of days from + noon, January 1, 4713 BC. Because nanoseconds are used to define the time + in pandas the actual range of dates that you can use is 1678 AD to 2262 AD. (:issue:`4041`) +- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types + and will upcast when needed. When it is not possible to losslessly upcast, a warning + is issued (:issue:`6327`) +- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp + and data_label which allow the time stamp and dataset label to be set when creating a + file. (:issue:`6545`) +- ``pandas.io.gbq`` now handles reading unicode strings properly. (:issue:`5940`) +- :ref:`Holidays Calendars` are now available and can be used with the ``CustomBusinessDay`` offset (:issue:`6719`) +- ``Float64Index`` is now backed by a ``float64`` dtype ndarray instead of an + ``object`` dtype array (:issue:`6471`). +- Implemented ``Panel.pct_change`` (:issue:`6904`) +- Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:`rolling_max` defaults to max, + :func:`rolling_min` defaults to min, and all others default to mean (:issue:`6297`) +- ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) +- :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of + quantiles. +- :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`) +- ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`) + + .. ipython:: python + + import datetime + df = DataFrame({ + 'Branch' : 'A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1], + 'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5), + datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0)], + 'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5), + datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0), + datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0)]}) + df + + pivot_table(df, index=Grouper(freq='M', key='Date'), + columns=Grouper(freq='M', key='PayDay'), + values='Quantity', aggfunc=np.sum) + +- Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) +- Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`) + +- `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`) + + .. ipython:: python + + prng = period_range('2013-01-01 09:00', periods=100, freq='H') + ps = Series(np.random.randn(len(prng)), index=prng) + ps + ps['2013-01-02'] + +- ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`) +- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`) +- pd.expanding_apply and pd.rolling_apply now take args and kwargs that are passed on to + the func (:issue:`6289`) +- ``DataFrame.rank()`` now has a percentage rank option (:issue:`5971`) +- ``Series.rank()`` now has a percentage rank option (:issue:`5971`) +- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) +- Support passing ``encoding`` with xlwt (:issue:`3710`) +- Refactor Block classes removing `Block.items` attributes to avoid duplication + in item handling (:issue:`6745`, :issue:`6988`). +- Testing statements updated to use specialized asserts (:issue:`6175`) + + + +.. _whatsnew_0140.performance: + +Performance +~~~~~~~~~~~ + +- Performance improvement when converting ``DatetimeIndex`` to floating ordinals + using ``DatetimeConverter`` (:issue:`6636`) +- Performance improvement for ``DataFrame.shift`` (:issue:`5609`) +- Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) +- Performance improvements in single-dtyped indexing (:issue:`6484`) +- Improve performance of DataFrame construction with certain offsets, by removing faulty caching + (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) +- Improve performance of ``CustomBusinessDay`` (:issue:`6584`) +- improve performance of slice indexing on Series with string keys (:issue:`6341`, :issue:`6372`) +- Performance improvement for ``DataFrame.from_records`` when reading a + specified number of rows from an iterable (:issue:`6700`) +- Performance improvements in timedelta conversions for integer dtypes (:issue:`6754`) +- Improved performance of compatible pickles (:issue:`6899`) +- Improve performance in certain reindexing operations by optimizing ``take_2d`` (:issue:`6749`) +- ``GroupBy.count()`` is now implemented in Cython and is much faster for large + numbers of groups (:issue:`7016`). + +Experimental +~~~~~~~~~~~~ + +There are no experimental changes in 0.14.0 + + +.. _whatsnew_0140.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +- Bug in Series ValueError when index doesn't match data (:issue:`6532`) +- Prevent segfault due to MultiIndex not being supported in HDFStore table + format (:issue:`1848`) +- Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`) +- Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`) +- Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`) +- Inconsistent tz parsing ``Timestamp`` / ``to_datetime`` for current year (:issue:`5958`) +- Indexing bugs with reordered indexes (:issue:`6252`, :issue:`6254`) +- Bug in ``.xs`` with a Series multiindex (:issue:`6258`, :issue:`5684`) +- Bug in conversion of a string types to a DatetimeIndex with a specified frequency (:issue:`6273`, :issue:`6274`) +- Bug in ``eval`` where type-promotion failed for large expressions (:issue:`6205`) +- Bug in interpolate with ``inplace=True`` (:issue:`6281`) +- ``HDFStore.remove`` now handles start and stop (:issue:`6177`) +- ``HDFStore.select_as_multiple`` handles start and stop the same way as ``select`` (:issue:`6177`) +- ``HDFStore.select_as_coordinates`` and ``select_column`` works with a ``where`` clause that results in filters (:issue:`6177`) +- Regression in join of non_unique_indexes (:issue:`6329`) +- Issue with groupby ``agg`` with a single function and a a mixed-type frame (:issue:`6337`) +- Bug in ``DataFrame.replace()`` when passing a non- ``bool`` + ``to_replace`` argument (:issue:`6332`) +- Raise when trying to align on different levels of a multi-index assignment (:issue:`3738`) +- Bug in setting complex dtypes via boolean indexing (:issue:`6345`) +- Bug in TimeGrouper/resample when presented with a non-monotonic DatetimeIndex that would return invalid results. (:issue:`4161`) +- Bug in index name propogation in TimeGrouper/resample (:issue:`4161`) +- TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`) +- Bug in multiple grouping with a TimeGrouper depending on target column order (:issue:`6764`) +- Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'`` + (:issue:`6351`) +- Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`) +- ``DataFrame.shift`` with ``axis=1`` was raising (:issue:`6371`) +- Disabled clipboard tests until release time (run locally with ``nosetests -A disabled``) (:issue:`6048`). +- Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained + keys not in the values to be replaced (:issue:`6342`) +- ``str.match`` ignored the na flag (:issue:`6609`). +- Bug in take with duplicate columns that were not consolidated (:issue:`6240`) +- Bug in interpolate changing dtypes (:issue:`6290`) +- Bug in ``Series.get``, was using a buggy access method (:issue:`6383`) +- Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`) +- Bug in ``DataFrame.dropna`` with duplicate indices (:issue:`6355`) +- Regression in chained getitem indexing with embedded list-like from 0.12 (:issue:`6394`) +- ``Float64Index`` with nans not comparing correctly (:issue:`6401`) +- ``eval``/``query`` expressions with strings containing the ``@`` character + will now work (:issue:`6366`). +- Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`) +- Bug in :meth:`DataFrame.replace` where nested dicts were erroneously + depending on the order of dictionary keys and values (:issue:`5338`). +- Perf issue in concatting with empty objects (:issue:`3259`) +- Clarify sorting of ``sym_diff`` on ``Index`` objects with ``NaN`` values (:issue:`6444`) +- Regression in ``MultiIndex.from_product`` with a ``DatetimeIndex`` as input (:issue:`6439`) +- Bug in ``str.extract`` when passed a non-default index (:issue:`6348`) +- Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) +- Bug in ``io.data.DataReader`` when passed ``"F-F_Momentum_Factor"`` and ``data_source="famafrench"`` (:issue:`6460`) +- Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) +- Bug in ``resample`` with a timezone and certain offsets (:issue:`6397`) +- Bug in ``iat/iloc`` with duplicate indices on a Series (:issue:`6493`) +- Bug in ``read_html`` where nan's were incorrectly being used to indicate + missing values in text. Should use the empty string for consistency with the + rest of pandas (:issue:`5129`). +- Bug in ``read_html`` tests where redirected invalid URLs would make one test + fail (:issue:`6445`). +- Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) +- Bug that caused _ref_locs corruption when slice indexing across columns axis of a DataFrame (:issue:`6525`) +- Regression from 0.13 in the treatment of numpy ``datetime64`` non-ns dtypes in Series creation (:issue:`6529`) +- ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`). +- Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`) +- Bug in setitem with ``.loc`` on mixed integer Indexes (:issue:`6546`) +- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) +- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could be exported using the + wrong data types and missing values (:issue:`6335`) +- ``StataWriter`` replaces missing values in string columns by empty string (:issue:`6802`) +- Inconsistent types in ``Timestamp`` addition/subtraction (:issue:`6543`) +- Bug in preserving frequency across Timestamp addition/subtraction (:issue:`4547`) +- Bug in empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) +- ``Series.quantile`` raising on an ``object`` dtype (:issue:`6555`) +- Bug in ``.xs`` with a ``nan`` in level when dropped (:issue:`6574`) +- Bug in fillna with ``method='bfill/ffill'`` and ``datetime64[ns]`` dtype (:issue:`6587`) +- Bug in sql writing with mixed dtypes possibly leading to data loss (:issue:`6509`) +- Bug in ``Series.pop`` (:issue:`6600`) +- Bug in ``iloc`` indexing when positional indexer matched ``Int64Index`` of the corresponding axis and no reordering happened (:issue:`6612`) +- Bug in ``fillna`` with ``limit`` and ``value`` specified +- Bug in ``DataFrame.to_stata`` when columns have non-string names (:issue:`4558`) +- Bug in compat with ``np.compress``, surfaced in (:issue:`6658`) +- Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`) +- Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores ``with_index`` keyword argument (:issue:`6685`) +- Bug in resample with extra bins when using an evenly divisible frequency (:issue:`4076`) +- Bug in consistency of groupby aggregation when passing a custom function (:issue:`6715`) +- Bug in resample when ``how=None`` resample freq is the same as the axis frequency (:issue:`5955`) +- Bug in downcasting inference with empty arrays (:issue:`6733`) +- Bug in ``obj.blocks`` on sparse containers dropping all but the last items of same for dtype (:issue:`6748`) +- Bug in unpickling ``NaT (NaTType)`` (:issue:`4606`) +- Bug in ``DataFrame.replace()`` where regex metacharacters were being treated + as regexs even when ``regex=False`` (:issue:`6777`). +- Bug in timedelta ops on 32-bit platforms (:issue:`6808`) +- Bug in setting a tz-aware index directly via ``.index`` (:issue:`6785`) +- Bug in expressions.py where numexpr would try to evaluate arithmetic ops + (:issue:`6762`). +- Bug in Makefile where it didn't remove Cython generated C files with ``make + clean`` (:issue:`6768`) +- Bug with numpy < 1.7.2 when reading long strings from ``HDFStore`` (:issue:`6166`) +- Bug in ``DataFrame._reduce`` where non bool-like (0/1) integers were being + coverted into bools. (:issue:`6806`) +- Regression from 0.13 with ``fillna`` and a Series on datetime-like (:issue:`6344`) +- Bug in adding ``np.timedelta64`` to ``DatetimeIndex`` with timezone outputs incorrect results (:issue:`6818`) +- Bug in ``DataFrame.replace()`` where changing a dtype through replacement + would only replace the first occurrence of a value (:issue:`6689`) +- Better error message when passing a frequency of 'MS' in ``Period`` construction (GH5332) +- Bug in ``Series.__unicode__`` when ``max_rows=None`` and the Series has more than 1000 rows. (:issue:`6863`) +- Bug in ``groupby.get_group`` where a datetlike wasn't always accepted (:issue:`5267`) +- Bug in ``groupBy.get_group`` created by ``TimeGrouper`` raises ``AttributeError`` (:issue:`6914`) +- Bug in ``DatetimeIndex.tz_localize`` and ``DatetimeIndex.tz_convert`` converting ``NaT`` incorrectly (:issue:`5546`) +- Bug in arithmetic operations affecting ``NaT`` (:issue:`6873`) +- Bug in ``Series.str.extract`` where the resulting ``Series`` from a single + group match wasn't renamed to the group name +- Bug in ``DataFrame.to_csv`` where setting ``index=False`` ignored the + ``header`` kwarg (:issue:`6186`) +- Bug in ``DataFrame.plot`` and ``Series.plot``, where the legend behave inconsistently when plotting to the same axes repeatedly (:issue:`6678`) +- Internal tests for patching ``__finalize__`` / bug in merge not finalizing (:issue:`6923`, :issue:`6927`) +- accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`) +- Bug in C parser with leading whitespace (:issue:`3374`) +- Bug in C parser with ``delim_whitespace=True`` and ``\r``-delimited lines +- Bug in python parser with explicit multi-index in row following column header (:issue:`6893`) +- Bug in ``Series.rank`` and ``DataFrame.rank`` that caused small floats (<1e-13) to all receive the same rank (:issue:`6886`) +- Bug in ``DataFrame.apply`` with functions that used \*args`` or \*\*kwargs and returned + an empty result (:issue:`6952`) +- Bug in sum/mean on 32-bit platforms on overflows (:issue:`6915`) +- Moved ``Panel.shift`` to ``NDFrame.slice_shift`` and fixed to respect multiple dtypes. (:issue:`6959`) +- Bug in enabling ``subplots=True`` in ``DataFrame.plot`` only has single column raises ``TypeError``, and ``Series.plot`` raises ``AttributeError`` (:issue:`6951`) +- Bug in ``DataFrame.plot`` draws unnecessary axes when enabling ``subplots`` and ``kind=scatter`` (:issue:`6951`) +- Bug in ``read_csv`` from a filesystem with non-utf-8 encoding (:issue:`6807`) +- Bug in ``iloc`` when setting / aligning (:issue:`6766`) +- Bug causing UnicodeEncodeError when get_dummies called with unicode values and a prefix (:issue:`6885`) +- Bug in timeseries-with-frequency plot cursor display (:issue:`5453`) +- Bug surfaced in ``groupby.plot`` when using a ``Float64Index`` (:issue:`7025`) +- Stopped tests from failing if options data isn't able to be downloaded from Yahoo (:issue:`7034`) +- Bug in ``parallel_coordinates`` and ``radviz`` where reordering of class column + caused possible color/class mismatch (:issue:`6956`) +- Bug in ``radviz`` and ``andrews_curves`` where multiple values of 'color' + were being passed to plotting method (:issue:`6956`) +- Bug in ``Float64Index.isin()`` where containing ``nan`` s would make indices + claim that they contained all the things (:issue:`7066`). +- Bug in ``DataFrame.boxplot`` where it failed to use the axis passed as the ``ax`` argument (:issue:`3578`) +- Bug in the ``XlsxWriter`` and ``XlwtWriter`` implementations that resulted in datetime columns being formatted without the time (:issue:`7075`) + were being passed to plotting method +- :func:`read_fwf` treats ``None`` in ``colspec`` like regular python slices. It now reads from the beginning + or until the end of the line when ``colspec`` contains a ``None`` (previously raised a ``TypeError``) +- Bug in cache coherence with chained indexing and slicing; add ``_is_view`` property to ``NDFrame`` to correctly predict + views; mark ``is_copy`` on ``xs`` only if its an actual copy (and not a view) (:issue:`7084`) +- Bug in DatetimeIndex creation from string ndarray with ``dayfirst=True`` (:issue:`5917`) +- Bug in ``MultiIndex.from_arrays`` created from ``DatetimeIndex`` doesn't preserve ``freq`` and ``tz`` (:issue:`7090`) +- Bug in ``unstack`` raises ``ValueError`` when ``MultiIndex`` contains ``PeriodIndex`` (:issue:`4342`) +- Bug in ``boxplot`` and ``hist`` draws unnecessary axes (:issue:`6769`) +- Regression in ``groupby.nth()`` for out-of-bounds indexers (:issue:`6621`) +- Bug in ``quantile`` with datetime values (:issue:`6965`) +- Bug in ``Dataframe.set_index``, ``reindex`` and ``pivot`` don't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`3950`, :issue:`5878`, :issue:`6631`) +- Bug in ``MultiIndex.get_level_values`` doesn't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`7092`) +- Bug in ``Groupby`` doesn't preserve ``tz`` (:issue:`3950`) +- Bug in ``PeriodIndex`` partial string slicing (:issue:`6716`) +- Bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the `large_repr` set to 'info' + (:issue:`7105`) +- Bug in ``DatetimeIndex`` specifying ``freq`` raises ``ValueError`` when passed value is too short (:issue:`7098`) +- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`) +- Bug ``PeriodIndex`` string slicing with out of bounds values (:issue:`5407`) +- Fixed a memory error in the hashtable implementation/factorizer on resizing of large tables (:issue:`7157`) +- Bug in ``isnull`` when applied to 0-dimensional object arrays (:issue:`7176`) +- Bug in ``query``/``eval`` where global constants were not looked up correctly + (:issue:`7178`) +- Bug in recognizing out-of-bounds positional list indexers with ``iloc`` and a multi-axis tuple indexer (:issue:`7189`) +- Bug in setitem with a single value, multi-index and integer indices (:issue:`7190`, :issue:`7218`) +- Bug in expressions evaluation with reversed ops, showing in series-dataframe ops (:issue:`7198`, :issue:`7192`) +- Bug in multi-axis indexing with > 2 ndim and a multi-index (:issue:`7199`) +- Fix a bug where invalid eval/query operations would blow the stack (:issue:`5198`) diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt new file mode 100644 index 00000000..2b5f8b2d --- /dev/null +++ b/doc/source/v0.14.1.txt @@ -0,0 +1,271 @@ +.. _whatsnew_0141: + +v0.14.1 (July 11, 2014) +----------------------- + +This is a minor release from 0.14.0 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +- Highlights include: + + - New methods :meth:`~pandas.DataFrame.select_dtypes` to select columns + based on the dtype and :meth:`~pandas.Series.sem` to calculate the + standard error of the mean. + - Support for dateutil timezones (see :ref:`docs `). + - Support for ignoring full line comments in the :func:`~pandas.read_csv` + text parser. + - New documentation section on :ref:`Options and Settings `. + - Lots of bug fixes. + +- :ref:`Enhancements ` +- :ref:`API Changes ` +- :ref:`Performance Improvements ` +- :ref:`Experimental Changes ` +- :ref:`Bug Fixes ` + +.. _whatsnew_0141.api: + +API changes +~~~~~~~~~~~ + +- Openpyxl now raises a ValueError on construction of the openpyxl writer + instead of warning on pandas import (:issue:`7284`). + +- For ``StringMethods.extract``, when no match is found, the result - only + containing ``NaN`` values - now also has ``dtype=object`` instead of + ``float`` (:issue:`7242`) + +- ``Period`` objects no longer raise a ``TypeError`` when compared using ``==`` + with another object that *isn't* a ``Period``. Instead + when comparing a ``Period`` with another object using ``==`` if the other + object isn't a ``Period`` ``False`` is returned. (:issue:`7376`) + +- Previously, the behaviour on resetting the time or not in + ``offsets.apply``, ``rollforward`` and ``rollback`` operations differed + between offsets. With the support of the ``normalize`` keyword for all offsets(see + below) with a default value of False (preserve time), the behaviour changed for certain + offsets (BusinessMonthBegin, MonthEnd, BusinessMonthEnd, CustomBusinessMonthEnd, + BusinessYearBegin, LastWeekOfMonth, FY5253Quarter, LastWeekOfMonth, Easter): + + .. code-block:: python + + In [6]: from pandas.tseries import offsets + + In [7]: d = pd.Timestamp('2014-01-01 09:00') + + # old behaviour < 0.14.1 + In [8]: d + offsets.MonthEnd() + Out[8]: Timestamp('2014-01-31 00:00:00') + + Starting from 0.14.1 all offsets preserve time by default. The old + behaviour can be obtained with ``normalize=True`` + + .. ipython:: python + :suppress: + + import pandas.tseries.offsets as offsets + d = pd.Timestamp('2014-01-01 09:00') + + .. ipython:: python + + # new behaviour + d + offsets.MonthEnd() + d + offsets.MonthEnd(normalize=True) + + Note that for the other offsets the default behaviour did not change. + +- Add back ``#N/A N/A`` as a default NA value in text parsing, (regresion from 0.12) (:issue:`5521`) +- Raise a ``TypeError`` on inplace-setting with a ``.where`` and a non ``np.nan`` value as this is inconsistent + with a set-item expression like ``df[mask] = None`` (:issue:`7656`) + + +.. _whatsnew_0141.enhancements: + +Enhancements +~~~~~~~~~~~~ + +- Add ``dropna`` argument to ``value_counts`` and ``nunique`` (:issue:`5569`). +- Add :meth:`~pandas.DataFrame.select_dtypes` method to allow selection of + columns based on dtype (:issue:`7316`). See :ref:`the docs `. +- All ``offsets`` suppports the ``normalize`` keyword to specify whether + ``offsets.apply``, ``rollforward`` and ``rollback`` resets the time (hour, + minute, etc) or not (default ``False``, preserves time) (:issue:`7156`): + + .. ipython:: python + + import pandas.tseries.offsets as offsets + + day = offsets.Day() + day.apply(Timestamp('2014-01-01 09:00')) + + day = offsets.Day(normalize=True) + day.apply(Timestamp('2014-01-01 09:00')) + +- ``PeriodIndex`` is represented as the same format as ``DatetimeIndex`` (:issue:`7601`) +- ``StringMethods`` now work on empty Series (:issue:`7242`) +- The file parsers ``read_csv`` and ``read_table`` now ignore line comments provided by + the parameter `comment`, which accepts only a single character for the C reader. + In particular, they allow for comments before file data begins (:issue:`2685`) +- Add ``NotImplementedError`` for simultaneous use of ``chunksize`` and ``nrows`` + for read_csv() (:issue:`6774`). +- Tests for basic reading of public S3 buckets now exist (:issue:`7281`). +- ``read_html`` now sports an ``encoding`` argument that is passed to the + underlying parser library. You can use this to read non-ascii encoded web + pages (:issue:`7323`). +- ``read_excel`` now supports reading from URLs in the same way + that ``read_csv`` does. (:issue:`6809`) +- Support for dateutil timezones, which can now be used in the same way as + pytz timezones across pandas. (:issue:`4688`) + + .. ipython:: python + + rng = date_range('3/6/2012 00:00', periods=10, freq='D', + tz='dateutil/Europe/London') + rng.tz + + See :ref:`the docs `. + +- Implemented ``sem`` (standard error of the mean) operation for ``Series``, + ``DataFrame``, ``Panel``, and ``Groupby`` (:issue:`6897`) +- Add ``nlargest`` and ``nsmallest`` to the ``Series`` ``groupby`` whitelist, + which means you can now use these methods on a ``SeriesGroupBy`` object + (:issue:`7053`). +- All offsets ``apply``, ``rollforward`` and ``rollback`` can now handle ``np.datetime64``, previously results in ``ApplyTypeError`` (:issue:`7452`) +- ``Period`` and ``PeriodIndex`` can contain ``NaT`` in its values (:issue:`7485`) +- Support pickling ``Series``, ``DataFrame`` and ``Panel`` objects with + non-unique labels along *item* axis (``index``, ``columns`` and ``items`` + respectively) (:issue:`7370`). +- Improved inference of datetime/timedelta with mixed null objects. Regression from 0.13.1 in interpretation of an object Index + with all null elements (:issue:`7431`) + +.. _whatsnew_0141.performance: + +Performance +~~~~~~~~~~~ +- Improvements in dtype inference for numeric operations involving yielding performance gains for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`) +- Improvements in Series.transform for significant performance gains (:issue:`6496`) +- Improvements in DataFrame.transform with ufuncs and built-in grouper functions for signifcant performance gains (:issue:`7383`) +- Regression in groupby aggregation of datetime64 dtypes (:issue:`7555`) +- Improvements in `MultiIndex.from_product` for large iterables (:issue:`7627`) + + +.. _whatsnew_0141.experimental: + +Experimental +~~~~~~~~~~~~ + +- ``pandas.io.data.Options`` has a new method, ``get_all_data`` method, and now consistently returns a + multi-indexed ``DataFrame``, see :ref:`the docs `. (:issue:`5602`) +- ``io.gbq.read_gbq`` and ``io.gbq.to_gbq`` were refactored to remove the + dependency on the Google ``bq.py`` command line client. This submodule + now uses ``httplib2`` and the Google ``apiclient`` and ``oauth2client`` API client + libraries which should be more stable and, therefore, reliable than + ``bq.py``. See :ref:`the docs `. (:issue:`6937`). + + +.. _whatsnew_0141.bug_fixes: + +Bug Fixes +~~~~~~~~~ +- Bug in ``DataFrame.where`` with a symmetric shaped frame and a passed other of a DataFrame (:issue:`7506`) +- Bug in Panel indexing with a multi-index axis (:issue:`7516`) +- Regression in datetimelike slice indexing with a duplicated index and non-exact end-points (:issue:`7523`) +- Bug in setitem with list-of-lists and single vs mixed types (:issue:`7551`:) +- Bug in timeops with non-aligned Series (:issue:`7500`) +- Bug in timedelta inference when assigning an incomplete Series (:issue:`7592`) +- Bug in groupby ``.nth`` with a Series and integer-like column name (:issue:`7559`) +- Bug in ``Series.get`` with a boolean accessor (:issue:`7407`) +- Bug in ``value_counts`` where ``NaT`` did not qualify as missing (``NaN``) (:issue:`7423`) +- Bug in ``to_timedelta`` that accepted invalid units and misinterpreted 'm/h' (:issue:`7611`, :issue:`6423`) +- Bug in line plot doesn't set correct ``xlim`` if ``secondary_y=True`` (:issue:`7459`) +- Bug in grouped ``hist`` and ``scatter`` plots use old ``figsize`` default (:issue:`7394`) +- Bug in plotting subplots with ``DataFrame.plot``, ``hist`` clears passed ``ax`` even if the number of subplots is one (:issue:`7391`). +- Bug in plotting subplots with ``DataFrame.boxplot`` with ``by`` kw raises ``ValueError`` if the number of subplots exceeds 1 (:issue:`7391`). +- Bug in subplots displays ``ticklabels`` and ``labels`` in different rule (:issue:`5897`) +- Bug in ``Panel.apply`` with a multi-index as an axis (:issue:`7469`) +- Bug in ``DatetimeIndex.insert`` doesn't preserve ``name`` and ``tz`` (:issue:`7299`) +- Bug in ``DatetimeIndex.asobject`` doesn't preserve ``name`` (:issue:`7299`) +- Bug in multi-index slicing with datetimelike ranges (strings and Timestamps), (:issue:`7429`) +- Bug in ``Index.min`` and ``max`` doesn't handle ``nan`` and ``NaT`` properly (:issue:`7261`) +- Bug in ``PeriodIndex.min/max`` results in ``int`` (:issue:`7609`) +- Bug in ``resample`` where ``fill_method`` was ignored if you passed ``how`` (:issue:`2073`) +- Bug in ``TimeGrouper`` doesn't exclude column specified by ``key`` (:issue:`7227`) +- Bug in ``DataFrame`` and ``Series`` bar and barh plot raises ``TypeError`` when ``bottom`` + and ``left`` keyword is specified (:issue:`7226`) +- Bug in ``DataFrame.hist`` raises ``TypeError`` when it contains non numeric column (:issue:`7277`) +- Bug in ``Index.delete`` does not preserve ``name`` and ``freq`` attributes (:issue:`7302`) +- Bug in ``DataFrame.query()``/``eval`` where local string variables with the @ + sign were being treated as temporaries attempting to be deleted + (:issue:`7300`). +- Bug in ``Float64Index`` which didn't allow duplicates (:issue:`7149`). +- Bug in ``DataFrame.replace()`` where truthy values were being replaced + (:issue:`7140`). +- Bug in ``StringMethods.extract()`` where a single match group Series + would use the matcher's name instead of the group name (:issue:`7313`). +- Bug in ``isnull()`` when ``mode.use_inf_as_null == True`` where isnull + wouldn't test ``True`` when it encountered an ``inf``/``-inf`` + (:issue:`7315`). +- Bug in inferred_freq results in None for eastern hemisphere timezones (:issue:`7310`) +- Bug in ``Easter`` returns incorrect date when offset is negative (:issue:`7195`) +- Bug in broadcasting with ``.div``, integer dtypes and divide-by-zero (:issue:`7325`) +- Bug in ``CustomBusinessDay.apply`` raiases ``NameError`` when ``np.datetime64`` object is passed (:issue:`7196`) +- Bug in ``MultiIndex.append``, ``concat`` and ``pivot_table`` don't preserve timezone (:issue:`6606`) +- Bug in ``.loc`` with a list of indexers on a single-multi index level (that is not nested) (:issue:`7349`) +- Bug in ``Series.map`` when mapping a dict with tuple keys of different lengths (:issue:`7333`) +- Bug all ``StringMethods`` now work on empty Series (:issue:`7242`) +- Fix delegation of `read_sql` to `read_sql_query` when query does not contain 'select' (:issue:`7324`). +- Bug where a string column name assignment to a ``DataFrame`` with a + ``Float64Index`` raised a ``TypeError`` during a call to ``np.isnan`` + (:issue:`7366`). +- Bug where ``NDFrame.replace()`` didn't correctly replace objects with + ``Period`` values (:issue:`7379`). +- Bug in ``.ix`` getitem should always return a Series (:issue:`7150`) +- Bug in multi-index slicing with incomplete indexers (:issue:`7399`) +- Bug in multi-index slicing with a step in a sliced level (:issue:`7400`) +- Bug where negative indexers in ``DatetimeIndex`` were not correctly sliced + (:issue:`7408`) +- Bug where ``NaT`` wasn't repr'd correctly in a ``MultiIndex`` (:issue:`7406`, + :issue:`7409`). +- Bug where bool objects were converted to ``nan`` in ``convert_objects`` + (:issue:`7416`). +- Bug in ``quantile`` ignoring the axis keyword argument (:issue`7306`) +- Bug where ``nanops._maybe_null_out`` doesn't work with complex numbers + (:issue:`7353`) +- Bug in several ``nanops`` functions when ``axis==0`` for + 1-dimensional ``nan`` arrays (:issue:`7354`) +- Bug where ``nanops.nanmedian`` doesn't work when ``axis==None`` + (:issue:`7352`) +- Bug where ``nanops._has_infs`` doesn't work with many dtypes + (:issue:`7357`) +- Bug in ``StataReader.data`` where reading a 0-observation dta failed (:issue:`7369`) +- Bug in when reading Stata 13 (117) files containing fixed width strings (:issue:`7360`) +- Bug in when writing Stata files where the encoding was ignored (:issue:`7286`) +- Bug in ``DatetimeIndex`` comparison doesn't handle ``NaT`` properly (:issue:`7529`) +- Bug in passing input with ``tzinfo`` to some offsets ``apply``, ``rollforward`` or ``rollback`` resets ``tzinfo`` or raises ``ValueError`` (:issue:`7465`) +- Bug in ``DatetimeIndex.to_period``, ``PeriodIndex.asobject``, ``PeriodIndex.to_timestamp`` doesn't preserve ``name`` (:issue:`7485`) +- Bug in ``DatetimeIndex.to_period`` and ``PeriodIndex.to_timestanp`` handle ``NaT`` incorrectly (:issue:`7228`) +- Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may return normal ``datetime`` (:issue:`7502`) +- Bug in ``resample`` raises ``ValueError`` when target contains ``NaT`` (:issue:`7227`) +- Bug in ``Timestamp.tz_localize`` resets ``nanosecond`` info (:issue:`7534`) +- Bug in ``DatetimeIndex.asobject`` raises ``ValueError`` when it contains ``NaT`` (:issue:`7539`) +- Bug in ``Timestamp.__new__`` doesn't preserve nanosecond properly (:issue:`7610`) +- Bug in ``Index.astype(float)`` where it would return an ``object`` dtype + ``Index`` (:issue:`7464`). +- Bug in ``DataFrame.reset_index`` loses ``tz`` (:issue:`3950`) +- Bug in ``DatetimeIndex.freqstr`` raises ``AttributeError`` when ``freq`` is ``None`` (:issue:`7606`) +- Bug in ``GroupBy.size`` created by ``TimeGrouper`` raises ``AttributeError`` (:issue:`7453`) +- Bug in single column bar plot is misaligned (:issue:`7498`). +- Bug in area plot with tz-aware time series raises ``ValueError`` (:issue:`7471`) +- Bug in non-monotonic ``Index.union`` may preserve ``name`` incorrectly (:issue:`7458`) +- Bug in ``DatetimeIndex.intersection`` doesn't preserve timezone (:issue:`4690`) +- Bug in ``rolling_var`` where a window larger than the array would raise an error(:issue:`7297`) +- Bug with last plotted timeseries dictating ``xlim`` (:issue:`2960`) +- Bug with ``secondary_y`` axis not being considered for timeseries ``xlim`` (:issue:`3490`) +- Bug in ``Float64Index`` assignment with a non scalar indexer (:issue:`7586`) +- Bug in ``pandas.core.strings.str_contains`` does not properly match in a case insensitive fashion when ``regex=False`` and ``case=False`` (:issue:`7505`) +- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, and ``rolling_corr`` for two arguments with mismatched index (:issue:`7512`) +- Bug in ``to_sql`` taking the boolean column as text column (:issue:`7678`) +- Bug in grouped `hist` doesn't handle `rot` kw and `sharex` kw properly (:issue:`7234`) +- Bug in ``.loc`` performing fallback integer indexing with ``object`` dtype indices (:issue:`7496`) +- Bug (regression) in ``PeriodIndex`` constructor when passed ``Series`` objects (:issue:`7701`). diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt new file mode 100644 index 00000000..d776848d --- /dev/null +++ b/doc/source/v0.15.0.txt @@ -0,0 +1,225 @@ +.. _whatsnew_0150: + +v0.15.0 (???) +------------- + +This is a major release from 0.14.1 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +- Highlights include: + + - Add highlites here + +- :ref:`Other Enhancements ` + +- :ref:`API Changes ` + +- :ref:`Performance Improvements ` + +- :ref:`Prior Deprecations ` + +- :ref:`Deprecations ` + +- :ref:`Known Issues ` + +- :ref:`Bug Fixes ` + +.. _whatsnew_0150.api: + +API changes +~~~~~~~~~~~ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. _whatsnew_0150.prior_deprecations: + +Prior Version Deprecations/Changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are prior version deprecations that are taking effect as of 0.15.0. + +.. _whatsnew_0150.deprecations: + +Deprecations +~~~~~~~~~~~~ + +.. _whatsnew_0150.knownissues: + +Known Issues +~~~~~~~~~~~~ + +.. _whatsnew_0150.enhancements: + +Enhancements +~~~~~~~~~~~~ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. _whatsnew_0150.performance: + +Performance +~~~~~~~~~~~ + + + + + + + + + + + + + + + + + + + + + + + +.. _whatsnew_0150.experimental: + +Experimental +~~~~~~~~~~~~ + +There are no experimental changes in 0.15.0 + +.. _whatsnew_0150.bug_fixes: + +Bug Fixes +~~~~~~~~~ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/v0.4.x.txt b/doc/source/v0.4.x.txt new file mode 100644 index 00000000..5333bb9f --- /dev/null +++ b/doc/source/v0.4.x.txt @@ -0,0 +1,63 @@ +.. _whatsnew_04x: + +v.0.4.3 through v0.4.1 (September 25 - October 9, 2011) +------------------------------------------------------- + +New Features +~~~~~~~~~~~~ + +- Added Python 3 support using 2to3 (:issue:`200`) +- :ref:`Added ` ``name`` attribute to ``Series``, now + prints as part of ``Series.__repr__`` +- :ref:`Added ` instance methods ``isnull`` and ``notnull`` to + Series (:issue:`209`, :issue:`203`) +- :ref:`Added ` ``Series.align`` method for aligning two series + with choice of join method (ENH56_) +- :ref:`Added ` method ``get_level_values`` to + ``MultiIndex`` (:issue:`188`) +- Set values in mixed-type ``DataFrame`` objects via ``.ix`` indexing attribute (:issue:`135`) +- Added new ``DataFrame`` :ref:`methods ` + ``get_dtype_counts`` and property ``dtypes`` (ENHdc_) +- Added :ref:`ignore_index ` option to + ``DataFrame.append`` to stack DataFrames (ENH1b_) +- ``read_csv`` tries to :ref:`sniff ` delimiters using + ``csv.Sniffer`` (:issue:`146`) +- ``read_csv`` can :ref:`read ` multiple columns into a + ``MultiIndex``; DataFrame's ``to_csv`` method writes out a corresponding + ``MultiIndex`` (:issue:`151`) +- ``DataFrame.rename`` has a new ``copy`` parameter to :ref:`rename + ` a DataFrame in place (ENHed_) +- :ref:`Enable ` unstacking by name (:issue:`142`) +- :ref:`Enable ` ``sortlevel`` to work by level (:issue:`141`) + +Performance Enhancements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Altered binary operations on differently-indexed SparseSeries objects + to use the integer-based (dense) alignment logic which is faster with a + larger number of blocks (:issue:`205`) +- Wrote faster Cython data alignment / merging routines resulting in + substantial speed increases +- Improved performance of ``isnull`` and ``notnull``, a regression from v0.3.0 + (:issue:`187`) +- Refactored code related to ``DataFrame.join`` so that intermediate aligned + copies of the data in each ``DataFrame`` argument do not need to be created. + Substantial performance increases result (:issue:`176`) +- Substantially improved performance of generic ``Index.intersection`` and + ``Index.union`` +- Implemented ``BlockManager.take`` resulting in significantly faster ``take`` + performance on mixed-type ``DataFrame`` objects (:issue:`104`) +- Improved performance of ``Series.sort_index`` +- Significant groupby performance enhancement: removed unnecessary integrity + checks in DataFrame internals that were slowing down slicing operations to + retrieve groups +- Optimized ``_ensure_index`` function resulting in performance savings in + type-checking Index objects +- Wrote fast time series merging / joining methods in Cython. Will be + integrated later into DataFrame.join and related functions + +.. _ENH1b: https://github.com/pydata/pandas/commit/1ba56251f0013ff7cd8834e9486cef2b10098371 +.. _ENHdc: https://github.com/pydata/pandas/commit/dca3c5c5a6a3769ee01465baca04cfdfa66a4f76 +.. _ENHed: https://github.com/pydata/pandas/commit/edd9f1945fc010a57fa0ae3b3444d1fffe592591 +.. _ENH56: https://github.com/pydata/pandas/commit/56e0c9ffafac79ce262b55a6a13e1b10a88fbe93 + diff --git a/doc/source/v0.5.0.txt b/doc/source/v0.5.0.txt new file mode 100644 index 00000000..d0550fd5 --- /dev/null +++ b/doc/source/v0.5.0.txt @@ -0,0 +1,43 @@ + +.. _whatsnew_050: + +v.0.5.0 (October 24, 2011) +-------------------------- + +New Features +~~~~~~~~~~~~ + +- :ref:`Added ` ``DataFrame.align`` method with standard join options +- :ref:`Added ` ``parse_dates`` option to ``read_csv`` and ``read_table`` methods to optionally try to parse dates in the index columns +- :ref:`Added ` ``nrows``, ``chunksize``, and ``iterator`` arguments to ``read_csv`` and ``read_table``. The last two return a new ``TextParser`` class capable of lazily iterating through chunks of a flat file (:issue:`242`) +- :ref:`Added ` ability to join on multiple columns in ``DataFrame.join`` (:issue:`214`) +- Added private ``_get_duplicates`` function to ``Index`` for identifying duplicate values more easily (ENH5c_) +- :ref:`Added ` column attribute access to DataFrame. +- :ref:`Added ` Python tab completion hook for DataFrame columns. (:issue:`233`, :issue:`230`) +- :ref:`Implemented ` ``Series.describe`` for Series containing objects (:issue:`241`) +- :ref:`Added ` inner join option to ``DataFrame.join`` when joining on key(s) (:issue:`248`) +- :ref:`Implemented ` selecting DataFrame columns by passing a list to ``__getitem__`` (:issue:`253`) +- :ref:`Implemented ` & and | to intersect / union Index objects, respectively (:issue:`261`) +- :ref:`Added` ``pivot_table`` convenience function to pandas namespace (:issue:`234`) +- :ref:`Implemented ` ``Panel.rename_axis`` function (:issue:`243`) +- DataFrame will show index level names in console output (:issue:`334`) +- :ref:`Implemented ` ``Panel.take`` +- :ref:`Added` ``set_eng_float_format`` for alternate DataFrame floating point string formatting (ENH61_) +- :ref:`Added ` convenience ``set_index`` function for creating a DataFrame index from its existing columns +- :ref:`Implemented ` ``groupby`` hierarchical index level name (:issue:`223`) +- :ref:`Added ` support for different delimiters in ``DataFrame.to_csv`` (:issue:`244`) +- TODO: DOCS ABOUT TAKE METHODS + +Performance Enhancements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- VBENCH Major performance improvements in file parsing functions ``read_csv`` and ``read_table`` +- VBENCH Added Cython function for converting tuples to ndarray very fast. Speeds up many MultiIndex-related operations +- VBENCH Refactored merging / joining code into a tidy class and disabled unnecessary computations in the float/object case, thus getting about 10% better performance (:issue:`211`) +- VBENCH Improved speed of ``DataFrame.xs`` on mixed-type DataFrame objects by about 5x, regression from 0.3.0 (:issue:`215`) +- VBENCH With new ``DataFrame.align`` method, speeding up binary operations between differently-indexed DataFrame objects by 10-25%. +- VBENCH Significantly sped up conversion of nested dict into DataFrame (:issue:`212`) +- VBENCH Significantly speed up DataFrame ``__repr__`` and ``count`` on large mixed-type DataFrame objects + +.. _ENH61: https://github.com/pydata/pandas/commit/6141961 +.. _ENH5c: https://github.com/pydata/pandas/commit/5ca6ff5d822ee4ddef1ec0d87b6d83d8b4bbd3eb diff --git a/doc/source/v0.6.0.txt b/doc/source/v0.6.0.txt new file mode 100644 index 00000000..55a67a75 --- /dev/null +++ b/doc/source/v0.6.0.txt @@ -0,0 +1,56 @@ +.. _whatsnew_060: + +v.0.6.0 (November 25, 2011) +--------------------------- + +New Features +~~~~~~~~~~~~ +- :ref:`Added ` ``melt`` function to ``pandas.core.reshape`` +- :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) +- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to to DataFrame (:issue:`296`) +- :ref:`Added ` ``Series.isin`` function which checks if each value is contained in a passed sequence (:issue:`289`) +- :ref:`Added ` ``float_format`` option to ``Series.to_string`` +- :ref:`Added ` ``skip_footer`` (:issue:`291`) and ``converters`` (:issue:`343`) options to ``read_csv`` and ``read_table`` +- :ref:`Added ` ``drop_duplicates`` and ``duplicated`` functions for removing duplicate DataFrame rows and checking for duplicate rows, respectively (:issue:`319`) +- :ref:`Implemented ` operators '&', '|', '^', '-' on DataFrame (:issue:`347`) +- :ref:`Added ` ``Series.mad``, mean absolute deviation +- :ref:`Added ` ``QuarterEnd`` DateOffset (:issue:`321`) +- :ref:`Added ` ``dot`` to DataFrame (:issue:`65`) +- :ref:`Added ` ``orient`` option to ``Panel.from_dict`` (:issue:`359`, :issue:`301`) +- :ref:`Added ` ``orient`` option to ``DataFrame.from_dict`` +- :ref:`Added ` passing list of tuples or list of lists to ``DataFrame.from_records`` (:issue:`357`) +- :ref:`Added ` multiple levels to groupby (:issue:`103`) +- :ref:`Allow ` multiple columns in ``by`` argument of ``DataFrame.sort_index`` (:issue:`92`, :issue:`362`) +- :ref:`Added ` fast ``get_value`` and ``put_value`` methods to DataFrame (:issue:`360`) +- :ref:`Added ` ``cov`` instance methods to Series and DataFrame (:issue:`194`, :issue:`362`) +- :ref:`Added ` ``kind='bar'`` option to ``DataFrame.plot`` (:issue:`348`) +- :ref:`Added ` ``idxmin`` and ``idxmax`` to Series and DataFrame (:issue:`286`) +- :ref:`Added ` ``read_clipboard`` function to parse DataFrame from clipboard (:issue:`300`) +- :ref:`Added ` ``nunique`` function to Series for counting unique elements (:issue:`297`) +- :ref:`Made ` DataFrame constructor use Series name if no columns passed (:issue:`373`) +- :ref:`Support ` regular expressions in read_table/read_csv (:issue:`364`) +- :ref:`Added ` ``DataFrame.to_html`` for writing DataFrame to HTML (:issue:`387`) +- :ref:`Added ` support for MaskedArray data in DataFrame, masked values converted to NaN (:issue:`396`) +- :ref:`Added ` ``DataFrame.boxplot`` function (:issue:`368`) +- :ref:`Can ` pass extra args, kwds to DataFrame.apply (:issue:`376`) +- :ref:`Implement ` ``DataFrame.join`` with vector ``on`` argument (:issue:`312`) +- :ref:`Added ` ``legend`` boolean flag to ``DataFrame.plot`` (:issue:`324`) +- :ref:`Can ` pass multiple levels to ``stack`` and ``unstack`` (:issue:`370`) +- :ref:`Can ` pass multiple values columns to ``pivot_table`` (:issue:`381`) +- :ref:`Use ` Series name in GroupBy for result index (:issue:`363`) +- :ref:`Added ` ``raw`` option to ``DataFrame.apply`` for performance if only need ndarray (:issue:`309`) +- Added proper, tested weighted least squares to standard and panel OLS (:issue:`303`) + +Performance Enhancements +~~~~~~~~~~~~~~~~~~~~~~~~ +- VBENCH Cythonized ``cache_readonly``, resulting in substantial micro-performance enhancements throughout the codebase (:issue:`361`) +- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than `np.apply_along_axis` (:issue:`309`) +- VBENCH Improved performance of ``MultiIndex.from_tuples`` +- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations +- VBENCH + DOCUMENT Add ``raw`` option to ``DataFrame.apply`` for getting better performance when +- VBENCH Faster cythonized count by level in Series and DataFrame (:issue:`341`) +- VBENCH? Significant GroupBy performance enhancement with multiple keys with many "empty" combinations +- VBENCH New Cython vectorized function ``map_infer`` speeds up ``Series.apply`` and ``Series.map`` significantly when passed elementwise Python function, motivated by (:issue:`355`) +- VBENCH Significantly improved performance of ``Series.order``, which also makes np.unique called on a Series faster (:issue:`327`) +- VBENCH Vastly improved performance of GroupBy on axes with a MultiIndex (:issue:`299`) + diff --git a/doc/source/v0.6.1.txt b/doc/source/v0.6.1.txt new file mode 100644 index 00000000..7e593d07 --- /dev/null +++ b/doc/source/v0.6.1.txt @@ -0,0 +1,50 @@ + +.. _whatsnew_061: + +v.0.6.1 (December 13, 2011) +--------------------------- + +New features +~~~~~~~~~~~~ +- Can :ref:`append single rows ` (as Series) to a DataFrame +- Add Spearman and Kendall rank :ref:`correlation ` + options to Series.corr and DataFrame.corr (:issue:`428`) +- :ref:`Added ` ``get_value`` and ``set_value`` methods to + Series, DataFrame, and Panel for very low-overhead access (>2x faster in many + cases) to scalar elements (:issue:`437`, :issue:`438`). ``set_value`` is capable of + producing an enlarged object. +- Add PyQt table widget to sandbox (:issue:`435`) +- DataFrame.align can :ref:`accept Series arguments ` + and an :ref:`axis option ` (:issue:`461`) +- Implement new :ref:`SparseArray ` and :ref:`SparseList ` + data structures. SparseSeries now derives from SparseArray (:issue:`463`) +- :ref:`Better console printing options ` (:issue:`453`) +- Implement fast :ref:`data ranking ` for Series and + DataFrame, fast versions of scipy.stats.rankdata (:issue:`428`) +- Implement :ref:`DataFrame.from_items ` alternate + constructor (:issue:`444`) +- DataFrame.convert_objects method for :ref:`inferring better dtypes ` + for object columns (:issue:`302`) +- Add :ref:`rolling_corr_pairwise ` function for + computing Panel of correlation matrices (:issue:`189`) +- Add :ref:`margins ` option to :ref:`pivot_table + ` for computing subgroup aggregates (:issue:`114`) +- Add ``Series.from_csv`` function (:issue:`482`) +- :ref:`Can pass ` DataFrame/DataFrame and + DataFrame/Series to rolling_corr/rolling_cov (GH #462) +- MultiIndex.get_level_values can :ref:`accept the level name ` + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improve memory usage of `DataFrame.describe` (do not copy data + unnecessarily) (PR #425) + +- Optimize scalar value lookups in the general case by 25% or more in Series + and DataFrame + +- Fix performance regression in cross-sectional count in DataFrame, affecting + DataFrame.dropna speed +- Column deletion in DataFrame copies no data (computes views on blocks) (GH + #158) + diff --git a/doc/source/v0.7.0.txt b/doc/source/v0.7.0.txt new file mode 100644 index 00000000..bf7acd38 --- /dev/null +++ b/doc/source/v0.7.0.txt @@ -0,0 +1,272 @@ +.. _whatsnew_0700: + +v.0.7.0 (February 9, 2012) +-------------------------- + +New features +~~~~~~~~~~~~ + +- New unified :ref:`merge function ` for efficiently performing + full gamut of database / relational-algebra operations. Refactored existing + join methods to use the new infrastructure, resulting in substantial + performance gains (:issue:`220`, :issue:`249`, :issue:`267`) + +- New :ref:`unified concatenation function ` for concatenating + Series, DataFrame or Panel objects along an axis. Can form union or + intersection of the other axes. Improves performance of ``Series.append`` and + ``DataFrame.append`` (:issue:`468`, :issue:`479`, :issue:`273`) + +- :ref:`Can ` pass multiple DataFrames to + `DataFrame.append` to concatenate (stack) and multiple Series to + ``Series.append`` too + +- :ref:`Can` pass list of dicts (e.g., a + list of JSON objects) to DataFrame constructor (:issue:`526`) + +- You can now :ref:`set multiple columns ` in a + DataFrame via ``__getitem__``, useful for transformation (:issue:`342`) + +- Handle differently-indexed output values in ``DataFrame.apply`` (:issue:`498`) + +.. ipython:: python + + df = DataFrame(randn(10, 4)) + df.apply(lambda x: x.describe()) + +- :ref:`Add` ``reorder_levels`` method to Series and + DataFrame (:issue:`534`) + +- :ref:`Add` dict-like ``get`` function to DataFrame + and Panel (:issue:`521`) + +- :ref:`Add` ``DataFrame.iterrows`` method for efficiently + iterating through the rows of a DataFrame + +- :ref:`Add` ``DataFrame.to_panel`` with code adapted from + ``LongPanel.to_long`` + +- :ref:`Add ` ``reindex_axis`` method added to DataFrame + +- :ref:`Add ` ``level`` option to binary arithmetic functions on + ``DataFrame`` and ``Series`` + +- :ref:`Add ` ``level`` option to the ``reindex`` + and ``align`` methods on Series and DataFrame for broadcasting values across + a level (:issue:`542`, :issue:`552`, others) + +- :ref:`Add ` attribute-based item access to + ``Panel`` and add IPython completion (:issue:`563`) + +- :ref:`Add ` ``logy`` option to ``Series.plot`` for + log-scaling on the Y axis + +- :ref:`Add ` ``index`` and ``header`` options to + ``DataFrame.to_string`` + +- :ref:`Can ` pass multiple DataFrames to + ``DataFrame.join`` to join on index (:issue:`115`) + +- :ref:`Can ` pass multiple Panels to ``Panel.join`` + (:issue:`115`) + +- :ref:`Added ` ``justify`` argument to ``DataFrame.to_string`` + to allow different alignment of column headers + +- :ref:`Add ` ``sort`` option to GroupBy to allow disabling + sorting of the group keys for potential speedups (:issue:`595`) + +- :ref:`Can ` pass MaskedArray to Series + constructor (:issue:`563`) + +- :ref:`Add ` Panel item access via attributes + and IPython completion (:issue:`554`) + +- Implement ``DataFrame.lookup``, fancy-indexing analogue for retrieving values + given a sequence of row and column labels (:issue:`338`) + +- Can pass a :ref:`list of functions ` to + aggregate with groupby on a DataFrame, yielding an aggregated result with + hierarchical columns (:issue:`166`) + +- Can call ``cummin`` and ``cummax`` on Series and DataFrame to get cumulative + minimum and maximum, respectively (:issue:`647`) + +- ``value_range`` added as utility function to get min and max of a dataframe + (:issue:`288`) + +- Added ``encoding`` argument to ``read_csv``, ``read_table``, ``to_csv`` and + ``from_csv`` for non-ascii text (:issue:`717`) + +- :ref:`Added ` ``abs`` method to pandas objects + +- :ref:`Added ` ``crosstab`` function for easily computing frequency tables + +- :ref:`Added ` ``isin`` method to index objects + +- :ref:`Added ` ``level`` argument to ``xs`` method of DataFrame. + + +API Changes to integer indexing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the potentially riskiest API changes in 0.7.0, but also one of the most +important, was a complete review of how **integer indexes** are handled with +regard to label-based indexing. Here is an example: + +.. ipython:: python + + s = Series(randn(10), index=range(0, 20, 2)) + s + s[0] + s[2] + s[4] + +This is all exactly identical to the behavior before. However, if you ask for a +key **not** contained in the Series, in versions 0.6.1 and prior, Series would +*fall back* on a location-based lookup. This now raises a ``KeyError``: + +.. code-block:: ipython + + In [2]: s[1] + KeyError: 1 + +This change also has the same impact on DataFrame: + +.. code-block:: ipython + + In [3]: df = DataFrame(randn(8, 4), index=range(0, 16, 2)) + + In [4]: df + 0 1 2 3 + 0 0.88427 0.3363 -0.1787 0.03162 + 2 0.14451 -0.1415 0.2504 0.58374 + 4 -1.44779 -0.9186 -1.4996 0.27163 + 6 -0.26598 -2.4184 -0.2658 0.11503 + 8 -0.58776 0.3144 -0.8566 0.61941 + 10 0.10940 -0.7175 -1.0108 0.47990 + 12 -1.16919 -0.3087 -0.6049 -0.43544 + 14 -0.07337 0.3410 0.0424 -0.16037 + + In [5]: df.ix[3] + KeyError: 3 + +In order to support purely integer-based indexing, the following methods have +been added: + +.. csv-table:: + :header: "Method","Description" + :widths: 40,60 + + ``Series.iget_value(i)``, Retrieve value stored at location ``i`` + ``Series.iget(i)``, Alias for ``iget_value`` + ``DataFrame.irow(i)``, Retrieve the ``i``-th row + ``DataFrame.icol(j)``, Retrieve the ``j``-th column + "``DataFrame.iget_value(i, j)``", Retrieve the value at row ``i`` and column ``j`` + +API tweaks regarding label-based slicing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Label-based slicing using ``ix`` now requires that the index be sorted +(monotonic) **unless** both the start and endpoint are contained in the index: + +.. ipython:: python + + s = Series(randn(6), index=list('gmkaec')) + s + +Then this is OK: + +.. ipython:: python + + s.ix['k':'e'] + +But this is not: + +.. code-block:: ipython + + In [12]: s.ix['b':'h'] + KeyError 'b' + +If the index had been sorted, the "range selection" would have been possible: + +.. ipython:: python + + s2 = s.sort_index() + s2 + s2.ix['b':'h'] + +Changes to Series ``[]`` operator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As as notational convenience, you can pass a sequence of labels or a label +slice to a Series when getting and setting values via ``[]`` (i.e. the +``__getitem__`` and ``__setitem__`` methods). The behavior will be the same as +passing similar input to ``ix`` **except in the case of integer indexing**: + +.. ipython:: python + + s = Series(randn(6), index=list('acegkm')) + s + s[['m', 'a', 'c', 'e']] + s['b':'l'] + s['c':'k'] + +In the case of integer indexes, the behavior will be exactly as before +(shadowing ``ndarray``): + +.. ipython:: python + + s = Series(randn(6), index=range(0, 12, 2)) + s[[4, 0, 2]] + s[1:5] + +If you wish to do indexing with sequences and slicing on an integer index with +label semantics, use ``ix``. + +Other API Changes +~~~~~~~~~~~~~~~~~ + +- The deprecated ``LongPanel`` class has been completely removed + +- If ``Series.sort`` is called on a column of a DataFrame, an exception will + now be raised. Before it was possible to accidentally mutate a DataFrame's + column by doing ``df[col].sort()`` instead of the side-effect free method + ``df[col].order()`` (:issue:`316`) + +- Miscellaneous renames and deprecations which will (harmlessly) raise + ``FutureWarning`` + +- ``drop`` added as an optional parameter to ``DataFrame.reset_index`` (:issue:`699`) + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- :ref:`Cythonized GroupBy aggregations ` no longer + presort the data, thus achieving a significant speedup (:issue:`93`). GroupBy + aggregations with Python functions significantly sped up by clever + manipulation of the ndarray data type in Cython (:issue:`496`). +- Better error message in DataFrame constructor when passed column labels + don't match data (:issue:`497`) +- Substantially improve performance of multi-GroupBy aggregation when a + Python function is passed, reuse ndarray object in Cython (:issue:`496`) +- Can store objects indexed by tuples and floats in HDFStore (:issue:`492`) +- Don't print length by default in Series.to_string, add `length` option (:issue:`489`) +- Improve Cython code for multi-groupby to aggregate without having to sort + the data (:issue:`93`) +- Improve MultiIndex reindexing speed by storing tuples in the MultiIndex, + test for backwards unpickling compatibility +- Improve column reindexing performance by using specialized Cython take + function +- Further performance tweaking of Series.__getitem__ for standard use cases +- Avoid Index dict creation in some cases (i.e. when getting slices, etc.), + regression from prior versions +- Friendlier error message in setup.py if NumPy not installed +- Use common set of NA-handling operations (sum, mean, etc.) in Panel class + also (:issue:`536`) +- Default name assignment when calling ``reset_index`` on DataFrame with a + regular (non-hierarchical) index (:issue:`476`) +- Use Cythonized groupers when possible in Series/DataFrame stat ops with + ``level`` parameter passed (:issue:`545`) +- Ported skiplist data structure to C to speed up ``rolling_median`` by about + 5-10x in most typical use cases (:issue:`374`) + diff --git a/doc/source/v0.7.1.txt b/doc/source/v0.7.1.txt new file mode 100644 index 00000000..bc12cb8d --- /dev/null +++ b/doc/source/v0.7.1.txt @@ -0,0 +1,30 @@ +.. _whatsnew_0701: + +v.0.7.1 (February 29, 2012) +--------------------------- + +This release includes a few new features and addresses over a dozen bugs in +0.7.0. + +New features +~~~~~~~~~~~~ + + - Add ``to_clipboard`` function to pandas namespace for writing objects to + the system clipboard (:issue:`774`) + - Add ``itertuples`` method to DataFrame for iterating through the rows of a + dataframe as tuples (:issue:`818`) + - Add ability to pass fill_value and method to DataFrame and Series align + method (:issue:`806`, :issue:`807`) + - Add fill_value option to reindex, align methods (:issue:`784`) + - Enable concat to produce DataFrame from Series (:issue:`787`) + - Add ``between`` method to Series (:issue:`802`) + - Add HTML representation hook to DataFrame for the IPython HTML notebook + (:issue:`773`) + - Support for reading Excel 2007 XML documents using openpyxl + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + - Improve performance and memory usage of fillna on DataFrame + - Can concatenate a list of Series along axis=1 to obtain a DataFrame (:issue:`787`) + diff --git a/doc/source/v0.7.2.txt b/doc/source/v0.7.2.txt new file mode 100644 index 00000000..c7116393 --- /dev/null +++ b/doc/source/v0.7.2.txt @@ -0,0 +1,27 @@ +.. _whatsnew_0702: + +v.0.7.2 (March 16, 2012) +--------------------------- + +This release targets bugs in 0.7.1, and adds a few minor features. + +New features +~~~~~~~~~~~~ + + - Add additional tie-breaking methods in DataFrame.rank (:issue:`874`) + - Add ascending parameter to rank in Series, DataFrame (:issue:`875`) + - Add coerce_float option to DataFrame.from_records (:issue:`893`) + - Add sort_columns parameter to allow unsorted plots (:issue:`918`) + - Enable column access via attributes on GroupBy (:issue:`882`) + - Can pass dict of values to DataFrame.fillna (:issue:`661`) + - Can select multiple hierarchical groups by passing list of values in .ix + (:issue:`134`) + - Add ``axis`` option to DataFrame.fillna (:issue:`174`) + - Add level keyword to ``drop`` for dropping values from a level (:issue:`159`) + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + - Use khash for Series.value_counts, add raw function to algorithms.py (:issue:`861`) + - Intercept __builtin__.sum in groupby (:issue:`885`) + diff --git a/doc/source/v0.7.3.txt b/doc/source/v0.7.3.txt new file mode 100644 index 00000000..afb4b8fa --- /dev/null +++ b/doc/source/v0.7.3.txt @@ -0,0 +1,96 @@ +.. _whatsnew_0703: + +v.0.7.3 (April 12, 2012) +------------------------ + +This is a minor release from 0.7.2 and fixes many minor bugs and adds a number +of nice new features. There are also a couple of API changes to note; these +should not affect very many users, and we are inclined to call them "bug fixes" +even though they do constitute a change in behavior. See the :ref:`full release +notes ` or issue +tracker on GitHub for a complete list. + +New features +~~~~~~~~~~~~ + +- New :ref:`fixed width file reader `, ``read_fwf`` +- New :ref:`scatter_matrix ` function for making + a scatter plot matrix + +.. code-block:: python + + from pandas.tools.plotting import scatter_matrix + scatter_matrix(df, alpha=0.2) + +.. image:: _static/scatter_matrix_kde.png + :width: 5in + +- Add ``stacked`` argument to Series and DataFrame's ``plot`` method for + :ref:`stacked bar plots `. + +.. code-block:: python + + df.plot(kind='bar', stacked=True) + +.. image:: _static/bar_plot_stacked_ex.png + :width: 4in + +.. code-block:: python + + df.plot(kind='barh', stacked=True) + +.. image:: _static/barh_plot_stacked_ex.png + :width: 4in + +- Add log x and y :ref:`scaling options ` to + ``DataFrame.plot`` and ``Series.plot`` +- Add ``kurt`` methods to Series and DataFrame for computing kurtosis + + +NA Boolean Comparison API Change +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Reverted some changes to how NA values (represented typically as ``NaN`` or +``None``) are handled in non-numeric Series: + +.. ipython:: python + + series = Series(['Steve', np.nan, 'Joe']) + series == 'Steve' + series != 'Steve' + +In comparisons, NA / NaN will always come through as ``False`` except with +``!=`` which is ``True``. *Be very careful* with boolean arithmetic, especially +negation, in the presence of NA data. You may wish to add an explicit NA +filter into boolean array operations if you are worried about this: + +.. ipython:: python + + mask = series == 'Steve' + series[mask & series.notnull()] + +While propagating NA in comparisons may seem like the right behavior to some +users (and you could argue on purely technical grounds that this is the right +thing to do), the evaluation was made that propagating NA everywhere, including +in numerical arrays, would cause a large amount of problems for users. Thus, a +"practicality beats purity" approach was taken. This issue may be revisited at +some point in the future. + +Other API Changes +~~~~~~~~~~~~~~~~~ + +When calling ``apply`` on a grouped Series, the return value will also be a +Series, to be more consistent with the ``groupby`` behavior with DataFrame: + +.. ipython:: python + + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) + df + grouped = df.groupby('A')['C'] + grouped.describe() + grouped.apply(lambda x: x.order()[-2:]) # top 2 values + diff --git a/doc/source/v0.8.0.txt b/doc/source/v0.8.0.txt new file mode 100644 index 00000000..a76c4e48 --- /dev/null +++ b/doc/source/v0.8.0.txt @@ -0,0 +1,274 @@ +.. _whatsnew_080: + +v0.8.0 (June 29, 2012) +------------------------ + +This is a major release from 0.7.3 and includes extensive work on the time +series handling and processing infrastructure as well as a great deal of new +functionality throughout the library. It includes over 700 commits from more +than 20 distinct authors. Most pandas 0.7.3 and earlier users should not +experience any issues upgrading, but due to the migration to the NumPy +datetime64 dtype, there may be a number of bugs and incompatibilities +lurking. Lingering incompatibilities will be fixed ASAP in a 0.8.1 release if +necessary. See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list. + +Support for non-unique indexes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All objects can now work with non-unique indexes. Data alignment / join +operations work according to SQL join semantics (including, if application, +index duplication in many-to-many joins) + +NumPy datetime64 dtype and 1.6 dependency +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Time series data are now represented using NumPy's datetime64 dtype; thus, +pandas 0.8.0 now requires at least NumPy 1.6. It has been tested and verified +to work with the development version (1.7+) of NumPy as well which includes +some significant user-facing API changes. NumPy 1.6 also has a number of bugs +having to do with nanosecond resolution data, so I recommend that you steer +clear of NumPy 1.6's datetime64 API functions (though limited as they are) and +only interact with this data using the interface that pandas provides. + +See the end of the 0.8.0 section for a "porting" guide listing potential issues +for users migrating legacy codebases from pandas 0.7 or earlier to 0.8.0. + +Bug fixes to the 0.7.x series for legacy NumPy < 1.6 users will be provided as +they arise. There will be no more further development in 0.7.x beyond bug +fixes. + +Time series changes and improvements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + With this release, legacy scikits.timeseries users should be able to port + their code to use pandas. + +.. note:: + + See :ref:`documentation ` for overview of pandas timeseries API. + +- New datetime64 representation **speeds up join operations and data + alignment**, **reduces memory usage**, and improve serialization / + deserialization performance significantly over datetime.datetime +- High performance and flexible **resample** method for converting from + high-to-low and low-to-high frequency. Supports interpolation, user-defined + aggregation functions, and control over how the intervals and result labeling + are defined. A suite of high performance Cython/C-based resampling functions + (including Open-High-Low-Close) have also been implemented. +- Revamp of :ref:`frequency aliases ` and support for + **frequency shortcuts** like '15min', or '1h30min' +- New :ref:`DatetimeIndex class ` supports both fixed + frequency and irregular time + series. Replaces now deprecated DateRange class +- New ``PeriodIndex`` and ``Period`` classes for representing + :ref:`time spans ` and performing **calendar logic**, + including the `12 fiscal quarterly frequencies `. + This is a partial port of, and a substantial enhancement to, + elements of the scikits.timeseries codebase. Support for conversion between + PeriodIndex and DatetimeIndex +- New Timestamp data type subclasses `datetime.datetime`, providing the same + interface while enabling working with nanosecond-resolution data. Also + provides :ref:`easy time zone conversions `. +- Enhanced support for :ref:`time zones `. Add + `tz_convert` and ``tz_lcoalize`` methods to TimeSeries and DataFrame. All + timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time + zone set will be localized to localtime. Time zone conversions are therefore + essentially free. User needs to know very little about pytz library now; only + time zone names as as strings are required. Time zone-aware timestamps are + equal if and only if their UTC timestamps match. Operations between time + zone-aware time series with different time zones will result in a UTC-indexed + time series. +- Time series **string indexing conveniences** / shortcuts: slice years, year + and month, and index values with strings +- Enhanced time series **plotting**; adaptation of scikits.timeseries + matplotlib-based plotting code +- New ``date_range``, ``bdate_range``, and ``period_range`` :ref:`factory + functions ` +- Robust **frequency inference** function `infer_freq` and ``inferred_freq`` + property of DatetimeIndex, with option to infer frequency on construction of + DatetimeIndex +- to_datetime function efficiently **parses array of strings** to + DatetimeIndex. DatetimeIndex will parse array or list of strings to + datetime64 +- **Optimized** support for datetime64-dtype data in Series and DataFrame + columns +- New NaT (Not-a-Time) type to represent **NA** in timestamp arrays +- Optimize Series.asof for looking up **"as of" values** for arrays of + timestamps +- Milli, Micro, Nano date offset objects +- Can index time series with datetime.time objects to select all data at + particular **time of day** (``TimeSeries.at_time``) or **between two times** + (``TimeSeries.between_time``) +- Add :ref:`tshift ` method for leading/lagging + using the frequency (if any) of the index, as opposed to a naive lead/lag + using shift + +Other new features +~~~~~~~~~~~~~~~~~~ + +- New :ref:`cut ` and ``qcut`` functions (like R's cut + function) for computing a categorical variable from a continuous variable by + binning values either into value-based (``cut``) or quantile-based (``qcut``) + bins +- Rename ``Factor`` to ``Categorical`` and add a number of usability features +- Add :ref:`limit ` argument to fillna/reindex +- More flexible multiple function application in GroupBy, and can pass list + (name, function) tuples to get result in particular order with given names +- Add flexible :ref:`replace ` method for efficiently + substituting values +- Enhanced :ref:`read_csv/read_table ` for reading time series + data and converting multiple columns to dates +- Add :ref:`comments ` option to parser functions: read_csv, etc. +- Add :ref`dayfirst ` option to parser functions for parsing + international DD/MM/YYYY dates +- Allow the user to specify the CSV reader :ref:`dialect ` to + control quoting etc. +- Handling :ref:`thousands ` separators in read_csv to improve + integer parsing. +- Enable unstacking of multiple levels in one shot. Alleviate ``pivot_table`` + bugs (empty columns being introduced) +- Move to klib-based hash tables for indexing; better performance and less + memory usage than Python's dict +- Add first, last, min, max, and prod optimized GroupBy functions +- New :ref:`ordered_merge ` function +- Add flexible :ref:`comparison ` instance methods eq, ne, lt, + gt, etc. to DataFrame, Series +- Improve :ref:`scatter_matrix ` plotting + function and add histogram or kernel density estimates to diagonal +- Add :ref:`'kde' ` plot option for density plots +- Support for converting DataFrame to R data.frame through rpy2 +- Improved support for complex numbers in Series and DataFrame +- Add :ref:`pct_change ` method to all data structures +- Add max_colwidth configuration option for DataFrame console output +- :ref:`Interpolate ` Series values using index values +- Can select multiple columns from GroupBy +- Add :ref:`update ` methods to Series/DataFrame + for updating values in place +- Add ``any`` and ``all`` method to DataFrame + +New plotting methods +~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + import pandas as pd + fx = pd.read_pickle('data/fx_prices') + import matplotlib.pyplot as plt + +``Series.plot`` now supports a ``secondary_y`` option: + +.. ipython:: python + + plt.figure() + + fx['FR'].plot(style='g') + + @savefig whatsnew_secondary_y.png + fx['IT'].plot(style='k--', secondary_y=True) + +Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot +types. For example, ``'kde'`` is a new option: + +.. ipython:: python + + s = Series(np.concatenate((np.random.randn(1000), + np.random.randn(1000) * 0.5 + 3))) + plt.figure() + s.hist(normed=True, alpha=0.2) + @savefig whatsnew_kde.png + s.plot(kind='kde') + +See :ref:`the plotting page ` for much more. + +Other API changes +~~~~~~~~~~~~~~~~~ + +- Deprecation of ``offset``, ``time_rule``, and ``timeRule`` arguments names in + time series functions. Warnings will be printed until pandas 0.9 or 1.0. + +Potential porting issues for pandas <= 0.7.3 users +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The major change that may affect you in pandas 0.8.0 is that time series +indexes use NumPy's ``datetime64`` data type instead of ``dtype=object`` arrays +of Python's built-in ``datetime.datetime`` objects. ``DateRange`` has been +replaced by ``DatetimeIndex`` but otherwise behaved identically. But, if you +have code that converts ``DateRange`` or ``Index`` objects that used to contain +``datetime.datetime`` values to plain NumPy arrays, you may have bugs lurking +with code using scalar values because you are handing control over to NumPy: + +.. ipython:: python + + import datetime + rng = date_range('1/1/2000', periods=10) + rng[5] + isinstance(rng[5], datetime.datetime) + rng_asarray = np.asarray(rng) + scalar_val = rng_asarray[5] + type(scalar_val) + +pandas's ``Timestamp`` object is a subclass of ``datetime.datetime`` that has +nanosecond support (the ``nanosecond`` field store the nanosecond value between +0 and 999). It should substitute directly into any code that used +``datetime.datetime`` values before. Thus, I recommend not casting +``DatetimeIndex`` to regular NumPy arrays. + +If you have code that requires an array of ``datetime.datetime`` objects, you +have a couple of options. First, the ``asobject`` property of ``DatetimeIndex`` +produces an array of ``Timestamp`` objects: + +.. ipython:: python + + stamp_array = rng.asobject + stamp_array + stamp_array[5] + +To get an array of proper ``datetime.datetime`` objects, use the +``to_pydatetime`` method: + +.. ipython:: python + + dt_array = rng.to_pydatetime() + dt_array + dt_array[5] + +matplotlib knows how to handle ``datetime.datetime`` but not Timestamp +objects. While I recommend that you plot time series using ``TimeSeries.plot``, +you can either use ``to_pydatetime`` or register a converter for the Timestamp +type. See `matplotlib documentation +`__ for more on this. + +.. warning:: + + There are bugs in the user-facing API with the nanosecond datetime64 unit + in NumPy 1.6. In particular, the string version of the array shows garbage + values, and conversion to ``dtype=object`` is similarly broken. + + .. ipython:: python + + rng = date_range('1/1/2000', periods=10) + rng + np.asarray(rng) + converted = np.asarray(rng, dtype=object) + converted[5] + + **Trust me: don't panic**. If you are using NumPy 1.6 and restrict your + interaction with ``datetime64`` values to pandas's API you will be just + fine. There is nothing wrong with the data-type (a 64-bit integer + internally); all of the important data processing happens in pandas and is + heavily tested. I strongly recommend that you **do not work directly with + datetime64 arrays in NumPy 1.6** and only use the pandas API. + + +**Support for non-unique indexes**: In the latter case, you may have code +inside a ``try:... catch:`` block that failed due to the index not being +unique. In many cases it will no longer fail (some method like ``append`` still +check for uniqueness unless disabled). However, all is not lost: you can +inspect ``index.is_unique`` and raise an exception explicitly if it is +``False`` or go to a different code branch. + diff --git a/doc/source/v0.8.1.txt b/doc/source/v0.8.1.txt new file mode 100644 index 00000000..cecf6f16 --- /dev/null +++ b/doc/source/v0.8.1.txt @@ -0,0 +1,36 @@ +.. _whatsnew_0801: + +v0.8.1 (July 22, 2012) +---------------------- + +This release includes a few new features, performance enhancements, and over 30 +bug fixes from 0.8.0. New features include notably NA friendly string +processing functionality and a series of new plot types and options. + +New features +~~~~~~~~~~~~ + + - Add :ref:`vectorized string processing methods ` + accessible via Series.str (:issue:`620`) + - Add option to disable adjustment in EWMA (:issue:`1584`) + - :ref:`Radviz plot ` (:issue:`1566`) + - :ref:`Parallel coordinates plot ` + - :ref:`Bootstrap plot ` + - Per column styles and secondary y-axis plotting (:issue:`1559`) + - New datetime converters millisecond plotting (:issue:`1599`) + - Add option to disable "sparse" display of hierarchical indexes (:issue:`1538`) + - Series/DataFrame's ``set_index`` method can :ref:`append levels + ` to an existing Index/MultiIndex (:issue:`1569`, :issue:`1577`) + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + - Improved implementation of rolling min and max (thanks to `Bottleneck + `__ !) + - Add accelerated ``'median'`` GroupBy option (:issue:`1358`) + - Significantly improve the performance of parsing ISO8601-format date + strings with ``DatetimeIndex`` or ``to_datetime`` (:issue:`1571`) + - Improve the performance of GroupBy on single-key aggregations and use with + Categorical types + - Significant datetime parsing performance improvments + diff --git a/doc/source/v0.9.0.txt b/doc/source/v0.9.0.txt new file mode 100644 index 00000000..2b385a7e --- /dev/null +++ b/doc/source/v0.9.0.txt @@ -0,0 +1,97 @@ +.. _whatsnew_0900: + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +v0.9.0 (October 7, 2012) +------------------------ + +This is a major release from 0.8.1 and includes several new features and +enhancements along with a large number of bug fixes. New features include +vectorized unicode encoding/decoding for `Series.str`, `to_latex` method to +DataFrame, more flexible parsing of boolean values, and enabling the download of +options data from Yahoo! Finance. + +New features +~~~~~~~~~~~~ + + - Add ``encode`` and ``decode`` for unicode handling to :ref:`vectorized + string processing methods ` in Series.str (:issue:`1706`) + - Add ``DataFrame.to_latex`` method (:issue:`1735`) + - Add convenient expanding window equivalents of all rolling_* ops (:issue:`1785`) + - Add Options class to pandas.io.data for fetching options data from Yahoo! + Finance (:issue:`1748`, :issue:`1739`) + - More flexible parsing of boolean values (Yes, No, TRUE, FALSE, etc) + (:issue:`1691`, :issue:`1295`) + - Add ``level`` parameter to ``Series.reset_index`` + - ``TimeSeries.between_time`` can now select times across midnight (:issue:`1871`) + - Series constructor can now handle generator as input (:issue:`1679`) + - ``DataFrame.dropna`` can now take multiple axes (tuple/list) as input + (:issue:`924`) + - Enable ``skip_footer`` parameter in ``ExcelFile.parse`` (:issue:`1843`) + +API changes +~~~~~~~~~~~ + + - The default column names when ``header=None`` and no columns names passed to + functions like ``read_csv`` has changed to be more Pythonic and amenable to + attribute access: + +.. ipython:: python + + data = '0,0,1\n1,1,0\n0,1,0' + df = read_csv(StringIO(data), header=None) + df + + +- Creating a Series from another Series, passing an index, will cause reindexing + to happen inside rather than treating the Series like an ndarray. Technically + improper usages like ``Series(df[col1], index=df[col2])`` that worked before + "by accident" (this was never intended) will lead to all NA Series in some + cases. To be perfectly clear: + +.. ipython:: python + + s1 = Series([1, 2, 3]) + s1 + + s2 = Series(s1, index=['foo', 'bar', 'baz']) + s2 + +- Deprecated ``day_of_year`` API removed from PeriodIndex, use ``dayofyear`` + (:issue:`1723`) + +- Don't modify NumPy suppress printoption to True at import time + +- The internal HDF5 data arrangement for DataFrames has been transposed. Legacy + files will still be readable by HDFStore (:issue:`1834`, :issue:`1824`) + +- Legacy cruft removed: pandas.stats.misc.quantileTS + +- Use ISO8601 format for Period repr: monthly, daily, and on down (:issue:`1776`) + +- Empty DataFrame columns are now created as object dtype. This will prevent a + class of TypeErrors that was occurring in code where the dtype of a column + would depend on the presence of data or not (e.g. a SQL query having results) + (:issue:`1783`) + +- Setting parts of DataFrame/Panel using ix now aligns input Series/DataFrame + (:issue:`1630`) + +- ``first`` and ``last`` methods in ``GroupBy`` no longer drop non-numeric + columns (:issue:`1809`) + +- Resolved inconsistencies in specifying custom NA values in text parser. + ``na_values`` of type dict no longer override default NAs unless + ``keep_default_na`` is set to false explicitly (:issue:`1657`) + +- ``DataFrame.dot`` will not do data alignment, and also work with Series + (:issue:`1915`) + + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/v0.9.1.txt b/doc/source/v0.9.1.txt new file mode 100644 index 00000000..6718a049 --- /dev/null +++ b/doc/source/v0.9.1.txt @@ -0,0 +1,145 @@ +.. _whatsnew_0901: + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +v0.9.1 (November 14, 2012) +-------------------------- + +This is a bugfix release from 0.9.0 and includes several new features and +enhancements along with a large number of bug fixes. The new features include +by-column sort order for DataFrame and Series, improved NA handling for the rank +method, masking functions for DataFrame, and intraday time-series filtering for +DataFrame. + +New features +~~~~~~~~~~~~ + + - `Series.sort`, `DataFrame.sort`, and `DataFrame.sort_index` can now be + specified in a per-column manner to support multiple sort orders (:issue:`928`) + + .. ipython:: python + + df = DataFrame(np.random.randint(0, 2, (6, 3)), columns=['A', 'B', 'C']) + + df.sort(['A', 'B'], ascending=[1, 0]) + + + - `DataFrame.rank` now supports additional argument values for the + `na_option` parameter so missing values can be assigned either the largest + or the smallest rank (:issue:`1508`, :issue:`2159`) + + .. ipython:: python + + df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + + df.ix[2:4] = np.nan + + df.rank() + + df.rank(na_option='top') + + df.rank(na_option='bottom') + + + - DataFrame has new `where` and `mask` methods to select values according to a + given boolean mask (:issue:`2109`, :issue:`2151`) + + DataFrame currently supports slicing via a boolean vector the same length as the DataFrame (inside the `[]`). + The returned DataFrame has the same number of columns as the original, but is sliced on its index. + + .. ipython:: python + + df = DataFrame(np.random.randn(5, 3), columns = ['A','B','C']) + + df + + df[df['A'] > 0] + + If a DataFrame is sliced with a DataFrame based boolean condition (with the same size as the original DataFrame), + then a DataFrame the same size (index and columns) as the original is returned, with + elements that do not meet the boolean condition as `NaN`. This is accomplished via + the new method `DataFrame.where`. In addition, `where` takes an optional `other` argument for replacement. + + .. ipython:: python + + df[df>0] + + df.where(df>0) + + df.where(df>0,-df) + + Furthermore, `where` now aligns the input boolean condition (ndarray or DataFrame), such that partial selection + with setting is possible. This is analagous to partial setting via `.ix` (but on the contents rather than the axis labels) + + .. ipython:: python + + df2 = df.copy() + df2[ df2[1:4] > 0 ] = 3 + df2 + + `DataFrame.mask` is the inverse boolean operation of `where`. + + .. ipython:: python + + df.mask(df<=0) + + - Enable referencing of Excel columns by their column names (:issue:`1936`) + + .. ipython:: python + + xl = ExcelFile('data/test.xls') + xl.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A:D') + + + - Added option to disable pandas-style tick locators and formatters + using `series.plot(x_compat=True)` or `pandas.plot_params['x_compat'] = + True` (:issue:`2205`) + - Existing TimeSeries methods `at_time` and `between_time` were added to + DataFrame (:issue:`2149`) + - DataFrame.dot can now accept ndarrays (:issue:`2042`) + - DataFrame.drop now supports non-unique indexes (:issue:`2101`) + - Panel.shift now supports negative periods (:issue:`2164`) + - DataFrame now support unary ~ operator (:issue:`2110`) + +API changes +~~~~~~~~~~~ + + - Upsampling data with a PeriodIndex will result in a higher frequency + TimeSeries that spans the original time window + + .. ipython:: python + + prng = period_range('2012Q1', periods=2, freq='Q') + + s = Series(np.random.randn(len(prng)), prng) + + s.resample('M') + + + - Period.end_time now returns the last nanosecond in the time interval + (:issue:`2124`, :issue:`2125`, :issue:`1764`) + + .. ipython:: python + + p = Period('2012') + + p.end_time + + + - File parsers no longer coerce to float or bool for columns that have custom + converters specified (:issue:`2184`) + + .. ipython:: python + + data = 'A,B,C\n00001,001,5\n00002,002,6' + + read_csv(StringIO(data), converters={'A' : lambda x: x.strip()}) + + +See the :ref:`full release notes +` or issue tracker +on GitHub for a complete list. diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst new file mode 100644 index 00000000..630e40c4 --- /dev/null +++ b/doc/source/visualization.rst @@ -0,0 +1,1146 @@ +.. currentmodule:: pandas +.. _visualization: + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + from numpy.random import randn, rand, randint + np.random.seed(123456) + from pandas import DataFrame, Series, date_range, options + import pandas.util.testing as tm + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + options.display.mpl_style = 'default' + options.display.max_rows = 15 + from pandas.compat import lrange + +******** +Plotting +******** + +We use the standard convention for referencing the matplotlib API: + +.. ipython:: python + + import matplotlib.pyplot as plt + +.. versionadded:: 0.11.0 + +The ``display.mpl_style`` produces more appealing plots. +When set, matplotlib's ``rcParams`` are changed (globally!) to nicer-looking settings. +All the plots in the documentation are rendered with this option set to the +'default' style. + +.. ipython:: python + + pd.options.display.mpl_style = 'default' + +We provide the basics in pandas to easily create decent looking plots. +See the :ref:`ecosystem ` section for visualization +libraries that go beyond the basics documented here. + +.. note:: + + All calls to ``np.random`` are seeded with 123456. + +.. _visualization.basic: + +Basic Plotting: ``plot`` +------------------------ + +See the :ref:`cookbook` for some advanced strategies + +The ``plot`` method on Series and DataFrame is just a simple wrapper around +:meth:`plt.plot() `: + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + @savefig series_plot_basic.png + ts.plot() + +If the index consists of dates, it calls :meth:`gcf().autofmt_xdate() ` +to try to format the x-axis nicely as per above. + +On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the columns with labels: + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = df.cumsum() + + @savefig frame_plot_basic.png + plt.figure(); df.plot(); + +You can plot one column versus another using the `x` and `y` keywords in +:meth:`~DataFrame.plot`: + +.. ipython:: python + :suppress: + + plt.figure() + np.random.seed(123456) + +.. ipython:: python + + df3 = DataFrame(randn(1000, 2), columns=['B', 'C']).cumsum() + df3['A'] = Series(list(range(len(df)))) + + @savefig df_plot_xy.png + df3.plot(x='A', y='B') + +.. note:: + + For more formatting and sytling options, see :ref:`below `. + +.. ipython:: python + :suppress: + + plt.close('all') + +.. _visualization.other: + +Other Plots +----------- + +The ``kind`` keyword argument of :meth:`~DataFrame.plot` accepts +a handful of values for plots other than the default Line plot. +These include: + +* :ref:`'bar' ` or :ref:`'barh' ` for bar plots +* :ref:`'kde' ` or ``'density'`` for density plots +* :ref:`'area' ` for area plots +* :ref:`'scatter' ` for scatter plots +* :ref:`'hexbin' ` for hexagonal bin plots +* :ref:`'pie' ` for pie plots + +In addition to these ``kind`` s, there are the :ref:`DataFrame.hist() `, +and :ref:`DataFrame.boxplot() ` methods, which use a separate interface. + +Finally, there are several :ref:`plotting functions ` in ``pandas.tools.plotting`` +that take a :class:`Series` or :class:`DataFrame` as an argument. These +include + +* :ref:`Scatter Matrix ` +* :ref:`Andrews Curves ` +* :ref:`Parallel Coordinates ` +* :ref:`Lag Plot ` +* :ref:`Autocorrelation Plot ` +* :ref:`Bootstrap Plot ` +* :ref:`RadViz ` + +Plots may also be adorned with :ref:`errorbars ` +or :ref:`tables `. + +.. _visualization.barplot: + +Bar plots +~~~~~~~~~ + +For labeled, non-time series data, you may wish to produce a bar plot: + +.. ipython:: python + + plt.figure(); + + @savefig bar_plot_ex.png + df.ix[5].plot(kind='bar'); plt.axhline(0, color='k') + +Calling a DataFrame's :meth:`~DataFrame.plot` method with ``kind='bar'`` produces a multiple +bar plot: + +.. ipython:: python + :suppress: + + plt.figure() + np.random.seed(123456) + +.. ipython:: python + + df2 = DataFrame(rand(10, 4), columns=['a', 'b', 'c', 'd']) + + @savefig bar_plot_multi_ex.png + df2.plot(kind='bar'); + +To produce a stacked bar plot, pass ``stacked=True``: + +.. ipython:: python + :suppress: + + plt.figure() + +.. ipython:: python + + @savefig bar_plot_stacked_ex.png + df2.plot(kind='bar', stacked=True); + +To get horizontal bar plots, pass ``kind='barh'``: + +.. ipython:: python + :suppress: + + plt.figure() + +.. ipython:: python + + @savefig barh_plot_stacked_ex.png + df2.plot(kind='barh', stacked=True); + +.. _visualization.hist: + +Histograms +~~~~~~~~~~ +.. ipython:: python + + plt.figure(); + + @savefig hist_plot_ex.png + df['A'].diff().hist() + + +:meth:`DataFrame.hist` plots the histograms of the columns on multiple +subplots: + +.. ipython:: python + + plt.figure() + + @savefig frame_hist_ex.png + df.diff().hist(color='k', alpha=0.5, bins=50) + + +.. versionadded:: 0.10.0 + +The ``by`` keyword can be specified to plot grouped histograms: + +.. ipython:: python + :suppress: + + plt.figure() + np.random.seed(123456) + +.. ipython:: python + + data = Series(randn(1000)) + + @savefig grouped_hist.png + data.hist(by=randint(0, 4, 1000), figsize=(6, 4)) + + +.. _visualization.box: + +Box Plots +~~~~~~~~~ + +DataFrame has a :meth:`~DataFrame.boxplot` method that allows you to visualize the +distribution of values within each column. + +For instance, here is a boxplot representing five trials of 10 observations of +a uniform random variable on [0,1). + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + :okwarning: + + df = DataFrame(rand(10,5)) + plt.figure(); + + @savefig box_plot_ex.png + bp = df.boxplot() + +You can create a stratified boxplot using the ``by`` keyword argument to create +groupings. For instance, + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + :okwarning: + + df = DataFrame(rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + + plt.figure(); + + @savefig box_plot_ex2.png + bp = df.boxplot(by='X') + +You can also pass a subset of columns to plot, as well as group by multiple +columns: + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + :okwarning: + + df = DataFrame(rand(10,3), columns=['Col1', 'Col2', 'Col3']) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + df['Y'] = Series(['A','B','A','B','A','B','A','B','A','B']) + + plt.figure(); + + @savefig box_plot_ex3.png + bp = df.boxplot(column=['Col1','Col2'], by=['X','Y']) + +.. ipython:: python + :suppress: + + plt.close('all') + +.. _visualization.box.return: + +The return type of ``boxplot`` depends on two keyword arguments: ``by`` and ``return_type``. +When ``by`` is ``None``: + +* if ``return_type`` is ``'dict'``, a dictionary containing the :class:`matplotlib Lines ` is returned. The keys are "boxes", "caps", "fliers", "medians", and "whiskers". + This is the deafult. +* if ``return_type`` is ``'axes'``, a :class:`matplotlib Axes ` containing the boxplot is returned. +* if ``return_type`` is ``'both'`` a namedtuple containging the :class:`matplotlib Axes ` + and :class:`matplotlib Lines ` is returned + +When ``by`` is some column of the DataFrame, a dict of ``return_type`` is returned, where +the keys are the columns of the DataFrame. The plot has a facet for each column of +the DataFrame, with a separate box for each value of ``by``. + +Finally, when calling boxplot on a :class:`Groupby` object, a dict of ``return_type`` +is returned, where the keys are the same as the Groupby object. The plot has a +facet for each key, with each facet containing a box for each column of the +DataFrame. + +.. ipython:: python + :okwarning: + + np.random.seed(1234) + df_box = DataFrame(np.random.randn(50, 2)) + df_box['g'] = np.random.choice(['A', 'B'], size=50) + df_box.loc[df_box['g'] == 'B', 1] += 3 + + @savefig boxplot_groupby.png + bp = df_box.boxplot(by='g') + +Compare to: + +.. ipython:: python + :okwarning: + + @savefig groupby_boxplot_vis.png + bp = df_box.groupby('g').boxplot() + +.. _visualization.area_plot: + +Area Plot +~~~~~~~~~ + +.. versionadded:: 0.14 + +You can create area plots with ``Series.plot`` and ``DataFrame.plot`` by passing ``kind='area'``. Area plots are stacked by default. To produce stacked area plot, each column must be either all positive or all negative values. + +When input data contains `NaN`, it will be automatically filled by 0. If you want to drop or fill by different values, use :func:`dataframe.dropna` or :func:`dataframe.fillna` before calling `plot`. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + plt.figure() + +.. ipython:: python + + df = DataFrame(rand(10, 4), columns=['a', 'b', 'c', 'd']) + + @savefig area_plot_stacked.png + df.plot(kind='area'); + +To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 unless otherwise specified: + +.. ipython:: python + :suppress: + + plt.figure() + +.. ipython:: python + + @savefig area_plot_unstacked.png + df.plot(kind='area', stacked=False); + +.. _visualization.hexbin: + +Hexagonal Bin Plot +~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14 + +You can create hexagonal bin plots with :meth:`DataFrame.plot` and +``kind='hexbin'``. +Hexbin plots can be a useful alternative to scatter plots if your data are +too dense to plot each point individually. + +.. ipython:: python + :suppress: + + plt.figure() + np.random.seed(123456) + +.. ipython:: python + + df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df['b'] = df['b'] + np.arange(1000) + + @savefig hexbin_plot.png + df.plot(kind='hexbin', x='a', y='b', gridsize=25) + + +A useful keyword argument is ``gridsize``; it controls the number of hexagons +in the x-direction, and defaults to 100. A larger ``gridsize`` means more, smaller +bins. + +By default, a histogram of the counts around each ``(x, y)`` point is computed. +You can specify alternative aggregations by passing values to the ``C`` and +``reduce_C_function`` arguments. ``C`` specifies the value at each ``(x, y)`` point +and ``reduce_C_function`` is a function of one argument that reduces all the +values in a bin to a single number (e.g. ``mean``, ``max``, ``sum``, ``std``). In this +example the positions are given by columns ``a`` and ``b``, while the value is +given by column ``z``. The bins are aggregated with numpy's ``max`` function. + +.. ipython:: python + :suppress: + + plt.figure() + np.random.seed(123456) + +.. ipython:: python + + df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df['b'] = df['b'] = df['b'] + np.arange(1000) + df['z'] = np.random.uniform(0, 3, 1000) + + @savefig hexbin_plot_agg.png + df.plot(kind='hexbin', x='a', y='b', C='z', reduce_C_function=np.max, + gridsize=25) + + +See the :meth:`hexbin ` method and the +`matplotlib hexbin documenation `__ for more. + +.. _visualization.pie: + +Pie plot +~~~~~~~~ + +.. versionadded:: 0.14 + +You can create a pie plot with :meth:`DataFrame.plot` or :meth:`Series.plot` with ``kind='pie'``. +If your data includes any ``NaN``, they will be automatically filled with 0. +A ``ValueError`` will be raised if there are any negative values in your data. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + plt.figure() + +.. ipython:: python + + series = Series(3 * rand(4), index=['a', 'b', 'c', 'd'], name='series') + + @savefig series_pie_plot.png + series.plot(kind='pie') + +Note that pie plot with :class:`DataFrame` requires that you either specify a target column by the ``y`` +argument or ``subplots=True``. When ``y`` is specified, pie plot of selected column +will be drawn. If ``subplots=True`` is specified, pie plots for each column are drawn as subplots. +A legend will be drawn in each pie plots by default; specify ``legend=False`` to hide it. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + plt.figure() + +.. ipython:: python + + df = DataFrame(3 * rand(4, 2), index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + + @savefig df_pie_plot.png + df.plot(kind='pie', subplots=True) + +You can use the ``labels`` and ``colors`` keywords to specify the labels and colors of each wedge. + +.. warning:: + + Most pandas plots use the the ``label`` and ``color`` arguments (not the lack of "s" on those). + To be consistent with :func:`matplotlib.pyplot.pie` you must use ``labels`` and ``colors``. + +If you want to hide wedge labels, specify ``labels=None``. +If ``fontsize`` is specified, the value will be applied to wedge labels. +Also, other keywords supported by :func:`matplotlib.pyplot.pie` can be used. + + +.. ipython:: python + :suppress: + + plt.figure() + +.. ipython:: python + + @savefig series_pie_plot_options.png + series.plot(kind='pie', labels=['AA', 'BB', 'CC', 'DD'], colors=['r', 'g', 'b', 'c'], + autopct='%.2f', fontsize=20) + +If you pass values whose sum total is less than 1.0, matplotlib draws a semicircle. + +.. ipython:: python + :suppress: + + plt.figure() + +.. ipython:: python + + series = Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') + + @savefig series_pie_plot_semi.png + series.plot(kind='pie') + +See the `matplotlib pie documenation `__ for more. + +.. ipython:: python + :suppress: + + plt.close('all') + +.. _visualization.tools: + +Plotting Tools +-------------- + +These functions can be imported from ``pandas.tools.plotting`` +and take a :class:`Series` or :class:`DataFrame` as an argument. + +.. _visualization.scatter_matrix: + +Scatter Matrix Plot +~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.7.3 + +You can create a scatter plot matrix using the + ``scatter_matrix`` method in ``pandas.tools.plotting``: + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + from pandas.tools.plotting import scatter_matrix + df = DataFrame(randn(1000, 4), columns=['a', 'b', 'c', 'd']) + + @savefig scatter_matrix_kde.png + scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') + +.. _visualization.kde: + +Density Plot +~~~~~~~~~~~~ + +.. versionadded:: 0.8.0 + +You can create density plots using the Series/DataFrame.plot and +setting ``kind='kde'``: + +.. ipython:: python + :suppress: + + plt.figure() + np.random.seed(123456) + +.. ipython:: python + + ser = Series(randn(1000)) + + @savefig kde_plot.png + ser.plot(kind='kde') + +.. _visualization.andrews_curves: + +Andrews Curves +~~~~~~~~~~~~~~ + +Andrews curves allow one to plot multivariate data as a large number +of curves that are created using the attributes of samples as coefficients +for Fourier series. By coloring these curves differently for each class +it is possible to visualize data clustering. Curves belonging to samples +of the same class will usually be closer together and form larger structures. + +**Note**: The "Iris" dataset is available `here `__. + +.. ipython:: python + + from pandas import read_csv + from pandas.tools.plotting import andrews_curves + + data = read_csv('data/iris.data') + + plt.figure() + + @savefig andrews_curves.png + andrews_curves(data, 'Name') + +.. _visualization.parallel_coordinates: + +Parallel Coordinates +~~~~~~~~~~~~~~~~~~~~ + +Parallel coordinates is a plotting technique for plotting multivariate data. +It allows one to see clusters in data and to estimate other statistics visually. +Using parallel coordinates points are represented as connected line segments. +Each vertical line represents one attribute. One set of connected line segments +represents one data point. Points that tend to cluster will appear closer together. + +.. ipython:: python + + from pandas import read_csv + from pandas.tools.plotting import parallel_coordinates + + data = read_csv('data/iris.data') + + plt.figure() + + @savefig parallel_coordinates.png + parallel_coordinates(data, 'Name') + +.. _visualization.lag: + +Lag Plot +~~~~~~~~ + +Lag plots are used to check if a data set or time series is random. Random +data should not exhibit any structure in the lag plot. Non-random structure +implies that the underlying data are not random. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + from pandas.tools.plotting import lag_plot + + plt.figure() + + data = Series(0.1 * rand(1000) + + 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) + + @savefig lag_plot.png + lag_plot(data) + +.. _visualization.autocorrelation: + +Autocorrelation Plot +~~~~~~~~~~~~~~~~~~~~ + +Autocorrelation plots are often used for checking randomness in time series. +This is done by computing autocorrelations for data values at varying time lags. +If time series is random, such autocorrelations should be near zero for any and +all time-lag separations. If time series is non-random then one or more of the +autocorrelations will be significantly non-zero. The horizontal lines displayed +in the plot correspond to 95% and 99% confidence bands. The dashed line is 99% +confidence band. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + from pandas.tools.plotting import autocorrelation_plot + + plt.figure() + + data = Series(0.7 * rand(1000) + + 0.3 * np.sin(np.linspace(-9 * np.pi, 9 * np.pi, num=1000))) + + @savefig autocorrelation_plot.png + autocorrelation_plot(data) + +.. _visualization.bootstrap: + +Bootstrap Plot +~~~~~~~~~~~~~~ + +Bootstrap plots are used to visually assess the uncertainty of a statistic, such +as mean, median, midrange, etc. A random subset of a specified size is selected +from a data set, the statistic in question is computed for this subset and the +process is repeated a specified number of times. Resulting plots and histograms +are what constitutes the bootstrap plot. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + from pandas.tools.plotting import bootstrap_plot + + data = Series(rand(1000)) + + @savefig bootstrap_plot.png + bootstrap_plot(data, size=50, samples=500, color='grey') + +.. ipython:: python + :suppress: + + plt.close('all') + +.. _visualization.radviz: + +RadViz +~~~~~~ + +RadViz is a way of visualizing multi-variate data. It is based on a simple +spring tension minimization algorithm. Basically you set up a bunch of points in +a plane. In our case they are equally spaced on a unit circle. Each point +represents a single attribute. You then pretend that each sample in the data set +is attached to each of these points by a spring, the stiffness of which is +proportional to the numerical value of that attribute (they are normalized to +unit interval). The point in the plane, where our sample settles to (where the +forces acting on our sample are at an equilibrium) is where a dot representing +our sample will be drawn. Depending on which class that sample belongs it will +be colored differently. + +**Note**: The "Iris" dataset is available `here `__. + +.. ipython:: python + + from pandas import read_csv + from pandas.tools.plotting import radviz + + data = read_csv('data/iris.data') + + plt.figure() + + @savefig radviz.png + radviz(data, 'Name') + +.. _visualization.formatting: + +Plot Formatting +--------------- + +Most plotting methods have a set of keyword arguments that control the +layout and formatting of the returned plot: + +.. ipython:: python + + @savefig series_plot_basic2.png + plt.figure(); ts.plot(style='k--', label='Series'); + +For each kind of plot (e.g. `line`, `bar`, `scatter`) any additional arguments +keywords are passed alogn to the corresponding matplotlib function +(:meth:`ax.plot() `, +:meth:`ax.bar() `, +:meth:`ax.scatter() `). These can be used +to control additional styling, beyond what pandas provides. + +Controlling the Legend +~~~~~~~~~~~~~~~~~~~~~~ + +You may set the ``legend`` argument to ``False`` to hide the legend, which is +shown by default. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = df.cumsum() + + @savefig frame_plot_basic_noleg.png + df.plot(legend=False) + +Scales +~~~~~~ + +You may pass ``logy`` to get a log-scale Y axis. + +.. ipython:: python + :suppress: + + plt.figure() + np.random.seed(123456) + + +.. ipython:: python + + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = np.exp(ts.cumsum()) + + @savefig series_plot_logy.png + ts.plot(logy=True) + +See also the ``logx`` and ``loglog`` keyword arguments. + +Plotting on a Secondary Y-axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To plot data on a secondary y-axis, use the ``secondary_y`` keyword: + +.. ipython:: python + :suppress: + + plt.figure() + +.. ipython:: python + + df.A.plot() + + @savefig series_plot_secondary_y.png + df.B.plot(secondary_y=True, style='g') + +To plot some columns in a DataFrame, give the column names to the ``secondary_y`` +keyword: + +.. ipython:: python + + plt.figure() + ax = df.plot(secondary_y=['A', 'B']) + ax.set_ylabel('CD scale') + @savefig frame_plot_secondary_y.png + ax.right_ax.set_ylabel('AB scale') + + +Note that the columns plotted on the secondary y-axis is automatically marked +with "(right)" in the legend. To turn off the automatic marking, use the +``mark_right=False`` keyword: + +.. ipython:: python + + plt.figure() + + @savefig frame_plot_secondary_y_no_right.png + df.plot(secondary_y=['A', 'B'], mark_right=False) + + +Suppressing Tick Resolution Adjustment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +pandas includes automatically tick resolution adjustment for regular frequency +time-series data. For limited cases where pandas cannot infer the frequency +information (e.g., in an externally created ``twinx``), you can choose to +suppress this behavior for alignment purposes. + +Here is the default behavior, notice how the x-axis tick labelling is performed: + +.. ipython:: python + + plt.figure() + + @savefig ser_plot_suppress.png + df.A.plot() + + +Using the ``x_compat`` parameter, you can suppress this behavior: + +.. ipython:: python + + plt.figure() + + @savefig ser_plot_suppress_parm.png + df.A.plot(x_compat=True) + + +If you have more than one plot that needs to be suppressed, the ``use`` method +in ``pandas.plot_params`` can be used in a `with statement`: + +.. ipython:: python + + import pandas as pd + + plt.figure() + + @savefig ser_plot_suppress_context.png + with pd.plot_params.use('x_compat', True): + df.A.plot(color='r') + df.B.plot(color='g') + df.C.plot(color='b') + +Subplots +~~~~~~~~ + +Each Series in a DataFrame can be plotted on a different axis +with the ``subplots`` keyword: + +.. ipython:: python + + @savefig frame_plot_subplots.png + df.plot(subplots=True, figsize=(6, 6)); + +Targeting Different Subplots +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can pass an ``ax`` argument to :meth:`Series.plot` to plot on a particular axis: + +.. ipython:: python + :suppress: + + np.random.seed(123456) + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = df.cumsum() + +.. ipython:: python + + fig, axes = plt.subplots(nrows=2, ncols=2) + df['A'].plot(ax=axes[0,0]); axes[0,0].set_title('A') + df['B'].plot(ax=axes[0,1]); axes[0,1].set_title('B') + df['C'].plot(ax=axes[1,0]); axes[1,0].set_title('C') + + @savefig series_plot_multi.png + df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D') + +.. ipython:: python + :suppress: + + plt.close('all') + +.. _visualization.errorbars: + +Plotting With Error Bars +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14 + +Plotting with error bars is now supported in the :meth:`DataFrame.plot` and :meth:`Series.plot` + +Horizontal and vertical errorbars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot()`. The error values can be specified using a variety of formats. + +- As a :class:`DataFrame` or ``dict`` of errors with column names matching the ``columns`` attribute of the plotting :class:`DataFrame` or matching the ``name`` attribute of the :class:`Series` +- As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values +- As raw values (``list``, ``tuple``, or ``np.ndarray``). Must be the same length as the plotting :class:`DataFrame`/:class:`Series` + +Asymmetrical error bars are also supported, however raw error values must be provided in this case. For a ``M`` length :class:`Series`, a ``Mx2`` array should be provided indicating lower and upper (or left and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors should be in a ``Mx2xN`` array. + +Here is an example of one way to easily plot group means with standard deviations from the raw data. + +.. ipython:: python + + # Generate the data + ix3 = pd.MultiIndex.from_arrays([['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'], ['foo', 'foo', 'bar', 'bar', 'foo', 'foo', 'bar', 'bar']], names=['letter', 'word']) + df3 = pd.DataFrame({'data1': [3, 2, 4, 3, 2, 4, 3, 2], 'data2': [6, 5, 7, 5, 4, 5, 6, 5]}, index=ix3) + + # Group by index labels and take the means and standard deviations for each group + gp3 = df3.groupby(level=('letter', 'word')) + means = gp3.mean() + errors = gp3.std() + means + errors + + # Plot + fig, ax = plt.subplots() + @savefig errorbar_example.png + means.plot(yerr=errors, ax=ax, kind='bar') + +.. _visualization.table: + +Plotting Tables +~~~~~~~~~~~~~~~ + +.. versionadded:: 0.14 + +Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and :meth:`Series.plot` with a ``table`` keyword. The ``table`` keyword can accept ``bool``, :class:`DataFrame` or :class:`Series`. The simple way to draw a table is to specify ``table=True``. Data will be transposed to meet matplotlib's default layout. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + fig, ax = plt.subplots(1, 1) + df = DataFrame(rand(5, 3), columns=['a', 'b', 'c']) + ax.get_xaxis().set_visible(False) # Hide Ticks + + @savefig line_plot_table_true.png + df.plot(table=True, ax=ax) + +Also, you can pass different :class:`DataFrame` or :class:`Series` for ``table`` keyword. The data will be drawn as displayed in print method (not transposed automatically). If required, it should be transposed manually as below example. + +.. ipython:: python + + fig, ax = plt.subplots(1, 1) + ax.get_xaxis().set_visible(False) # Hide Ticks + @savefig line_plot_table_data.png + df.plot(table=np.round(df.T, 2), ax=ax) + + +Finally, there is a helper function ``pandas.tools.plotting.table`` to create a table from :class:`DataFrame` and :class:`Series`, and add it to an ``matplotlib.Axes``. This function can accept keywords which matplotlib table has. + +.. ipython:: python + + from pandas.tools.plotting import table + fig, ax = plt.subplots(1, 1) + + table(ax, np.round(df.describe(), 2), + loc='upper right', colWidths=[0.2, 0.2, 0.2]) + + @savefig line_plot_table_describe.png + df.plot(ax=ax, ylim=(0, 2), legend=None) + +**Note**: You can get table instances on the axes using ``axes.tables`` property for further decorations. See the `matplotlib table documenation `__ for more. + +.. _visualization.colormaps: + +Colormaps +~~~~~~~~~ + +A potential issue when plotting a large number of columns is that it can be +difficult to distinguish some series due to repetition in the default colors. To +remedy this, DataFrame plotting supports the use of the ``colormap=`` argument, +which accepts either a Matplotlib `colormap `__ +or a string that is a name of a colormap registered with Matplotlib. A +visualization of the default matplotlib colormaps is available `here +`__. + +As matplotlib does not directly support colormaps for line-based plots, the +colors are selected based on an even spacing determined by the number of columns +in the DataFrame. There is no consideration made for background color, so some +colormaps will produce lines that are not easily visible. + +To use the cubhelix colormap, we can simply pass ``'cubehelix'`` to ``colormap=`` + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + df = DataFrame(randn(1000, 10), index=ts.index) + df = df.cumsum() + + plt.figure() + + @savefig cubehelix.png + df.plot(colormap='cubehelix') + +or we can pass the colormap itself + +.. ipython:: python + + from matplotlib import cm + + plt.figure() + + @savefig cubehelix_cm.png + df.plot(colormap=cm.cubehelix) + +Colormaps can also be used other plot types, like bar charts: + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + dd = DataFrame(randn(10, 10)).applymap(abs) + dd = dd.cumsum() + + plt.figure() + + @savefig greens.png + dd.plot(kind='bar', colormap='Greens') + +Parallel coordinates charts: + +.. ipython:: python + + plt.figure() + + @savefig parallel_gist_rainbow.png + parallel_coordinates(data, 'Name', colormap='gist_rainbow') + +Andrews curves charts: + +.. ipython:: python + + plt.figure() + + @savefig andrews_curve_winter.png + andrews_curves(data, 'Name', colormap='winter') + + +Plotting directly with matplotlib +--------------------------------- + +In some situations it may still be preferable or necessary to prepare plots +directly with matplotlib, for instance when a certain type of plot or +customization is not (yet) supported by pandas. Series and DataFrame objects +behave like arrays and can therefore be passed directly to matplotlib functions +without explicit casts. + +pandas also automatically registers formatters and locators that recognize date +indices, thereby extending date and time support to practically all plot types +available in matplotlib. Although this formatting does not provide the same +level of refinement you would get when plotting via pandas, it can be faster +when plotting a large number of points. + +.. note:: + + The speed up for large data sets only applies to pandas 0.14.0 and later. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + +.. ipython:: python + + price = Series(randn(150).cumsum(), + index=date_range('2000-1-1', periods=150, freq='B')) + ma = pd.rolling_mean(price, 20) + mstd = pd.rolling_std(price, 20) + + plt.figure() + + plt.plot(price.index, price, 'k') + plt.plot(ma.index, ma, 'b') + @savefig bollinger.png + plt.fill_between(mstd.index, ma-2*mstd, ma+2*mstd, color='b', alpha=0.2) + +.. ipython:: python + :suppress: + + plt.close('all') diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst new file mode 100644 index 00000000..047f37db --- /dev/null +++ b/doc/source/whatsnew.rst @@ -0,0 +1,58 @@ +.. _whatsnew: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + options.display.max_rows = 15 + +********** +What's New +********** + +These are new features and improvements of note in each release. + +.. include:: v0.14.1.txt + +.. include:: v0.14.0.txt + +.. include:: v0.13.1.txt + +.. include:: v0.13.0.txt + +.. include:: v0.12.0.txt + +.. include:: v0.11.0.txt + +.. include:: v0.10.1.txt + +.. include:: v0.10.0.txt + +.. include:: v0.9.1.txt + +.. include:: v0.9.0.txt + +.. include:: v0.8.1.txt + +.. include:: v0.8.0.txt + +.. include:: v0.7.3.txt + +.. include:: v0.7.2.txt + +.. include:: v0.7.1.txt + +.. include:: v0.7.0.txt + +.. include:: v0.6.1.txt + +.. include:: v0.6.0.txt + +.. include:: v0.5.0.txt + +.. include:: v0.4.x.txt diff --git a/doc/sphinxext/README.rst b/doc/sphinxext/README.rst new file mode 100644 index 00000000..e39cf8da --- /dev/null +++ b/doc/sphinxext/README.rst @@ -0,0 +1,17 @@ +sphinxext +========= + +This directory contains copies of different sphinx extensions in use in the +pandas documentation. These copies originate from other projects: + +- ``numpydoc`` - Numpy's Sphinx extensions: this can be found at its own + repository: https://github.com/numpy/numpydoc +- ``ipython_directive`` and ``ipython_console_highlighting`` in the folder + `ipython_sphinxext` - Sphinx extensions from IPython: these are included + in IPython: https://github.com/ipython/ipython/tree/master/IPython/sphinxext + +.. note:: + + These copies are maintained at the respective projects, so fixes should, + to the extent possible, be pushed upstream instead of only adapting our + local copy to avoid divergence between the the local and upstream version. diff --git a/doc/sphinxext/ipython_sphinxext/__init__.py b/doc/sphinxext/ipython_sphinxext/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py b/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py new file mode 100644 index 00000000..dfb489e4 --- /dev/null +++ b/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py @@ -0,0 +1,116 @@ +"""reST directive for syntax-highlighting ipython interactive sessions. + +XXX - See what improvements can be made based on the new (as of Sept 2009) +'pycon' lexer for the python console. At the very least it will give better +highlighted tracebacks. +""" + +#----------------------------------------------------------------------------- +# Needed modules + +# Standard library +import re + +# Third party +from pygments.lexer import Lexer, do_insertions +from pygments.lexers.agile import (PythonConsoleLexer, PythonLexer, + PythonTracebackLexer) +from pygments.token import Comment, Generic + +from sphinx import highlighting + +#----------------------------------------------------------------------------- +# Global constants +line_re = re.compile('.*?\n') + +#----------------------------------------------------------------------------- +# Code begins - classes and functions + + +class IPythonConsoleLexer(Lexer): + + """ + For IPython console output or doctests, such as: + + .. sourcecode:: ipython + + In [1]: a = 'foo' + + In [2]: a + Out[2]: 'foo' + + In [3]: print(a) + foo + + In [4]: 1 / 0 + + Notes: + + - Tracebacks are not currently supported. + + - It assumes the default IPython prompts, not customized ones. + """ + + name = 'IPython console session' + aliases = ['ipython'] + mimetypes = ['text/x-ipython-console'] + input_prompt = re.compile("(In \[[0-9]+\]: )|( \.\.\.+:)") + output_prompt = re.compile("(Out\[[0-9]+\]: )|( \.\.\.+:)") + continue_prompt = re.compile(" \.\.\.+:") + tb_start = re.compile("\-+") + + def get_tokens_unprocessed(self, text): + pylexer = PythonLexer(**self.options) + tblexer = PythonTracebackLexer(**self.options) + + curcode = '' + insertions = [] + for match in line_re.finditer(text): + line = match.group() + input_prompt = self.input_prompt.match(line) + continue_prompt = self.continue_prompt.match(line.rstrip()) + output_prompt = self.output_prompt.match(line) + if line.startswith("#"): + insertions.append((len(curcode), + [(0, Comment, line)])) + elif input_prompt is not None: + insertions.append((len(curcode), + [(0, Generic.Prompt, input_prompt.group())])) + curcode += line[input_prompt.end():] + elif continue_prompt is not None: + insertions.append((len(curcode), + [(0, Generic.Prompt, continue_prompt.group())])) + curcode += line[continue_prompt.end():] + elif output_prompt is not None: + # Use the 'error' token for output. We should probably make + # our own token, but error is typicaly in a bright color like + # red, so it works fine for our output prompts. + insertions.append((len(curcode), + [(0, Generic.Error, output_prompt.group())])) + curcode += line[output_prompt.end():] + else: + if curcode: + for item in do_insertions(insertions, + pylexer.get_tokens_unprocessed(curcode)): + yield item + curcode = '' + insertions = [] + yield match.start(), Generic.Output, line + if curcode: + for item in do_insertions(insertions, + pylexer.get_tokens_unprocessed(curcode)): + yield item + + +def setup(app): + """Setup as a sphinx extension.""" + + # This is only a lexer, so adding it below to pygments appears sufficient. + # But if somebody knows that the right API usage should be to do that via + # sphinx, by all means fix it here. At least having this setup.py + # suppresses the sphinx warning we'd get without it. + pass + +#----------------------------------------------------------------------------- +# Register the extension as a valid pygments lexer +highlighting.lexers['ipython'] = IPythonConsoleLexer() diff --git a/doc/sphinxext/ipython_sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py new file mode 100644 index 00000000..3f9be956 --- /dev/null +++ b/doc/sphinxext/ipython_sphinxext/ipython_directive.py @@ -0,0 +1,1085 @@ +# -*- coding: utf-8 -*- +""" +Sphinx directive to support embedded IPython code. + +This directive allows pasting of entire interactive IPython sessions, prompts +and all, and their code will actually get re-executed at doc build time, with +all prompts renumbered sequentially. It also allows you to input code as a pure +python input by giving the argument python to the directive. The output looks +like an interactive ipython section. + +To enable this directive, simply list it in your Sphinx ``conf.py`` file +(making sure the directory where you placed it is visible to sphinx, as is +needed for all Sphinx directives). For example, to enable syntax highlighting +and the IPython directive:: + + extensions = ['IPython.sphinxext.ipython_console_highlighting', + 'IPython.sphinxext.ipython_directive'] + +The IPython directive outputs code-blocks with the language 'ipython'. So +if you do not have the syntax highlighting extension enabled as well, then +all rendered code-blocks will be uncolored. By default this directive assumes +that your prompts are unchanged IPython ones, but this can be customized. +The configurable options that can be placed in conf.py are: + +ipython_savefig_dir: + The directory in which to save the figures. This is relative to the + Sphinx source directory. The default is `html_static_path`. +ipython_rgxin: + The compiled regular expression to denote the start of IPython input + lines. The default is re.compile('In \[(\d+)\]:\s?(.*)\s*'). You + shouldn't need to change this. +ipython_rgxout: + The compiled regular expression to denote the start of IPython output + lines. The default is re.compile('Out\[(\d+)\]:\s?(.*)\s*'). You + shouldn't need to change this. +ipython_promptin: + The string to represent the IPython input prompt in the generated ReST. + The default is 'In [%d]:'. This expects that the line numbers are used + in the prompt. +ipython_promptout: + The string to represent the IPython prompt in the generated ReST. The + default is 'Out [%d]:'. This expects that the line numbers are used + in the prompt. +ipython_mplbackend: + The string which specifies if the embedded Sphinx shell should import + Matplotlib and set the backend. The value specifies a backend that is + passed to `matplotlib.use()` before any lines in `ipython_execlines` are + executed. If not specified in conf.py, then the default value of 'agg' is + used. To use the IPython directive without matplotlib as a dependency, set + the value to `None`. It may end up that matplotlib is still imported + if the user specifies so in `ipython_execlines` or makes use of the + @savefig pseudo decorator. +ipython_execlines: + A list of strings to be exec'd in the embedded Sphinx shell. Typical + usage is to make certain packages always available. Set this to an empty + list if you wish to have no imports always available. If specified in + conf.py as `None`, then it has the effect of making no imports available. + If omitted from conf.py altogether, then the default value of + ['import numpy as np', 'import matplotlib.pyplot as plt'] is used. +ipython_holdcount + When the @suppress pseudo-decorator is used, the execution count can be + incremented or not. The default behavior is to hold the execution count, + corresponding to a value of `True`. Set this to `False` to increment + the execution count after each suppressed command. + +As an example, to use the IPython directive when `matplotlib` is not available, +one sets the backend to `None`:: + + ipython_mplbackend = None + +An example usage of the directive is: + +.. code-block:: rst + + .. ipython:: + + In [1]: x = 1 + + In [2]: y = x**2 + + In [3]: print(y) + +See http://matplotlib.org/sampledoc/ipython_directive.html for additional +documentation. + +ToDo +---- + +- Turn the ad-hoc test() function into a real test suite. +- Break up ipython-specific functionality from matplotlib stuff into better + separated code. + +Authors +------- + +- John D Hunter: orignal author. +- Fernando Perez: refactoring, documentation, cleanups, port to 0.11. +- VáclavŠmilauer : Prompt generalizations. +- Skipper Seabold, refactoring, cleanups, pure python addition +""" +from __future__ import print_function +from __future__ import unicode_literals + +#----------------------------------------------------------------------------- +# Imports +#----------------------------------------------------------------------------- + +# Stdlib +import os +import re +import sys +import tempfile +import ast +from pandas.compat import zip, range, map, lmap, u, cStringIO as StringIO +import warnings + +# To keep compatibility with various python versions +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + +# Third-party +import sphinx +from docutils.parsers.rst import directives +from docutils import nodes +from sphinx.util.compat import Directive + +# Our own +from IPython import Config, InteractiveShell +from IPython.core.profiledir import ProfileDir +from IPython.utils import io +from IPython.utils.py3compat import PY3 + +if PY3: + from io import StringIO + text_type = str +else: + from StringIO import StringIO + text_type = unicode + +#----------------------------------------------------------------------------- +# Globals +#----------------------------------------------------------------------------- +# for tokenizing blocks +COMMENT, INPUT, OUTPUT = range(3) + +#----------------------------------------------------------------------------- +# Functions and class declarations +#----------------------------------------------------------------------------- + +def block_parser(part, rgxin, rgxout, fmtin, fmtout): + """ + part is a string of ipython text, comprised of at most one + input, one ouput, comments, and blank lines. The block parser + parses the text into a list of:: + + blocks = [ (TOKEN0, data0), (TOKEN1, data1), ...] + + where TOKEN is one of [COMMENT | INPUT | OUTPUT ] and + data is, depending on the type of token:: + + COMMENT : the comment string + + INPUT: the (DECORATOR, INPUT_LINE, REST) where + DECORATOR: the input decorator (or None) + INPUT_LINE: the input as string (possibly multi-line) + REST : any stdout generated by the input line (not OUTPUT) + + OUTPUT: the output string, possibly multi-line + + """ + block = [] + lines = part.split('\n') + N = len(lines) + i = 0 + decorator = None + while 1: + + if i==N: + # nothing left to parse -- the last line + break + + line = lines[i] + i += 1 + line_stripped = line.strip() + if line_stripped.startswith('#'): + block.append((COMMENT, line)) + continue + + if line_stripped.startswith('@'): + # we're assuming at most one decorator -- may need to + # rethink + decorator = line_stripped + continue + + # does this look like an input line? + matchin = rgxin.match(line) + if matchin: + lineno, inputline = int(matchin.group(1)), matchin.group(2) + + # the ....: continuation string + continuation = ' %s:'%''.join(['.']*(len(str(lineno))+2)) + Nc = len(continuation) + # input lines can continue on for more than one line, if + # we have a '\' line continuation char or a function call + # echo line 'print'. The input line can only be + # terminated by the end of the block or an output line, so + # we parse out the rest of the input line if it is + # multiline as well as any echo text + + rest = [] + while i 1: + if input_lines[-1] != "": + input_lines.append('') # make sure there's a blank line + # so splitter buffer gets reset + + continuation = ' %s:'%''.join(['.']*(len(str(lineno))+2)) + + if is_savefig: + image_file, image_directive = self.process_image(decorator) + + ret = [] + is_semicolon = False + + # Hold the execution count, if requested to do so. + if is_suppress and self.hold_count: + store_history = False + else: + store_history = True + + # Note: catch_warnings is not thread safe + with warnings.catch_warnings(record=True) as ws: + for i, line in enumerate(input_lines): + if line.endswith(';'): + is_semicolon = True + + if i == 0: + # process the first input line + if is_verbatim: + self.process_input_line('') + self.IP.execution_count += 1 # increment it anyway + else: + # only submit the line in non-verbatim mode + self.process_input_line(line, store_history=store_history) + formatted_line = '%s %s'%(input_prompt, line) + else: + # process a continuation line + if not is_verbatim: + self.process_input_line(line, store_history=store_history) + + formatted_line = '%s %s'%(continuation, line) + + if not is_suppress: + ret.append(formatted_line) + + if not is_suppress and len(rest.strip()) and is_verbatim: + # the "rest" is the standard output of the + # input, which needs to be added in + # verbatim mode + ret.append(rest) + + self.cout.seek(0) + output = self.cout.read() + if not is_suppress and not is_semicolon: + ret.append(output) + elif is_semicolon: # get spacing right + ret.append('') + + # context information + filename = self.state.document.current_source + lineno = self.state.document.current_line + + # output any exceptions raised during execution to stdout + # unless :okexcept: has been specified. + if not is_okexcept and "Traceback" in output: + s = "\nException in %s at block ending on line %s\n" % (filename, lineno) + s += "Specify :okexcept: as an option in the ipython:: block to suppress this message\n" + sys.stdout.write('\n\n>>>' + ('-' * 73)) + sys.stdout.write(s) + sys.stdout.write(output) + sys.stdout.write('<<<' + ('-' * 73) + '\n\n') + + # output any warning raised during execution to stdout + # unless :okwarning: has been specified. + if not is_okwarning: + for w in ws: + s = "\nWarning in %s at block ending on line %s\n" % (filename, lineno) + s += "Specify :okwarning: as an option in the ipython:: block to suppress this message\n" + sys.stdout.write('\n\n>>>' + ('-' * 73)) + sys.stdout.write(s) + sys.stdout.write('-' * 76 + '\n') + s=warnings.formatwarning(w.message, w.category, + w.filename, w.lineno, w.line) + sys.stdout.write(s) + sys.stdout.write('<<<' + ('-' * 73) + '\n') + + self.cout.truncate(0) + return (ret, input_lines, output, is_doctest, decorator, image_file, + image_directive) + + + def process_output(self, data, output_prompt, + input_lines, output, is_doctest, decorator, image_file): + """ + Process data block for OUTPUT token. + + """ + TAB = ' ' * 4 + + if is_doctest and output is not None: + + found = output + found = found.strip() + submitted = data.strip() + + if self.directive is None: + source = 'Unavailable' + content = 'Unavailable' + else: + source = self.directive.state.document.current_source + content = self.directive.content + # Add tabs and join into a single string. + content = '\n'.join([TAB + line for line in content]) + + # Make sure the output contains the output prompt. + ind = found.find(output_prompt) + if ind < 0: + e = ('output does not contain output prompt\n\n' + 'Document source: {0}\n\n' + 'Raw content: \n{1}\n\n' + 'Input line(s):\n{TAB}{2}\n\n' + 'Output line(s):\n{TAB}{3}\n\n') + e = e.format(source, content, '\n'.join(input_lines), + repr(found), TAB=TAB) + raise RuntimeError(e) + found = found[len(output_prompt):].strip() + + # Handle the actual doctest comparison. + if decorator.strip() == '@doctest': + # Standard doctest + if found != submitted: + e = ('doctest failure\n\n' + 'Document source: {0}\n\n' + 'Raw content: \n{1}\n\n' + 'On input line(s):\n{TAB}{2}\n\n' + 'we found output:\n{TAB}{3}\n\n' + 'instead of the expected:\n{TAB}{4}\n\n') + e = e.format(source, content, '\n'.join(input_lines), + repr(found), repr(submitted), TAB=TAB) + raise RuntimeError(e) + else: + self.custom_doctest(decorator, input_lines, found, submitted) + + def process_comment(self, data): + """Process data fPblock for COMMENT token.""" + if not self.is_suppress: + return [data] + + def save_image(self, image_file): + """ + Saves the image file to disk. + """ + self.ensure_pyplot() + command = ('plt.gcf().savefig("%s", bbox_inches="tight", ' + 'dpi=100)' % image_file) + + #print 'SAVEFIG', command # dbg + self.process_input_line('bookmark ipy_thisdir', store_history=False) + self.process_input_line('cd -b ipy_savedir', store_history=False) + self.process_input_line(command, store_history=False) + self.process_input_line('cd -b ipy_thisdir', store_history=False) + self.process_input_line('bookmark -d ipy_thisdir', store_history=False) + self.clear_cout() + + def process_block(self, block): + """ + process block from the block_parser and return a list of processed lines + """ + ret = [] + output = None + input_lines = None + lineno = self.IP.execution_count + + input_prompt = self.promptin % lineno + output_prompt = self.promptout % lineno + image_file = None + image_directive = None + + for token, data in block: + if token == COMMENT: + out_data = self.process_comment(data) + elif token == INPUT: + (out_data, input_lines, output, is_doctest, decorator, + image_file, image_directive) = \ + self.process_input(data, input_prompt, lineno) + elif token == OUTPUT: + out_data = \ + self.process_output(data, output_prompt, + input_lines, output, is_doctest, + decorator, image_file) + if out_data: + ret.extend(out_data) + + # save the image files + if image_file is not None: + self.save_image(image_file) + + return ret, image_directive + + def ensure_pyplot(self): + """ + Ensures that pyplot has been imported into the embedded IPython shell. + + Also, makes sure to set the backend appropriately if not set already. + + """ + # We are here if the @figure pseudo decorator was used. Thus, it's + # possible that we could be here even if python_mplbackend were set to + # `None`. That's also strange and perhaps worthy of raising an + # exception, but for now, we just set the backend to 'agg'. + + if not self._pyplot_imported: + if 'matplotlib.backends' not in sys.modules: + # Then ipython_matplotlib was set to None but there was a + # call to the @figure decorator (and ipython_execlines did + # not set a backend). + #raise Exception("No backend was set, but @figure was used!") + import matplotlib + matplotlib.use('agg') + + # Always import pyplot into embedded shell. + self.process_input_line('import matplotlib.pyplot as plt', + store_history=False) + self._pyplot_imported = True + + def process_pure_python(self, content): + """ + content is a list of strings. it is unedited directive content + + This runs it line by line in the InteractiveShell, prepends + prompts as needed capturing stderr and stdout, then returns + the content as a list as if it were ipython code + """ + output = [] + savefig = False # keep up with this to clear figure + multiline = False # to handle line continuation + multiline_start = None + fmtin = self.promptin + + ct = 0 + + for lineno, line in enumerate(content): + + line_stripped = line.strip() + if not len(line): + output.append(line) + continue + + # handle decorators + if line_stripped.startswith('@'): + output.extend([line]) + if 'savefig' in line: + savefig = True # and need to clear figure + continue + + # handle comments + if line_stripped.startswith('#'): + output.extend([line]) + continue + + # deal with lines checking for multiline + continuation = u' %s:'% ''.join(['.']*(len(str(ct))+2)) + if not multiline: + modified = u"%s %s" % (fmtin % ct, line_stripped) + output.append(modified) + ct += 1 + try: + ast.parse(line_stripped) + output.append(u'') + except Exception: # on a multiline + multiline = True + multiline_start = lineno + else: # still on a multiline + modified = u'%s %s' % (continuation, line) + output.append(modified) + + # if the next line is indented, it should be part of multiline + if len(content) > lineno + 1: + nextline = content[lineno + 1] + if len(nextline) - len(nextline.lstrip()) > 3: + continue + try: + mod = ast.parse( + '\n'.join(content[multiline_start:lineno+1])) + if isinstance(mod.body[0], ast.FunctionDef): + # check to see if we have the whole function + for element in mod.body[0].body: + if isinstance(element, ast.Return): + multiline = False + else: + output.append(u'') + multiline = False + except Exception: + pass + + if savefig: # clear figure if plotted + self.ensure_pyplot() + self.process_input_line('plt.clf()', store_history=False) + self.clear_cout() + savefig = False + + return output + + def custom_doctest(self, decorator, input_lines, found, submitted): + """ + Perform a specialized doctest. + + """ + from .custom_doctests import doctests + + args = decorator.split() + doctest_type = args[1] + if doctest_type in doctests: + doctests[doctest_type](self, args, input_lines, found, submitted) + else: + e = "Invalid option to @doctest: {0}".format(doctest_type) + raise Exception(e) + + +class IPythonDirective(Directive): + + has_content = True + required_arguments = 0 + optional_arguments = 4 # python, suppress, verbatim, doctest + final_argumuent_whitespace = True + option_spec = { 'python': directives.unchanged, + 'suppress' : directives.flag, + 'verbatim' : directives.flag, + 'doctest' : directives.flag, + 'okexcept': directives.flag, + 'okwarning': directives.flag, + 'output_encoding': directives.unchanged_required + } + + shell = None + + seen_docs = set() + + def get_config_options(self): + # contains sphinx configuration variables + config = self.state.document.settings.env.config + + # get config variables to set figure output directory + confdir = self.state.document.settings.env.app.confdir + savefig_dir = config.ipython_savefig_dir + source_dir = os.path.dirname(self.state.document.current_source) + if savefig_dir is None: + savefig_dir = config.html_static_path + if isinstance(savefig_dir, list): + savefig_dir = savefig_dir[0] # safe to assume only one path? + savefig_dir = os.path.join(confdir, savefig_dir) + + # get regex and prompt stuff + rgxin = config.ipython_rgxin + rgxout = config.ipython_rgxout + promptin = config.ipython_promptin + promptout = config.ipython_promptout + mplbackend = config.ipython_mplbackend + exec_lines = config.ipython_execlines + hold_count = config.ipython_holdcount + + return (savefig_dir, source_dir, rgxin, rgxout, + promptin, promptout, mplbackend, exec_lines, hold_count) + + def setup(self): + # Get configuration values. + (savefig_dir, source_dir, rgxin, rgxout, promptin, promptout, + mplbackend, exec_lines, hold_count) = self.get_config_options() + + if self.shell is None: + # We will be here many times. However, when the + # EmbeddedSphinxShell is created, its interactive shell member + # is the same for each instance. + + if mplbackend: + import matplotlib + # Repeated calls to use() will not hurt us since `mplbackend` + # is the same each time. + matplotlib.use(mplbackend) + + # Must be called after (potentially) importing matplotlib and + # setting its backend since exec_lines might import pylab. + self.shell = EmbeddedSphinxShell(exec_lines, self.state) + + # Store IPython directive to enable better error messages + self.shell.directive = self + + # reset the execution count if we haven't processed this doc + #NOTE: this may be borked if there are multiple seen_doc tmp files + #check time stamp? + if not self.state.document.current_source in self.seen_docs: + self.shell.IP.history_manager.reset() + self.shell.IP.execution_count = 1 + self.shell.IP.prompt_manager.width = 0 + self.seen_docs.add(self.state.document.current_source) + + # and attach to shell so we don't have to pass them around + self.shell.rgxin = rgxin + self.shell.rgxout = rgxout + self.shell.promptin = promptin + self.shell.promptout = promptout + self.shell.savefig_dir = savefig_dir + self.shell.source_dir = source_dir + self.shell.hold_count = hold_count + + # setup bookmark for saving figures directory + self.shell.process_input_line('bookmark ipy_savedir %s'%savefig_dir, + store_history=False) + self.shell.clear_cout() + + return rgxin, rgxout, promptin, promptout + + def teardown(self): + # delete last bookmark + self.shell.process_input_line('bookmark -d ipy_savedir', + store_history=False) + self.shell.clear_cout() + + def run(self): + debug = False + + #TODO, any reason block_parser can't be a method of embeddable shell + # then we wouldn't have to carry these around + rgxin, rgxout, promptin, promptout = self.setup() + + options = self.options + self.shell.is_suppress = 'suppress' in options + self.shell.is_doctest = 'doctest' in options + self.shell.is_verbatim = 'verbatim' in options + self.shell.is_okexcept = 'okexcept' in options + self.shell.is_okwarning = 'okwarning' in options + + self.shell.output_encoding = [options.get('output_encoding', 'utf8')] + + # handle pure python code + if 'python' in self.arguments: + content = self.content + self.content = self.shell.process_pure_python(content) + + parts = '\n'.join(self.content).split('\n\n') + + lines = ['.. code-block:: ipython', ''] + figures = [] + + for part in parts: + block = block_parser(part, rgxin, rgxout, promptin, promptout) + if len(block): + rows, figure = self.shell.process_block(block) + for row in rows: + lines.extend([' %s'%line for line in row.split('\n')]) + + if figure is not None: + figures.append(figure) + + for figure in figures: + lines.append('') + lines.extend(figure.split('\n')) + lines.append('') + + if len(lines)>2: + if debug: + print('\n'.join(lines)) + else: + # This has to do with input, not output. But if we comment + # these lines out, then no IPython code will appear in the + # final output. + self.state_machine.insert_input( + lines, self.state_machine.input_lines.source(0)) + + # cleanup + self.teardown() + + return [] + +# Enable as a proper Sphinx directive +def setup(app): + setup.app = app + + app.add_directive('ipython', IPythonDirective) + app.add_config_value('ipython_savefig_dir', None, 'env') + app.add_config_value('ipython_rgxin', + re.compile('In \[(\d+)\]:\s?(.*)\s*'), 'env') + app.add_config_value('ipython_rgxout', + re.compile('Out\[(\d+)\]:\s?(.*)\s*'), 'env') + app.add_config_value('ipython_promptin', 'In [%d]:', 'env') + app.add_config_value('ipython_promptout', 'Out[%d]:', 'env') + + # We could just let matplotlib pick whatever is specified as the default + # backend in the matplotlibrc file, but this would cause issues if the + # backend didn't work in headless environments. For this reason, 'agg' + # is a good default backend choice. + app.add_config_value('ipython_mplbackend', 'agg', 'env') + + # If the user sets this config value to `None`, then EmbeddedSphinxShell's + # __init__ method will treat it as []. + execlines = ['import numpy as np', 'import matplotlib.pyplot as plt'] + app.add_config_value('ipython_execlines', execlines, 'env') + + app.add_config_value('ipython_holdcount', True, 'env') + +# Simple smoke test, needs to be converted to a proper automatic test. +def test(): + + examples = [ + r""" +In [9]: pwd +Out[9]: '/home/jdhunter/py4science/book' + +In [10]: cd bookdata/ +/home/jdhunter/py4science/book/bookdata + +In [2]: from pylab import * + +In [2]: ion() + +In [3]: im = imread('stinkbug.png') + +@savefig mystinkbug.png width=4in +In [4]: imshow(im) +Out[4]: + +""", + r""" + +In [1]: x = 'hello world' + +# string methods can be +# used to alter the string +@doctest +In [2]: x.upper() +Out[2]: 'HELLO WORLD' + +@verbatim +In [3]: x.st +x.startswith x.strip +""", + r""" + +In [130]: url = 'http://ichart.finance.yahoo.com/table.csv?s=CROX\ + .....: &d=9&e=22&f=2009&g=d&a=1&br=8&c=2006&ignore=.csv' + +In [131]: print url.split('&') +['http://ichart.finance.yahoo.com/table.csv?s=CROX', 'd=9', 'e=22', 'f=2009', 'g=d', 'a=1', 'b=8', 'c=2006', 'ignore=.csv'] + +In [60]: import urllib + +""", + r"""\ + +In [133]: import numpy.random + +@suppress +In [134]: numpy.random.seed(2358) + +@doctest +In [135]: numpy.random.rand(10,2) +Out[135]: +array([[ 0.64524308, 0.59943846], + [ 0.47102322, 0.8715456 ], + [ 0.29370834, 0.74776844], + [ 0.99539577, 0.1313423 ], + [ 0.16250302, 0.21103583], + [ 0.81626524, 0.1312433 ], + [ 0.67338089, 0.72302393], + [ 0.7566368 , 0.07033696], + [ 0.22591016, 0.77731835], + [ 0.0072729 , 0.34273127]]) + +""", + + r""" +In [106]: print x +jdh + +In [109]: for i in range(10): + .....: print i + .....: + .....: +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +""", + + r""" + +In [144]: from pylab import * + +In [145]: ion() + +# use a semicolon to suppress the output +@savefig test_hist.png width=4in +In [151]: hist(np.random.randn(10000), 100); + + +@savefig test_plot.png width=4in +In [151]: plot(np.random.randn(10000), 'o'); + """, + + r""" +# use a semicolon to suppress the output +In [151]: plt.clf() + +@savefig plot_simple.png width=4in +In [151]: plot([1,2,3]) + +@savefig hist_simple.png width=4in +In [151]: hist(np.random.randn(10000), 100); + +""", + r""" +# update the current fig +In [151]: ylabel('number') + +In [152]: title('normal distribution') + + +@savefig hist_with_text.png +In [153]: grid(True) + +@doctest float +In [154]: 0.1 + 0.2 +Out[154]: 0.3 + +@doctest float +In [155]: np.arange(16).reshape(4,4) +Out[155]: +array([[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11], + [12, 13, 14, 15]]) + +In [1]: x = np.arange(16, dtype=float).reshape(4,4) + +In [2]: x[0,0] = np.inf + +In [3]: x[0,1] = np.nan + +@doctest float +In [4]: x +Out[4]: +array([[ inf, nan, 2., 3.], + [ 4., 5., 6., 7.], + [ 8., 9., 10., 11.], + [ 12., 13., 14., 15.]]) + + + """, + ] + # skip local-file depending first example: + examples = examples[1:] + + #ipython_directive.DEBUG = True # dbg + #options = dict(suppress=True) # dbg + options = dict() + for example in examples: + content = example.split('\n') + IPythonDirective('debug', arguments=None, options=options, + content=content, lineno=0, + content_offset=None, block_text=None, + state=None, state_machine=None, + ) + +# Run test suite as a script +if __name__=='__main__': + if not os.path.isdir('_static'): + os.mkdir('_static') + test() + print('All OK? Check figures in _static/') diff --git a/doc/sphinxext/numpydoc/LICENSE.txt b/doc/sphinxext/numpydoc/LICENSE.txt new file mode 100755 index 00000000..b15c699d --- /dev/null +++ b/doc/sphinxext/numpydoc/LICENSE.txt @@ -0,0 +1,94 @@ +------------------------------------------------------------------------------- + The files + - numpydoc.py + - docscrape.py + - docscrape_sphinx.py + - phantom_import.py + have the following license: + +Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + The files + - compiler_unparse.py + - comment_eater.py + - traitsdoc.py + have the following license: + +This software is OSI Certified Open Source Software. +OSI Certified is a certification mark of the Open Source Initiative. + +Copyright (c) 2006, Enthought, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Enthought, Inc. nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------- + The file + - plot_directive.py + originates from Matplotlib (http://matplotlib.sf.net/) which has + the following license: + +Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. + +1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. + +4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. + +5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. + diff --git a/doc/sphinxext/numpydoc/README.rst b/doc/sphinxext/numpydoc/README.rst new file mode 100755 index 00000000..89b9f2fd --- /dev/null +++ b/doc/sphinxext/numpydoc/README.rst @@ -0,0 +1,51 @@ +===================================== +numpydoc -- Numpy's Sphinx extensions +===================================== + +Numpy's documentation uses several custom extensions to Sphinx. These +are shipped in this ``numpydoc`` package, in case you want to make use +of them in third-party projects. + +The following extensions are available: + + - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add + the code description directives ``np:function``, ``np-c:function``, etc. + that support the Numpy docstring syntax. + + - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. + + - ``numpydoc.plot_directive``: Adaptation of Matplotlib's ``plot::`` + directive. Note that this implementation may still undergo severe + changes or eventually be deprecated. + + +numpydoc +======== + +Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings +following the Numpy/Scipy format to a form palatable to Sphinx. + +Options +------- + +The following options can be set in conf.py: + +- numpydoc_use_plots: bool + + Whether to produce ``plot::`` directives for Examples sections that + contain ``import matplotlib``. + +- numpydoc_show_class_members: bool + + Whether to show all members of a class in the Methods and Attributes + sections automatically. + +- numpydoc_class_members_toctree: bool + + Whether to create a Sphinx table of contents for the lists of class + methods and attributes. If a table of contents is made, Sphinx expects + each entry to have a separate page. + +- numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) + + Whether to insert an edit link after docstrings. diff --git a/doc/sphinxext/numpydoc/__init__.py b/doc/sphinxext/numpydoc/__init__.py new file mode 100755 index 00000000..0fce2cf7 --- /dev/null +++ b/doc/sphinxext/numpydoc/__init__.py @@ -0,0 +1,3 @@ +from __future__ import division, absolute_import, print_function + +from .numpydoc import setup diff --git a/doc/sphinxext/numpydoc/comment_eater.py b/doc/sphinxext/numpydoc/comment_eater.py new file mode 100755 index 00000000..8cddd330 --- /dev/null +++ b/doc/sphinxext/numpydoc/comment_eater.py @@ -0,0 +1,169 @@ +from __future__ import division, absolute_import, print_function + +import sys +if sys.version_info[0] >= 3: + from io import StringIO +else: + from io import StringIO + +import compiler +import inspect +import textwrap +import tokenize + +from .compiler_unparse import unparse + + +class Comment(object): + """ A comment block. + """ + is_comment = True + def __init__(self, start_lineno, end_lineno, text): + # int : The first line number in the block. 1-indexed. + self.start_lineno = start_lineno + # int : The last line number. Inclusive! + self.end_lineno = end_lineno + # str : The text block including '#' character but not any leading spaces. + self.text = text + + def add(self, string, start, end, line): + """ Add a new comment line. + """ + self.start_lineno = min(self.start_lineno, start[0]) + self.end_lineno = max(self.end_lineno, end[0]) + self.text += string + + def __repr__(self): + return '%s(%r, %r, %r)' % (self.__class__.__name__, self.start_lineno, + self.end_lineno, self.text) + + +class NonComment(object): + """ A non-comment block of code. + """ + is_comment = False + def __init__(self, start_lineno, end_lineno): + self.start_lineno = start_lineno + self.end_lineno = end_lineno + + def add(self, string, start, end, line): + """ Add lines to the block. + """ + if string.strip(): + # Only add if not entirely whitespace. + self.start_lineno = min(self.start_lineno, start[0]) + self.end_lineno = max(self.end_lineno, end[0]) + + def __repr__(self): + return '%s(%r, %r)' % (self.__class__.__name__, self.start_lineno, + self.end_lineno) + + +class CommentBlocker(object): + """ Pull out contiguous comment blocks. + """ + def __init__(self): + # Start with a dummy. + self.current_block = NonComment(0, 0) + + # All of the blocks seen so far. + self.blocks = [] + + # The index mapping lines of code to their associated comment blocks. + self.index = {} + + def process_file(self, file): + """ Process a file object. + """ + if sys.version_info[0] >= 3: + nxt = file.__next__ + else: + nxt = file.next + for token in tokenize.generate_tokens(nxt): + self.process_token(*token) + self.make_index() + + def process_token(self, kind, string, start, end, line): + """ Process a single token. + """ + if self.current_block.is_comment: + if kind == tokenize.COMMENT: + self.current_block.add(string, start, end, line) + else: + self.new_noncomment(start[0], end[0]) + else: + if kind == tokenize.COMMENT: + self.new_comment(string, start, end, line) + else: + self.current_block.add(string, start, end, line) + + def new_noncomment(self, start_lineno, end_lineno): + """ We are transitioning from a noncomment to a comment. + """ + block = NonComment(start_lineno, end_lineno) + self.blocks.append(block) + self.current_block = block + + def new_comment(self, string, start, end, line): + """ Possibly add a new comment. + + Only adds a new comment if this comment is the only thing on the line. + Otherwise, it extends the noncomment block. + """ + prefix = line[:start[1]] + if prefix.strip(): + # Oops! Trailing comment, not a comment block. + self.current_block.add(string, start, end, line) + else: + # A comment block. + block = Comment(start[0], end[0], string) + self.blocks.append(block) + self.current_block = block + + def make_index(self): + """ Make the index mapping lines of actual code to their associated + prefix comments. + """ + for prev, block in zip(self.blocks[:-1], self.blocks[1:]): + if not block.is_comment: + self.index[block.start_lineno] = prev + + def search_for_comment(self, lineno, default=None): + """ Find the comment block just before the given line number. + + Returns None (or the specified default) if there is no such block. + """ + if not self.index: + self.make_index() + block = self.index.get(lineno, None) + text = getattr(block, 'text', default) + return text + + +def strip_comment_marker(text): + """ Strip # markers at the front of a block of comment text. + """ + lines = [] + for line in text.splitlines(): + lines.append(line.lstrip('#')) + text = textwrap.dedent('\n'.join(lines)) + return text + + +def get_class_traits(klass): + """ Yield all of the documentation for trait definitions on a class object. + """ + # FIXME: gracefully handle errors here or in the caller? + source = inspect.getsource(klass) + cb = CommentBlocker() + cb.process_file(StringIO(source)) + mod_ast = compiler.parse(source) + class_ast = mod_ast.node.nodes[0] + for node in class_ast.code.nodes: + # FIXME: handle other kinds of assignments? + if isinstance(node, compiler.ast.Assign): + name = node.nodes[0].name + rhs = unparse(node.expr).strip() + doc = strip_comment_marker(cb.search_for_comment(node.lineno, default='')) + yield name, rhs, doc + diff --git a/doc/sphinxext/numpydoc/compiler_unparse.py b/doc/sphinxext/numpydoc/compiler_unparse.py new file mode 100755 index 00000000..8933a83d --- /dev/null +++ b/doc/sphinxext/numpydoc/compiler_unparse.py @@ -0,0 +1,865 @@ +""" Turn compiler.ast structures back into executable python code. + + The unparse method takes a compiler.ast tree and transforms it back into + valid python code. It is incomplete and currently only works for + import statements, function calls, function definitions, assignments, and + basic expressions. + + Inspired by python-2.5-svn/Demo/parser/unparse.py + + fixme: We may want to move to using _ast trees because the compiler for + them is about 6 times faster than compiler.compile. +""" +from __future__ import division, absolute_import, print_function + +import sys +from compiler.ast import Const, Name, Tuple, Div, Mul, Sub, Add + +if sys.version_info[0] >= 3: + from io import StringIO +else: + from StringIO import StringIO + +def unparse(ast, single_line_functions=False): + s = StringIO() + UnparseCompilerAst(ast, s, single_line_functions) + return s.getvalue().lstrip() + +op_precedence = { 'compiler.ast.Power':3, 'compiler.ast.Mul':2, 'compiler.ast.Div':2, + 'compiler.ast.Add':1, 'compiler.ast.Sub':1 } + +class UnparseCompilerAst: + """ Methods in this class recursively traverse an AST and + output source code for the abstract syntax; original formatting + is disregarged. + """ + + ######################################################################### + # object interface. + ######################################################################### + + def __init__(self, tree, file = sys.stdout, single_line_functions=False): + """ Unparser(tree, file=sys.stdout) -> None. + + Print the source for tree to file. + """ + self.f = file + self._single_func = single_line_functions + self._do_indent = True + self._indent = 0 + self._dispatch(tree) + self._write("\n") + self.f.flush() + + ######################################################################### + # Unparser private interface. + ######################################################################### + + ### format, output, and dispatch methods ################################ + + def _fill(self, text = ""): + "Indent a piece of text, according to the current indentation level" + if self._do_indent: + self._write("\n"+" "*self._indent + text) + else: + self._write(text) + + def _write(self, text): + "Append a piece of text to the current line." + self.f.write(text) + + def _enter(self): + "Print ':', and increase the indentation." + self._write(": ") + self._indent += 1 + + def _leave(self): + "Decrease the indentation level." + self._indent -= 1 + + def _dispatch(self, tree): + "_dispatcher function, _dispatching tree type T to method _T." + if isinstance(tree, list): + for t in tree: + self._dispatch(t) + return + meth = getattr(self, "_"+tree.__class__.__name__) + if tree.__class__.__name__ == 'NoneType' and not self._do_indent: + return + meth(tree) + + + ######################################################################### + # compiler.ast unparsing methods. + # + # There should be one method per concrete grammar type. They are + # organized in alphabetical order. + ######################################################################### + + def _Add(self, t): + self.__binary_op(t, '+') + + def _And(self, t): + self._write(" (") + for i, node in enumerate(t.nodes): + self._dispatch(node) + if i != len(t.nodes)-1: + self._write(") and (") + self._write(")") + + def _AssAttr(self, t): + """ Handle assigning an attribute of an object + """ + self._dispatch(t.expr) + self._write('.'+t.attrname) + + def _Assign(self, t): + """ Expression Assignment such as "a = 1". + + This only handles assignment in expressions. Keyword assignment + is handled separately. + """ + self._fill() + for target in t.nodes: + self._dispatch(target) + self._write(" = ") + self._dispatch(t.expr) + if not self._do_indent: + self._write('; ') + + def _AssName(self, t): + """ Name on left hand side of expression. + + Treat just like a name on the right side of an expression. + """ + self._Name(t) + + def _AssTuple(self, t): + """ Tuple on left hand side of an expression. + """ + + # _write each elements, separated by a comma. + for element in t.nodes[:-1]: + self._dispatch(element) + self._write(", ") + + # Handle the last one without writing comma + last_element = t.nodes[-1] + self._dispatch(last_element) + + def _AugAssign(self, t): + """ +=,-=,*=,/=,**=, etc. operations + """ + + self._fill() + self._dispatch(t.node) + self._write(' '+t.op+' ') + self._dispatch(t.expr) + if not self._do_indent: + self._write(';') + + def _Bitand(self, t): + """ Bit and operation. + """ + + for i, node in enumerate(t.nodes): + self._write("(") + self._dispatch(node) + self._write(")") + if i != len(t.nodes)-1: + self._write(" & ") + + def _Bitor(self, t): + """ Bit or operation + """ + + for i, node in enumerate(t.nodes): + self._write("(") + self._dispatch(node) + self._write(")") + if i != len(t.nodes)-1: + self._write(" | ") + + def _CallFunc(self, t): + """ Function call. + """ + self._dispatch(t.node) + self._write("(") + comma = False + for e in t.args: + if comma: self._write(", ") + else: comma = True + self._dispatch(e) + if t.star_args: + if comma: self._write(", ") + else: comma = True + self._write("*") + self._dispatch(t.star_args) + if t.dstar_args: + if comma: self._write(", ") + else: comma = True + self._write("**") + self._dispatch(t.dstar_args) + self._write(")") + + def _Compare(self, t): + self._dispatch(t.expr) + for op, expr in t.ops: + self._write(" " + op + " ") + self._dispatch(expr) + + def _Const(self, t): + """ A constant value such as an integer value, 3, or a string, "hello". + """ + self._dispatch(t.value) + + def _Decorators(self, t): + """ Handle function decorators (eg. @has_units) + """ + for node in t.nodes: + self._dispatch(node) + + def _Dict(self, t): + self._write("{") + for i, (k, v) in enumerate(t.items): + self._dispatch(k) + self._write(": ") + self._dispatch(v) + if i < len(t.items)-1: + self._write(", ") + self._write("}") + + def _Discard(self, t): + """ Node for when return value is ignored such as in "foo(a)". + """ + self._fill() + self._dispatch(t.expr) + + def _Div(self, t): + self.__binary_op(t, '/') + + def _Ellipsis(self, t): + self._write("...") + + def _From(self, t): + """ Handle "from xyz import foo, bar as baz". + """ + # fixme: Are From and ImportFrom handled differently? + self._fill("from ") + self._write(t.modname) + self._write(" import ") + for i, (name,asname) in enumerate(t.names): + if i != 0: + self._write(", ") + self._write(name) + if asname is not None: + self._write(" as "+asname) + + def _Function(self, t): + """ Handle function definitions + """ + if t.decorators is not None: + self._fill("@") + self._dispatch(t.decorators) + self._fill("def "+t.name + "(") + defaults = [None] * (len(t.argnames) - len(t.defaults)) + list(t.defaults) + for i, arg in enumerate(zip(t.argnames, defaults)): + self._write(arg[0]) + if arg[1] is not None: + self._write('=') + self._dispatch(arg[1]) + if i < len(t.argnames)-1: + self._write(', ') + self._write(")") + if self._single_func: + self._do_indent = False + self._enter() + self._dispatch(t.code) + self._leave() + self._do_indent = True + + def _Getattr(self, t): + """ Handle getting an attribute of an object + """ + if isinstance(t.expr, (Div, Mul, Sub, Add)): + self._write('(') + self._dispatch(t.expr) + self._write(')') + else: + self._dispatch(t.expr) + + self._write('.'+t.attrname) + + def _If(self, t): + self._fill() + + for i, (compare,code) in enumerate(t.tests): + if i == 0: + self._write("if ") + else: + self._write("elif ") + self._dispatch(compare) + self._enter() + self._fill() + self._dispatch(code) + self._leave() + self._write("\n") + + if t.else_ is not None: + self._write("else") + self._enter() + self._fill() + self._dispatch(t.else_) + self._leave() + self._write("\n") + + def _IfExp(self, t): + self._dispatch(t.then) + self._write(" if ") + self._dispatch(t.test) + + if t.else_ is not None: + self._write(" else (") + self._dispatch(t.else_) + self._write(")") + + def _Import(self, t): + """ Handle "import xyz.foo". + """ + self._fill("import ") + + for i, (name,asname) in enumerate(t.names): + if i != 0: + self._write(", ") + self._write(name) + if asname is not None: + self._write(" as "+asname) + + def _Keyword(self, t): + """ Keyword value assignment within function calls and definitions. + """ + self._write(t.name) + self._write("=") + self._dispatch(t.expr) + + def _List(self, t): + self._write("[") + for i,node in enumerate(t.nodes): + self._dispatch(node) + if i < len(t.nodes)-1: + self._write(", ") + self._write("]") + + def _Module(self, t): + if t.doc is not None: + self._dispatch(t.doc) + self._dispatch(t.node) + + def _Mul(self, t): + self.__binary_op(t, '*') + + def _Name(self, t): + self._write(t.name) + + def _NoneType(self, t): + self._write("None") + + def _Not(self, t): + self._write('not (') + self._dispatch(t.expr) + self._write(')') + + def _Or(self, t): + self._write(" (") + for i, node in enumerate(t.nodes): + self._dispatch(node) + if i != len(t.nodes)-1: + self._write(") or (") + self._write(")") + + def _Pass(self, t): + self._write("pass\n") + + def _Printnl(self, t): + self._fill("print ") + if t.dest: + self._write(">> ") + self._dispatch(t.dest) + self._write(", ") + comma = False + for node in t.nodes: + if comma: self._write(', ') + else: comma = True + self._dispatch(node) + + def _Power(self, t): + self.__binary_op(t, '**') + + def _Return(self, t): + self._fill("return ") + if t.value: + if isinstance(t.value, Tuple): + text = ', '.join([ name.name for name in t.value.asList() ]) + self._write(text) + else: + self._dispatch(t.value) + if not self._do_indent: + self._write('; ') + + def _Slice(self, t): + self._dispatch(t.expr) + self._write("[") + if t.lower: + self._dispatch(t.lower) + self._write(":") + if t.upper: + self._dispatch(t.upper) + #if t.step: + # self._write(":") + # self._dispatch(t.step) + self._write("]") + + def _Sliceobj(self, t): + for i, node in enumerate(t.nodes): + if i != 0: + self._write(":") + if not (isinstance(node, Const) and node.value is None): + self._dispatch(node) + + def _Stmt(self, tree): + for node in tree.nodes: + self._dispatch(node) + + def _Sub(self, t): + self.__binary_op(t, '-') + + def _Subscript(self, t): + self._dispatch(t.expr) + self._write("[") + for i, value in enumerate(t.subs): + if i != 0: + self._write(",") + self._dispatch(value) + self._write("]") + + def _TryExcept(self, t): + self._fill("try") + self._enter() + self._dispatch(t.body) + self._leave() + + for handler in t.handlers: + self._fill('except ') + self._dispatch(handler[0]) + if handler[1] is not None: + self._write(', ') + self._dispatch(handler[1]) + self._enter() + self._dispatch(handler[2]) + self._leave() + + if t.else_: + self._fill("else") + self._enter() + self._dispatch(t.else_) + self._leave() + + def _Tuple(self, t): + + if not t.nodes: + # Empty tuple. + self._write("()") + else: + self._write("(") + + # _write each elements, separated by a comma. + for element in t.nodes[:-1]: + self._dispatch(element) + self._write(", ") + + # Handle the last one without writing comma + last_element = t.nodes[-1] + self._dispatch(last_element) + + self._write(")") + + def _UnaryAdd(self, t): + self._write("+") + self._dispatch(t.expr) + + def _UnarySub(self, t): + self._write("-") + self._dispatch(t.expr) + + def _With(self, t): + self._fill('with ') + self._dispatch(t.expr) + if t.vars: + self._write(' as ') + self._dispatch(t.vars.name) + self._enter() + self._dispatch(t.body) + self._leave() + self._write('\n') + + def _int(self, t): + self._write(repr(t)) + + def __binary_op(self, t, symbol): + # Check if parenthesis are needed on left side and then dispatch + has_paren = False + left_class = str(t.left.__class__) + if (left_class in op_precedence.keys() and + op_precedence[left_class] < op_precedence[str(t.__class__)]): + has_paren = True + if has_paren: + self._write('(') + self._dispatch(t.left) + if has_paren: + self._write(')') + # Write the appropriate symbol for operator + self._write(symbol) + # Check if parenthesis are needed on the right side and then dispatch + has_paren = False + right_class = str(t.right.__class__) + if (right_class in op_precedence.keys() and + op_precedence[right_class] < op_precedence[str(t.__class__)]): + has_paren = True + if has_paren: + self._write('(') + self._dispatch(t.right) + if has_paren: + self._write(')') + + def _float(self, t): + # if t is 0.1, str(t)->'0.1' while repr(t)->'0.1000000000001' + # We prefer str here. + self._write(str(t)) + + def _str(self, t): + self._write(repr(t)) + + def _tuple(self, t): + self._write(str(t)) + + ######################################################################### + # These are the methods from the _ast modules unparse. + # + # As our needs to handle more advanced code increase, we may want to + # modify some of the methods below so that they work for compiler.ast. + ######################################################################### + +# # stmt +# def _Expr(self, tree): +# self._fill() +# self._dispatch(tree.value) +# +# def _Import(self, t): +# self._fill("import ") +# first = True +# for a in t.names: +# if first: +# first = False +# else: +# self._write(", ") +# self._write(a.name) +# if a.asname: +# self._write(" as "+a.asname) +# +## def _ImportFrom(self, t): +## self._fill("from ") +## self._write(t.module) +## self._write(" import ") +## for i, a in enumerate(t.names): +## if i == 0: +## self._write(", ") +## self._write(a.name) +## if a.asname: +## self._write(" as "+a.asname) +## # XXX(jpe) what is level for? +## +# +# def _Break(self, t): +# self._fill("break") +# +# def _Continue(self, t): +# self._fill("continue") +# +# def _Delete(self, t): +# self._fill("del ") +# self._dispatch(t.targets) +# +# def _Assert(self, t): +# self._fill("assert ") +# self._dispatch(t.test) +# if t.msg: +# self._write(", ") +# self._dispatch(t.msg) +# +# def _Exec(self, t): +# self._fill("exec ") +# self._dispatch(t.body) +# if t.globals: +# self._write(" in ") +# self._dispatch(t.globals) +# if t.locals: +# self._write(", ") +# self._dispatch(t.locals) +# +# def _Print(self, t): +# self._fill("print ") +# do_comma = False +# if t.dest: +# self._write(">>") +# self._dispatch(t.dest) +# do_comma = True +# for e in t.values: +# if do_comma:self._write(", ") +# else:do_comma=True +# self._dispatch(e) +# if not t.nl: +# self._write(",") +# +# def _Global(self, t): +# self._fill("global") +# for i, n in enumerate(t.names): +# if i != 0: +# self._write(",") +# self._write(" " + n) +# +# def _Yield(self, t): +# self._fill("yield") +# if t.value: +# self._write(" (") +# self._dispatch(t.value) +# self._write(")") +# +# def _Raise(self, t): +# self._fill('raise ') +# if t.type: +# self._dispatch(t.type) +# if t.inst: +# self._write(", ") +# self._dispatch(t.inst) +# if t.tback: +# self._write(", ") +# self._dispatch(t.tback) +# +# +# def _TryFinally(self, t): +# self._fill("try") +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# self._fill("finally") +# self._enter() +# self._dispatch(t.finalbody) +# self._leave() +# +# def _excepthandler(self, t): +# self._fill("except ") +# if t.type: +# self._dispatch(t.type) +# if t.name: +# self._write(", ") +# self._dispatch(t.name) +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# def _ClassDef(self, t): +# self._write("\n") +# self._fill("class "+t.name) +# if t.bases: +# self._write("(") +# for a in t.bases: +# self._dispatch(a) +# self._write(", ") +# self._write(")") +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# def _FunctionDef(self, t): +# self._write("\n") +# for deco in t.decorators: +# self._fill("@") +# self._dispatch(deco) +# self._fill("def "+t.name + "(") +# self._dispatch(t.args) +# self._write(")") +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# def _For(self, t): +# self._fill("for ") +# self._dispatch(t.target) +# self._write(" in ") +# self._dispatch(t.iter) +# self._enter() +# self._dispatch(t.body) +# self._leave() +# if t.orelse: +# self._fill("else") +# self._enter() +# self._dispatch(t.orelse) +# self._leave +# +# def _While(self, t): +# self._fill("while ") +# self._dispatch(t.test) +# self._enter() +# self._dispatch(t.body) +# self._leave() +# if t.orelse: +# self._fill("else") +# self._enter() +# self._dispatch(t.orelse) +# self._leave +# +# # expr +# def _Str(self, tree): +# self._write(repr(tree.s)) +## +# def _Repr(self, t): +# self._write("`") +# self._dispatch(t.value) +# self._write("`") +# +# def _Num(self, t): +# self._write(repr(t.n)) +# +# def _ListComp(self, t): +# self._write("[") +# self._dispatch(t.elt) +# for gen in t.generators: +# self._dispatch(gen) +# self._write("]") +# +# def _GeneratorExp(self, t): +# self._write("(") +# self._dispatch(t.elt) +# for gen in t.generators: +# self._dispatch(gen) +# self._write(")") +# +# def _comprehension(self, t): +# self._write(" for ") +# self._dispatch(t.target) +# self._write(" in ") +# self._dispatch(t.iter) +# for if_clause in t.ifs: +# self._write(" if ") +# self._dispatch(if_clause) +# +# def _IfExp(self, t): +# self._dispatch(t.body) +# self._write(" if ") +# self._dispatch(t.test) +# if t.orelse: +# self._write(" else ") +# self._dispatch(t.orelse) +# +# unop = {"Invert":"~", "Not": "not", "UAdd":"+", "USub":"-"} +# def _UnaryOp(self, t): +# self._write(self.unop[t.op.__class__.__name__]) +# self._write("(") +# self._dispatch(t.operand) +# self._write(")") +# +# binop = { "Add":"+", "Sub":"-", "Mult":"*", "Div":"/", "Mod":"%", +# "LShift":">>", "RShift":"<<", "BitOr":"|", "BitXor":"^", "BitAnd":"&", +# "FloorDiv":"//", "Pow": "**"} +# def _BinOp(self, t): +# self._write("(") +# self._dispatch(t.left) +# self._write(")" + self.binop[t.op.__class__.__name__] + "(") +# self._dispatch(t.right) +# self._write(")") +# +# boolops = {_ast.And: 'and', _ast.Or: 'or'} +# def _BoolOp(self, t): +# self._write("(") +# self._dispatch(t.values[0]) +# for v in t.values[1:]: +# self._write(" %s " % self.boolops[t.op.__class__]) +# self._dispatch(v) +# self._write(")") +# +# def _Attribute(self,t): +# self._dispatch(t.value) +# self._write(".") +# self._write(t.attr) +# +## def _Call(self, t): +## self._dispatch(t.func) +## self._write("(") +## comma = False +## for e in t.args: +## if comma: self._write(", ") +## else: comma = True +## self._dispatch(e) +## for e in t.keywords: +## if comma: self._write(", ") +## else: comma = True +## self._dispatch(e) +## if t.starargs: +## if comma: self._write(", ") +## else: comma = True +## self._write("*") +## self._dispatch(t.starargs) +## if t.kwargs: +## if comma: self._write(", ") +## else: comma = True +## self._write("**") +## self._dispatch(t.kwargs) +## self._write(")") +# +# # slice +# def _Index(self, t): +# self._dispatch(t.value) +# +# def _ExtSlice(self, t): +# for i, d in enumerate(t.dims): +# if i != 0: +# self._write(': ') +# self._dispatch(d) +# +# # others +# def _arguments(self, t): +# first = True +# nonDef = len(t.args)-len(t.defaults) +# for a in t.args[0:nonDef]: +# if first:first = False +# else: self._write(", ") +# self._dispatch(a) +# for a,d in zip(t.args[nonDef:], t.defaults): +# if first:first = False +# else: self._write(", ") +# self._dispatch(a), +# self._write("=") +# self._dispatch(d) +# if t.vararg: +# if first:first = False +# else: self._write(", ") +# self._write("*"+t.vararg) +# if t.kwarg: +# if first:first = False +# else: self._write(", ") +# self._write("**"+t.kwarg) +# +## def _keyword(self, t): +## self._write(t.arg) +## self._write("=") +## self._dispatch(t.value) +# +# def _Lambda(self, t): +# self._write("lambda ") +# self._dispatch(t.args) +# self._write(": ") +# self._dispatch(t.body) + + + diff --git a/doc/sphinxext/numpydoc/docscrape.py b/doc/sphinxext/numpydoc/docscrape.py new file mode 100755 index 00000000..2c49ed84 --- /dev/null +++ b/doc/sphinxext/numpydoc/docscrape.py @@ -0,0 +1,527 @@ +"""Extract reference documentation from the NumPy source tree. + +""" +from __future__ import division, absolute_import, print_function + +import inspect +import textwrap +import re +import pydoc +from warnings import warn +import collections + + +class Reader(object): + """A line-based string reader. + + """ + def __init__(self, data): + """ + Parameters + ---------- + data : str + String with lines separated by '\n'. + + """ + if isinstance(data,list): + self._str = data + else: + self._str = data.split('\n') # store string as list of lines + + self.reset() + + def __getitem__(self, n): + return self._str[n] + + def reset(self): + self._l = 0 # current line nr + + def read(self): + if not self.eof(): + out = self[self._l] + self._l += 1 + return out + else: + return '' + + def seek_next_non_empty_line(self): + for l in self[self._l:]: + if l.strip(): + break + else: + self._l += 1 + + def eof(self): + return self._l >= len(self._str) + + def read_to_condition(self, condition_func): + start = self._l + for line in self[start:]: + if condition_func(line): + return self[start:self._l] + self._l += 1 + if self.eof(): + return self[start:self._l+1] + return [] + + def read_to_next_empty_line(self): + self.seek_next_non_empty_line() + def is_empty(line): + return not line.strip() + return self.read_to_condition(is_empty) + + def read_to_next_unindented_line(self): + def is_unindented(line): + return (line.strip() and (len(line.lstrip()) == len(line))) + return self.read_to_condition(is_unindented) + + def peek(self,n=0): + if self._l + n < len(self._str): + return self[self._l + n] + else: + return '' + + def is_empty(self): + return not ''.join(self._str).strip() + + +class NumpyDocString(object): + def __init__(self, docstring, config={}): + docstring = textwrap.dedent(docstring).split('\n') + + self._doc = Reader(docstring) + self._parsed_data = { + 'Signature': '', + 'Summary': [''], + 'Extended Summary': [], + 'Parameters': [], + 'Returns': [], + 'Raises': [], + 'Warns': [], + 'Other Parameters': [], + 'Attributes': [], + 'Methods': [], + 'See Also': [], + 'Notes': [], + 'Warnings': [], + 'References': '', + 'Examples': '', + 'index': {} + } + + self._parse() + + def __getitem__(self,key): + return self._parsed_data[key] + + def __setitem__(self,key,val): + if key not in self._parsed_data: + warn("Unknown section %s" % key) + else: + self._parsed_data[key] = val + + def _is_at_section(self): + self._doc.seek_next_non_empty_line() + + if self._doc.eof(): + return False + + l1 = self._doc.peek().strip() # e.g. Parameters + + if l1.startswith('.. index::'): + return True + + l2 = self._doc.peek(1).strip() # ---------- or ========== + return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) + + def _strip(self,doc): + i = 0 + j = 0 + for i,line in enumerate(doc): + if line.strip(): break + + for j,line in enumerate(doc[::-1]): + if line.strip(): break + + return doc[i:len(doc)-j] + + def _read_to_next_section(self): + section = self._doc.read_to_next_empty_line() + + while not self._is_at_section() and not self._doc.eof(): + if not self._doc.peek(-1).strip(): # previous line was empty + section += [''] + + section += self._doc.read_to_next_empty_line() + + return section + + def _read_sections(self): + while not self._doc.eof(): + data = self._read_to_next_section() + name = data[0].strip() + + if name.startswith('..'): # index section + yield name, data[1:] + elif len(data) < 2: + yield StopIteration + else: + yield name, self._strip(data[2:]) + + def _parse_param_list(self,content): + r = Reader(content) + params = [] + while not r.eof(): + header = r.read().strip() + if ' : ' in header: + arg_name, arg_type = header.split(' : ')[:2] + else: + arg_name, arg_type = header, '' + + desc = r.read_to_next_unindented_line() + desc = dedent_lines(desc) + + params.append((arg_name,arg_type,desc)) + + return params + + + _name_rgx = re.compile(r"^\s*(:(?P\w+):`(?P[a-zA-Z0-9_.-]+)`|" + r" (?P[a-zA-Z0-9_.-]+))\s*", re.X) + def _parse_see_also(self, content): + """ + func_name : Descriptive text + continued text + another_func_name : Descriptive text + func_name1, func_name2, :meth:`func_name`, func_name3 + + """ + items = [] + + def parse_item_name(text): + """Match ':role:`name`' or 'name'""" + m = self._name_rgx.match(text) + if m: + g = m.groups() + if g[1] is None: + return g[3], None + else: + return g[2], g[1] + raise ValueError("%s is not a item name" % text) + + def push_item(name, rest): + if not name: + return + name, role = parse_item_name(name) + items.append((name, list(rest), role)) + del rest[:] + + current_func = None + rest = [] + + for line in content: + if not line.strip(): continue + + m = self._name_rgx.match(line) + if m and line[m.end():].strip().startswith(':'): + push_item(current_func, rest) + current_func, line = line[:m.end()], line[m.end():] + rest = [line.split(':', 1)[1].strip()] + if not rest[0]: + rest = [] + elif not line.startswith(' '): + push_item(current_func, rest) + current_func = None + if ',' in line: + for func in line.split(','): + if func.strip(): + push_item(func, []) + elif line.strip(): + current_func = line + elif current_func is not None: + rest.append(line.strip()) + push_item(current_func, rest) + return items + + def _parse_index(self, section, content): + """ + .. index: default + :refguide: something, else, and more + + """ + def strip_each_in(lst): + return [s.strip() for s in lst] + + out = {} + section = section.split('::') + if len(section) > 1: + out['default'] = strip_each_in(section[1].split(','))[0] + for line in content: + line = line.split(':') + if len(line) > 2: + out[line[1]] = strip_each_in(line[2].split(',')) + return out + + def _parse_summary(self): + """Grab signature (if given) and summary""" + if self._is_at_section(): + return + + # If several signatures present, take the last one + while True: + summary = self._doc.read_to_next_empty_line() + summary_str = " ".join([s.strip() for s in summary]).strip() + if re.compile('^([\w., ]+=)?\s*[\w\.]+\(.*\)$').match(summary_str): + self['Signature'] = summary_str + if not self._is_at_section(): + continue + break + + if summary is not None: + self['Summary'] = summary + + if not self._is_at_section(): + self['Extended Summary'] = self._read_to_next_section() + + def _parse(self): + self._doc.reset() + self._parse_summary() + + for (section,content) in self._read_sections(): + if not section.startswith('..'): + section = ' '.join([s.capitalize() for s in section.split(' ')]) + if section in ('Parameters', 'Returns', 'Raises', 'Warns', + 'Other Parameters', 'Attributes', 'Methods'): + self[section] = self._parse_param_list(content) + elif section.startswith('.. index::'): + self['index'] = self._parse_index(section, content) + elif section == 'See Also': + self['See Also'] = self._parse_see_also(content) + else: + self[section] = content + + # string conversion routines + + def _str_header(self, name, symbol='-'): + return [name, len(name)*symbol] + + def _str_indent(self, doc, indent=4): + out = [] + for line in doc: + out += [' '*indent + line] + return out + + def _str_signature(self): + if self['Signature']: + return [self['Signature'].replace('*','\*')] + [''] + else: + return [''] + + def _str_summary(self): + if self['Summary']: + return self['Summary'] + [''] + else: + return [] + + def _str_extended_summary(self): + if self['Extended Summary']: + return self['Extended Summary'] + [''] + else: + return [] + + def _str_param_list(self, name): + out = [] + if self[name]: + out += self._str_header(name) + for param,param_type,desc in self[name]: + if param_type: + out += ['%s : %s' % (param, param_type)] + else: + out += [param] + out += self._str_indent(desc) + out += [''] + return out + + def _str_section(self, name): + out = [] + if self[name]: + out += self._str_header(name) + out += self[name] + out += [''] + return out + + def _str_see_also(self, func_role): + if not self['See Also']: return [] + out = [] + out += self._str_header("See Also") + last_had_desc = True + for func, desc, role in self['See Also']: + if role: + link = ':%s:`%s`' % (role, func) + elif func_role: + link = ':%s:`%s`' % (func_role, func) + else: + link = "`%s`_" % func + if desc or last_had_desc: + out += [''] + out += [link] + else: + out[-1] += ", %s" % link + if desc: + out += self._str_indent([' '.join(desc)]) + last_had_desc = True + else: + last_had_desc = False + out += [''] + return out + + def _str_index(self): + idx = self['index'] + out = [] + out += ['.. index:: %s' % idx.get('default','')] + for section, references in idx.items(): + if section == 'default': + continue + out += [' :%s: %s' % (section, ', '.join(references))] + return out + + def __str__(self, func_role=''): + out = [] + out += self._str_signature() + out += self._str_summary() + out += self._str_extended_summary() + for param_list in ('Parameters', 'Returns', 'Other Parameters', + 'Raises', 'Warns'): + out += self._str_param_list(param_list) + out += self._str_section('Warnings') + out += self._str_see_also(func_role) + for s in ('Notes','References','Examples'): + out += self._str_section(s) + for param_list in ('Attributes', 'Methods'): + out += self._str_param_list(param_list) + out += self._str_index() + return '\n'.join(out) + + +def indent(str,indent=4): + indent_str = ' '*indent + if str is None: + return indent_str + lines = str.split('\n') + return '\n'.join(indent_str + l for l in lines) + +def dedent_lines(lines): + """Deindent a list of lines maximally""" + return textwrap.dedent("\n".join(lines)).split("\n") + +def header(text, style='-'): + return text + '\n' + style*len(text) + '\n' + + +class FunctionDoc(NumpyDocString): + def __init__(self, func, role='func', doc=None, config={}): + self._f = func + self._role = role # e.g. "func" or "meth" + + if doc is None: + if func is None: + raise ValueError("No function or docstring given") + doc = inspect.getdoc(func) or '' + NumpyDocString.__init__(self, doc) + + if not self['Signature'] and func is not None: + func, func_name = self.get_func() + try: + # try to read signature + argspec = inspect.getargspec(func) + argspec = inspect.formatargspec(*argspec) + argspec = argspec.replace('*','\*') + signature = '%s%s' % (func_name, argspec) + except TypeError as e: + signature = '%s()' % func_name + self['Signature'] = signature + + def get_func(self): + func_name = getattr(self._f, '__name__', self.__class__.__name__) + if inspect.isclass(self._f): + func = getattr(self._f, '__call__', self._f.__init__) + else: + func = self._f + return func, func_name + + def __str__(self): + out = '' + + func, func_name = self.get_func() + signature = self['Signature'].replace('*', '\*') + + roles = {'func': 'function', + 'meth': 'method'} + + if self._role: + if self._role not in roles: + print("Warning: invalid role %s" % self._role) + out += '.. %s:: %s\n \n\n' % (roles.get(self._role,''), + func_name) + + out += super(FunctionDoc, self).__str__(func_role=self._role) + return out + + +class ClassDoc(NumpyDocString): + + extra_public_methods = ['__call__'] + + def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, + config={}): + if not inspect.isclass(cls) and cls is not None: + raise ValueError("Expected a class or None, but got %r" % cls) + self._cls = cls + + if modulename and not modulename.endswith('.'): + modulename += '.' + self._mod = modulename + + if doc is None: + if cls is None: + raise ValueError("No class or documentation string given") + doc = pydoc.getdoc(cls) + + NumpyDocString.__init__(self, doc) + + if config.get('show_class_members', True): + def splitlines_x(s): + if not s: + return [] + else: + return s.splitlines() + + for field, items in [('Methods', self.methods), + ('Attributes', self.properties)]: + if not self[field]: + doc_list = [] + for name in sorted(items): + try: + doc_item = pydoc.getdoc(getattr(self._cls, name)) + doc_list.append((name, '', splitlines_x(doc_item))) + except AttributeError: + pass # method doesn't exist + self[field] = doc_list + + @property + def methods(self): + if self._cls is None: + return [] + return [name for name,func in inspect.getmembers(self._cls) + if ((not name.startswith('_') + or name in self.extra_public_methods) + and isinstance(func, collections.Callable))] + + @property + def properties(self): + if self._cls is None: + return [] + return [name for name,func in inspect.getmembers(self._cls) + if not name.startswith('_') and + (func is None or isinstance(func, property) or + inspect.isgetsetdescriptor(func))] diff --git a/doc/sphinxext/numpydoc/docscrape_sphinx.py b/doc/sphinxext/numpydoc/docscrape_sphinx.py new file mode 100755 index 00000000..ba93b2ea --- /dev/null +++ b/doc/sphinxext/numpydoc/docscrape_sphinx.py @@ -0,0 +1,274 @@ +from __future__ import division, absolute_import, print_function + +import sys, re, inspect, textwrap, pydoc +import sphinx +import collections +from .docscrape import NumpyDocString, FunctionDoc, ClassDoc + +if sys.version_info[0] >= 3: + sixu = lambda s: s +else: + sixu = lambda s: unicode(s, 'unicode_escape') + + +class SphinxDocString(NumpyDocString): + def __init__(self, docstring, config={}): + # Subclasses seemingly do not call this. + NumpyDocString.__init__(self, docstring, config=config) + + def load_config(self, config): + self.use_plots = config.get('use_plots', False) + self.class_members_toctree = config.get('class_members_toctree', True) + + # string conversion routines + def _str_header(self, name, symbol='`'): + return ['.. rubric:: ' + name, ''] + + def _str_field_list(self, name): + return [':' + name + ':'] + + def _str_indent(self, doc, indent=4): + out = [] + for line in doc: + out += [' '*indent + line] + return out + + def _str_signature(self): + return [''] + if self['Signature']: + return ['``%s``' % self['Signature']] + [''] + else: + return [''] + + def _str_summary(self): + return self['Summary'] + [''] + + def _str_extended_summary(self): + return self['Extended Summary'] + [''] + + def _str_returns(self): + out = [] + if self['Returns']: + out += self._str_field_list('Returns') + out += [''] + for param, param_type, desc in self['Returns']: + if param_type: + out += self._str_indent(['**%s** : %s' % (param.strip(), + param_type)]) + else: + out += self._str_indent([param.strip()]) + if desc: + out += [''] + out += self._str_indent(desc, 8) + out += [''] + return out + + def _str_param_list(self, name): + out = [] + if self[name]: + out += self._str_field_list(name) + out += [''] + for param, param_type, desc in self[name]: + if param_type: + out += self._str_indent(['**%s** : %s' % (param.strip(), + param_type)]) + else: + out += self._str_indent(['**%s**' % param.strip()]) + if desc: + out += [''] + out += self._str_indent(desc, 8) + out += [''] + return out + + @property + def _obj(self): + if hasattr(self, '_cls'): + return self._cls + elif hasattr(self, '_f'): + return self._f + return None + + def _str_member_list(self, name): + """ + Generate a member listing, autosummary:: table where possible, + and a table where not. + + """ + out = [] + if self[name]: + out += ['.. rubric:: %s' % name, ''] + prefix = getattr(self, '_name', '') + + if prefix: + prefix = '~%s.' % prefix + + autosum = [] + others = [] + for param, param_type, desc in self[name]: + param = param.strip() + + # Check if the referenced member can have a docstring or not + param_obj = getattr(self._obj, param, None) + if not (callable(param_obj) + or isinstance(param_obj, property) + or inspect.isgetsetdescriptor(param_obj)): + param_obj = None + + if param_obj and (pydoc.getdoc(param_obj) or not desc): + # Referenced object has a docstring + autosum += [" %s%s" % (prefix, param)] + else: + others.append((param, param_type, desc)) + + if autosum: + out += ['.. autosummary::'] + if self.class_members_toctree: + out += [' :toctree:'] + out += [''] + autosum + + if others: + maxlen_0 = max(3, max([len(x[0]) for x in others])) + hdr = sixu("=")*maxlen_0 + sixu(" ") + sixu("=")*10 + fmt = sixu('%%%ds %%s ') % (maxlen_0,) + out += ['', hdr] + for param, param_type, desc in others: + desc = sixu(" ").join(x.strip() for x in desc).strip() + if param_type: + desc = "(%s) %s" % (param_type, desc) + out += [fmt % (param.strip(), desc)] + out += [hdr] + out += [''] + return out + + def _str_section(self, name): + out = [] + if self[name]: + out += self._str_header(name) + out += [''] + content = textwrap.dedent("\n".join(self[name])).split("\n") + out += content + out += [''] + return out + + def _str_see_also(self, func_role): + out = [] + if self['See Also']: + see_also = super(SphinxDocString, self)._str_see_also(func_role) + out = ['.. seealso::', ''] + out += self._str_indent(see_also[2:]) + return out + + def _str_warnings(self): + out = [] + if self['Warnings']: + out = ['.. warning::', ''] + out += self._str_indent(self['Warnings']) + return out + + def _str_index(self): + idx = self['index'] + out = [] + if len(idx) == 0: + return out + + out += ['.. index:: %s' % idx.get('default','')] + for section, references in idx.items(): + if section == 'default': + continue + elif section == 'refguide': + out += [' single: %s' % (', '.join(references))] + else: + out += [' %s: %s' % (section, ','.join(references))] + return out + + def _str_references(self): + out = [] + if self['References']: + out += self._str_header('References') + if isinstance(self['References'], str): + self['References'] = [self['References']] + out.extend(self['References']) + out += [''] + # Latex collects all references to a separate bibliography, + # so we need to insert links to it + if sphinx.__version__ >= "0.6": + out += ['.. only:: latex',''] + else: + out += ['.. latexonly::',''] + items = [] + for line in self['References']: + m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I) + if m: + items.append(m.group(1)) + out += [' ' + ", ".join(["[%s]_" % item for item in items]), ''] + return out + + def _str_examples(self): + examples_str = "\n".join(self['Examples']) + + if (self.use_plots and 'import matplotlib' in examples_str + and 'plot::' not in examples_str): + out = [] + out += self._str_header('Examples') + out += ['.. plot::', ''] + out += self._str_indent(self['Examples']) + out += [''] + return out + else: + return self._str_section('Examples') + + def __str__(self, indent=0, func_role="obj"): + out = [] + out += self._str_signature() + out += self._str_index() + [''] + out += self._str_summary() + out += self._str_extended_summary() + out += self._str_param_list('Parameters') + out += self._str_returns() + for param_list in ('Other Parameters', 'Raises', 'Warns'): + out += self._str_param_list(param_list) + out += self._str_warnings() + out += self._str_see_also(func_role) + out += self._str_section('Notes') + out += self._str_references() + out += self._str_examples() + for param_list in ('Attributes', 'Methods'): + out += self._str_member_list(param_list) + out = self._str_indent(out,indent) + return '\n'.join(out) + +class SphinxFunctionDoc(SphinxDocString, FunctionDoc): + def __init__(self, obj, doc=None, config={}): + self.load_config(config) + FunctionDoc.__init__(self, obj, doc=doc, config=config) + +class SphinxClassDoc(SphinxDocString, ClassDoc): + def __init__(self, obj, doc=None, func_doc=None, config={}): + self.load_config(config) + ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config) + +class SphinxObjDoc(SphinxDocString): + def __init__(self, obj, doc=None, config={}): + self._f = obj + self.load_config(config) + SphinxDocString.__init__(self, doc, config=config) + +def get_doc_object(obj, what=None, doc=None, config={}): + if what is None: + if inspect.isclass(obj): + what = 'class' + elif inspect.ismodule(obj): + what = 'module' + elif isinstance(obj, collections.Callable): + what = 'function' + else: + what = 'object' + if what == 'class': + return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc, + config=config) + elif what in ('function', 'method'): + return SphinxFunctionDoc(obj, doc=doc, config=config) + else: + if doc is None: + doc = pydoc.getdoc(obj) + return SphinxObjDoc(obj, doc, config=config) diff --git a/doc/sphinxext/numpydoc/linkcode.py b/doc/sphinxext/numpydoc/linkcode.py new file mode 100644 index 00000000..1ad3ab82 --- /dev/null +++ b/doc/sphinxext/numpydoc/linkcode.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +""" + linkcode + ~~~~~~~~ + + Add external links to module code in Python object descriptions. + + :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + :license: BSD, see LICENSE for details. + +""" +from __future__ import division, absolute_import, print_function + +import warnings +import collections + +warnings.warn("This extension has been accepted to Sphinx upstream. " + "Use the version from there (Sphinx >= 1.2) " + "https://bitbucket.org/birkenfeld/sphinx/pull-request/47/sphinxextlinkcode", + FutureWarning, stacklevel=1) + + +from docutils import nodes + +from sphinx import addnodes +from sphinx.locale import _ +from sphinx.errors import SphinxError + +class LinkcodeError(SphinxError): + category = "linkcode error" + +def doctree_read(app, doctree): + env = app.builder.env + + resolve_target = getattr(env.config, 'linkcode_resolve', None) + if not isinstance(env.config.linkcode_resolve, collections.Callable): + raise LinkcodeError( + "Function `linkcode_resolve` is not given in conf.py") + + domain_keys = dict( + py=['module', 'fullname'], + c=['names'], + cpp=['names'], + js=['object', 'fullname'], + ) + + for objnode in doctree.traverse(addnodes.desc): + domain = objnode.get('domain') + uris = set() + for signode in objnode: + if not isinstance(signode, addnodes.desc_signature): + continue + + # Convert signode to a specified format + info = {} + for key in domain_keys.get(domain, []): + value = signode.get(key) + if not value: + value = '' + info[key] = value + if not info: + continue + + # Call user code to resolve the link + uri = resolve_target(domain, info) + if not uri: + # no source + continue + + if uri in uris or not uri: + # only one link per name, please + continue + uris.add(uri) + + onlynode = addnodes.only(expr='html') + onlynode += nodes.reference('', '', internal=False, refuri=uri) + onlynode[0] += nodes.inline('', _('[source]'), + classes=['viewcode-link']) + signode += onlynode + +def setup(app): + app.connect('doctree-read', doctree_read) + app.add_config_value('linkcode_resolve', None, '') diff --git a/doc/sphinxext/numpydoc/numpydoc.py b/doc/sphinxext/numpydoc/numpydoc.py new file mode 100755 index 00000000..2bc2d1e9 --- /dev/null +++ b/doc/sphinxext/numpydoc/numpydoc.py @@ -0,0 +1,187 @@ +""" +======== +numpydoc +======== + +Sphinx extension that handles docstrings in the Numpy standard format. [1] + +It will: + +- Convert Parameters etc. sections to field lists. +- Convert See Also section to a See also entry. +- Renumber references. +- Extract the signature from the docstring, if it can't be determined otherwise. + +.. [1] https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt + +""" +from __future__ import division, absolute_import, print_function + +import os, sys, re, pydoc +import sphinx +import inspect +import collections + +if sphinx.__version__ < '1.0.1': + raise RuntimeError("Sphinx 1.0.1 or newer is required") + +from .docscrape_sphinx import get_doc_object, SphinxDocString +from sphinx.util.compat import Directive + +if sys.version_info[0] >= 3: + sixu = lambda s: s +else: + sixu = lambda s: unicode(s, 'unicode_escape') + + +def mangle_docstrings(app, what, name, obj, options, lines, + reference_offset=[0]): + + cfg = dict(use_plots=app.config.numpydoc_use_plots, + show_class_members=app.config.numpydoc_show_class_members, + class_members_toctree=app.config.numpydoc_class_members_toctree, + ) + + if what == 'module': + # Strip top title + title_re = re.compile(sixu('^\\s*[#*=]{4,}\\n[a-z0-9 -]+\\n[#*=]{4,}\\s*'), + re.I|re.S) + lines[:] = title_re.sub(sixu(''), sixu("\n").join(lines)).split(sixu("\n")) + else: + doc = get_doc_object(obj, what, sixu("\n").join(lines), config=cfg) + if sys.version_info[0] >= 3: + doc = str(doc) + else: + doc = unicode(doc) + lines[:] = doc.split(sixu("\n")) + + if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \ + obj.__name__: + if hasattr(obj, '__module__'): + v = dict(full_name=sixu("%s.%s") % (obj.__module__, obj.__name__)) + else: + v = dict(full_name=obj.__name__) + lines += [sixu(''), sixu('.. htmlonly::'), sixu('')] + lines += [sixu(' %s') % x for x in + (app.config.numpydoc_edit_link % v).split("\n")] + + # replace reference numbers so that there are no duplicates + references = [] + for line in lines: + line = line.strip() + m = re.match(sixu('^.. \\[([a-z0-9_.-])\\]'), line, re.I) + if m: + references.append(m.group(1)) + + # start renaming from the longest string, to avoid overwriting parts + references.sort(key=lambda x: -len(x)) + if references: + for i, line in enumerate(lines): + for r in references: + if re.match(sixu('^\\d+$'), r): + new_r = sixu("R%d") % (reference_offset[0] + int(r)) + else: + new_r = sixu("%s%d") % (r, reference_offset[0]) + lines[i] = lines[i].replace(sixu('[%s]_') % r, + sixu('[%s]_') % new_r) + lines[i] = lines[i].replace(sixu('.. [%s]') % r, + sixu('.. [%s]') % new_r) + + reference_offset[0] += len(references) + +def mangle_signature(app, what, name, obj, options, sig, retann): + # Do not try to inspect classes that don't define `__init__` + if (inspect.isclass(obj) and + (not hasattr(obj, '__init__') or + 'initializes x; see ' in pydoc.getdoc(obj.__init__))): + return '', '' + + if not (isinstance(obj, collections.Callable) or hasattr(obj, '__argspec_is_invalid_')): return + if not hasattr(obj, '__doc__'): return + + doc = SphinxDocString(pydoc.getdoc(obj)) + if doc['Signature']: + sig = re.sub(sixu("^[^(]*"), sixu(""), doc['Signature']) + return sig, sixu('') + +def setup(app, get_doc_object_=get_doc_object): + if not hasattr(app, 'add_config_value'): + return # probably called by nose, better bail out + + global get_doc_object + get_doc_object = get_doc_object_ + + app.connect('autodoc-process-docstring', mangle_docstrings) + app.connect('autodoc-process-signature', mangle_signature) + app.add_config_value('numpydoc_edit_link', None, False) + app.add_config_value('numpydoc_use_plots', None, False) + app.add_config_value('numpydoc_show_class_members', True, True) + app.add_config_value('numpydoc_class_members_toctree', True, True) + + # Extra mangling domains + app.add_domain(NumpyPythonDomain) + app.add_domain(NumpyCDomain) + +#------------------------------------------------------------------------------ +# Docstring-mangling domains +#------------------------------------------------------------------------------ + +from docutils.statemachine import ViewList +from sphinx.domains.c import CDomain +from sphinx.domains.python import PythonDomain + +class ManglingDomainBase(object): + directive_mangling_map = {} + + def __init__(self, *a, **kw): + super(ManglingDomainBase, self).__init__(*a, **kw) + self.wrap_mangling_directives() + + def wrap_mangling_directives(self): + for name, objtype in list(self.directive_mangling_map.items()): + self.directives[name] = wrap_mangling_directive( + self.directives[name], objtype) + +class NumpyPythonDomain(ManglingDomainBase, PythonDomain): + name = 'np' + directive_mangling_map = { + 'function': 'function', + 'class': 'class', + 'exception': 'class', + 'method': 'function', + 'classmethod': 'function', + 'staticmethod': 'function', + 'attribute': 'attribute', + } + indices = [] + +class NumpyCDomain(ManglingDomainBase, CDomain): + name = 'np-c' + directive_mangling_map = { + 'function': 'function', + 'member': 'attribute', + 'macro': 'function', + 'type': 'class', + 'var': 'object', + } + +def wrap_mangling_directive(base_directive, objtype): + class directive(base_directive): + def run(self): + env = self.state.document.settings.env + + name = None + if self.arguments: + m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0]) + name = m.group(2).strip() + + if not name: + name = self.arguments[0] + + lines = list(self.content) + mangle_docstrings(env.app, objtype, name, None, None, lines) + self.content = ViewList(lines, self.content.parent) + + return base_directive.run(self) + + return directive diff --git a/doc/sphinxext/numpydoc/phantom_import.py b/doc/sphinxext/numpydoc/phantom_import.py new file mode 100755 index 00000000..9a60b4a3 --- /dev/null +++ b/doc/sphinxext/numpydoc/phantom_import.py @@ -0,0 +1,167 @@ +""" +============== +phantom_import +============== + +Sphinx extension to make directives from ``sphinx.ext.autodoc`` and similar +extensions to use docstrings loaded from an XML file. + +This extension loads an XML file in the Pydocweb format [1] and +creates a dummy module that contains the specified docstrings. This +can be used to get the current docstrings from a Pydocweb instance +without needing to rebuild the documented module. + +.. [1] http://code.google.com/p/pydocweb + +""" +from __future__ import division, absolute_import, print_function + +import imp, sys, compiler, types, os, inspect, re + +def setup(app): + app.connect('builder-inited', initialize) + app.add_config_value('phantom_import_file', None, True) + +def initialize(app): + fn = app.config.phantom_import_file + if (fn and os.path.isfile(fn)): + print("[numpydoc] Phantom importing modules from", fn, "...") + import_phantom_module(fn) + +#------------------------------------------------------------------------------ +# Creating 'phantom' modules from an XML description +#------------------------------------------------------------------------------ +def import_phantom_module(xml_file): + """ + Insert a fake Python module to sys.modules, based on a XML file. + + The XML file is expected to conform to Pydocweb DTD. The fake + module will contain dummy objects, which guarantee the following: + + - Docstrings are correct. + - Class inheritance relationships are correct (if present in XML). + - Function argspec is *NOT* correct (even if present in XML). + Instead, the function signature is prepended to the function docstring. + - Class attributes are *NOT* correct; instead, they are dummy objects. + + Parameters + ---------- + xml_file : str + Name of an XML file to read + + """ + import lxml.etree as etree + + object_cache = {} + + tree = etree.parse(xml_file) + root = tree.getroot() + + # Sort items so that + # - Base classes come before classes inherited from them + # - Modules come before their contents + all_nodes = dict([(n.attrib['id'], n) for n in root]) + + def _get_bases(node, recurse=False): + bases = [x.attrib['ref'] for x in node.findall('base')] + if recurse: + j = 0 + while True: + try: + b = bases[j] + except IndexError: break + if b in all_nodes: + bases.extend(_get_bases(all_nodes[b])) + j += 1 + return bases + + type_index = ['module', 'class', 'callable', 'object'] + + def base_cmp(a, b): + x = cmp(type_index.index(a.tag), type_index.index(b.tag)) + if x != 0: return x + + if a.tag == 'class' and b.tag == 'class': + a_bases = _get_bases(a, recurse=True) + b_bases = _get_bases(b, recurse=True) + x = cmp(len(a_bases), len(b_bases)) + if x != 0: return x + if a.attrib['id'] in b_bases: return -1 + if b.attrib['id'] in a_bases: return 1 + + return cmp(a.attrib['id'].count('.'), b.attrib['id'].count('.')) + + nodes = root.getchildren() + nodes.sort(base_cmp) + + # Create phantom items + for node in nodes: + name = node.attrib['id'] + doc = (node.text or '').decode('string-escape') + "\n" + if doc == "\n": doc = "" + + # create parent, if missing + parent = name + while True: + parent = '.'.join(parent.split('.')[:-1]) + if not parent: break + if parent in object_cache: break + obj = imp.new_module(parent) + object_cache[parent] = obj + sys.modules[parent] = obj + + # create object + if node.tag == 'module': + obj = imp.new_module(name) + obj.__doc__ = doc + sys.modules[name] = obj + elif node.tag == 'class': + bases = [object_cache[b] for b in _get_bases(node) + if b in object_cache] + bases.append(object) + init = lambda self: None + init.__doc__ = doc + obj = type(name, tuple(bases), {'__doc__': doc, '__init__': init}) + obj.__name__ = name.split('.')[-1] + elif node.tag == 'callable': + funcname = node.attrib['id'].split('.')[-1] + argspec = node.attrib.get('argspec') + if argspec: + argspec = re.sub('^[^(]*', '', argspec) + doc = "%s%s\n\n%s" % (funcname, argspec, doc) + obj = lambda: 0 + obj.__argspec_is_invalid_ = True + if sys.version_info[0] >= 3: + obj.__name__ = funcname + else: + obj.func_name = funcname + obj.__name__ = name + obj.__doc__ = doc + if inspect.isclass(object_cache[parent]): + obj.__objclass__ = object_cache[parent] + else: + class Dummy(object): pass + obj = Dummy() + obj.__name__ = name + obj.__doc__ = doc + if inspect.isclass(object_cache[parent]): + obj.__get__ = lambda: None + object_cache[name] = obj + + if parent: + if inspect.ismodule(object_cache[parent]): + obj.__module__ = parent + setattr(object_cache[parent], name.split('.')[-1], obj) + + # Populate items + for node in root: + obj = object_cache.get(node.attrib['id']) + if obj is None: continue + for ref in node.findall('ref'): + if node.tag == 'class': + if ref.attrib['ref'].startswith(node.attrib['id'] + '.'): + setattr(obj, ref.attrib['name'], + object_cache.get(ref.attrib['ref'])) + else: + setattr(obj, ref.attrib['name'], + object_cache.get(ref.attrib['ref'])) diff --git a/doc/sphinxext/numpydoc/plot_directive.py b/doc/sphinxext/numpydoc/plot_directive.py new file mode 100755 index 00000000..2014f857 --- /dev/null +++ b/doc/sphinxext/numpydoc/plot_directive.py @@ -0,0 +1,642 @@ +""" +A special directive for generating a matplotlib plot. + +.. warning:: + + This is a hacked version of plot_directive.py from Matplotlib. + It's very much subject to change! + + +Usage +----- + +Can be used like this:: + + .. plot:: examples/example.py + + .. plot:: + + import matplotlib.pyplot as plt + plt.plot([1,2,3], [4,5,6]) + + .. plot:: + + A plotting example: + + >>> import matplotlib.pyplot as plt + >>> plt.plot([1,2,3], [4,5,6]) + +The content is interpreted as doctest formatted if it has a line starting +with ``>>>``. + +The ``plot`` directive supports the options + + format : {'python', 'doctest'} + Specify the format of the input + + include-source : bool + Whether to display the source code. Default can be changed in conf.py + +and the ``image`` directive options ``alt``, ``height``, ``width``, +``scale``, ``align``, ``class``. + +Configuration options +--------------------- + +The plot directive has the following configuration options: + + plot_include_source + Default value for the include-source option + + plot_pre_code + Code that should be executed before each plot. + + plot_basedir + Base directory, to which plot:: file names are relative to. + (If None or empty, file names are relative to the directoly where + the file containing the directive is.) + + plot_formats + File formats to generate. List of tuples or strings:: + + [(suffix, dpi), suffix, ...] + + that determine the file format and the DPI. For entries whose + DPI was omitted, sensible defaults are chosen. + + plot_html_show_formats + Whether to show links to the files in HTML. + +TODO +---- + +* Refactor Latex output; now it's plain images, but it would be nice + to make them appear side-by-side, or in floats. + +""" +from __future__ import division, absolute_import, print_function + +import sys, os, glob, shutil, imp, warnings, re, textwrap, traceback +import sphinx + +if sys.version_info[0] >= 3: + from io import StringIO +else: + from io import StringIO + +import warnings +warnings.warn("A plot_directive module is also available under " + "matplotlib.sphinxext; expect this numpydoc.plot_directive " + "module to be deprecated after relevant features have been " + "integrated there.", + FutureWarning, stacklevel=2) + + +#------------------------------------------------------------------------------ +# Registration hook +#------------------------------------------------------------------------------ + +def setup(app): + setup.app = app + setup.config = app.config + setup.confdir = app.confdir + + app.add_config_value('plot_pre_code', '', True) + app.add_config_value('plot_include_source', False, True) + app.add_config_value('plot_formats', ['png', 'hires.png', 'pdf'], True) + app.add_config_value('plot_basedir', None, True) + app.add_config_value('plot_html_show_formats', True, True) + + app.add_directive('plot', plot_directive, True, (0, 1, False), + **plot_directive_options) + +#------------------------------------------------------------------------------ +# plot:: directive +#------------------------------------------------------------------------------ +from docutils.parsers.rst import directives +from docutils import nodes + +def plot_directive(name, arguments, options, content, lineno, + content_offset, block_text, state, state_machine): + return run(arguments, content, options, state_machine, state, lineno) +plot_directive.__doc__ = __doc__ + +def _option_boolean(arg): + if not arg or not arg.strip(): + # no argument given, assume used as a flag + return True + elif arg.strip().lower() in ('no', '0', 'false'): + return False + elif arg.strip().lower() in ('yes', '1', 'true'): + return True + else: + raise ValueError('"%s" unknown boolean' % arg) + +def _option_format(arg): + return directives.choice(arg, ('python', 'lisp')) + +def _option_align(arg): + return directives.choice(arg, ("top", "middle", "bottom", "left", "center", + "right")) + +plot_directive_options = {'alt': directives.unchanged, + 'height': directives.length_or_unitless, + 'width': directives.length_or_percentage_or_unitless, + 'scale': directives.nonnegative_int, + 'align': _option_align, + 'class': directives.class_option, + 'include-source': _option_boolean, + 'format': _option_format, + } + +#------------------------------------------------------------------------------ +# Generating output +#------------------------------------------------------------------------------ + +from docutils import nodes, utils + +try: + # Sphinx depends on either Jinja or Jinja2 + import jinja2 + def format_template(template, **kw): + return jinja2.Template(template).render(**kw) +except ImportError: + import jinja + def format_template(template, **kw): + return jinja.from_string(template, **kw) + +TEMPLATE = """ +{{ source_code }} + +{{ only_html }} + + {% if source_link or (html_show_formats and not multi_image) %} + ( + {%- if source_link -%} + `Source code <{{ source_link }}>`__ + {%- endif -%} + {%- if html_show_formats and not multi_image -%} + {%- for img in images -%} + {%- for fmt in img.formats -%} + {%- if source_link or not loop.first -%}, {% endif -%} + `{{ fmt }} <{{ dest_dir }}/{{ img.basename }}.{{ fmt }}>`__ + {%- endfor -%} + {%- endfor -%} + {%- endif -%} + ) + {% endif %} + + {% for img in images %} + .. figure:: {{ build_dir }}/{{ img.basename }}.png + {%- for option in options %} + {{ option }} + {% endfor %} + + {% if html_show_formats and multi_image -%} + ( + {%- for fmt in img.formats -%} + {%- if not loop.first -%}, {% endif -%} + `{{ fmt }} <{{ dest_dir }}/{{ img.basename }}.{{ fmt }}>`__ + {%- endfor -%} + ) + {%- endif -%} + {% endfor %} + +{{ only_latex }} + + {% for img in images %} + .. image:: {{ build_dir }}/{{ img.basename }}.pdf + {% endfor %} + +""" + +class ImageFile(object): + def __init__(self, basename, dirname): + self.basename = basename + self.dirname = dirname + self.formats = [] + + def filename(self, format): + return os.path.join(self.dirname, "%s.%s" % (self.basename, format)) + + def filenames(self): + return [self.filename(fmt) for fmt in self.formats] + +def run(arguments, content, options, state_machine, state, lineno): + if arguments and content: + raise RuntimeError("plot:: directive can't have both args and content") + + document = state_machine.document + config = document.settings.env.config + + options.setdefault('include-source', config.plot_include_source) + + # determine input + rst_file = document.attributes['source'] + rst_dir = os.path.dirname(rst_file) + + if arguments: + if not config.plot_basedir: + source_file_name = os.path.join(rst_dir, + directives.uri(arguments[0])) + else: + source_file_name = os.path.join(setup.confdir, config.plot_basedir, + directives.uri(arguments[0])) + code = open(source_file_name, 'r').read() + output_base = os.path.basename(source_file_name) + else: + source_file_name = rst_file + code = textwrap.dedent("\n".join(map(str, content))) + counter = document.attributes.get('_plot_counter', 0) + 1 + document.attributes['_plot_counter'] = counter + base, ext = os.path.splitext(os.path.basename(source_file_name)) + output_base = '%s-%d.py' % (base, counter) + + base, source_ext = os.path.splitext(output_base) + if source_ext in ('.py', '.rst', '.txt'): + output_base = base + else: + source_ext = '' + + # ensure that LaTeX includegraphics doesn't choke in foo.bar.pdf filenames + output_base = output_base.replace('.', '-') + + # is it in doctest format? + is_doctest = contains_doctest(code) + if 'format' in options: + if options['format'] == 'python': + is_doctest = False + else: + is_doctest = True + + # determine output directory name fragment + source_rel_name = relpath(source_file_name, setup.confdir) + source_rel_dir = os.path.dirname(source_rel_name) + while source_rel_dir.startswith(os.path.sep): + source_rel_dir = source_rel_dir[1:] + + # build_dir: where to place output files (temporarily) + build_dir = os.path.join(os.path.dirname(setup.app.doctreedir), + 'plot_directive', + source_rel_dir) + if not os.path.exists(build_dir): + os.makedirs(build_dir) + + # output_dir: final location in the builder's directory + dest_dir = os.path.abspath(os.path.join(setup.app.builder.outdir, + source_rel_dir)) + + # how to link to files from the RST file + dest_dir_link = os.path.join(relpath(setup.confdir, rst_dir), + source_rel_dir).replace(os.path.sep, '/') + build_dir_link = relpath(build_dir, rst_dir).replace(os.path.sep, '/') + source_link = dest_dir_link + '/' + output_base + source_ext + + # make figures + try: + results = makefig(code, source_file_name, build_dir, output_base, + config) + errors = [] + except PlotError as err: + reporter = state.memo.reporter + sm = reporter.system_message( + 2, "Exception occurred in plotting %s: %s" % (output_base, err), + line=lineno) + results = [(code, [])] + errors = [sm] + + # generate output restructuredtext + total_lines = [] + for j, (code_piece, images) in enumerate(results): + if options['include-source']: + if is_doctest: + lines = [''] + lines += [row.rstrip() for row in code_piece.split('\n')] + else: + lines = ['.. code-block:: python', ''] + lines += [' %s' % row.rstrip() + for row in code_piece.split('\n')] + source_code = "\n".join(lines) + else: + source_code = "" + + opts = [':%s: %s' % (key, val) for key, val in list(options.items()) + if key in ('alt', 'height', 'width', 'scale', 'align', 'class')] + + only_html = ".. only:: html" + only_latex = ".. only:: latex" + + if j == 0: + src_link = source_link + else: + src_link = None + + result = format_template( + TEMPLATE, + dest_dir=dest_dir_link, + build_dir=build_dir_link, + source_link=src_link, + multi_image=len(images) > 1, + only_html=only_html, + only_latex=only_latex, + options=opts, + images=images, + source_code=source_code, + html_show_formats=config.plot_html_show_formats) + + total_lines.extend(result.split("\n")) + total_lines.extend("\n") + + if total_lines: + state_machine.insert_input(total_lines, source=source_file_name) + + # copy image files to builder's output directory + if not os.path.exists(dest_dir): + os.makedirs(dest_dir) + + for code_piece, images in results: + for img in images: + for fn in img.filenames(): + shutil.copyfile(fn, os.path.join(dest_dir, + os.path.basename(fn))) + + # copy script (if necessary) + if source_file_name == rst_file: + target_name = os.path.join(dest_dir, output_base + source_ext) + f = open(target_name, 'w') + f.write(unescape_doctest(code)) + f.close() + + return errors + + +#------------------------------------------------------------------------------ +# Run code and capture figures +#------------------------------------------------------------------------------ + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import matplotlib.image as image +from matplotlib import _pylab_helpers + +import exceptions + +def contains_doctest(text): + try: + # check if it's valid Python as-is + compile(text, '', 'exec') + return False + except SyntaxError: + pass + r = re.compile(r'^\s*>>>', re.M) + m = r.search(text) + return bool(m) + +def unescape_doctest(text): + """ + Extract code from a piece of text, which contains either Python code + or doctests. + + """ + if not contains_doctest(text): + return text + + code = "" + for line in text.split("\n"): + m = re.match(r'^\s*(>>>|\.\.\.) (.*)$', line) + if m: + code += m.group(2) + "\n" + elif line.strip(): + code += "# " + line.strip() + "\n" + else: + code += "\n" + return code + +def split_code_at_show(text): + """ + Split code at plt.show() + + """ + + parts = [] + is_doctest = contains_doctest(text) + + part = [] + for line in text.split("\n"): + if (not is_doctest and line.strip() == 'plt.show()') or \ + (is_doctest and line.strip() == '>>> plt.show()'): + part.append(line) + parts.append("\n".join(part)) + part = [] + else: + part.append(line) + if "\n".join(part).strip(): + parts.append("\n".join(part)) + return parts + +class PlotError(RuntimeError): + pass + +def run_code(code, code_path, ns=None): + # Change the working directory to the directory of the example, so + # it can get at its data files, if any. + pwd = os.getcwd() + old_sys_path = list(sys.path) + if code_path is not None: + dirname = os.path.abspath(os.path.dirname(code_path)) + os.chdir(dirname) + sys.path.insert(0, dirname) + + # Redirect stdout + stdout = sys.stdout + sys.stdout = StringIO() + + # Reset sys.argv + old_sys_argv = sys.argv + sys.argv = [code_path] + + try: + try: + code = unescape_doctest(code) + if ns is None: + ns = {} + if not ns: + exec(setup.config.plot_pre_code, ns) + exec(code, ns) + except (Exception, SystemExit) as err: + raise PlotError(traceback.format_exc()) + finally: + os.chdir(pwd) + sys.argv = old_sys_argv + sys.path[:] = old_sys_path + sys.stdout = stdout + return ns + + +#------------------------------------------------------------------------------ +# Generating figures +#------------------------------------------------------------------------------ + +def out_of_date(original, derived): + """ + Returns True if derivative is out-of-date wrt original, + both of which are full file paths. + """ + return (not os.path.exists(derived) + or os.stat(derived).st_mtime < os.stat(original).st_mtime) + + +def makefig(code, code_path, output_dir, output_base, config): + """ + Run a pyplot script *code* and save the images under *output_dir* + with file names derived from *output_base* + + """ + + # -- Parse format list + default_dpi = {'png': 80, 'hires.png': 200, 'pdf': 50} + formats = [] + for fmt in config.plot_formats: + if isinstance(fmt, str): + formats.append((fmt, default_dpi.get(fmt, 80))) + elif type(fmt) in (tuple, list) and len(fmt)==2: + formats.append((str(fmt[0]), int(fmt[1]))) + else: + raise PlotError('invalid image format "%r" in plot_formats' % fmt) + + # -- Try to determine if all images already exist + + code_pieces = split_code_at_show(code) + + # Look for single-figure output files first + all_exists = True + img = ImageFile(output_base, output_dir) + for format, dpi in formats: + if out_of_date(code_path, img.filename(format)): + all_exists = False + break + img.formats.append(format) + + if all_exists: + return [(code, [img])] + + # Then look for multi-figure output files + results = [] + all_exists = True + for i, code_piece in enumerate(code_pieces): + images = [] + for j in range(1000): + img = ImageFile('%s_%02d_%02d' % (output_base, i, j), output_dir) + for format, dpi in formats: + if out_of_date(code_path, img.filename(format)): + all_exists = False + break + img.formats.append(format) + + # assume that if we have one, we have them all + if not all_exists: + all_exists = (j > 0) + break + images.append(img) + if not all_exists: + break + results.append((code_piece, images)) + + if all_exists: + return results + + # -- We didn't find the files, so build them + + results = [] + ns = {} + + for i, code_piece in enumerate(code_pieces): + # Clear between runs + plt.close('all') + + # Run code + run_code(code_piece, code_path, ns) + + # Collect images + images = [] + fig_managers = _pylab_helpers.Gcf.get_all_fig_managers() + for j, figman in enumerate(fig_managers): + if len(fig_managers) == 1 and len(code_pieces) == 1: + img = ImageFile(output_base, output_dir) + else: + img = ImageFile("%s_%02d_%02d" % (output_base, i, j), + output_dir) + images.append(img) + for format, dpi in formats: + try: + figman.canvas.figure.savefig(img.filename(format), dpi=dpi) + except exceptions.BaseException as err: + raise PlotError(traceback.format_exc()) + img.formats.append(format) + + # Results + results.append((code_piece, images)) + + return results + + +#------------------------------------------------------------------------------ +# Relative pathnames +#------------------------------------------------------------------------------ + +try: + from os.path import relpath +except ImportError: + # Copied from Python 2.7 + if 'posix' in sys.builtin_module_names: + def relpath(path, start=os.path.curdir): + """Return a relative version of a path""" + from os.path import sep, curdir, join, abspath, commonprefix, \ + pardir + + if not path: + raise ValueError("no path specified") + + start_list = abspath(start).split(sep) + path_list = abspath(path).split(sep) + + # Work out how much of the filepath is shared by start and path. + i = len(commonprefix([start_list, path_list])) + + rel_list = [pardir] * (len(start_list)-i) + path_list[i:] + if not rel_list: + return curdir + return join(*rel_list) + elif 'nt' in sys.builtin_module_names: + def relpath(path, start=os.path.curdir): + """Return a relative version of a path""" + from os.path import sep, curdir, join, abspath, commonprefix, \ + pardir, splitunc + + if not path: + raise ValueError("no path specified") + start_list = abspath(start).split(sep) + path_list = abspath(path).split(sep) + if start_list[0].lower() != path_list[0].lower(): + unc_path, rest = splitunc(path) + unc_start, rest = splitunc(start) + if bool(unc_path) ^ bool(unc_start): + raise ValueError("Cannot mix UNC and non-UNC paths (%s and %s)" + % (path, start)) + else: + raise ValueError("path is on drive %s, start on drive %s" + % (path_list[0], start_list[0])) + # Work out how much of the filepath is shared by start and path. + for i in range(min(len(start_list), len(path_list))): + if start_list[i].lower() != path_list[i].lower(): + break + else: + i += 1 + + rel_list = [pardir] * (len(start_list)-i) + path_list[i:] + if not rel_list: + return curdir + return join(*rel_list) + else: + raise RuntimeError("Unsupported platform (no relpath available!)") diff --git a/doc/sphinxext/numpydoc/tests/test_docscrape.py b/doc/sphinxext/numpydoc/tests/test_docscrape.py new file mode 100755 index 00000000..b682504e --- /dev/null +++ b/doc/sphinxext/numpydoc/tests/test_docscrape.py @@ -0,0 +1,767 @@ +# -*- encoding:utf-8 -*- +from __future__ import division, absolute_import, print_function + +import sys, textwrap + +from numpydoc.docscrape import NumpyDocString, FunctionDoc, ClassDoc +from numpydoc.docscrape_sphinx import SphinxDocString, SphinxClassDoc +from nose.tools import * + +if sys.version_info[0] >= 3: + sixu = lambda s: s +else: + sixu = lambda s: unicode(s, 'unicode_escape') + + +doc_txt = '''\ + numpy.multivariate_normal(mean, cov, shape=None, spam=None) + + Draw values from a multivariate normal distribution with specified + mean and covariance. + + The multivariate normal or Gaussian distribution is a generalisation + of the one-dimensional normal distribution to higher dimensions. + + Parameters + ---------- + mean : (N,) ndarray + Mean of the N-dimensional distribution. + + .. math:: + + (1+2+3)/3 + + cov : (N, N) ndarray + Covariance matrix of the distribution. + shape : tuple of ints + Given a shape of, for example, (m,n,k), m*n*k samples are + generated, and packed in an m-by-n-by-k arrangement. Because + each sample is N-dimensional, the output shape is (m,n,k,N). + + Returns + ------- + out : ndarray + The drawn samples, arranged according to `shape`. If the + shape given is (m,n,...), then the shape of `out` is is + (m,n,...,N). + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. + list of str + This is not a real return value. It exists to test + anonymous return values. + + Other Parameters + ---------------- + spam : parrot + A parrot off its mortal coil. + + Raises + ------ + RuntimeError + Some error + + Warns + ----- + RuntimeWarning + Some warning + + Warnings + -------- + Certain warnings apply. + + Notes + ----- + Instead of specifying the full covariance matrix, popular + approximations include: + + - Spherical covariance (`cov` is a multiple of the identity matrix) + - Diagonal covariance (`cov` has non-negative elements only on the diagonal) + + This geometrical property can be seen in two dimensions by plotting + generated data-points: + + >>> mean = [0,0] + >>> cov = [[1,0],[0,100]] # diagonal covariance, points lie on x or y-axis + + >>> x,y = multivariate_normal(mean,cov,5000).T + >>> plt.plot(x,y,'x'); plt.axis('equal'); plt.show() + + Note that the covariance matrix must be symmetric and non-negative + definite. + + References + ---------- + .. [1] A. Papoulis, "Probability, Random Variables, and Stochastic + Processes," 3rd ed., McGraw-Hill Companies, 1991 + .. [2] R.O. Duda, P.E. Hart, and D.G. Stork, "Pattern Classification," + 2nd ed., Wiley, 2001. + + See Also + -------- + some, other, funcs + otherfunc : relationship + + Examples + -------- + >>> mean = (1,2) + >>> cov = [[1,0],[1,0]] + >>> x = multivariate_normal(mean,cov,(3,3)) + >>> print x.shape + (3, 3, 2) + + The following is probably true, given that 0.6 is roughly twice the + standard deviation: + + >>> print list( (x[0,0,:] - mean) < 0.6 ) + [True, True] + + .. index:: random + :refguide: random;distributions, random;gauss + + ''' +doc = NumpyDocString(doc_txt) + + +def test_signature(): + assert doc['Signature'].startswith('numpy.multivariate_normal(') + assert doc['Signature'].endswith('spam=None)') + +def test_summary(): + assert doc['Summary'][0].startswith('Draw values') + assert doc['Summary'][-1].endswith('covariance.') + +def test_extended_summary(): + assert doc['Extended Summary'][0].startswith('The multivariate normal') + +def test_parameters(): + assert_equal(len(doc['Parameters']), 3) + assert_equal([n for n,_,_ in doc['Parameters']], ['mean','cov','shape']) + + arg, arg_type, desc = doc['Parameters'][1] + assert_equal(arg_type, '(N, N) ndarray') + assert desc[0].startswith('Covariance matrix') + assert doc['Parameters'][0][-1][-2] == ' (1+2+3)/3' + +def test_other_parameters(): + assert_equal(len(doc['Other Parameters']), 1) + assert_equal([n for n,_,_ in doc['Other Parameters']], ['spam']) + arg, arg_type, desc = doc['Other Parameters'][0] + assert_equal(arg_type, 'parrot') + assert desc[0].startswith('A parrot off its mortal coil') + +def test_returns(): + assert_equal(len(doc['Returns']), 2) + arg, arg_type, desc = doc['Returns'][0] + assert_equal(arg, 'out') + assert_equal(arg_type, 'ndarray') + assert desc[0].startswith('The drawn samples') + assert desc[-1].endswith('distribution.') + + arg, arg_type, desc = doc['Returns'][1] + assert_equal(arg, 'list of str') + assert_equal(arg_type, '') + assert desc[0].startswith('This is not a real') + assert desc[-1].endswith('anonymous return values.') + +def test_notes(): + assert doc['Notes'][0].startswith('Instead') + assert doc['Notes'][-1].endswith('definite.') + assert_equal(len(doc['Notes']), 17) + +def test_references(): + assert doc['References'][0].startswith('..') + assert doc['References'][-1].endswith('2001.') + +def test_examples(): + assert doc['Examples'][0].startswith('>>>') + assert doc['Examples'][-1].endswith('True]') + +def test_index(): + assert_equal(doc['index']['default'], 'random') + assert_equal(len(doc['index']), 2) + assert_equal(len(doc['index']['refguide']), 2) + +def non_blank_line_by_line_compare(a,b): + a = textwrap.dedent(a) + b = textwrap.dedent(b) + a = [l.rstrip() for l in a.split('\n') if l.strip()] + b = [l.rstrip() for l in b.split('\n') if l.strip()] + for n,line in enumerate(a): + if not line == b[n]: + raise AssertionError("Lines %s of a and b differ: " + "\n>>> %s\n<<< %s\n" % + (n,line,b[n])) +def test_str(): + non_blank_line_by_line_compare(str(doc), +"""numpy.multivariate_normal(mean, cov, shape=None, spam=None) + +Draw values from a multivariate normal distribution with specified +mean and covariance. + +The multivariate normal or Gaussian distribution is a generalisation +of the one-dimensional normal distribution to higher dimensions. + +Parameters +---------- +mean : (N,) ndarray + Mean of the N-dimensional distribution. + + .. math:: + + (1+2+3)/3 + +cov : (N, N) ndarray + Covariance matrix of the distribution. +shape : tuple of ints + Given a shape of, for example, (m,n,k), m*n*k samples are + generated, and packed in an m-by-n-by-k arrangement. Because + each sample is N-dimensional, the output shape is (m,n,k,N). + +Returns +------- +out : ndarray + The drawn samples, arranged according to `shape`. If the + shape given is (m,n,...), then the shape of `out` is is + (m,n,...,N). + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. +list of str + This is not a real return value. It exists to test + anonymous return values. + +Other Parameters +---------------- +spam : parrot + A parrot off its mortal coil. + +Raises +------ +RuntimeError + Some error + +Warns +----- +RuntimeWarning + Some warning + +Warnings +-------- +Certain warnings apply. + +See Also +-------- +`some`_, `other`_, `funcs`_ + +`otherfunc`_ + relationship + +Notes +----- +Instead of specifying the full covariance matrix, popular +approximations include: + + - Spherical covariance (`cov` is a multiple of the identity matrix) + - Diagonal covariance (`cov` has non-negative elements only on the diagonal) + +This geometrical property can be seen in two dimensions by plotting +generated data-points: + +>>> mean = [0,0] +>>> cov = [[1,0],[0,100]] # diagonal covariance, points lie on x or y-axis + +>>> x,y = multivariate_normal(mean,cov,5000).T +>>> plt.plot(x,y,'x'); plt.axis('equal'); plt.show() + +Note that the covariance matrix must be symmetric and non-negative +definite. + +References +---------- +.. [1] A. Papoulis, "Probability, Random Variables, and Stochastic + Processes," 3rd ed., McGraw-Hill Companies, 1991 +.. [2] R.O. Duda, P.E. Hart, and D.G. Stork, "Pattern Classification," + 2nd ed., Wiley, 2001. + +Examples +-------- +>>> mean = (1,2) +>>> cov = [[1,0],[1,0]] +>>> x = multivariate_normal(mean,cov,(3,3)) +>>> print x.shape +(3, 3, 2) + +The following is probably true, given that 0.6 is roughly twice the +standard deviation: + +>>> print list( (x[0,0,:] - mean) < 0.6 ) +[True, True] + +.. index:: random + :refguide: random;distributions, random;gauss""") + + +def test_sphinx_str(): + sphinx_doc = SphinxDocString(doc_txt) + non_blank_line_by_line_compare(str(sphinx_doc), +""" +.. index:: random + single: random;distributions, random;gauss + +Draw values from a multivariate normal distribution with specified +mean and covariance. + +The multivariate normal or Gaussian distribution is a generalisation +of the one-dimensional normal distribution to higher dimensions. + +:Parameters: + + **mean** : (N,) ndarray + + Mean of the N-dimensional distribution. + + .. math:: + + (1+2+3)/3 + + **cov** : (N, N) ndarray + + Covariance matrix of the distribution. + + **shape** : tuple of ints + + Given a shape of, for example, (m,n,k), m*n*k samples are + generated, and packed in an m-by-n-by-k arrangement. Because + each sample is N-dimensional, the output shape is (m,n,k,N). + +:Returns: + + **out** : ndarray + + The drawn samples, arranged according to `shape`. If the + shape given is (m,n,...), then the shape of `out` is is + (m,n,...,N). + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. + + list of str + + This is not a real return value. It exists to test + anonymous return values. + +:Other Parameters: + + **spam** : parrot + + A parrot off its mortal coil. + +:Raises: + + **RuntimeError** + + Some error + +:Warns: + + **RuntimeWarning** + + Some warning + +.. warning:: + + Certain warnings apply. + +.. seealso:: + + :obj:`some`, :obj:`other`, :obj:`funcs` + + :obj:`otherfunc` + relationship + +.. rubric:: Notes + +Instead of specifying the full covariance matrix, popular +approximations include: + + - Spherical covariance (`cov` is a multiple of the identity matrix) + - Diagonal covariance (`cov` has non-negative elements only on the diagonal) + +This geometrical property can be seen in two dimensions by plotting +generated data-points: + +>>> mean = [0,0] +>>> cov = [[1,0],[0,100]] # diagonal covariance, points lie on x or y-axis + +>>> x,y = multivariate_normal(mean,cov,5000).T +>>> plt.plot(x,y,'x'); plt.axis('equal'); plt.show() + +Note that the covariance matrix must be symmetric and non-negative +definite. + +.. rubric:: References + +.. [1] A. Papoulis, "Probability, Random Variables, and Stochastic + Processes," 3rd ed., McGraw-Hill Companies, 1991 +.. [2] R.O. Duda, P.E. Hart, and D.G. Stork, "Pattern Classification," + 2nd ed., Wiley, 2001. + +.. only:: latex + + [1]_, [2]_ + +.. rubric:: Examples + +>>> mean = (1,2) +>>> cov = [[1,0],[1,0]] +>>> x = multivariate_normal(mean,cov,(3,3)) +>>> print x.shape +(3, 3, 2) + +The following is probably true, given that 0.6 is roughly twice the +standard deviation: + +>>> print list( (x[0,0,:] - mean) < 0.6 ) +[True, True] +""") + + +doc2 = NumpyDocString(""" + Returns array of indices of the maximum values of along the given axis. + + Parameters + ---------- + a : {array_like} + Array to look in. + axis : {None, integer} + If None, the index is into the flattened array, otherwise along + the specified axis""") + +def test_parameters_without_extended_description(): + assert_equal(len(doc2['Parameters']), 2) + +doc3 = NumpyDocString(""" + my_signature(*params, **kwds) + + Return this and that. + """) + +def test_escape_stars(): + signature = str(doc3).split('\n')[0] + assert_equal(signature, 'my_signature(\*params, \*\*kwds)') + +doc4 = NumpyDocString( + """a.conj() + + Return an array with all complex-valued elements conjugated.""") + +def test_empty_extended_summary(): + assert_equal(doc4['Extended Summary'], []) + +doc5 = NumpyDocString( + """ + a.something() + + Raises + ------ + LinAlgException + If array is singular. + + Warns + ----- + SomeWarning + If needed + """) + +def test_raises(): + assert_equal(len(doc5['Raises']), 1) + name,_,desc = doc5['Raises'][0] + assert_equal(name,'LinAlgException') + assert_equal(desc,['If array is singular.']) + +def test_warns(): + assert_equal(len(doc5['Warns']), 1) + name,_,desc = doc5['Warns'][0] + assert_equal(name,'SomeWarning') + assert_equal(desc,['If needed']) + +def test_see_also(): + doc6 = NumpyDocString( + """ + z(x,theta) + + See Also + -------- + func_a, func_b, func_c + func_d : some equivalent func + foo.func_e : some other func over + multiple lines + func_f, func_g, :meth:`func_h`, func_j, + func_k + :obj:`baz.obj_q` + :class:`class_j`: fubar + foobar + """) + + assert len(doc6['See Also']) == 12 + for func, desc, role in doc6['See Also']: + if func in ('func_a', 'func_b', 'func_c', 'func_f', + 'func_g', 'func_h', 'func_j', 'func_k', 'baz.obj_q'): + assert(not desc) + else: + assert(desc) + + if func == 'func_h': + assert role == 'meth' + elif func == 'baz.obj_q': + assert role == 'obj' + elif func == 'class_j': + assert role == 'class' + else: + assert role is None + + if func == 'func_d': + assert desc == ['some equivalent func'] + elif func == 'foo.func_e': + assert desc == ['some other func over', 'multiple lines'] + elif func == 'class_j': + assert desc == ['fubar', 'foobar'] + +def test_see_also_print(): + class Dummy(object): + """ + See Also + -------- + func_a, func_b + func_c : some relationship + goes here + func_d + """ + pass + + obj = Dummy() + s = str(FunctionDoc(obj, role='func')) + assert(':func:`func_a`, :func:`func_b`' in s) + assert(' some relationship' in s) + assert(':func:`func_d`' in s) + +doc7 = NumpyDocString(""" + + Doc starts on second line. + + """) + +def test_empty_first_line(): + assert doc7['Summary'][0].startswith('Doc starts') + + +def test_no_summary(): + str(SphinxDocString(""" + Parameters + ----------""")) + + +def test_unicode(): + doc = SphinxDocString(""" + öäöäöäöäöåååå + + öäöäöäööäååå + + Parameters + ---------- + ååå : äää + ööö + + Returns + ------- + ååå : ööö + äää + + """) + assert isinstance(doc['Summary'][0], str) + assert doc['Summary'][0] == 'öäöäöäöäöåååå' + +def test_plot_examples(): + cfg = dict(use_plots=True) + + doc = SphinxDocString(""" + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> plt.plot([1,2,3],[4,5,6]) + >>> plt.show() + """, config=cfg) + assert 'plot::' in str(doc), str(doc) + + doc = SphinxDocString(""" + Examples + -------- + .. plot:: + + import matplotlib.pyplot as plt + plt.plot([1,2,3],[4,5,6]) + plt.show() + """, config=cfg) + assert str(doc).count('plot::') == 1, str(doc) + +def test_class_members(): + + class Dummy(object): + """ + Dummy class. + + """ + def spam(self, a, b): + """Spam\n\nSpam spam.""" + pass + def ham(self, c, d): + """Cheese\n\nNo cheese.""" + pass + @property + def spammity(self): + """Spammity index""" + return 0.95 + + class Ignorable(object): + """local class, to be ignored""" + pass + + for cls in (ClassDoc, SphinxClassDoc): + doc = cls(Dummy, config=dict(show_class_members=False)) + assert 'Methods' not in str(doc), (cls, str(doc)) + assert 'spam' not in str(doc), (cls, str(doc)) + assert 'ham' not in str(doc), (cls, str(doc)) + assert 'spammity' not in str(doc), (cls, str(doc)) + assert 'Spammity index' not in str(doc), (cls, str(doc)) + + doc = cls(Dummy, config=dict(show_class_members=True)) + assert 'Methods' in str(doc), (cls, str(doc)) + assert 'spam' in str(doc), (cls, str(doc)) + assert 'ham' in str(doc), (cls, str(doc)) + assert 'spammity' in str(doc), (cls, str(doc)) + + if cls is SphinxClassDoc: + assert '.. autosummary::' in str(doc), str(doc) + else: + assert 'Spammity index' in str(doc), str(doc) + +def test_duplicate_signature(): + # Duplicate function signatures occur e.g. in ufuncs, when the + # automatic mechanism adds one, and a more detailed comes from the + # docstring itself. + + doc = NumpyDocString( + """ + z(x1, x2) + + z(a, theta) + """) + + assert doc['Signature'].strip() == 'z(a, theta)' + + +class_doc_txt = """ + Foo + + Parameters + ---------- + f : callable ``f(t, y, *f_args)`` + Aaa. + jac : callable ``jac(t, y, *jac_args)`` + Bbb. + + Attributes + ---------- + t : float + Current time. + y : ndarray + Current variable values. + + Methods + ------- + a + b + c + + Examples + -------- + For usage examples, see `ode`. +""" + +def test_class_members_doc(): + doc = ClassDoc(None, class_doc_txt) + non_blank_line_by_line_compare(str(doc), + """ + Foo + + Parameters + ---------- + f : callable ``f(t, y, *f_args)`` + Aaa. + jac : callable ``jac(t, y, *jac_args)`` + Bbb. + + Examples + -------- + For usage examples, see `ode`. + + Attributes + ---------- + t : float + Current time. + y : ndarray + Current variable values. + + Methods + ------- + a + + b + + c + + .. index:: + + """) + +def test_class_members_doc_sphinx(): + doc = SphinxClassDoc(None, class_doc_txt) + non_blank_line_by_line_compare(str(doc), + """ + Foo + + :Parameters: + + **f** : callable ``f(t, y, *f_args)`` + + Aaa. + + **jac** : callable ``jac(t, y, *jac_args)`` + + Bbb. + + .. rubric:: Examples + + For usage examples, see `ode`. + + .. rubric:: Attributes + + === ========== + t (float) Current time. + y (ndarray) Current variable values. + === ========== + + .. rubric:: Methods + + === ========== + a + b + c + === ========== + + """) + +if __name__ == "__main__": + import nose + nose.run() diff --git a/doc/sphinxext/numpydoc/tests/test_linkcode.py b/doc/sphinxext/numpydoc/tests/test_linkcode.py new file mode 100644 index 00000000..340166a4 --- /dev/null +++ b/doc/sphinxext/numpydoc/tests/test_linkcode.py @@ -0,0 +1,5 @@ +from __future__ import division, absolute_import, print_function + +import numpydoc.linkcode + +# No tests at the moment... diff --git a/doc/sphinxext/numpydoc/tests/test_phantom_import.py b/doc/sphinxext/numpydoc/tests/test_phantom_import.py new file mode 100644 index 00000000..173b5662 --- /dev/null +++ b/doc/sphinxext/numpydoc/tests/test_phantom_import.py @@ -0,0 +1,5 @@ +from __future__ import division, absolute_import, print_function + +import numpydoc.phantom_import + +# No tests at the moment... diff --git a/doc/sphinxext/numpydoc/tests/test_plot_directive.py b/doc/sphinxext/numpydoc/tests/test_plot_directive.py new file mode 100644 index 00000000..0e511fcb --- /dev/null +++ b/doc/sphinxext/numpydoc/tests/test_plot_directive.py @@ -0,0 +1,5 @@ +from __future__ import division, absolute_import, print_function + +import numpydoc.plot_directive + +# No tests at the moment... diff --git a/doc/sphinxext/numpydoc/tests/test_traitsdoc.py b/doc/sphinxext/numpydoc/tests/test_traitsdoc.py new file mode 100644 index 00000000..d36e5ddb --- /dev/null +++ b/doc/sphinxext/numpydoc/tests/test_traitsdoc.py @@ -0,0 +1,5 @@ +from __future__ import division, absolute_import, print_function + +import numpydoc.traitsdoc + +# No tests at the moment... diff --git a/doc/sphinxext/numpydoc/traitsdoc.py b/doc/sphinxext/numpydoc/traitsdoc.py new file mode 100755 index 00000000..596c54eb --- /dev/null +++ b/doc/sphinxext/numpydoc/traitsdoc.py @@ -0,0 +1,142 @@ +""" +========= +traitsdoc +========= + +Sphinx extension that handles docstrings in the Numpy standard format, [1] +and support Traits [2]. + +This extension can be used as a replacement for ``numpydoc`` when support +for Traits is required. + +.. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard +.. [2] http://code.enthought.com/projects/traits/ + +""" +from __future__ import division, absolute_import, print_function + +import inspect +import os +import pydoc +import collections + +from . import docscrape +from . import docscrape_sphinx +from .docscrape_sphinx import SphinxClassDoc, SphinxFunctionDoc, SphinxDocString + +from . import numpydoc + +from . import comment_eater + +class SphinxTraitsDoc(SphinxClassDoc): + def __init__(self, cls, modulename='', func_doc=SphinxFunctionDoc): + if not inspect.isclass(cls): + raise ValueError("Initialise using a class. Got %r" % cls) + self._cls = cls + + if modulename and not modulename.endswith('.'): + modulename += '.' + self._mod = modulename + self._name = cls.__name__ + self._func_doc = func_doc + + docstring = pydoc.getdoc(cls) + docstring = docstring.split('\n') + + # De-indent paragraph + try: + indent = min(len(s) - len(s.lstrip()) for s in docstring + if s.strip()) + except ValueError: + indent = 0 + + for n,line in enumerate(docstring): + docstring[n] = docstring[n][indent:] + + self._doc = docscrape.Reader(docstring) + self._parsed_data = { + 'Signature': '', + 'Summary': '', + 'Description': [], + 'Extended Summary': [], + 'Parameters': [], + 'Returns': [], + 'Raises': [], + 'Warns': [], + 'Other Parameters': [], + 'Traits': [], + 'Methods': [], + 'See Also': [], + 'Notes': [], + 'References': '', + 'Example': '', + 'Examples': '', + 'index': {} + } + + self._parse() + + def _str_summary(self): + return self['Summary'] + [''] + + def _str_extended_summary(self): + return self['Description'] + self['Extended Summary'] + [''] + + def __str__(self, indent=0, func_role="func"): + out = [] + out += self._str_signature() + out += self._str_index() + [''] + out += self._str_summary() + out += self._str_extended_summary() + for param_list in ('Parameters', 'Traits', 'Methods', + 'Returns','Raises'): + out += self._str_param_list(param_list) + out += self._str_see_also("obj") + out += self._str_section('Notes') + out += self._str_references() + out += self._str_section('Example') + out += self._str_section('Examples') + out = self._str_indent(out,indent) + return '\n'.join(out) + +def looks_like_issubclass(obj, classname): + """ Return True if the object has a class or superclass with the given class + name. + + Ignores old-style classes. + """ + t = obj + if t.__name__ == classname: + return True + for klass in t.__mro__: + if klass.__name__ == classname: + return True + return False + +def get_doc_object(obj, what=None, config=None): + if what is None: + if inspect.isclass(obj): + what = 'class' + elif inspect.ismodule(obj): + what = 'module' + elif isinstance(obj, collections.Callable): + what = 'function' + else: + what = 'object' + if what == 'class': + doc = SphinxTraitsDoc(obj, '', func_doc=SphinxFunctionDoc, config=config) + if looks_like_issubclass(obj, 'HasTraits'): + for name, trait, comment in comment_eater.get_class_traits(obj): + # Exclude private traits. + if not name.startswith('_'): + doc['Traits'].append((name, trait, comment.splitlines())) + return doc + elif what in ('function', 'method'): + return SphinxFunctionDoc(obj, '', config=config) + else: + return SphinxDocString(pydoc.getdoc(obj), config=config) + +def setup(app): + # init numpydoc + numpydoc.setup(app, get_doc_object) + diff --git a/examples/data/SOURCES b/examples/data/SOURCES new file mode 100644 index 00000000..e69de29b diff --git a/examples/finance.py b/examples/finance.py new file mode 100644 index 00000000..91ac57f6 --- /dev/null +++ b/examples/finance.py @@ -0,0 +1,86 @@ +""" +Some examples playing around with yahoo finance data +""" + +from datetime import datetime +from pandas.compat import zip + +import matplotlib.finance as fin +import numpy as np +from pylab import show + + +from pandas import Index, DataFrame +from pandas.core.datetools import BMonthEnd +from pandas import ols + +startDate = datetime(2008, 1, 1) +endDate = datetime(2009, 9, 1) + + +def getQuotes(symbol, start, end): + quotes = fin.quotes_historical_yahoo(symbol, start, end) + dates, open, close, high, low, volume = zip(*quotes) + + data = { + 'open': open, + 'close': close, + 'high': high, + 'low': low, + 'volume': volume + } + + dates = Index([datetime.fromordinal(int(d)) for d in dates]) + return DataFrame(data, index=dates) + +msft = getQuotes('MSFT', startDate, endDate) +aapl = getQuotes('AAPL', startDate, endDate) +goog = getQuotes('GOOG', startDate, endDate) +ibm = getQuotes('IBM', startDate, endDate) + +px = DataFrame({'MSFT': msft['close'], + 'IBM': ibm['close'], + 'GOOG': goog['close'], + 'AAPL': aapl['close']}) +returns = px / px.shift(1) - 1 + +# Select dates + +subIndex = ibm.index[(ibm['close'] > 95) & (ibm['close'] < 100)] +msftOnSameDates = msft.reindex(subIndex) + +# Insert columns + +msft['hi-lo spread'] = msft['high'] - msft['low'] +ibm['hi-lo spread'] = ibm['high'] - ibm['low'] + +# Aggregate monthly + + +def toMonthly(frame, how): + offset = BMonthEnd() + + return frame.groupby(offset.rollforward).aggregate(how) + +msftMonthly = toMonthly(msft, np.mean) +ibmMonthly = toMonthly(ibm, np.mean) + +# Statistics + +stdev = DataFrame({ + 'MSFT': msft.std(), + 'IBM': ibm.std() +}) + +# Arithmetic + +ratios = ibm / msft + +# Works with different indices + +ratio = ibm / ibmMonthly +monthlyRatio = ratio.reindex(ibmMonthly.index) + +# Ratio relative to past month average + +filledRatio = ibm / ibmMonthly.reindex(ibm.index, method='pad') diff --git a/examples/regressions.py b/examples/regressions.py new file mode 100644 index 00000000..bc58408a --- /dev/null +++ b/examples/regressions.py @@ -0,0 +1,51 @@ +from datetime import datetime +import string + +import numpy as np + +from pandas.core.api import Series, DataFrame, DatetimeIndex +from pandas.stats.api import ols + +N = 100 + +start = datetime(2009, 9, 2) +dateRange = DatetimeIndex(start, periods=N) + + +def makeDataFrame(): + data = DataFrame(np.random.randn(N, 7), + columns=list(string.ascii_uppercase[:7]), + index=dateRange) + + return data + + +def makeSeries(): + return Series(np.random.randn(N), index=dateRange) + +#------------------------------------------------------------------------------- +# Standard rolling linear regression + +X = makeDataFrame() +Y = makeSeries() + +model = ols(y=Y, x=X) + +print(model) + +#------------------------------------------------------------------------------- +# Panel regression + +data = { + 'A': makeDataFrame(), + 'B': makeDataFrame(), + 'C': makeDataFrame() +} + +Y = makeDataFrame() + +panelModel = ols(y=Y, x=data, window=50) + +model = ols(y=Y, x=data) + +print(panelModel) diff --git a/ez_setup.py b/ez_setup.py new file mode 100644 index 00000000..6f63b856 --- /dev/null +++ b/ez_setup.py @@ -0,0 +1,264 @@ +#!python +"""Bootstrap setuptools installation + +If you want to use setuptools in your package's setup.py, just include this +file in the same directory with it, and add this to the top of your setup.py:: + + from ez_setup import use_setuptools + use_setuptools() + +If you want to require a specific version of setuptools, set a download +mirror, or use an alternate download directory, you can do so by supplying +the appropriate options to ``use_setuptools()``. + +This file can also be run as a script to install or upgrade setuptools. +""" +from __future__ import print_function +import sys +DEFAULT_VERSION = "0.6c11" +DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[ + :3] + +md5_data = { + 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', + 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', + 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', + 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', + 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', + 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', + 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', + 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', + 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', + 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', + 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090', + 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4', + 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7', + 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5', + 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de', + 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b', + 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2', + 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086', + 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', + 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', + 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', + 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', + 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', + 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', + 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', + 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', + 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', + 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', + 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', + 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', + 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', + 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', + 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', + 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', + 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', + 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', + 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', + 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', + 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', + 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', + 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', + 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', +} + +import sys +import os +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + + +def _validate_md5(egg_name, data): + if egg_name in md5_data: + digest = md5(data).hexdigest() + if digest != md5_data[egg_name]: + print(( + "md5 validation of %s failed! (Possible download problem?)" + % egg_name + ), file=sys.stderr) + sys.exit(2) + return data + + +def use_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + download_delay=15 +): + """Automatically find/download setuptools and make it available on sys.path + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end with + a '/'). `to_dir` is the directory where setuptools will be downloaded, if + it is not already available. If `download_delay` is specified, it should + be the number of seconds that will be paused before initiating a download, + should one be required. If an older version of setuptools is installed, + this routine will print a message to ``sys.stderr`` and raise SystemExit in + an attempt to abort the calling script. + """ + was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules + + def do_download(): + egg = download_setuptools( + version, download_base, to_dir, download_delay) + sys.path.insert(0, egg) + import setuptools + setuptools.bootstrap_install_from = egg + try: + import pkg_resources + except ImportError: + return do_download() + try: + pkg_resources.require("setuptools>=" + version) + return + except pkg_resources.VersionConflict as e: + if was_imported: + print(( + "The required version of setuptools (>=%s) is not available, and\n" + "can't be installed while this script is running. Please install\n" + " a more recent version first, using 'easy_install -U setuptools'." + "\n\n(Currently using %r)" + ) % (version, e.args[0]), file=sys.stderr) + sys.exit(2) + else: + del pkg_resources, sys.modules['pkg_resources'] # reload ok + return do_download() + except pkg_resources.DistributionNotFound: + return do_download() + + +def download_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + delay=15 +): + """Download setuptools from a specified location and return its filename + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end + with a '/'). `to_dir` is the directory where the egg will be downloaded. + `delay` is the number of seconds to pause before an actual download attempt. + """ + import urllib2 + import shutil + egg_name = "setuptools-%s-py%s.egg" % (version, sys.version[:3]) + url = download_base + egg_name + saveto = os.path.join(to_dir, egg_name) + src = dst = None + if not os.path.exists(saveto): # Avoid repeated downloads + try: + from distutils import log + if delay: + log.warn(""" +--------------------------------------------------------------------------- +This script requires setuptools version %s to run (even to display +help). I will attempt to download it for you (from +%s), but +you may need to enable firewall access for this script first. +I will start the download in %d seconds. + +(Note: if this machine does not have network access, please obtain the file + + %s + +and place it in this directory before rerunning this script.) +---------------------------------------------------------------------------""", + version, download_base, delay, url + ) + from time import sleep + sleep(delay) + log.warn("Downloading %s", url) + src = urllib2.urlopen(url) + # Read/write all in one block, so we don't create a corrupt file + # if the download is interrupted. + data = _validate_md5(egg_name, src.read()) + dst = open(saveto, "wb") + dst.write(data) + finally: + if src: + src.close() + if dst: + dst.close() + return os.path.realpath(saveto) + + +def main(argv, version=DEFAULT_VERSION): + """Install or upgrade setuptools and EasyInstall""" + try: + import setuptools + except ImportError: + egg = None + try: + egg = download_setuptools(version, delay=0) + sys.path.insert(0, egg) + from setuptools.command.easy_install import main + return main(list(argv) + [egg]) # we're done here + finally: + if egg and os.path.exists(egg): + os.unlink(egg) + else: + if setuptools.__version__ == '0.0.1': + print(( + "You have an obsolete version of setuptools installed. Please\n" + "remove it from your system entirely before rerunning this script." + ), file=sys.stderr) + sys.exit(2) + + req = "setuptools>=" + version + import pkg_resources + try: + pkg_resources.require(req) + except pkg_resources.VersionConflict: + try: + from setuptools.command.easy_install import main + except ImportError: + from easy_install import main + main(list(argv) + [download_setuptools(delay=0)]) + sys.exit(0) # try to force an exit + else: + if argv: + from setuptools.command.easy_install import main + main(argv) + else: + print("Setuptools version", version, "or greater has been installed.") + print('(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)') + + +def update_md5(filenames): + """Update our built-in md5 registry""" + + import re + + for name in filenames: + base = os.path.basename(name) + f = open(name, 'rb') + md5_data[base] = md5(f.read()).hexdigest() + f.close() + + data = sorted([" %r: %r,\n" % it for it in md5_data.items()]) + repl = "".join(data) + + import inspect + srcfile = inspect.getsourcefile(sys.modules[__name__]) + f = open(srcfile, 'rb') + src = f.read() + f.close() + + match = re.search("\nmd5_data = {\n([^}]+)}", src) + if not match: + print("Internal error!", file=sys.stderr) + sys.exit(2) + + src = src[:match.start(1)] + repl + src[match.end(1):] + f = open(srcfile, 'w') + f.write(src) + f.close() + + +if __name__ == '__main__': + if len(sys.argv) > 2 and sys.argv[1] == '--md5update': + update_md5(sys.argv[2:]) + else: + main(sys.argv[1:]) diff --git a/fake_pyrex/Pyrex/Distutils/__init__.py b/fake_pyrex/Pyrex/Distutils/__init__.py new file mode 100644 index 00000000..51c8e16b --- /dev/null +++ b/fake_pyrex/Pyrex/Distutils/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/fake_pyrex/Pyrex/Distutils/build_ext.py b/fake_pyrex/Pyrex/Distutils/build_ext.py new file mode 100644 index 00000000..4f846f62 --- /dev/null +++ b/fake_pyrex/Pyrex/Distutils/build_ext.py @@ -0,0 +1 @@ +build_ext = "yes, it's there!" diff --git a/fake_pyrex/Pyrex/__init__.py b/fake_pyrex/Pyrex/__init__.py new file mode 100644 index 00000000..51c8e16b --- /dev/null +++ b/fake_pyrex/Pyrex/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/pandas/__init__.py b/pandas/__init__.py new file mode 100644 index 00000000..6eda0498 --- /dev/null +++ b/pandas/__init__.py @@ -0,0 +1,55 @@ +# pylint: disable-msg=W0614,W0401,W0611,W0622 + +__docformat__ = 'restructuredtext' + +try: + from . import hashtable, tslib, lib +except Exception: # pragma: no cover + import sys + e = sys.exc_info()[1] # Py25 and Py3 current exception syntax conflict + print(e) + if 'No module named lib' in str(e): + raise ImportError('C extensions not built: if you installed already ' + 'verify that you are not importing from the source ' + 'directory') + else: + raise + +from datetime import datetime +import numpy as np + +# XXX: HACK for NumPy 1.5.1 to suppress warnings +try: + np.seterr(all='ignore') +except Exception: # pragma: no cover + pass + +# numpy versioning +from distutils.version import LooseVersion +_np_version = np.version.short_version +_np_version_under1p6 = LooseVersion(_np_version) < '1.6' +_np_version_under1p7 = LooseVersion(_np_version) < '1.7' +_np_version_under1p8 = LooseVersion(_np_version) < '1.8' +_np_version_under1p9 = LooseVersion(_np_version) < '1.9' + +from pandas.version import version as __version__ +from pandas.info import __doc__ + +# let init-time option registration happen +import pandas.core.config_init + +from pandas.core.api import * +from pandas.sparse.api import * +from pandas.stats.api import * +from pandas.tseries.api import * +from pandas.io.api import * +from pandas.computation.api import * + +from pandas.tools.describe import value_range +from pandas.tools.merge import merge, concat, ordered_merge +from pandas.tools.pivot import pivot_table, crosstab +from pandas.tools.plotting import scatter_matrix, plot_params +from pandas.tools.tile import cut, qcut +from pandas.core.reshape import melt +from pandas.util.print_versions import show_versions +import pandas.util.testing diff --git a/pandas/algos.pyx b/pandas/algos.pyx new file mode 100644 index 00000000..2a07272a --- /dev/null +++ b/pandas/algos.pyx @@ -0,0 +1,2276 @@ +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cdef float64_t FP_ERR = 1e-13 +cdef float64_t REL_TOL = 1e-07 + +cimport util + +from libc.stdlib cimport malloc, free + +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef np.int8_t MINint8 = np.iinfo(np.int8).min +cdef np.int16_t MINint16 = np.iinfo(np.int16).min +cdef np.int32_t MINint32 = np.iinfo(np.int32).min +cdef np.int64_t MINint64 = np.iinfo(np.int64).min +cdef np.float16_t MINfloat16 = np.NINF +cdef np.float32_t MINfloat32 = np.NINF +cdef np.float64_t MINfloat64 = np.NINF + +cdef np.int8_t MAXint8 = np.iinfo(np.int8).max +cdef np.int16_t MAXint16 = np.iinfo(np.int16).max +cdef np.int32_t MAXint32 = np.iinfo(np.int32).max +cdef np.int64_t MAXint64 = np.iinfo(np.int64).max +cdef np.float16_t MAXfloat16 = np.inf +cdef np.float32_t MAXfloat32 = np.inf +cdef np.float64_t MAXfloat64 = np.inf + +cdef double NaN = np.NaN +cdef double nan = NaN + + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + + +cdef extern from "src/headers/math.h": + double sqrt(double x) + double fabs(double) + int signbit(double) + +from pandas import lib + +include "skiplist.pyx" + + +cdef: + int TIEBREAK_AVERAGE = 0 + int TIEBREAK_MIN = 1 + int TIEBREAK_MAX = 2 + int TIEBREAK_FIRST = 3 + int TIEBREAK_FIRST_DESCENDING = 4 + int TIEBREAK_DENSE = 5 + +tiebreakers = { + 'average' : TIEBREAK_AVERAGE, + 'min' : TIEBREAK_MIN, + 'max' : TIEBREAK_MAX, + 'first' : TIEBREAK_FIRST, + 'dense' : TIEBREAK_DENSE, +} + + +# ctypedef fused pvalue_t: +# float64_t +# int64_t +# object + +# from cython cimport floating, integral + +cdef _take_2d_float64(ndarray[float64_t, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[float64_t, ndim=2] result + object val + + N, K = ( values).shape + result = np.empty_like(values) + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +cdef _take_2d_int64(ndarray[int64_t, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[int64_t, ndim=2] result + object val + + N, K = ( values).shape + result = np.empty_like(values) + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +cdef _take_2d_object(ndarray[object, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[object, ndim=2] result + object val + + N, K = ( values).shape + result = values.copy() + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + + +cdef inline bint float64_are_diff(float64_t left, float64_t right): + cdef double abs_diff, allowed + if right == MAXfloat64 or right == -MAXfloat64: + if left == right: + return False + else: + return True + else: + abs_diff = fabs(left - right) + allowed = REL_TOL * fabs(right) + return abs_diff > allowed + +def rank_1d_float64(object in_arr, ties_method='average', ascending=True, + na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 + ndarray[float64_t] sorted_data, ranks, values + ndarray[int64_t] argsorted + float64_t val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + tiebreak = tiebreakers[ties_method] + + values = np.asarray(in_arr).copy() + + keep_na = na_option == 'keep' + + if ascending ^ (na_option == 'top'): + nan_value = np.inf + else: + nan_value = -np.inf + mask = np.isnan(values) + np.putmask(values, mask, nan_value) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + # py2.5/win32 hack, can't pass i8 + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + + for i in range(n): + sum_ranks += i + 1 + dups += 1 + val = sorted_data[i] + if (val == nan_value) and keep_na: + ranks[argsorted[i]] = nan + continue + count += 1.0 + if i == n - 1 or float64_are_diff(sorted_data[i + 1], val): + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + if pct: + return ranks / count + else: + return ranks + + +def rank_1d_int64(object in_arr, ties_method='average', ascending=True, + na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 + ndarray[int64_t] sorted_data, values + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + int64_t val + float64_t sum_ranks = 0 + int tiebreak = 0 + float count = 0.0 + tiebreak = tiebreakers[ties_method] + + values = np.asarray(in_arr) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + # py2.5/win32 hack, can't pass i8 + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + + for i in range(n): + sum_ranks += i + 1 + dups += 1 + val = sorted_data[i] + count += 1.0 + if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + if pct: + return ranks / count + else: + return ranks + + +def rank_2d_float64(object in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + ndarray[float64_t, ndim=2] ranks, values + ndarray[int64_t, ndim=2] argsorted + float64_t val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + in_arr = np.asarray(in_arr) + + if axis == 0: + values = in_arr.T.copy() + else: + values = in_arr.copy() + + if ascending ^ (na_option == 'top'): + nan_value = np.inf + else: + nan_value = -np.inf + + np.putmask(values, np.isnan(values), nan_value) + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_float64(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + dups = sum_ranks = 0 + total_tie_count = 0 + count = 0.0 + for j in range(k): + sum_ranks += j + 1 + dups += 1 + val = values[i, j] + if val == nan_value and keep_na: + ranks[i, argsorted[i, j]] = nan + continue + count += 1.0 + if j == k - 1 or float64_are_diff(values[i, j + 1], val): + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + + +def rank_2d_int64(object in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + ndarray[float64_t, ndim=2] ranks + ndarray[int64_t, ndim=2] argsorted + ndarray[int64_t, ndim=2, cast=True] values + int64_t val + float64_t sum_ranks = 0 + int tiebreak = 0 + float count = 0.0 + tiebreak = tiebreakers[ties_method] + + if axis == 0: + values = np.asarray(in_arr).T + else: + values = np.asarray(in_arr) + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_int64(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + dups = sum_ranks = 0 + total_tie_count = 0 + count = 0.0 + for j in range(k): + sum_ranks += j + 1 + dups += 1 + val = values[i, j] + count += 1.0 + if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + + +def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 + ndarray[float64_t] ranks + ndarray sorted_data, values + ndarray[int64_t] argsorted + object val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + + if ascending ^ (na_option == 'top'): + # always greater than everything + nan_value = Infinity() + else: + nan_value = NegInfinity() + + mask = lib.isnullobj(values) + np.putmask(values, mask, nan_value) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + # py2.5/win32 hack, can't pass i8 + try: + _as = values.argsort() + except TypeError: + if not retry: + raise + + valid_locs = (~mask).nonzero()[0] + ranks.put(valid_locs, rank_1d_generic(values.take(valid_locs), 0, + ties_method=ties_method, + ascending=ascending)) + np.putmask(ranks, mask, np.nan) + return ranks + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + for i in range(n): + sum_ranks += i + 1 + dups += 1 + val = util.get_value_at(sorted_data, i) + if val is nan_value and keep_na: + ranks[argsorted[i]] = nan + continue + if (i == n - 1 or + are_diff(util.get_value_at(sorted_data, i + 1), val)): + count += 1.0 + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + raise ValueError('first not supported for non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + if pct: + return ranks / count + else: + return ranks + +cdef inline are_diff(object left, object right): + try: + return fabs(left - right) > FP_ERR + except TypeError: + return left != right + +_return_false = lambda self, other: False +_return_true = lambda self, other: True + +class Infinity(object): + + __lt__ = _return_false + __le__ = _return_false + __eq__ = _return_false + __ne__ = _return_true + __gt__ = _return_true + __ge__ = _return_true + __cmp__ = _return_false + +class NegInfinity(object): + + __lt__ = _return_true + __le__ = _return_true + __eq__ = _return_false + __ne__ = _return_true + __gt__ = _return_false + __ge__ = _return_false + __cmp__ = _return_true + +def rank_2d_generic(object in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, infs, dups = 0 + Py_ssize_t total_tie_count = 0 + ndarray[float64_t, ndim=2] ranks + ndarray[object, ndim=2] values + ndarray[int64_t, ndim=2] argsorted + object val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float count = 0.0 + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + in_arr = np.asarray(in_arr) + + if axis == 0: + values = in_arr.T.copy() + else: + values = in_arr.copy() + + if values.dtype != np.object_: + values = values.astype('O') + + if ascending ^ (na_option == 'top'): + # always greater than everything + nan_value = Infinity() + else: + nan_value = NegInfinity() + + mask = lib.isnullobj2d(values) + np.putmask(values, mask, nan_value) + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_generic(in_arr[i], + ties_method=ties_method, + ascending=ascending, + pct=pct) + if axis == 0: + return ranks.T + else: + return ranks + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_object(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + dups = sum_ranks = infs = 0 + total_tie_count = 0 + count = 0.0 + for j in range(k): + val = values[i, j] + if val is nan_value and keep_na: + ranks[i, argsorted[i, j]] = nan + infs += 1 + continue + count += 1.0 + sum_ranks += (j - infs) + 1 + dups += 1 + if j == k - 1 or are_diff(values[i, j + 1], val): + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + raise ValueError('first not supported for ' + 'non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + +# def _take_indexer_2d(ndarray[float64_t, ndim=2] values, +# ndarray[Py_ssize_t, ndim=2, cast=True] indexer): +# cdef: +# Py_ssize_t i, j, N, K +# ndarray[float64_t, ndim=2] result + +# N, K = ( values).shape +# result = np.empty_like(values) +# for i in range(N): +# for j in range(K): +# result[i, j] = values[i, indexer[i, j]] +# return result + + +# Cython implementations of rolling sum, mean, variance, skewness, +# other statistical moment functions +# +# Misc implementation notes +# ------------------------- +# +# - In Cython x * x is faster than x ** 2 for C types, this should be +# periodically revisited to see if it's still true. +# +# - + +def _check_minp(win, minp, N): + if minp > win: + raise ValueError('min_periods (%d) must be <= window (%d)' + % (minp, win)) + elif minp > N: + minp = N + 1 + elif minp == 0: + minp = 1 + elif minp < 0: + raise ValueError('min_periods must be >= 0') + return minp + +# original C implementation by N. Devillard. +# This code in public domain. +# Function : kth_smallest() +# In : array of elements, # of elements in the array, rank k +# Out : one element +# Job : find the kth smallest element in the array + +# Reference: + +# Author: Wirth, Niklaus +# Title: Algorithms + data structures = programs +# Publisher: Englewood Cliffs: Prentice-Hall, 1976 +# Physical description: 366 p. +# Series: Prentice-Hall Series in Automatic Computation + + +ctypedef fused numeric: + int8_t + int16_t + int32_t + int64_t + + uint8_t + uint16_t + uint32_t + uint64_t + + float32_t + float64_t + + +cdef inline Py_ssize_t swap(numeric *a, numeric *b) except -1: + cdef numeric t + + # cython doesn't allow pointer dereference so use array syntax + t = a[0] + a[0] = b[0] + b[0] = t + return 0 + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k): + cdef: + Py_ssize_t i, j, l, m, n = a.size + numeric x + + l = 0 + m = n - 1 + + while l < m: + x = a[k] + i = l + j = m + + while 1: + while a[i] < x: i += 1 + while x < a[j]: j -= 1 + if i <= j: + swap(&a[i], &a[j]) + i += 1; j -= 1 + + if i > j: break + + if j < k: l = i + if k < i: m = j + return a[k] + + +cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): + cdef: + Py_ssize_t i,j,l,m + double_t x, t + + l = 0 + m = n-1 + while (l j: break + + if j < k: l = i + if k < i: m = j + return a[k] + + +cpdef numeric median(numeric[:] arr): + ''' + A faster median + ''' + cdef Py_ssize_t n = arr.size + + if n == 0: + return np.NaN + + arr = arr.copy() + + if n % 2: + return kth_smallest(arr, n // 2) + else: + return (kth_smallest(arr, n // 2) + + kth_smallest(arr, n // 2 - 1)) / 2 + + +# -------------- Min, Max subsequence + +def max_subseq(ndarray[double_t] arr): + cdef: + Py_ssize_t i=0,s=0,e=0,T,n + double m, S + + n = len(arr) + + if len(arr) == 0: + return (-1,-1,None) + + m = arr[0] + S = m + T = 0 + + for i in range(1, n): + # S = max { S + A[i], A[i] ) + if (S > 0): + S = S + arr[i] + else: + S = arr[i] + T = i + if S > m: + s = T + e = i + m = S + + return (s, e, m) + +def min_subseq(ndarray[double_t] arr): + cdef: + Py_ssize_t s, e + double m + + (s, e, m) = max_subseq(-arr) + + return (s, e, -m) + +#------------------------------------------------------------------------------- +# Rolling sum + +def roll_sum(ndarray[double_t] input, int win, int minp): + cdef double val, prev, sum_x = 0 + cdef int nobs = 0, i + cdef int N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + sum_x += val + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + + if nobs >= minp: + output[i] = sum_x + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling mean + +def roll_mean(ndarray[double_t] input, + int win, int minp): + cdef: + double val, prev, result, sum_x = 0 + Py_ssize_t nobs = 0, i, neg_ct = 0 + Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + if signbit(val): + neg_ct += 1 + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + sum_x += val + if signbit(val): + neg_ct += 1 + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + if signbit(prev): + neg_ct -= 1 + + if nobs >= minp: + result = sum_x / nobs + if neg_ct == 0 and result < 0: + # all positive + output[i] = 0 + elif neg_ct == nobs and result > 0: + # all negative + output[i] = 0 + else: + output[i] = result + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Exponentially weighted moving average + +def ewma(ndarray[double_t] input, double_t com, int adjust): + ''' + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + + Returns + ------- + y : ndarray + ''' + + cdef double cur, prev, neww, oldw, adj + cdef Py_ssize_t i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + if N == 0: + return output + + neww = 1. / (1. + com) + oldw = 1. - neww + adj = oldw + + if adjust: + output[0] = neww * input[0] + else: + output[0] = input[0] + + for i from 1 <= i < N: + cur = input[i] + prev = output[i - 1] + + if cur == cur: + if prev == prev: + output[i] = oldw * prev + neww * cur + else: + output[i] = neww * cur + else: + output[i] = prev + + if adjust: + for i from 0 <= i < N: + cur = input[i] + + if cur == cur: + output[i] = output[i] / (1. - adj) + adj *= oldw + else: + if i >= 1: + output[i] = output[i - 1] + + return output + +#---------------------------------------------------------------------- +# Pairwise correlation/covariance + +@cython.boundscheck(False) +@cython.wraparound(False) +def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None): + cdef: + Py_ssize_t i, j, xi, yi, N, K + ndarray[float64_t, ndim=2] result + ndarray[uint8_t, ndim=2] mask + int64_t nobs = 0 + float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor + + N, K = ( mat).shape + + if minp is None: + minp = 1 + + result = np.empty((K, K), dtype=np.float64) + mask = np.isfinite(mat).view(np.uint8) + + for xi in range(K): + for yi in range(xi + 1): + nobs = sumxx = sumyy = sumx = sumy = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] + vy = mat[i, yi] + nobs += 1 + sumx += vx + sumy += vy + + if nobs < minp: + result[xi, yi] = result[yi, xi] = np.NaN + else: + meanx = sumx / nobs + meany = sumy / nobs + + # now the cov numerator + sumx = 0 + + for i in range(N): + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] - meanx + vy = mat[i, yi] - meany + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + + divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = np.NaN + + return result + +#---------------------------------------------------------------------- +# Pairwise Spearman correlation + +@cython.boundscheck(False) +@cython.wraparound(False) +def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): + cdef: + Py_ssize_t i, j, xi, yi, N, K + ndarray[float64_t, ndim=2] result + ndarray[float64_t, ndim=1] maskedx + ndarray[float64_t, ndim=1] maskedy + ndarray[uint8_t, ndim=2] mask + int64_t nobs = 0 + float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor + + N, K = ( mat).shape + + result = np.empty((K, K), dtype=np.float64) + mask = np.isfinite(mat).view(np.uint8) + + for xi in range(K): + for yi in range(xi + 1): + nobs = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + nobs += 1 + + if nobs < minp: + result[xi, yi] = result[yi, xi] = np.NaN + else: + maskedx = np.empty(nobs, dtype=np.float64) + maskedy = np.empty(nobs, dtype=np.float64) + j = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + maskedx[j] = mat[i, xi] + maskedy[j] = mat[i, yi] + j += 1 + maskedx = rank_1d_float64(maskedx) + maskedy = rank_1d_float64(maskedy) + + mean = (nobs + 1) / 2. + + # now the cov numerator + sumx = sumxx = sumyy = 0 + + for i in range(nobs): + vx = maskedx[i] - mean + vy = maskedy[i] - mean + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + + divisor = sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = np.NaN + + return result + +#---------------------------------------------------------------------- +# Rolling variance + +def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + cdef Py_ssize_t i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + minp = _check_minp(win, minp, N) + + # Check for windows larger than array, addresses #7297 + win = min(win, N) + + # Over the first window, observations can only be added, never removed + for i from 0 <= i < win: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) + + if nobs >= minp: + #pathological case + if nobs == 1: + val = 0 + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 + else: + val = NaN + + output[i] = val + + # After the first window, observations can both be added and removed + for i from win <= i < N: + val = input[i] + prev = input[i - win] + + if val == val: + if prev == prev: + # Adding one observation and removing another one + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + else: + # Adding one observation and not removing any + nobs += 1 + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) + elif prev == prev: + # Adding no new observation, but removing one + nobs -= 1 + if nobs: + delta = (prev - mean_x) + mean_x -= delta / nobs + ssqdm_x -= delta * (prev - mean_x) + else: + mean_x = 0 + ssqdm_x = 0 + # Variance is unchanged if no observation is added or removed + + if nobs >= minp: + #pathological case + if nobs == 1: + val = 0 + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 + else: + val = NaN + + output[i] = val + + return output + + +#------------------------------------------------------------------------------- +# Rolling skewness + +def roll_skew(ndarray[double_t] input, int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0 + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + # 3 components of the skewness equation + cdef double A, B, C, R + + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + + nobs -= 1 + + if nobs >= minp: + A = x / nobs + B = xx / nobs - A * A + C = xxx / nobs - A * A * A - 3 * A * B + + R = sqrt(B) + + if B == 0 or nobs < 3: + output[i] = NaN + else: + output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / + ((nobs-2) * R * R * R)) + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling kurtosis + + +def roll_kurt(ndarray[double_t] input, + int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + # 5 components of the kurtosis equation + cdef double A, B, C, D, R, K + + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + + # seriously don't ask me why this is faster + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + xxxx -= prev * prev * prev * prev + + nobs -= 1 + + if nobs >= minp: + A = x / nobs + R = A * A + B = xx / nobs - R + R = R * A + C = xxx / nobs - R - 3 * A * B + R = R * A + D = xxxx / nobs - R - 6*B*A*A - 4*C*A + + if B == 0 or nobs < 4: + output[i] = NaN + + else: + K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) + K = K / ((nobs - 2.)*(nobs-3.)) + + output[i] = K + + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling median, min, max + +ctypedef double_t (* skiplist_f)(object sl, int n, int p) + +cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): + cdef ndarray[double_t] input = arg + cdef double val, prev, midpoint + cdef IndexableSkiplist skiplist + cdef Py_ssize_t nobs = 0, i + + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + skiplist = IndexableSkiplist(win) + + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + + if prev == prev: + skiplist.remove(prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = op(skiplist, nobs, minp) + + return output + +from skiplist cimport * + +def roll_median_c(ndarray[float64_t] arg, int win, int minp): + cdef double val, res, prev + cdef: + int ret=0 + skiplist_t *sl + Py_ssize_t midpoint, nobs = 0, i + + + cdef Py_ssize_t N = len(arg) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + sl = skiplist_init(win) + + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = arg[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist_insert(sl, val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = arg[i] + + if i > win - 1: + prev = arg[i - win] + + if prev == prev: + skiplist_remove(sl, prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist_insert(sl, val) + + if nobs >= minp: + midpoint = nobs / 2 + if nobs % 2: + res = skiplist_get(sl, midpoint, &ret) + else: + res = (skiplist_get(sl, midpoint, &ret) + + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + res = NaN + + output[i] = res + + skiplist_destroy(sl) + + return output + +def roll_median_cython(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_median) + +# Unfortunately had to resort to some hackery here, would like for +# Cython to be able to get this right. + +cdef double_t _get_median(object sl, int nobs, int minp): + cdef Py_ssize_t midpoint + cdef IndexableSkiplist skiplist = sl + if nobs >= minp: + midpoint = nobs / 2 + if nobs % 2: + return skiplist.get(midpoint) + else: + return (skiplist.get(midpoint) + + skiplist.get(midpoint - 1)) / 2 + else: + return NaN + +#---------------------------------------------------------------------- + +# Moving maximum / minimum code taken from Bottleneck under the terms +# of its Simplified BSD license +# https://github.com/kwgoodman/bottleneck + +cdef struct pairs: + double value + int death + +from libc cimport stdlib + +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_max2(ndarray[float64_t] a, int window, int minp): + "Moving max of 1d array of dtype=float64 along axis=0 ignoring NaNs." + cdef np.float64_t ai, aold + cdef Py_ssize_t count + cdef pairs* ring + cdef pairs* minpair + cdef pairs* end + cdef pairs* last + cdef Py_ssize_t i0 + cdef np.npy_intp *dim + dim = PyArray_DIMS(a) + cdef Py_ssize_t n0 = dim[0] + cdef np.npy_intp *dims = [n0] + cdef np.ndarray[np.float64_t, ndim=1] y = PyArray_EMPTY(1, dims, + NPY_float64, 0) + + if window < 1: + raise ValueError('Invalid window size %d' + % (window)) + + if minp > window: + raise ValueError('Invalid min_periods size %d greater than window %d' + % (minp, window)) + + minp = _check_minp(window, minp, n0) + + window = min(window, n0) + + ring = stdlib.malloc(window * sizeof(pairs)) + end = ring + window + last = ring + + minpair = ring + ai = a[0] + if ai == ai: + minpair.value = ai + else: + minpair.value = MINfloat64 + minpair.death = window + + count = 0 + for i0 in range(n0): + ai = a[i0] + if ai == ai: + count += 1 + else: + ai = MINfloat64 + if i0 >= window: + aold = a[i0 - window] + if aold == aold: + count -= 1 + if minpair.death == i0: + minpair += 1 + if minpair >= end: + minpair = ring + if ai >= minpair.value: + minpair.value = ai + minpair.death = i0 + window + last = minpair + else: + while last.value <= ai: + if last == ring: + last = end + last -= 1 + last += 1 + if last == end: + last = ring + last.value = ai + last.death = i0 + window + if count >= minp: + y[i0] = minpair.value + else: + y[i0] = NaN + + for i0 in range(minp - 1): + y[i0] = NaN + + stdlib.free(ring) + return y + +def roll_max(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_max) + + +cdef double_t _get_max(object skiplist, int nobs, int minp): + if nobs >= minp: + return skiplist.get(nobs - 1) + else: + return NaN + +def roll_min(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_min) + +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_min2(np.ndarray[np.float64_t, ndim=1] a, int window, int minp): + "Moving min of 1d array of dtype=float64 along axis=0 ignoring NaNs." + cdef np.float64_t ai, aold + cdef Py_ssize_t count + cdef pairs* ring + cdef pairs* minpair + cdef pairs* end + cdef pairs* last + cdef Py_ssize_t i0 + cdef np.npy_intp *dim + dim = PyArray_DIMS(a) + cdef Py_ssize_t n0 = dim[0] + cdef np.npy_intp *dims = [n0] + cdef np.ndarray[np.float64_t, ndim=1] y = PyArray_EMPTY(1, dims, + NPY_float64, 0) + + if window < 1: + raise ValueError('Invalid window size %d' + % (window)) + + if minp > window: + raise ValueError('Invalid min_periods size %d greater than window %d' + % (minp, window)) + + window = min(window, n0) + + minp = _check_minp(window, minp, n0) + + ring = stdlib.malloc(window * sizeof(pairs)) + end = ring + window + last = ring + + minpair = ring + ai = a[0] + if ai == ai: + minpair.value = ai + else: + minpair.value = MAXfloat64 + minpair.death = window + + count = 0 + for i0 in range(n0): + ai = a[i0] + if ai == ai: + count += 1 + else: + ai = MAXfloat64 + if i0 >= window: + aold = a[i0 - window] + if aold == aold: + count -= 1 + if minpair.death == i0: + minpair += 1 + if minpair >= end: + minpair = ring + if ai <= minpair.value: + minpair.value = ai + minpair.death = i0 + window + last = minpair + else: + while last.value >= ai: + if last == ring: + last = end + last -= 1 + last += 1 + if last == end: + last = ring + last.value = ai + last.death = i0 + window + if count >= minp: + y[i0] = minpair.value + else: + y[i0] = NaN + + for i0 in range(minp - 1): + y[i0] = NaN + + stdlib.free(ring) + return y + +cdef double_t _get_min(object skiplist, int nobs, int minp): + if nobs >= minp: + return skiplist.get(0) + else: + return NaN + +def roll_quantile(ndarray[float64_t, cast=True] input, int win, + int minp, double quantile): + ''' + O(N log(window)) implementation using skip list + ''' + cdef double val, prev, midpoint + cdef IndexableSkiplist skiplist + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + skiplist = IndexableSkiplist(win) + + minp = _check_minp(win, minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + + if prev == prev: + skiplist.remove(prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist.insert(val) + + if nobs >= minp: + idx = int((quantile / 1.) * (nobs - 1)) + output[i] = skiplist.get(idx) + else: + output[i] = NaN + + return output + +def roll_generic(ndarray[float64_t, cast=True] input, int win, + int minp, object func, object args, object kwargs): + cdef ndarray[double_t] output, counts, bufarr + cdef Py_ssize_t i, n + cdef float64_t *buf + cdef float64_t *oldbuf + + if not input.flags.c_contiguous: + input = input.copy('C') + + buf = input.data + + n = len(input) + if n == 0: + return input + + minp = _check_minp(win, minp, n) + output = np.empty(n, dtype=float) + counts = roll_sum(np.isfinite(input).astype(float), win, minp) + + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + + n = len(input) + for i from 0 <= i < int_min(win, n): + if counts[i] >= minp: + output[i] = func(input[int_max(i - win + 1, 0) : i + 1], *args, + **kwargs) + else: + output[i] = NaN + + for i from win <= i < n: + buf = buf + 1 + bufarr.data = buf + if counts[i] >= minp: + output[i] = func(bufarr, *args, **kwargs) + else: + output[i] = NaN + + bufarr.data = oldbuf + + return output + + +def roll_window(ndarray[float64_t, ndim=1, cast=True] input, + ndarray[float64_t, ndim=1, cast=True] weights, + int minp, bint avg=True, bint avg_wgt=False): + """ + Assume len(weights) << len(input) + """ + cdef: + ndarray[double_t] output, tot_wgt, counts + Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k + float64_t val_in, val_win, c, w + + in_n = len(input) + win_n = len(weights) + output = np.zeros(in_n, dtype=float) + counts = np.zeros(in_n, dtype=float) + if avg: + tot_wgt = np.zeros(in_n, dtype=float) + + minp = _check_minp(len(weights), minp, in_n) + + if avg_wgt: + for win_i from 0 <= win_i < win_n: + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: + val_in = input[in_i] + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + tot_wgt[in_i + (win_n - win_i) - 1] += val_win + + for in_i from 0 <= in_i < in_n: + c = counts[in_i] + if c < minp: + output[in_i] = NaN + else: + w = tot_wgt[in_i] + if w == 0: + output[in_i] = NaN + else: + output[in_i] /= tot_wgt[in_i] + + else: + for win_i from 0 <= win_i < win_n: + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: + val_in = input[in_i] + + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + + for in_i from 0 <= in_i < in_n: + c = counts[in_i] + if c < minp: + output[in_i] = NaN + elif avg: + output[in_i] /= c + + return output + + +#---------------------------------------------------------------------- +# group operations + + +@cython.wraparound(False) +@cython.boundscheck(False) +def is_lexsorted(list list_of_arrays): + cdef: + int i + Py_ssize_t n, nlevels + int64_t k, cur, pre + ndarray arr + + nlevels = len(list_of_arrays) + n = len(list_of_arrays[0]) + + cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + for i from 0 <= i < nlevels: + # vecs[i] = ( list_of_arrays[i]).data + + arr = list_of_arrays[i] + vecs[i] = arr.data + # assume uniqueness?? + + for i from 1 <= i < n: + for k from 0 <= k < nlevels: + cur = vecs[k][i] + pre = vecs[k][i-1] + if cur == pre: + continue + elif cur > pre: + break + else: + return False + free(vecs) + return True + + +@cython.boundscheck(False) +def groupby_indices(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels, counts, arr, seen + int64_t loc + dict ids = {} + object val + int64_t k + + ids, labels, counts = group_labels(values) + seen = np.zeros_like(counts) + + # try not to get in trouble here... + cdef int64_t **vecs = malloc(len(ids) * sizeof(int64_t*)) + result = {} + for i from 0 <= i < len(counts): + arr = np.empty(counts[i], dtype=np.int64) + result[ids[i]] = arr + vecs[i] = arr.data + + for i from 0 <= i < n: + k = labels[i] + + # was NaN + if k == -1: + continue + + loc = seen[k] + vecs[k][loc] = i + seen[k] = loc + 1 + + free(vecs) + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_labels(ndarray[object] values): + ''' + Compute label vector from input values and associated useful data + + Returns + ------- + ''' + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + ndarray[int64_t] counts = np.empty(n, dtype=np.int64) + dict ids = {}, reverse = {} + int64_t idx + object val + int64_t count = 0 + + for i from 0 <= i < n: + val = values[i] + + # is NaN + if val != val: + labels[i] = -1 + continue + + # for large number of groups, not doing try: except: makes a big + # difference + if val in ids: + idx = ids[val] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + ids[val] = count + reverse[count] = val + labels[i] = count + counts[count] = 1 + count += 1 + + return reverse, labels, counts[:count].copy() + + +@cython.boundscheck(False) +@cython.wraparound(False) +def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): + cdef: + Py_ssize_t i, loc, label, n + ndarray[int64_t] counts, where, result + + # count group sizes, location 0 for NA + counts = np.zeros(ngroups + 1, dtype=np.int64) + n = len(index) + for i from 0 <= i < n: + counts[index[i] + 1] += 1 + + # mark the start of each contiguous group of like-indexed data + where = np.zeros(ngroups + 1, dtype=np.int64) + for i from 1 <= i < ngroups + 1: + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + result = np.zeros(n, dtype=np.int64) + for i from 0 <= i < n: + label = index[i] + 1 + result[where[label]] = i + where[label] += 1 + + return result, counts + +# TODO: aggregate multiple columns in single pass +#---------------------------------------------------------------------- +# first, nth, last + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels, + int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[int64_t, ndim=2] nobs + ndarray[object, ndim=2] resx + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[float64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.float64) + resx = np.empty(( out).shape, dtype=object) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_bin_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[float64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.float64) + resx = np.empty(( out).shape, dtype=object) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + + +#---------------------------------------------------------------------- +# median + +def group_median(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, size + ndarray[int64_t] _counts + ndarray data + float64_t* ptr + ngroups = len(counts) + N, K = ( values).shape + + indexer, _counts = groupsort_indexer(labels, ngroups) + counts[:] = _counts[1:] + + data = np.empty((K, N), dtype=np.float64) + ptr = data.data + + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = _median_linear(ptr, size) + ptr += size + + +cdef inline float64_t _median_linear(float64_t* a, int n): + cdef int i, j, na_count = 0 + cdef float64_t result + cdef float64_t* tmp + + # count NAs + for i in range(n): + if a[i] != a[i]: + na_count += 1 + + if na_count: + if na_count == n: + return NaN + + tmp = malloc((n - na_count) * sizeof(float64_t)) + + j = 0 + for i in range(n): + if a[i] == a[i]: + tmp[j] = a[i] + j += 1 + + a = tmp + n -= na_count + + + if n % 2: + result = kth_smallest_c( a, n / 2, n) + else: + result = (kth_smallest_c(a, n / 2, n) + + kth_smallest_c(a, n / 2 - 1, n)) / 2 + + if na_count: + free(a) + + return result + +include "join.pyx" +include "generated.pyx" diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py new file mode 100644 index 00000000..bff6eb1f --- /dev/null +++ b/pandas/compat/__init__.py @@ -0,0 +1,754 @@ +""" +compat +====== + +Cross-compatible functions for Python 2 and 3. + +Key items to import for 2/3 compatible code: +* iterators: range(), map(), zip(), filter(), reduce() +* lists: lrange(), lmap(), lzip(), lfilter() +* unicode: u() [u"" is a syntax error in Python 3.0-3.2] +* longs: long (int in Python 3) +* callable +* iterable method compatibility: iteritems, iterkeys, itervalues + * Uses the original method if available, otherwise uses items, keys, values. +* types: + * text_type: unicode in Python 2, str in Python 3 + * binary_type: str in Python 2, bythes in Python 3 + * string_types: basestring in Python 2, str in Python 3 +* bind_method: binds functions to classes +* add_metaclass(metaclass) - class decorator that recreates class with with the + given metaclass instead (and avoids intermediary class creation) + +Python 2.6 compatibility: +* OrderedDict +* Counter + +Other items: +* OrderedDefaultDict +""" +# pylint disable=W0611 +import functools +import itertools +from distutils.version import LooseVersion +from itertools import product +import sys +import types + +PY3 = (sys.version_info[0] >= 3) +PY3_2 = sys.version_info[:2] == (3, 2) + +try: + import __builtin__ as builtins + # not writeable when instantiated with string, doesn't handle unicode well + from cStringIO import StringIO as cStringIO + # always writeable + from StringIO import StringIO + BytesIO = StringIO + import cPickle + import httplib +except ImportError: + import builtins + from io import StringIO, BytesIO + cStringIO = StringIO + import pickle as cPickle + import http.client as httplib + +from pandas.compat.chainmap import DeepChainMap + + +if PY3: + def isidentifier(s): + return s.isidentifier() + + def str_to_bytes(s, encoding=None): + return s.encode(encoding or 'ascii') + + def bytes_to_str(b, encoding=None): + return b.decode(encoding or 'utf-8') + + # have to explicitly put builtins into the namespace + range = range + map = map + zip = zip + filter = filter + reduce = functools.reduce + long = int + unichr = chr + + # list-producing versions of the major Python iterating functions + def lrange(*args, **kwargs): + return list(range(*args, **kwargs)) + + def lzip(*args, **kwargs): + return list(zip(*args, **kwargs)) + + def lmap(*args, **kwargs): + return list(map(*args, **kwargs)) + + def lfilter(*args, **kwargs): + return list(filter(*args, **kwargs)) +else: + # Python 2 + import re + _name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*$") + + def isidentifier(s, dotted=False): + return bool(_name_re.match(s)) + + def str_to_bytes(s, encoding='ascii'): + return s + + def bytes_to_str(b, encoding='ascii'): + return b + + # import iterator versions of these functions + range = xrange + zip = itertools.izip + filter = itertools.ifilter + map = itertools.imap + reduce = reduce + long = long + unichr = unichr + + # Python 2-builtin ranges produce lists + lrange = builtins.range + lzip = builtins.zip + lmap = builtins.map + lfilter = builtins.filter + + +def iteritems(obj, **kwargs): + """replacement for six's iteritems for Python2/3 compat + uses 'iteritems' if available and otherwise uses 'items'. + + Passes kwargs to method. + """ + func = getattr(obj, "iteritems", None) + if not func: + func = obj.items + return func(**kwargs) + + +def iterkeys(obj, **kwargs): + func = getattr(obj, "iterkeys", None) + if not func: + func = obj.keys + return func(**kwargs) + + +def itervalues(obj, **kwargs): + func = getattr(obj, "itervalues", None) + if not func: + func = obj.values + return func(**kwargs) + + +def bind_method(cls, name, func): + """Bind a method to class, python 2 and python 3 compatible. + + Parameters + ---------- + + cls : type + class to receive bound method + name : basestring + name of method on class instance + func : function + function to be bound as method + + + Returns + ------- + None + """ + # only python 2 has bound/unbound method issue + if not PY3: + setattr(cls, name, types.MethodType(func, None, cls)) + else: + setattr(cls, name, func) +# ---------------------------------------------------------------------------- +# functions largely based / taken from the six module + +# Much of the code in this module comes from Benjamin Peterson's six library. +# The license for this library can be found in LICENSES/SIX and the code can be +# found at https://bitbucket.org/gutworth/six + +if PY3: + string_types = str, + integer_types = int, + class_types = type, + text_type = str + binary_type = bytes + + def u(s): + return s + + def u_safe(s): + return s +else: + string_types = basestring, + integer_types = (int, long) + class_types = (type, types.ClassType) + text_type = unicode + binary_type = str + + def u(s): + return unicode(s, "unicode_escape") + + def u_safe(s): + try: + return unicode(s, "unicode_escape") + except: + return s + + +string_and_binary_types = string_types + (binary_type,) + + +try: + # callable reintroduced in later versions of Python + callable = callable +except NameError: + def callable(obj): + return any("__call__" in klass.__dict__ for klass in type(obj).__mro__) + + +def add_metaclass(metaclass): + """Class decorator for creating a class with a metaclass.""" + def wrapper(cls): + orig_vars = cls.__dict__.copy() + orig_vars.pop('__dict__', None) + orig_vars.pop('__weakref__', None) + for slots_var in orig_vars.get('__slots__', ()): + orig_vars.pop(slots_var) + return metaclass(cls.__name__, cls.__bases__, orig_vars) + return wrapper + + +# ---------------------------------------------------------------------------- +# Python 2.6 compatibility shims +# + +# OrderedDict Shim from Raymond Hettinger, python core dev +# http://code.activestate.com/recipes/576693-ordered-dictionary-for-py24/ +# here to support versions before 2.6 +if not PY3: + # don't need this except in 2.6 + try: + from thread import get_ident as _get_ident + except ImportError: + from dummy_thread import get_ident as _get_ident + +try: + from _abcoll import KeysView, ValuesView, ItemsView +except ImportError: + pass + + +class _OrderedDict(dict): + + """Dictionary that remembers insertion order""" + # An inherited dict maps keys to values. + # The inherited dict provides __getitem__, __len__, __contains__, and get. + # The remaining methods are order-aware. + # Big-O running times for all methods are the same as for regular + # dictionaries. + + # The internal self.__map dictionary maps keys to links in a doubly linked + # list. The circular doubly linked list starts and ends with a sentinel + # element. The sentinel element never gets deleted (this simplifies the + # algorithm). Each link is stored as a list of length three: [PREV, NEXT, + # KEY]. + + def __init__(self, *args, **kwds): + """Initialize an ordered dictionary. Signature is the same as for + regular dictionaries, but keyword arguments are not recommended + because their insertion order is arbitrary. + """ + if len(args) > 1: + raise TypeError('expected at most 1 arguments, got %d' % len(args)) + try: + self.__root + except AttributeError: + self.__root = root = [] # sentinel node + root[:] = [root, root, None] + self.__map = {} + self.__update(*args, **kwds) + + def __setitem__(self, key, value, dict_setitem=dict.__setitem__): + """od.__setitem__(i, y) <==> od[i]=y""" + # Setting a new item creates a new link which goes at the end of the + # linked list, and the inherited dictionary is updated with the new + # key/value pair. + if key not in self: + root = self.__root + last = root[0] + last[1] = root[0] = self.__map[key] = [last, root, key] + dict_setitem(self, key, value) + + def __delitem__(self, key, dict_delitem=dict.__delitem__): + """od.__delitem__(y) <==> del od[y]""" + # Deleting an existing item uses self.__map to find the link which is + # then removed by updating the links in the predecessor and successor + # nodes. + dict_delitem(self, key) + link_prev, link_next, key = self.__map.pop(key) + link_prev[1] = link_next + link_next[0] = link_prev + + def __iter__(self): + """od.__iter__() <==> iter(od)""" + root = self.__root + curr = root[1] + while curr is not root: + yield curr[2] + curr = curr[1] + + def __reversed__(self): + """od.__reversed__() <==> reversed(od)""" + root = self.__root + curr = root[0] + while curr is not root: + yield curr[2] + curr = curr[0] + + def clear(self): + """od.clear() -> None. Remove all items from od.""" + try: + for node in itervalues(self.__map): + del node[:] + root = self.__root + root[:] = [root, root, None] + self.__map.clear() + except AttributeError: + pass + dict.clear(self) + + def popitem(self, last=True): + """od.popitem() -> (k, v), return and remove a (key, value) pair. + + Pairs are returned in LIFO order if last is true or FIFO order if + false. + """ + if not self: + raise KeyError('dictionary is empty') + root = self.__root + if last: + link = root[0] + link_prev = link[0] + link_prev[1] = root + root[0] = link_prev + else: + link = root[1] + link_next = link[1] + root[1] = link_next + link_next[0] = root + key = link[2] + del self.__map[key] + value = dict.pop(self, key) + return key, value + + # -- the following methods do not depend on the internal structure -- + + def keys(self): + """od.keys() -> list of keys in od""" + return list(self) + + def values(self): + """od.values() -> list of values in od""" + return [self[key] for key in self] + + def items(self): + """od.items() -> list of (key, value) pairs in od""" + return [(key, self[key]) for key in self] + + def iterkeys(self): + """od.iterkeys() -> an iterator over the keys in od""" + return iter(self) + + def itervalues(self): + """od.itervalues -> an iterator over the values in od""" + for k in self: + yield self[k] + + def iteritems(self): + """od.iteritems -> an iterator over the (key, value) items in od""" + for k in self: + yield (k, self[k]) + + def update(*args, **kwds): + """od.update(E, **F) -> None. Update od from dict/iterable E and F. + + If E is a dict instance, does: for k in E: od[k] = E[k] + If E has a .keys() method, does: for k in E.keys(): od[k] = E[k] + Or if E is an iterable of items, does:for k, v in E: od[k] = v + In either case, this is followed by: for k, v in F.items(): od[k] = v + """ + if len(args) > 2: + raise TypeError('update() takes at most 2 positional ' + 'arguments (%d given)' % (len(args),)) + elif not args: + raise TypeError('update() takes at least 1 argument (0 given)') + self = args[0] + # Make progressively weaker assumptions about "other" + other = () + if len(args) == 2: + other = args[1] + if isinstance(other, dict): + for key in other: + self[key] = other[key] + elif hasattr(other, 'keys'): + for key in other.keys(): + self[key] = other[key] + else: + for key, value in other: + self[key] = value + for key, value in kwds.items(): + self[key] = value + # let subclasses override update without breaking __init__ + __update = update + + __marker = object() + + def pop(self, key, default=__marker): + """od.pop(k[,d]) -> v, remove specified key and return the + corresponding value. If key is not found, d is returned if given, + otherwise KeyError is raised. + """ + if key in self: + result = self[key] + del self[key] + return result + if default is self.__marker: + raise KeyError(key) + return default + + def setdefault(self, key, default=None): + """od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od + """ + if key in self: + return self[key] + self[key] = default + return default + + def __repr__(self, _repr_running={}): + """od.__repr__() <==> repr(od)""" + call_key = id(self), _get_ident() + if call_key in _repr_running: + return '...' + _repr_running[call_key] = 1 + try: + if not self: + return '%s()' % (self.__class__.__name__,) + return '%s(%r)' % (self.__class__.__name__, list(self.items())) + finally: + del _repr_running[call_key] + + def __reduce__(self): + """Return state information for pickling""" + items = [[k, self[k]] for k in self] + inst_dict = vars(self).copy() + for k in vars(OrderedDict()): + inst_dict.pop(k, None) + if inst_dict: + return (self.__class__, (items,), inst_dict) + return self.__class__, (items,) + + def copy(self): + """od.copy() -> a shallow copy of od""" + return self.__class__(self) + + @classmethod + def fromkeys(cls, iterable, value=None): + """OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S and + values equal to v (which defaults to None). + """ + d = cls() + for key in iterable: + d[key] = value + return d + + def __eq__(self, other): + """od.__eq__(y) <==> od==y. Comparison to another OD is + order-sensitive while comparison to a regular mapping is + order-insensitive. + """ + if isinstance(other, OrderedDict): + return (len(self) == len(other) and + list(self.items()) == list(other.items())) + return dict.__eq__(self, other) + + def __ne__(self, other): + return not self == other + + # -- the following methods are only used in Python 2.7 -- + + def viewkeys(self): + """od.viewkeys() -> a set-like object providing a view on od's keys""" + return KeysView(self) + + def viewvalues(self): + """od.viewvalues() -> an object providing a view on od's values""" + return ValuesView(self) + + def viewitems(self): + """od.viewitems() -> a set-like object providing a view on od's items + """ + return ItemsView(self) + + +# {{{ http://code.activestate.com/recipes/576611/ (r11) + +try: + from operator import itemgetter + from heapq import nlargest +except ImportError: + pass + + +class _Counter(dict): + + """Dict subclass for counting hashable objects. Sometimes called a bag + or multiset. Elements are stored as dictionary keys and their counts + are stored as dictionary values. + + >>> Counter('zyzygy') + Counter({'y': 3, 'z': 2, 'g': 1}) + + """ + + def __init__(self, iterable=None, **kwds): + """Create a new, empty Counter object. And if given, count elements + from an input iterable. Or, initialize the count from another mapping + of elements to their counts. + + >>> c = Counter() # a new, empty counter + >>> c = Counter('gallahad') # a new counter from an iterable + >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping + >>> c = Counter(a=4, b=2) # a new counter from keyword args + + """ + self.update(iterable, **kwds) + + def __missing__(self, key): + return 0 + + def most_common(self, n=None): + """List the n most common elements and their counts from the most + common to the least. If n is None, then list all element counts. + + >>> Counter('abracadabra').most_common(3) + [('a', 5), ('r', 2), ('b', 2)] + + """ + if n is None: + return sorted(iteritems(self), key=itemgetter(1), reverse=True) + return nlargest(n, iteritems(self), key=itemgetter(1)) + + def elements(self): + """Iterator over elements repeating each as many times as its count. + + >>> c = Counter('ABCABC') + >>> sorted(c.elements()) + ['A', 'A', 'B', 'B', 'C', 'C'] + + If an element's count has been set to zero or is a negative number, + elements() will ignore it. + + """ + for elem, count in iteritems(self): + for _ in range(count): + yield elem + + # Override dict methods where the meaning changes for Counter objects. + + @classmethod + def fromkeys(cls, iterable, v=None): + raise NotImplementedError( + 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') + + def update(self, iterable=None, **kwds): + """Like dict.update() but add counts instead of replacing them. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.update('witch') # add elements from another iterable + >>> d = Counter('watch') + >>> c.update(d) # add elements from another counter + >>> c['h'] # four 'h' in which, witch, and watch + 4 + + """ + if iterable is not None: + if hasattr(iterable, 'iteritems'): + if self: + self_get = self.get + for elem, count in iteritems(iterable): + self[elem] = self_get(elem, 0) + count + else: + dict.update( + self, iterable) # fast path when counter is empty + else: + self_get = self.get + for elem in iterable: + self[elem] = self_get(elem, 0) + 1 + if kwds: + self.update(kwds) + + def copy(self): + """Like dict.copy() but returns a Counter instance instead of a dict. + """ + return Counter(self) + + def __delitem__(self, elem): + """Like dict.__delitem__() but does not raise KeyError for missing + values. + """ + if elem in self: + dict.__delitem__(self, elem) + + def __repr__(self): + if not self: + return '%s()' % self.__class__.__name__ + items = ', '.join(map('%r: %r'.__mod__, self.most_common())) + return '%s({%s})' % (self.__class__.__name__, items) + + # Multiset-style mathematical operations discussed in: + # Knuth TAOCP Volume II section 4.6.3 exercise 19 + # and at http://en.wikipedia.org/wiki/Multiset + # + # Outputs guaranteed to only include positive counts. + # + # To strip negative and zero counts, add-in an empty counter: + # c += Counter() + + def __add__(self, other): + """Add counts from two counters. + + >>> Counter('abbb') + Counter('bcc') + Counter({'b': 4, 'c': 2, 'a': 1}) + + """ + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem in set(self) | set(other): + newcount = self[elem] + other[elem] + if newcount > 0: + result[elem] = newcount + return result + + def __sub__(self, other): + """Subtract count, but keep only results with positive counts. + + >>> Counter('abbbc') - Counter('bccd') + Counter({'b': 2, 'a': 1}) + + """ + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem in set(self) | set(other): + newcount = self[elem] - other[elem] + if newcount > 0: + result[elem] = newcount + return result + + def __or__(self, other): + """Union is the maximum of value in either of the input counters. + + >>> Counter('abbb') | Counter('bcc') + Counter({'b': 3, 'c': 2, 'a': 1}) + + """ + if not isinstance(other, Counter): + return NotImplemented + _max = max + result = Counter() + for elem in set(self) | set(other): + newcount = _max(self[elem], other[elem]) + if newcount > 0: + result[elem] = newcount + return result + + def __and__(self, other): + """Intersection is the minimum of corresponding counts. + + >>> Counter('abbb') & Counter('bcc') + Counter({'b': 1}) + + """ + if not isinstance(other, Counter): + return NotImplemented + _min = min + result = Counter() + if len(self) < len(other): + self, other = other, self + for elem in filter(self.__contains__, other): + newcount = _min(self[elem], other[elem]) + if newcount > 0: + result[elem] = newcount + return result + +if sys.version_info[:2] < (2, 7): + OrderedDict = _OrderedDict + Counter = _Counter +else: + from collections import OrderedDict, Counter + +if PY3: + def raise_with_traceback(exc, traceback=Ellipsis): + if traceback == Ellipsis: + _, _, traceback = sys.exc_info() + raise exc.with_traceback(traceback) +else: + # this version of raise is a syntax error in Python 3 + exec(""" +def raise_with_traceback(exc, traceback=Ellipsis): + if traceback == Ellipsis: + _, _, traceback = sys.exc_info() + raise exc, None, traceback +""") + +raise_with_traceback.__doc__ = """Raise exception with existing traceback. +If traceback is not passed, uses sys.exc_info() to get traceback.""" + + +# http://stackoverflow.com/questions/4126348 +# Thanks to @martineau at SO + +from dateutil import parser as _date_parser +import dateutil +if LooseVersion(dateutil.__version__) < '2.0': + @functools.wraps(_date_parser.parse) + def parse_date(timestr, *args, **kwargs): + timestr = bytes(timestr) + return _date_parser.parse(timestr, *args, **kwargs) +else: + parse_date = _date_parser.parse + + +class OrderedDefaultdict(OrderedDict): + + def __init__(self, *args, **kwargs): + newdefault = None + newargs = () + if args: + newdefault = args[0] + if not (newdefault is None or callable(newdefault)): + raise TypeError('first argument must be callable or None') + newargs = args[1:] + self.default_factory = newdefault + super(self.__class__, self).__init__(*newargs, **kwargs) + + def __missing__(self, key): + if self.default_factory is None: + raise KeyError(key) + self[key] = value = self.default_factory() + return value + + def __reduce__(self): # optional, for pickle support + args = self.default_factory if self.default_factory else tuple() + return type(self), args, None, None, list(self.items()) diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py new file mode 100644 index 00000000..9edd2ef0 --- /dev/null +++ b/pandas/compat/chainmap.py @@ -0,0 +1,26 @@ +try: + from collections import ChainMap +except ImportError: + from pandas.compat.chainmap_impl import ChainMap + + +class DeepChainMap(ChainMap): + def __setitem__(self, key, value): + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key): + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) + + # override because the m parameter is introduced in Python 3.4 + def new_child(self, m=None): + if m is None: + m = {} + return self.__class__(m, *self.maps) diff --git a/pandas/compat/chainmap_impl.py b/pandas/compat/chainmap_impl.py new file mode 100644 index 00000000..92d24240 --- /dev/null +++ b/pandas/compat/chainmap_impl.py @@ -0,0 +1,136 @@ +from collections import MutableMapping + +try: + from thread import get_ident +except ImportError: + from _thread import get_ident + + +def recursive_repr(fillvalue='...'): + 'Decorator to make a repr function return fillvalue for a recursive call' + + def decorating_function(user_function): + repr_running = set() + + def wrapper(self): + key = id(self), get_ident() + if key in repr_running: + return fillvalue + repr_running.add(key) + try: + result = user_function(self) + finally: + repr_running.discard(key) + return result + + # Can't use functools.wraps() here because of bootstrap issues + wrapper.__module__ = getattr(user_function, '__module__') + wrapper.__doc__ = getattr(user_function, '__doc__') + wrapper.__name__ = getattr(user_function, '__name__') + return wrapper + + return decorating_function + + +class ChainMap(MutableMapping): + ''' A ChainMap groups multiple dicts (or other mappings) together + to create a single, updateable view. + + The underlying mappings are stored in a list. That list is public and can + accessed or updated using the *maps* attribute. There is no other state. + + Lookups search the underlying mappings successively until a key is found. + In contrast, writes, updates, and deletions only operate on the first + mapping. + + ''' + + def __init__(self, *maps): + '''Initialize a ChainMap by setting *maps* to the given mappings. + If no mappings are provided, a single empty dictionary is used. + + ''' + self.maps = list(maps) or [{}] # always at least one map + + def __missing__(self, key): + raise KeyError(key) + + def __getitem__(self, key): + for mapping in self.maps: + try: + return mapping[key] # can't use 'key in mapping' with defaultdict + except KeyError: + pass + return self.__missing__(key) # support subclasses that define __missing__ + + def get(self, key, default=None): + return self[key] if key in self else default + + def __len__(self): + return len(set().union(*self.maps)) # reuses stored hash values if possible + + def __iter__(self): + return iter(set().union(*self.maps)) + + def __contains__(self, key): + return any(key in m for m in self.maps) + + def __bool__(self): + return any(self.maps) + + @recursive_repr() + def __repr__(self): + return '{0.__class__.__name__}({1})'.format( + self, ', '.join(repr(m) for m in self.maps)) + + @classmethod + def fromkeys(cls, iterable, *args): + 'Create a ChainMap with a single dict created from the iterable.' + return cls(dict.fromkeys(iterable, *args)) + + def copy(self): + 'New ChainMap or subclass with a new copy of maps[0] and refs to maps[1:]' + return self.__class__(self.maps[0].copy(), *self.maps[1:]) + + __copy__ = copy + + def new_child(self, m=None): # like Django's Context.push() + ''' + New ChainMap with a new map followed by all previous maps. If no + map is provided, an empty dict is used. + ''' + if m is None: + m = {} + return self.__class__(m, *self.maps) + + @property + def parents(self): # like Django's Context.pop() + 'New ChainMap from maps[1:].' + return self.__class__(*self.maps[1:]) + + def __setitem__(self, key, value): + self.maps[0][key] = value + + def __delitem__(self, key): + try: + del self.maps[0][key] + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def popitem(self): + 'Remove and return an item pair from maps[0]. Raise KeyError is maps[0] is empty.' + try: + return self.maps[0].popitem() + except KeyError: + raise KeyError('No keys found in the first mapping.') + + def pop(self, key, *args): + 'Remove *key* from maps[0] and return its value. Raise KeyError if *key* not in maps[0].' + try: + return self.maps[0].pop(key, *args) + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def clear(self): + 'Clear maps[0], leaving maps[1:] intact.' + self.maps[0].clear() diff --git a/pandas/compat/openpyxl_compat.py b/pandas/compat/openpyxl_compat.py new file mode 100644 index 00000000..25ba83d5 --- /dev/null +++ b/pandas/compat/openpyxl_compat.py @@ -0,0 +1,24 @@ +""" +Detect incompatible version of OpenPyXL + +GH7169 +""" + +from distutils.version import LooseVersion + +start_ver = '1.6.1' +stop_ver = '2.0.0' + + +def is_compat(): + """Detect whether the installed version of openpyxl is supported. + + Returns + ------- + compat : bool + ``True`` if openpyxl is installed and is between versions 1.6.1 and + 2.0.0, ``False`` otherwise. + """ + import openpyxl + ver = LooseVersion(openpyxl.__version__) + return LooseVersion(start_ver) < ver <= LooseVersion(stop_ver) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py new file mode 100644 index 00000000..03b45336 --- /dev/null +++ b/pandas/compat/pickle_compat.py @@ -0,0 +1,113 @@ +""" support pre 0.12 series pickle compatibility """ + +import sys +import numpy as np +import pandas +import copy +import pickle as pkl +from pandas import compat +from pandas.compat import u, string_types +from pandas.core.series import Series, TimeSeries +from pandas.sparse.series import SparseSeries, SparseTimeSeries + + +def load_reduce(self): + stack = self.stack + args = stack.pop() + func = stack[-1] + if type(args[0]) is type: + n = args[0].__name__ + if n == u('DeprecatedSeries') or n == u('DeprecatedTimeSeries'): + stack[-1] = object.__new__(Series) + return + elif (n == u('DeprecatedSparseSeries') or + n == u('DeprecatedSparseTimeSeries')): + stack[-1] = object.__new__(SparseSeries) + return + + try: + value = func(*args) + except: + + # try to reencode the arguments + if getattr(self,'encoding',None) is not None: + args = tuple([arg.encode(self.encoding) + if isinstance(arg, string_types) + else arg for arg in args]) + try: + stack[-1] = func(*args) + return + except: + pass + + if getattr(self,'is_verbose',None): + print(sys.exc_info()) + print(func, args) + raise + + stack[-1] = value + +if compat.PY3: + class Unpickler(pkl._Unpickler): + pass +else: + class Unpickler(pkl.Unpickler): + pass + +Unpickler.dispatch = copy.copy(Unpickler.dispatch) +Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce + + +def load(fh, encoding=None, compat=False, is_verbose=False): + """load a pickle, with a provided encoding + + if compat is True: + fake the old class hierarchy + if it works, then return the new type objects + + Parameters + ---------- + fh: a filelike object + encoding: an optional encoding + compat: provide Series compatibility mode, boolean, default False + is_verbose: show exception output + """ + + try: + if compat: + pandas.core.series.Series = DeprecatedSeries + pandas.core.series.TimeSeries = DeprecatedTimeSeries + pandas.sparse.series.SparseSeries = DeprecatedSparseSeries + pandas.sparse.series.SparseTimeSeries = DeprecatedSparseTimeSeries + fh.seek(0) + if encoding is not None: + up = Unpickler(fh, encoding=encoding) + else: + up = Unpickler(fh) + up.is_verbose = is_verbose + + return up.load() + except: + raise + finally: + if compat: + pandas.core.series.Series = Series + pandas.core.series.Series = TimeSeries + pandas.sparse.series.SparseSeries = SparseSeries + pandas.sparse.series.SparseTimeSeries = SparseTimeSeries + + +class DeprecatedSeries(np.ndarray, Series): + pass + + +class DeprecatedTimeSeries(DeprecatedSeries): + pass + + +class DeprecatedSparseSeries(DeprecatedSeries): + pass + + +class DeprecatedSparseTimeSeries(DeprecatedSparseSeries): + pass diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/computation/align.py b/pandas/computation/align.py new file mode 100644 index 00000000..2e0845bd --- /dev/null +++ b/pandas/computation/align.py @@ -0,0 +1,183 @@ +"""Core eval alignment algorithms +""" + +import warnings +from functools import partial, wraps +from pandas.compat import zip, range + +import numpy as np + +import pandas as pd +from pandas import compat +import pandas.core.common as com +from pandas.computation.common import _result_type_many + + +def _align_core_single_unary_op(term): + if isinstance(term.value, np.ndarray): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + ret = typ, + + if not hasattr(term.value, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.value.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(isinstance(term.value, pd.core.generic.PandasObject) + for term in terms) + + +def _filter_special_cases(f): + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + term_values = (term.value for term in terms) + + # we don't have any pandas objects + if not _any_pandas_objects(terms): + return _result_type_many(*term_values), None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) + if hasattr(term.value, 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + gt_than_one_axis = naxes > 1 + + for value in (terms[i].value for i in term_index): + is_series = isinstance(value, pd.Series) + is_series_and_gt_one_axis = is_series and gt_than_one_axis + + for axis, items in enumerate(value.axes): + if is_series_and_gt_one_axis: + ax, itm = naxes - 1, value.index + else: + ax, itm = axis, items + + if not axes[ax].is_(itm): + axes[ax] = axes[ax].join(itm, how='outer') + + for i, ndim in compat.iteritems(ndims): + for axis, items in zip(range(ndim), axes): + ti = terms[i].value + + if hasattr(ti, 'reindex_axis'): + transpose = isinstance(ti, pd.Series) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + ordm = np.log10(abs(reindexer_size - term_axis_size)) + if ordm >= 1 and reindexer_size >= 10000: + warnings.warn('Alignment difference on axis {0} is larger ' + 'than an order of magnitude on term {1!r}, ' + 'by more than {2:.4g}; performance may ' + 'suffer'.format(axis, terms[i].name, ordm), + category=pd.io.common.PerformanceWarning) + + if transpose: + f = partial(ti.reindex, index=reindexer, copy=False) + else: + f = partial(ti.reindex_axis, reindexer, axis=axis, + copy=False) + + terms[i].update(f()) + + terms[i].update(terms[i].value.values) + + return typ, _zip_axes_from_type(typ, axes) + + +def _align(terms): + """Align a set of terms""" + try: + # flatten the parse tree (a nested list, really) + terms = list(com.flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if isinstance(terms.value, pd.core.generic.NDFrame): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None + + # if all resolved variables are numeric scalars + if all(term.isscalar for term in terms): + return _result_type_many(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def _reconstruct_object(typ, obj, axes, dtype): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + ret : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + typ = typ.type + except AttributeError: + pass + + res_t = np.result_type(obj.dtype, dtype) + + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): + return typ(obj, dtype=res_t, **axes) + + # special case for pathological things like ~True/~False + if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) + + try: + ret = ret_value.item() + except (ValueError, IndexError): + # XXX: we catch IndexError to absorb a + # regression in numpy 1.7.0 + # fixed by numpy/numpy@04b89c63 + ret = ret_value + return ret diff --git a/pandas/computation/api.py b/pandas/computation/api.py new file mode 100644 index 00000000..db8269a4 --- /dev/null +++ b/pandas/computation/api.py @@ -0,0 +1,2 @@ +from pandas.computation.eval import eval +from pandas.computation.expr import Expr diff --git a/pandas/computation/common.py b/pandas/computation/common.py new file mode 100644 index 00000000..105cc497 --- /dev/null +++ b/pandas/computation/common.py @@ -0,0 +1,24 @@ +import numpy as np +import pandas as pd +from pandas.compat import reduce + + +def _ensure_decoded(s): + """ if we have bytes, decode them to unicode """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode(pd.get_option('display.encoding')) + return s + + +def _result_type_many(*arrays_and_dtypes): + """ wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) + argument limit """ + try: + return np.result_type(*arrays_and_dtypes) + except ValueError: + # we have > NPY_MAXARGS terms in our expression + return reduce(np.result_type, arrays_and_dtypes) + + +class NameResolutionError(NameError): + pass diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py new file mode 100644 index 00000000..58b822af --- /dev/null +++ b/pandas/computation/engines.py @@ -0,0 +1,147 @@ +"""Engine classes for :func:`~pandas.eval` +""" + +import abc + +from pandas import compat +from pandas.compat import DeepChainMap, map +from pandas.core import common as com +from pandas.computation.align import _align, _reconstruct_object +from pandas.computation.ops import UndefinedVariableError, _mathops, _reductions + + +_ne_builtins = frozenset(_mathops + _reductions) + + +class NumExprClobberingError(NameError): + pass + + +def _check_ne_builtin_clash(expr): + """Attempt to prevent foot-shooting in a helpful way. + + Parameters + ---------- + terms : Term + Terms can contain + """ + names = expr.names + overlap = names & _ne_builtins + + if overlap: + s = ', '.join(map(repr, overlap)) + raise NumExprClobberingError('Variables in expression "%s" overlap with ' + 'numexpr builtins: (%s)' % (expr, s)) + + +class AbstractEngine(object): + + """Object serving as a base class for all engines.""" + + __metaclass__ = abc.ABCMeta + + has_neg_frac = False + + def __init__(self, expr): + self.expr = expr + self.aligned_axes = None + self.result_type = None + + def convert(self): + """Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return com.pprint_thing(self.expr) + + def evaluate(self): + """Run the engine on the expression + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + obj : object + The result of the passed expression. + """ + if not self._is_aligned: + self.result_type, self.aligned_axes = _align(self.expr.terms) + + # make sure no names in resolvers and locals/globals clash + res = self._evaluate() + return _reconstruct_object(self.result_type, res, self.aligned_axes, + self.expr.terms.return_type) + + @property + def _is_aligned(self): + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self): + """Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + Must be implemented by subclasses. + """ + pass + + +class NumExprEngine(AbstractEngine): + + """NumExpr engine class""" + has_neg_frac = True + + def __init__(self, expr): + super(NumExprEngine, self).__init__(expr) + + def convert(self): + return str(super(NumExprEngine, self).convert()) + + def _evaluate(self): + import numexpr as ne + + # convert the expression to a valid numexpr expression + s = self.convert() + + try: + env = self.expr.env + scope = env.full_scope + truediv = scope['truediv'] + _check_ne_builtin_clash(self.expr) + return ne.evaluate(s, local_dict=scope, truediv=truediv) + except KeyError as e: + # python 3 compat kludge + try: + msg = e.message + except AttributeError: + msg = compat.text_type(e) + raise UndefinedVariableError(msg) + + +class PythonEngine(AbstractEngine): + + """Evaluate an expression in Python space. + + Mostly for testing purposes. + """ + has_neg_frac = False + + def __init__(self, expr): + super(PythonEngine, self).__init__(expr) + + def evaluate(self): + return self.expr() + + def _evaluate(self): + pass + + +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py new file mode 100644 index 00000000..82c68fb1 --- /dev/null +++ b/pandas/computation/eval.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python + +"""Top level ``eval`` module. +""" + +import tokenize +from pandas.core import common as com +from pandas.computation.expr import Expr, _parsers, tokenize_string +from pandas.computation.scope import _ensure_scope +from pandas.compat import DeepChainMap, builtins +from pandas.computation.engines import _engines +from distutils.version import LooseVersion + + +def _check_engine(engine): + """Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + + Raises + ------ + KeyError + * If an invalid engine is passed + ImportError + * If numexpr was requested but doesn't exist + """ + if engine not in _engines: + raise KeyError('Invalid engine {0!r} passed, valid engines are' + ' {1}'.format(engine, list(_engines.keys()))) + + # TODO: validate this in a more general way (thinking of future engines + # that won't necessarily be import-able) + # Could potentially be done on engine instantiation + if engine == 'numexpr': + try: + import numexpr + except ImportError: + raise ImportError("'numexpr' not found. Cannot use " + "engine='numexpr' for query/eval " + "if 'numexpr' is not installed") + else: + ne_version = numexpr.__version__ + if ne_version < LooseVersion('2.0'): + raise ImportError("'numexpr' version is %s, " + "must be >= 2.0" % ne_version) + + +def _check_parser(parser): + """Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ + if parser not in _parsers: + raise KeyError('Invalid parser {0!r} passed, valid parsers are' + ' {1}'.format(parser, _parsers.keys())) + + +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, '__getitem__'): + name = type(resolver).__name__ + raise TypeError('Resolver of type %r does not implement ' + 'the __getitem__ method' % name) + + +def _check_expression(expr): + """Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr): + """Convert an object to an expression. + + Thus function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + s : unicode + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = com.pprint_thing(expr) + _check_expression(s) + return s + + +def _check_for_locals(expr, stack_level, parser): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != 'pandas' + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ("The '@' prefix is not allowed in " + "top-level eval calls, \nplease refer to " + "your variables by name without the '@' " + "prefix") + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval in tokenize_string(expr): + if toknum == tokenize.OP and tokval == '@': + raise SyntaxError(msg) + + +def eval(expr, parser='pandas', engine='numexpr', truediv=True, + local_dict=None, global_dict=None, resolvers=(), level=0, + target=None): + """Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + Parameters + ---------- + expr : str or unicode + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser : string, default 'pandas', {'pandas', 'python'} + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine : string, default 'numexpr', {'python', 'numexpr'} + + The engine used to evaluate the expression. Supported engines are + + - ``'numexpr'``: This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions + with large frames. + - ``'python'``: Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + + truediv : bool, optional + Whether to use true division, like in Python >= 3 + local_dict : dict or None, optional + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional + A dictionary of global variables, taken from globals() by default. + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~pandas.DataFrame.query` method to inject the + :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level : int, optional + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target : a target object for assignment, optional, default is None + essentially this is a passed in resolver + + Returns + ------- + ndarray, numeric scalar, DataFrame, Series + + Notes + ----- + The ``dtype`` of any objects involved in an arithmetic ``%`` operation are + recursively cast to ``float64``. + + See the :ref:`enhancing performance ` documentation for + more details. + + See Also + -------- + pandas.DataFrame.query + pandas.DataFrame.eval + """ + expr = _convert_expression(expr) + _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + _check_for_locals(expr, level, parser) + + # get our (possibly passed-in) scope + level += 1 + env = _ensure_scope(level, global_dict=global_dict, + local_dict=local_dict, resolvers=resolvers, + target=target) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, + truediv=truediv) + + # construct the engine and evaluate the parsed expression + eng = _engines[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + + # assign if needed + if env.target is not None and parsed_expr.assigner is not None: + env.target[parsed_expr.assigner] = ret + return None + + return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py new file mode 100644 index 00000000..b6a1fcbe --- /dev/null +++ b/pandas/computation/expr.py @@ -0,0 +1,662 @@ +""":func:`~pandas.eval` parsers +""" + +import ast +import operator +import sys +import inspect +import tokenize +import datetime + +from functools import partial + +import pandas as pd +from pandas import compat +from pandas.compat import StringIO, lmap, zip, reduce, string_types +from pandas.core.base import StringMixin +from pandas.core import common as com +from pandas.tools.util import compose +from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, + _arith_ops_syms, _unary_ops_syms, is_term) +from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG +from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div +from pandas.computation.ops import UndefinedVariableError +from pandas.computation.scope import Scope, _ensure_scope + + +def tokenize_string(source): + """Tokenize a Python source code string. + + Parameters + ---------- + source : str + A Python source code string + """ + line_reader = StringIO(source).readline + for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader): + yield toknum, tokval + + +def _rewrite_assign(tok): + """Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + return toknum, '==' if tokval == '=' else tokval + + +def _replace_booleans(tok): + """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == '&': + return tokenize.NAME, 'and' + elif tokval == '|': + return tokenize.NAME, 'or' + return toknum, tokval + return toknum, tokval + + +def _replace_locals(tok): + """Replace local variables with a syntactically valid name. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. + """ + toknum, tokval = tok + if toknum == tokenize.OP and tokval == '@': + return tokenize.OP, _LOCAL_TAG + return toknum, tokval + + +def _preparse(source, f=compose(_replace_locals, _replace_booleans, + _rewrite_assign)): + """Compose a collection of tokenization functions + + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. + + Returns + ------- + s : str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), 'f must be callable' + return tokenize.untokenize(lmap(f, tokenize_string(source))) + + +def _is_type(t): + """Factory for a type checking function of type ``t`` or tuple of types.""" + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(string_types) + + +# partition all AST nodes +_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and + issubclass(x, ast.AST), + (getattr(ast, node) for node in dir(ast)))) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + """Filter out AST nodes that are subclasses of ``superclass``.""" + node_names = (node.__name__ for node in all_nodes + if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes)) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_slice_nodes = _filter_nodes(ast.slice) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(['Assign', 'Module', 'Expr']) + + +_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp', + 'DictComp', 'SetComp', 'Repr', 'Lambda', + 'Set', 'AST', 'Is', 'IsNot']) + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | + _arguments_nodes | _keyword_nodes | _alias_nodes | + _expr_context_nodes | _unsupported_expr_nodes) - + _hacked_nodes) + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +_msg = 'cannot both support and not support {0}'.format(_unsupported_nodes & + _base_supported_nodes) +assert not _unsupported_nodes & _base_supported_nodes, _msg + + +def _node_not_implemented(node_name, cls): + """Return a function that raises a NotImplementedError with a passed node + name. + """ + + def f(self, *args, **kwargs): + raise NotImplementedError("{0!r} nodes are not " + "implemented".format(node_name)) + return f + + +def disallow(nodes): + """Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + disallowed : callable + """ + def disallowed(cls): + cls.unsupported_nodes = () + for node in nodes: + new_method = _node_not_implemented(node, cls) + name = 'visit_{0}'.format(node) + cls.unsupported_nodes += (name,) + setattr(cls, name, new_method) + return cls + return disallowed + + +def _op_maker(op_class, op_symbol): + """Return a function to create an op class with its symbol already passed. + + Returns + ------- + f : callable + """ + + def f(self, node, *args, **kwargs): + """Return a partial function with an Op subclass with an operator + already passed. + + Returns + ------- + f : callable + """ + return partial(op_class, op_symbol, *args, **kwargs) + return f + + +_op_classes = {'binary': BinOp, 'unary': UnaryOp} + + +def add_ops(op_classes): + """Decorator to add default implementation of ops.""" + def f(cls): + for op_attr_name, op_class in compat.iteritems(op_classes): + ops = getattr(cls, '{0}_ops'.format(op_attr_name)) + ops_map = getattr(cls, '{0}_op_nodes_map'.format(op_attr_name)) + for op in ops: + op_node = ops_map[op] + if op_node is not None: + made_op = _op_maker(op_class, op) + setattr(cls, 'visit_{0}'.format(op_node), made_op) + return cls + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + + """Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ + const_type = Constant + term_type = Term + + binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn', + 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', + None, 'Pow', 'FloorDiv', 'Mod') + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) + + unary_ops = _unary_ops_syms + unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn + } + + def __init__(self, env, engine, parser, preparser=_preparse): + self.env = env + self.engine = engine + self.parser = parser + self.preparser = preparser + self.assigner = None + + def visit(self, node, **kwargs): + if isinstance(node, string_types): + clean = self.preparser(node) + node = ast.fix_missing_locations(ast.parse(clean)) + + method = 'visit_' + node.__class__.__name__ + visitor = getattr(self, method) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError('only a single expression is allowed') + expr = node.body[0] + return self.visit(expr, **kwargs) + + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) + + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + if left_str: + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _possibly_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side='left') + if right is None: + right = self.visit(node.right, side='right') + op, op_class, left, right = self._rewrite_membership_op(node, left, + right) + return op, op_class, left, right + + def _possibly_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate(self.env, self.engine, self.parser, + self.term_type, eval_in_python) + + def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, + eval_in_python=('in', 'not in'), + maybe_eval_in_python=('==', '!=', '<', '>', + '<=', '>=')): + res = op(lhs, rhs) + + if res.has_invalid_return_type: + raise TypeError("unsupported operand type(s) for {0}:" + " '{1}' and '{2}'".format(res.op, lhs.type, + rhs.type)) + + if self.engine != 'pytables': + if (res.op in _cmp_ops_syms + and getattr(lhs, 'is_datetime', False) + or getattr(rhs, 'is_datetime', False)): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._possibly_eval(res, self.binary_ops) + + if res.op in eval_in_python: + # "in"/"not in" ops are always evaluated in python + return self._possibly_eval(res, eval_in_python) + elif self.engine != 'pytables': + if (getattr(lhs, 'return_type', None) == object + or getattr(rhs, 'return_type', None) == object): + # evaluate "==" and "!=" in python if either of our operands + # has an object return type + return self._possibly_eval(res, eval_in_python + + maybe_eval_in_python) + return res + + def visit_BinOp(self, node, **kwargs): + op, op_class, left, right = self._possibly_transform_eq_ne(node) + return self._possibly_evaluate_binop(op, op_class, left, right) + + def visit_Div(self, node, **kwargs): + truediv = self.env.scope['truediv'] + return lambda lhs, rhs: Div(lhs, rhs, truediv) + + def visit_UnaryOp(self, node, **kwargs): + op = self.visit(node.op) + operand = self.visit(node.operand) + return op(operand) + + def visit_Name(self, node, **kwargs): + return self.term_type(node.id, self.env, **kwargs) + + def visit_NameConstant(self, node, **kwargs): + return self.const_type(node.value, self.env) + + def visit_Num(self, node, **kwargs): + return self.const_type(node.n, self.env) + + def visit_Str(self, node, **kwargs): + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) + + def visit_List(self, node, **kwargs): + name = self.env.add_tmp([self.visit(e).value for e in node.elts]) + return self.term_type(name, self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """ df.index[4] """ + return self.visit(node.value) + + def visit_Subscript(self, node, **kwargs): + value = self.visit(node.value) + slobj = self.visit(node.slice) + result = pd.eval(slobj, local_dict=self.env, engine=self.engine, + parser=self.parser) + try: + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd.eval(value, local_dict=self.env, engine=self.engine, + parser=self.parser) + v = lhs[result] + name = self.env.add_tmp(v) + return self.term_type(name, env=self.env) + + def visit_Slice(self, node, **kwargs): + """ df.index[slice(4,6)] """ + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + """ + support a single assignment node, like + + c = a + b + + set the assigner at the top level, must be a Name node which + might or might not exist in the resolvers + + """ + + if len(node.targets) != 1: + raise SyntaxError('can only assign a single expression') + if not isinstance(node.targets[0], ast.Name): + raise SyntaxError('left hand side of an assignment must be a ' + 'single name') + if self.env.target is None: + raise ValueError('cannot assign without a target object') + + try: + assigner = self.visit(node.targets[0], **kwargs) + except UndefinedVariableError: + assigner = node.targets[0].id + + self.assigner = getattr(assigner, 'name', assigner) + if self.assigner is None: + raise SyntaxError('left hand side of an assignment must be a ' + 'single resolvable name') + + return self.visit(node.value, **kwargs) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + # resolve the value + resolved = self.visit(value).value + try: + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return self.term_type(name, self.env) + except AttributeError: + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + + def visit_Call(self, node, side=None, **kwargs): + + # this can happen with: datetime.datetime + if isinstance(node.func, ast.Attribute): + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + else: + res = self.visit(node.func) + + if res is None: + raise ValueError("Invalid function call {0}".format(node.func.id)) + if hasattr(res, 'value'): + res = res.value + + args = [self.visit(targ).value for targ in node.args] + if node.starargs is not None: + args += self.visit(node.starargs).value + + keywords = {} + for key in node.keywords: + if not isinstance(key, ast.keyword): + raise ValueError("keyword error in function call " + "'{0}'".format(node.func.id)) + keywords[key.arg] = self.visit(key.value).value + if node.kwargs is not None: + keywords.update(self.visit(node.kwargs).value) + + return self.const_type(res(*args, **keywords), self.env) + + def translate_In(self, op): + return op + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + + # base case: we have something like a CMP b + if len(comps) == 1: + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = self.visit(ast.Compare(comparators=[comp], left=left, + ops=[self.translate_In(op)])) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + + def visit_BoolOp(self, node, **kwargs): + def visitor(x, y): + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) + + op, op_class, lhs, rhs = self._possibly_transform_eq_ne(node, lhs, + rhs) + return self._possibly_evaluate_binop(op, node.op, lhs, rhs) + + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(['Dict', 'Call', 'BoolOp', 'In', 'NotIn']) +_numexpr_supported_calls = frozenset(_reductions + _mathops) + + +@disallow((_unsupported_nodes | _python_not_supported) - + (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn', + 'Tuple']))) +class PandasExprVisitor(BaseExprVisitor): + + def __init__(self, env, engine, parser, + preparser=partial(_preparse, f=compose(_replace_locals, + _replace_booleans))): + super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) + + +@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) +class PythonExprVisitor(BaseExprVisitor): + + def __init__(self, env, engine, parser, preparser=lambda x: x): + super(PythonExprVisitor, self).__init__(env, engine, parser, + preparser=preparser) + + +class Expr(StringMixin): + + """Object encapsulating an expression. + + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + truediv : bool, optional, default True + level : int, optional, default 2 + """ + + def __init__(self, expr, engine='numexpr', parser='pandas', env=None, + truediv=True, level=0): + self.expr = expr + self.env = env or Scope(level=level + 1) + self.engine = engine + self.parser = parser + self.env.scope['truediv'] = truediv + self._visitor = _parsers[parser](self.env, self.engine, self.parser) + self.terms = self.parse() + + @property + def assigner(self): + return getattr(self._visitor, 'assigner', None) + + def __call__(self): + return self.terms(self.env) + + def __unicode__(self): + return com.pprint_thing(self.terms) + + def __len__(self): + return len(self.expr) + + def parse(self): + """Parse an expression""" + return self._visitor.visit(self.expr) + + @property + def names(self): + """Get the names in an expression""" + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + +_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/computation/expressions.py b/pandas/computation/expressions.py new file mode 100644 index 00000000..47d3fce6 --- /dev/null +++ b/pandas/computation/expressions.py @@ -0,0 +1,258 @@ +""" +Expressions +----------- + +Offer fast expression evaluation through numexpr + +""" + +import warnings +import numpy as np +from pandas.core.common import _values_from_object +from distutils.version import LooseVersion + +try: + import numexpr as ne + _NUMEXPR_INSTALLED = ne.__version__ >= LooseVersion('2.0') +except ImportError: # pragma: no cover + _NUMEXPR_INSTALLED = False + +_TEST_MODE = None +_TEST_RESULT = None +_USE_NUMEXPR = _NUMEXPR_INSTALLED +_evaluate = None +_where = None + +# the set of dtypes that we will allow pass to numexpr +_ALLOWED_DTYPES = { + 'evaluate': set(['int64', 'int32', 'float64', 'float32', 'bool']), + 'where': set(['int64', 'float64', 'bool']) +} + +# the minimum prod shape that we will use numexpr +_MIN_ELEMENTS = 10000 + + +def set_use_numexpr(v=True): + # set/unset to use numexpr + global _USE_NUMEXPR + if _NUMEXPR_INSTALLED: + _USE_NUMEXPR = v + + # choose what we are going to do + global _evaluate, _where + if not _USE_NUMEXPR: + _evaluate = _evaluate_standard + _where = _where_standard + else: + _evaluate = _evaluate_numexpr + _where = _where_numexpr + + +def set_numexpr_threads(n=None): + # if we are using numexpr, set the threads to n + # otherwise reset + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) + + +def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): + """ standard evaluation """ + if _TEST_MODE: + _store_test_result(False) + return op(a, b) + + +def _can_use_numexpr(op, op_str, a, b, dtype_check): + """ return a boolean if we WILL be using numexpr """ + if op_str is not None: + + # required min elements (otherwise we are adding overhead) + if np.prod(a.shape) > _MIN_ELEMENTS: + + # check for dtype compatiblity + dtypes = set() + for o in [a, b]: + if hasattr(o, 'get_dtype_counts'): + s = o.get_dtype_counts() + if len(s) > 1: + return False + dtypes |= set(s.index) + elif isinstance(o, np.ndarray): + dtypes |= set([o.dtype.name]) + + # allowed are a superset + if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes: + return True + + return False + + +def _evaluate_numexpr(op, op_str, a, b, raise_on_error=False, truediv=True, reversed=False, + **eval_kwargs): + result = None + + if _can_use_numexpr(op, op_str, a, b, 'evaluate'): + try: + + # we were originally called by a reversed op + # method + if reversed: + a,b = b,a + + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) + result = ne.evaluate('a_value %s b_value' % op_str, + local_dict={'a_value': a_value, + 'b_value': b_value}, + casting='safe', truediv=truediv, + **eval_kwargs) + except ValueError as detail: + if 'unknown type object' in str(detail): + pass + except Exception as detail: + if raise_on_error: + raise + + if _TEST_MODE: + _store_test_result(result is not None) + + if result is None: + result = _evaluate_standard(op, op_str, a, b, raise_on_error) + + return result + + +def _where_standard(cond, a, b, raise_on_error=True): + return np.where(_values_from_object(cond), _values_from_object(a), + _values_from_object(b)) + + +def _where_numexpr(cond, a, b, raise_on_error=False): + result = None + + if _can_use_numexpr(None, 'where', a, b, 'where'): + + try: + cond_value = getattr(cond, 'values', cond) + a_value = getattr(a, 'values', a) + b_value = getattr(b, 'values', b) + result = ne.evaluate('where(cond_value, a_value, b_value)', + local_dict={'cond_value': cond_value, + 'a_value': a_value, + 'b_value': b_value}, + casting='safe') + except ValueError as detail: + if 'unknown type object' in str(detail): + pass + except Exception as detail: + if raise_on_error: + raise TypeError(str(detail)) + + if result is None: + result = _where_standard(cond, a, b, raise_on_error) + + return result + + +# turn myself on +set_use_numexpr(True) + + +def _has_bool_dtype(x): + try: + return x.dtype == bool + except AttributeError: + try: + return 'bool' in x.blocks + except AttributeError: + return isinstance(x, (bool, np.bool_)) + + +def _bool_arith_check(op_str, a, b, not_allowed=frozenset(('/', '//', '**')), + unsupported=None): + if unsupported is None: + unsupported = {'+': '|', '*': '&', '-': '^'} + + if _has_bool_dtype(a) and _has_bool_dtype(b): + if op_str in unsupported: + warnings.warn("evaluating in Python space because the %r operator" + " is not supported by numexpr for the bool " + "dtype, use %r instead" % (op_str, + unsupported[op_str])) + return False + + if op_str in not_allowed: + raise NotImplementedError("operator %r not implemented for bool " + "dtypes" % op_str) + return True + + +def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, + **eval_kwargs): + """ evaluate and return the expression of the op on a and b + + Parameters + ---------- + + op : the actual operand + op_str: the string version of the op + a : left operand + b : right operand + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results + use_numexpr : whether to try to use numexpr (default True) + """ + use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) + if use_numexpr: + return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, + **eval_kwargs) + return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) + + +def where(cond, a, b, raise_on_error=False, use_numexpr=True): + """ evaluate the where condition cond on a and b + + Parameters + ---------- + + cond : a boolean array + a : return if cond is True + b : return if cond is False + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results + use_numexpr : whether to try to use numexpr (default True) + """ + + if use_numexpr: + return _where(cond, a, b, raise_on_error=raise_on_error) + return _where_standard(cond, a, b, raise_on_error=raise_on_error) + + +def set_test_mode(v=True): + """ + Keeps track of whether numexpr was used. Stores an additional ``True`` + for every successful use of evaluate with numexpr since the last + ``get_test_result`` + """ + global _TEST_MODE, _TEST_RESULT + _TEST_MODE = v + _TEST_RESULT = [] + + +def _store_test_result(used_numexpr): + global _TEST_RESULT + if used_numexpr: + _TEST_RESULT.append(used_numexpr) + + +def get_test_result(): + """get test result and reset test_results""" + global _TEST_RESULT + res = _TEST_RESULT + _TEST_RESULT = [] + return res diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py new file mode 100644 index 00000000..81526b88 --- /dev/null +++ b/pandas/computation/ops.py @@ -0,0 +1,493 @@ +"""Operator classes for eval. +""" + +import re +import operator as op +from functools import partial +from itertools import product, islice, chain +from datetime import datetime + +import numpy as np + +import pandas as pd +from pandas.compat import PY3, string_types, text_type +import pandas.core.common as com +from pandas.core.base import StringMixin +from pandas.computation.common import _ensure_decoded, _result_type_many +from pandas.computation.scope import _DEFAULT_GLOBALS + + +_reductions = 'sum', 'prod' +_mathops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', 'pow', 'div', 'sqrt', + 'inv', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan', + 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs') + + +_LOCAL_TAG = '__pd_eval_local_' + + +class UndefinedVariableError(NameError): + + """NameError subclass for local variables.""" + + def __init__(self, name, is_local): + if is_local: + msg = 'local variable {0!r} is not defined' + else: + msg = 'name {0!r} is not defined' + super(UndefinedVariableError, self).__init__(msg.format(name)) + + +class Term(StringMixin): + + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, string_types) else cls + supr_new = super(Term, klass).__new__ + return supr_new(klass) + + def __init__(self, name, env, side=None, encoding=None): + self._name = name + self.env = env + self.side = side + tname = text_type(name) + self.is_local = (tname.startswith(_LOCAL_TAG) or + tname in _DEFAULT_GLOBALS) + self._value = self._resolve_name() + self.encoding = encoding + + @property + def local_name(self): + return self.name.replace(_LOCAL_TAG, '') + + def __unicode__(self): + return com.pprint_thing(self.name) + + def __call__(self, *args, **kwargs): + return self.value + + def evaluate(self, *args, **kwargs): + return self + + def _resolve_name(self): + key = self.name + res = self.env.resolve(self.local_name, is_local=self.is_local) + self.update(res) + + if hasattr(res, 'ndim') and res.ndim > 2: + raise NotImplementedError("N-dimensional objects, where N > 2," + " are not supported with eval") + return res + + def update(self, value): + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ + key = self.name + + # if it's a variable name (otherwise a constant) + if isinstance(key, string_types): + self.env.swapkey(self.local_name, key, new_value=value) + + self.value = value + + @property + def isscalar(self): + return np.isscalar(self._value) + + @property + def type(self): + try: + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype + except AttributeError: + try: + # ndarray + return self._value.dtype + except AttributeError: + # scalar + return type(self._value) + + return_type = type + + @property + def raw(self): + return com.pprint_thing('{0}(name={1!r}, type={2})' + ''.format(self.__class__.__name__, self.name, + self.type)) + + @property + def is_datetime(self): + try: + t = self.type.type + except AttributeError: + t = self.type + + return issubclass(t, (datetime, np.datetime64)) + + @property + def value(self): + return self._value + + @value.setter + def value(self, new_value): + self._value = new_value + + @property + def name(self): + return self._name + + @name.setter + def name(self, new_name): + self._name = new_name + + @property + def ndim(self): + return self._value.ndim + + +class Constant(Term): + + def __init__(self, value, env, side=None, encoding=None): + super(Constant, self).__init__(value, env, side=side, + encoding=encoding) + + def _resolve_name(self): + return self._name + + @property + def name(self): + return self.value + + +_bool_op_map = {'not': '~', 'and': '&', 'or': '|'} + + +class Op(StringMixin): + + """Hold an operator of arbitrary arity + """ + + def __init__(self, op, operands, *args, **kwargs): + self.op = _bool_op_map.get(op, op) + self.operands = operands + self.encoding = kwargs.get('encoding', None) + + def __iter__(self): + return iter(self.operands) + + def __unicode__(self): + """Print a generic n-ary operator and its operands using infix + notation""" + # recurse over the operands + parened = ('({0})'.format(com.pprint_thing(opr)) + for opr in self.operands) + return com.pprint_thing(' {0} '.format(self.op).join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (_cmp_ops_syms + _bool_ops_syms): + return np.bool_ + return _result_type_many(*(term.type for term in com.flatten(self))) + + @property + def has_invalid_return_type(self): + types = self.operand_types + obj_dtype_set = frozenset([np.dtype('object')]) + return self.return_type == object and types - obj_dtype_set + + @property + def operand_types(self): + return frozenset(term.type for term in com.flatten(self)) + + @property + def isscalar(self): + return all(operand.isscalar for operand in self.operands) + + @property + def is_datetime(self): + try: + t = self.return_type.type + except AttributeError: + t = self.return_type + + return issubclass(t, (datetime, np.datetime64)) + + +def _in(x, y): + """Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return x.isin(y) + except AttributeError: + if com.is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass + return x in y + + +def _not_in(x, y): + """Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~x.isin(y) + except AttributeError: + if com.is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass + return x not in y + + +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) + +_bool_ops_syms = '&', '|', 'and', 'or' +_bool_ops_funcs = op.and_, op.or_, op.and_, op.or_ +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) + +_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, + op.pow, op.floordiv, op.mod) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) + +_special_case_arith_ops_syms = '**', '//', '%' +_special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod +_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms, + _special_case_arith_ops_funcs)) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, dtype): + """Cast an expression inplace. + + Parameters + ---------- + terms : Op + The expression that should cast. + dtype : str or numpy.dtype + The dtype to cast to. + """ + dt = np.dtype(dtype) + for term in terms: + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj): + return isinstance(obj, Term) + + +class BinOp(Op): + + """Hold a binary operator and its operands + + Parameters + ---------- + op : str + left : Term or Op + right : Term or Op + """ + + def __init__(self, op, lhs, rhs, **kwargs): + super(BinOp, self).__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + self._disallow_scalar_only_bool_ops() + + self.convert_values() + + try: + self.func = _binary_ops_dict[op] + except KeyError: + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) + raise ValueError('Invalid binary operator {0!r}, valid' + ' operators are {1}'.format(op, keys)) + + def __call__(self, env): + """Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ + # handle truediv + if self.op == '/' and env.scope['truediv']: + self.func = op.truediv + + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) + + return self.func(left, right) + + def evaluate(self, env, engine, parser, term_type, eval_in_python): + """Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == 'python': + res = self(env) + else: + # recurse over the left/right nodes + left = self.lhs.evaluate(env, engine=engine, parser=parser, + term_type=term_type, + eval_in_python=eval_in_python) + right = self.rhs.evaluate(env, engine=engine, parser=parser, + term_type=term_type, + eval_in_python=eval_in_python) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + res = pd.eval(self, local_dict=env, engine=engine, + parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) + + def convert_values(self): + """Convert datetimes to a comparable value in an expression. + """ + def stringify(value): + if self.encoding is not None: + encoder = partial(com.pprint_thing_encoded, + encoding=self.encoding) + else: + encoder = com.pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.isscalar: + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = pd.Timestamp(_ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert('UTC') + self.rhs.update(v) + + if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.isscalar: + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = pd.Timestamp(_ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert('UTC') + self.lhs.update(v) + + def _disallow_scalar_only_bool_ops(self): + if ((self.lhs.isscalar or self.rhs.isscalar) and + self.op in _bool_ops_dict and + (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and + issubclass(self.lhs.return_type, (bool, np.bool_))))): + raise NotImplementedError("cannot evaluate scalar only bool ops") + + +def isnumeric(dtype): + return issubclass(np.dtype(dtype).type, np.number) + + +class Div(BinOp): + + """Div operator to special case casting. + + Parameters + ---------- + lhs, rhs : Term or Op + The Terms or Ops in the ``/`` expression. + truediv : bool + Whether or not to use true division. With Python 3 this happens + regardless of the value of ``truediv``. + """ + + def __init__(self, lhs, rhs, truediv, *args, **kwargs): + super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) + + if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): + raise TypeError("unsupported operand type(s) for {0}:" + " '{1}' and '{2}'".format(self.op, + lhs.return_type, + rhs.return_type)) + + if truediv or PY3: + _cast_inplace(com.flatten(self), np.float_) + + +_unary_ops_syms = '+', '-', '~', 'not' +_unary_ops_funcs = op.pos, op.neg, op.invert, op.invert +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) + + +class UnaryOp(Op): + + """Hold a unary operator and its operands + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. + """ + + def __init__(self, op, operand): + super(UnaryOp, self).__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError: + raise ValueError('Invalid unary operator {0!r}, valid operators ' + 'are {1}'.format(op, _unary_ops_syms)) + + def __call__(self, env): + operand = self.operand(env) + return self.func(operand) + + def __unicode__(self): + return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py new file mode 100644 index 00000000..9a1e61ad --- /dev/null +++ b/pandas/computation/pytables.py @@ -0,0 +1,604 @@ +""" manage PyTables query interface via Expressions """ + +import ast +import time +import warnings +from functools import partial +from datetime import datetime, timedelta +import numpy as np +import pandas as pd +from pandas.compat import u, string_types, PY3, DeepChainMap +from pandas.core.base import StringMixin +import pandas.core.common as com +from pandas.computation import expr, ops +from pandas.computation.ops import is_term, UndefinedVariableError +from pandas.computation.scope import _ensure_scope +from pandas.computation.expr import BaseExprVisitor +from pandas.computation.common import _ensure_decoded +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + + +class Scope(expr.Scope): + __slots__ = 'queryables', + + def __init__(self, level, global_dict=None, local_dict=None, + queryables=None): + super(Scope, self).__init__(level + 1, global_dict=global_dict, + local_dict=local_dict) + self.queryables = queryables or dict() + + +class Term(ops.Term): + + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, string_types) else cls + supr_new = StringMixin.__new__ + return supr_new(klass) + + def __init__(self, name, env, side=None, encoding=None): + super(Term, self).__init__(name, env, side=side, encoding=encoding) + + def _resolve_name(self): + # must be a queryables + if self.side == 'left': + if self.name not in self.env.queryables: + raise NameError('name {0!r} is not defined'.format(self.name)) + return self.name + + # resolve the rhs (and allow it to be None) + try: + return self.env.resolve(self.name, is_local=False) + except UndefinedVariableError: + return self.name + + @property + def value(self): + return self._value + + +class Constant(Term): + + def __init__(self, value, env, side=None, encoding=None): + super(Constant, self).__init__(value, env, side=side, + encoding=encoding) + + def _resolve_name(self): + return self._name + + +class BinOp(ops.BinOp): + + _max_selectors = 31 + + def __init__(self, op, lhs, rhs, queryables, encoding): + super(BinOp, self).__init__(op, lhs, rhs) + self.queryables = queryables + self.encoding = encoding + self.filter = None + self.condition = None + + def _disallow_scalar_only_bool_ops(self): + pass + + def prune(self, klass): + + def pr(left, right): + """ create and return a new specialized BinOp from myself """ + + if left is None: + return right + elif right is None: + return left + + k = klass + if isinstance(left, ConditionBinOp): + if (isinstance(left, ConditionBinOp) and + isinstance(right, ConditionBinOp)): + k = JointConditionBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + elif isinstance(left, FilterBinOp): + if (isinstance(left, FilterBinOp) and + isinstance(right, FilterBinOp)): + k = JointFilterBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + return k(self.op, left, right, queryables=self.queryables, + encoding=self.encoding).evaluate() + + left, right = self.lhs, self.rhs + + if is_term(left) and is_term(right): + res = pr(left.value, right.value) + elif not is_term(left) and is_term(right): + res = pr(left.prune(klass), right.value) + elif is_term(left) and not is_term(right): + res = pr(left.value, right.prune(klass)) + elif not (is_term(left) or is_term(right)): + res = pr(left.prune(klass), right.prune(klass)) + + return res + + def conform(self, rhs): + """ inplace conform rhs """ + if not com.is_list_like(rhs): + rhs = [rhs] + if hasattr(self.rhs, 'ravel'): + rhs = rhs.ravel() + return rhs + + @property + def is_valid(self): + """ return True if this is a valid field """ + return self.lhs in self.queryables + + @property + def is_in_table(self): + """ return True if this is a valid column name for generation (e.g. an + actual column in the table) """ + return self.queryables.get(self.lhs) is not None + + @property + def kind(self): + """ the kind of my field """ + return self.queryables.get(self.lhs) + + def generate(self, v): + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return "(%s %s %s)" % (self.lhs, self.op, val) + + def convert_value(self, v): + """ convert the expression that is in the term to something that is + accepted by pytables """ + + def stringify(value): + if self.encoding is not None: + encoder = partial(com.pprint_thing_encoded, + encoding=self.encoding) + else: + encoder = com.pprint_thing + return encoder(value) + + kind = _ensure_decoded(self.kind) + if kind == u('datetime64') or kind == u('datetime'): + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = pd.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + return TermValue(v, v.value, kind) + elif (isinstance(v, datetime) or hasattr(v, 'timetuple') or + kind == u('date')): + v = time.mktime(v.timetuple()) + return TermValue(v, pd.Timestamp(v), kind) + elif kind == u('timedelta64') or kind == u('timedelta'): + v = _coerce_scalar_to_timedelta_type(v, unit='s').item() + return TermValue(int(v), v, kind) + elif kind == u('integer'): + v = int(float(v)) + return TermValue(v, v, kind) + elif kind == u('float'): + v = float(v) + return TermValue(v, v, kind) + elif kind == u('bool'): + if isinstance(v, string_types): + v = not v.strip().lower() in [u('false'), u('f'), u('no'), + u('n'), u('none'), u('0'), + u('[]'), u('{}'), u('')] + else: + v = bool(v) + return TermValue(v, v, kind) + elif not isinstance(v, string_types): + v = stringify(v) + return TermValue(v, stringify(v), u('string')) + + # string quoting + return TermValue(v, stringify(v), u('string')) + + def convert_values(self): + pass + + +class FilterBinOp(BinOp): + + def __unicode__(self): + return com.pprint_thing("[Filter : [{0}] -> " + "[{1}]".format(self.filter[0], self.filter[1])) + + def invert(self): + """ invert the filter """ + if self.filter is not None: + f = list(self.filter) + f[1] = self.generate_filter_op(invert=True) + self.filter = tuple(f) + return self + + def format(self): + """ return the actual filter format """ + return [self.filter] + + def evaluate(self): + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + rhs = self.conform(self.rhs) + values = [TermValue(v, v, self.kind) for v in rhs] + + if self.is_in_table: + + # if too many values to create the expression, use a filter instead + if self.op in ['==', '!='] and len(values) > self._max_selectors: + + filter_op = self.generate_filter_op() + self.filter = ( + self.lhs, + filter_op, + pd.Index([v.value for v in values])) + + return self + return None + + # equality conditions + if self.op in ['==', '!=']: + + filter_op = self.generate_filter_op() + self.filter = ( + self.lhs, + filter_op, + pd.Index([v.value for v in values])) + + else: + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + self) + + return self + + def generate_filter_op(self, invert=False): + if (self.op == '!=' and not invert) or (self.op == '==' and invert): + return lambda axis, vals: ~axis.isin(vals) + else: + return lambda axis, vals: axis.isin(vals) + + +class JointFilterBinOp(FilterBinOp): + + def format(self): + raise NotImplementedError("unable to collapse Joint Filters") + + def evaluate(self): + return self + + +class ConditionBinOp(BinOp): + + def __unicode__(self): + return com.pprint_thing("[Condition : [{0}]]".format(self.condition)) + + def invert(self): + """ invert the condition """ + # if self.condition is not None: + # self.condition = "~(%s)" % self.condition + # return self + raise NotImplementedError("cannot use an invert condition when " + "passing to numexpr") + + def format(self): + """ return the actual ne format """ + return self.condition + + def evaluate(self): + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + # convert values if we are in the table + if not self.is_in_table: + return None + + rhs = self.conform(self.rhs) + values = [self.convert_value(v) for v in rhs] + + # equality conditions + if self.op in ['==', '!=']: + + # too many values to create the expression? + if len(values) <= self._max_selectors: + vs = [self.generate(v) for v in values] + self.condition = "(%s)" % ' | '.join(vs) + + # use a filter after reading + else: + return None + else: + self.condition = self.generate(values[0]) + + return self + + +class JointConditionBinOp(ConditionBinOp): + + def evaluate(self): + self.condition = "(%s %s %s)" % ( + self.lhs.condition, + self.op, + self.rhs.condition) + return self + + +class UnaryOp(ops.UnaryOp): + + def prune(self, klass): + + if self.op != '~': + raise NotImplementedError("UnaryOp only support invert type ops") + + operand = self.operand + operand = operand.prune(klass) + + if operand is not None: + if issubclass(klass, ConditionBinOp): + if operand.condition is not None: + return operand.invert() + elif issubclass(klass, FilterBinOp): + if operand.filter is not None: + return operand.invert() + + return None + + +_op_classes = {'unary': UnaryOp} + + +class ExprVisitor(BaseExprVisitor): + const_type = Constant + term_type = Term + + def __init__(self, env, engine, parser, **kwargs): + super(ExprVisitor, self).__init__(env, engine, parser) + for bin_op in self.binary_ops: + setattr(self, 'visit_{0}'.format(self.binary_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op, + **kwargs)) + + def visit_UnaryOp(self, node, **kwargs): + if isinstance(node.op, (ast.Not, ast.Invert)): + return UnaryOp('~', self.visit(node.operand)) + elif isinstance(node.op, ast.USub): + return self.const_type(-self.visit(node.operand).value, self.env) + elif isinstance(node.op, ast.UAdd): + raise NotImplementedError('Unary addition not supported') + + def visit_Index(self, node, **kwargs): + return self.visit(node.value).value + + def visit_Assign(self, node, **kwargs): + cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0], + comparators=[node.value]) + return self.visit(cmpr) + + def visit_Subscript(self, node, **kwargs): + # only allow simple suscripts + + value = self.visit(node.value) + slobj = self.visit(node.slice) + try: + value = value.value + except: + pass + + try: + return self.const_type(value[slobj], self.env) + except TypeError: + raise ValueError("cannot subscript {0!r} with " + "{1!r}".format(value, slobj)) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx.__class__ + if ctx == ast.Load: + # resolve the value + resolved = self.visit(value) + + # try to get the value to see if we are another expression + try: + resolved = resolved.value + except (AttributeError): + pass + + try: + return self.term_type(getattr(resolved, attr), self.env) + except AttributeError: + + # something like datetime.datetime where scope is overriden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + + def translate_In(self, op): + return ast.Eq() if isinstance(op, ast.In) else op + + def _rewrite_membership_op(self, node, left, right): + return self.visit(node.op), node.op, left, right + + +class Expr(expr.Expr): + + """ hold a pytables like expression, comprised of possibly multiple 'terms' + + Parameters + ---------- + where : string term expression, Expr, or list-like of Exprs + queryables : a "kinds" map (dict of column name -> kind), or None if column + is non-indexable + encoding : an encoding that will encode the query terms + + Returns + ------- + an Expr object + + Examples + -------- + + 'index>=date' + "columns=['A', 'D']" + 'columns=A' + 'columns==A' + "~(columns=['A','B'])" + 'index>df.index[3] & string="bar"' + '(index>df.index[3] & index<=df.index[6]) | string="bar"' + "ts>=Timestamp('2012-02-01')" + "major_axis>=20130101" + """ + + def __init__(self, where, op=None, value=None, queryables=None, + encoding=None, scope_level=0): + + # try to be back compat + where = self.parse_back_compat(where, op, value) + + self.encoding = encoding + self.condition = None + self.filter = None + self.terms = None + self._visitor = None + + # capture the environment if needed + local_dict = DeepChainMap() + + if isinstance(where, Expr): + local_dict = where.env.scope + where = where.expr + + elif isinstance(where, (list, tuple)): + for idx, w in enumerate(where): + if isinstance(w, Expr): + local_dict = w.env.scope + else: + w = self.parse_back_compat(w) + where[idx] = w + where = ' & ' .join(["(%s)" % w for w in where]) + + self.expr = where + self.env = Scope(scope_level + 1, local_dict=local_dict) + + if queryables is not None and isinstance(self.expr, string_types): + self.env.queryables.update(queryables) + self._visitor = ExprVisitor(self.env, queryables=queryables, + parser='pytables', engine='pytables', + encoding=encoding) + self.terms = self.parse() + + def parse_back_compat(self, w, op=None, value=None): + """ allow backward compatibility for passed arguments """ + + if isinstance(w, dict): + w, op, value = w.get('field'), w.get('op'), w.get('value') + if not isinstance(w, string_types): + raise TypeError( + "where must be passed as a string if op/value are passed") + warnings.warn("passing a dict to Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + if isinstance(w, tuple): + if len(w) == 2: + w, value = w + op = '==' + elif len(w) == 3: + w, op, value = w + warnings.warn("passing a tuple into Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + + if op is not None: + if not isinstance(w, string_types): + raise TypeError( + "where must be passed as a string if op/value are passed") + + if isinstance(op, Expr): + raise TypeError("invalid op passed, must be a string") + w = "{0}{1}".format(w, op) + if value is not None: + if isinstance(value, Expr): + raise TypeError("invalid value passed, must be a string") + + # stringify with quotes these values + def convert(v): + if isinstance(v, (datetime,np.datetime64,timedelta,np.timedelta64)) or hasattr(v, 'timetuple'): + return "'{0}'".format(v) + return v + + if isinstance(value, (list,tuple)): + value = [ convert(v) for v in value ] + else: + value = convert(value) + + w = "{0}{1}".format(w, value) + + warnings.warn("passing multiple values to Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + + return w + + def __unicode__(self): + if self.terms is not None: + return com.pprint_thing(self.terms) + return com.pprint_thing(self.expr) + + def evaluate(self): + """ create and return the numexpr condition and filter """ + + try: + self.condition = self.terms.prune(ConditionBinOp) + except AttributeError: + raise ValueError("cannot process expression [{0}], [{1}] is not a " + "valid condition".format(self.expr, self)) + try: + self.filter = self.terms.prune(FilterBinOp) + except AttributeError: + raise ValueError("cannot process expression [{0}], [{1}] is not a " + "valid filter".format(self.expr, self)) + + return self.condition, self.filter + + +class TermValue(object): + + """ hold a term value the we use to construct a condition/filter """ + + def __init__(self, value, converted, kind): + self.value = value + self.converted = converted + self.kind = kind + + def tostring(self, encoding): + """ quote the string if not encoded + else encode and return """ + if self.kind == u('string'): + if encoding is not None: + return self.converted + return '"%s"' % self.converted + return self.converted + + +def maybe_expression(s): + """ loose checking if s is a pytables-acceptable expression """ + if not isinstance(s, string_types): + return False + ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',) + + # make sure we have an op at least + return any(op in s for op in ops) diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py new file mode 100644 index 00000000..875aaa95 --- /dev/null +++ b/pandas/computation/scope.py @@ -0,0 +1,297 @@ +"""Module for scope operations +""" + +import sys +import struct +import inspect +import datetime +import itertools +import pprint + +import numpy as np + +import pandas as pd +from pandas.compat import DeepChainMap, map, StringIO +from pandas.core.base import StringMixin +import pandas.computation as compu + + +def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(), + target=None, **kwargs): + """Ensure that we are grabbing the correct scope.""" + return Scope(level + 1, global_dict=global_dict, local_dict=local_dict, + resolvers=resolvers, target=target) + + +def _replacer(x): + """Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj): + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack('@P', id(obj)) + return ''.join(map(_replacer, packed)) + + +_DEFAULT_GLOBALS = { + 'Timestamp': pd.lib.Timestamp, + 'datetime': datetime.datetime, + 'True': True, + 'False': False, + 'list': list, + 'tuple': tuple, + 'inf': np.inf, + 'Inf': np.inf, +} + + +def _get_pretty_string(obj): + """Return a prettier version of obj + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + s : str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope(StringMixin): + + """Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + __slots__ = 'level', 'scope', 'target', 'temps' + + def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), + target=None): + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self.update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + self.scope = self.scope.new_child((global_dict or + frame.f_globals).copy()) + if not isinstance(local_dict, Scope): + self.scope = self.scope.new_child((local_dict or + frame.f_locals).copy()) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __unicode__(self): + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + return '%s(scope=%s, resolvers=%s)' % (type(self).__name__, scope_keys, + res_keys) + + @property + def has_resolvers(self): + """Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key, is_local): + """Resolve a variable name in a possibly local context + + Parameters + ---------- + key : text_type + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError: + raise compu.ops.UndefinedVariableError(key, is_local) + + def swapkey(self, old_key, new_key, new_value=None): + """Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + mapping[new_key] = new_value + return + + def _get_vars(self, stack, scopes): + """Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, 'f_' + scope) + self.scope = self.scope.new_child(d) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def update(self, level): + """Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int or None, optional, default None + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=['locals']) + finally: + del stack[:], stack + + def add_tmp(self, value): + """Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + name : basestring + The name of the temporary variable created. + """ + name = '{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, + _raw_hex_id(self)) + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + @property + def ntemps(self): + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self): + """Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/pandas/computation/tests/__init__.py b/pandas/computation/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py new file mode 100644 index 00000000..5489893d --- /dev/null +++ b/pandas/computation/tests/test_eval.py @@ -0,0 +1,1631 @@ +#!/usr/bin/env python + +import warnings +import operator +from itertools import product +from distutils.version import LooseVersion + +import nose +from nose.tools import assert_raises + +from numpy.random import randn, rand, randint +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose +from numpy.testing.decorators import slow + +import pandas as pd +from pandas.core import common as com +from pandas import DataFrame, Series, Panel, date_range +from pandas.util.testing import makeCustomDataframe as mkdf + +from pandas.computation import pytables +from pandas.computation.engines import _engines, NumExprClobberingError +from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor +from pandas.computation.ops import (_binary_ops_dict, + _special_case_arith_ops_syms, + _arith_ops_syms, _bool_ops_syms) + +import pandas.computation.expr as expr +import pandas.util.testing as tm +from pandas.util.testing import (assert_frame_equal, randbool, + assertRaisesRegexp, + assert_produces_warning, assert_series_equal) +from pandas.compat import PY3, u, reduce + +_series_frame_incompatible = _bool_ops_syms +_scalar_skip = 'in', 'not in' + + +def engine_has_neg_frac(engine): + return _engines[engine].has_neg_frac + + +def _eval_single_bin(lhs, cmp1, rhs, engine): + c = _binary_ops_dict[cmp1] + if engine_has_neg_frac(engine): + try: + return c(lhs, rhs) + except ValueError as e: + try: + msg = e.message + except AttributeError: + msg = e + msg = u(msg) + if msg == u('negative number cannot be raised to a fractional' + ' power'): + return np.nan + raise + return c(lhs, rhs) + + +def _series_and_2d_ndarray(lhs, rhs): + return ((isinstance(lhs, Series) and + isinstance(rhs, np.ndarray) and rhs.ndim > 1) + or (isinstance(rhs, Series) and + isinstance(lhs, np.ndarray) and lhs.ndim > 1)) + + +def _series_and_frame(lhs, rhs): + return ((isinstance(lhs, Series) and isinstance(rhs, DataFrame)) + or (isinstance(rhs, Series) and isinstance(lhs, DataFrame))) + + +def _bool_and_frame(lhs, rhs): + return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame) + + +def _is_py3_complex_incompat(result, expected): + return (PY3 and isinstance(expected, (complex, np.complexfloating)) and + np.isnan(result)) + + +_good_arith_ops = com.difference(_arith_ops_syms, _special_case_arith_ops_syms) + + +class TestEvalNumexprPandas(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestEvalNumexprPandas, cls).setUpClass() + tm.skip_if_no_ne() + import numexpr as ne + cls.ne = ne + cls.engine = 'numexpr' + cls.parser = 'pandas' + + @classmethod + def tearDownClass(cls): + super(TestEvalNumexprPandas, cls).tearDownClass() + del cls.engine, cls.parser + if hasattr(cls, 'ne'): + del cls.ne + + def setup_data(self): + nan_df1 = DataFrame(rand(10, 5)) + nan_df1[nan_df1 > 0.5] = np.nan + nan_df2 = DataFrame(rand(10, 5)) + nan_df2[nan_df2 > 0.5] = np.nan + + self.pandas_lhses = (DataFrame(randn(10, 5)), Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), nan_df1) + self.pandas_rhses = (DataFrame(randn(10, 5)), Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), nan_df2) + self.scalar_lhses = randn(), + self.scalar_rhses = randn(), + + self.lhses = self.pandas_lhses + self.scalar_lhses + self.rhses = self.pandas_rhses + self.scalar_rhses + + def setup_ops(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = '-', '~', 'not ' + + def setUp(self): + self.setup_ops() + self.setup_data() + self.current_engines = filter(lambda x: x != self.engine, _engines) + + def tearDown(self): + del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses + del self.pandas_rhses, self.pandas_lhses, self.current_engines + + @slow + def test_complex_cmp_ops(self): + for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, self.cmp_ops, + self.rhses, self.bin_ops, + self.cmp2_ops): + self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2) + + def test_simple_cmp_ops(self): + bool_lhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + bool_rhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): + self.check_simple_cmp_op(lhs, cmp_op, rhs) + + @slow + def test_binary_arith_ops(self): + for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses): + self.check_binary_arith_op(lhs, op, rhs) + + def test_modulus(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_modulus(lhs, '%', rhs) + + def test_floor_division(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_floor_division(lhs, '//', rhs) + + def test_pow(self): + import platform + if platform.system() == 'Windows': + raise nose.SkipTest('not testing pow on Windows') + + # odd failure on win32 platform, so skip + for lhs, rhs in product(self.lhses, self.rhses): + self.check_pow(lhs, '**', rhs) + + @slow + def test_single_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_single_invert_op(lhs, op, rhs) + + @slow + def test_compound_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_compound_invert_op(lhs, op, rhs) + + @slow + def test_chained_cmp_op(self): + mids = self.lhses + cmp_ops = '<', '>' + for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, + mids, cmp_ops, self.rhses): + self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) + + def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): + skip_these = _scalar_skip + ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, + binop=binop, + cmp2=cmp2) + scalar_with_in_notin = (np.isscalar(rhs) and (cmp1 in skip_these or + cmp2 in skip_these)) + if scalar_with_in_notin: + with tm.assertRaises(TypeError): + pd.eval(ex, engine=self.engine, parser=self.parser) + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + else: + lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) + rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) + if (isinstance(lhs_new, Series) and isinstance(rhs_new, DataFrame) + and binop in _series_frame_incompatible): + pass + # TODO: the code below should be added back when left and right + # hand side bool ops are fixed. + + # try: + # self.assertRaises(Exception, pd.eval, ex, + #local_dict={'lhs': lhs, 'rhs': rhs}, + # engine=self.engine, parser=self.parser) + # except AssertionError: + #import ipdb; ipdb.set_trace() + # raise + else: + expected = _eval_single_bin( + lhs_new, binop, rhs_new, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(result, expected) + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + skip_these = _scalar_skip + + def check_operands(left, right, cmp_op): + return _eval_single_bin(left, cmp_op, right, self.engine) + + lhs_new = check_operands(lhs, mid, cmp1) + rhs_new = check_operands(mid, rhs, cmp2) + + if lhs_new is not None and rhs_new is not None: + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) + expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) + + for ex in (ex1, ex2, ex3): + result = pd.eval(ex, engine=self.engine, + parser=self.parser) + assert_array_equal(result, expected) + + def check_simple_cmp_op(self, lhs, cmp1, rhs): + ex = 'lhs {0} rhs'.format(cmp1) + if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + else: + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(result, expected) + + def check_binary_arith_op(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = _eval_single_bin(lhs, arith1, rhs, self.engine) + assert_array_equal(result, expected) + ex = 'lhs {0} rhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + nlhs = _eval_single_bin(lhs, arith1, rhs, + self.engine) + self.check_alignment(result, nlhs, rhs, arith1) + + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass + else: + expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) + assert_array_equal(result, expected) + + # modulus, pow, and floor division require special casing + + def check_modulus(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs % rhs + assert_allclose(result, expected) + expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) + assert_allclose(result, expected) + + def check_floor_division(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + + if self.engine == 'python': + res = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs // rhs + assert_array_equal(res, expected) + else: + self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs, + 'rhs': rhs}, + engine=self.engine, parser=self.parser) + + def get_expected_pow_result(self, lhs, rhs): + try: + expected = _eval_single_bin(lhs, '**', rhs, self.engine) + except ValueError as e: + msg = 'negative number cannot be raised to a fractional power' + try: + emsg = e.message + except AttributeError: + emsg = e + + emsg = u(emsg) + + if emsg == msg: + if self.engine == 'python': + raise nose.SkipTest(emsg) + else: + expected = np.nan + else: + raise + return expected + + def check_pow(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + expected = self.get_expected_pow_result(lhs, rhs) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + + if (np.isscalar(lhs) and np.isscalar(rhs) and + _is_py3_complex_incompat(result, expected)): + self.assertRaises(AssertionError, assert_array_equal, result, + expected) + else: + assert_allclose(result, expected) + + ex = '(lhs {0} rhs) {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = self.get_expected_pow_result( + self.get_expected_pow_result(lhs, rhs), rhs) + assert_allclose(result, expected) + + def check_single_invert_op(self, lhs, cmp1, rhs): + # simple + for el in (lhs, rhs): + try: + elb = el.astype(bool) + except AttributeError: + elb = np.array([bool(el)]) + expected = ~elb + result = pd.eval('~elb', engine=self.engine, parser=self.parser) + assert_array_equal(expected, result) + + for engine in self.current_engines: + tm.skip_if_no_ne(engine) + assert_array_equal(result, pd.eval('~elb', engine=engine, + parser=self.parser)) + + def check_compound_invert_op(self, lhs, cmp1, rhs): + skip_these = 'in', 'not in' + ex = '~(lhs {0} rhs)'.format(cmp1) + + if np.isscalar(rhs) and cmp1 in skip_these: + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + else: + # compound + if np.isscalar(lhs) and np.isscalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + if np.isscalar(expected): + expected = not expected + else: + expected = ~expected + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(expected, result) + + # make sure the other engines work the same as this one + for engine in self.current_engines: + tm.skip_if_no_ne(engine) + ev = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(ev, result) + + def ex(self, op, var_name='lhs'): + return '{0}{1}'.format(op, var_name) + + def test_frame_invert(self): + expr = self.ex('~') + + # ~ ## + # frame + # float always raises + lhs = DataFrame(randn(5, 2)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + # int raises on numexpr + lhs = DataFrame(randint(5, size=(5, 2))) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # bool always works + lhs = DataFrame(rand(5, 2) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # object raises + lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5}) + if self.engine == 'numexpr': + with tm.assertRaises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_series_invert(self): + # ~ #### + expr = self.ex('~') + + # series + # float raises + lhs = Series(randn(5)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + # int raises on numexpr + lhs = Series(randint(5, size=5)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # bool + lhs = Series(rand(5) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # float + # int + # bool + + # object + lhs = Series(['a', 1, 2.0]) + if self.engine == 'numexpr': + with tm.assertRaises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_frame_negate(self): + expr = self.ex('-') + + # float + lhs = DataFrame(randn(5, 2)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # int + lhs = DataFrame(randint(5, size=(5, 2))) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + def test_series_negate(self): + expr = self.ex('-') + + # float + lhs = Series(randn(5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + def test_frame_pos(self): + expr = self.ex('+') + + # float + lhs = DataFrame(randn(5, 2)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # int + lhs = DataFrame(randint(5, size=(5, 2))) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + def test_series_pos(self): + expr = self.ex('+') + + # float + lhs = Series(randn(5)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + def test_scalar_unary(self): + with tm.assertRaises(TypeError): + pd.eval('~1.0', engine=self.engine, parser=self.parser) + + self.assertEqual( + pd.eval('-1.0', parser=self.parser, engine=self.engine), -1.0) + self.assertEqual( + pd.eval('+1.0', parser=self.parser, engine=self.engine), +1.0) + + self.assertEqual( + pd.eval('~1', parser=self.parser, engine=self.engine), ~1) + self.assertEqual( + pd.eval('-1', parser=self.parser, engine=self.engine), -1) + self.assertEqual( + pd.eval('+1', parser=self.parser, engine=self.engine), +1) + + self.assertEqual( + pd.eval('~True', parser=self.parser, engine=self.engine), ~True) + self.assertEqual( + pd.eval('~False', parser=self.parser, engine=self.engine), ~False) + self.assertEqual( + pd.eval('-True', parser=self.parser, engine=self.engine), -True) + self.assertEqual( + pd.eval('-False', parser=self.parser, engine=self.engine), -False) + self.assertEqual( + pd.eval('+True', parser=self.parser, engine=self.engine), +True) + self.assertEqual( + pd.eval('+False', parser=self.parser, engine=self.engine), +False) + + def test_disallow_scalar_bool_ops(self): + exprs = '1 or 2', '1 and 2' + exprs += 'a and b', 'a or b' + exprs += '1 or 2 and (3 + 2) > 3', + exprs += '2 * x > 2 or 1 and 2', + exprs += '2 * df > 3 and 1 or a', + + x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) + for ex in exprs: + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + + +class TestEvalNumexprPython(TestEvalNumexprPandas): + + @classmethod + def setUpClass(cls): + super(TestEvalNumexprPython, cls).setUpClass() + tm.skip_if_no_ne() + import numexpr as ne + cls.ne = ne + cls.engine = 'numexpr' + cls.parser = 'python' + + def setup_ops(self): + self.cmp_ops = list(filter(lambda x: x not in ('in', 'not in'), + expr._cmp_ops_syms)) + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = [s for s in expr._bool_ops_syms + if s not in ('and', 'or')] + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = '+', '-', '~' + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + with tm.assertRaises(NotImplementedError): + pd.eval(ex1, engine=self.engine, parser=self.parser) + + +class TestEvalPythonPython(TestEvalNumexprPython): + + @classmethod + def setUpClass(cls): + super(TestEvalPythonPython, cls).setUpClass() + cls.engine = 'python' + cls.parser = 'python' + + def check_modulus(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + + expected = lhs % rhs + assert_allclose(result, expected) + + expected = _eval_single_bin(expected, arith1, rhs, self.engine) + assert_allclose(result, expected) + + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass + else: + expected = eval('nlhs {0} ghs'.format(op)) + assert_array_equal(result, expected) + + +class TestEvalPythonPandas(TestEvalPythonPython): + + @classmethod + def setUpClass(cls): + super(TestEvalPythonPandas, cls).setUpClass() + cls.engine = 'python' + cls.parser = 'pandas' + + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, + rhs) + + +f = lambda *args, **kwargs: np.random.randn() + + +ENGINES_PARSERS = list(product(_engines, expr._parsers)) + + +#------------------------------------- +# basic and complex alignment + +def _is_datetime(x): + return issubclass(x.dtype.type, np.datetime64) + + +def should_warn(*args): + not_mono = not any(map(operator.attrgetter('is_monotonic'), args)) + only_one_dt = reduce(operator.xor, map(_is_datetime, args)) + return not_mono and only_one_dt + + +class TestAlignment(object): + + index_types = 'i', 'u', 'dt' + lhs_index_types = index_types + ('s',) # 'p' + + def check_align_nested_unary_op(self, engine, parser): + tm.skip_if_no_ne(engine) + s = 'df * ~2' + df = mkdf(5, 3, data_gen_f=f) + res = pd.eval(s, engine=engine, parser=parser) + assert_frame_equal(res, df * ~2) + + def test_align_nested_unary_op(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_align_nested_unary_op, engine, parser + + def check_basic_frame_alignment(self, engine, parser): + tm.skip_if_no_ne(engine) + args = product(self.lhs_index_types, self.index_types, + self.index_types) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for lr_idx_type, rr_idx_type, c_idx_type in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type, + c_idx_type=c_idx_type) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type, + c_idx_type=c_idx_type) + # only warns if not monotonic and not sortable + if should_warn(df.index, df2.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df + df2', engine=engine, parser=parser) + else: + res = pd.eval('df + df2', engine=engine, parser=parser) + assert_frame_equal(res, df + df2) + + def test_basic_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_frame_alignment, engine, parser + + def check_frame_comparison(self, engine, parser): + tm.skip_if_no_ne(engine) + args = product(self.lhs_index_types, repeat=2) + for r_idx_type, c_idx_type in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + res = pd.eval('df < 2', engine=engine, parser=parser) + assert_frame_equal(res, df < 2) + + df3 = DataFrame(randn(*df.shape), index=df.index, + columns=df.columns) + res = pd.eval('df < df3', engine=engine, parser=parser) + assert_frame_equal(res, df < df3) + + def test_frame_comparison(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_frame_comparison, engine, parser + + def check_medium_complex_frame_alignment(self, engine, parser): + tm.skip_if_no_ne(engine) + args = product(self.lhs_index_types, self.index_types, + self.index_types, self.index_types) + + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + + for r1, c1, r2, c2 in args: + df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + if should_warn(df.index, df2.index, df3.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df + df2 + df3', engine=engine, + parser=parser) + else: + res = pd.eval('df + df2 + df3', engine=engine, parser=parser) + assert_frame_equal(res, df + df2 + df3) + + @slow + def test_medium_complex_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_medium_complex_frame_alignment, engine, parser + + def check_basic_frame_series_alignment(self, engine, parser): + tm.skip_if_no_ne(engine) + + def testit(r_idx_type, c_idx_type, index_name): + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if should_warn(df.index, s.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df + s', engine=engine, parser=parser) + else: + res = pd.eval('df + s', engine=engine, parser=parser) + + if r_idx_type == 'dt' or c_idx_type == 'dt': + expected = df.add(s) if engine == 'numexpr' else df + s + else: + expected = df + s + assert_frame_equal(res, expected) + + args = product(self.lhs_index_types, self.index_types, + ('index', 'columns')) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + def test_basic_frame_series_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_frame_series_alignment, engine, parser + + def check_basic_series_frame_alignment(self, engine, parser): + tm.skip_if_no_ne(engine) + + def testit(r_idx_type, c_idx_type, index_name): + df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + if should_warn(s.index, df.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('s + df', engine=engine, parser=parser) + else: + res = pd.eval('s + df', engine=engine, parser=parser) + + if r_idx_type == 'dt' or c_idx_type == 'dt': + expected = df.add(s) if engine == 'numexpr' else s + df + else: + expected = s + df + assert_frame_equal(res, expected) + + # only test dt with dt, otherwise weird joins result + args = product(['i', 'u', 's'], ['i', 'u', 's'], ('index', 'columns')) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + # dt with dt + args = product(['dt'], ['dt'], ('index', 'columns')) + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) + + def test_basic_series_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_series_frame_alignment, engine, parser + + def check_series_frame_commutativity(self, engine, parser): + tm.skip_if_no_ne(engine) + args = product(self.lhs_index_types, self.index_types, ('+', '*'), + ('index', 'columns')) + + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r_idx_type, c_idx_type, op, index_name in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + if should_warn(df.index, s.index): + with tm.assert_produces_warning(RuntimeWarning): + a = pd.eval(lhs, engine=engine, parser=parser) + with tm.assert_produces_warning(RuntimeWarning): + b = pd.eval(rhs, engine=engine, parser=parser) + else: + a = pd.eval(lhs, engine=engine, parser=parser) + b = pd.eval(rhs, engine=engine, parser=parser) + + if r_idx_type != 'dt' and c_idx_type != 'dt': + if engine == 'numexpr': + assert_frame_equal(a, b) + + def test_series_frame_commutativity(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_series_frame_commutativity, engine, parser + + def check_complex_series_frame_alignment(self, engine, parser): + tm.skip_if_no_ne(engine) + + import random + args = product(self.lhs_index_types, self.index_types, + self.index_types, self.index_types) + n = 3 + m1 = 5 + m2 = 2 * m1 + + with warnings.catch_warnings(record=True): + warnings.simplefilter('always', RuntimeWarning) + for r1, r2, c1, c2 in args: + index_name = random.choice(['index', 'columns']) + obj_name = random.choice(['df', 'df2']) + + df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals().get(obj_name), index_name) + s = Series(np.random.randn(n), index[:n]) + + if r2 == 'dt' or c2 == 'dt': + if engine == 'numexpr': + expected2 = df2.add(s) + else: + expected2 = df2 + s + else: + expected2 = df2 + s + + if r1 == 'dt' or c1 == 'dt': + if engine == 'numexpr': + expected = expected2.add(df) + else: + expected = expected2 + df + else: + expected = expected2 + df + + if should_warn(df2.index, s.index, df.index): + with tm.assert_produces_warning(RuntimeWarning): + res = pd.eval('df2 + s + df', engine=engine, + parser=parser) + else: + res = pd.eval('df2 + s + df', engine=engine, parser=parser) + tm.assert_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) + + @slow + def test_complex_series_frame_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_complex_series_frame_alignment, engine, parser + + def check_performance_warning_for_poor_alignment(self, engine, parser): + tm.skip_if_no_ne(engine) + df = DataFrame(randn(1000, 10)) + s = Series(randn(10000)) + if engine == 'numexpr': + seen = pd.io.common.PerformanceWarning + else: + seen = False + + with assert_produces_warning(seen): + pd.eval('df + s', engine=engine, parser=parser) + + s = Series(randn(1000)) + with assert_produces_warning(False): + pd.eval('df + s', engine=engine, parser=parser) + + df = DataFrame(randn(10, 10000)) + s = Series(randn(10000)) + with assert_produces_warning(False): + pd.eval('df + s', engine=engine, parser=parser) + + df = DataFrame(randn(10, 10)) + s = Series(randn(10000)) + + is_python_engine = engine == 'python' + + if not is_python_engine: + wrn = pd.io.common.PerformanceWarning + else: + wrn = False + + with assert_produces_warning(wrn) as w: + pd.eval('df + s', engine=engine, parser=parser) + + if not is_python_engine: + tm.assert_equal(len(w), 1) + msg = str(w[0].message) + expected = ("Alignment difference on axis {0} is larger" + " than an order of magnitude on term {1!r}, " + "by more than {2:.4g}; performance may suffer" + "".format(1, 'df', np.log10(s.size - df.shape[1]))) + tm.assert_equal(msg, expected) + + def test_performance_warning_for_poor_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield (self.check_performance_warning_for_poor_alignment, engine, + parser) + + +#------------------------------------ +# slightly more complex ops + +class TestOperationsNumExprPandas(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestOperationsNumExprPandas, cls).setUpClass() + tm.skip_if_no_ne() + cls.engine = 'numexpr' + cls.parser = 'pandas' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + + @classmethod + def tearDownClass(cls): + super(TestOperationsNumExprPandas, cls).tearDownClass() + del cls.engine, cls.parser + + def eval(self, *args, **kwargs): + kwargs['engine'] = self.engine + kwargs['parser'] = self.parser + kwargs['level'] = kwargs.pop('level', 0) + 1 + return pd.eval(*args, **kwargs) + + def test_simple_arith_ops(self): + ops = self.arith_ops + + for op in filter(lambda x: x != '//', ops): + ex = '1 {0} 1'.format(op) + ex2 = 'x {0} 1'.format(op) + ex3 = '1 {0} (x + 1)'.format(op) + + if op in ('in', 'not in'): + self.assertRaises(TypeError, pd.eval, ex, + engine=self.engine, parser=self.parser) + else: + expec = _eval_single_bin(1, op, 1, self.engine) + x = self.eval(ex, engine=self.engine, parser=self.parser) + tm.assert_equal(x, expec) + + expec = _eval_single_bin(x, op, 1, self.engine) + y = self.eval(ex2, local_dict={'x': x}, engine=self.engine, + parser=self.parser) + tm.assert_equal(y, expec) + + expec = _eval_single_bin(1, op, x + 1, self.engine) + y = self.eval(ex3, local_dict={'x': x}, + engine=self.engine, parser=self.parser) + tm.assert_equal(y, expec) + + def test_simple_bool_ops(self): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), + (True, False)): + ex = '{0} {1} {2}'.format(lhs, op, rhs) + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) + + def test_bool_ops_with_constants(self): + for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), + ('True', 'False')): + ex = '{0} {1} {2}'.format(lhs, op, rhs) + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) + + def test_panel_fails(self): + x = Panel(randn(3, 4, 5)) + y = Series(randn(10)) + assert_raises(NotImplementedError, self.eval, 'x + y', + local_dict={'x': x, 'y': y}) + + def test_4d_ndarray_fails(self): + x = randn(3, 4, 5, 6) + y = Series(randn(10)) + assert_raises(NotImplementedError, self.eval, 'x + y', + local_dict={'x': x, 'y': y}) + + def test_constant(self): + x = self.eval('1') + tm.assert_equal(x, 1) + + def test_single_variable(self): + df = DataFrame(randn(10, 2)) + df2 = self.eval('df', local_dict={'df': df}) + assert_frame_equal(df, df2) + + def test_truediv(self): + s = np.array([1]) + ex = 's / 1' + d = {'s': s} + + if PY3: + res = self.eval(ex, truediv=False) + assert_array_equal(res, np.array([1.0])) + + res = self.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + + res = self.eval('1 / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('1 / 2', truediv=False) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=False) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) + else: + res = self.eval(ex, truediv=False) + assert_array_equal(res, np.array([1])) + + res = self.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + + res = self.eval('1 / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('1 / 2', truediv=False) + expec = 0 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=False) + expec = 0 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) + + def test_failing_subscript_with_name_error(self): + df = DataFrame(np.random.randn(5, 3)) + with tm.assertRaises(NameError): + self.eval('df[x > 2] > 2') + + def test_lhs_expression_subscript(self): + df = DataFrame(np.random.randn(5, 3)) + result = self.eval('(df + 1)[df > 2]', local_dict={'df': df}) + expected = (df + 1)[df > 2] + assert_frame_equal(result, expected) + + def test_attr_expression(self): + df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + expr1 = 'df.a < df.b' + expec1 = df.a < df.b + expr2 = 'df.a + df.b + df.c' + expec2 = df.a + df.b + df.c + expr3 = 'df.a + df.b + df.c[df.b < 0]' + expec3 = df.a + df.b + df.c[df.b < 0] + exprs = expr1, expr2, expr3 + expecs = expec1, expec2, expec3 + for e, expec in zip(exprs, expecs): + assert_series_equal(expec, self.eval(e, local_dict={'df': df})) + + def test_assignment_fails(self): + df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + df2 = DataFrame(np.random.randn(5, 3)) + expr1 = 'df = df2' + self.assertRaises(ValueError, self.eval, expr1, + local_dict={'df': df, 'df2': df2}) + + def test_assignment_column(self): + tm.skip_if_no_ne('numexpr') + df = DataFrame(np.random.randn(5, 2), columns=list('ab')) + orig_df = df.copy() + + # multiple assignees + self.assertRaises(SyntaxError, df.eval, 'd c = a + b') + + # invalid assignees + self.assertRaises(SyntaxError, df.eval, 'd,c = a + b') + self.assertRaises( + SyntaxError, df.eval, 'Timestamp("20131001") = a + b') + + # single assignment - existing variable + expected = orig_df.copy() + expected['a'] = expected['a'] + expected['b'] + df = orig_df.copy() + df.eval('a = a + b') + assert_frame_equal(df, expected) + + # single assignment - new variable + expected = orig_df.copy() + expected['c'] = expected['a'] + expected['b'] + df = orig_df.copy() + df.eval('c = a + b') + assert_frame_equal(df, expected) + + # with a local name overlap + def f(): + df = orig_df.copy() + a = 1 + df.eval('a = 1 + b') + return df + + df = f() + expected = orig_df.copy() + expected['a'] = 1 + expected['b'] + assert_frame_equal(df, expected) + + df = orig_df.copy() + + def f(): + a = 1 + old_a = df.a.copy() + df.eval('a = a + b') + assert_series_equal(old_a + df.b, df.a) + + f() + + # multiple assignment + df = orig_df.copy() + df.eval('c = a + b') + self.assertRaises(SyntaxError, df.eval, 'c = a = b') + + # explicit targets + df = orig_df.copy() + self.eval('c = df.a + df.b', local_dict={'df': df}, target=df) + expected = orig_df.copy() + expected['c'] = expected['a'] + expected['b'] + assert_frame_equal(df, expected) + + def test_basic_period_index_boolean_expression(self): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + + e = df < 2 + r = self.eval('df < 2', local_dict={'df': df}) + x = df < 2 + + assert_frame_equal(r, e) + assert_frame_equal(x, e) + + def test_basic_period_index_subscript_expression(self): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + r = self.eval('df[df < 2 + 3]', local_dict={'df': df}) + e = df[df < 2 + 3] + assert_frame_equal(r, e) + + def test_nested_period_index_subscript_expression(self): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + r = self.eval('df[df[df < 2] < 2] + df * 2', local_dict={'df': df}) + e = df[df[df < 2] < 2] + df * 2 + assert_frame_equal(r, e) + + def test_date_boolean(self): + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + res = self.eval('df.dates1 < 20130101', local_dict={'df': df}, + engine=self.engine, parser=self.parser) + expec = df.dates1 < '20130101' + assert_series_equal(res, expec) + + def test_simple_in_ops(self): + if self.parser != 'python': + res = pd.eval('1 in [1, 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('2 in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('3 in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertFalse(res) + + res = pd.eval('3 not in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[3] not in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[3] in ([3], 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[[3]] in [[[3]], 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('(3,) in [(3,), 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('(3,) not in [(3,), 2]', engine=self.engine, + parser=self.parser) + self.assertFalse(res) + + res = pd.eval('[(3,)] in [[(3,)], 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + else: + with tm.assertRaises(NotImplementedError): + pd.eval('1 in [1, 2]', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('2 in (1, 2)', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('3 in (1, 2)', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('3 not in (1, 2)', engine=self.engine, + parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('[(3,)] in (1, 2, [(3,)])', engine=self.engine, + parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('[3] not in (1, 2, [[3]])', engine=self.engine, + parser=self.parser) + + +class TestOperationsNumExprPython(TestOperationsNumExprPandas): + + @classmethod + def setUpClass(cls): + super(TestOperationsNumExprPython, cls).setUpClass() + cls.engine = 'numexpr' + cls.parser = 'python' + tm.skip_if_no_ne(cls.engine) + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), + cls.arith_ops) + + def test_check_many_exprs(self): + a = 1 + expr = ' * '.join('a' * 33) + expected = 1 + res = pd.eval(expr, engine=self.engine, parser=self.parser) + tm.assert_equal(res, expected) + + def test_fails_and(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 and df > 3', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_or(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 or df > 3', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_not(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'not df > 2', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_ampersand(self): + df = DataFrame(np.random.randn(5, 3)) + ex = '(df + 2)[df > 1] > 0 & (df > 0)' + with tm.assertRaises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) + + def test_fails_pipe(self): + df = DataFrame(np.random.randn(5, 3)) + ex = '(df + 2)[df > 1] > 0 | (df > 0)' + with tm.assertRaises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) + + def test_bool_ops_with_constants(self): + for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), + ('True', 'False')): + ex = '{0} {1} {2}'.format(lhs, op, rhs) + if op in ('and', 'or'): + with tm.assertRaises(NotImplementedError): + self.eval(ex) + else: + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) + + def test_simple_bool_ops(self): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), + (True, False)): + ex = 'lhs {0} rhs'.format(op) + if op in ('and', 'or'): + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + else: + res = pd.eval(ex, engine=self.engine, parser=self.parser) + exp = eval(ex) + self.assertEqual(res, exp) + + +class TestOperationsPythonPython(TestOperationsNumExprPython): + + @classmethod + def setUpClass(cls): + super(TestOperationsPythonPython, cls).setUpClass() + cls.engine = cls.parser = 'python' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), + cls.arith_ops) + + +class TestOperationsPythonPandas(TestOperationsNumExprPandas): + + @classmethod + def setUpClass(cls): + super(TestOperationsPythonPandas, cls).setUpClass() + cls.engine = 'python' + cls.parser = 'pandas' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + + +_var_s = randn(10) + + +class TestScope(object): + + def check_global_scope(self, e, engine, parser): + tm.skip_if_no_ne(engine) + assert_array_equal(_var_s * 2, pd.eval(e, engine=engine, + parser=parser)) + + def test_global_scope(self): + e = '_var_s * 2' + for engine, parser in product(_engines, expr._parsers): + yield self.check_global_scope, e, engine, parser + + def check_no_new_locals(self, engine, parser): + tm.skip_if_no_ne(engine) + x = 1 + lcls = locals().copy() + pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser) + lcls2 = locals().copy() + lcls2.pop('lcls') + tm.assert_equal(lcls, lcls2) + + def test_no_new_locals(self): + for engine, parser in product(_engines, expr._parsers): + yield self.check_no_new_locals, engine, parser + + def check_no_new_globals(self, engine, parser): + tm.skip_if_no_ne(engine) + x = 1 + gbls = globals().copy() + pd.eval('x + 1', engine=engine, parser=parser) + gbls2 = globals().copy() + tm.assert_equal(gbls, gbls2) + + def test_no_new_globals(self): + for engine, parser in product(_engines, expr._parsers): + yield self.check_no_new_globals, engine, parser + + +def test_invalid_engine(): + tm.skip_if_no_ne() + assertRaisesRegexp(KeyError, 'Invalid engine \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + engine='asdf') + + +def test_invalid_parser(): + tm.skip_if_no_ne() + assertRaisesRegexp(KeyError, 'Invalid parser \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + parser='asdf') + + +_parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, + 'pandas': PandasExprVisitor} + + +def check_disallowed_nodes(engine, parser): + tm.skip_if_no_ne(engine) + VisitorClass = _parsers[parser] + uns_ops = VisitorClass.unsupported_nodes + inst = VisitorClass('x + 1', engine, parser) + + for ops in uns_ops: + assert_raises(NotImplementedError, getattr(inst, ops)) + + +def test_disallowed_nodes(): + for engine, visitor in product(_parsers, repeat=2): + yield check_disallowed_nodes, engine, visitor + + +def check_syntax_error_exprs(engine, parser): + tm.skip_if_no_ne(engine) + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine, parser=parser) + + +def test_syntax_error_exprs(): + for engine, parser in ENGINES_PARSERS: + yield check_syntax_error_exprs, engine, parser + + +def check_name_error_exprs(engine, parser): + tm.skip_if_no_ne(engine) + e = 's + t' + with tm.assertRaises(NameError): + pd.eval(e, engine=engine, parser=parser) + + +def test_name_error_exprs(): + for engine, parser in ENGINES_PARSERS: + yield check_name_error_exprs, engine, parser + + +def check_invalid_numexpr_version(engine, parser): + def testit(): + a, b = 1, 2 + res = pd.eval('a + b', engine=engine, parser=parser) + tm.assert_equal(res, 3) + + if engine == 'numexpr': + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("no numexpr") + else: + if ne.__version__ < LooseVersion('2.0'): + with tm.assertRaisesRegexp(ImportError, "'numexpr' version is " + ".+, must be >= 2.0"): + testit() + else: + testit() + else: + testit() + + +def test_invalid_numexpr_version(): + for engine, parser in ENGINES_PARSERS: + yield check_invalid_numexpr_version, engine, parser + + +def check_invalid_local_variable_reference(engine, parser): + tm.skip_if_no_ne(engine) + + a, b = 1, 2 + exprs = 'a + @b', '@a + b', '@a + @b' + for expr in exprs: + if parser != 'pandas': + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is only"): + pd.eval(exprs, engine=engine, parser=parser) + else: + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is not"): + pd.eval(exprs, engine=engine, parser=parser) + + +def test_invalid_local_variable_reference(): + for engine, parser in ENGINES_PARSERS: + yield check_invalid_local_variable_reference, engine, parser + + +def check_numexpr_builtin_raises(engine, parser): + tm.skip_if_no_ne(engine) + sin, dotted_line = 1, 2 + if engine == 'numexpr': + with tm.assertRaisesRegexp(NumExprClobberingError, + 'Variables in expression .+'): + pd.eval('sin + dotted_line', engine=engine, parser=parser) + else: + res = pd.eval('sin + dotted_line', engine=engine, parser=parser) + tm.assert_equal(res, sin + dotted_line) + + +def test_numexpr_builtin_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_numexpr_builtin_raises, engine, parser + + +def check_bad_resolver_raises(engine, parser): + tm.skip_if_no_ne(engine) + cannot_resolve = 42, 3.0 + with tm.assertRaisesRegexp(TypeError, 'Resolver of type .+'): + pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine, + parser=parser) + + +def test_bad_resolver_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_bad_resolver_raises, engine, parser + + +def check_more_than_one_expression_raises(engine, parser): + tm.skip_if_no_ne(engine) + with tm.assertRaisesRegexp(SyntaxError, + 'only a single expression is allowed'): + pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser) + + +def test_more_than_one_expression_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_more_than_one_expression_raises, engine, parser + + +def check_bool_ops_fails_on_scalars(gen, lhs, cmp, rhs, engine, parser): + tm.skip_if_no_ne(engine) + mid = gen[type(lhs)]() + ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp) + for ex in (ex1, ex2, ex3): + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=engine, parser=parser) + + +def test_bool_ops_fails_on_scalars(): + _bool_ops_syms = 'and', 'or' + dtypes = int, float + gen = {int: lambda: np.random.randint(10), float: np.random.randn} + for engine, parser, dtype1, cmp, dtype2 in product(_engines, expr._parsers, + dtypes, _bool_ops_syms, + dtypes): + yield (check_bool_ops_fails_on_scalars, gen, gen[dtype1](), cmp, + gen[dtype2](), engine, parser) + + +def check_inf(engine, parser): + tm.skip_if_no_ne(engine) + s = 'inf + 1' + expected = np.inf + result = pd.eval(s, engine=engine, parser=parser) + tm.assert_equal(result, expected) + + +def test_inf(): + for engine, parser in ENGINES_PARSERS: + yield check_inf, engine, parser + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/core/__init__.py b/pandas/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py new file mode 100644 index 00000000..c45256c4 --- /dev/null +++ b/pandas/core/algorithms.py @@ -0,0 +1,522 @@ +""" +Generic data algorithms. This module is experimental at the moment and not +intended for public consumption +""" +from __future__ import division +from warnings import warn +import numpy as np + +import pandas.core.common as com +import pandas.algos as algos +import pandas.hashtable as htable +from pandas.compat import string_types + +def match(to_match, values, na_sentinel=-1): + """ + Compute locations of to_match into values + + Parameters + ---------- + to_match : array-like + values to find positions of + values : array-like + Unique set of values + na_sentinel : int, default -1 + Value to mark "not found" + + Examples + -------- + + Returns + ------- + match : ndarray of integers + """ + values = com._asarray_tuplesafe(values) + if issubclass(values.dtype.type, string_types): + values = np.array(values, dtype='O') + + f = lambda htype, caster: _match_generic(to_match, values, htype, caster) + result = _hashtable_algo(f, values.dtype) + + if na_sentinel != -1: + + # replace but return a numpy array + # use a Series because it handles dtype conversions properly + from pandas.core.series import Series + result = Series(result.ravel()).replace(-1,na_sentinel).values.reshape(result.shape) + + return result + + +def unique(values): + """ + Compute unique values (not necessarily sorted) efficiently from input array + of values + + Parameters + ---------- + values : array-like + + Returns + ------- + uniques + """ + values = com._asarray_tuplesafe(values) + f = lambda htype, caster: _unique_generic(values, htype, caster) + return _hashtable_algo(f, values.dtype) + + +def _hashtable_algo(f, dtype): + """ + f(HashTable, type_caster) -> result + """ + if com.is_float_dtype(dtype): + return f(htable.Float64HashTable, com._ensure_float64) + elif com.is_integer_dtype(dtype): + return f(htable.Int64HashTable, com._ensure_int64) + else: + return f(htable.PyObjectHashTable, com._ensure_object) + + +def _match_generic(values, index, table_type, type_caster): + values = type_caster(values) + index = type_caster(index) + table = table_type(min(len(index), 1000000)) + table.map_locations(index) + return table.lookup(values) + + +def _unique_generic(values, table_type, type_caster): + values = type_caster(values) + table = table_type(min(len(values), 1000000)) + uniques = table.unique(values) + return type_caster(uniques) + + +def factorize(values, sort=False, order=None, na_sentinel=-1): + """ + Encode input values as an enumerated type or categorical variable + + Parameters + ---------- + values : ndarray (1-d) + Sequence + sort : boolean, default False + Sort by values + order : deprecated + na_sentinel: int, default -1 + Value to mark "not found" + + Returns + ------- + labels : the indexer to the original array + uniques : ndarray (1-d) or Index + the unique values. Index is returned when passed values is Index or Series + + note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex + """ + if order is not None: + warn("order is deprecated." + "See https://github.com/pydata/pandas/issues/6926", FutureWarning) + + from pandas.core.index import Index + from pandas.core.series import Series + vals = np.asarray(values) + is_datetime = com.is_datetime64_dtype(vals) + (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) + + table = hash_klass(len(vals)) + uniques = vec_klass() + labels = table.get_labels(vals, uniques, 0, na_sentinel) + + labels = com._ensure_platform_int(labels) + + uniques = uniques.to_array() + + if sort and len(uniques) > 0: + try: + sorter = uniques.argsort() + except: + # unorderable in py3 if mixed str/int + t = hash_klass(len(uniques)) + t.map_locations(com._ensure_object(uniques)) + + # order ints before strings + ordered = np.concatenate([ + np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types), + lambda x: isinstance(x,string_types) ] + ]) + sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered))) + + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + labels = reverse_indexer.take(labels) + np.putmask(labels, mask, -1) + + uniques = uniques.take(sorter) + + if is_datetime: + uniques = uniques.astype('M8[ns]') + if isinstance(values, Index): + uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None), + tz=getattr(values, 'tz', None)) + elif isinstance(values, Series): + uniques = Index(uniques) + return labels, uniques + + +def value_counts(values, sort=True, ascending=False, normalize=False, + bins=None, dropna=True): + """ + Compute a histogram of the counts of non-null values. + + Parameters + ---------- + values : ndarray (1-d) + sort : boolean, default True + Sort by values + ascending : boolean, default False + Sort in ascending order + normalize: boolean, default False + If True then compute a relative histogram + bins : integer, optional + Rather than count values, group them into half-open bins, + convenience for pd.cut, only works with numeric data + dropna : boolean, default True + Don't include counts of NaN + + Returns + ------- + value_counts : Series + + """ + from pandas.core.series import Series + from pandas.tools.tile import cut + + values = Series(values).values + + if bins is not None: + try: + cat, bins = cut(values, bins, retbins=True) + except TypeError: + raise TypeError("bins argument only works with numeric data.") + values = cat.labels + + dtype = values.dtype + if com.is_integer_dtype(dtype): + values = com._ensure_int64(values) + keys, counts = htable.value_count_int64(values) + + elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): + values = values.view(np.int64) + keys, counts = htable.value_count_int64(values) + + if dropna: + from pandas.tslib import iNaT + msk = keys != iNaT + keys, counts = keys[msk], counts[msk] + # convert the keys back to the dtype we came in + keys = keys.astype(dtype) + + else: + values = com._ensure_object(values) + mask = com.isnull(values) + keys, counts = htable.value_count_object(values, mask) + if not dropna: + keys = np.insert(keys, 0, np.NaN) + counts = np.insert(counts, 0, mask.sum()) + + result = Series(counts, index=com._values_from_object(keys)) + if bins is not None: + # TODO: This next line should be more efficient + result = result.reindex(np.arange(len(cat.levels)), fill_value=0) + result.index = bins[:-1] + + if sort: + result.sort() + if not ascending: + result = result[::-1] + + if normalize: + result = result / float(values.size) + + return result + + +def mode(values): + """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" + # must sort because hash order isn't necessarily defined. + from pandas.core.series import Series + + if isinstance(values, Series): + constructor = values._constructor + values = values.values + else: + values = np.asanyarray(values) + constructor = Series + + dtype = values.dtype + if com.is_integer_dtype(values.dtype): + values = com._ensure_int64(values) + result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) + + elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): + dtype = values.dtype + values = values.view(np.int64) + result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) + + else: + mask = com.isnull(values) + values = com._ensure_object(values) + res = htable.mode_object(values, mask) + try: + res = sorted(res) + except TypeError as e: + warn("Unable to sort modes: %s" % e) + result = constructor(res, dtype=dtype) + + return result + + +def rank(values, axis=0, method='average', na_option='keep', + ascending=True, pct=False): + """ + + """ + if values.ndim == 1: + f, values = _get_data_algo(values, _rank1d_functions) + ranks = f(values, ties_method=method, ascending=ascending, + na_option=na_option, pct=pct) + elif values.ndim == 2: + f, values = _get_data_algo(values, _rank2d_functions) + ranks = f(values, axis=axis, ties_method=method, + ascending=ascending, na_option=na_option, pct=pct) + + return ranks + + +def quantile(x, q, interpolation_method='fraction'): + """ + Compute sample quantile or quantiles of the input array. For example, q=0.5 + computes the median. + + The `interpolation_method` parameter supports three values, namely + `fraction` (default), `lower` and `higher`. Interpolation is done only, + if the desired quantile lies between two data points `i` and `j`. For + `fraction`, the result is an interpolated value between `i` and `j`; + for `lower`, the result is `i`, for `higher` the result is `j`. + + Parameters + ---------- + x : ndarray + Values from which to extract score. + q : scalar or array + Percentile at which to extract score. + interpolation_method : {'fraction', 'lower', 'higher'}, optional + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + - fraction: `i + (j - i)*fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + -lower: `i`. + - higher: `j`. + + Returns + ------- + score : float + Score at percentile. + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(100) + >>> stats.scoreatpercentile(a, 50) + 49.5 + + """ + x = np.asarray(x) + mask = com.isnull(x) + + x = x[~mask] + + values = np.sort(x) + + def _get_score(at): + if len(values) == 0: + return np.nan + + idx = at * (len(values) - 1) + if idx % 1 == 0: + score = values[int(idx)] + else: + if interpolation_method == 'fraction': + score = _interpolate(values[int(idx)], values[int(idx) + 1], + idx % 1) + elif interpolation_method == 'lower': + score = values[np.floor(idx)] + elif interpolation_method == 'higher': + score = values[np.ceil(idx)] + else: + raise ValueError("interpolation_method can only be 'fraction' " + ", 'lower' or 'higher'") + + return score + + if np.isscalar(q): + return _get_score(q) + else: + q = np.asarray(q, np.float64) + return algos.arrmap_float64(q, _get_score) + + +def _interpolate(a, b, fraction): + """Returns the point at the given fraction between a and b, where + 'fraction' must be between 0 and 1. + """ + return a + (b - a) * fraction + + +def _get_data_algo(values, func_map): + mask = None + if com.is_float_dtype(values): + f = func_map['float64'] + values = com._ensure_float64(values) + elif com.is_datetime64_dtype(values): + + # if we have NaT, punt to object dtype + mask = com.isnull(values) + if mask.ravel().any(): + f = func_map['generic'] + values = com._ensure_object(values) + values[mask] = np.nan + else: + f = func_map['int64'] + values = values.view('i8') + + elif com.is_integer_dtype(values): + f = func_map['int64'] + values = com._ensure_int64(values) + else: + f = func_map['generic'] + values = com._ensure_object(values) + return f, values + + +def group_position(*args): + """ + Get group position + """ + from collections import defaultdict + table = defaultdict(int) + + result = [] + for tup in zip(*args): + result.append(table[tup]) + table[tup] += 1 + + return result + + +_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'} + + +def _finalize_nsmallest(arr, kth_val, n, take_last, narr): + ns, = np.nonzero(arr <= kth_val) + inds = ns[arr[ns].argsort(kind='mergesort')][:n] + + if take_last: + # reverse indices + return narr - 1 - inds + return inds + + +def nsmallest(arr, n, take_last=False): + ''' + Find the indices of the n smallest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if take_last: + arr = arr[::-1] + + narr = len(arr) + n = min(n, narr) + + sdtype = str(arr.dtype) + arr = arr.view(_dtype_map.get(sdtype, sdtype)) + + kth_val = algos.kth_smallest(arr.copy(), n - 1) + return _finalize_nsmallest(arr, kth_val, n, take_last, narr) + + +def nlargest(arr, n, take_last=False): + """ + Find the indices of the n largest values of a numpy array. + + Note: Fails silently with NaN. + """ + sdtype = str(arr.dtype) + arr = arr.view(_dtype_map.get(sdtype, sdtype)) + return nsmallest(-arr, n, take_last=take_last) + + +def select_n_slow(dropped, n, take_last, method): + reverse_it = take_last or method == 'nlargest' + ascending = method == 'nsmallest' + slc = np.s_[::-1] if reverse_it else np.s_[:] + return dropped[slc].order(ascending=ascending).head(n) + + +_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} + + +def select_n(series, n, take_last, method): + """Implement n largest/smallest. + + Parameters + ---------- + n : int + take_last : bool + method : str, {'nlargest', 'nsmallest'} + + Returns + ------- + nordered : Series + """ + dtype = series.dtype + if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64, + np.timedelta64)): + raise TypeError("Cannot use method %r with dtype %s" % (method, dtype)) + + if n <= 0: + return series[[]] + + dropped = series.dropna() + + if n >= len(series): + return select_n_slow(dropped, n, take_last, method) + + inds = _select_methods[method](dropped.values, n, take_last) + return dropped.iloc[inds] + + +_rank1d_functions = { + 'float64': algos.rank_1d_float64, + 'int64': algos.rank_1d_int64, + 'generic': algos.rank_1d_generic +} + +_rank2d_functions = { + 'float64': algos.rank_2d_float64, + 'int64': algos.rank_2d_int64, + 'generic': algos.rank_2d_generic +} + +_hashtables = { + 'float64': (htable.Float64HashTable, htable.Float64Vector), + 'int64': (htable.Int64HashTable, htable.Int64Vector), + 'generic': (htable.PyObjectHashTable, htable.ObjectVector) +} diff --git a/pandas/core/api.py b/pandas/core/api.py new file mode 100644 index 00000000..b7e02917 --- /dev/null +++ b/pandas/core/api.py @@ -0,0 +1,35 @@ + +# pylint: disable=W0614,W0401,W0611 + +import numpy as np + +from pandas.core.algorithms import factorize, match, unique, value_counts +from pandas.core.common import isnull, notnull +from pandas.core.categorical import Categorical +from pandas.core.groupby import Grouper +from pandas.core.format import set_eng_float_format +from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex + +from pandas.core.series import Series, TimeSeries +from pandas.core.frame import DataFrame +from pandas.core.panel import Panel +from pandas.core.panel4d import Panel4D +from pandas.core.groupby import groupby +from pandas.core.reshape import (pivot_simple as pivot, get_dummies, + lreshape, wide_to_long) + +WidePanel = Panel + +from pandas.core.indexing import IndexSlice +from pandas.tseries.offsets import DateOffset +from pandas.tseries.tools import to_datetime +from pandas.tseries.index import (DatetimeIndex, Timestamp, + date_range, bdate_range) +from pandas.tseries.period import Period, PeriodIndex + +# legacy +from pandas.core.common import save, load # deprecated, remove in 0.13 +import pandas.core.datetools as datetools + +from pandas.core.config import (get_option, set_option, reset_option, + describe_option, option_context, options) diff --git a/pandas/core/array.py b/pandas/core/array.py new file mode 100644 index 00000000..495f2319 --- /dev/null +++ b/pandas/core/array.py @@ -0,0 +1,37 @@ +""" +Isolate pandas's exposure to NumPy +""" + +import numpy as np + +Array = np.ndarray + +bool = np.bool_ + +_dtypes = { + 'int': [8, 16, 32, 64], + 'uint': [8, 16, 32, 64], + 'float': [16, 32, 64] +} + +_lift_types = [] + +for _k, _v in _dtypes.items(): + for _i in _v: + _lift_types.append(_k + str(_i)) + +for _t in _lift_types: + globals()[_t] = getattr(np, _t) + +_lift_function = ['empty', 'arange', 'array', 'putmask', 'where'] + +for _f in _lift_function: + globals()[_f] = getattr(np, _f) + +_lift_random = ['randn', 'rand'] + +for _f in _lift_random: + globals()[_f] = getattr(np.random, _f) + +NA = np.nan + diff --git a/pandas/core/base.py b/pandas/core/base.py new file mode 100644 index 00000000..ce078eb9 --- /dev/null +++ b/pandas/core/base.py @@ -0,0 +1,494 @@ +""" +Base and utility classes for pandas objects. +""" +from pandas import compat +import numpy as np +from pandas.core import common as com +import pandas.core.nanops as nanops +import pandas.tslib as tslib +from pandas.util.decorators import cache_readonly + +class StringMixin(object): + + """implements string methods so long as object defines a `__unicode__` + method. + + Handles Python2/3 compatibility transparently. + """ + # side note - this could be made into a metaclass if more than one + # object needs + + #---------------------------------------------------------------------- + # Formatting + + def __unicode__(self): + raise NotImplementedError + + def __str__(self): + """ + Return a string representation for a particular Object + + Invoked by str(df) in both py2/py3. + Yields Bytestring in Py2, Unicode String in py3. + """ + + if compat.PY3: + return self.__unicode__() + return self.__bytes__() + + def __bytes__(self): + """ + Return a string representation for a particular object. + + Invoked by bytes(obj) in py3 only. + Yields a bytestring in both py2/py3. + """ + from pandas.core.config import get_option + + encoding = get_option("display.encoding") + return self.__unicode__().encode(encoding, 'replace') + + def __repr__(self): + """ + Return a string representation for a particular object. + + Yields Bytestring in Py2, Unicode String in py3. + """ + return str(self) + + +class PandasObject(StringMixin): + + """baseclass for various pandas objects""" + + @property + def _constructor(self): + """class constructor (for this class it's just `__class__`""" + return self.__class__ + + def __unicode__(self): + """ + Return a string representation for a particular object. + + Invoked by unicode(obj) in py2 only. Yields a Unicode String in both + py2/py3. + """ + # Should be overwritten by base classes + return object.__repr__(self) + + def _local_dir(self): + """ provide addtional __dir__ for this object """ + return [] + + def __dir__(self): + """ + Provide method name lookup and completion + Only provide 'public' methods + """ + return list(sorted(list(set(dir(type(self)) + self._local_dir())))) + + def _reset_cache(self, key=None): + """ + Reset cached properties. If ``key`` is passed, only clears that key. + """ + if getattr(self, '_cache', None) is None: + return + if key is None: + self._cache.clear() + else: + self._cache.pop(key, None) + + +class FrozenList(PandasObject, list): + + """ + Container that doesn't allow setting item *but* + because it's technically non-hashable, will be used + for lookups, appropriately, etc. + """ + # Sidenote: This has to be of type list, otherwise it messes up PyTables + # typechecks + + def __add__(self, other): + if isinstance(other, tuple): + other = list(other) + return self.__class__(super(FrozenList, self).__add__(other)) + + __iadd__ = __add__ + + # Python 2 compat + def __getslice__(self, i, j): + return self.__class__(super(FrozenList, self).__getslice__(i, j)) + + def __getitem__(self, n): + # Python 3 compat + if isinstance(n, slice): + return self.__class__(super(FrozenList, self).__getitem__(n)) + return super(FrozenList, self).__getitem__(n) + + def __radd__(self, other): + if isinstance(other, tuple): + other = list(other) + return self.__class__(other + list(self)) + + def __eq__(self, other): + if isinstance(other, (tuple, FrozenList)): + other = list(other) + return super(FrozenList, self).__eq__(other) + + __req__ = __eq__ + + def __mul__(self, other): + return self.__class__(super(FrozenList, self).__mul__(other)) + + __imul__ = __mul__ + + def __reduce__(self): + return self.__class__, (list(self),) + + def __hash__(self): + return hash(tuple(self)) + + def _disabled(self, *args, **kwargs): + """This method will not function because object is immutable.""" + raise TypeError("'%s' does not support mutable operations." % + self.__class__.__name__) + + def __unicode__(self): + from pandas.core.common import pprint_thing + return pprint_thing(self, quote_strings=True, + escape_chars=('\t', '\r', '\n')) + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, + str(self)) + + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + pop = append = extend = remove = sort = insert = _disabled + + +class FrozenNDArray(PandasObject, np.ndarray): + + # no __array_finalize__ for now because no metadata + def __new__(cls, data, dtype=None, copy=False): + if copy is None: + copy = not isinstance(data, FrozenNDArray) + res = np.array(data, dtype=dtype, copy=copy).view(cls) + return res + + def _disabled(self, *args, **kwargs): + """This method will not function because object is immutable.""" + raise TypeError("'%s' does not support mutable operations." % + self.__class__) + + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + put = itemset = fill = _disabled + + def _shallow_copy(self): + return self.view() + + def values(self): + """returns *copy* of underlying array""" + arr = self.view(np.ndarray).copy() + return arr + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'), + quote_strings=True) + return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + + +class IndexOpsMixin(object): + """ common ops mixin to support a unified inteface / docs for Series / Index """ + + def _is_allowed_index_op(self, name): + if not self._allow_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _ops_compat(self, name, op_accessor): + + obj = self._get_access_object() + try: + return self._wrap_access_object(getattr(obj,op_accessor)) + except AttributeError: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(obj))) + + def _get_access_object(self): + if isinstance(self, com.ABCSeries): + return self.index + return self + + def _wrap_access_object(self, obj): + # we may need to coerce the input as we don't want non int64 if + # we have an integer result + if hasattr(obj,'dtype') and com.is_integer_dtype(obj): + obj = obj.astype(np.int64) + + if isinstance(self, com.ABCSeries): + return self._constructor(obj,index=self.index).__finalize__(self) + + return obj + + def max(self): + """ The maximum value of the object """ + return nanops.nanmax(self.values) + + def min(self): + """ The minimum value of the object """ + return nanops.nanmin(self.values) + + def value_counts(self, normalize=False, sort=True, ascending=False, + bins=None, dropna=True): + """ + Returns object containing counts of unique values. + + The resulting object will be in descending order so that the + first element is the most frequently-occurring element. + Excludes NA values by default. + + Parameters + ---------- + normalize : boolean, default False + If True then the object returned will contain the relative + frequencies of the unique values. + sort : boolean, default True + Sort by values + ascending : boolean, default False + Sort in ascending order + bins : integer, optional + Rather than count values, group them into half-open bins, + a convenience for pd.cut, only works with numeric data + dropna : boolean, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + """ + from pandas.core.algorithms import value_counts + return value_counts(self.values, sort=sort, ascending=ascending, + normalize=normalize, bins=bins, dropna=dropna) + + def unique(self): + """ + Return array of unique values in the object. Significantly faster than + numpy.unique. Includes NA values. + + Returns + ------- + uniques : ndarray + """ + from pandas.core.nanops import unique1d + return unique1d(self.values) + + def nunique(self, dropna=True): + """ + Return number of unique elements in the object. + + Excludes NA values by default. + + Parameters + ---------- + dropna : boolean, default True + Don't include NaN in the count. + + Returns + ------- + nunique : int + """ + return len(self.value_counts(dropna=dropna)) + + def factorize(self, sort=False, na_sentinel=-1): + """ + Encode the object as an enumerated type or categorical variable + + Parameters + ---------- + sort : boolean, default False + Sort by values + na_sentinel: int, default -1 + Value to mark "not found" + + Returns + ------- + labels : the indexer to the original array + uniques : the unique Index + """ + from pandas.core.algorithms import factorize + return factorize(self, sort=sort, na_sentinel=na_sentinel) + +# facilitate the properties on the wrapped ops +def _field_accessor(name, docstring=None): + op_accessor = '_{0}'.format(name) + def f(self): + return self._ops_compat(name,op_accessor) + + f.__name__ = name + f.__doc__ = docstring + return property(f) + +class DatetimeIndexOpsMixin(object): + """ common ops mixin to support a unified inteface datetimelike Index """ + + def _is_allowed_datetime_index_op(self, name): + if not self._allow_datetime_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _is_allowed_period_index_op(self, name): + if not self._allow_period_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _ops_compat(self, name, op_accessor): + + from pandas.tseries.index import DatetimeIndex + from pandas.tseries.period import PeriodIndex + obj = self._get_access_object() + if isinstance(obj, DatetimeIndex): + self._is_allowed_datetime_index_op(name) + elif isinstance(obj, PeriodIndex): + self._is_allowed_period_index_op(name) + try: + return self._wrap_access_object(getattr(obj,op_accessor)) + except AttributeError: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(obj))) + + date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps') + time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps') + year = _field_accessor('year', "The year of the datetime") + month = _field_accessor('month', "The month as January=1, December=12") + day = _field_accessor('day', "The days of the datetime") + hour = _field_accessor('hour', "The hours of the datetime") + minute = _field_accessor('minute', "The minutes of the datetime") + second = _field_accessor('second', "The seconds of the datetime") + microsecond = _field_accessor('microsecond', "The microseconds of the datetime") + nanosecond = _field_accessor('nanosecond', "The nanoseconds of the datetime") + weekofyear = _field_accessor('weekofyear', "The week ordinal of the year") + week = weekofyear + dayofweek = _field_accessor('dayofweek', "The day of the week with Monday=0, Sunday=6") + weekday = dayofweek + dayofyear = _field_accessor('dayofyear', "The ordinal day of the year") + quarter = _field_accessor('quarter', "The quarter of the date") + qyear = _field_accessor('qyear') + is_month_start = _field_accessor('is_month_start', "Logical indicating if first day of month (defined by frequency)") + is_month_end = _field_accessor('is_month_end', "Logical indicating if last day of month (defined by frequency)") + is_quarter_start = _field_accessor('is_quarter_start', "Logical indicating if first day of quarter (defined by frequency)") + is_quarter_end = _field_accessor('is_quarter_end', "Logical indicating if last day of quarter (defined by frequency)") + is_year_start = _field_accessor('is_year_start', "Logical indicating if first day of year (defined by frequency)") + is_year_end = _field_accessor('is_year_end', "Logical indicating if last day of year (defined by frequency)") + + @property + def _box_func(self): + """ + box function to get object from internal representation + """ + raise NotImplementedError + + def _box_values(self, values): + """ + apply box func to passed values + """ + import pandas.lib as lib + return lib.map_infer(values, self._box_func) + + @cache_readonly + def hasnans(self): + """ return if I have any nans; enables various perf speedups """ + return (self.asi8 == tslib.iNaT).any() + + @property + def asobject(self): + from pandas.core.index import Index + return Index(self._box_values(self.asi8), name=self.name, dtype=object) + + def tolist(self): + """ + See ndarray.tolist + """ + return list(self.asobject) + + def min(self, axis=None): + """ + Overridden ndarray.min to return an object + """ + try: + i8 = self.asi8 + + # quick check + if len(i8) and self.is_monotonic: + if i8[0] != tslib.iNaT: + return self._box_func(i8[0]) + + if self.hasnans: + mask = i8 == tslib.iNaT + min_stamp = self[~mask].asi8.min() + else: + min_stamp = i8.min() + return self._box_func(min_stamp) + except ValueError: + return self._na_value + + def max(self, axis=None): + """ + Overridden ndarray.max to return an object + """ + try: + i8 = self.asi8 + + # quick check + if len(i8) and self.is_monotonic: + if i8[-1] != tslib.iNaT: + return self._box_func(i8[-1]) + + if self.hasnans: + mask = i8 == tslib.iNaT + max_stamp = self[~mask].asi8.max() + else: + max_stamp = i8.max() + return self._box_func(max_stamp) + except ValueError: + return self._na_value + + @property + def _formatter_func(self): + """ + Format function to convert value to representation + """ + return str + + def _format_footer(self): + tagline = 'Length: %d, Freq: %s, Timezone: %s' + return tagline % (len(self), self.freqstr, self.tz) + + def __unicode__(self): + formatter = self._formatter_func + summary = str(self.__class__) + '\n' + + n = len(self) + if n == 0: + pass + elif n == 1: + first = formatter(self[0]) + summary += '[%s]\n' % first + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary += '[%s, %s]\n' % (first, last) + else: + first = formatter(self[0]) + last = formatter(self[-1]) + summary += '[%s, ..., %s]\n' % (first, last) + + summary += self._format_footer() + return summary + diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py new file mode 100644 index 00000000..dfadd34e --- /dev/null +++ b/pandas/core/categorical.py @@ -0,0 +1,226 @@ +# pylint: disable=E1101,W0232 + +import numpy as np + +from pandas import compat +from pandas.compat import u + +from pandas.core.algorithms import factorize +from pandas.core.base import PandasObject +from pandas.core.index import Index +import pandas.core.common as com +from pandas.util.terminal import get_terminal_size +from pandas.core.config import get_option +from pandas.core import format as fmt + + +def _cat_compare_op(op): + def f(self, other): + if isinstance(other, (Categorical, np.ndarray)): + values = np.asarray(self) + f = getattr(values, op) + return f(np.asarray(other)) + else: + if other in self.levels: + i = self.levels.get_loc(other) + return getattr(self.labels, op)(i) + else: + return np.repeat(False, len(self)) + + f.__name__ = op + + return f + + +class Categorical(PandasObject): + + """ + Represents a categorical variable in classic R / S-plus fashion + + Parameters + ---------- + labels : ndarray of integers + If levels is given, the integer at label `i` is the index of the level + for that label. I.e., the level at labels[i] is levels[labels[i]]. + Otherwise, if levels is None, these are just the labels and the levels + are assumed to be the unique labels. See from_array. + levels : Index-like (unique), optional + The unique levels for each label. If not given, the levels are assumed + to be the unique values of labels. + name : str, optional + Name for the Categorical variable. If levels is None, will attempt + to infer from labels. + + Returns + ------- + **Attributes** + * labels : ndarray + * levels : ndarray + + Examples + -------- + >>> from pandas import Categorical + >>> Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) + Categorical: + array([1, 2, 3, 1, 2, 3]) + Levels (3): Int64Index([1, 2, 3]) + + >>> Categorical([0,1,2,0,1,2], ['a', 'b', 'c']) + Categorical: + array(['a', 'b', 'c', 'a', 'b', 'c'], dtype=object) + Levels (3): Index(['a', 'b', 'c'], dtype=object) + + >>> Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + Categorical: + array(['a', 'b', 'c', 'a', 'b', 'c'], dtype=object) + Levels (3): Index(['a', 'b', 'c'], dtype=object) + """ + + def __init__(self, labels, levels=None, name=None): + if levels is None: + if name is None: + name = getattr(labels, 'name', None) + try: + labels, levels = factorize(labels, sort=True) + except TypeError: + labels, levels = factorize(labels, sort=False) + + self.labels = labels + self.levels = levels + self.name = name + + @classmethod + def from_array(cls, data): + """ + Make a Categorical type from a single array-like object. + + Parameters + ---------- + data : array-like + Can be an Index or array-like. The levels are assumed to be + the unique values of `data`. + """ + return Categorical(data) + + _levels = None + + def _set_levels(self, levels): + from pandas.core.index import _ensure_index + + levels = _ensure_index(levels) + if not levels.is_unique: + raise ValueError('Categorical levels must be unique') + self._levels = levels + + def _get_levels(self): + return self._levels + + levels = property(fget=_get_levels, fset=_set_levels) + + __eq__ = _cat_compare_op('__eq__') + __ne__ = _cat_compare_op('__ne__') + __lt__ = _cat_compare_op('__lt__') + __gt__ = _cat_compare_op('__gt__') + __le__ = _cat_compare_op('__le__') + __ge__ = _cat_compare_op('__ge__') + + def __array__(self, dtype=None): + return com.take_1d(self.levels.values, self.labels) + + def __len__(self): + return len(self.labels) + + def _tidy_repr(self, max_vals=20): + num = max_vals // 2 + head = self[:num]._get_repr(length=False, name=False, footer=False) + tail = self[-(max_vals - num):]._get_repr(length=False, + name=False, + footer=False) + + result = '%s\n...\n%s' % (head, tail) + # TODO: tidy_repr for footer since there may be a ton of levels? + result = '%s\n%s' % (result, self._repr_footer()) + + return compat.text_type(result) + + def _repr_footer(self): + levheader = 'Levels (%d): ' % len(self.levels) + # TODO: should max_line_width respect a setting? + levstring = np.array_repr(self.levels, max_line_width=60) + indent = ' ' * (levstring.find('[') + len(levheader) + 1) + lines = levstring.split('\n') + levstring = '\n'.join([lines[0]] + + [indent + x.lstrip() for x in lines[1:]]) + + namestr = "Name: %s, " % self.name if self.name is not None else "" + return u('%s\n%sLength: %d' % (levheader + levstring, namestr, + len(self))) + + def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): + formatter = fmt.CategoricalFormatter(self, name=name, + length=length, na_rep=na_rep, + footer=footer) + result = formatter.to_string() + return compat.text_type(result) + + def __unicode__(self): + width, height = get_terminal_size() + max_rows = (height if get_option("display.max_rows") == 0 + else get_option("display.max_rows")) + if len(self.labels) > (max_rows or 1000): + result = self._tidy_repr(min(30, max_rows) - 4) + elif len(self.labels) > 0: + result = self._get_repr(length=len(self) > 50, + name=True) + else: + result = 'Categorical([], %s' % self._get_repr(name=True, + length=False, + footer=True, + ) + + return result + + def __getitem__(self, key): + if isinstance(key, (int, np.integer)): + i = self.labels[key] + if i == -1: + return np.nan + else: + return self.levels[i] + else: + return Categorical(self.labels[key], self.levels) + + def equals(self, other): + """ + Returns True if categorical arrays are equal + + Parameters + ---------- + other : Categorical + + Returns + ------- + are_equal : boolean + """ + if not isinstance(other, Categorical): + return False + + return (self.levels.equals(other.levels) and + np.array_equal(self.labels, other.labels)) + + def describe(self): + """ + Returns a dataframe with frequency and counts by level. + """ + # Hack? + from pandas.core.frame import DataFrame + counts = DataFrame({ + 'labels' : self.labels, + 'values' : self.labels } + ).groupby('labels').count().squeeze().values + freqs = counts / float(counts.sum()) + return DataFrame({ + 'counts': counts, + 'freqs': freqs, + 'levels': self.levels + }).set_index('levels') diff --git a/pandas/core/common.py b/pandas/core/common.py new file mode 100644 index 00000000..bb7f4351 --- /dev/null +++ b/pandas/core/common.py @@ -0,0 +1,2931 @@ +""" +Misc tools for implementing data structures +""" + +import re +import collections +import numbers +import codecs +import csv +import types +from datetime import datetime, timedelta + +from numpy.lib.format import read_array, write_array +import numpy as np + +import pandas as pd +import pandas.algos as algos +import pandas.lib as lib +import pandas.tslib as tslib +from pandas import compat +from pandas.compat import StringIO, BytesIO, range, long, u, zip, map + +from pandas.core.config import get_option +from pandas.core import array as pa + + +class PandasError(Exception): + pass + + +class SettingWithCopyError(ValueError): + pass + + +class SettingWithCopyWarning(Warning): + pass + + +class AmbiguousIndexError(PandasError, KeyError): + pass + + +_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name + for t in ['O', 'int8', + 'uint8', 'int16', 'uint16', 'int32', + 'uint32', 'int64', 'uint64']]) + +_NS_DTYPE = np.dtype('M8[ns]') +_TD_DTYPE = np.dtype('m8[ns]') +_INT64_DTYPE = np.dtype(np.int64) +_DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'M8[ns]', + 'm8[ns]', 'm8[ns]']]) + + +# define abstract base classes to enable isinstance type checking on our +# objects +def create_pandas_abc_type(name, attr, comp): + @classmethod + def _check(cls, inst): + return getattr(inst, attr, None) in comp + dct = dict(__instancecheck__=_check, + __subclasscheck__=_check) + meta = type("ABCBase", (type,), dct) + return meta(name, tuple(), dct) + + +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) +ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", + ('sparse_series', + 'sparse_time_series')) +ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", + ('sparse_array', 'sparse_series')) + + +class _ABCGeneric(type): + + def __instancecheck__(cls, inst): + return hasattr(inst, "_data") + + +ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {}) + + +def bind_method(cls, name, func): + """Bind a method to class, python 2 and python 3 compatible. + + Parameters + ---------- + + cls : type + class to receive bound method + name : basestring + name of method on class instance + func : function + function to be bound as method + + + Returns + ------- + None + """ + # only python 2 has bound/unbound method issue + if not compat.PY3: + setattr(cls, name, types.MethodType(func, None, cls)) + else: + setattr(cls, name, func) + + +def isnull(obj): + """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) + + Parameters + ---------- + arr : ndarray or object value + Object to check for null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is null or if an array is + given which of the element is null. + + See also + -------- + pandas.notnull: boolean inverse of pandas.isnull + """ + return _isnull(obj) + + +def _isnull_new(obj): + if lib.isscalar(obj): + return lib.checknull(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, pd.MultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray)): + return _isnull_ndarraylike(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=isnull)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike(np.asarray(obj)) + else: + return obj is None + + +def _isnull_old(obj): + """Detect missing values. Treat None, NaN, INF, -INF as null. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + """ + if lib.isscalar(obj): + return lib.checknull_old(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, pd.MultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray)): + return _isnull_ndarraylike_old(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=_isnull_old)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike_old(np.asarray(obj)) + else: + return obj is None + +_isnull = _isnull_new + + +def _use_inf_as_null(key): + """Option change callback for null/inf behaviour + Choose which replacement for numpy.isnan / -numpy.isfinite is used. + + Parameters + ---------- + flag: bool + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). + + Notes + ----- + This approach to setting global module values is discussed and + approved here: + + * http://stackoverflow.com/questions/4859217/ + programmatically-creating-variables-in-python/4859312#4859312 + """ + flag = get_option(key) + if flag: + globals()['_isnull'] = _isnull_old + else: + globals()['_isnull'] = _isnull_new + + +def _isnull_ndarraylike(obj): + + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if dtype.kind in ('O', 'S', 'U'): + # Working around NumPy ticket 1542 + shape = values.shape + + if dtype.kind in ('S', 'U'): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[...] = vec.reshape(shape) + + elif dtype in _DATELIKE_DTYPES: + # this is the NaT pattern + result = values.view('i8') == tslib.iNaT + else: + result = np.isnan(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def _isnull_ndarraylike_old(obj): + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if dtype.kind in ('O', 'S', 'U'): + # Working around NumPy ticket 1542 + shape = values.shape + + if values.dtype.kind in ('S', 'U'): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj_old(values.ravel()) + result[:] = vec.reshape(shape) + + elif dtype in _DATELIKE_DTYPES: + # this is the NaT pattern + result = values.view('i8') == tslib.iNaT + else: + result = ~np.isfinite(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def notnull(obj): + """Replacement for numpy.isfinite / -numpy.isnan which is suitable for use + on object arrays. + + Parameters + ---------- + arr : ndarray or object value + Object to check for *not*-null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is *not* null or if an array + is given which of the element is *not* null. + + See also + -------- + pandas.isnull : boolean inverse of pandas.notnull + """ + res = isnull(obj) + if np.isscalar(res): + return not res + return ~res + +def _is_null_datelike_scalar(other): + """ test whether the object is a null datelike, e.g. Nat + but guard against passing a non-scalar """ + if other is pd.NaT or other is None: + return True + elif np.isscalar(other): + + # a timedelta + if hasattr(other,'dtype'): + return other.view('i8') == tslib.iNaT + elif is_integer(other) and other == tslib.iNaT: + return True + return isnull(other) + return False + +def array_equivalent(left, right): + """ + True if two arrays, left and right, have equal non-NaN elements, and NaNs in + corresponding locations. False otherwise. It is assumed that left and right + are NumPy arrays of the same dtype. The behavior of this function + (particularly with respect to NaNs) is not defined if the dtypes are + different. + + Parameters + ---------- + left, right : ndarrays + + Returns + ------- + b : bool + Returns True if the arrays are equivalent. + + Examples + -------- + >>> array_equivalent(np.array([1, 2, nan]), np.array([1, 2, nan])) + True + >>> array_equivalent(np.array([1, nan, 2]), np.array([1, 2, nan])) + False + """ + left, right = np.asarray(left), np.asarray(right) + if left.shape != right.shape: return False + # NaNs occur only in object arrays, float or complex arrays. + if issubclass(left.dtype.type, np.object_): + return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all() + if issubclass(left.dtype.type, (np.floating, np.complexfloating)): + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + return np.array_equal(left, right) + +def _iterable_not_string(x): + return (isinstance(x, collections.Iterable) and + not isinstance(x, compat.string_types)) + + +def flatten(l): + """Flatten an arbitrarily nested sequence. + + Parameters + ---------- + l : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ + for el in l: + if _iterable_not_string(el): + for s in flatten(el): + yield s + else: + yield el + + +def mask_missing(arr, values_to_mask): + """ + Return a masking array of same size/shape as arr + with entries equaling any member of values_to_mask set to True + """ + if not isinstance(values_to_mask, (list, np.ndarray)): + values_to_mask = [values_to_mask] + + try: + values_to_mask = np.array(values_to_mask, dtype=arr.dtype) + except Exception: + values_to_mask = np.array(values_to_mask, dtype=object) + + na_mask = isnull(values_to_mask) + nonna = values_to_mask[~na_mask] + + mask = None + for x in nonna: + if mask is None: + mask = arr == x + + # if x is a string and arr is not, then we get False and we must + # expand the mask to size arr.shape + if np.isscalar(mask): + mask = np.zeros(arr.shape, dtype=bool) + else: + mask |= arr == x + + if na_mask.any(): + if mask is None: + mask = isnull(arr) + else: + mask |= isnull(arr) + + return mask + + +def _pickle_array(arr): + arr = arr.view(np.ndarray) + + buf = BytesIO() + write_array(buf, arr) + + return buf.getvalue() + + +def _unpickle_array(bytes): + arr = read_array(BytesIO(bytes)) + + # All datetimes should be stored as M8[ns]. When unpickling with + # numpy1.6, it will read these as M8[us]. So this ensures all + # datetime64 types are read as MS[ns] + if is_datetime64_dtype(arr): + arr = arr.view(_NS_DTYPE) + + return arr + + +def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): + def wrapper(arr, indexer, out, fill_value=np.nan): + if arr_dtype is not None: + arr = arr.view(arr_dtype) + if out_dtype is not None: + out = out.view(out_dtype) + if fill_wrap is not None: + fill_value = fill_wrap(fill_value) + f(arr, indexer, out, fill_value=fill_value) + return wrapper + + +def _convert_wrapper(f, conv_dtype): + def wrapper(arr, indexer, out, fill_value=np.nan): + arr = arr.astype(conv_dtype) + f(arr, indexer, out, fill_value=fill_value) + return wrapper + + +def _take_2d_multi_generic(arr, indexer, out, fill_value, mask_info): + # this is not ideal, performance-wise, but it's better than raising + # an exception (best to optimize in Cython to avoid getting here) + row_idx, col_idx = indexer + if mask_info is not None: + (row_mask, col_mask), (row_needs, col_needs) = mask_info + else: + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + if fill_value is not None: + if row_needs: + out[row_mask, :] = fill_value + if col_needs: + out[:, col_mask] = fill_value + for i in range(len(row_idx)): + u_ = row_idx[i] + for j in range(len(col_idx)): + v = col_idx[j] + out[i, j] = arr[u_, v] + + +def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): + if mask_info is not None: + mask, needs_masking = mask_info + else: + mask = indexer == -1 + needs_masking = mask.any() + if arr.dtype != out.dtype: + arr = arr.astype(out.dtype) + if arr.shape[axis] > 0: + arr.take(_ensure_platform_int(indexer), axis=axis, out=out) + if needs_masking: + outindexer = [slice(None)] * arr.ndim + outindexer[axis] = mask + out[tuple(outindexer)] = fill_value + + +_take_1d_dict = { + ('int8', 'int8'): algos.take_1d_int8_int8, + ('int8', 'int32'): algos.take_1d_int8_int32, + ('int8', 'int64'): algos.take_1d_int8_int64, + ('int8', 'float64'): algos.take_1d_int8_float64, + ('int16', 'int16'): algos.take_1d_int16_int16, + ('int16', 'int32'): algos.take_1d_int16_int32, + ('int16', 'int64'): algos.take_1d_int16_int64, + ('int16', 'float64'): algos.take_1d_int16_float64, + ('int32', 'int32'): algos.take_1d_int32_int32, + ('int32', 'int64'): algos.take_1d_int32_int64, + ('int32', 'float64'): algos.take_1d_int32_float64, + ('int64', 'int64'): algos.take_1d_int64_int64, + ('int64', 'float64'): algos.take_1d_int64_float64, + ('float32', 'float32'): algos.take_1d_float32_float32, + ('float32', 'float64'): algos.take_1d_float32_float64, + ('float64', 'float64'): algos.take_1d_float64_float64, + ('object', 'object'): algos.take_1d_object_object, + ('bool', 'bool'): + _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), + ('bool', 'object'): + _view_wrapper(algos.take_1d_bool_object, np.uint8, None), + ('datetime64[ns]', 'datetime64[ns]'): + _view_wrapper(algos.take_1d_int64_int64, np.int64, np.int64, np.int64) +} + + +_take_2d_axis0_dict = { + ('int8', 'int8'): algos.take_2d_axis0_int8_int8, + ('int8', 'int32'): algos.take_2d_axis0_int8_int32, + ('int8', 'int64'): algos.take_2d_axis0_int8_int64, + ('int8', 'float64'): algos.take_2d_axis0_int8_float64, + ('int16', 'int16'): algos.take_2d_axis0_int16_int16, + ('int16', 'int32'): algos.take_2d_axis0_int16_int32, + ('int16', 'int64'): algos.take_2d_axis0_int16_int64, + ('int16', 'float64'): algos.take_2d_axis0_int16_float64, + ('int32', 'int32'): algos.take_2d_axis0_int32_int32, + ('int32', 'int64'): algos.take_2d_axis0_int32_int64, + ('int32', 'float64'): algos.take_2d_axis0_int32_float64, + ('int64', 'int64'): algos.take_2d_axis0_int64_int64, + ('int64', 'float64'): algos.take_2d_axis0_int64_float64, + ('float32', 'float32'): algos.take_2d_axis0_float32_float32, + ('float32', 'float64'): algos.take_2d_axis0_float32_float64, + ('float64', 'float64'): algos.take_2d_axis0_float64_float64, + ('object', 'object'): algos.take_2d_axis0_object_object, + ('bool', 'bool'): + _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), + ('bool', 'object'): + _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), + ('datetime64[ns]', 'datetime64[ns]'): + _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64, + fill_wrap=np.int64) +} + + +_take_2d_axis1_dict = { + ('int8', 'int8'): algos.take_2d_axis1_int8_int8, + ('int8', 'int32'): algos.take_2d_axis1_int8_int32, + ('int8', 'int64'): algos.take_2d_axis1_int8_int64, + ('int8', 'float64'): algos.take_2d_axis1_int8_float64, + ('int16', 'int16'): algos.take_2d_axis1_int16_int16, + ('int16', 'int32'): algos.take_2d_axis1_int16_int32, + ('int16', 'int64'): algos.take_2d_axis1_int16_int64, + ('int16', 'float64'): algos.take_2d_axis1_int16_float64, + ('int32', 'int32'): algos.take_2d_axis1_int32_int32, + ('int32', 'int64'): algos.take_2d_axis1_int32_int64, + ('int32', 'float64'): algos.take_2d_axis1_int32_float64, + ('int64', 'int64'): algos.take_2d_axis1_int64_int64, + ('int64', 'float64'): algos.take_2d_axis1_int64_float64, + ('float32', 'float32'): algos.take_2d_axis1_float32_float32, + ('float32', 'float64'): algos.take_2d_axis1_float32_float64, + ('float64', 'float64'): algos.take_2d_axis1_float64_float64, + ('object', 'object'): algos.take_2d_axis1_object_object, + ('bool', 'bool'): + _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), + ('bool', 'object'): + _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), + ('datetime64[ns]', 'datetime64[ns]'): + _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64, + fill_wrap=np.int64) +} + + +_take_2d_multi_dict = { + ('int8', 'int8'): algos.take_2d_multi_int8_int8, + ('int8', 'int32'): algos.take_2d_multi_int8_int32, + ('int8', 'int64'): algos.take_2d_multi_int8_int64, + ('int8', 'float64'): algos.take_2d_multi_int8_float64, + ('int16', 'int16'): algos.take_2d_multi_int16_int16, + ('int16', 'int32'): algos.take_2d_multi_int16_int32, + ('int16', 'int64'): algos.take_2d_multi_int16_int64, + ('int16', 'float64'): algos.take_2d_multi_int16_float64, + ('int32', 'int32'): algos.take_2d_multi_int32_int32, + ('int32', 'int64'): algos.take_2d_multi_int32_int64, + ('int32', 'float64'): algos.take_2d_multi_int32_float64, + ('int64', 'int64'): algos.take_2d_multi_int64_int64, + ('int64', 'float64'): algos.take_2d_multi_int64_float64, + ('float32', 'float32'): algos.take_2d_multi_float32_float32, + ('float32', 'float64'): algos.take_2d_multi_float32_float64, + ('float64', 'float64'): algos.take_2d_multi_float64_float64, + ('object', 'object'): algos.take_2d_multi_object_object, + ('bool', 'bool'): + _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), + ('bool', 'object'): + _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), + ('datetime64[ns]', 'datetime64[ns]'): + _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64, + fill_wrap=np.int64) +} + + +def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): + if ndim <= 2: + tup = (arr_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + return func + + tup = (out_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + func = _convert_wrapper(func, out_dtype) + return func + + def func(arr, indexer, out, fill_value=np.nan): + _take_nd_generic(arr, indexer, out, axis=axis, + fill_value=fill_value, mask_info=mask_info) + return func + + +def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, + mask_info=None, allow_fill=True): + """ + Specialized Cython take which sets NaN values in one pass + + Parameters + ---------- + arr : ndarray + Input array + indexer : ndarray + 1-D array of indices to take, subarrays corresponding to -1 value + indicies are filed with fill_value + axis : int, default 0 + Axis to take from + out : ndarray or None, default None + Optional output array, must be appropriate type to hold input and + fill_value together, if indexer has any -1 value entries; call + common._maybe_promote to determine this type for any fill_value + fill_value : any, default np.nan + Fill value to replace -1 values with + mask_info : tuple of (ndarray, boolean) + If provided, value should correspond to: + (indexer != -1, (indexer != -1).any()) + If not provided, it will be computed internally if necessary + allow_fill : boolean, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + """ + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.int64) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = _ensure_int64(indexer) + if not allow_fill: + dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False + else: + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + if mask_info is not None: + mask, needs_masking = mask_info + else: + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking + if needs_masking: + if out is not None and out.dtype != dtype: + raise TypeError('Incompatible type for fill_value') + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + flip_order = False + if arr.ndim == 2: + if arr.flags.f_contiguous: + flip_order = True + + if flip_order: + arr = arr.T + axis = arr.ndim - axis - 1 + if out is not None: + out = out.T + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + if out is None: + out_shape = list(arr.shape) + out_shape[axis] = len(indexer) + out_shape = tuple(out_shape) + if arr.flags.f_contiguous and axis == arr.ndim - 1: + # minor tweak that can make an order-of-magnitude difference + # for dataframes initialized directly from 2-d ndarrays + # (s.t. df.values is c-contiguous and df._data.blocks[0] is its + # f-contiguous transpose) + out = np.empty(out_shape, dtype=dtype, order='F') + else: + out = np.empty(out_shape, dtype=dtype) + + func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, + axis=axis, mask_info=mask_info) + + func(arr, indexer, out, fill_value) + + if flip_order: + out = out.T + return out + + +take_1d = take_nd + + +def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, + mask_info=None, allow_fill=True): + """ + Specialized Cython take which sets NaN values in one pass + """ + if indexer is None or (indexer[0] is None and indexer[1] is None): + row_idx = np.arange(arr.shape[0], dtype=np.int64) + col_idx = np.arange(arr.shape[1], dtype=np.int64) + indexer = row_idx, col_idx + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + row_idx, col_idx = indexer + if row_idx is None: + row_idx = np.arange(arr.shape[0], dtype=np.int64) + else: + row_idx = _ensure_int64(row_idx) + if col_idx is None: + col_idx = np.arange(arr.shape[1], dtype=np.int64) + else: + col_idx = _ensure_int64(col_idx) + indexer = row_idx, col_idx + if not allow_fill: + dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False + else: + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + if mask_info is not None: + (row_mask, col_mask), (row_needs, col_needs) = mask_info + else: + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + if row_needs or col_needs: + if out is not None and out.dtype != dtype: + raise TypeError('Incompatible type for fill_value') + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + if out is None: + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) + + func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) + if func is None and arr.dtype != out.dtype: + func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) + if func is not None: + func = _convert_wrapper(func, out.dtype) + if func is None: + def func(arr, indexer, out, fill_value=np.nan): + _take_2d_multi_generic(arr, indexer, out, + fill_value=fill_value, mask_info=mask_info) + func(arr, indexer, out=out, fill_value=fill_value) + return out + + +_diff_special = { + 'float64': algos.diff_2d_float64, + 'float32': algos.diff_2d_float32, + 'int64': algos.diff_2d_int64, + 'int32': algos.diff_2d_int32, + 'int16': algos.diff_2d_int16, + 'int8': algos.diff_2d_int8, +} + + +def diff(arr, n, axis=0): + """ difference of n between self, + analagoust to s-s.shift(n) """ + + n = int(n) + dtype = arr.dtype + na = np.nan + + if is_timedelta64_dtype(arr) or is_datetime64_dtype(arr): + dtype = 'timedelta64[ns]' + arr = arr.view('i8') + na = tslib.iNaT + elif issubclass(dtype.type, np.integer): + dtype = np.float64 + elif issubclass(dtype.type, np.bool_): + dtype = np.object_ + + out_arr = np.empty(arr.shape, dtype=dtype) + + na_indexer = [slice(None)] * arr.ndim + na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) + out_arr[tuple(na_indexer)] = na + + if arr.ndim == 2 and arr.dtype.name in _diff_special: + f = _diff_special[arr.dtype.name] + f(arr, out_arr, n, axis) + else: + res_indexer = [slice(None)] * arr.ndim + res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) + res_indexer = tuple(res_indexer) + + lag_indexer = [slice(None)] * arr.ndim + lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) + lag_indexer = tuple(lag_indexer) + + # need to make sure that we account for na for datelike/timedelta + # we don't actually want to subtract these i8 numbers + if dtype == 'timedelta64[ns]': + res = arr[res_indexer] + lag = arr[lag_indexer] + + mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na) + if mask.any(): + res = res.copy() + res[mask] = 0 + lag = lag.copy() + lag[mask] = 0 + + result = res - lag + result[mask] = na + out_arr[res_indexer] = result + else: + out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] + + return out_arr + + +def _coerce_to_dtypes(result, dtypes): + """ given a dtypes and a result set, coerce the result elements to the + dtypes + """ + if len(result) != len(dtypes): + raise AssertionError("_coerce_to_dtypes requires equal len arrays") + + from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + + def conv(r, dtype): + try: + if isnull(r): + pass + elif dtype == _NS_DTYPE: + r = lib.Timestamp(r) + elif dtype == _TD_DTYPE: + r = _coerce_scalar_to_timedelta_type(r) + elif dtype == np.bool_: + # messy. non 0/1 integers do not get converted. + if is_integer(r) and r not in [0,1]: + return int(r) + r = bool(r) + elif dtype.kind == 'f': + r = float(r) + elif dtype.kind == 'i': + r = int(r) + except: + pass + + return r + + return [conv(r, dtype) for r, dtype in zip(result, dtypes)] + + +def _infer_dtype_from_scalar(val): + """ interpret the dtype from a scalar, upcast floats and ints + return the new value and the dtype """ + + dtype = np.object_ + + # a 1-element ndarray + if isinstance(val, pa.Array): + if val.ndim != 0: + raise ValueError( + "invalid ndarray passed to _infer_dtype_from_scalar") + + dtype = val.dtype + val = val.item() + + elif isinstance(val, compat.string_types): + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + + dtype = np.object_ + + elif isinstance(val, (np.datetime64, datetime)) and getattr(val,'tz',None) is None: + val = lib.Timestamp(val).value + dtype = np.dtype('M8[ns]') + + elif isinstance(val, (np.timedelta64, timedelta)): + val = tslib.convert_to_timedelta(val,'ns') + dtype = np.dtype('m8[ns]') + + elif is_bool(val): + dtype = np.bool_ + + # provide implicity upcast on scalars + elif is_integer(val): + dtype = np.int64 + + elif is_float(val): + dtype = np.float64 + + elif is_complex(val): + dtype = np.complex_ + + return dtype, val + + +def _maybe_cast_scalar(dtype, value): + """ if we a scalar value and are casting to a dtype that needs nan -> NaT + conversion + """ + if np.isscalar(value) and dtype in _DATELIKE_DTYPES and isnull(value): + return tslib.iNaT + return value + + +def _maybe_promote(dtype, fill_value=np.nan): + + # if we passed an array here, determine the fill value by dtype + if isinstance(fill_value, np.ndarray): + if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): + fill_value = tslib.iNaT + else: + + # we need to change to object type as our + # fill_value is of object type + if fill_value.dtype == np.object_: + dtype = np.dtype(np.object_) + fill_value = np.nan + + # returns tuple of (dtype, fill_value) + if issubclass(dtype.type, (np.datetime64, np.timedelta64)): + # for now: refuse to upcast datetime64 + # (this is because datetime64 will not implicitly upconvert + # to object correctly as of numpy 1.6.1) + if isnull(fill_value): + fill_value = tslib.iNaT + else: + if issubclass(dtype.type, np.datetime64): + try: + fill_value = lib.Timestamp(fill_value).value + except: + # the proper thing to do here would probably be to upcast + # to object (but numpy 1.6.1 doesn't do this properly) + fill_value = tslib.iNaT + else: + fill_value = tslib.iNaT + elif is_float(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + dtype = np.float64 + elif is_bool(fill_value): + if not issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif is_integer(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + # upcast to prevent overflow + arr = np.asarray(fill_value) + if arr != arr.astype(dtype): + dtype = arr.dtype + elif is_complex(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, (np.integer, np.floating)): + dtype = np.complex128 + else: + dtype = np.object_ + + # in case we have a string that looked like a number + if issubclass(np.dtype(dtype).type, compat.string_types): + dtype = np.object_ + + return dtype, fill_value + + +def _maybe_upcast_putmask(result, mask, other, dtype=None, change=None): + """ a safe version of put mask that (potentially upcasts the result + return the result + if change is not None, then MUTATE the change (and change the dtype) + return a changed flag + """ + + if mask.any(): + + other = _maybe_cast_scalar(result.dtype, other) + + def changeit(): + + # try to directly set by expanding our array to full + # length of the boolean + try: + om = other[mask] + om_at = om.astype(result.dtype) + if (om == om_at).all(): + new_other = result.values.copy() + new_other[mask] = om_at + result[:] = new_other + return result, False + except: + pass + + # we are forced to change the dtype of the result as the input + # isn't compatible + r, fill_value = _maybe_upcast( + result, fill_value=other, dtype=dtype, copy=True) + np.putmask(r, mask, other) + + # we need to actually change the dtype here + if change is not None: + + # if we are trying to do something unsafe + # like put a bigger dtype in a smaller one, use the smaller one + # pragma: no cover + if change.dtype.itemsize < r.dtype.itemsize: + raise AssertionError( + "cannot change dtype of input to smaller size") + change.dtype = r.dtype + change[:] = r + + return r, True + + # we want to decide whether putmask will work + # if we have nans in the False portion of our mask then we need to + # upcast (possibily) otherwise we DON't want to upcast (e.g. if we are + # have values, say integers in the success portion then its ok to not + # upcast) + new_dtype, fill_value = _maybe_promote(result.dtype, other) + if new_dtype != result.dtype: + + # we have a scalar or len 0 ndarray + # and its nan and we are changing some values + if (np.isscalar(other) or + (isinstance(other, np.ndarray) and other.ndim < 1)): + if isnull(other): + return changeit() + + # we have an ndarray and the masking has nans in it + else: + + if isnull(other[mask]).any(): + return changeit() + + try: + np.putmask(result, mask, other) + except: + return changeit() + + return result, False + + +def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): + """ provide explict type promotion and coercion + + Parameters + ---------- + values : the ndarray that we want to maybe upcast + fill_value : what we want to fill with + dtype : if None, then use the dtype of the values, else coerce to this type + copy : if True always make a copy even if no upcast is required + """ + + if dtype is None: + dtype = values.dtype + new_dtype, fill_value = _maybe_promote(dtype, fill_value) + if new_dtype != values.dtype: + values = values.astype(new_dtype) + elif copy: + values = values.copy() + return values, fill_value + + +def _possibly_cast_item(obj, item, dtype): + chunk = obj[item] + + if chunk.values.dtype != dtype: + if dtype in (np.object_, np.bool_): + obj[item] = chunk.astype(np.object_) + elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover + raise ValueError("Unexpected dtype encountered: %s" % dtype) + + +def _possibly_downcast_to_dtype(result, dtype): + """ try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 + """ + + if np.isscalar(result): + return result + + trans = lambda x: x + if isinstance(dtype, compat.string_types): + if dtype == 'infer': + inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) + if inferred_type == 'boolean': + dtype = 'bool' + elif inferred_type == 'integer': + dtype = 'int64' + elif inferred_type == 'datetime64': + dtype = 'datetime64[ns]' + elif inferred_type == 'timedelta64': + dtype = 'timedelta64[ns]' + + # try to upcast here + elif inferred_type == 'floating': + dtype = 'int64' + if issubclass(result.dtype.type, np.number): + trans = lambda x: x.round() + + else: + dtype = 'object' + + if isinstance(dtype, compat.string_types): + dtype = np.dtype(dtype) + + try: + + # don't allow upcasts here (except if empty) + if dtype.kind == result.dtype.kind: + if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): + return result + + if issubclass(dtype.type, np.floating): + return result.astype(dtype) + elif dtype == np.bool_ or issubclass(dtype.type, np.integer): + + # if we don't have any elements, just astype it + if not np.prod(result.shape): + return trans(result).astype(dtype) + + # do a test on the first element, if it fails then we are done + r = result.ravel() + arr = np.array([r[0]]) + if not np.allclose(arr, trans(arr).astype(dtype)): + return result + + # a comparable, e.g. a Decimal may slip in here + elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, + float, bool)): + return result + + if (issubclass(result.dtype.type, (np.object_, np.number)) and + notnull(result).all()): + new_result = trans(result).astype(dtype) + try: + if np.allclose(new_result, result): + return new_result + except: + + # comparison of an object dtype with a number type could + # hit here + if (new_result == result).all(): + return new_result + + # a datetimelike + elif dtype.kind in ['M','m'] and result.dtype.kind in ['i']: + try: + result = result.astype(dtype) + except: + pass + + except: + pass + + return result + + +def _lcd_dtypes(a_dtype, b_dtype): + """ return the lcd dtype to hold these types """ + + if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): + return _NS_DTYPE + elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): + return _TD_DTYPE + elif is_complex_dtype(a_dtype): + if is_complex_dtype(b_dtype): + return a_dtype + return np.float64 + elif is_integer_dtype(a_dtype): + if is_integer_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + return np.int64 + return np.float64 + elif is_float_dtype(a_dtype): + if is_float_dtype(b_dtype): + if a_dtype.itemsize == b_dtype.itemsize: + return a_dtype + else: + return np.float64 + elif is_integer(b_dtype): + return np.float64 + return np.object + + +def _fill_zeros(result, x, y, name, fill): + """ + if this is a reversed op, then flip x,y + + if we have an integer value (or array in y) + and we have 0's, fill them with the fill, + return the result + + mask the nan's from x + """ + + if fill is not None: + + if name.startswith('r'): + x,y = y,x + + + if not isinstance(y, np.ndarray): + dtype, value = _infer_dtype_from_scalar(y) + y = pa.empty(result.shape, dtype=dtype) + y.fill(value) + + if is_integer_dtype(y): + + if (y.ravel() == 0).any(): + shape = result.shape + result = result.ravel().astype('float64') + + # GH 7325, mask and nans must be broadcastable + signs = np.sign(result) + mask = ((y == 0) & ~np.isnan(x)).ravel() + + np.putmask(result, mask, fill) + + # if we have a fill of inf, then sign it + # correctly + # GH 6178 + if np.isinf(fill): + np.putmask(result,signs<0 & mask, -fill) + + result = result.reshape(shape) + + return result + + +def _interp_wrapper(f, wrap_dtype, na_override=None): + def wrapper(arr, mask, limit=None): + view = arr.view(wrap_dtype) + f(view, mask, limit=limit) + return wrapper + + +_pad_1d_datetime = _interp_wrapper(algos.pad_inplace_int64, np.int64) +_pad_2d_datetime = _interp_wrapper(algos.pad_2d_inplace_int64, np.int64) +_backfill_1d_datetime = _interp_wrapper(algos.backfill_inplace_int64, + np.int64) +_backfill_2d_datetime = _interp_wrapper(algos.backfill_2d_inplace_int64, + np.int64) + + +def pad_1d(values, limit=None, mask=None, dtype=None): + + if dtype is None: + dtype = values.dtype + _method = None + if is_float_dtype(values): + _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + _method = _pad_1d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.pad_inplace_float64 + elif values.dtype == np.object_: + _method = algos.pad_inplace_object + + if _method is None: + raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + _method(values, mask, limit=limit) + return values + + +def backfill_1d(values, limit=None, mask=None, dtype=None): + + if dtype is None: + dtype = values.dtype + _method = None + if is_float_dtype(values): + _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + _method = _backfill_1d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.backfill_inplace_float64 + elif values.dtype == np.object_: + _method = algos.backfill_inplace_object + + if _method is None: + raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype.name) + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + + _method(values, mask, limit=limit) + return values + + +def pad_2d(values, limit=None, mask=None, dtype=None): + + if dtype is None: + dtype = values.dtype + _method = None + if is_float_dtype(values): + _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + _method = _pad_2d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.pad_2d_inplace_float64 + elif values.dtype == np.object_: + _method = algos.pad_2d_inplace_object + + if _method is None: + raise ValueError('Invalid dtype for pad_2d [%s]' % dtype.name) + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + + if np.all(values.shape): + _method(values, mask, limit=limit) + else: + # for test coverage + pass + return values + + +def backfill_2d(values, limit=None, mask=None, dtype=None): + + if dtype is None: + dtype = values.dtype + _method = None + if is_float_dtype(values): + _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + _method = _backfill_2d_datetime + elif is_integer_dtype(values): + values = _ensure_float64(values) + _method = algos.backfill_2d_inplace_float64 + elif values.dtype == np.object_: + _method = algos.backfill_2d_inplace_object + + if _method is None: + raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + + if np.all(values.shape): + _method(values, mask, limit=limit) + else: + # for test coverage + pass + return values + + +def _clean_interp_method(method, order=None, **kwargs): + valid = ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear', + 'quadratic', 'cubic', 'barycentric', 'polynomial', + 'krogh', 'piecewise_polynomial', + 'pchip', 'spline'] + if method in ('spline', 'polynomial') and order is None: + raise ValueError("You must specify the order of the spline or " + "polynomial.") + if method not in valid: + raise ValueError("method must be one of {0}." + "Got '{1}' instead.".format(valid, method)) + return method + + +def interpolate_1d(xvalues, yvalues, method='linear', limit=None, + fill_value=None, bounds_error=False, **kwargs): + """ + Logic for the 1-d interpolation. The result should be 1-d, inputs + xvalues and yvalues will each be 1-d arrays of the same length. + + Bounds_error is currently hardcoded to False since non-scipy ones don't + take it as an argumnet. + """ + # Treat the original, non-scipy methods first. + + invalid = isnull(yvalues) + valid = ~invalid + + valid_y = yvalues[valid] + valid_x = xvalues[valid] + new_x = xvalues[invalid] + + if method == 'time': + if not getattr(xvalues, 'is_all_dates', None): + # if not issubclass(xvalues.dtype.type, np.datetime64): + raise ValueError('time-weighted interpolation only works ' + 'on Series or DataFrames with a ' + 'DatetimeIndex') + method = 'values' + + def _interp_limit(invalid, limit): + """mask off values that won't be filled since they exceed the limit""" + all_nans = np.where(invalid)[0] + violate = [invalid[x:x + limit + 1] for x in all_nans] + violate = np.array([x.all() & (x.size > limit) for x in violate]) + return all_nans[violate] + limit + + xvalues = getattr(xvalues, 'values', xvalues) + yvalues = getattr(yvalues, 'values', yvalues) + + if limit: + violate_limit = _interp_limit(invalid, limit) + if valid.any(): + firstIndex = valid.argmax() + valid = valid[firstIndex:] + invalid = invalid[firstIndex:] + result = yvalues.copy() + if valid.all(): + return yvalues + else: + # have to call np.array(xvalues) since xvalues could be an Index + # which cant be mutated + result = np.empty_like(np.array(xvalues), dtype=np.float64) + result.fill(np.nan) + return result + + if method in ['linear', 'time', 'index', 'values']: + if method in ('values', 'index'): + inds = np.asarray(xvalues) + # hack for DatetimeIndex, #1646 + if issubclass(inds.dtype.type, np.datetime64): + inds = inds.view(pa.int64) + + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + else: + inds = xvalues + + inds = inds[firstIndex:] + + result[firstIndex:][invalid] = np.interp(inds[invalid], inds[valid], + yvalues[firstIndex:][valid]) + + if limit: + result[violate_limit] = np.nan + return result + + sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'krogh', 'spline', 'polynomial', + 'piecewise_polynomial', 'pchip'] + if method in sp_methods: + new_x = new_x[firstIndex:] + xvalues = xvalues[firstIndex:] + + result[firstIndex:][invalid] = _interpolate_scipy_wrapper( + valid_x, valid_y, new_x, method=method, fill_value=fill_value, + bounds_error=bounds_error, **kwargs) + if limit: + result[violate_limit] = np.nan + return result + + +def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, + bounds_error=False, order=None, **kwargs): + """ + passed off to scipy.interpolate.interp1d. method is scipy's kind. + Returns an array interpolated at new_x. Add any new methods to + the list in _clean_interp_method + """ + try: + from scipy import interpolate + from pandas import DatetimeIndex + except ImportError: + raise ImportError('{0} interpolation requires Scipy'.format(method)) + + new_x = np.asarray(new_x) + + # ignores some kwargs that could be passed along. + alt_methods = { + 'barycentric': interpolate.barycentric_interpolate, + 'krogh': interpolate.krogh_interpolate, + 'piecewise_polynomial': interpolate.piecewise_polynomial_interpolate, + } + + if getattr(x, 'is_all_dates', False): + # GH 5975, scipy.interp1d can't hande datetime64s + x, new_x = x.values.astype('i8'), new_x.astype('i8') + + try: + alt_methods['pchip'] = interpolate.pchip_interpolate + except AttributeError: + if method == 'pchip': + raise ImportError("Your version of scipy does not support " + "PCHIP interpolation.") + + interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'polynomial'] + if method in interp1d_methods: + if method == 'polynomial': + method = order + terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value, + bounds_error=bounds_error) + new_y = terp(new_x) + elif method == 'spline': + terp = interpolate.UnivariateSpline(x, y, k=order) + new_y = terp(new_x) + else: + # GH 7295: need to be able to write for some reason + # in some circumstances: check all three + if not x.flags.writeable: + x = x.copy() + if not y.flags.writeable: + y = y.copy() + if not new_x.flags.writeable: + new_x = new_x.copy() + method = alt_methods[method] + new_y = method(x, y, new_x) + return new_y + + +def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, dtype=None): + """ perform an actual interpolation of values, values will be make 2-d if + needed fills inplace, returns the result + """ + + transf = (lambda x: x) if axis == 0 else (lambda x: x.T) + + # reshape a 1 dim if needed + ndim = values.ndim + if values.ndim == 1: + if axis != 0: # pragma: no cover + raise AssertionError("cannot interpolate on a ndim == 1 with " + "axis != 0") + values = values.reshape(tuple((1,) + values.shape)) + + if fill_value is None: + mask = None + else: # todo create faster fill func without masking + mask = mask_missing(transf(values), fill_value) + + method = _clean_fill_method(method) + if method == 'pad': + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + else: + values = transf(backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + + # reshape back + if ndim == 1: + values = values[0] + + return values + + +def _consensus_name_attr(objs): + name = objs[0].name + for obj in objs[1:]: + if obj.name != name: + return None + return name + + +_fill_methods = {'pad': pad_1d, 'backfill': backfill_1d} + + +def _get_fill_func(method): + method = _clean_fill_method(method) + return _fill_methods[method] + + +#---------------------------------------------------------------------- +# Lots of little utilities + +def _validate_date_like_dtype(dtype): + try: + typ = np.datetime_data(dtype)[0] + except ValueError as e: + raise TypeError('%s' % e) + if typ != 'generic' and typ != 'ns': + raise ValueError('%r is too specific of a frequency, try passing %r' + % (dtype.name, dtype.type.__name__)) + + +def _invalidate_string_dtypes(dtype_set): + """Change string like dtypes to object for ``DataFrame.select_dtypes()``.""" + non_string_dtypes = dtype_set - _string_dtypes + if non_string_dtypes != dtype_set: + raise TypeError("string dtypes are not allowed, use 'object' instead") + + +def _get_dtype_from_object(dtype): + """Get a numpy dtype.type-style object. + + Notes + ----- + If nothing can be found, returns ``object``. + """ + # type object from a dtype + if isinstance(dtype, type) and issubclass(dtype, np.generic): + return dtype + elif isinstance(dtype, np.dtype): # dtype object + try: + _validate_date_like_dtype(dtype) + except TypeError: + # should still pass if we don't have a datelike + pass + return dtype.type + elif isinstance(dtype, compat.string_types): + if dtype == 'datetime' or dtype == 'timedelta': + dtype += '64' + try: + return _get_dtype_from_object(getattr(np, dtype)) + except AttributeError: + # handles cases like _get_dtype(int) + # i.e., python objects that are valid dtypes (unlike user-defined + # types, in general) + pass + return _get_dtype_from_object(np.dtype(dtype)) + + +_string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, + compat.text_type))) + + +def _get_info_slice(obj, indexer): + """Slice the info axis of `obj` with `indexer`.""" + if not hasattr(obj, '_info_axis_number'): + raise TypeError('object of type %r has no info axis' % + type(obj).__name__) + slices = [slice(None)] * obj.ndim + slices[obj._info_axis_number] = indexer + return tuple(slices) + + +def _maybe_box(indexer, values, obj, key): + + # if we have multiples coming back, box em + if isinstance(values, np.ndarray): + return obj[indexer.get_loc(key)] + + # return the value + return values + + +def _maybe_box_datetimelike(value): + # turn a datetime like into a Timestamp/timedelta as needed + + if isinstance(value, np.datetime64): + value = tslib.Timestamp(value) + elif isinstance(value, np.timedelta64): + pass + + return value + +_values_from_object = lib.values_from_object + +def _possibly_convert_objects(values, convert_dates=True, + convert_numeric=True, + convert_timedeltas=True): + """ if we have an object dtype, try to coerce dates and/or numbers """ + + # if we have passed in a list or scalar + if isinstance(values, (list, tuple)): + values = np.array(values, dtype=np.object_) + if not hasattr(values, 'dtype'): + values = np.array([values], dtype=np.object_) + + # convert dates + if convert_dates and values.dtype == np.object_: + + # we take an aggressive stance and convert to datetime64[ns] + if convert_dates == 'coerce': + new_values = _possibly_cast_to_datetime( + values, 'M8[ns]', coerce=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects( + values, convert_datetime=convert_dates) + + # convert timedeltas + if convert_timedeltas and values.dtype == np.object_: + + if convert_timedeltas == 'coerce': + from pandas.tseries.timedeltas import \ + _possibly_cast_to_timedelta + values = _possibly_cast_to_timedelta(values, coerce=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects( + values, convert_timedelta=convert_timedeltas) + + # convert to numeric + if values.dtype == np.object_: + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + except: + pass + else: + + # soft-conversion + values = lib.maybe_convert_objects(values) + + return values + + +def _possibly_castable(arr): + # return False to force a non-fastpath + + # check datetime64[ns]/timedelta64[ns] are valid + # otherwise try to coerce + kind = arr.dtype.kind + if kind == 'M' or kind == 'm': + return arr.dtype in _DATELIKE_DTYPES + + return arr.dtype.name not in _POSSIBLY_CAST_DTYPES + + +def _possibly_convert_platform(values): + """ try to do platform conversion, allow ndarray or list here """ + + if isinstance(values, (list, tuple)): + values = lib.list_to_object_array(values) + if getattr(values, 'dtype', None) == np.object_: + if hasattr(values, 'values'): + values = values.values + values = lib.maybe_convert_objects(values) + + return values + + +def _possibly_cast_to_datetime(value, dtype, coerce=False): + """ try to cast the array/value to a datetimelike dtype, converting float + nan to iNaT + """ + + if dtype is not None: + if isinstance(dtype, compat.string_types): + dtype = np.dtype(dtype) + + is_datetime64 = is_datetime64_dtype(dtype) + is_timedelta64 = is_timedelta64_dtype(dtype) + + if is_datetime64 or is_timedelta64: + + # force the dtype if needed + if is_datetime64 and dtype != _NS_DTYPE: + if dtype.name == 'datetime64[ns]': + dtype = _NS_DTYPE + else: + raise TypeError( + "cannot convert datetimelike to dtype [%s]" % dtype) + elif is_timedelta64 and dtype != _TD_DTYPE: + if dtype.name == 'timedelta64[ns]': + dtype = _TD_DTYPE + else: + raise TypeError( + "cannot convert timedeltalike to dtype [%s]" % dtype) + + if np.isscalar(value): + if value == tslib.iNaT or isnull(value): + value = tslib.iNaT + else: + value = np.array(value,copy=False) + + # have a scalar array-like (e.g. NaT) + if value.ndim == 0: + value = tslib.iNaT + + # we have an array of datetime or timedeltas & nulls + elif np.prod(value.shape) and value.dtype != dtype: + try: + if is_datetime64: + from pandas.tseries.tools import to_datetime + value = to_datetime(value, coerce=coerce).values + elif is_timedelta64: + from pandas.tseries.timedeltas import \ + _possibly_cast_to_timedelta + value = _possibly_cast_to_timedelta(value, coerce='compat', dtype=dtype) + except: + pass + + else: + + is_array = isinstance(value, np.ndarray) + + # catch a datetime/timedelta that is not of ns variety + # and no coercion specified + if (is_array and value.dtype.kind in ['M','m']): + dtype = value.dtype + + if dtype.kind == 'M' and dtype != _NS_DTYPE: + value = value.astype(_NS_DTYPE) + + elif dtype.kind == 'm' and dtype != _TD_DTYPE: + from pandas.tseries.timedeltas import \ + _possibly_cast_to_timedelta + value = _possibly_cast_to_timedelta(value, coerce='compat') + + # only do this if we have an array and the dtype of the array is not + # setup already we are not an integer/object, so don't bother with this + # conversion + elif (is_array and not ( + issubclass(value.dtype.type, np.integer) or + value.dtype == np.object_)): + pass + + # try to infer if we have a datetimelike here + # otherwise pass thru + else: + value = _possibly_infer_to_datetimelike(value) + + return value + + +def _possibly_infer_to_datetimelike(value): + # we might have a array (or single object) that is datetime like, + # and no dtype is passed don't change the value unless we find a + # datetime/timedelta set + + # this is pretty strict in that a datetime/timedelta is REQUIRED + # in addition to possible nulls/string likes + + # ONLY strings are NOT datetimelike + + v = value + if not is_list_like(v): + v = [v] + v = np.array(v,copy=False) + shape = v.shape + if not v.ndim == 1: + v = v.ravel() + + if len(v): + + def _try_datetime(v): + # safe coerce to datetime64 + try: + return tslib.array_to_datetime(v, raise_=True).reshape(shape) + except: + return v + + def _try_timedelta(v): + # safe coerce to timedelta64 + + # will try first with a string & object conversion + from pandas.tseries.timedeltas import to_timedelta + try: + return to_timedelta(v).values.reshape(shape) + except: + + # this is for compat with numpy < 1.7 + # but string-likes will fail here + + from pandas.tseries.timedeltas import \ + _possibly_cast_to_timedelta + try: + return _possibly_cast_to_timedelta(v, coerce='compat').reshape(shape) + except: + return v + + # do a quick inference for perf + sample = v[:min(3,len(v))] + inferred_type = lib.infer_dtype(sample) + + if inferred_type in ['datetime', 'datetime64']: + value = _try_datetime(v) + elif inferred_type in ['timedelta', 'timedelta64']: + value = _try_timedelta(v) + + # its possible to have nulls intermixed within the datetime or timedelta + # these will in general have an inferred_type of 'mixed', so have to try + # both datetime and timedelta + + # try timedelta first to avoid spurious datetime conversions + # e.g. '00:00:01' is a timedelta but technically is also a datetime + elif inferred_type in ['mixed']: + + if lib.is_possible_datetimelike_array(_ensure_object(v)): + value = _try_timedelta(v) + if lib.infer_dtype(value) in ['mixed']: + value = _try_datetime(v) + + return value + + +def _is_bool_indexer(key): + if isinstance(key, (ABCSeries, np.ndarray)): + if key.dtype == np.object_: + key = np.asarray(_values_from_object(key)) + + if not lib.is_bool_array(key): + if isnull(key).any(): + raise ValueError('cannot index with vector containing ' + 'NA / NaN values') + return False + return True + elif key.dtype == np.bool_: + return True + elif isinstance(key, list): + try: + arr = np.asarray(key) + return arr.dtype == np.bool_ and len(arr) == len(key) + except TypeError: # pragma: no cover + return False + + return False + + +def _default_index(n): + from pandas.core.index import Int64Index + values = np.arange(n, dtype=np.int64) + result = values.view(Int64Index) + result.name = None + result.is_unique = True + return result + + +def ensure_float(arr): + if issubclass(arr.dtype.type, (np.integer, np.bool_)): + arr = arr.astype(float) + return arr + + +def _mut_exclusive(**kwargs): + item1, item2 = kwargs.items() + label1, val1 = item1 + label2, val2 = item2 + if val1 is not None and val2 is not None: + raise TypeError('mutually exclusive arguments: %r and %r' % + (label1, label2)) + elif val1 is not None: + return val1 + else: + return val2 + + +def _any_none(*args): + for arg in args: + if arg is None: + return True + return False + + +def _all_not_none(*args): + for arg in args: + if arg is None: + return False + return True + + +def _try_sort(iterable): + listed = list(iterable) + try: + return sorted(listed) + except Exception: + return listed + + +def _count_not_none(*args): + return sum(x is not None for x in args) + +#------------------------------------------------------------------------------ +# miscellaneous python tools + + +def rands(n): + """Generates a random alphanumeric string of length *n*""" + from random import Random + import string + return ''.join(Random().sample(string.ascii_letters + string.digits, n)) + + +def adjoin(space, *lists): + """ + Glues together two sets of strings using the amount of space requested. + The idea is to prettify. + """ + out_lines = [] + newLists = [] + lengths = [max(map(len, x)) + space for x in lists[:-1]] + + # not the last one + lengths.append(max(map(len, lists[-1]))) + + maxLen = max(map(len, lists)) + for i, lst in enumerate(lists): + nl = [x.ljust(lengths[i]) for x in lst] + nl.extend([' ' * lengths[i]] * (maxLen - len(lst))) + newLists.append(nl) + toJoin = zip(*newLists) + for lines in toJoin: + out_lines.append(_join_unicode(lines)) + return _join_unicode(out_lines, sep='\n') + + +def _join_unicode(lines, sep=''): + try: + return sep.join(lines) + except UnicodeDecodeError: + sep = compat.text_type(sep) + return sep.join([x.decode('utf-8') if isinstance(x, str) else x + for x in lines]) + + +def iterpairs(seq): + """ + Parameters + ---------- + seq: sequence + + Returns + ------- + iterator returning overlapping pairs of elements + + Examples + -------- + >>> iterpairs([1, 2, 3, 4]) + [(1, 2), (2, 3), (3, 4) + """ + # input may not be sliceable + seq_it = iter(seq) + seq_it_next = iter(seq) + next(seq_it_next) + + return zip(seq_it, seq_it_next) + + +def split_ranges(mask): + """ Generates tuples of ranges which cover all True value in mask + + >>> list(split_ranges([1,0,0,1,0])) + [(0, 1), (3, 4)] + """ + ranges = [(0, len(mask))] + + for pos, val in enumerate(mask): + if not val: # this pos should be ommited, split off the prefix range + r = ranges.pop() + if pos > r[0]: # yield non-zero range + yield (r[0], pos) + if pos + 1 < len(mask): # save the rest for processing + ranges.append((pos + 1, len(mask))) + if ranges: + yield ranges[-1] + + +def indent(string, spaces=4): + dent = ' ' * spaces + return '\n'.join([dent + x for x in string.split('\n')]) + + +def banner(message): + """ + Return 80-char width message declaration with = bars on top and bottom. + """ + bar = '=' * 80 + return '%s\n%s\n%s' % (bar, message, bar) + + +def _long_prod(vals): + result = long(1) + for x in vals: + result *= x + return result + + +class groupby(dict): + + """ + A simple groupby different from the one in itertools. + + Does not require the sequence elements to be sorted by keys, + however it is slower. + """ + + def __init__(self, seq, key=lambda x: x): + for value in seq: + k = key(value) + self.setdefault(k, []).append(value) + try: + __iter__ = dict.iteritems + except AttributeError: # pragma: no cover + # Python 3 + def __iter__(self): + return iter(dict.items(self)) + + +def map_indices_py(arr): + """ + Returns a dictionary with (element, index) pairs for each element in the + given array/list + """ + return dict([(x, i) for i, x in enumerate(arr)]) + + +def union(*seqs): + result = set([]) + for seq in seqs: + if not isinstance(seq, set): + seq = set(seq) + result |= seq + return type(seqs[0])(list(result)) + + +def difference(a, b): + return type(a)(list(set(a) - set(b))) + + +def intersection(*seqs): + result = set(seqs[0]) + for seq in seqs: + if not isinstance(seq, set): + seq = set(seq) + result &= seq + return type(seqs[0])(list(result)) + + +def _asarray_tuplesafe(values, dtype=None): + from pandas.core.index import Index + + if not (isinstance(values, (list, tuple)) + or hasattr(values, '__array__')): + values = list(values) + elif isinstance(values, Index): + return values.values + + if isinstance(values, list) and dtype in [np.object_, object]: + return lib.list_to_object_array(values) + + result = np.asarray(values, dtype=dtype) + + if issubclass(result.dtype.type, compat.string_types): + result = np.asarray(values, dtype=object) + + if result.ndim == 2: + if isinstance(values, list): + return lib.list_to_object_array(values) + else: + # Making a 1D array that safely contains tuples is a bit tricky + # in numpy, leading to the following + try: + result = np.empty(len(values), dtype=object) + result[:] = values + except ValueError: + # we have a list-of-list + result[:] = [tuple(x) for x in values] + + return result + + +def _index_labels_to_array(labels): + if isinstance(labels, (compat.string_types, tuple)): + labels = [labels] + + if not isinstance(labels, (list, np.ndarray)): + try: + labels = list(labels) + except TypeError: # non-iterable + labels = [labels] + + labels = _asarray_tuplesafe(labels) + + return labels + + +def _maybe_make_list(obj): + if obj is not None and not isinstance(obj, (tuple, list)): + return [obj] + return obj + + +is_bool = lib.is_bool + + +is_integer = lib.is_integer + + +is_float = lib.is_float + + +is_complex = lib.is_complex + + +def is_iterator(obj): + # python 3 generators have __next__ instead of next + return hasattr(obj, 'next') or hasattr(obj, '__next__') + + +def is_number(obj): + return isinstance(obj, (numbers.Number, np.number)) + + +def _get_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype + if isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype) + return arr_or_dtype.dtype + + +def _get_dtype_type(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype.type + if isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype).type + return arr_or_dtype.dtype.type + + +def _is_any_int_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.integer) + + +def is_integer_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def _is_int_or_datetime_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) or + issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_datetime64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.datetime64) + + +def is_datetime64_ns_dtype(arr_or_dtype): + tipo = _get_dtype(arr_or_dtype) + return tipo == _NS_DTYPE + + +def is_timedelta64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.timedelta64) + + +def is_timedelta64_ns_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return tipo == _TD_DTYPE + + +def _is_datetime_or_timedelta_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, (np.datetime64, np.timedelta64)) + + +needs_i8_conversion = _is_datetime_or_timedelta_dtype + + +def is_numeric_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, (np.number, np.bool_)) + and not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_float_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.floating) + + +def _is_floating_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return isinstance(tipo, np.floating) + + +def is_bool_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.bool_) + + +def is_complex_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.complexfloating) + + +def is_object_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.object_) + + +def is_re(obj): + return isinstance(obj, re._pattern_type) + + +def is_re_compilable(obj): + try: + re.compile(obj) + except TypeError: + return False + else: + return True + + +def is_list_like(arg): + return (hasattr(arg, '__iter__') and + not isinstance(arg, compat.string_and_binary_types)) + + +def _is_sequence(x): + try: + iter(x) + len(x) # it has a length + return not isinstance(x, compat.string_and_binary_types) + except (TypeError, AttributeError): + return False + + +_ensure_float64 = algos.ensure_float64 +_ensure_float32 = algos.ensure_float32 +_ensure_int64 = algos.ensure_int64 +_ensure_int32 = algos.ensure_int32 +_ensure_int16 = algos.ensure_int16 +_ensure_int8 = algos.ensure_int8 +_ensure_platform_int = algos.ensure_platform_int +_ensure_object = algos.ensure_object + + +def _astype_nansafe(arr, dtype, copy=True): + """ return a view if copy is False, but + need to be very careful as the result shape could change! """ + if not isinstance(dtype, np.dtype): + dtype = np.dtype(dtype) + + if is_datetime64_dtype(arr): + if dtype == object: + return tslib.ints_to_pydatetime(arr.view(np.int64)) + elif dtype == np.int64: + return arr.view(dtype) + elif dtype != _NS_DTYPE: + raise TypeError("cannot astype a datetimelike from [%s] to [%s]" % + (arr.dtype, dtype)) + return arr.astype(_NS_DTYPE) + elif is_timedelta64_dtype(arr): + if dtype == np.int64: + return arr.view(dtype) + elif dtype == object: + return arr.astype(object) + + # in py3, timedelta64[ns] are int64 + elif ((compat.PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or + (not compat.PY3 and dtype != _TD_DTYPE)): + + # allow frequency conversions + if dtype.kind == 'm': + mask = isnull(arr) + result = arr.astype(dtype).astype(np.float64) + result[mask] = np.nan + return result + + raise TypeError("cannot astype a timedelta from [%s] to [%s]" % + (arr.dtype, dtype)) + + return arr.astype(_TD_DTYPE) + elif (np.issubdtype(arr.dtype, np.floating) and + np.issubdtype(dtype, np.integer)): + + if np.isnan(arr).any(): + raise ValueError('Cannot convert NA to integer') + elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): + # work around NumPy brokenness, #1987 + return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) + elif issubclass(dtype.type, compat.string_types): + return lib.astype_str(arr.ravel()).reshape(arr.shape) + + if copy: + return arr.astype(dtype) + return arr.view(dtype) + + +def _clean_fill_method(method): + if method is None: + return None + method = method.lower() + if method == 'ffill': + method = 'pad' + if method == 'bfill': + method = 'backfill' + if method not in ['pad', 'backfill']: + msg = ('Invalid fill method. Expecting pad (ffill) or backfill ' + '(bfill). Got %s' % method) + raise ValueError(msg) + return method + + +def _all_none(*args): + for arg in args: + if arg is not None: + return False + return True + + +class UTF8Recoder: + + """ + Iterator that reads an encoded stream and reencodes the input to UTF-8 + """ + + def __init__(self, f, encoding): + self.reader = codecs.getreader(encoding)(f) + + def __iter__(self): + return self + + def read(self, bytes=-1): + return self.reader.read(bytes).encode('utf-8') + + def readline(self): + return self.reader.readline().encode('utf-8') + + def next(self): + return next(self.reader).encode("utf-8") + + # Python 3 iterator + __next__ = next + + +def _get_handle(path, mode, encoding=None, compression=None): + """Gets file handle for given path and mode. + NOTE: Under Python 3.2, getting a compressed file handle means reading in + the entire file, decompressing it and decoding it to ``str`` all at once + and then wrapping it in a StringIO. + """ + if compression is not None: + if encoding is not None and not compat.PY3: + msg = 'encoding + compression not yet supported in Python 2' + raise ValueError(msg) + + if compression == 'gzip': + import gzip + f = gzip.GzipFile(path, 'rb') + elif compression == 'bz2': + import bz2 + + f = bz2.BZ2File(path, 'rb') + else: + raise ValueError('Unrecognized compression type: %s' % + compression) + if compat.PY3_2: + # gzip and bz2 don't work with TextIOWrapper in 3.2 + encoding = encoding or get_option('display.encoding') + f = StringIO(f.read().decode(encoding)) + elif compat.PY3: + from io import TextIOWrapper + f = TextIOWrapper(f, encoding=encoding) + return f + else: + if compat.PY3: + if encoding: + f = open(path, mode, encoding=encoding) + else: + f = open(path, mode, errors='replace') + else: + f = open(path, mode) + + return f + + +if compat.PY3: # pragma: no cover + def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): + # ignore encoding + return csv.reader(f, dialect=dialect, **kwds) + + def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): + return csv.writer(f, dialect=dialect, **kwds) +else: + class UnicodeReader: + + """ + A CSV reader which will iterate over lines in the CSV file "f", + which is encoded in the given encoding. + + On Python 3, this is replaced (below) by csv.reader, which handles + unicode. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + f = UTF8Recoder(f, encoding) + self.reader = csv.reader(f, dialect=dialect, **kwds) + + def next(self): + row = next(self.reader) + return [compat.text_type(s, "utf-8") for s in row] + + # python 3 iterator + __next__ = next + + def __iter__(self): # pragma: no cover + return self + + class UnicodeWriter: + + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + self.quoting = kwds.get("quoting", None) + + def writerow(self, row): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + row = [x if _check_as_is(x) + else pprint_thing(x).encode('utf-8') for x in row] + + self.writer.writerow([s for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + def writerows(self, rows): + def _check_as_is(x): + return (self.quoting == csv.QUOTE_NONNUMERIC and + is_number(x)) or isinstance(x, str) + + for i, row in enumerate(rows): + rows[i] = [x if _check_as_is(x) + else pprint_thing(x).encode('utf-8') for x in row] + + self.writer.writerows([[s for s in row] for row in rows]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + +def _concat_compat(to_concat, axis=0): + # filter empty arrays + nonempty = [x for x in to_concat if x.shape[axis] > 0] + + # If all arrays are empty, there's nothing to convert, just short-cut to + # the concatenation, #3121. + # + # Creating an empty array directly is tempting, but the winnings would be + # marginal given that it would still require shape & dtype calculation and + # np.concatenate which has them both implemented is compiled. + if nonempty: + is_datetime64 = [x.dtype == _NS_DTYPE for x in nonempty] + if all(is_datetime64): + # work around NumPy 1.6 bug + new_values = np.concatenate([x.view(np.int64) for x in nonempty], + axis=axis) + return new_values.view(_NS_DTYPE) + elif any(is_datetime64): + to_concat = [_to_pydatetime(x) for x in nonempty] + + return np.concatenate(to_concat, axis=axis) + + +def _to_pydatetime(x): + if x.dtype == _NS_DTYPE: + shape = x.shape + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) + x = x.reshape(shape) + + return x + + +def _where_compat(mask, arr1, arr2): + if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE: + new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8')) + return new_vals.view(_NS_DTYPE) + + import pandas.tslib as tslib + if arr1.dtype == _NS_DTYPE: + arr1 = tslib.ints_to_pydatetime(arr1.view('i8')) + if arr2.dtype == _NS_DTYPE: + arr2 = tslib.ints_to_pydatetime(arr2.view('i8')) + + return np.where(mask, arr1, arr2) + + +def sentinel_factory(): + class Sentinel(object): + pass + + return Sentinel() + + +def in_interactive_session(): + """ check if we're running in an interactive shell + + returns True if running under python/ipython interactive shell + """ + def check_main(): + import __main__ as main + return (not hasattr(main, '__file__') or + get_option('mode.sim_interactive')) + + try: + return __IPYTHON__ or check_main() + except: + return check_main() + + +def in_qtconsole(): + """ + check if we're inside an IPython qtconsole + + DEPRECATED: This is no longer needed, or working, in IPython 3 and above. + """ + try: + ip = get_ipython() + front_end = ( + ip.config.get('KernelApp', {}).get('parent_appname', "") or + ip.config.get('IPKernelApp', {}).get('parent_appname', "") + ) + if 'qtconsole' in front_end.lower(): + return True + except: + return False + return False + + +def in_ipnb(): + """ + check if we're inside an IPython Notebook + + DEPRECATED: This is no longer used in pandas, and won't work in IPython 3 + and above. + """ + try: + ip = get_ipython() + front_end = ( + ip.config.get('KernelApp', {}).get('parent_appname', "") or + ip.config.get('IPKernelApp', {}).get('parent_appname', "") + ) + if 'notebook' in front_end.lower(): + return True + except: + return False + return False + + +def in_ipython_frontend(): + """ + check if we're inside an an IPython zmq frontend + """ + try: + ip = get_ipython() + return 'zmq' in str(type(ip)).lower() + except: + pass + + return False + +# Unicode consolidation +# --------------------- +# +# pprinting utility functions for generating Unicode text or +# bytes(3.x)/str(2.x) representations of objects. +# Try to use these as much as possible rather then rolling your own. +# +# When to use +# ----------- +# +# 1) If you're writing code internal to pandas (no I/O directly involved), +# use pprint_thing(). +# +# It will always return unicode text which can handled by other +# parts of the package without breakage. +# +# 2) If you need to send something to the console, use console_encode(). +# +# console_encode() should (hopefully) choose the right encoding for you +# based on the encoding set in option "display.encoding" +# +# 3) if you need to write something out to file, use +# pprint_thing_encoded(encoding). +# +# If no encoding is specified, it defaults to utf-8. Since encoding pure +# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're +# working with straight ascii. + + +def _pprint_seq(seq, _nest_lvl=0, **kwds): + """ + internal. pprinter for iterables. you should probably use pprint_thing() + rather then calling this directly. + + bounds length of printed sequence, depending on options + """ + if isinstance(seq, set): + fmt = u("set([%s])") + else: + fmt = u("[%s]") if hasattr(seq, '__setitem__') else u("(%s)") + + nitems = get_option("max_seq_items") or len(seq) + + s = iter(seq) + r = [] + for i in range(min(nitems, len(seq))): # handle sets, no slicing + r.append(pprint_thing(next(s), _nest_lvl + 1, **kwds)) + body = ", ".join(r) + + if nitems < len(seq): + body += ", ..." + elif isinstance(seq, tuple) and len(seq) == 1: + body += ',' + + return fmt % body + + +def _pprint_dict(seq, _nest_lvl=0, **kwds): + """ + internal. pprinter for iterables. you should probably use pprint_thing() + rather then calling this directly. + """ + fmt = u("{%s}") + pairs = [] + + pfmt = u("%s: %s") + + nitems = get_option("max_seq_items") or len(seq) + + for k, v in list(seq.items())[:nitems]: + pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, **kwds), + pprint_thing(v, _nest_lvl + 1, **kwds))) + + if nitems < len(seq): + return fmt % (", ".join(pairs) + ", ...") + else: + return fmt % ", ".join(pairs) + + +def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, + quote_strings=False): + """ + This function is the sanctioned way of converting objects + to a unicode representation. + + properly handles nested sequences containing unicode strings + (unicode(object) does not) + + Parameters + ---------- + thing : anything to be formatted + _nest_lvl : internal use only. pprint_thing() is mutually-recursive + with pprint_sequence, this argument is used to keep track of the + current nesting level, and limit it. + escape_chars : list or dict, optional + Characters to escape. If a dict is passed the values are the + replacements + default_escapes : bool, default False + Whether the input escape characters replaces or adds to the defaults + + Returns + ------- + result - unicode object on py2, str on py3. Always Unicode. + + """ + def as_escaped_unicode(thing, escape_chars=escape_chars): + # Unicode is fine, else we try to decode using utf-8 and 'replace' + # if that's not it either, we have no way of knowing and the user + # should deal with it himself. + + try: + result = compat.text_type(thing) # we should try this first + except UnicodeDecodeError: + # either utf-8 or we replace errors + result = str(thing).decode('utf-8', "replace") + + translate = {'\t': r'\t', + '\n': r'\n', + '\r': r'\r', + } + if isinstance(escape_chars, dict): + if default_escapes: + translate.update(escape_chars) + else: + translate = escape_chars + escape_chars = list(escape_chars.keys()) + else: + escape_chars = escape_chars or tuple() + for c in escape_chars: + result = result.replace(c, translate[c]) + + return compat.text_type(result) + + if (compat.PY3 and hasattr(thing, '__next__')) or hasattr(thing, 'next'): + return compat.text_type(thing) + elif (isinstance(thing, dict) and + _nest_lvl < get_option("display.pprint_nest_depth")): + result = _pprint_dict(thing, _nest_lvl, quote_strings=True) + elif _is_sequence(thing) and _nest_lvl < \ + get_option("display.pprint_nest_depth"): + result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, + quote_strings=quote_strings) + elif isinstance(thing, compat.string_types) and quote_strings: + if compat.PY3: + fmt = "'%s'" + else: + fmt = "u'%s'" + result = fmt % as_escaped_unicode(thing) + else: + result = as_escaped_unicode(thing) + + return compat.text_type(result) # always unicode + + +def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds): + value = pprint_thing(object) # get unicode representation of object + return value.encode(encoding, errors, **kwds) + + +def console_encode(object, **kwds): + """ + this is the sanctioned way to prepare something for + sending *to the console*, it delegates to pprint_thing() to get + a unicode representation of the object relies on the global encoding + set in display.encoding. Use this everywhere + where you output to the console. + """ + return pprint_thing_encoded(object, + get_option("display.encoding")) + + +def load(path): # TODO remove in 0.13 + """ + Load pickled pandas object (or any other pickled object) from the specified + file path + + Warning: Loading pickled data received from untrusted sources can be + unsafe. See: http://docs.python.org/2.7/library/pickle.html + + Parameters + ---------- + path : string + File path + + Returns + ------- + unpickled : type of object stored in file + """ + import warnings + warnings.warn("load is deprecated, use read_pickle", FutureWarning) + from pandas.io.pickle import read_pickle + return read_pickle(path) + + +def save(obj, path): # TODO remove in 0.13 + """ + Pickle (serialize) object to input file path + + Parameters + ---------- + obj : any object + path : string + File path + """ + import warnings + warnings.warn("save is deprecated, use obj.to_pickle", FutureWarning) + from pandas.io.pickle import to_pickle + return to_pickle(obj, path) + + +def _maybe_match_name(a, b): + a_name = getattr(a, 'name', None) + b_name = getattr(b, 'name', None) + if a_name == b_name: + return a_name + return None diff --git a/pandas/core/config.py b/pandas/core/config.py new file mode 100644 index 00000000..3e8d7650 --- /dev/null +++ b/pandas/core/config.py @@ -0,0 +1,805 @@ +""" +The config module holds package-wide configurables and provides +a uniform API for working with them. + +Overview +======== + +This module supports the following requirements: +- options are referenced using keys in dot.notation, e.g. "x.y.option - z". +- keys are case-insensitive. +- functions should accept partial/regex keys, when unambiguous. +- options can be registered by modules at import time. +- options can be registered at init-time (via core.config_init) +- options have a default value, and (optionally) a description and + validation function associated with them. +- options can be deprecated, in which case referencing them + should produce a warning. +- deprecated options can optionally be rerouted to a replacement + so that accessing a deprecated option reroutes to a differently + named option. +- options can be reset to their default value. +- all option can be reset to their default value at once. +- all options in a certain sub - namespace can be reset at once. +- the user can set / get / reset or ask for the description of an option. +- a developer can register and mark an option as deprecated. +- you can register a callback to be invoked when the the option value + is set or reset. Changing the stored value is considered misuse, but + is not verboten. + +Implementation +============== + +- Data is stored using nested dictionaries, and should be accessed + through the provided API. + +- "Registered options" and "Deprecated options" have metadata associcated + with them, which are stored in auxilary dictionaries keyed on the + fully-qualified key, e.g. "x.y.z.option". + +- the config_init module is imported by the package's __init__.py file. + placing any register_option() calls there will ensure those options + are available as soon as pandas is loaded. If you use register_option + in a module, it will only be available after that module is imported, + which you should be aware of. + +- `config_prefix` is a context_manager (for use with the `with` keyword) + which can save developers some typing, see the docstring. + +""" + +import re + +from collections import namedtuple +import warnings +from pandas.compat import map, lmap, u +import pandas.compat as compat + +DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver') +RegisteredOption = namedtuple( + 'RegisteredOption', 'key defval doc validator cb') + +_deprecated_options = {} # holds deprecated option metdata +_registered_options = {} # holds registered option metdata +_global_config = {} # holds the current values for registered options +_reserved_keys = ['all'] # keys which have a special meaning + + +class OptionError(AttributeError, KeyError): + + """Exception for pandas.options, backwards compatible with KeyError + checks""" + + +# +# User API + +def _get_single_key(pat, silent): + keys = _select_options(pat) + if len(keys) == 0: + if not silent: + _warn_if_deprecated(pat) + raise OptionError('No such keys(s): %r' % pat) + if len(keys) > 1: + raise OptionError('Pattern matched multiple keys') + key = keys[0] + + if not silent: + _warn_if_deprecated(key) + + key = _translate_key(key) + + return key + + +def _get_option(pat, silent=False): + key = _get_single_key(pat, silent) + + # walk the nested dict + root, k = _get_root(key) + return root[k] + + +def _set_option(*args, **kwargs): + # must at least 1 arg deal with constraints later + nargs = len(args) + if not nargs or nargs % 2 != 0: + raise ValueError("Must provide an even number of non-keyword " + "arguments") + + # default to false + silent = kwargs.get('silent', False) + + for k, v in zip(args[::2], args[1::2]): + key = _get_single_key(k, silent) + + o = _get_registered_option(key) + if o and o.validator: + o.validator(v) + + # walk the nested dict + root, k = _get_root(key) + root[k] = v + + if o.cb: + o.cb(key) + +def _describe_option(pat='', _print_desc=True): + + keys = _select_options(pat) + if len(keys) == 0: + raise OptionError('No such keys(s)') + + s = u('') + for k in keys: # filter by pat + s += _build_option_description(k) + + if _print_desc: + print(s) + else: + return s + + +def _reset_option(pat, silent=False): + + keys = _select_options(pat) + + if len(keys) == 0: + raise OptionError('No such keys(s)') + + if len(keys) > 1 and len(pat) < 4 and pat != 'all': + raise ValueError('You must specify at least 4 characters when ' + 'resetting multiple keys, use the special keyword ' + '"all" to reset all the options to their default ' + 'value') + + for k in keys: + _set_option(k, _registered_options[k].defval, silent=silent) + + +def get_default_val(pat): + key = _get_single_key(pat, silent=True) + return _get_registered_option(key).defval + + +class DictWrapper(object): + + """ provide attribute-style access to a nested dict + """ + + def __init__(self, d, prefix=""): + object.__setattr__(self, "d", d) + object.__setattr__(self, "prefix", prefix) + + def __setattr__(self, key, val): + prefix = object.__getattribute__(self, "prefix") + if prefix: + prefix += "." + prefix += key + # you can't set new keys + # can you can't overwrite subtrees + if key in self.d and not isinstance(self.d[key], dict): + _set_option(prefix, val) + else: + raise OptionError("You can only set the value of existing options") + + def __getattr__(self, key): + prefix = object.__getattribute__(self, "prefix") + if prefix: + prefix += "." + prefix += key + v = object.__getattribute__(self, "d")[key] + if isinstance(v, dict): + return DictWrapper(v, prefix) + else: + return _get_option(prefix) + + def __dir__(self): + return list(self.d.keys()) + + +# For user convenience, we'd like to have the available options described +# in the docstring. For dev convenience we'd like to generate the docstrings +# dynamically instead of maintaining them by hand. To this, we use the +# class below which wraps functions inside a callable, and converts +# __doc__ into a propery function. The doctsrings below are templates +# using the py2.6+ advanced formatting syntax to plug in a concise list +# of options, and option descriptions. + + +class CallableDynamicDoc(object): + + def __init__(self, func, doc_tmpl): + self.__doc_tmpl__ = doc_tmpl + self.__func__ = func + + def __call__(self, *args, **kwds): + return self.__func__(*args, **kwds) + + @property + def __doc__(self): + opts_desc = _describe_option('all', _print_desc=False) + opts_list = pp_options_list(list(_registered_options.keys())) + return self.__doc_tmpl__.format(opts_desc=opts_desc, + opts_list=opts_list) + +_get_option_tmpl = """ +get_option(pat) + +Retrieves the value of the specified option. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp which should match a single option. + Note: partial matches are supported for convenience, but unless you use the + full option name (e.g. x.y.z.option_name), your code may break in future + versions if new options with similar names are introduced. + +Returns +------- +result : the value of the option + +Raises +------ +OptionError : if no such option exists + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +_set_option_tmpl = """ +set_option(pat, value) + +Sets the value of the specified option. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp which should match a single option. + Note: partial matches are supported for convenience, but unless you use the + full option name (e.g. x.y.z.option_name), your code may break in future + versions if new options with similar names are introduced. +value : + new value of option. + +Returns +------- +None + +Raises +------ +OptionError if no such option exists + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +_describe_option_tmpl = """ +describe_option(pat, _print_desc=False) + +Prints the description for one or more registered options. + +Call with not arguments to get a listing for all registered options. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str + Regexp pattern. All matching keys will have their description displayed. +_print_desc : bool, default True + If True (default) the description(s) will be printed to stdout. + Otherwise, the description(s) will be returned as a unicode string + (for testing). + +Returns +------- +None by default, the description(s) as a unicode string if _print_desc +is False + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +_reset_option_tmpl = """ +reset_option(pat) + +Reset one or more options to their default value. + +Pass "all" as argument to reset all options. + +Available options: + +{opts_list} + +Parameters +---------- +pat : str/regex + If specified only options matching `prefix*` will be reset. + Note: partial matches are supported for convenience, but unless you + use the full option name (e.g. x.y.z.option_name), your code may break + in future versions if new options with similar names are introduced. + +Returns +------- +None + +Notes +----- +The available options with its descriptions: + +{opts_desc} +""" + +# bind the functions with their docstrings into a Callable +# and use that as the functions exposed in pd.api +get_option = CallableDynamicDoc(_get_option, _get_option_tmpl) +set_option = CallableDynamicDoc(_set_option, _set_option_tmpl) +reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl) +describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl) +options = DictWrapper(_global_config) + +# +# Functions for use by pandas developers, in addition to User - api + + +class option_context(object): + """ + Context manager to temporarily set options in the `with` statement context. + + You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + + Examples + -------- + + >>> with option_context('display.max_rows', 10, 'display.max_columns', 5): + ... + + """ + + def __init__(self, *args): + if not (len(args) % 2 == 0 and len(args) >= 2): + raise ValueError( + 'Need to invoke as' + 'option_context(pat, val, [(pat, val), ...)).' + ) + + ops = list(zip(args[::2], args[1::2])) + undo = [] + for pat, val in ops: + undo.append((pat, _get_option(pat, silent=True))) + + self.undo = undo + + for pat, val in ops: + _set_option(pat, val, silent=True) + + def __enter__(self): + pass + + def __exit__(self, *args): + if self.undo: + for pat, val in self.undo: + _set_option(pat, val, silent=True) + + +def register_option(key, defval, doc='', validator=None, cb=None): + """Register an option in the package-wide pandas config object + + Parameters + ---------- + key - a fully-qualified key, e.g. "x.y.option - z". + defval - the default value of the option + doc - a string description of the option + validator - a function of a single argument, should raise `ValueError` if + called with a value which is not a legal value for the option. + cb - a function of a single argument "key", which is called + immediately after an option value is set/reset. key is + the full name of the option. + + Returns + ------- + Nothing. + + Raises + ------ + ValueError if `validator` is specified and `defval` is not a valid value. + + """ + import tokenize + import keyword + key = key.lower() + + if key in _registered_options: + raise OptionError("Option '%s' has already been registered" % key) + if key in _reserved_keys: + raise OptionError("Option '%s' is a reserved key" % key) + + # the default value should be legal + if validator: + validator(defval) + + # walk the nested dict, creating dicts as needed along the path + path = key.split('.') + + for k in path: + if not bool(re.match('^' + tokenize.Name + '$', k)): + raise ValueError("%s is not a valid identifier" % k) + if keyword.iskeyword(key): + raise ValueError("%s is a python keyword" % k) + + cursor = _global_config + for i, p in enumerate(path[:-1]): + if not isinstance(cursor, dict): + raise OptionError("Path prefix to option '%s' is already an option" + % '.'.join(path[:i])) + if p not in cursor: + cursor[p] = {} + cursor = cursor[p] + + if not isinstance(cursor, dict): + raise OptionError("Path prefix to option '%s' is already an option" + % '.'.join(path[:-1])) + + cursor[path[-1]] = defval # initialize + + # save the option metadata + _registered_options[key] = RegisteredOption(key=key, defval=defval, + doc=doc, validator=validator, + cb=cb) + + +def deprecate_option(key, msg=None, rkey=None, removal_ver=None): + """ + Mark option `key` as deprecated, if code attempts to access this option, + a warning will be produced, using `msg` if given, or a default message + if not. + if `rkey` is given, any access to the key will be re-routed to `rkey`. + + Neither the existence of `key` nor that if `rkey` is checked. If they + do not exist, any subsequence access will fail as usual, after the + deprecation warning is given. + + Parameters + ---------- + key - the name of the option to be deprecated. must be a fully-qualified + option name (e.g "x.y.z.rkey"). + + msg - (Optional) a warning message to output when the key is referenced. + if no message is given a default message will be emitted. + + rkey - (Optional) the name of an option to reroute access to. + If specified, any referenced `key` will be re-routed to `rkey` + including set/get/reset. + rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). + used by the default message if no `msg` is specified. + + removal_ver - (Optional) specifies the version in which this option will + be removed. used by the default message if no `msg` + is specified. + + Returns + ------- + Nothing + + Raises + ------ + OptionError - if key has already been deprecated. + + """ + + key = key.lower() + + if key in _deprecated_options: + raise OptionError("Option '%s' has already been defined as deprecated." + % key) + + _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) + + +# +# functions internal to the module + +def _select_options(pat): + """returns a list of keys matching `pat` + + if pat=="all", returns all registered options + """ + + # short-circuit for exact key + if pat in _registered_options: + return [pat] + + # else look through all of them + keys = sorted(_registered_options.keys()) + if pat == 'all': # reserved key + return keys + + return [k for k in keys if re.search(pat, k, re.I)] + + +def _get_root(key): + path = key.split('.') + cursor = _global_config + for p in path[:-1]: + cursor = cursor[p] + return cursor, path[-1] + + +def _is_deprecated(key): + """ Returns True if the given option has been deprecated """ + + key = key.lower() + return key in _deprecated_options + + +def _get_deprecated_option(key): + """ + Retrieves the metadata for a deprecated option, if `key` is deprecated. + + Returns + ------- + DeprecatedOption (namedtuple) if key is deprecated, None otherwise + """ + + try: + d = _deprecated_options[key] + except KeyError: + return None + else: + return d + + +def _get_registered_option(key): + """ + Retrieves the option metadata if `key` is a registered option. + + Returns + ------- + RegisteredOption (namedtuple) if key is deprecated, None otherwise + """ + return _registered_options.get(key) + + +def _translate_key(key): + """ + if key id deprecated and a replacement key defined, will return the + replacement key, otherwise returns `key` as - is + """ + + d = _get_deprecated_option(key) + if d: + return d.rkey or key + else: + return key + + +def _warn_if_deprecated(key): + """ + Checks if `key` is a deprecated option and if so, prints a warning. + + Returns + ------- + bool - True if `key` is deprecated, False otherwise. + """ + + d = _get_deprecated_option(key) + if d: + if d.msg: + print(d.msg) + warnings.warn(d.msg, DeprecationWarning) + else: + msg = "'%s' is deprecated" % key + if d.removal_ver: + msg += ' and will be removed in %s' % d.removal_ver + if d.rkey: + msg += ", please use '%s' instead." % d.rkey + else: + msg += ', please refrain from using it.' + + warnings.warn(msg, DeprecationWarning) + return True + return False + + +def _build_option_description(k): + """ Builds a formatted description of a registered option and prints it """ + + o = _get_registered_option(k) + d = _get_deprecated_option(k) + + s = u('%s ') % k + + if o.doc: + s += '\n'.join(o.doc.strip().split('\n')) + else: + s += 'No description available.' + + if o: + s += u('\n [default: %s] [currently: %s]') % (o.defval, + _get_option(k, True)) + + if d: + s += u('\n (Deprecated') + s += (u(', use `%s` instead.') % d.rkey if d.rkey else '') + s += u(')') + + s += '\n\n' + return s + + +def pp_options_list(keys, width=80, _print=False): + """ Builds a concise listing of available options, grouped by prefix """ + + from textwrap import wrap + from itertools import groupby + + def pp(name, ks): + pfx = ('- ' + name + '.[' if name else '') + ls = wrap(', '.join(ks), width, initial_indent=pfx, + subsequent_indent=' ', break_long_words=False) + if ls and ls[-1] and name: + ls[-1] = ls[-1] + ']' + return ls + + ls = [] + singles = [x for x in sorted(keys) if x.find('.') < 0] + if singles: + ls += pp('', singles) + keys = [x for x in keys if x.find('.') >= 0] + + for k, g in groupby(sorted(keys), lambda x: x[:x.rfind('.')]): + ks = [x[len(k) + 1:] for x in list(g)] + ls += pp(k, ks) + s = '\n'.join(ls) + if _print: + print(s) + else: + return s + + +# +# helpers + +from contextlib import contextmanager + + +@contextmanager +def config_prefix(prefix): + """contextmanager for multiple invocations of API with a common prefix + + supported API functions: (register / get / set )__option + + Warning: This is not thread - safe, and won't work properly if you import + the API functions into your module using the "from x import y" construct. + + Example: + + import pandas.core.config as cf + with cf.config_prefix("display.font"): + cf.register_option("color", "red") + cf.register_option("size", " 5 pt") + cf.set_option(size, " 6 pt") + cf.get_option(size) + ... + + etc' + + will register options "display.font.color", "display.font.size", set the + value of "display.font.size"... and so on. + """ + + # Note: reset_option relies on set_option, and on key directly + # it does not fit in to this monkey-patching scheme + + global register_option, get_option, set_option, reset_option + + def wrap(func): + + def inner(key, *args, **kwds): + pkey = '%s.%s' % (prefix, key) + return func(pkey, *args, **kwds) + + return inner + + _register_option = register_option + _get_option = get_option + _set_option = set_option + set_option = wrap(set_option) + get_option = wrap(get_option) + register_option = wrap(register_option) + yield None + set_option = _set_option + get_option = _get_option + register_option = _register_option + + +# These factories and methods are handy for use as the validator +# arg in register_option + +def is_type_factory(_type): + """ + + Parameters + ---------- + `_type` - a type to be compared against (e.g. type(x) == `_type`) + + Returns + ------- + validator - a function of a single argument x , which returns the + True if type(x) is equal to `_type` + + """ + + def inner(x): + if type(x) != _type: + raise ValueError("Value must have type '%s'" % str(_type)) + + return inner + + +def is_instance_factory(_type): + """ + + Parameters + ---------- + `_type` - the type to be checked against + + Returns + ------- + validator - a function of a single argument x , which returns the + True if x is an instance of `_type` + + """ + if isinstance(_type, (tuple, list)): + _type = tuple(_type) + from pandas.core.common import pprint_thing + type_repr = "|".join(map(pprint_thing, _type)) + else: + type_repr = "'%s'" % _type + + def inner(x): + if not isinstance(x, _type): + raise ValueError("Value must be an instance of %s" % type_repr) + + return inner + + +def is_one_of_factory(legal_values): + def inner(x): + from pandas.core.common import pprint_thing as pp + if not x in legal_values: + pp_values = lmap(pp, legal_values) + raise ValueError("Value must be one of %s" + % pp("|".join(pp_values))) + + return inner + +# common type validators, for convenience +# usage: register_option(... , validator = is_int) +is_int = is_type_factory(int) +is_bool = is_type_factory(bool) +is_float = is_type_factory(float) +is_str = is_type_factory(str) +is_unicode = is_type_factory(compat.text_type) +is_text = is_instance_factory((str, bytes)) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py new file mode 100644 index 00000000..f9f3b0da --- /dev/null +++ b/pandas/core/config_init.py @@ -0,0 +1,343 @@ +""" +This module is imported from the pandas package __init__.py file +in order to ensure that the core.config options registered here will +be available as soon as the user loads the package. if register_option +is invoked inside specific modules, they will not be registered until that +module is imported, which may or may not be a problem. + +If you need to make sure options are available even before a certain +module is imported, register them here rather then in the module. + +""" + +import pandas.core.config as cf +from pandas.core.config import (is_int, is_bool, is_text, is_float, + is_instance_factory, is_one_of_factory, + get_default_val) +from pandas.core.format import detect_console_encoding + + +# +# options from the "display" namespace + +pc_precision_doc = """ +: int + Floating point output precision (number of significant digits). This is + only a suggestion +""" + +pc_colspace_doc = """ +: int + Default space for DataFrame columns. +""" + +pc_max_rows_doc = """ +: int + This sets the maximum number of rows pandas should output when printing + out various output. For example, this value determines whether the repr() + for a dataframe prints out fully or just a summary repr. + 'None' value means unlimited. +""" + +pc_max_cols_doc = """ +: int + max_rows and max_columns are used in __repr__() methods to decide if + to_string() or info() is used to render an object to a string. In case + python/IPython is running in a terminal this can be set to 0 and pandas + will correctly auto-detect the width the terminal and swap to a smaller + format in case all columns would not fit vertically. The IPython notebook, + IPython qtconsole, or IDLE do not run in a terminal and hence it is not + possible to do correct auto-detection. + 'None' value means unlimited. +""" + +pc_max_info_cols_doc = """ +: int + max_info_columns is used in DataFrame.info method to decide if + per column information will be printed. +""" + +pc_nb_repr_h_doc = """ +: boolean + When True, IPython notebook will use html representation for + pandas objects (if it is available). +""" + +pc_date_dayfirst_doc = """ +: boolean + When True, prints and parses dates with the day first, eg 20/01/2005 +""" + +pc_date_yearfirst_doc = """ +: boolean + When True, prints and parses dates with the year first, eg 2005/01/20 +""" + +pc_pprint_nest_depth = """ +: int + Controls the number of nested levels to process when pretty-printing +""" + +pc_multi_sparse_doc = """ +: boolean + "sparsify" MultiIndex display (don't display repeated + elements in outer levels within groups) +""" + +pc_encoding_doc = """ +: str/unicode + Defaults to the detected encoding of the console. + Specifies the encoding to be used for strings returned by to_string, + these are generally strings meant to be displayed on the console. +""" + +float_format_doc = """ +: callable + The callable should accept a floating point number and return + a string with the desired format of the number. This is used + in some places like SeriesFormatter. + See core.format.EngFormatter for an example. +""" + +max_colwidth_doc = """ +: int + The maximum width in characters of a column in the repr of + a pandas data structure. When the column overflows, a "..." + placeholder is embedded in the output. +""" + +colheader_justify_doc = """ +: 'left'/'right' + Controls the justification of column headers. used by DataFrameFormatter. +""" + +pc_expand_repr_doc = """ +: boolean + Whether to print out the full DataFrame repr for wide DataFrames across + multiple lines, `max_columns` is still respected, but the output will + wrap-around across multiple "pages" if it's width exceeds `display.width`. +""" + +pc_show_dimensions_doc = """ +: boolean or 'truncate' + Whether to print out dimensions at the end of DataFrame repr. + If 'truncate' is specified, only print out the dimensions if the + frame is truncated (e.g. not display all rows and/or columns) +""" + +pc_line_width_doc = """ +: int + Deprecated. +""" + +pc_line_width_deprecation_warning = """\ +line_width has been deprecated, use display.width instead (currently both are +identical) +""" + +pc_height_deprecation_warning = """\ +height has been deprecated. +""" + +pc_width_doc = """ +: int + Width of the display in characters. In case python/IPython is running in + a terminal this can be set to None and pandas will correctly auto-detect + the width. + Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible to correctly detect the width. +""" + +pc_height_doc = """ +: int + Deprecated. +""" + +pc_chop_threshold_doc = """ +: float or None + if set to a float value, all float values smaller then the given threshold + will be displayed as exactly 0 by repr and friends. +""" + +pc_max_seq_items = """ +: int or None + when pretty-printing a long sequence, no more then `max_seq_items` + will be printed. If items are omitted, they will be denoted by the + addition of "..." to the resulting string. + + If set to None, the number of items to be printed is unlimited. +""" + +pc_max_info_rows_doc = """ +: int or None + df.info() will usually show null-counts for each column. + For large frames this can be quite slow. max_info_rows and max_info_cols + limit this null check only to frames with smaller dimensions then specified. +""" + +pc_large_repr_doc = """ +: 'truncate'/'info' + For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can + show a truncated table (the default from 0.13), or switch to the view from + df.info() (the behaviour in earlier versions of pandas). +""" + +pc_mpl_style_doc = """ +: bool + Setting this to 'default' will modify the rcParams used by matplotlib + to give plots a more pleasing visual style by default. + Setting this to None/False restores the values to their initial value. +""" + +style_backup = dict() + + +def mpl_style_cb(key): + import sys + from pandas.tools.plotting import mpl_stylesheet + global style_backup + + val = cf.get_option(key) + + if 'matplotlib' not in sys.modules.keys(): + if not(val): # starting up, we get reset to None + return val + raise Exception("matplotlib has not been imported. aborting") + + import matplotlib.pyplot as plt + + if val == 'default': + style_backup = dict([(k, plt.rcParams[k]) for k in mpl_stylesheet]) + plt.rcParams.update(mpl_stylesheet) + elif not val: + if style_backup: + plt.rcParams.update(style_backup) + + return val + +with cf.config_prefix('display'): + cf.register_option('precision', 7, pc_precision_doc, validator=is_int) + cf.register_option('float_format', None, float_format_doc) + cf.register_option('column_space', 12, validator=is_int) + cf.register_option('max_info_rows', 1690785, pc_max_info_rows_doc, + validator=is_instance_factory((int, type(None)))) + cf.register_option('max_rows', 60, pc_max_rows_doc, + validator=is_instance_factory([type(None), int])) + cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int) + cf.register_option('max_columns', 20, pc_max_cols_doc, + validator=is_instance_factory([type(None), int])) + cf.register_option('large_repr', 'truncate', pc_large_repr_doc, + validator=is_one_of_factory(['truncate', 'info'])) + cf.register_option('max_info_columns', 100, pc_max_info_cols_doc, + validator=is_int) + cf.register_option('colheader_justify', 'right', colheader_justify_doc, + validator=is_text) + cf.register_option('notebook_repr_html', True, pc_nb_repr_h_doc, + validator=is_bool) + cf.register_option('date_dayfirst', False, pc_date_dayfirst_doc, + validator=is_bool) + cf.register_option('date_yearfirst', False, pc_date_yearfirst_doc, + validator=is_bool) + cf.register_option('pprint_nest_depth', 3, pc_pprint_nest_depth, + validator=is_int) + cf.register_option('multi_sparse', True, pc_multi_sparse_doc, + validator=is_bool) + cf.register_option('encoding', detect_console_encoding(), pc_encoding_doc, + validator=is_text) + cf.register_option('expand_frame_repr', True, pc_expand_repr_doc) + cf.register_option('show_dimensions', 'truncate', pc_show_dimensions_doc, + validator=is_one_of_factory([True, False, 'truncate'])) + cf.register_option('chop_threshold', None, pc_chop_threshold_doc) + cf.register_option('max_seq_items', 100, pc_max_seq_items) + cf.register_option('mpl_style', None, pc_mpl_style_doc, + validator=is_one_of_factory([None, False, 'default']), + cb=mpl_style_cb) + cf.register_option('height', 60, pc_height_doc, + validator=is_instance_factory([type(None), int])) + cf.register_option('width', 80, pc_width_doc, + validator=is_instance_factory([type(None), int])) + # redirected to width, make defval identical + cf.register_option('line_width', get_default_val('display.width'), + pc_line_width_doc) + +cf.deprecate_option('display.line_width', + msg=pc_line_width_deprecation_warning, + rkey='display.width') + +cf.deprecate_option('display.height', + msg=pc_height_deprecation_warning, + rkey='display.max_rows') + +tc_sim_interactive_doc = """ +: boolean + Whether to simulate interactive mode for purposes of testing +""" +with cf.config_prefix('mode'): + cf.register_option('sim_interactive', False, tc_sim_interactive_doc) + +use_inf_as_null_doc = """ +: boolean + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). +""" + +# We don't want to start importing everything at the global context level +# or we'll hit circular deps. + + +def use_inf_as_null_cb(key): + from pandas.core.common import _use_inf_as_null + _use_inf_as_null(key) + +with cf.config_prefix('mode'): + cf.register_option('use_inf_as_null', False, use_inf_as_null_doc, + cb=use_inf_as_null_cb) + + +# user warnings +chained_assignment = """ +: string + Raise an exception, warn, or no action if trying to use chained assignment, + The default is warn +""" + +with cf.config_prefix('mode'): + cf.register_option('chained_assignment', 'warn', chained_assignment, + validator=is_one_of_factory([None, 'warn', 'raise'])) + + +# Set up the io.excel specific configuration. +writer_engine_doc = """ +: string + The default Excel writer engine for '{ext}' files. Available options: + '{default}' (the default){others}. +""" + +with cf.config_prefix('io.excel'): + # going forward, will be additional writers + for ext, options in [('xls', ['xlwt']), + ('xlsm', ['openpyxl'])]: + default = options.pop(0) + if options: + options = " " + ", ".join(options) + else: + options = "" + doc = writer_engine_doc.format(ext=ext, default=default, + others=options) + cf.register_option(ext + '.writer', default, doc, validator=str) + + def _register_xlsx(engine, other): + cf.register_option('xlsx.writer', engine, + writer_engine_doc.format(ext='xlsx', + default=engine, + others=", '%s'" % other), + validator=str) + + try: + # better memory footprint + import xlsxwriter + _register_xlsx('xlsxwriter', 'openpyxl') + except ImportError: + # fallback + _register_xlsx('openpyxl', 'xlsxwriter') diff --git a/pandas/core/datetools.py b/pandas/core/datetools.py new file mode 100644 index 00000000..6678baac --- /dev/null +++ b/pandas/core/datetools.py @@ -0,0 +1,63 @@ +"""A collection of random tools for dealing with dates in Python""" + +from pandas.tseries.tools import * +from pandas.tseries.offsets import * +from pandas.tseries.frequencies import * + +day = DateOffset() +bday = BDay() +businessDay = bday +try: + cday = CDay() + customBusinessDay = CustomBusinessDay() + customBusinessMonthEnd = CBMonthEnd() + customBusinessMonthBegin = CBMonthBegin() +except NotImplementedError: + cday = None + customBusinessDay = None + customBusinessMonthEnd = None + customBusinessMonthBegin = None +monthEnd = MonthEnd() +yearEnd = YearEnd() +yearBegin = YearBegin() +bmonthEnd = BMonthEnd() +bmonthBegin = BMonthBegin() +cbmonthEnd = customBusinessMonthEnd +cbmonthBegin = customBusinessMonthBegin +bquarterEnd = BQuarterEnd() +quarterEnd = QuarterEnd() +byearEnd = BYearEnd() +week = Week() + +# Functions/offsets to roll dates forward +thisMonthEnd = MonthEnd(0) +thisBMonthEnd = BMonthEnd(0) +thisYearEnd = YearEnd(0) +thisYearBegin = YearBegin(0) +thisBQuarterEnd = BQuarterEnd(0) +thisQuarterEnd = QuarterEnd(0) + +# Functions to check where a date lies +isBusinessDay = BDay().onOffset +isMonthEnd = MonthEnd().onOffset +isBMonthEnd = BMonthEnd().onOffset + + +def _resolve_offset(freq, kwds): + if 'timeRule' in kwds or 'offset' in kwds: + offset = kwds.get('offset', None) + offset = kwds.get('timeRule', offset) + if isinstance(offset, compat.string_types): + offset = getOffset(offset) + warn = True + else: + offset = freq + warn = False + + if warn: + import warnings + warnings.warn("'timeRule' and 'offset' parameters are deprecated," + " please use 'freq' instead", + FutureWarning) + + return offset diff --git a/pandas/core/format.py b/pandas/core/format.py new file mode 100644 index 00000000..b11b2e72 --- /dev/null +++ b/pandas/core/format.py @@ -0,0 +1,2298 @@ + +#coding: utf-8 +from __future__ import print_function +# pylint: disable=W0141 + +import sys +import re + +from pandas.core.base import PandasObject +from pandas.core.common import adjoin, isnull, notnull +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas import compat +from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u, + OrderedDict) +from pandas.util.terminal import get_terminal_size +from pandas.core.config import get_option, set_option, reset_option +import pandas.core.common as com +import pandas.lib as lib +from pandas.tslib import iNaT + +import numpy as np + +import itertools +import csv +from datetime import time + +from pandas.tseries.period import PeriodIndex, DatetimeIndex + +docstring_to_string = """ + Parameters + ---------- + frame : DataFrame + object to render + buf : StringIO-like, optional + buffer to write to + columns : sequence, optional + the subset of columns to write; default None writes all columns + col_space : int, optional + the minimum width of each column + header : bool, optional + whether to print column labels, default True + index : bool, optional + whether to print index (row) labels, default True + na_rep : string, optional + string representation of NAN to use, default 'NaN' + formatters : list or dict of one-parameter functions, optional + formatter functions to apply to columns' elements by position or name, + default None. The result of each function must be a unicode string. + List must be of length equal to the number of columns. + float_format : one-parameter function, optional + formatter function to apply to columns' elements if they are floats, + default None. The result of this function must be a unicode string. + sparsify : bool, optional + Set to False for a DataFrame with a hierarchical index to print every + multiindex key at each row, default True + justify : {'left', 'right'}, default None + Left or right-justify the column labels. If None uses the option from + the print configuration (controlled by set_option), 'right' out + of the box. + index_names : bool, optional + Prints the names of the indexes, default True + force_unicode : bool, default False + Always return a unicode result. Deprecated in v0.10.0 as string + formatting is now rendered to unicode by default. + + Returns + ------- + formatted : string (or unicode, depending on data and options)""" + + +class CategoricalFormatter(object): + + def __init__(self, categorical, buf=None, length=True, + na_rep='NaN', name=False, footer=True): + self.categorical = categorical + self.buf = buf if buf is not None else StringIO(u("")) + self.name = name + self.na_rep = na_rep + self.length = length + self.footer = footer + + def _get_footer(self): + footer = '' + + if self.name: + name = com.pprint_thing(self.categorical.name, + escape_chars=('\t', '\r', '\n')) + footer += ('Name: %s' % name if self.categorical.name is not None + else '') + + if self.length: + if footer: + footer += ', ' + footer += "Length: %d" % len(self.categorical) + + levheader = 'Levels (%d): ' % len(self.categorical.levels) + + # TODO: should max_line_width respect a setting? + levstring = np.array_repr(self.categorical.levels, max_line_width=60) + indent = ' ' * (levstring.find('[') + len(levheader) + 1) + lines = levstring.split('\n') + levstring = '\n'.join([lines[0]] + + [indent + x.lstrip() for x in lines[1:]]) + if footer: + footer += ', ' + footer += levheader + levstring + + return compat.text_type(footer) + + def _get_formatted_values(self): + return format_array(np.asarray(self.categorical), None, + float_format=None, + na_rep=self.na_rep) + + def to_string(self): + categorical = self.categorical + + if len(categorical) == 0: + if self.footer: + return self._get_footer() + else: + return u('') + + fmt_values = self._get_formatted_values() + pad_space = 10 + + result = ['%s' % i for i in fmt_values] + if self.footer: + footer = self._get_footer() + if footer: + result.append(footer) + + return compat.text_type(u('\n').join(result)) + + +class SeriesFormatter(object): + + def __init__(self, series, buf=None, header=True, length=True, + na_rep='NaN', name=False, float_format=None, dtype=True): + self.series = series + self.buf = buf if buf is not None else StringIO() + self.name = name + self.na_rep = na_rep + self.length = length + self.header = header + + if float_format is None: + float_format = get_option("display.float_format") + self.float_format = float_format + self.dtype = dtype + + def _get_footer(self): + footer = u('') + + if self.name: + if getattr(self.series.index, 'freq', None): + footer += 'Freq: %s' % self.series.index.freqstr + + if footer and self.series.name is not None: + footer += ', ' + + series_name = com.pprint_thing(self.series.name, + escape_chars=('\t', '\r', '\n')) + footer += ("Name: %s" % + series_name) if self.series.name is not None else "" + + if self.length: + if footer: + footer += ', ' + footer += 'Length: %d' % len(self.series) + + if self.dtype: + name = getattr(self.series.dtype, 'name', None) + if name: + if footer: + footer += ', ' + footer += 'dtype: %s' % com.pprint_thing(name) + + return compat.text_type(footer) + + def _get_formatted_index(self): + index = self.series.index + is_multi = isinstance(index, MultiIndex) + + if is_multi: + have_header = any(name for name in index.names) + fmt_index = index.format(names=True) + else: + have_header = index.name is not None + fmt_index = index.format(name=True) + return fmt_index, have_header + + def _get_formatted_values(self): + return format_array(self.series.values, None, + float_format=self.float_format, + na_rep=self.na_rep) + + def to_string(self): + series = self.series + + if len(series) == 0: + return u('') + + fmt_index, have_header = self._get_formatted_index() + fmt_values = self._get_formatted_values() + + maxlen = max(len(x) for x in fmt_index) + pad_space = min(maxlen, 60) + + result = ['%s %s'] * len(fmt_values) + for i, (k, v) in enumerate(zip(fmt_index[1:], fmt_values)): + idx = k.ljust(pad_space) + result[i] = result[i] % (idx, v) + + if self.header and have_header: + result.insert(0, fmt_index[0]) + + footer = self._get_footer() + if footer: + result.append(footer) + + return compat.text_type(u('\n').join(result)) + + +def _strlen_func(): + if compat.PY3: # pragma: no cover + _strlen = len + else: + encoding = get_option("display.encoding") + + def _strlen(x): + try: + return len(x.decode(encoding)) + except UnicodeError: + return len(x) + + return _strlen + + +class TableFormatter(object): + is_truncated = False + show_dimensions = None + + @property + def should_show_dimensions(self): + return self.show_dimensions is True or (self.show_dimensions == 'truncate' and self.is_truncated) + + def _get_formatter(self, i): + if isinstance(self.formatters, (list, tuple)): + if com.is_integer(i): + return self.formatters[i] + else: + return None + else: + if com.is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) + + +class DataFrameFormatter(TableFormatter): + + """ + Render a DataFrame + + self.to_string() : console-friendly tabular output + self.to_html() : html table + self.to_latex() : LaTeX tabular environment table + + """ + + __doc__ = __doc__ if __doc__ else '' + __doc__ += docstring_to_string + + def __init__(self, frame, buf=None, columns=None, col_space=None, + header=True, index=True, na_rep='NaN', formatters=None, + justify=None, float_format=None, sparsify=None, + index_names=True, line_width=None, max_rows=None, + max_cols=None, show_dimensions=False, **kwds): + self.frame = frame + self.buf = buf if buf is not None else StringIO() + self.show_index_names = index_names + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + self.sparsify = sparsify + + self.float_format = float_format + self.formatters = formatters if formatters is not None else {} + self.na_rep = na_rep + self.col_space = col_space + self.header = header + self.index = index + self.line_width = line_width + self.max_rows = max_rows + self.max_cols = max_cols + self.max_rows_displayed = min(max_rows or len(self.frame), + len(self.frame)) + self.show_dimensions = show_dimensions + + if justify is None: + self.justify = get_option("display.colheader_justify") + else: + self.justify = justify + + self.kwds = kwds + + if columns is not None: + self.columns = _ensure_index(columns) + self.frame = self.frame[self.columns] + else: + self.columns = frame.columns + + self._chk_truncate() + + def _chk_truncate(self): + from pandas.tools.merge import concat + + truncate_h = self.max_cols and (len(self.columns) > self.max_cols) + truncate_v = self.max_rows and (len(self.frame) > self.max_rows) + + # Cut the data to the information actually printed + max_cols = self.max_cols + max_rows = self.max_rows + frame = self.frame + if truncate_h: + if max_cols > 1: + col_num = (max_cols // 2) + frame = concat( (frame.iloc[:,:col_num],frame.iloc[:,-col_num:]),axis=1 ) + else: + col_num = max_cols + frame = frame.iloc[:,:max_cols] + self.tr_col_num = col_num + if truncate_v: + if max_rows > 1: + row_num = max_rows // 2 + frame = concat( (frame.iloc[:row_num,:],frame.iloc[-row_num:,:]) ) + else: + row_num = max_rows + frame = frame.iloc[:max_rows,:] + self.tr_row_num = row_num + + self.tr_frame = frame + self.truncate_h = truncate_h + self.truncate_v = truncate_v + self.is_truncated = self.truncate_h or self.truncate_v + + def _to_str_columns(self): + """ + Render a DataFrame to a list of columns (as lists of strings). + """ + _strlen = _strlen_func() + frame = self.tr_frame + + # may include levels names also + str_index = self._get_formatted_index(frame) + + str_columns = self._get_formatted_column_labels(frame) + + if self.header: + stringified = [] + col_headers = frame.columns + for i, c in enumerate(frame): + cheader = str_columns[i] + max_colwidth = max(self.col_space or 0, + *(_strlen(x) for x in cheader)) + + fmt_values = self._format_col(i) + + fmt_values = _make_fixed_width(fmt_values, self.justify, + minimum=max_colwidth) + + + max_len = max(np.max([_strlen(x) for x in fmt_values]), + max_colwidth) + if self.justify == 'left': + cheader = [x.ljust(max_len) for x in cheader] + else: + cheader = [x.rjust(max_len) for x in cheader] + + stringified.append(cheader + fmt_values) + else: + stringified = [] + for i, c in enumerate(frame): + formatter = self._get_formatter(i) + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width(fmt_values, self.justify) + + stringified.append(fmt_values) + + strcols = stringified + if self.index: + strcols.insert(0, str_index) + + # Add ... to signal truncated + truncate_h = self.truncate_h + truncate_v = self.truncate_v + + if truncate_h: + col_num = self.tr_col_num + col_width = len(strcols[col_num][0]) # infer from column header + strcols.insert(col_num + 1, ['...'.center(col_width)] * (len(str_index))) + if truncate_v: + n_header_rows = len(str_index) - len(frame) + row_num = self.tr_row_num + for ix,col in enumerate(strcols): + cwidth = len(strcols[ix][row_num]) # infer from above row + is_dot_col = False + if truncate_h: + is_dot_col = ix == col_num + 1 + if cwidth > 3 or is_dot_col: + my_str = '...' + else: + my_str = '..' + + if ix == 0: + dot_str = my_str.ljust(cwidth) + elif is_dot_col: + dot_str = my_str.center(cwidth) + else: + dot_str = my_str.rjust(cwidth) + + strcols[ix].insert(row_num + n_header_rows, dot_str) + + return strcols + + def to_string(self): + """ + Render a DataFrame to a console-friendly tabular output. + """ + + frame = self.frame + + if len(frame.columns) == 0 or len(frame.index) == 0: + info_line = (u('Empty %s\nColumns: %s\nIndex: %s') + % (type(self.frame).__name__, + com.pprint_thing(frame.columns), + com.pprint_thing(frame.index))) + text = info_line + else: + strcols = self._to_str_columns() + if self.line_width is None: + text = adjoin(1, *strcols) + else: + text = self._join_multiline(*strcols) + + self.buf.writelines(text) + + if self.should_show_dimensions: + self.buf.write("\n\n[%d rows x %d columns]" + % (len(frame), len(frame.columns))) + + def _join_multiline(self, *strcols): + lwidth = self.line_width + adjoin_width = 1 + strcols = list(strcols) + if self.index: + idx = strcols.pop(0) + lwidth -= np.array([len(x) for x in idx]).max() + adjoin_width + + col_widths = [np.array([len(x) for x in col]).max() + if len(col) > 0 else 0 + for col in strcols] + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.max_rows and len(self.frame) > self.max_rows: + nrows = self.max_rows + 1 + else: + nrows = len(self.frame) + + str_lst = [] + st = 0 + for i, ed in enumerate(col_bins): + row = strcols[st:ed] + row.insert(0, idx) + if nbins > 1: + if ed <= len(strcols) and i < nbins - 1: + row.append([' \\'] + [' '] * (nrows - 1)) + else: + row.append([' '] * nrows) + + str_lst.append(adjoin(adjoin_width, *row)) + st = ed + return '\n\n'.join(str_lst) + + def to_latex(self, column_format=None, longtable=False): + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + self.escape = self.kwds.get('escape', True) + #TODO: column_format is not settable in df.to_latex + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return 'r' + else: + return 'l' + + frame = self.frame + + if len(frame.columns) == 0 or len(frame.index) == 0: + info_line = (u('Empty %s\nColumns: %s\nIndex: %s') + % (type(self.frame).__name__, + frame.columns, frame.index)) + strcols = [[info_line]] + else: + strcols = self._to_str_columns() + + if column_format is None: + dtypes = self.frame.dtypes.values + if self.index: + column_format = 'l%s' % ''.join(map(get_col_type, dtypes)) + else: + column_format = '%s' % ''.join(map(get_col_type, dtypes)) + elif not isinstance(column_format, + compat.string_types): # pragma: no cover + raise AssertionError('column_format must be str or unicode, not %s' + % type(column_format)) + + def write(buf, frame, column_format, strcols, longtable=False): + if not longtable: + buf.write('\\begin{tabular}{%s}\n' % column_format) + buf.write('\\toprule\n') + else: + buf.write('\\begin{longtable}{%s}\n' % column_format) + buf.write('\\toprule\n') + + nlevels = frame.index.nlevels + for i, row in enumerate(zip(*strcols)): + if i == nlevels: + buf.write('\\midrule\n') # End of header + if longtable: + buf.write('\\endhead\n') + buf.write('\\midrule\n') + buf.write('\\multicolumn{3}{r}{{Continued on next ' + 'page}} \\\\\n') + buf.write('\midrule\n') + buf.write('\endfoot\n\n') + buf.write('\\bottomrule\n') + buf.write('\\endlastfoot\n') + if self.escape: + crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first + .replace('_', '\\_') + .replace('%', '\\%') + .replace('$', '\\$') + .replace('#', '\\#') + .replace('{', '\\{') + .replace('}', '\\}') + .replace('~', '\\textasciitilde') + .replace('^', '\\textasciicircum') + .replace('&', '\\&') if x else '{}') for x in row] + else: + crow = [x if x else '{}' for x in row] + buf.write(' & '.join(crow)) + buf.write(' \\\\\n') + + if not longtable: + buf.write('\\bottomrule\n') + buf.write('\\end{tabular}\n') + else: + buf.write('\\end{longtable}\n') + + if hasattr(self.buf, 'write'): + write(self.buf, frame, column_format, strcols, longtable) + elif isinstance(self.buf, compat.string_types): + with open(self.buf, 'w') as f: + write(f, frame, column_format, strcols, longtable) + else: + raise TypeError('buf is not a file name and it has no write ' + 'method') + + def _format_col(self, i): + frame = self.tr_frame + formatter = self._get_formatter(i) + return format_array( + (frame.iloc[:, i]).get_values(), + formatter, float_format=self.float_format, na_rep=self.na_rep, + space=self.col_space + ) + + def to_html(self, classes=None): + """ + Render a DataFrame to a html table. + """ + html_renderer = HTMLFormatter(self, classes=classes, + max_rows=self.max_rows, + max_cols=self.max_cols) + if hasattr(self.buf, 'write'): + html_renderer.write_result(self.buf) + elif isinstance(self.buf, compat.string_types): + with open(self.buf, 'w') as f: + html_renderer.write_result(f) + else: + raise TypeError('buf is not a file name and it has no write ' + ' method') + + def _get_formatted_column_labels(self,frame): + from pandas.core.index import _sparsify + + def is_numeric_dtype(dtype): + return issubclass(dtype.type, np.number) + + columns = frame.columns + + if isinstance(columns, MultiIndex): + fmt_columns = columns.format(sparsify=False, adjoin=False) + fmt_columns = lzip(*fmt_columns) + dtypes = self.frame.dtypes.values + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = list(zip(*[ + [' ' + y if y not in self.formatters and need_leadsp[x] + else y for y in x] for x in fmt_columns])) + if self.sparsify: + str_columns = _sparsify(str_columns) + + str_columns = [list(x) for x in zip(*str_columns)] + else: + fmt_columns = columns.format() + dtypes = self.frame.dtypes + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = [[' ' + x + if not self._get_formatter(i) and need_leadsp[x] + else x] + for i, (col, x) in + enumerate(zip(columns, fmt_columns))] + + if self.show_index_names and self.has_index_names: + for x in str_columns: + x.append('') + + return str_columns + + @property + def has_index_names(self): + return _has_names(self.frame.index) + + @property + def has_column_names(self): + return _has_names(self.frame.columns) + + def _get_formatted_index(self,frame): + # Note: this is only used by to_string(), not by to_html(). + index = frame.index + columns = frame.columns + + show_index_names = self.show_index_names and self.has_index_names + show_col_names = (self.show_index_names and self.has_column_names) + + fmt = self._get_formatter('__index__') + + if isinstance(index, MultiIndex): + fmt_index = index.format(sparsify=self.sparsify, adjoin=False, + names=show_index_names, + formatter=fmt) + else: + fmt_index = [index.format(name=show_index_names, formatter=fmt)] + + adjoined = adjoin(1, *fmt_index).split('\n') + + # empty space for columns + if show_col_names: + col_header = ['%s' % x for x in self._get_column_name_list()] + else: + col_header = [''] * columns.nlevels + + if self.header: + return col_header + adjoined + else: + return adjoined + + def _get_column_name_list(self): + names = [] + columns = self.frame.columns + if isinstance(columns, MultiIndex): + names.extend('' if name is None else name + for name in columns.names) + else: + names.append('' if columns.name is None else columns.name) + return names + + +class HTMLFormatter(TableFormatter): + + indent_delta = 2 + + def __init__(self, formatter, classes=None, max_rows=None, max_cols=None): + self.fmt = formatter + self.classes = classes + + self.frame = self.fmt.frame + self.columns = self.fmt.tr_frame.columns + self.elements = [] + self.bold_rows = self.fmt.kwds.get('bold_rows', False) + self.escape = self.fmt.kwds.get('escape', True) + + self.max_rows = max_rows or len(self.fmt.frame) + self.max_cols = max_cols or len(self.fmt.columns) + self.show_dimensions = self.fmt.show_dimensions + self.is_truncated = self.max_rows < len(self.fmt.frame) or self.max_cols < len(self.fmt.columns) + + def write(self, s, indent=0): + rs = com.pprint_thing(s) + self.elements.append(' ' * indent + rs) + + def write_th(self, s, indent=0, tags=None): + if (self.fmt.col_space is not None + and self.fmt.col_space > 0): + tags = (tags or "") + tags += 'style="min-width: %s;"' % self.fmt.col_space + + return self._write_cell(s, kind='th', indent=indent, tags=tags) + + def write_td(self, s, indent=0, tags=None): + return self._write_cell(s, kind='td', indent=indent, tags=tags) + + def _write_cell(self, s, kind='td', indent=0, tags=None): + if tags is not None: + start_tag = '<%s %s>' % (kind, tags) + else: + start_tag = '<%s>' % kind + + if self.escape: + # escape & first to prevent double escaping of & + esc = OrderedDict( + [('&', r'&'), ('<', r'<'), ('>', r'>')] + ) + else: + esc = {} + rs = com.pprint_thing(s, escape_chars=esc) + self.write( + '%s%s' % (start_tag, rs, kind), indent) + + def write_tr(self, line, indent=0, indent_delta=4, header=False, + align=None, tags=None, nindex_levels=0): + if tags is None: + tags = {} + + if align is None: + self.write('', indent) + else: + self.write('' % align, indent) + indent += indent_delta + + for i, s in enumerate(line): + val_tag = tags.get(i, None) + if header or (self.bold_rows and i < nindex_levels): + self.write_th(s, indent, tags=val_tag) + else: + self.write_td(s, indent, tags=val_tag) + + indent -= indent_delta + self.write('', indent) + + def write_result(self, buf): + indent = 0 + frame = self.frame + + _classes = ['dataframe'] # Default class. + if self.classes is not None: + if isinstance(self.classes, str): + self.classes = self.classes.split() + if not isinstance(self.classes, (list, tuple)): + raise AssertionError(('classes must be list or tuple, ' + 'not %s') % type(self.classes)) + _classes.extend(self.classes) + + self.write('' % ' '.join(_classes), + indent) + + indent += self.indent_delta + indent = self._write_header(indent) + indent = self._write_body(indent) + + self.write('
', indent) + if self.should_show_dimensions: + by = chr(215) if compat.PY3 else unichr(215) # × + self.write(u('

%d rows %s %d columns

') % + (len(frame), by, len(frame.columns))) + _put_lines(buf, self.elements) + + def _write_header(self, indent): + truncate_h = self.fmt.truncate_h + row_levels = self.frame.index.nlevels + if not self.fmt.header: + # write nothing + return indent + + def _column_header(): + if self.fmt.index: + row = [''] * (self.frame.index.nlevels - 1) + else: + row = [] + + if isinstance(self.columns, MultiIndex): + if self.fmt.has_column_names and self.fmt.index: + row.append(single_column_table(self.columns.names)) + else: + row.append('') + style = "text-align: %s;" % self.fmt.justify + row.extend([single_column_table(c, self.fmt.justify, style) for + c in self.columns]) + else: + if self.fmt.index: + row.append(self.columns.name or '') + row.extend(self.columns) + return row + + self.write('', indent) + row = [] + + indent += self.indent_delta + + if isinstance(self.columns, MultiIndex): + template = 'colspan="%d" halign="left"' + + if self.fmt.sparsify: + # GH3547 + sentinel = com.sentinel_factory() + else: + sentinel = None + levels = self.columns.format(sparsify=sentinel, + adjoin=False, names=False) + level_lengths = _get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + for lnum, (records, values) in enumerate(zip(level_lengths, + levels)): + if truncate_h: + # modify the header lines + ins_col = self.fmt.tr_col_num + if self.fmt.sparsify: + recs_new = {} + # Increment tags after ... col. + for tag,span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + elif tag + span > ins_col: + recs_new[tag] = span + 1 + if lnum == inner_lvl: + values = values[:ins_col] + (u('...'),) + \ + values[ins_col:] + else: # sparse col headers do not receive a ... + values = values[:ins_col] + \ + (values[ins_col - 1],) + values[ins_col:] + else: + recs_new[tag] = span + # if ins_col lies between tags, all col headers get ... + if tag + span == ins_col: + recs_new[ins_col] = 1 + values = values[:ins_col] + (u('...'),) + \ + values[ins_col:] + records = recs_new + inner_lvl = len(level_lengths) - 1 + if lnum == inner_lvl: + records[ins_col] = 1 + else: + recs_new = {} + for tag,span in list(records.items()): + if tag >= ins_col: + recs_new[tag + 1] = span + else: + recs_new[tag] = span + recs_new[ins_col] = 1 + records = recs_new + values = values[:ins_col] + [u('...')] + values[ins_col:] + + name = self.columns.names[lnum] + row = [''] * (row_levels - 1) + ['' if name is None + else com.pprint_thing(name)] + tags = {} + j = len(row) + for i, v in enumerate(values): + if i in records: + if records[i] > 1: + tags[j] = template % records[i] + else: + continue + j += 1 + row.append(v) + self.write_tr(row, indent, self.indent_delta, tags=tags, + header=True) + else: + col_row = _column_header() + align = self.fmt.justify + + if truncate_h: + ins_col = row_levels + self.fmt.tr_col_num + col_row.insert(ins_col, '...') + + self.write_tr(col_row, indent, self.indent_delta, header=True, + align=align) + + if self.fmt.has_index_names: + row = [ + x if x is not None else '' for x in self.frame.index.names + ] + [''] * min(len(self.columns), self.max_cols) + if truncate_h: + ins_col = row_levels + self.fmt.tr_col_num + row.insert(ins_col, '') + self.write_tr(row, indent, self.indent_delta, header=True) + + indent -= self.indent_delta + self.write('', indent) + + return indent + + def _write_body(self, indent): + self.write('', indent) + indent += self.indent_delta + + fmt_values = {} + for i in range(min(len(self.columns), self.max_cols)): + fmt_values[i] = self.fmt._format_col(i) + + # write values + if self.fmt.index: + if isinstance(self.frame.index, MultiIndex): + self._write_hierarchical_rows(fmt_values, indent) + else: + self._write_regular_rows(fmt_values, indent) + else: + for i in range(len(self.frame)): + row = [fmt_values[j][i] for j in range(len(self.columns))] + self.write_tr(row, indent, self.indent_delta, tags=None) + + indent -= self.indent_delta + self.write('', indent) + indent -= self.indent_delta + + return indent + + def _write_regular_rows(self, fmt_values, indent): + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + + ncols = len(self.fmt.tr_frame.columns) + nrows = len(self.fmt.tr_frame) + fmt = self.fmt._get_formatter('__index__') + if fmt is not None: + index_values = self.fmt.tr_frame.index.map(fmt) + else: + index_values = self.fmt.tr_frame.index.format() + + for i in range(nrows): + + if truncate_v and i == (self.fmt.tr_row_num): + str_sep_row = [ '...' for ele in row ] + self.write_tr(str_sep_row, indent, self.indent_delta, tags=None, + nindex_levels=1) + + row = [] + row.append(index_values[i]) + row.extend(fmt_values[j][i] for j in range(ncols)) + + if truncate_h: + dot_col_ix = self.fmt.tr_col_num + 1 + row.insert(dot_col_ix, '...') + self.write_tr(row, indent, self.indent_delta, tags=None, + nindex_levels=1) + + def _write_hierarchical_rows(self, fmt_values, indent): + template = 'rowspan="%d" valign="top"' + + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v + frame = self.fmt.tr_frame + ncols = len(frame.columns) + nrows = len(frame) + row_levels = self.frame.index.nlevels + + idx_values = frame.index.format(sparsify=False, adjoin=False, + names=False) + idx_values = lzip(*idx_values) + + if self.fmt.sparsify: + # GH3547 + sentinel = com.sentinel_factory() + levels = frame.index.format(sparsify=sentinel, + adjoin=False, names=False) + + level_lengths = _get_level_lengths(levels, sentinel) + inner_lvl = len(level_lengths) - 1 + if truncate_v: + # Insert ... row and adjust idx_values and + # level_lengths to take this into account. + ins_row = self.fmt.tr_row_num + for lnum,records in enumerate(level_lengths): + rec_new = {} + for tag,span in list(records.items()): + if tag >= ins_row: + rec_new[tag + 1] = span + elif tag + span > ins_row: + rec_new[tag] = span + 1 + dot_row = list(idx_values[ins_row - 1]) + dot_row[-1] = u('...') + idx_values.insert(ins_row,tuple(dot_row)) + else: + rec_new[tag] = span + # If ins_row lies between tags, all cols idx cols receive ... + if tag + span == ins_row: + rec_new[ins_row] = 1 + if lnum == 0: + idx_values.insert(ins_row,tuple([u('...')]*len(level_lengths))) + level_lengths[lnum] = rec_new + + level_lengths[inner_lvl][ins_row] = 1 + for ix_col in range(len(fmt_values)): + fmt_values[ix_col].insert(ins_row,'...') + nrows += 1 + + for i in range(nrows): + row = [] + tags = {} + + sparse_offset = 0 + j = 0 + for records, v in zip(level_lengths, idx_values[i]): + if i in records: + if records[i] > 1: + tags[j] = template % records[i] + else: + sparse_offset += 1 + continue + + j += 1 + row.append(v) + + row.extend(fmt_values[j][i] for j in range(ncols)) + if truncate_h: + row.insert(row_levels - sparse_offset + self.fmt.tr_col_num, '...') + self.write_tr(row, indent, self.indent_delta, tags=tags, + nindex_levels=len(levels) - sparse_offset) + else: + for i in range(len(frame)): + idx_values = list(zip(*frame.index.format(sparsify=False, + adjoin=False, + names=False))) + row = [] + row.extend(idx_values[i]) + row.extend(fmt_values[j][i] for j in range(ncols)) + if truncate_h: + row.insert(row_levels + self.fmt.tr_col_num, '...') + self.write_tr(row, indent, self.indent_delta, tags=None, + nindex_levels=frame.index.nlevels) + +def _get_level_lengths(levels, sentinel=''): + from itertools import groupby + + def _make_grouper(): + record = {'count': 0} + + def grouper(x): + if x != sentinel: + record['count'] += 1 + return record['count'] + return grouper + + result = [] + for lev in levels: + i = 0 + f = _make_grouper() + recs = {} + for key, gpr in groupby(lev, f): + values = list(gpr) + recs[i] = len(values) + i += len(values) + + result.append(recs) + + return result + + +class CSVFormatter(object): + + def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, + cols=None, header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None, quoting=None, + line_terminator='\n', chunksize=None, engine=None, + tupleize_cols=False, quotechar='"', date_format=None, + doublequote=True, escapechar=None): + + self.engine = engine # remove for 0.13 + self.obj = obj + + if path_or_buf is None: + path_or_buf = StringIO() + + self.path_or_buf = path_or_buf + self.sep = sep + self.na_rep = na_rep + self.float_format = float_format + + self.header = header + self.index = index + self.index_label = index_label + self.mode = mode + self.encoding = encoding + + if quoting is None: + quoting = csv.QUOTE_MINIMAL + self.quoting = quoting + + if quoting == csv.QUOTE_NONE: + # prevents crash in _csv + quotechar = None + self.quotechar = quotechar + + self.doublequote = doublequote + self.escapechar = escapechar + + self.line_terminator = line_terminator + + self.date_format = date_format + + # GH3457 + if not self.obj.columns.is_unique and engine == 'python': + raise NotImplementedError("columns.is_unique == False not " + "supported with engine='python'") + + self.tupleize_cols = tupleize_cols + self.has_mi_columns = isinstance(obj.columns, MultiIndex + ) and not self.tupleize_cols + + # validate mi options + if self.has_mi_columns: + if cols is not None: + raise TypeError("cannot specify cols with a MultiIndex on the " + "columns") + + if cols is not None: + if isinstance(cols, Index): + cols = cols.to_native_types(na_rep=na_rep, + float_format=float_format, + date_format=date_format) + else: + cols = list(cols) + self.obj = self.obj.loc[:, cols] + + # update columns to include possible multiplicity of dupes + # and make sure sure cols is just a list of labels + cols = self.obj.columns + if isinstance(cols, Index): + cols = cols.to_native_types(na_rep=na_rep, + float_format=float_format, + date_format=date_format) + else: + cols = list(cols) + + # save it + self.cols = cols + + # preallocate data 2d list + self.blocks = self.obj._data.blocks + ncols = sum(b.shape[0] for b in self.blocks) + self.data = [None] * ncols + + if chunksize is None: + chunksize = (100000 / (len(self.cols) or 1)) or 1 + self.chunksize = int(chunksize) + + self.data_index = obj.index + if isinstance(obj.index, PeriodIndex): + self.data_index = obj.index.to_timestamp() + + if (isinstance(self.data_index, DatetimeIndex) and + date_format is not None): + self.data_index = Index([x.strftime(date_format) + if notnull(x) else '' + for x in self.data_index]) + + self.nlevels = getattr(self.data_index, 'nlevels', 1) + if not index: + self.nlevels = 0 + + # original python implem. of df.to_csv + # invoked by df.to_csv(engine=python) + def _helper_csv(self, writer, na_rep=None, cols=None, + header=True, index=True, + index_label=None, float_format=None, date_format=None): + if cols is None: + cols = self.columns + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: + if index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(self.obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(self.obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = self.obj.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, + (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) + + writer.writerow(encoded_labels + encoded_cols) + else: + encoded_cols = list(cols) + writer.writerow(encoded_cols) + + if date_format is None: + date_formatter = lambda x: lib.Timestamp(x)._repr_base + else: + def strftime_with_nulls(x): + x = lib.Timestamp(x) + if notnull(x): + return x.strftime(date_format) + + date_formatter = lambda x: strftime_with_nulls(x) + + data_index = self.obj.index + + if isinstance(self.obj.index, PeriodIndex): + data_index = self.obj.index.to_timestamp() + + if isinstance(data_index, DatetimeIndex) and date_format is not None: + data_index = Index([date_formatter(x) for x in data_index]) + + values = self.obj.copy() + values.index = data_index + values.columns = values.columns.to_native_types( + na_rep=na_rep, float_format=float_format, + date_format=date_format) + values = values[cols] + + series = {} + for k, v in compat.iteritems(values._series): + series[k] = v.values + + nlevels = getattr(data_index, 'nlevels', 1) + for j, idx in enumerate(data_index): + row_fields = [] + if index: + if nlevels == 1: + row_fields = [idx] + else: # handle MultiIndex + row_fields = list(idx) + for i, col in enumerate(cols): + val = series[col][j] + if lib.checknull(val): + val = na_rep + + if float_format is not None and com.is_float(val): + val = float_format % val + elif isinstance(val, (np.datetime64, lib.Timestamp)): + val = date_formatter(val) + + row_fields.append(val) + + writer.writerow(row_fields) + + def save(self): + # create the writer & save + if hasattr(self.path_or_buf, 'write'): + f = self.path_or_buf + close = False + else: + f = com._get_handle(self.path_or_buf, self.mode, + encoding=self.encoding) + close = True + + try: + writer_kwargs = dict(lineterminator=self.line_terminator, + delimiter=self.sep, quoting=self.quoting, + doublequote=self.doublequote, + escapechar=self.escapechar, + quotechar=self.quotechar) + if self.encoding is not None: + writer_kwargs['encoding'] = self.encoding + self.writer = com.UnicodeWriter(f, **writer_kwargs) + else: + self.writer = csv.writer(f, **writer_kwargs) + + if self.engine == 'python': + # to be removed in 0.13 + self._helper_csv(self.writer, na_rep=self.na_rep, + float_format=self.float_format, + cols=self.cols, header=self.header, + index=self.index, + index_label=self.index_label, + date_format=self.date_format) + + else: + self._save() + + finally: + if close: + f.close() + + def _save_header(self): + + writer = self.writer + obj = self.obj + index_label = self.index_label + cols = self.cols + has_mi_columns = self.has_mi_columns + header = self.header + encoded_labels = [] + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if not (has_aliases or self.header): + return + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, MultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + else: + encoded_labels = [] + + if not has_mi_columns: + encoded_labels += list(write_cols) + + # write out the mi + if has_mi_columns: + columns = obj.columns + + # write out the names for each level, then ALL of the values for + # each level + for i in range(columns.nlevels): + + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + + # name is the first column + col_line.append(columns.names[i]) + + if isinstance(index_label, list) and len(index_label) > 1: + col_line.extend([''] * (len(index_label) - 1)) + + col_line.extend(columns.get_level_values(i)) + + writer.writerow(col_line) + + # add blanks for the columns, so that we + # have consistent seps + encoded_labels.extend([''] * len(columns)) + + # write out the index label line + writer.writerow(encoded_labels) + + def _save(self): + + self._save_header() + + nrows = len(self.data_index) + + # write in chunksize bites + chunksize = self.chunksize + chunks = int(nrows / chunksize) + 1 + + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + self._save_chunk(start_i, end_i) + + def _save_chunk(self, start_i, end_i): + + data_index = self.data_index + + # create the data for a chunk + slicer = slice(start_i, end_i) + for i in range(len(self.blocks)): + b = self.blocks[i] + d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, + float_format=self.float_format, + date_format=self.date_format) + + for col_loc, col in zip(b.mgr_locs, d): + # self.data is a preallocated list + self.data[col_loc] = col + + ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, + float_format=self.float_format, + date_format=self.date_format) + + lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + +# from collections import namedtuple +# ExcelCell = namedtuple("ExcelCell", +# 'row, col, val, style, mergestart, mergeend') + + +class ExcelCell(object): + __fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend') + __slots__ = __fields__ + + def __init__(self, row, col, val, + style=None, mergestart=None, mergeend=None): + self.row = row + self.col = col + self.val = val + self.style = style + self.mergestart = mergestart + self.mergeend = mergeend + + +header_style = {"font": {"bold": True}, + "borders": {"top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}} + + +class ExcelFormatter(object): + + """ + Class for formatting a DataFrame to a list of ExcelCells, + + Parameters + ---------- + df : dataframe + na_rep: na representation + float_format : string, default None + Format string for floating point numbers + cols : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : boolean, default True + output row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + merge_cells : boolean, default False + Format MultiIndex and Hierarchical Rows as merged cells. + inf_rep : string, default `'inf'` + representation for np.inf values (which aren't representable in Excel) + A `'-'` sign will be added in front of -inf. + """ + + def __init__(self, df, na_rep='', float_format=None, cols=None, + header=True, index=True, index_label=None, merge_cells=False, + inf_rep='inf'): + self.df = df + self.rowcounter = 0 + self.na_rep = na_rep + self.columns = cols + if cols is None: + self.columns = df.columns + self.float_format = float_format + self.index = index + self.index_label = index_label + self.header = header + self.merge_cells = merge_cells + self.inf_rep = inf_rep + + def _format_value(self, val): + if lib.checknull(val): + val = self.na_rep + elif com.is_float(val): + if np.isposinf(val): + val = '-%s' % self.inf_rep + elif np.isneginf(val): + val = self.inf_rep + elif self.float_format is not None: + val = float(self.float_format % val) + return val + + def _format_header_mi(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + if not(has_aliases or self.header): + return + + columns = self.columns + level_strs = columns.format(sparsify=True, adjoin=False, names=False) + level_lengths = _get_level_lengths(level_strs) + coloffset = 0 + lnum = 0 + + if self.index and isinstance(self.df.index, MultiIndex): + coloffset = len(self.df.index[0]) - 1 + + if self.merge_cells: + # Format multi-index as a merged cells. + for lnum in range(len(level_lengths)): + name = columns.names[lnum] + yield ExcelCell(lnum, coloffset, name, header_style) + + for lnum, (spans, levels, labels) in enumerate(zip(level_lengths, + columns.levels, + columns.labels) + ): + values = levels.take(labels) + for i in spans: + if spans[i] > 1: + yield ExcelCell(lnum, + coloffset + i + 1, + values[i], + header_style, + lnum, + coloffset + i + spans[i]) + else: + yield ExcelCell(lnum, + coloffset + i + 1, + values[i], + header_style) + else: + # Format in legacy format with dots to indicate levels. + for i, values in enumerate(zip(*level_strs)): + v = ".".join(map(com.pprint_thing, values)) + yield ExcelCell(lnum, coloffset + i + 1, v, header_style) + + self.rowcounter = lnum + + def _format_header_regular(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + if has_aliases or self.header: + coloffset = 0 + + if self.index: + coloffset = 1 + if isinstance(self.df.index, MultiIndex): + coloffset = len(self.df.index[0]) + + colnames = self.columns + if has_aliases: + if len(self.header) != len(self.columns): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(self.columns), len(self.header)))) + else: + colnames = self.header + + for colindex, colname in enumerate(colnames): + yield ExcelCell(self.rowcounter, colindex + coloffset, colname, + header_style) + + def _format_header(self): + if isinstance(self.columns, MultiIndex): + gen = self._format_header_mi() + else: + gen = self._format_header_regular() + + gen2 = () + if self.df.index.names: + row = [x if x is not None else '' + for x in self.df.index.names] + [''] * len(self.columns) + if reduce(lambda x, y: x and y, map(lambda x: x != '', row)): + gen2 = (ExcelCell(self.rowcounter, colindex, val, header_style) + for colindex, val in enumerate(row)) + self.rowcounter += 1 + return itertools.chain(gen, gen2) + + def _format_body(self): + + if isinstance(self.df.index, MultiIndex): + return self._format_hierarchical_rows() + else: + return self._format_regular_rows() + + def _format_regular_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + if has_aliases or self.header: + self.rowcounter += 1 + + coloffset = 0 + # output index and index_label? + if self.index: + # chek aliases + # if list only take first as this is not a MultiIndex + if self.index_label and isinstance(self.index_label, + (list, tuple, np.ndarray)): + index_label = self.index_label[0] + # if string good to go + elif self.index_label and isinstance(self.index_label, str): + index_label = self.index_label + else: + index_label = self.df.index.names[0] + + if index_label and self.header is not False: + if self.merge_cells: + yield ExcelCell(self.rowcounter, + 0, + index_label, + header_style) + self.rowcounter += 1 + else: + yield ExcelCell(self.rowcounter - 1, + 0, + index_label, + header_style) + + # write index_values + index_values = self.df.index + if isinstance(self.df.index, PeriodIndex): + index_values = self.df.index.to_timestamp() + + coloffset = 1 + for idx, idxval in enumerate(index_values): + yield ExcelCell(self.rowcounter + idx, 0, idxval, header_style) + + # Get a frame that will account for any duplicates in the column names. + col_mapped_frame = self.df.loc[:, self.columns] + + # Write the body of the frame data series by series. + for colidx in range(len(self.columns)): + series = col_mapped_frame.iloc[:, colidx] + for i, val in enumerate(series): + yield ExcelCell(self.rowcounter + i, colidx + coloffset, val) + + def _format_hierarchical_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray)) + if has_aliases or self.header: + self.rowcounter += 1 + + gcolidx = 0 + + if self.index: + index_labels = self.df.index.names + # check for aliases + if self.index_label and isinstance(self.index_label, + (list, tuple, np.ndarray)): + index_labels = self.index_label + + # if index labels are not empty go ahead and dump + if (any(x is not None for x in index_labels) + and self.header is not False): + + if not self.merge_cells: + self.rowcounter -= 1 + + for cidx, name in enumerate(index_labels): + yield ExcelCell(self.rowcounter, + cidx, + name, + header_style) + self.rowcounter += 1 + + if self.merge_cells: + # Format hierarchical rows as merged cells. + level_strs = self.df.index.format(sparsify=True, adjoin=False, + names=False) + level_lengths = _get_level_lengths(level_strs) + + for spans, levels, labels in zip(level_lengths, + self.df.index.levels, + self.df.index.labels): + values = levels.take(labels) + for i in spans: + if spans[i] > 1: + yield ExcelCell(self.rowcounter + i, + gcolidx, + values[i], + header_style, + self.rowcounter + i + spans[i] - 1, + gcolidx) + else: + yield ExcelCell(self.rowcounter + i, + gcolidx, + values[i], + header_style) + gcolidx += 1 + + else: + # Format hierarchical rows with non-merged values. + for indexcolvals in zip(*self.df.index): + for idx, indexcolval in enumerate(indexcolvals): + yield ExcelCell(self.rowcounter + idx, + gcolidx, + indexcolval, + header_style) + gcolidx += 1 + + # Get a frame that will account for any duplicates in the column names. + col_mapped_frame = self.df.loc[:, self.columns] + + # Write the body of the frame data series by series. + for colidx in range(len(self.columns)): + series = col_mapped_frame.iloc[:, colidx] + for i, val in enumerate(series): + yield ExcelCell(self.rowcounter + i, gcolidx + colidx, val) + + def get_formatted_cells(self): + for cell in itertools.chain(self._format_header(), + self._format_body()): + cell.val = self._format_value(cell.val) + yield cell + +#---------------------------------------------------------------------- +# Array formatters + + +def format_array(values, formatter, float_format=None, na_rep='NaN', + digits=None, space=None, justify='right'): + if com.is_float_dtype(values.dtype): + fmt_klass = FloatArrayFormatter + elif com.is_integer_dtype(values.dtype): + fmt_klass = IntArrayFormatter + elif com.is_datetime64_dtype(values.dtype): + fmt_klass = Datetime64Formatter + elif com.is_timedelta64_dtype(values.dtype): + fmt_klass = Timedelta64Formatter + else: + fmt_klass = GenericArrayFormatter + + if space is None: + space = get_option("display.column_space") + + if float_format is None: + float_format = get_option("display.float_format") + + if digits is None: + digits = get_option("display.precision") + + fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, + float_format=float_format, + formatter=formatter, space=space, + justify=justify) + + return fmt_obj.get_result() + + +class GenericArrayFormatter(object): + + def __init__(self, values, digits=7, formatter=None, na_rep='NaN', + space=12, float_format=None, justify='right'): + self.values = values + self.digits = digits + self.na_rep = na_rep + self.space = space + self.formatter = formatter + self.float_format = float_format + self.justify = justify + + def get_result(self): + fmt_values = self._format_strings() + return _make_fixed_width(fmt_values, self.justify) + + def _format_strings(self): + if self.float_format is None: + float_format = get_option("display.float_format") + if float_format is None: + fmt_str = '%% .%dg' % get_option("display.precision") + float_format = lambda x: fmt_str % x + else: + float_format = self.float_format + + formatter = self.formatter if self.formatter is not None else \ + (lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'))) + + def _format(x): + if self.na_rep is not None and lib.checknull(x): + if x is None: + return 'None' + return self.na_rep + elif isinstance(x, PandasObject): + return '%s' % x + else: + # object dtype + return '%s' % formatter(x) + + vals = self.values + + is_float = lib.map_infer(vals, com.is_float) & notnull(vals) + leading_space = is_float.any() + + fmt_values = [] + for i, v in enumerate(vals): + if not is_float[i] and leading_space: + fmt_values.append(' %s' % _format(v)) + elif is_float[i]: + fmt_values.append(float_format(v)) + else: + fmt_values.append(' %s' % _format(v)) + + return fmt_values + + +class FloatArrayFormatter(GenericArrayFormatter): + + """ + + """ + + def __init__(self, *args, **kwargs): + GenericArrayFormatter.__init__(self, *args, **kwargs) + + if self.float_format is not None and self.formatter is None: + self.formatter = self.float_format + + def _format_with(self, fmt_str): + def _val(x, threshold): + if notnull(x): + if (threshold is None or + abs(x) > get_option("display.chop_threshold")): + return fmt_str % x + else: + if fmt_str.endswith("e"): # engineering format + return "0" + else: + return fmt_str % 0 + else: + + return self.na_rep + + threshold = get_option("display.chop_threshold") + fmt_values = [_val(x, threshold) for x in self.values] + return _trim_zeros(fmt_values, self.na_rep) + + def _format_strings(self): + if self.formatter is not None: + fmt_values = [self.formatter(x) for x in self.values] + else: + fmt_str = '%% .%df' % (self.digits - 1) + fmt_values = self._format_with(fmt_str) + + if len(fmt_values) > 0: + maxlen = max(len(x) for x in fmt_values) + else: + maxlen = 0 + + too_long = maxlen > self.digits + 5 + + abs_vals = np.abs(self.values) + + # this is pretty arbitrary for now + has_large_values = (abs_vals > 1e8).any() + has_small_values = ((abs_vals < 10 ** (-self.digits)) & + (abs_vals > 0)).any() + + if too_long and has_large_values: + fmt_str = '%% .%de' % (self.digits - 1) + fmt_values = self._format_with(fmt_str) + elif has_small_values: + fmt_str = '%% .%de' % (self.digits - 1) + fmt_values = self._format_with(fmt_str) + + return fmt_values + + +class IntArrayFormatter(GenericArrayFormatter): + + def _format_strings(self): + formatter = self.formatter or (lambda x: '% d' % x) + + fmt_values = [formatter(x) for x in self.values] + + return fmt_values + + +class Datetime64Formatter(GenericArrayFormatter): + def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): + super(Datetime64Formatter, self).__init__(values, **kwargs) + self.nat_rep = nat_rep + self.date_format = date_format + + def _format_strings(self): + formatter = self.formatter or _get_format_datetime64_from_values( + self.values, + nat_rep=self.nat_rep, + date_format=self.date_format) + + fmt_values = [formatter(x) for x in self.values] + + return fmt_values + + +def _format_datetime64(x, tz=None, nat_rep='NaT'): + if x is None or lib.checknull(x): + return nat_rep + + if tz is not None or not isinstance(x, lib.Timestamp): + x = lib.Timestamp(x, tz=tz) + + return str(x) + + +def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): + if x is None or lib.checknull(x): + return nat_rep + + if not isinstance(x, lib.Timestamp): + x = lib.Timestamp(x) + + if date_format: + return x.strftime(date_format) + else: + return x._date_repr + + +def _is_dates_only(values): + for d in values: + if isinstance(d, np.datetime64): + d = lib.Timestamp(d) + + if d is not None and not lib.checknull(d) and d._has_time_component(): + return False + return True + + +def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): + + if is_dates_only: + return lambda x, tz=None: _format_datetime64_dateonly(x, + nat_rep=nat_rep, + date_format=date_format) + else: + return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) + + +def _get_format_datetime64_from_values(values, + nat_rep='NaT', + date_format=None): + is_dates_only = _is_dates_only(values) + return _get_format_datetime64(is_dates_only=is_dates_only, + nat_rep=nat_rep, + date_format=date_format) + + +class Timedelta64Formatter(GenericArrayFormatter): + + def _format_strings(self): + formatter = self.formatter or _get_format_timedelta64(self.values) + + fmt_values = [formatter(x) for x in self.values] + + return fmt_values + + +def _get_format_timedelta64(values): + values_int = values.astype(np.int64) + + consider_values = values_int != iNaT + + one_day_in_nanos = (86400 * 1e9) + even_days = np.logical_and(consider_values, values_int % one_day_in_nanos != 0).sum() == 0 + all_sub_day = np.logical_and(consider_values, np.abs(values_int) >= one_day_in_nanos).sum() == 0 + + format_short = even_days or all_sub_day + format = "short" if format_short else "long" + + def impl(x): + if x is None or lib.checknull(x): + return 'NaT' + elif format_short and com.is_integer(x) and x.view('int64') == 0: + return "0 days" if even_days else "00:00:00" + else: + return lib.repr_timedelta64(x, format=format) + + return impl + + +def _make_fixed_width(strings, justify='right', minimum=None): + if len(strings) == 0 or justify == 'all': + return strings + + _strlen = _strlen_func() + + max_len = np.max([_strlen(x) for x in strings]) + + if minimum is not None: + max_len = max(minimum, max_len) + + conf_max = get_option("display.max_colwidth") + if conf_max is not None and max_len > conf_max: + max_len = conf_max + + if justify == 'left': + justfunc = lambda self, x: self.ljust(x) + else: + justfunc = lambda self, x: self.rjust(x) + + def just(x): + eff_len = max_len + + if conf_max is not None: + if (conf_max > 3) & (_strlen(x) > max_len): + x = x[:eff_len - 3] + '...' + + return justfunc(x, eff_len) + + result = [just(x) for x in strings] + + return result + + +def _trim_zeros(str_floats, na_rep='NaN'): + """ + Trims zeros and decimal points. + """ + trimmed = str_floats + + def _cond(values): + non_na = [x for x in values if x != na_rep] + return (len(non_na) > 0 and all([x.endswith('0') for x in non_na]) and + not(any([('e' in x) or ('E' in x) for x in non_na]))) + + while _cond(trimmed): + trimmed = [x[:-1] if x != na_rep else x for x in trimmed] + + # trim decimal points + return [x[:-1] if x.endswith('.') and x != na_rep else x for x in trimmed] + + +def single_column_table(column, align=None, style=None): + table = '%s' % str(i)) + table += '' + return table + + +def single_row_table(row): # pragma: no cover + table = '' + for i in row: + table += ('' % str(i)) + table += '
%s
' + return table + + +def _has_names(index): + if isinstance(index, MultiIndex): + return any([x is not None for x in index.names]) + else: + return index.name is not None + + +#------------------------------------------------------------------------------ +# Global formatting options + +_initial_defencoding = None + + +def detect_console_encoding(): + """ + Try to find the most capable encoding supported by the console. + slighly modified from the way IPython handles the same issue. + """ + import locale + global _initial_defencoding + + encoding = None + try: + encoding = sys.stdout.encoding or sys.stdin.encoding + except AttributeError: + pass + + # try again for something better + if not encoding or 'ascii' in encoding.lower(): + try: + encoding = locale.getpreferredencoding() + except Exception: + pass + + # when all else fails. this will usually be "ascii" + if not encoding or 'ascii' in encoding.lower(): + encoding = sys.getdefaultencoding() + + # GH3360, save the reported defencoding at import time + # MPL backends may change it. Make available for debugging. + if not _initial_defencoding: + _initial_defencoding = sys.getdefaultencoding() + + return encoding + + +def get_console_size(): + """Return console size as tuple = (width, height). + + Returns (None,None) in non-interactive session. + """ + display_width = get_option('display.width') + # deprecated. + display_height = get_option('display.height', silent=True) + + # Consider + # interactive shell terminal, can detect term size + # interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term + # size non-interactive script, should disregard term size + + # in addition + # width,height have default values, but setting to 'None' signals + # should use Auto-Detection, But only in interactive shell-terminal. + # Simple. yeah. + + if com.in_interactive_session(): + if com.in_ipython_frontend(): + # sane defaults for interactive non-shell terminal + # match default for width,height in config_init + from pandas.core.config import get_default_val + terminal_width = get_default_val('display.width') + terminal_height = get_default_val('display.height') + else: + # pure terminal + terminal_width, terminal_height = get_terminal_size() + else: + terminal_width, terminal_height = None, None + + # Note if the User sets width/Height to None (auto-detection) + # and we're in a script (non-inter), this will return (None,None) + # caller needs to deal. + return (display_width or terminal_width, display_height or terminal_height) + + +class EngFormatter(object): + + """ + Formats float values according to engineering format. + + Based on matplotlib.ticker.EngFormatter + """ + + # The SI engineering prefixes + ENG_PREFIXES = { + -24: "y", + -21: "z", + -18: "a", + -15: "f", + -12: "p", + -9: "n", + -6: "u", + -3: "m", + 0: "", + 3: "k", + 6: "M", + 9: "G", + 12: "T", + 15: "P", + 18: "E", + 21: "Z", + 24: "Y" + } + + def __init__(self, accuracy=None, use_eng_prefix=False): + self.accuracy = accuracy + self.use_eng_prefix = use_eng_prefix + + def __call__(self, num): + """ Formats a number in engineering notation, appending a letter + representing the power of 1000 of the original number. Some examples: + + >>> format_eng(0) # for self.accuracy = 0 + ' 0' + + >>> format_eng(1000000) # for self.accuracy = 1, + # self.use_eng_prefix = True + ' 1.0M' + + >>> format_eng("-1e-6") # for self.accuracy = 2 + # self.use_eng_prefix = False + '-1.00E-06' + + @param num: the value to represent + @type num: either a numeric value or a string that can be converted to + a numeric value (as per decimal.Decimal constructor) + + @return: engineering formatted string + """ + import decimal + import math + dnum = decimal.Decimal(str(num)) + + sign = 1 + + if dnum < 0: # pragma: no cover + sign = -1 + dnum = -dnum + + if dnum != 0: + pow10 = decimal.Decimal(int(math.floor(dnum.log10() / 3) * 3)) + else: + pow10 = decimal.Decimal(0) + + pow10 = pow10.min(max(self.ENG_PREFIXES.keys())) + pow10 = pow10.max(min(self.ENG_PREFIXES.keys())) + int_pow10 = int(pow10) + + if self.use_eng_prefix: + prefix = self.ENG_PREFIXES[int_pow10] + else: + if int_pow10 < 0: + prefix = 'E-%02d' % (-int_pow10) + else: + prefix = 'E+%02d' % int_pow10 + + mant = sign * dnum / (10 ** pow10) + + if self.accuracy is None: # pragma: no cover + format_str = u("% g%s") + else: + format_str = (u("%% .%if%%s") % self.accuracy) + + formatted = format_str % (mant, prefix) + + return formatted # .strip() + + +def set_eng_float_format(accuracy=3, use_eng_prefix=False): + """ + Alter default behavior on how float is formatted in DataFrame. + Format float in engineering format. By accuracy, we mean the number of + decimal digits after the floating point. + + See also EngFormatter. + """ + + set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix)) + set_option("display.column_space", max(12, accuracy + 9)) + + +def _put_lines(buf, lines): + if any(isinstance(x, compat.text_type) for x in lines): + lines = [compat.text_type(x) for x in lines] + buf.write('\n'.join(lines)) + + +def _binify(cols, line_width): + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins + +if __name__ == '__main__': + arr = np.array([746.03, 0.00, 5620.00, 1592.36]) + # arr = np.array([11111111.1, 1.55]) + # arr = [314200.0034, 1.4125678] + arr = np.array([327763.3119, 345040.9076, 364460.9915, 398226.8688, + 383800.5172, 433442.9262, 539415.0568, 568590.4108, + 599502.4276, 620921.8593, 620898.5294, 552427.1093, + 555221.2193, 519639.7059, 388175.7, 379199.5854, + 614898.25, 504833.3333, 560600., 941214.2857, + 1134250., 1219550., 855736.85, 1042615.4286, + 722621.3043, 698167.1818, 803750.]) + fmt = FloatArrayFormatter(arr, digits=7) + print(fmt.get_result()) diff --git a/pandas/core/frame.py b/pandas/core/frame.py new file mode 100644 index 00000000..b97cb119 --- /dev/null +++ b/pandas/core/frame.py @@ -0,0 +1,4985 @@ +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +Similar to its R counterpart, data.frame, except providing automatic data +alignment and a host of useful data manipulation methods having to do with the +labeling information +""" +from __future__ import division +# pylint: disable=E1101,E1103 +# pylint: disable=W0212,W0231,W0703,W0622 + +import functools +import collections +import itertools +import sys +import types +import warnings + +from numpy import nan as NA +import numpy as np +import numpy.ma as ma + +from pandas.core.common import (isnull, notnull, PandasError, _try_sort, + _default_index, _maybe_upcast, _is_sequence, + _infer_dtype_from_scalar, _values_from_object, + is_list_like, _get_dtype) +from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.indexing import (_maybe_droplevels, + _convert_to_index_sliceable, + _check_bool_indexer) +from pandas.core.internals import (BlockManager, + create_block_manager_from_arrays, + create_block_manager_from_blocks) +from pandas.core.series import Series +import pandas.computation.expressions as expressions +from pandas.computation.eval import eval as _eval +from numpy import percentile as _quantile +from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, + OrderedDict, raise_with_traceback) +from pandas import compat +from pandas.util.decorators import deprecate, Appender, Substitution, \ + deprecate_kwarg + +from pandas.tseries.period import PeriodIndex +from pandas.tseries.index import DatetimeIndex + +import pandas.core.algorithms as algos +import pandas.core.common as com +import pandas.core.format as fmt +import pandas.core.nanops as nanops +import pandas.core.ops as ops + +import pandas.lib as lib +import pandas.algos as _algos + +from pandas.core.config import get_option + +#---------------------------------------------------------------------- +# Docstring templates + +_shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame', + axes_single_arg="{0,1,'index','columns'}") + +_numeric_only_doc = """numeric_only : boolean, default None + Include only float, int, boolean data. If None, will attempt to use + everything, then use only numeric data +""" + +_merge_doc = """ +Merge DataFrame objects by performing a database-style join operation by +columns or indexes. + +If joining columns on columns, the DataFrame indexes *will be +ignored*. Otherwise if joining indexes on indexes or indexes on a column or +columns, the index will be passed on. + +Parameters +----------%s +right : DataFrame +how : {'left', 'right', 'outer', 'inner'}, default 'inner' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join) +on : label or list + Field names to join on. Must be found in both DataFrames. If on is + None and not merging on indexes, then it merges on the intersection of + the columns by default. +left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns +right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs +left_index : boolean, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels +right_index : boolean, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index +sort : boolean, default False + Sort the join keys lexicographically in the result DataFrame +suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively +copy : boolean, default True + If False, do not copy data unnecessarily + +Examples +-------- + +>>> A >>> B + lkey value rkey value +0 foo 1 0 foo 5 +1 bar 2 1 bar 6 +2 baz 3 2 qux 7 +3 foo 4 3 bar 8 + +>>> merge(A, B, left_on='lkey', right_on='rkey', how='outer') + lkey value_x rkey value_y +0 foo 1 foo 5 +1 foo 4 foo 5 +2 bar 2 bar 6 +3 bar 2 bar 8 +4 baz 3 NaN NaN +5 NaN NaN qux 7 + +Returns +------- +merged : DataFrame +""" + +#---------------------------------------------------------------------- +# DataFrame class + + +class DataFrame(NDFrame): + + """ Two-dimensional size-mutable, potentially heterogeneous tabular data + structure with labeled axes (rows and columns). Arithmetic operations + align on both row and column labels. Can be thought of as a dict-like + container for Series objects. The primary pandas data structure + + Parameters + ---------- + data : numpy ndarray (structured or homogeneous), dict, or DataFrame + Dict can contain Series, arrays, constants, or list-like objects + index : Index or array-like + Index to use for resulting frame. Will default to np.arange(n) if + no indexing information part of input data and no index provided + columns : Index or array-like + Column labels to use for resulting frame. Will default to + np.arange(n) if no column labels are provided + dtype : dtype, default None + Data type to force, otherwise infer + copy : boolean, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input + + Examples + -------- + >>> d = {'col1': ts1, 'col2': ts2} + >>> df = DataFrame(data=d, index=index) + >>> df2 = DataFrame(np.random.randn(10, 5)) + >>> df3 = DataFrame(np.random.randn(10, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + + See also + -------- + DataFrame.from_records : constructor from tuples, also record arrays + DataFrame.from_dict : from dicts of Series, arrays, or dicts + DataFrame.from_csv : from CSV files + DataFrame.from_items : from sequence of (key, value) pairs + pandas.read_csv, pandas.read_table, pandas.read_clipboard + """ + _auto_consolidate = True + + @property + def _constructor(self): + return DataFrame + + _constructor_sliced = Series + + def __init__(self, data=None, index=None, columns=None, dtype=None, + copy=False): + if data is None: + data = {} + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, DataFrame): + data = data._data + + if isinstance(data, BlockManager): + mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), + dtype=dtype, copy=copy) + elif isinstance(data, dict): + mgr = self._init_dict(data, index, columns, dtype=dtype) + elif isinstance(data, ma.MaskedArray): + import numpy.ma.mrecords as mrecords + # masked recarray + if isinstance(data, mrecords.MaskedRecords): + mgr = _masked_rec_array_to_mgr(data, index, columns, dtype, + copy) + + # a masked array + else: + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = _maybe_upcast(data, copy=True) + data[mask] = fill_value + else: + data = data.copy() + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + + elif isinstance(data, (np.ndarray, Series)): + if data.dtype.names: + data_columns = list(data.dtype.names) + data = dict((k, data[k]) for k in data_columns) + if columns is None: + columns = data_columns + mgr = self._init_dict(data, index, columns, dtype=dtype) + elif getattr(data, 'name', None): + mgr = self._init_dict({data.name: data}, index, columns, + dtype=dtype) + else: + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + elif isinstance(data, (list, types.GeneratorType)): + if isinstance(data, types.GeneratorType): + data = list(data) + if len(data) > 0: + if index is None and isinstance(data[0], Series): + index = _get_names_from_index(data) + + if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: + arrays, columns = _to_arrays(data, columns, dtype=dtype) + columns = _ensure_index(columns) + + if index is None: + index = _default_index(len(data)) + mgr = _arrays_to_mgr(arrays, columns, index, columns, + dtype=dtype) + else: + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + else: + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + elif isinstance(data, collections.Iterator): + raise TypeError("data argument can't be an iterator") + else: + try: + arr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError) as e: + exc = TypeError('DataFrame constructor called with ' + 'incompatible data and dtype: %s' % e) + raise_with_traceback(exc) + + if arr.ndim == 0 and index is not None and columns is not None: + if isinstance(data, compat.string_types) and dtype is None: + dtype = np.object_ + if dtype is None: + dtype, data = _infer_dtype_from_scalar(data) + + values = np.empty((len(index), len(columns)), dtype=dtype) + values.fill(data) + mgr = self._init_ndarray(values, index, columns, dtype=dtype, + copy=False) + else: + raise PandasError('DataFrame constructor not properly called!') + + NDFrame.__init__(self, mgr, fastpath=True) + + def _init_dict(self, data, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. + """ + if columns is not None: + columns = _ensure_index(columns) + + # prefilter if columns passed + + data = dict((k, v) for k, v in compat.iteritems(data) + if k in columns) + + if index is None: + index = extract_index(list(data.values())) + else: + index = _ensure_index(index) + + arrays = [] + data_names = [] + for k in columns: + if k not in data: + # no obvious "empty" int column + if dtype is not None and issubclass(dtype.type, + np.integer): + continue + + if dtype is None: + # 1783 + v = np.empty(len(index), dtype=object) + else: + v = np.empty(len(index), dtype=dtype) + + v.fill(NA) + else: + v = data[k] + data_names.append(k) + arrays.append(v) + else: + keys = list(data.keys()) + if not isinstance(data, OrderedDict): + keys = _try_sort(keys) + columns = data_names = Index(keys) + arrays = [data[k] for k in keys] + + return _arrays_to_mgr(arrays, data_names, index, columns, + dtype=dtype) + + def _init_ndarray(self, values, index, columns, dtype=None, + copy=False): + if isinstance(values, Series): + if columns is None: + if values.name is not None: + columns = [values.name] + if index is None: + index = values.index + else: + values = values.reindex(index) + + # zero len case (GH #2234) + if not len(values) and columns is not None and len(columns): + values = np.empty((0, 1), dtype=object) + + values = _prep_ndarray(values, copy=copy) + + if dtype is not None: + if values.dtype != dtype: + try: + values = values.astype(dtype) + except Exception as orig: + e = ValueError("failed to cast to '%s' (Exception was: %s)" + % (dtype, orig)) + raise_with_traceback(e) + + N, K = values.shape + + if index is None: + index = _default_index(N) + else: + index = _ensure_index(index) + + if columns is None: + columns = _default_index(K) + else: + columns = _ensure_index(columns) + + return create_block_manager_from_blocks([values.T], [columns, index]) + + @property + def axes(self): + return [self.index, self.columns] + + @property + def shape(self): + return (len(self.index), len(self.columns)) + + def _repr_fits_vertical_(self): + """ + Check length against max_rows. + """ + max_rows = get_option("display.max_rows") + return len(self) <= max_rows + + def _repr_fits_horizontal_(self, ignore_width=False): + """ + Check if full repr fits in horizontal boundaries imposed by the display + options width and max_columns. In case off non-interactive session, no + boundaries apply. + + ignore_width is here so ipnb+HTML output can behave the way + users expect. display.max_columns remains in effect. + GH3541, GH3573 + """ + + width, height = fmt.get_console_size() + max_columns = get_option("display.max_columns") + nb_columns = len(self.columns) + + # exceed max columns + if ((max_columns and nb_columns > max_columns) or + ((not ignore_width) and width and nb_columns > (width // 2))): + return False + + if (ignore_width # used by repr_html under IPython notebook + # scripts ignore terminal dims + or not com.in_interactive_session()): + return True + + if (get_option('display.width') is not None or + com.in_ipython_frontend()): + # check at least the column row for excessive width + max_rows = 1 + else: + max_rows = get_option("display.max_rows") + + # when auto-detecting, so width=None and not in ipython front end + # check whether repr fits horizontal by actualy checking + # the width of the rendered repr + buf = StringIO() + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + + if not (max_rows is None): # unlimited rows + # min of two, where one may be None + d = d.iloc[:min(max_rows, len(d))] + else: + return True + + d.to_string(buf=buf) + value = buf.getvalue() + repr_width = max([len(l) for l in value.split('\n')]) + + return repr_width < width + + def _info_repr(self): + """True if the repr should show the info view.""" + info_repr_option = (get_option("display.large_repr") == "info") + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) + + def __unicode__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + buf = StringIO(u("")) + if self._info_repr(): + self.info(buf=buf) + return buf.getvalue() + + max_rows = get_option("display.max_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + if get_option("display.expand_frame_repr"): + width, _ = fmt.get_console_size() + else: + width = None + self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols, + line_width=width, show_dimensions=show_dimensions) + + return buf.getvalue() + + def _repr_html_(self): + """ + Return a html representation for a particular DataFrame. + Mainly for IPython notebook. + """ + # qtconsole doesn't report it's line width, and also + # behaves badly when outputting an HTML table + # that doesn't fit the window, so disable it. + # XXX: In IPython 3.x and above, the Qt console will not attempt to + # display HTML, so this check can be removed when support for IPython 2.x + # is no longer needed. + if com.in_qtconsole(): + # 'HTML output is disabled in QtConsole' + return None + + if self._info_repr(): + buf = StringIO(u("")) + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace('<', r'<', 1).replace('>', + r'>', 1) + return '
' + val + '
' + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + + return ('
\n' + + self.to_html(max_rows=max_rows, max_cols=max_cols, + show_dimensions=show_dimensions) + '\n
') + else: + return None + + def iteritems(self): + """Iterator over (column, series) pairs""" + if self.columns.is_unique and hasattr(self, '_item_cache'): + for k in self.columns: + yield k, self._get_item_cache(k) + else: + for i, k in enumerate(self.columns): + yield k, self.icol(i) + + def iterrows(self): + """ + Iterate over rows of DataFrame as (index, Series) pairs. + + Notes + ----- + + * ``iterrows`` does **not** preserve dtypes across the rows (dtypes + are preserved across columns for DataFrames). For example, + + >>> df = DataFrame([[1, 1.0]], columns=['x', 'y']) + >>> row = next(df.iterrows())[1] + >>> print(row['x'].dtype) + float64 + >>> print(df['x'].dtype) + int64 + + Returns + ------- + it : generator + A generator that iterates over the rows of the frame. + """ + columns = self.columns + for k, v in zip(self.index, self.values): + s = Series(v, index=columns, name=k) + yield k, s + + def itertuples(self, index=True): + """ + Iterate over rows of DataFrame as tuples, with index value + as first element of the tuple + """ + arrays = [] + if index: + arrays.append(self.index) + + # use integer indexing because of possible duplicate column names + arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) + return zip(*arrays) + + if compat.PY3: # pragma: no cover + items = iteritems + + def __len__(self): + """Returns length of info axis, but here we use the index """ + return len(self.index) + + def dot(self, other): + """ + Matrix multiplication with DataFrame or Series objects + + Parameters + ---------- + other : DataFrame or Series + + Returns + ------- + dot_product : DataFrame or Series + """ + if isinstance(other, (Series, DataFrame)): + common = self.columns.union(other.index) + if (len(common) > len(self.columns) or + len(common) > len(other.index)): + raise ValueError('matrices are not aligned') + + left = self.reindex(columns=common, copy=False) + right = other.reindex(index=common, copy=False) + lvals = left.values + rvals = right.values + else: + left = self + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[1] != rvals.shape[0]: + raise ValueError('Dot product shape mismatch, %s vs %s' % + (lvals.shape, rvals.shape)) + + if isinstance(other, DataFrame): + return self._constructor(np.dot(lvals, rvals), + index=left.index, + columns=other.columns) + elif isinstance(other, Series): + return Series(np.dot(lvals, rvals), index=left.index) + elif isinstance(rvals, np.ndarray): + result = np.dot(lvals, rvals) + if result.ndim == 2: + return self._constructor(result, index=left.index) + else: + return Series(result, index=left.index) + else: # pragma: no cover + raise TypeError('unsupported type: %s' % type(other)) + + #---------------------------------------------------------------------- + # IO methods (to / from other formats) + + @classmethod + def from_dict(cls, data, orient='columns', dtype=None): + """ + Construct DataFrame from dict of array-like or dicts + + Parameters + ---------- + data : dict + {field : array-like} or {field : dict} + orient : {'columns', 'index'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + + Returns + ------- + DataFrame + """ + index, columns = None, None + orient = orient.lower() + if orient == 'index': + if len(data) > 0: + # TODO speed up Series case + if isinstance(list(data.values())[0], (Series, dict)): + data = _from_nested_dict(data) + else: + data, index = list(data.values()), list(data.keys()) + elif orient != 'columns': # pragma: no cover + raise ValueError('only recognize index or columns for orient') + + return cls(data, index=index, columns=columns, dtype=dtype) + + def to_dict(self, outtype='dict'): + """ + Convert DataFrame to dictionary. + + Parameters + ---------- + outtype : str {'dict', 'list', 'series', 'records'} + Determines the type of the values of the dictionary. The + default `dict` is a nested dictionary {column -> {index -> value}}. + `list` returns {column -> list(values)}. `series` returns + {column -> Series(values)}. `records` returns [{columns -> value}]. + Abbreviations are allowed. + + + Returns + ------- + result : dict like {column -> {index -> value}} + """ + if not self.columns.is_unique: + warnings.warn("DataFrame columns are not unique, some " + "columns will be omitted.", UserWarning) + if outtype.lower().startswith('d'): + return dict((k, v.to_dict()) for k, v in compat.iteritems(self)) + elif outtype.lower().startswith('l'): + return dict((k, v.tolist()) for k, v in compat.iteritems(self)) + elif outtype.lower().startswith('s'): + return dict((k, v) for k, v in compat.iteritems(self)) + elif outtype.lower().startswith('r'): + return [dict((k, v) for k, v in zip(self.columns, row)) + for row in self.values] + else: # pragma: no cover + raise ValueError("outtype %s not understood" % outtype) + + def to_gbq(self, destination_table, project_id=None, chunksize=10000, + verbose=True, reauth=False): + """Write a DataFrame to a Google BigQuery table. + + THIS IS AN EXPERIMENTAL LIBRARY + + If the table exists, the dataframe will be written to the table using + the defined table schema and column types. For simplicity, this method + uses the Google BigQuery streaming API. The to_gbq method chunks data + into a default chunk size of 10,000. Failures return the complete error + response which can be quite long depending on the size of the insert. + There are several important limitations of the Google streaming API + which are detailed at: + https://developers.google.com/bigquery/streaming-data-into-bigquery. + + Parameters + ---------- + dataframe : DataFrame + DataFrame to be written + destination_table : string + Name of table to be written, in the form 'dataset.tablename' + project_id : str + Google BigQuery Account project ID. + chunksize : int (default 10000) + Number of rows to be inserted in each chunk from the dataframe. + verbose : boolean (default True) + Show percentage complete + reauth : boolean (default False) + Force Google BigQuery to reauthenticate the user. This is useful + if multiple accounts are used. + + """ + + from pandas.io import gbq + return gbq.to_gbq(self, destination_table, project_id=project_id, + chunksize=chunksize, verbose=verbose, + reauth=reauth) + + @classmethod + def from_records(cls, data, index=None, exclude=None, columns=None, + coerce_float=False, nrows=None): + """ + Convert structured or record ndarray to DataFrame + + Parameters + ---------- + data : ndarray (structured dtype), list of tuples, dict, or DataFrame + index : string, list of fields, array-like + Field of array to use as the index, alternately a specific set of + input labels to use + exclude : sequence, default None + Columns or fields to exclude + columns : sequence, default None + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns) + coerce_float : boolean, default False + Attempt to convert values to non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets + + Returns + ------- + df : DataFrame + """ + # Make a copy of the input columns so we can modify it + if columns is not None: + columns = _ensure_index(columns) + + if com.is_iterator(data): + if nrows == 0: + return cls() + + try: + if compat.PY3: + first_row = next(data) + else: + first_row = next(data) + except StopIteration: + return cls(index=index, columns=columns) + + dtype = None + if hasattr(first_row, 'dtype') and first_row.dtype.names: + dtype = first_row.dtype + + values = [first_row] + + if nrows is None: + values += data + else: + values.extend(itertools.islice(data, nrows - 1)) + + if dtype is not None: + data = np.array(values, dtype=dtype) + else: + data = values + + if isinstance(data, dict): + if columns is None: + columns = arr_columns = _ensure_index(sorted(data)) + arrays = [data[k] for k in columns] + else: + arrays = [] + arr_columns = [] + for k, v in compat.iteritems(data): + if k in columns: + arr_columns.append(k) + arrays.append(v) + + arrays, arr_columns = _reorder_arrays(arrays, arr_columns, + columns) + + elif isinstance(data, (np.ndarray, DataFrame)): + arrays, columns = _to_arrays(data, columns) + if columns is not None: + columns = _ensure_index(columns) + arr_columns = columns + else: + arrays, arr_columns = _to_arrays(data, columns, + coerce_float=coerce_float) + + arr_columns = _ensure_index(arr_columns) + if columns is not None: + columns = _ensure_index(columns) + else: + columns = arr_columns + + if exclude is None: + exclude = set() + else: + exclude = set(exclude) + + result_index = None + if index is not None: + if (isinstance(index, compat.string_types) or + not hasattr(index, "__iter__")): + i = columns.get_loc(index) + exclude.add(index) + if len(arrays) > 0: + result_index = Index(arrays[i], name=index) + else: + result_index = Index([], name=index) + else: + try: + to_remove = [arr_columns.get_loc(field) for field in index] + + result_index = MultiIndex.from_arrays( + [arrays[i] for i in to_remove], names=index) + + exclude.update(index) + except Exception: + result_index = index + + if any(exclude): + arr_exclude = [x for x in exclude if x in arr_columns] + to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arrays = [v for i, v in enumerate(arrays) if i not in to_remove] + + arr_columns = arr_columns.drop(arr_exclude) + columns = columns.drop(exclude) + + mgr = _arrays_to_mgr(arrays, arr_columns, result_index, + columns) + + return cls(mgr) + + def to_records(self, index=True, convert_datetime64=True): + """ + Convert DataFrame to record array. Index will be put in the + 'index' field of the record array if requested + + Parameters + ---------- + index : boolean, default True + Include index in resulting record array, stored in 'index' field + convert_datetime64 : boolean, default True + Whether to convert the index to datetime.datetime if it is a + DatetimeIndex + + Returns + ------- + y : recarray + """ + if index: + if com.is_datetime64_dtype(self.index) and convert_datetime64: + ix_vals = [self.index.to_pydatetime()] + else: + if isinstance(self.index, MultiIndex): + # array of tuples to numpy cols. copy copy copy + ix_vals = lmap(np.array, zip(*self.index.values)) + else: + ix_vals = [self.index.values] + + arrays = ix_vals + [self[c].values for c in self.columns] + + count = 0 + index_names = list(self.index.names) + if isinstance(self.index, MultiIndex): + for i, n in enumerate(index_names): + if n is None: + index_names[i] = 'level_%d' % count + count += 1 + elif index_names[0] is None: + index_names = ['index'] + names = index_names + lmap(str, self.columns) + else: + arrays = [self[c].values for c in self.columns] + names = lmap(str, self.columns) + + dtype = np.dtype([(x, v.dtype) for x, v in zip(names, arrays)]) + return np.rec.fromarrays(arrays, dtype=dtype, names=names) + + @classmethod + def from_items(cls, items, columns=None, orient='columns'): + """ + Convert (key, value) pairs to DataFrame. The keys will be the axis + index (usually the columns, but depends on the specified + orientation). The values should be arrays or Series. + + Parameters + ---------- + items : sequence of (key, value) pairs + Values should be arrays or Series. + columns : sequence of column labels, optional + Must be passed if orient='index'. + orient : {'columns', 'index'}, default 'columns' + The "orientation" of the data. If the keys of the + input correspond to column labels, pass 'columns' + (default). Otherwise if the keys correspond to the index, + pass 'index'. + + Returns + ------- + frame : DataFrame + """ + keys, values = lzip(*items) + + if orient == 'columns': + if columns is not None: + columns = _ensure_index(columns) + + idict = dict(items) + if len(idict) < len(items): + if not columns.equals(_ensure_index(keys)): + raise ValueError('With non-unique item names, passed ' + 'columns must be identical') + arrays = values + else: + arrays = [idict[k] for k in columns if k in idict] + else: + columns = _ensure_index(keys) + arrays = values + + return cls._from_arrays(arrays, columns, None) + elif orient == 'index': + if columns is None: + raise TypeError("Must pass columns with orient='index'") + + keys = _ensure_index(keys) + + arr = np.array(values, dtype=object).T + data = [lib.maybe_convert_objects(v) for v in arr] + return cls._from_arrays(data, columns, keys) + else: # pragma: no cover + raise ValueError("'orient' must be either 'columns' or 'index'") + + @classmethod + def _from_arrays(cls, arrays, columns, index, dtype=None): + mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + return cls(mgr) + + @classmethod + def from_csv(cls, path, header=0, sep=',', index_col=0, + parse_dates=True, encoding=None, tupleize_cols=False, + infer_datetime_format=False): + """ + Read delimited file into DataFrame + + Parameters + ---------- + path : string file path or file handle / StringIO + header : int, default 0 + Row to use at header (skip prior rows) + sep : string, default ',' + Field delimiter + index_col : int or sequence, default 0 + Column to use for index. If a sequence is given, a MultiIndex + is used. Different default from read_table + parse_dates : boolean, default True + Parse dates. Different default from read_table + tupleize_cols : boolean, default False + write multi_index columns as a list of tuples (if True) + or new (expanded format) if False) + infer_datetime_format: boolean, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. + + Notes + ----- + Preferable to use read_table for most general purposes but from_csv + makes for an easy roundtrip to and from file, especially with a + DataFrame of time series data + + Returns + ------- + y : DataFrame + """ + from pandas.io.parsers import read_table + return read_table(path, header=header, sep=sep, + parse_dates=parse_dates, index_col=index_col, + encoding=encoding, tupleize_cols=tupleize_cols, + infer_datetime_format=infer_datetime_format) + + def to_sparse(self, fill_value=None, kind='block'): + """ + Convert to SparseDataFrame + + Parameters + ---------- + fill_value : float, default NaN + kind : {'block', 'integer'} + + Returns + ------- + y : SparseDataFrame + """ + from pandas.core.sparse import SparseDataFrame + return SparseDataFrame(self._series, index=self.index, + default_kind=kind, + default_fill_value=fill_value) + + def to_panel(self): + """ + Transform long (stacked) format (DataFrame) into wide (3D, Panel) + format. + + Currently the index of the DataFrame must be a 2-level MultiIndex. This + may be generalized later + + Returns + ------- + panel : Panel + """ + from pandas.core.panel import Panel + from pandas.core.reshape import block2d_to_blocknd + + # only support this kind for now + if (not isinstance(self.index, MultiIndex) or # pragma: no cover + len(self.index.levels) != 2): + raise NotImplementedError('Only 2-level MultiIndex are supported.') + + if not self.index.is_unique: + raise ValueError("Can't convert non-uniquely indexed " + "DataFrame to Panel") + + self._consolidate_inplace() + + # minor axis must be sorted + if self.index.lexsort_depth < 2: + selfsorted = self.sortlevel(0) + else: + selfsorted = self + + major_axis, minor_axis = selfsorted.index.levels + + major_labels, minor_labels = selfsorted.index.labels + + shape = len(major_axis), len(minor_axis) + + new_blocks = [] + for block in selfsorted._data.blocks: + newb = block2d_to_blocknd( + values=block.values.T, + placement=block.mgr_locs, shape=shape, + labels=[major_labels, minor_labels], + ref_items=selfsorted.columns) + new_blocks.append(newb) + + # preserve names, if any + major_axis = major_axis.copy() + major_axis.name = self.index.names[0] + + minor_axis = minor_axis.copy() + minor_axis.name = self.index.names[1] + + new_axes = [selfsorted.columns, major_axis, minor_axis] + new_mgr = create_block_manager_from_blocks(new_blocks, new_axes) + + return Panel(new_mgr) + + to_wide = deprecate('to_wide', to_panel) + + @deprecate_kwarg(old_arg_name='cols', new_arg_name='columns') + def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, + columns=None, header=True, index=True, index_label=None, + mode='w', encoding=None, quoting=None, + quotechar='"', line_terminator='\n', chunksize=None, + tupleize_cols=False, date_format=None, doublequote=True, + escapechar=None, **kwds): + r"""Write DataFrame to a comma-separated values (csv) file + + Parameters + ---------- + path_or_buf : string or file handle, default None + File path or object, if None is provided the result is returned as + a string. + sep : character, default "," + Field delimiter for the output file. + na_rep : string, default '' + Missing data representation + float_format : string, default None + Format string for floating point numbers + columns : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is assumed + to be aliases for the column names + index : boolean, default True + Write row names (index) + index_label : string or sequence, or False, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. If + False do not print fields for index names. Use index_label=False + for easier importing in R + nanRep : None + deprecated, use na_rep + mode : str + Python write mode, default 'w' + encoding : string, optional + a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + line_terminator : string, default '\\n' + The newline character or character sequence to use in the output + file + quoting : optional constant from csv module + defaults to csv.QUOTE_MINIMAL + quotechar : string (length 1), default '"' + character used to quote fields + doublequote : boolean, default True + Control quoting of `quotechar` inside a field + escapechar : string (length 1), default None + character used to escape `sep` and `quotechar` when appropriate + chunksize : int or None + rows to write at a time + tupleize_cols : boolean, default False + write multi_index columns as a list of tuples (if True) + or new (expanded format) if False) + date_format : string, default None + Format string for datetime objects + cols : kwarg only alias of columns [deprecated] + """ + + formatter = fmt.CSVFormatter(self, path_or_buf, + line_terminator=line_terminator, + sep=sep, encoding=encoding, + quoting=quoting, na_rep=na_rep, + float_format=float_format, cols=columns, + header=header, index=index, + index_label=index_label, mode=mode, + chunksize=chunksize, quotechar=quotechar, + engine=kwds.get("engine"), + tupleize_cols=tupleize_cols, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar) + formatter.save() + + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + + @deprecate_kwarg(old_arg_name='cols', new_arg_name='columns') + def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', + float_format=None, columns=None, header=True, index=True, + index_label=None, startrow=0, startcol=0, engine=None, + merge_cells=True, encoding=None, inf_rep='inf'): + """ + Write DataFrame to a excel sheet + + Parameters + ---------- + excel_writer : string or ExcelWriter object + File path or existing ExcelWriter + sheet_name : string, default 'Sheet1' + Name of sheet which will contain DataFrame + na_rep : string, default '' + Missing data representation + float_format : string, default None + Format string for floating point numbers + columns : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : boolean, default True + Write row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + startrow : + upper left cell row to dump data frame + startcol : + upper left cell column to dump data frame + engine : string, default None + write engine to use - you can also set this via the options + ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and + ``io.excel.xlsm.writer``. + merge_cells : boolean, default True + Write MultiIndex and Hierarchical Rows as merged cells. + encoding: string, default None + encoding of the resulting excel file. Only necessary for xlwt, + other writers support unicode natively. + cols : kwarg only alias of columns [deprecated] + inf_rep : string, default 'inf' + Representation for infinity (there is no native representation for + infinity in Excel) + + Notes + ----- + If passing an existing ExcelWriter object, then the sheet will be added + to the existing workbook. This can be used to save different + DataFrames to one workbook: + + >>> writer = ExcelWriter('output.xlsx') + >>> df1.to_excel(writer,'Sheet1') + >>> df2.to_excel(writer,'Sheet2') + >>> writer.save() + """ + from pandas.io.excel import ExcelWriter + + need_save = False + if encoding == None: + encoding = 'ascii' + + if isinstance(excel_writer, compat.string_types): + excel_writer = ExcelWriter(excel_writer, engine=engine) + need_save = True + + formatter = fmt.ExcelFormatter(self, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep) + formatted_cells = formatter.get_formatted_cells() + excel_writer.write_cells(formatted_cells, sheet_name, + startrow=startrow, startcol=startcol) + if need_save: + excel_writer.save() + + def to_stata( + self, fname, convert_dates=None, write_index=True, encoding="latin-1", + byteorder=None, time_stamp=None, data_label=None): + """ + A class for writing Stata binary dta files from array-like objects + + Parameters + ---------- + fname : file path or buffer + Where to save the dta file. + convert_dates : dict + Dictionary mapping column of datetime types to the stata internal + format that you want to use for the dates. Options are + 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a + number or a name. + encoding : str + Default is latin-1. Note that Stata does not support unicode. + byteorder : str + Can be ">", "<", "little", or "big". The default is None which uses + `sys.byteorder` + + Examples + -------- + >>> writer = StataWriter('./data_file.dta', data) + >>> writer.write_file() + + Or with dates + + >>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'}) + >>> writer.write_file() + """ + from pandas.io.stata import StataWriter + writer = StataWriter(fname, self, convert_dates=convert_dates, + encoding=encoding, byteorder=byteorder, + time_stamp=time_stamp, data_label=data_label, + write_index=write_index) + writer.write_file() + + @Appender(fmt.docstring_to_string, indents=1) + def to_string(self, buf=None, columns=None, col_space=None, colSpace=None, + header=True, index=True, na_rep='NaN', formatters=None, + float_format=None, sparsify=None, index_names=True, + justify=None, line_width=None, max_rows=None, max_cols=None, + show_dimensions=False): + """ + Render a DataFrame to a console-friendly tabular output. + """ + + if colSpace is not None: # pragma: no cover + warnings.warn("colSpace is deprecated, use col_space", + FutureWarning) + col_space = colSpace + + formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, index=index, + line_width=line_width, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions) + formatter.to_string() + + if buf is None: + result = formatter.buf.getvalue() + return result + + @Appender(fmt.docstring_to_string, indents=1) + def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, + header=True, index=True, na_rep='NaN', formatters=None, + float_format=None, sparsify=None, index_names=True, + justify=None, bold_rows=True, classes=None, escape=True, + max_rows=None, max_cols=None, show_dimensions=False): + """ + Render a DataFrame as an HTML table. + + `to_html`-specific options: + + bold_rows : boolean, default True + Make the row labels bold in the output + classes : str or list or tuple, default None + CSS class(es) to apply to the resulting html table + escape : boolean, default True + Convert the characters <, >, and & to HTML-safe sequences.= + max_rows : int, optional + Maximum number of rows to show before truncating. If None, show + all. + max_cols : int, optional + Maximum number of columns to show before truncating. If None, show + all. + + """ + + if colSpace is not None: # pragma: no cover + warnings.warn("colSpace is deprecated, use col_space", + FutureWarning) + col_space = colSpace + + formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, index=index, + bold_rows=bold_rows, + escape=escape, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions) + formatter.to_html(classes=classes) + + if buf is None: + return formatter.buf.getvalue() + + @Appender(fmt.docstring_to_string, indents=1) + def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, + header=True, index=True, na_rep='NaN', formatters=None, + float_format=None, sparsify=None, index_names=True, + bold_rows=True, longtable=False, escape=True): + """ + Render a DataFrame to a tabular environment table. You can splice + this into a LaTeX document. Requires \\usepackage{booktabs}. + + `to_latex`-specific options: + + bold_rows : boolean, default True + Make the row labels bold in the output + longtable : boolean, default False + Use a longtable environment instead of tabular. Requires adding + a \\usepackage{longtable} to your LaTeX preamble. + escape : boolean, default True + When set to False prevents from escaping latex special + characters in column names. + + """ + + if colSpace is not None: # pragma: no cover + warnings.warn("colSpace is deprecated, use col_space", + FutureWarning) + col_space = colSpace + + formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + header=header, index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + index_names=index_names, + escape=escape) + formatter.to_latex(longtable=longtable) + + if buf is None: + return formatter.buf.getvalue() + + def info(self, verbose=None, buf=None, max_cols=None): + """ + Concise summary of a DataFrame. + + Parameters + ---------- + verbose : {None, True, False}, optional + Whether to print the full summary. + None follows the `display.max_info_columns` setting. + True or False overrides the `display.max_info_columns` setting. + buf : writable buffer, defaults to sys.stdout + max_cols : int, default None + Determines whether full summary or short summary is printed. + None follows the `display.max_info_columns` setting. + """ + from pandas.core.format import _put_lines + + if buf is None: # pragma: no cover + buf = sys.stdout + + lines = [] + + lines.append(str(type(self))) + lines.append(self.index.summary()) + + if len(self.columns) == 0: + lines.append('Empty %s' % type(self).__name__) + _put_lines(buf, lines) + return + + cols = self.columns + + # hack + if max_cols is None: + max_cols = get_option( + 'display.max_info_columns', len(self.columns) + 1) + + max_rows = get_option('display.max_info_rows', len(self) + 1) + + show_counts = ((len(self.columns) <= max_cols) and + (len(self) < max_rows)) + exceeds_info_cols = len(self.columns) > max_cols + + def _verbose_repr(): + lines.append('Data columns (total %d columns):' % + len(self.columns)) + space = max([len(com.pprint_thing(k)) for k in self.columns]) + 4 + counts = None + + tmpl = "%s%s" + if show_counts: + counts = self.count() + if len(cols) != len(counts): # pragma: no cover + raise AssertionError('Columns must equal counts (%d != %d)' % + (len(cols), len(counts))) + tmpl = "%s non-null %s" + + dtypes = self.dtypes + for i, col in enumerate(self.columns): + dtype = dtypes[col] + col = com.pprint_thing(col) + + count = "" + if show_counts: + count = counts.iloc[i] + + lines.append(_put_str(col, space) + + tmpl % (count, dtype)) + + def _non_verbose_repr(): + lines.append(self.columns.summary(name='Columns')) + + if verbose: + _verbose_repr() + elif verbose is False: # specifically set to False, not nesc None + _non_verbose_repr() + else: + if exceeds_info_cols: + _non_verbose_repr() + else: + _verbose_repr() + + counts = self.get_dtype_counts() + dtypes = ['%s(%d)' % k for k in sorted(compat.iteritems(counts))] + lines.append('dtypes: %s' % ', '.join(dtypes)) + _put_lines(buf, lines) + + def transpose(self): + """Transpose index and columns""" + return super(DataFrame, self).transpose(1, 0) + + T = property(transpose) + + #---------------------------------------------------------------------- + # Picklability + + # legacy pickle formats + def _unpickle_frame_compat(self, state): # pragma: no cover + from pandas.core.common import _unpickle_array + if len(state) == 2: # pragma: no cover + series, idx = state + columns = sorted(series) + else: + series, cols, idx = state + columns = _unpickle_array(cols) + + index = _unpickle_array(idx) + self._data = self._init_dict(series, index, columns, None) + + def _unpickle_matrix_compat(self, state): # pragma: no cover + from pandas.core.common import _unpickle_array + # old unpickling + (vals, idx, cols), object_state = state + + index = _unpickle_array(idx) + dm = DataFrame(vals, index=index, columns=_unpickle_array(cols), + copy=False) + + if object_state is not None: + ovals, _, ocols = object_state + objects = DataFrame(ovals, index=index, + columns=_unpickle_array(ocols), + copy=False) + + dm = dm.join(objects) + + self._data = dm._data + + #---------------------------------------------------------------------- + #---------------------------------------------------------------------- + # Getting and setting elements + + def get_value(self, index, col, takeable=False): + """ + Quickly retrieve single value at passed column and index + + Parameters + ---------- + index : row label + col : column label + takeable : interpret the index/col as indexers, default False + + Returns + ------- + value : scalar value + """ + + if takeable: + series = self._iget_item_cache(col) + return series.values[index] + + series = self._get_item_cache(col) + engine = self.index._engine + return engine.get_value(series.values, index) + + def set_value(self, index, col, value, takeable=False): + """ + Put single value at passed column and index + + Parameters + ---------- + index : row label + col : column label + value : scalar value + takeable : interpret the index/col as indexers, default False + + Returns + ------- + frame : DataFrame + If label pair is contained, will be reference to calling DataFrame, + otherwise a new object + """ + try: + if takeable is True: + series = self._iget_item_cache(col) + return series.set_value(index, value, takeable=True) + + series = self._get_item_cache(col) + engine = self.index._engine + engine.set_value(series.values, index, value) + return self + except KeyError: + + # set using a non-recursive method & reset the cache + self.loc[index, col] = value + self._item_cache.pop(col, None) + + return self + + def irow(self, i, copy=False): + return self._ixs(i, axis=0) + + def icol(self, i): + return self._ixs(i, axis=1) + + def _ixs(self, i, axis=0): + """ + i : int, slice, or sequence of integers + axis : int + """ + + # irow + if axis == 0: + + """ + Notes + ----- + If slice passed, the resulting data will be a view + """ + + if isinstance(i, slice): + return self[i] + else: + label = self.index[i] + if isinstance(label, Index): + # a location index by definition + result = self.take(i, axis=axis) + copy=True + else: + new_values = self._data.fast_xs(i) + + # if we are a copy, mark as such + copy = isinstance(new_values,np.ndarray) and new_values.base is None + result = Series(new_values, index=self.columns, + name=self.index[i], dtype=new_values.dtype) + result._set_is_copy(self, copy=copy) + return result + + # icol + else: + + """ + Notes + ----- + If slice passed, the resulting data will be a view + """ + + label = self.columns[i] + if isinstance(i, slice): + # need to return view + lab_slice = slice(label[0], label[-1]) + return self.ix[:, lab_slice] + else: + label = self.columns[i] + if isinstance(label, Index): + return self.take(i, axis=1, convert=True) + + # if the values returned are not the same length + # as the index (iow a not found value), iget returns + # a 0-len ndarray. This is effectively catching + # a numpy error (as numpy should really raise) + values = self._data.iget(i) + if not len(values): + values = np.array([np.nan] * len(self.index), dtype=object) + result = self._constructor_sliced.from_array( + values, index=self.index, + name=label, fastpath=True) + + # this is a cached value, mark it so + result._set_as_cached(label, self) + + return result + + def iget_value(self, i, j): + return self.iat[i, j] + + def __getitem__(self, key): + + # shortcut if we are an actual column + is_mi_columns = isinstance(self.columns, MultiIndex) + try: + if key in self.columns and not is_mi_columns: + return self._getitem_column(key) + except: + pass + + # see if we can slice the rows + indexer = _convert_to_index_sliceable(self, key) + if indexer is not None: + return self._getitem_slice(indexer) + + if isinstance(key, (Series, np.ndarray, list)): + # either boolean or fancy integer index + return self._getitem_array(key) + elif isinstance(key, DataFrame): + return self._getitem_frame(key) + elif is_mi_columns: + return self._getitem_multilevel(key) + else: + return self._getitem_column(key) + + def _getitem_column(self, key): + """ return the actual column """ + + # get column + if self.columns.is_unique: + return self._get_item_cache(key) + + # duplicate columns & possible reduce dimensionaility + result = self._constructor(self._data.get(key)) + if result.columns.is_unique: + result = result[key] + + return result + + def _getitem_slice(self, key): + return self._slice(key, axis=0) + + def _getitem_array(self, key): + # also raises Exception if object array with NA values + if com._is_bool_indexer(key): + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn("Boolean Series key will be reindexed to match " + "DataFrame index.", UserWarning) + elif len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d.' % + (len(key), len(self.index))) + # _check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = _check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + return self.take(indexer, axis=0, convert=False) + else: + indexer = self.ix._convert_to_indexer(key, axis=1) + return self.take(indexer, axis=1, convert=True) + + def _getitem_multilevel(self, key): + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, Series, np.ndarray)): + new_columns = self.columns[loc] + result_columns = _maybe_droplevels(new_columns, key) + if self._is_mixed_type: + result = self.reindex(columns=new_columns) + result.columns = result_columns + else: + new_values = self.values[:, loc] + result = DataFrame(new_values, index=self.index, + columns=result_columns).__finalize__(self) + if len(result.columns) == 1: + top = result.columns[0] + if ((type(top) == str and top == '') or + (type(top) == tuple and top[0] == '')): + result = result[''] + if isinstance(result, Series): + result = Series(result, index=self.index, name=key) + + result._set_is_copy(self) + return result + else: + return self._get_item_cache(key) + + def _getitem_frame(self, key): + if key.values.dtype != np.bool_: + raise ValueError('Must pass DataFrame with boolean values only') + return self.where(key) + + def query(self, expr, **kwargs): + """Query the columns of a frame with a boolean expression. + + .. versionadded:: 0.13 + + Parameters + ---------- + expr : string + The query string to evaluate. You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + kwargs : dict + See the documentation for :func:`pandas.eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. + + Returns + ------- + q : DataFrame + + Notes + ----- + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`pandas.eval` function to + evaluate the passed query. + + The :meth:`~pandas.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. + + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. + + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for the frame index; you can also + use the name of the index to identify it in a query. + + For further details and examples see the ``query`` documentation in + :ref:`indexing `. + + See Also + -------- + pandas.eval + DataFrame.eval + + Examples + -------- + >>> from numpy.random import randn + >>> from pandas import DataFrame + >>> df = DataFrame(randn(10, 2), columns=list('ab')) + >>> df.query('a > b') + >>> df[df.a > df.b] # same result as the previous expression + """ + kwargs['level'] = kwargs.pop('level', 0) + 1 + res = self.eval(expr, **kwargs) + + try: + return self.loc[res] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + return self[res] + + def eval(self, expr, **kwargs): + """Evaluate an expression in the context of the calling DataFrame + instance. + + Parameters + ---------- + expr : string + The expression string to evaluate. + kwargs : dict + See the documentation for :func:`~pandas.eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ret : ndarray, scalar, or pandas object + + See Also + -------- + pandas.DataFrame.query + pandas.eval + + Notes + ----- + For more details see the API documentation for :func:`~pandas.eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> from numpy.random import randn + >>> from pandas import DataFrame + >>> df = DataFrame(randn(10, 2), columns=list('ab')) + >>> df.eval('a + b') + >>> df.eval('c = a + b') + """ + resolvers = kwargs.pop('resolvers', None) + kwargs['level'] = kwargs.pop('level', 0) + 1 + if resolvers is None: + index_resolvers = self._get_index_resolvers() + resolvers = dict(self.iteritems()), index_resolvers + kwargs['target'] = self + kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers + return _eval(expr, **kwargs) + + def select_dtypes(self, include=None, exclude=None): + """Return a subset of a DataFrame including/excluding columns based on + their ``dtype``. + + Parameters + ---------- + include, exclude : list-like + A list of dtypes or strings to be included/excluded. You must pass + in a non-empty sequence for at least one of these. + + Raises + ------ + ValueError + * If both of ``include`` and ``exclude`` are empty + * If ``include`` and ``exclude`` have overlapping elements + * If any kind of string dtype is passed in. + TypeError + * If either of ``include`` or ``exclude`` is not a sequence + + Returns + ------- + subset : DataFrame + The subset of the frame including the dtypes in ``include`` and + excluding the dtypes in ``exclude``. + + Notes + ----- + * To select all *numeric* types use the numpy dtype ``numpy.number`` + * To select strings you must use the ``object`` dtype, but note that + this will return *all* object dtype columns + * See the `numpy dtype hierarchy + `__ + + Examples + -------- + >>> df = pd.DataFrame({'a': np.random.randn(6).astype('f4'), + ... 'b': [True, False] * 3, + ... 'c': [1.0, 2.0] * 3}) + >>> df + a b c + 0 0.3962 True 1 + 1 0.1459 False 2 + 2 0.2623 True 1 + 3 0.0764 False 2 + 4 -0.9703 True 1 + 5 -1.2094 False 2 + >>> df.select_dtypes(include=['float64']) + c + 0 1 + 1 2 + 2 1 + 3 2 + 4 1 + 5 2 + >>> df.select_dtypes(exclude=['floating']) + b + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + """ + include, exclude = include or (), exclude or () + if not (com.is_list_like(include) and com.is_list_like(exclude)): + raise TypeError('include and exclude must both be non-string' + ' sequences') + selection = tuple(map(frozenset, (include, exclude))) + + if not any(selection): + raise ValueError('at least one of include or exclude must be ' + 'nonempty') + + # convert the myriad valid dtypes object to a single representation + include, exclude = map(lambda x: + frozenset(map(com._get_dtype_from_object, x)), + selection) + for dtypes in (include, exclude): + com._invalidate_string_dtypes(dtypes) + + # can't both include AND exclude! + if not include.isdisjoint(exclude): + raise ValueError('include and exclude overlap on %s' + % (include & exclude)) + + # empty include/exclude -> defaults to True + # three cases (we've already raised if both are empty) + # case 1: empty include, nonempty exclude + # we have True, True, ... True for include, same for exclude + # in the loop below we get the excluded + # and when we call '&' below we get only the excluded + # case 2: nonempty include, empty exclude + # same as case 1, but with include + # case 3: both nonempty + # the "union" of the logic of case 1 and case 2: + # we get the included and excluded, and return their logical and + include_these = Series(not bool(include), index=self.columns) + exclude_these = Series(not bool(exclude), index=self.columns) + + def is_dtype_instance_mapper(column, dtype): + return column, functools.partial(issubclass, dtype.type) + + for column, f in itertools.starmap(is_dtype_instance_mapper, + self.dtypes.iteritems()): + if include: # checks for the case of empty include or exclude + include_these[column] = any(map(f, include)) + if exclude: + exclude_these[column] = not any(map(f, exclude)) + + dtype_indexer = include_these & exclude_these + return self.loc[com._get_info_slice(self, dtype_indexer)] + + def _box_item_values(self, key, values): + items = self.columns[self.columns.get_loc(key)] + if values.ndim == 2: + return self._constructor(values.T, columns=items, index=self.index) + else: + return self._box_col_values(values, items) + + def _box_col_values(self, values, items): + """ provide boxed values for a column """ + return self._constructor_sliced.from_array(values, index=self.index, + name=items, fastpath=True) + + def __setitem__(self, key, value): + + # see if we can slice the rows + indexer = _convert_to_index_sliceable(self, key) + if indexer is not None: + return self._setitem_slice(indexer, value) + + if isinstance(key, (Series, np.ndarray, list)): + self._setitem_array(key, value) + elif isinstance(key, DataFrame): + self._setitem_frame(key, value) + else: + # set column + self._set_item(key, value) + + def _setitem_slice(self, key, value): + self._check_setitem_copy() + self.ix._setitem_with_indexer(key, value) + + def _setitem_array(self, key, value): + # also raises Exception if object array with NA values + if com._is_bool_indexer(key): + if len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d!' % + (len(key), len(self.index))) + key = _check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + self._check_setitem_copy() + self.ix._setitem_with_indexer(indexer, value) + else: + if isinstance(value, DataFrame): + if len(value.columns) != len(key): + raise ValueError('Columns must be same length as key') + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] + else: + indexer = self.ix._convert_to_indexer(key, axis=1) + self._check_setitem_copy() + self.ix._setitem_with_indexer((slice(None), indexer), value) + + def _setitem_frame(self, key, value): + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 + if key.values.dtype != np.bool_: + raise TypeError('Must pass DataFrame with boolean values only') + + self._check_inplace_setting(value) + self._check_setitem_copy() + self.where(-key, value, inplace=True) + + def _ensure_valid_index(self, value): + """ + ensure that if we don't have an index, that we can create one from the + passed value + """ + if not len(self.index): + + # GH5632, make sure that we are a Series convertible + if is_list_like(value): + try: + value = Series(value) + except: + pass + + if not isinstance(value, Series): + raise ValueError('Cannot set a frame with no defined index ' + 'and a value that cannot be converted to a ' + 'Series') + + self._data = self._data.reindex_axis(value.index.copy(), axis=1, + fill_value=np.nan) + + # we are a scalar + # noop + else: + + pass + + def _set_item(self, key, value): + """ + Add series to DataFrame in specified column. + + If series is a numpy-array (not a Series/TimeSeries), it must be the + same length as the DataFrame's index or an error will be thrown. + + Series/TimeSeries will be conformed to the DataFrame's index to + ensure homogeneity. + """ + + is_existing = key in self.columns + self._ensure_valid_index(value) + value = self._sanitize_column(key, value) + NDFrame._set_item(self, key, value) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exeption to occur first + if is_existing: + self._check_setitem_copy() + + def insert(self, loc, column, value, allow_duplicates=False): + """ + Insert column into DataFrame at specified location. + + If `allow_duplicates` is False, raises Exception if column + is already contained in the DataFrame. + + Parameters + ---------- + loc : int + Must have 0 <= loc <= len(columns) + column : object + value : int, Series, or array-like + """ + self._ensure_valid_index(value) + value = self._sanitize_column(column, value) + self._data.insert( + loc, column, value, allow_duplicates=allow_duplicates) + + def _sanitize_column(self, key, value): + # Need to make sure new columns (which go into the BlockManager as new + # blocks) are always copied + + if isinstance(value, (Series, DataFrame)): + is_frame = isinstance(value, DataFrame) + if value.index.equals(self.index) or not len(self.index): + # copy the values + value = value.values.copy() + else: + + # GH 4107 + try: + value = value.reindex(self.index).values + except Exception as e: + + # duplicate axis + if not value.index.is_unique: + raise e + + # other + raise TypeError('incompatible index of inserted column ' + 'with frame index') + + if is_frame: + value = value.T + elif isinstance(value, Index) or _is_sequence(value): + if len(value) != len(self.index): + raise ValueError('Length of values does not match length of ' + 'index') + + if not isinstance(value, (np.ndarray, Index)): + if isinstance(value, list) and len(value) > 0: + value = com._possibly_convert_platform(value) + else: + value = com._asarray_tuplesafe(value) + elif isinstance(value, PeriodIndex): + value = value.asobject + elif isinstance(value, DatetimeIndex): + value = value._to_embed(keep_tz=True).copy() + elif value.ndim == 2: + value = value.copy().T + else: + value = value.copy() + else: + # upcast the scalar + dtype, value = _infer_dtype_from_scalar(value) + value = np.repeat(value, len(self.index)).astype(dtype) + value = com._possibly_cast_to_datetime(value, dtype) + + # broadcast across multiple columns if necessary + if key in self.columns and value.ndim == 1: + if not self.columns.is_unique or isinstance(self.columns, + MultiIndex): + existing_piece = self[key] + if isinstance(existing_piece, DataFrame): + value = np.tile(value, (len(existing_piece.columns), 1)) + + return np.atleast_2d(np.asarray(value)) + + @property + def _series(self): + result = {} + for idx, item in enumerate(self.columns): + result[item] = Series(self._data.iget(idx), index=self.index, + name=item) + return result + + def lookup(self, row_labels, col_labels): + """Label-based "fancy indexing" function for DataFrame. + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + + Parameters + ---------- + row_labels : sequence + The row labels to use for lookup + col_labels : sequence + The column labels to use for lookup + + Notes + ----- + Akin to:: + + result = [] + for row, col in zip(row_labels, col_labels): + result.append(df.get_value(row, col)) + + Examples + -------- + values : ndarray + The found values + + """ + n = len(row_labels) + if n != len(col_labels): + raise ValueError('Row labels must have same size as column labels') + + thresh = 1000 + if not self._is_mixed_type or n > thresh: + values = self.values + ridx = self.index.get_indexer(row_labels) + cidx = self.columns.get_indexer(col_labels) + if (ridx == -1).any(): + raise KeyError('One or more row labels was not found') + if (cidx == -1).any(): + raise KeyError('One or more column labels was not found') + flat_index = ridx * len(self.columns) + cidx + result = values.flat[flat_index] + else: + result = np.empty(n, dtype='O') + for i, (r, c) in enumerate(zip(row_labels, col_labels)): + result[i] = self.get_value(r, c) + + if result.dtype == 'O': + result = lib.maybe_convert_objects(result) + + return result + + #---------------------------------------------------------------------- + # Reindexing and alignment + + def _reindex_axes(self, axes, level, limit, method, fill_value, copy): + frame = self + + columns = axes['columns'] + if columns is not None: + frame = frame._reindex_columns(columns, copy, level, fill_value, + limit) + + index = axes['index'] + if index is not None: + frame = frame._reindex_index(index, method, copy, level, + fill_value, limit) + + return frame + + def _reindex_index(self, new_index, method, copy, level, fill_value=NA, + limit=None): + new_index, indexer = self.index.reindex(new_index, method, level, + limit=limit, + copy_if_needed=True) + return self._reindex_with_indexers({0: [new_index, indexer]}, + copy=copy, fill_value=fill_value, + allow_dups=False) + + def _reindex_columns(self, new_columns, copy, level, fill_value=NA, + limit=None): + new_columns, indexer = self.columns.reindex(new_columns, level=level, + limit=limit, + copy_if_needed=True) + return self._reindex_with_indexers({1: [new_columns, indexer]}, + copy=copy, fill_value=fill_value, + allow_dups=False) + + def _reindex_multi(self, axes, copy, fill_value): + """ we are guaranteed non-Nones in the axes! """ + + new_index, row_indexer = self.index.reindex(axes['index']) + new_columns, col_indexer = self.columns.reindex(axes['columns']) + + if row_indexer is not None and col_indexer is not None: + indexer = row_indexer, col_indexer + new_values = com.take_2d_multi(self.values, indexer, + fill_value=fill_value) + return self._constructor(new_values, index=new_index, + columns=new_columns) + else: + return self._reindex_with_indexers({0: [new_index, row_indexer], + 1: [new_columns, col_indexer]}, + copy=copy, + fill_value=fill_value) + + @Appender(_shared_docs['reindex'] % _shared_doc_kwargs) + def reindex(self, index=None, columns=None, **kwargs): + return super(DataFrame, self).reindex(index=index, columns=columns, + **kwargs) + + @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) + def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, + limit=None, fill_value=np.nan): + return super(DataFrame, self).reindex_axis(labels=labels, axis=axis, + method=method, level=level, + copy=copy, limit=limit, + fill_value=fill_value) + + @Appender(_shared_docs['rename'] % _shared_doc_kwargs) + def rename(self, index=None, columns=None, **kwargs): + return super(DataFrame, self).rename(index=index, columns=columns, + **kwargs) + + def set_index(self, keys, drop=True, append=False, inplace=False, + verify_integrity=False): + """ + Set the DataFrame index (row labels) using one or more existing + columns. By default yields a new object. + + Parameters + ---------- + keys : column label or list of column labels / arrays + drop : boolean, default True + Delete columns to be used as the new index + append : boolean, default False + Whether to append columns to existing index + inplace : boolean, default False + Modify the DataFrame in place (do not create a new object) + verify_integrity : boolean, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method + + Examples + -------- + >>> indexed_df = df.set_index(['A', 'B']) + >>> indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]]) + >>> indexed_df3 = df.set_index([[0, 1, 2, 0, 1, 2]]) + + Returns + ------- + dataframe : DataFrame + """ + if not isinstance(keys, list): + keys = [keys] + + if inplace: + frame = self + else: + frame = self.copy() + + arrays = [] + names = [] + if append: + names = [x for x in self.index.names] + if isinstance(self.index, MultiIndex): + for i in range(self.index.nlevels): + arrays.append(self.index.get_level_values(i)) + else: + arrays.append(self.index) + + to_remove = [] + for col in keys: + if isinstance(col, MultiIndex): + # append all but the last column so we don't have to modify + # the end of this loop + for n in range(col.nlevels - 1): + arrays.append(col.get_level_values(n)) + + level = col.get_level_values(col.nlevels - 1) + names.extend(col.names) + elif isinstance(col, Series): + level = col.values + names.append(col.name) + elif isinstance(col, Index): + level = col + names.append(col.name) + elif isinstance(col, (list, np.ndarray)): + level = col + names.append(None) + else: + level = frame[col].values + names.append(col) + if drop: + to_remove.append(col) + arrays.append(level) + + index = MultiIndex.from_arrays(arrays, names=names) + + if verify_integrity and not index.is_unique: + duplicates = index.get_duplicates() + raise ValueError('Index has duplicate keys: %s' % duplicates) + + for c in to_remove: + del frame[c] + + # clear up memory usage + index._cleanup() + + frame.index = index + + if not inplace: + return frame + + def reset_index(self, level=None, drop=False, inplace=False, col_level=0, + col_fill=''): + """ + For DataFrame with multi-level index, return new DataFrame with + labeling information in the columns under the index names, defaulting + to 'level_0', 'level_1', etc. if any are None. For a standard index, + the index name will be used (if set), otherwise a default 'index' or + 'level_0' (if 'index' is already taken) will be used. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default + drop : boolean, default False + Do not try to insert index into dataframe columns. This resets + the index to the default integer index. + inplace : boolean, default False + Modify the DataFrame in place (do not create a new object) + col_level : int or str, default 0 + If the columns have multiple levels, determines which level the + labels are inserted into. By default it is inserted into the first + level. + col_fill : object, default '' + If the columns have multiple levels, determines how the other + levels are named. If None then the index name is repeated. + + Returns + ------- + resetted : DataFrame + """ + if inplace: + new_obj = self + else: + new_obj = self.copy() + + def _maybe_casted_values(index, labels=None): + if isinstance(index, PeriodIndex): + values = index.asobject + elif (isinstance(index, DatetimeIndex) and + index.tz is not None): + values = index.asobject + else: + values = index.values + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + + # if we have the labels, extract the values with a mask + if labels is not None: + mask = labels == -1 + values = values.take(labels) + if mask.any(): + values, changed = com._maybe_upcast_putmask(values, + mask, np.nan) + return values + + new_index = np.arange(len(new_obj)) + if isinstance(self.index, MultiIndex): + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if len(level) < len(self.index.levels): + new_index = self.index.droplevel(level) + + if not drop: + names = self.index.names + zipped = lzip(self.index.levels, self.index.labels) + + multi_col = isinstance(self.columns, MultiIndex) + for i, (lev, lab) in reversed(list(enumerate(zipped))): + col_name = names[i] + if col_name is None: + col_name = 'level_%d' % i + + if multi_col: + if col_fill is None: + col_name = tuple([col_name] * + self.columns.nlevels) + else: + name_lst = [col_fill] * self.columns.nlevels + lev_num = self.columns._get_level_number(col_level) + name_lst[lev_num] = col_name + col_name = tuple(name_lst) + + # to ndarray and maybe infer different dtype + level_values = _maybe_casted_values(lev, lab) + if level is None or i in level: + new_obj.insert(0, col_name, level_values) + + elif not drop: + name = self.index.name + if name is None or name == 'index': + name = 'index' if 'index' not in self else 'level_0' + if isinstance(self.columns, MultiIndex): + if col_fill is None: + name = tuple([name] * self.columns.nlevels) + else: + name_lst = [col_fill] * self.columns.nlevels + lev_num = self.columns._get_level_number(col_level) + name_lst[lev_num] = name + name = tuple(name_lst) + values = _maybe_casted_values(self.index) + new_obj.insert(0, name, values) + + new_obj.index = new_index + if not inplace: + return new_obj + + delevel = deprecate('delevel', reset_index) + + #---------------------------------------------------------------------- + # Reindex-based selection methods + + def dropna(self, axis=0, how='any', thresh=None, subset=None, + inplace=False): + """ + Return object with labels on given axis omitted where alternately any + or all of the data are missing + + Parameters + ---------- + axis : {0, 1}, or tuple/list thereof + Pass tuple or list to drop on multiple axes + how : {'any', 'all'} + * any : if any NA values are present, drop that label + * all : if all values are NA, drop that label + thresh : int, default None + int value : require that many non-NA values + subset : array-like + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include + inplace : boolean, defalt False + If True, do operation inplace and return None. + + Returns + ------- + dropped : DataFrame + """ + if isinstance(axis, (tuple, list)): + result = self + for ax in axis: + result = result.dropna(how=how, thresh=thresh, + subset=subset, axis=ax) + else: + axis = self._get_axis_number(axis) + agg_axis = 1 - axis + + agg_obj = self + if subset is not None: + ax = self._get_axis(agg_axis) + agg_obj = self.take(ax.get_indexer_for(subset),axis=agg_axis) + + count = agg_obj.count(axis=agg_axis) + + if thresh is not None: + mask = count >= thresh + elif how == 'any': + mask = count == len(agg_obj._get_axis(agg_axis)) + elif how == 'all': + mask = count > 0 + else: + if how is not None: + raise ValueError('invalid how option: %s' % how) + else: + raise TypeError('must specify how or thresh') + + result = self.take(mask.nonzero()[0], axis=axis, convert=False) + + if inplace: + self._update_inplace(result) + else: + return result + + @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset') + def drop_duplicates(self, subset=None, take_last=False, inplace=False): + """ + Return DataFrame with duplicate rows removed, optionally only + considering certain columns + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns + take_last : boolean, default False + Take the last observed row in a row. Defaults to the first row + inplace : boolean, default False + Whether to drop duplicates in place or to return a copy + cols : kwargs only argument of subset [deprecated] + + Returns + ------- + deduplicated : DataFrame + """ + duplicated = self.duplicated(subset, take_last=take_last) + + if inplace: + inds, = (-duplicated).nonzero() + new_data = self._data.take(inds) + self._update_inplace(new_data) + else: + return self[-duplicated] + + @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset') + def duplicated(self, subset=None, take_last=False): + """ + Return boolean Series denoting duplicate rows, optionally only + considering certain columns + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns + take_last : boolean, default False + Take the last observed row in a row. Defaults to the first row + cols : kwargs only argument of subset [deprecated] + + Returns + ------- + duplicated : Series + """ + # kludge for #1833 + def _m8_to_i8(x): + if issubclass(x.dtype.type, np.datetime64): + return x.view(np.int64) + return x + + if subset is None: + values = list(_m8_to_i8(self.values.T)) + else: + if np.iterable(subset) and not isinstance(subset, compat.string_types): + if isinstance(subset, tuple): + if subset in self.columns: + values = [self[subset].values] + else: + values = [_m8_to_i8(self[x].values) for x in subset] + else: + values = [_m8_to_i8(self[x].values) for x in subset] + else: + values = [self[subset].values] + + keys = lib.fast_zip_fillna(values) + duplicated = lib.duplicated(keys, take_last=take_last) + return Series(duplicated, index=self.index) + + #---------------------------------------------------------------------- + # Sorting + + def sort(self, columns=None, axis=0, ascending=True, + inplace=False, kind='quicksort', na_position='last'): + """ + Sort DataFrame either by labels (along either axis) or by the values in + column(s) + + Parameters + ---------- + columns : object + Column name(s) in frame. Accepts a column name or a list + for a nested sort. A tuple will be interpreted as the + levels of a multi-index. + ascending : boolean or list, default True + Sort ascending vs. descending. Specify list for multiple sort + orders + axis : {0, 1} + Sort index/rows versus columns + inplace : boolean, default False + Sort the DataFrame without creating a new instance + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + This option is only applied when sorting on a single column or label. + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + + Examples + -------- + >>> result = df.sort(['A', 'B'], ascending=[1, 0]) + + Returns + ------- + sorted : DataFrame + """ + return self.sort_index(by=columns, axis=axis, ascending=ascending, + inplace=inplace, kind=kind, na_position=na_position) + + def sort_index(self, axis=0, by=None, ascending=True, inplace=False, + kind='quicksort', na_position='last'): + """ + Sort DataFrame either by labels (along either axis) or by the values in + a column + + Parameters + ---------- + axis : {0, 1} + Sort index/rows versus columns + by : object + Column name(s) in frame. Accepts a column name or a list + for a nested sort. A tuple will be interpreted as the + levels of a multi-index. + ascending : boolean or list, default True + Sort ascending vs. descending. Specify list for multiple sort + orders + inplace : boolean, default False + Sort the DataFrame without creating a new instance + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + kind : {'quicksort', 'mergesort', 'heapsort'}, optional + This option is only applied when sorting on a single column or label. + + Examples + -------- + >>> result = df.sort_index(by=['A', 'B'], ascending=[True, False]) + + Returns + ------- + sorted : DataFrame + """ + + from pandas.core.groupby import _lexsort_indexer, _nargsort + axis = self._get_axis_number(axis) + if axis not in [0, 1]: # pragma: no cover + raise AssertionError('Axis must be 0 or 1, got %s' % str(axis)) + + labels = self._get_axis(axis) + + if by is not None: + if axis != 0: + raise ValueError('When sorting by column, axis must be 0 ' + '(rows)') + if not isinstance(by, list): + by = [by] + if com._is_sequence(ascending) and len(by) != len(ascending): + raise ValueError('Length of ascending (%d) != length of by' + ' (%d)' % (len(ascending), len(by))) + if len(by) > 1: + def trans(v): + if com.needs_i8_conversion(v): + return v.view('i8') + return v + keys = [] + for x in by: + k = self[x].values + if k.ndim == 2: + raise ValueError('Cannot sort by duplicate column %s' % str(x)) + keys.append(trans(k)) + indexer = _lexsort_indexer(keys, orders=ascending, + na_position=na_position) + indexer = com._ensure_platform_int(indexer) + else: + by = by[0] + k = self[by].values + if k.ndim == 2: + + # try to be helpful + if isinstance(self.columns, MultiIndex): + raise ValueError('Cannot sort by column %s in a multi-index' + ' you need to explicity provide all the levels' + % str(by)) + + raise ValueError('Cannot sort by duplicate column %s' + % str(by)) + if isinstance(ascending, (tuple, list)): + ascending = ascending[0] + indexer = _nargsort(k, kind=kind, ascending=ascending, + na_position=na_position) + + elif isinstance(labels, MultiIndex): + indexer = _lexsort_indexer(labels.labels, orders=ascending, + na_position=na_position) + indexer = com._ensure_platform_int(indexer) + else: + indexer = _nargsort(labels, kind=kind, ascending=ascending, + na_position=na_position) + + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.take(indexer, axis=bm_axis, + convert=False, verify=False) + + if inplace: + return self._update_inplace(new_data) + else: + return self._constructor(new_data).__finalize__(self) + + def sortlevel(self, level=0, axis=0, ascending=True, + inplace=False, sort_remaining=True): + """ + Sort multilevel index by chosen axis and primary level. Data will be + lexicographically sorted by the chosen level followed by the other + levels (in order) + + Parameters + ---------- + level : int + axis : {0, 1} + ascending : boolean, default True + inplace : boolean, default False + Sort the DataFrame without creating a new instance + sort_remaining : boolean, default True + Sort by the other levels too. + + Returns + ------- + sorted : DataFrame + """ + axis = self._get_axis_number(axis) + the_axis = self._get_axis(axis) + if not isinstance(the_axis, MultiIndex): + raise TypeError('can only sort by level with a hierarchical index') + + new_axis, indexer = the_axis.sortlevel(level, ascending=ascending, + sort_remaining=sort_remaining) + + if self._is_mixed_type and not inplace: + ax = 'index' if axis == 0 else 'columns' + + if new_axis.is_unique: + return self.reindex(**{ax: new_axis}) + else: + return self.take(indexer, axis=axis, convert=False) + + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.take(indexer, axis=bm_axis, + convert=False, verify=False) + if inplace: + return self._update_inplace(new_data) + else: + return self._constructor(new_data).__finalize__(self) + + def swaplevel(self, i, j, axis=0): + """ + Swap levels i and j in a MultiIndex on a particular axis + + Parameters + ---------- + i, j : int, string (can be mixed) + Level of index to be swapped. Can pass level name as string. + + Returns + ------- + swapped : type of caller (new object) + """ + result = self.copy() + + axis = self._get_axis_number(axis) + if axis == 0: + result.index = result.index.swaplevel(i, j) + else: + result.columns = result.columns.swaplevel(i, j) + return result + + def reorder_levels(self, order, axis=0): + """ + Rearrange index levels using input order. + May not drop or duplicate levels + + Parameters + ---------- + order : list of int or list of str + List representing new level order. Reference level by number + (position) or by key (label). + axis : int + Where to reorder levels. + + Returns + ------- + type of caller (new object) + """ + axis = self._get_axis_number(axis) + if not isinstance(self._get_axis(axis), + MultiIndex): # pragma: no cover + raise TypeError('Can only reorder levels on a hierarchical axis.') + + result = self.copy() + + if axis == 0: + result.index = result.index.reorder_levels(order) + else: + result.columns = result.columns.reorder_levels(order) + return result + + #---------------------------------------------------------------------- + # Arithmetic / combination related + + def _combine_frame(self, other, func, fill_value=None, level=None): + this, other = self.align(other, join='outer', level=level, copy=False) + new_index, new_columns = this.index, this.columns + + def _arith_op(left, right): + if fill_value is not None: + left_mask = isnull(left) + right_mask = isnull(right) + left = left.copy() + right = right.copy() + + # one but not both + mask = left_mask ^ right_mask + left[left_mask & mask] = fill_value + right[right_mask & mask] = fill_value + + return func(left, right) + + if this._is_mixed_type or other._is_mixed_type: + + # unique + if this.columns.is_unique: + + def f(col): + r = _arith_op(this[col].values, other[col].values) + return self._constructor_sliced(r, index=new_index, + dtype=r.dtype) + + result = dict([(col, f(col)) for col in this]) + + # non-unique + else: + + def f(i): + r = _arith_op(this.iloc[:, i].values, + other.iloc[:, i].values) + return self._constructor_sliced(r, index=new_index, + dtype=r.dtype) + + result = dict([ + (i, f(i)) for i, col in enumerate(this.columns) + ]) + result = self._constructor(result, index=new_index, copy=False) + result.columns = new_columns + return result + + else: + result = _arith_op(this.values, other.values) + + return self._constructor(result, index=new_index, + columns=new_columns, copy=False) + + def _combine_series(self, other, func, fill_value=None, axis=None, + level=None): + if axis is not None: + axis = self._get_axis_name(axis) + if axis == 'index': + return self._combine_match_index(other, func, level=level, fill_value=fill_value) + else: + return self._combine_match_columns(other, func, level=level, fill_value=fill_value) + return self._combine_series_infer(other, func, level=level, fill_value=fill_value) + + def _combine_series_infer(self, other, func, level=None, fill_value=None): + if len(other) == 0: + return self * NA + + if len(self) == 0: + # Ambiguous case, use _series so works with DataFrame + return self._constructor(data=self._series, index=self.index, + columns=self.columns) + + # teeny hack because one does DataFrame + TimeSeries all the time + if self.index.is_all_dates and other.index.is_all_dates: + warnings.warn(("TimeSeries broadcasting along DataFrame index " + "by default is deprecated. Please use " + "DataFrame. to explicitly broadcast arithmetic " + "operations along the index"), + FutureWarning) + return self._combine_match_index(other, func, level=level, fill_value=fill_value) + else: + return self._combine_match_columns(other, func, level=level, fill_value=fill_value) + + def _combine_match_index(self, other, func, level=None, fill_value=None): + left, right = self.align(other, join='outer', axis=0, level=level, copy=False) + if fill_value is not None: + raise NotImplementedError("fill_value %r not supported." % + fill_value) + return self._constructor(func(left.values.T, right.values).T, + index=left.index, + columns=self.columns, copy=False) + + def _combine_match_columns(self, other, func, level=None, fill_value=None): + left, right = self.align(other, join='outer', axis=1, level=level, copy=False) + if fill_value is not None: + raise NotImplementedError("fill_value %r not supported" % + fill_value) + + new_data = left._data.eval( + func=func, other=right, axes=[left.columns, self.index]) + return self._constructor(new_data) + + def _combine_const(self, other, func, raise_on_error=True): + if self.empty: + return self + + new_data = self._data.eval(func=func, other=other, raise_on_error=raise_on_error) + return self._constructor(new_data) + + def _compare_frame_evaluate(self, other, func, str_rep): + + # unique + if self.columns.is_unique: + def _compare(a, b): + return dict([(col, func(a[col], b[col])) for col in a.columns]) + new_data = expressions.evaluate(_compare, str_rep, self, other) + return self._constructor(data=new_data, index=self.index, + columns=self.columns, copy=False) + # non-unique + else: + def _compare(a, b): + return dict([(i, func(a.iloc[:, i], b.iloc[:, i])) + for i, col in enumerate(a.columns)]) + new_data = expressions.evaluate(_compare, str_rep, self, other) + result = self._constructor(data=new_data, index=self.index, + copy=False) + result.columns = self.columns + return result + + def _compare_frame(self, other, func, str_rep): + if not self._indexed_same(other): + raise ValueError('Can only compare identically-labeled ' + 'DataFrame objects') + return self._compare_frame_evaluate(other, func, str_rep) + + def _flex_compare_frame(self, other, func, str_rep, level): + if not self._indexed_same(other): + self, other = self.align(other, 'outer', level=level) + return self._compare_frame_evaluate(other, func, str_rep) + + def combine(self, other, func, fill_value=None, overwrite=True): + """ + Add two DataFrame objects and do not propagate NaN values, so if for a + (column, time) one frame is missing a value, it will default to the + other frame's value (which might be NaN as well) + + Parameters + ---------- + other : DataFrame + func : function + fill_value : scalar value + overwrite : boolean, default True + If True then overwrite values for common keys in the calling frame + + Returns + ------- + result : DataFrame + """ + + other_idxlen = len(other.index) # save for compare + + this, other = self.align(other, copy=False) + new_index = this.index + + if other.empty and len(new_index) == len(self.index): + return self.copy() + + if self.empty and len(other) == other_idxlen: + return other.copy() + + # sorts if possible + new_columns = this.columns.union(other.columns) + do_fill = fill_value is not None + + result = {} + for col in new_columns: + series = this[col] + otherSeries = other[col] + + this_dtype = series.dtype + other_dtype = otherSeries.dtype + + this_mask = isnull(series) + other_mask = isnull(otherSeries) + + # don't overwrite columns unecessarily + # DO propogate if this column is not in the intersection + if not overwrite and other_mask.all(): + result[col] = this[col].copy() + continue + + if do_fill: + series = series.copy() + otherSeries = otherSeries.copy() + series[this_mask] = fill_value + otherSeries[other_mask] = fill_value + + # if we have different dtypes, possibily promote + new_dtype = this_dtype + if this_dtype != other_dtype: + new_dtype = com._lcd_dtypes(this_dtype, other_dtype) + series = series.astype(new_dtype) + otherSeries = otherSeries.astype(new_dtype) + + # see if we need to be represented as i8 (datetimelike) + # try to keep us at this dtype + needs_i8_conversion = com.needs_i8_conversion(new_dtype) + if needs_i8_conversion: + this_dtype = new_dtype + arr = func(series, otherSeries, True) + else: + arr = func(series, otherSeries) + + if do_fill: + arr = com.ensure_float(arr) + arr[this_mask & other_mask] = NA + + # try to downcast back to the original dtype + if needs_i8_conversion: + arr = com._possibly_cast_to_datetime(arr, this_dtype) + else: + arr = com._possibly_downcast_to_dtype(arr, this_dtype) + + result[col] = arr + + # convert_objects just in case + return self._constructor(result, + index=new_index, + columns=new_columns).convert_objects( + convert_dates=True, + copy=False) + + def combine_first(self, other): + """ + Combine two DataFrame objects and default to non-null values in frame + calling the method. Result index columns will be the union of the + respective indexes and columns + + Parameters + ---------- + other : DataFrame + + Examples + -------- + a's values prioritized, use values from b to fill holes: + + >>> a.combine_first(b) + + + Returns + ------- + combined : DataFrame + """ + def combiner(x, y, needs_i8_conversion=False): + x_values = x.values if hasattr(x, 'values') else x + y_values = y.values if hasattr(y, 'values') else y + if needs_i8_conversion: + mask = isnull(x) + x_values = x_values.view('i8') + y_values = y_values.view('i8') + else: + mask = isnull(x_values) + + return expressions.where(mask, y_values, x_values, + raise_on_error=True) + + return self.combine(other, combiner, overwrite=False) + + def update(self, other, join='left', overwrite=True, filter_func=None, + raise_conflict=False): + """ + Modify DataFrame in place using non-NA values from passed + DataFrame. Aligns on indices + + Parameters + ---------- + other : DataFrame, or object coercible into a DataFrame + join : {'left', 'right', 'outer', 'inner'}, default 'left' + overwrite : boolean, default True + If True then overwrite values for common keys in the calling frame + filter_func : callable(1d-array) -> 1d-array, default None + Can choose to replace values other than NA. Return True for values + that should be updated + raise_conflict : boolean + If True, will raise an error if the DataFrame and other both + contain data in the same place. + """ + # TODO: Support other joins + if join != 'left': # pragma: no cover + raise NotImplementedError("Only left join is supported") + + if not isinstance(other, DataFrame): + other = DataFrame(other) + + other = other.reindex_like(self) + + for col in self.columns: + this = self[col].values + that = other[col].values + if filter_func is not None: + mask = ~filter_func(this) | isnull(that) + else: + if raise_conflict: + mask_this = notnull(that) + mask_that = notnull(this) + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = isnull(that) + + # don't overwrite columns unecessarily + if mask.all(): + continue + else: + mask = notnull(this) + + self[col] = expressions.where( + mask, this, that, raise_on_error=True) + + #---------------------------------------------------------------------- + # Misc methods + + def first_valid_index(self): + """ + Return label for first non-NA/null value + """ + return self.index[self.count(1) > 0][0] + + def last_valid_index(self): + """ + Return label for last non-NA/null value + """ + return self.index[self.count(1) > 0][-1] + + #---------------------------------------------------------------------- + # Data reshaping + + def pivot(self, index=None, columns=None, values=None): + """ + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from index / columns to form axes and return either + DataFrame or Panel, depending on whether you request a single value + column (DataFrame) or all columns (Panel) + + Parameters + ---------- + index : string or object + Column name to use to make new frame's index + columns : string or object + Column name to use to make new frame's columns + values : string or object, optional + Column name to use for populating new frame's values + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods + + Examples + -------- + >>> df + foo bar baz + 0 one A 1. + 1 one B 2. + 2 one C 3. + 3 two A 4. + 4 two B 5. + 5 two C 6. + + >>> df.pivot('foo', 'bar', 'baz') + A B C + one 1 2 3 + two 4 5 6 + + >>> df.pivot('foo', 'bar')['baz'] + A B C + one 1 2 3 + two 4 5 6 + + Returns + ------- + pivoted : DataFrame + If no values column specified, will have hierarchically indexed + columns + """ + from pandas.core.reshape import pivot + return pivot(self, index=index, columns=columns, values=values) + + def stack(self, level=-1, dropna=True): + """ + Pivot a level of the (possibly hierarchical) column labels, returning a + DataFrame (or Series in the case of an object with a single level of + column labels) having a hierarchical index with a new inner-most level + of row labels. + + Parameters + ---------- + level : int, string, or list of these, default last level + Level(s) to stack, can pass level name + dropna : boolean, default True + Whether to drop rows in the resulting Frame/Series with no valid + values + + Examples + ---------- + >>> s + a b + one 1. 2. + two 3. 4. + + >>> s.stack() + one a 1 + b 2 + two a 3 + b 4 + + Returns + ------- + stacked : DataFrame or Series + """ + from pandas.core.reshape import stack + + if isinstance(level, (tuple, list)): + result = self + for lev in level: + result = stack(result, lev, dropna=dropna) + return result + else: + return stack(self, level, dropna=dropna) + + def unstack(self, level=-1): + """ + Pivot a level of the (necessarily hierarchical) index labels, returning + a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. If the index is not a MultiIndex, + the output will be a Series (the analogue of stack when the columns are + not a MultiIndex) + + Parameters + ---------- + level : int, string, or list of these, default -1 (last level) + Level(s) of index to unstack, can pass level name + + See also + -------- + DataFrame.pivot : Pivot a table based on column values. + DataFrame.stack : Pivot a level of the column labels (inverse operation + from `unstack`). + + Examples + -------- + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) + >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s + one a 1 + b 2 + two a 3 + b 4 + dtype: float64 + + >>> s.unstack(level=-1) + a b + one 1 2 + two 3 4 + + >>> s.unstack(level=0) + one two + a 1 3 + b 2 4 + + >>> df = s.unstack(level=0) + >>> df.unstack() + one a 1. + b 3. + two a 2. + b 4. + + Returns + ------- + unstacked : DataFrame or Series + """ + from pandas.core.reshape import unstack + return unstack(self, level) + + #---------------------------------------------------------------------- + # Time series-related + + def diff(self, periods=1): + """ + 1st discrete difference of object + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming difference + + Returns + ------- + diffed : DataFrame + """ + new_data = self._data.diff(n=periods) + return self._constructor(new_data) + + #---------------------------------------------------------------------- + # Function application + + def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, + args=(), **kwds): + """ + Applies function along input axis of DataFrame. + + Objects passed to functions are Series objects having index + either the DataFrame's index (axis=0) or the columns (axis=1). + Return type depends on whether passed function aggregates, or the + reduce argument if the DataFrame is empty. + + Parameters + ---------- + func : function + Function to apply to each column/row + axis : {0, 1} + * 0 : apply function to each column + * 1 : apply function to each row + broadcast : boolean, default False + For aggregation functions, return object of same size with values + propagated + reduce : boolean or None, default None + Try to apply reduction procedures. If the DataFrame is empty, + apply will use reduce to determine whether the result should be a + Series or a DataFrame. If reduce is None (the default), apply's + return value will be guessed by calling func an empty Series (note: + while guessing, exceptions raised by func will be ignored). If + reduce is True a Series will always be returned, and if False a + DataFrame will always be returned. + raw : boolean, default False + If False, convert each row or column into a Series. If raw=True the + passed function will receive ndarray objects instead. If you are + just applying a NumPy reduction function this will achieve much + better performance + args : tuple + Positional arguments to pass to function in addition to the + array/series + Additional keyword arguments will be passed as keywords to the function + + Notes + ----- + In the current implementation apply calls func twice on the + first column/row to decide whether it can take a fast or slow + code path. This can lead to unexpected behavior if func has + side-effects, as they will take effect twice for the first + column/row. + + Examples + -------- + >>> df.apply(numpy.sqrt) # returns DataFrame + >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) + >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) + + See also + -------- + DataFrame.applymap: For elementwise operations + + Returns + ------- + applied : Series or DataFrame + """ + axis = self._get_axis_number(axis) + if kwds or args and not isinstance(func, np.ufunc): + f = lambda x: func(x, *args, **kwds) + else: + f = func + + if len(self.columns) == 0 and len(self.index) == 0: + return self._apply_empty_result(func, axis, reduce, *args, **kwds) + + if isinstance(f, np.ufunc): + results = f(self.values) + return self._constructor(data=results, index=self.index, + columns=self.columns, copy=False) + else: + if not broadcast: + if not all(self.shape): + return self._apply_empty_result(func, axis, reduce, *args, + **kwds) + + if raw and not self._is_mixed_type: + return self._apply_raw(f, axis) + else: + if reduce is None: + reduce = True + return self._apply_standard(f, axis, reduce=reduce) + else: + return self._apply_broadcast(f, axis) + + def _apply_empty_result(self, func, axis, reduce, *args, **kwds): + if reduce is None: + reduce = False + try: + reduce = not isinstance(func(_EMPTY_SERIES, *args, **kwds), + Series) + except Exception: + pass + + if reduce: + return Series(NA, index=self._get_agg_axis(axis)) + else: + return self.copy() + + def _apply_raw(self, func, axis): + try: + result = lib.reduce(self.values, func, axis=axis) + except Exception: + result = np.apply_along_axis(func, axis, self.values) + + # TODO: mixed type case + if result.ndim == 2: + return DataFrame(result, index=self.index, + columns=self.columns) + else: + return Series(result, index=self._get_agg_axis(axis)) + + def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): + + # skip if we are mixed datelike and trying reduce across axes + # GH6125 + if reduce and axis==1 and self._is_mixed_type and self._is_datelike_mixed_type: + reduce=False + + # try to reduce first (by default) + # this only matters if the reduction in values is of different dtype + # e.g. if we want to apply to a SparseFrame, then can't directly reduce + if reduce: + + try: + + # the is the fast-path + values = self.values + dummy = Series(NA, index=self._get_axis(axis), + dtype=values.dtype) + + labels = self._get_agg_axis(axis) + result = lib.reduce(values, func, axis=axis, dummy=dummy, + labels=labels) + return Series(result, index=labels) + except Exception: + pass + + if axis == 0: + series_gen = (self.icol(i) for i in range(len(self.columns))) + res_index = self.columns + res_columns = self.index + elif axis == 1: + res_index = self.index + res_columns = self.columns + values = self.values + series_gen = (Series.from_array(arr, index=res_columns, name=name) + for i, (arr, name) in + enumerate(zip(values, res_index))) + else: # pragma : no cover + raise AssertionError('Axis must be 0 or 1, got %s' % str(axis)) + + i = None + keys = [] + results = {} + if ignore_failures: + successes = [] + for i, v in enumerate(series_gen): + try: + results[i] = func(v) + keys.append(v.name) + successes.append(i) + except Exception: + pass + # so will work with MultiIndex + if len(successes) < len(res_index): + res_index = res_index.take(successes) + else: + try: + for i, v in enumerate(series_gen): + results[i] = func(v) + keys.append(v.name) + except Exception as e: + if hasattr(e, 'args'): + # make sure i is defined + if i is not None: + k = res_index[i] + e.args = e.args + ('occurred at index %s' % + com.pprint_thing(k),) + raise + + if len(results) > 0 and _is_sequence(results[0]): + if not isinstance(results[0], Series): + index = res_columns + else: + index = None + + result = self._constructor(data=results, index=index) + result.columns = res_index + + if axis == 1: + result = result.T + result = result.convert_objects(copy=False) + + else: + + result = Series(results) + result.index = res_index + + return result + + def _apply_broadcast(self, func, axis): + if axis == 0: + target = self + elif axis == 1: + target = self.T + else: # pragma: no cover + raise AssertionError('Axis must be 0 or 1, got %s' % axis) + + result_values = np.empty_like(target.values) + columns = target.columns + for i, col in enumerate(columns): + result_values[:, i] = func(target[col]) + + result = self._constructor(result_values, index=target.index, + columns=target.columns) + + if axis == 1: + result = result.T + + return result + + def applymap(self, func): + """ + Apply a function to a DataFrame that is intended to operate + elementwise, i.e. like doing map(func, series) for each series in the + DataFrame + + Parameters + ---------- + func : function + Python function, returns a single value from a single value + + Returns + ------- + applied : DataFrame + + See also + -------- + DataFrame.apply : For operations on rows/columns + + """ + + # if we have a dtype == 'M8[ns]', provide boxed values + def infer(x): + if com.is_datetime64_dtype(x): + x = lib.map_infer(_values_from_object(x), lib.Timestamp) + return lib.map_infer(_values_from_object(x), func) + return self.apply(infer) + + #---------------------------------------------------------------------- + # Merging / joining methods + + def append(self, other, ignore_index=False, verify_integrity=False): + """ + Append columns of other to end of this frame's columns and index, + returning a new object. Columns not in this frame are added as new + columns. + + Parameters + ---------- + other : DataFrame or list of Series/dict-like objects + ignore_index : boolean, default False + If True do not use the index labels. Useful for gluing together + record arrays + verify_integrity : boolean, default False + If True, raise ValueError on creating index with duplicates + + Notes + ----- + If a list of dict is passed and the keys are all contained in the + DataFrame's index, the order of the columns in the resulting DataFrame + will be unchanged + + Returns + ------- + appended : DataFrame + """ + if isinstance(other, (Series, dict)): + if isinstance(other, dict): + other = Series(other) + if other.name is None and not ignore_index: + raise TypeError('Can only append a Series if ' + 'ignore_index=True') + + index = None if other.name is None else [other.name] + combined_columns = self.columns.tolist() + ((self.columns | other.index) - self.columns).tolist() + other = other.reindex(combined_columns, copy=False) + other = DataFrame(other.values.reshape((1, len(other))), + index=index, columns=combined_columns).convert_objects() + if not self.columns.equals(combined_columns): + self = self.reindex(columns=combined_columns) + elif isinstance(other, list) and not isinstance(other[0], DataFrame): + other = DataFrame(other) + if (self.columns.get_indexer(other.columns) >= 0).all(): + other = other.ix[:, self.columns] + + from pandas.tools.merge import concat + if isinstance(other, (list, tuple)): + to_concat = [self] + other + else: + to_concat = [self, other] + return concat(to_concat, ignore_index=ignore_index, + verify_integrity=verify_integrity) + + def join(self, other, on=None, how='left', lsuffix='', rsuffix='', + sort=False): + """ + Join columns with other DataFrame either on index or on a key + column. Efficiently Join multiple DataFrame objects by index at once by + passing a list. + + Parameters + ---------- + other : DataFrame, Series with name field set, or list of DataFrame + Index should be similar to one of the columns in this one. If a + Series is passed, its name attribute must be set, and that will be + used as the column name in the resulting joined DataFrame + on : column name, tuple/list of column names, or array-like + Column(s) to use for joining, otherwise join on index. If multiples + columns given, the passed DataFrame must have a MultiIndex. Can + pass an array as the join key if not already contained in the + calling DataFrame. Like an Excel VLOOKUP operation + how : {'left', 'right', 'outer', 'inner'} + How to handle indexes of the two objects. Default: 'left' + for joining on index, None otherwise + + * left: use calling frame's index + * right: use input frame's index + * outer: form union of indexes + * inner: use intersection of indexes + lsuffix : string + Suffix to use from left frame's overlapping columns + rsuffix : string + Suffix to use from right frame's overlapping columns + sort : boolean, default False + Order result DataFrame lexicographically by the join key. If False, + preserves the index order of the calling (left) DataFrame + + Notes + ----- + on, lsuffix, and rsuffix options are not supported when passing a list + of DataFrame objects + + Returns + ------- + joined : DataFrame + """ + # For SparseDataFrame's benefit + return self._join_compat(other, on=on, how=how, lsuffix=lsuffix, + rsuffix=rsuffix, sort=sort) + + def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', + sort=False): + from pandas.tools.merge import merge, concat + + if isinstance(other, Series): + if other.name is None: + raise ValueError('Other Series must have a name') + other = DataFrame({other.name: other}) + + if isinstance(other, DataFrame): + return merge(self, other, left_on=on, how=how, + left_index=on is None, right_index=True, + suffixes=(lsuffix, rsuffix), sort=sort) + else: + if on is not None: + raise ValueError('Joining multiple DataFrames only supported' + ' for joining on index') + + # join indexes only using concat + if how == 'left': + how = 'outer' + join_axes = [self.index] + else: + join_axes = None + + frames = [self] + list(other) + + can_concat = all(df.index.is_unique for df in frames) + + if can_concat: + return concat(frames, axis=1, join=how, join_axes=join_axes, + verify_integrity=True) + + joined = frames[0] + + for frame in frames[1:]: + joined = merge(joined, frame, how=how, + left_index=True, right_index=True) + + return joined + + @Substitution('') + @Appender(_merge_doc, indents=2) + def merge(self, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=False, + suffixes=('_x', '_y'), copy=True): + from pandas.tools.merge import merge + return merge(self, right, how=how, on=on, + left_on=left_on, right_on=right_on, + left_index=left_index, right_index=right_index, sort=sort, + suffixes=suffixes, copy=copy) + + #---------------------------------------------------------------------- + # Statistical methods, etc. + + def corr(self, method='pearson', min_periods=1): + """ + Compute pairwise correlation of columns, excluding NA/null values + + Parameters + ---------- + method : {'pearson', 'kendall', 'spearman'} + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. Currently only available for pearson + and spearman correlation + + Returns + ------- + y : DataFrame + """ + numeric_df = self._get_numeric_data() + cols = numeric_df.columns + mat = numeric_df.values + + if method == 'pearson': + correl = _algos.nancorr(com._ensure_float64(mat), + minp=min_periods) + elif method == 'spearman': + correl = _algos.nancorr_spearman(com._ensure_float64(mat), + minp=min_periods) + else: + if min_periods is None: + min_periods = 1 + mat = mat.T + corrf = nanops.get_corr_func(method) + K = len(cols) + correl = np.empty((K, K), dtype=float) + mask = np.isfinite(mat) + for i, ac in enumerate(mat): + for j, bc in enumerate(mat): + valid = mask[i] & mask[j] + if valid.sum() < min_periods: + c = NA + elif not valid.all(): + c = corrf(ac[valid], bc[valid]) + else: + c = corrf(ac, bc) + correl[i, j] = c + correl[j, i] = c + + return self._constructor(correl, index=cols, columns=cols) + + def cov(self, min_periods=None): + """ + Compute pairwise covariance of columns, excluding NA/null values + + Parameters + ---------- + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + Returns + ------- + y : DataFrame + + Notes + ----- + `y` contains the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-1 (unbiased estimator). + """ + numeric_df = self._get_numeric_data() + cols = numeric_df.columns + mat = numeric_df.values + + if notnull(mat).all(): + if min_periods is not None and min_periods > len(mat): + baseCov = np.empty((mat.shape[1], mat.shape[1])) + baseCov.fill(np.nan) + else: + baseCov = np.cov(mat.T) + baseCov = baseCov.reshape((len(cols), len(cols))) + else: + baseCov = _algos.nancorr(com._ensure_float64(mat), cov=True, + minp=min_periods) + + return self._constructor(baseCov, index=cols, columns=cols) + + def corrwith(self, other, axis=0, drop=False): + """ + Compute pairwise correlation between rows or columns of two DataFrame + objects. + + Parameters + ---------- + other : DataFrame + axis : {0, 1} + 0 to compute column-wise, 1 for row-wise + drop : boolean, default False + Drop missing indices from result, default returns union of all + + Returns + ------- + correls : Series + """ + axis = self._get_axis_number(axis) + if isinstance(other, Series): + return self.apply(other.corr, axis=axis) + + this = self._get_numeric_data() + other = other._get_numeric_data() + + left, right = this.align(other, join='inner', copy=False) + + # mask missing values + left = left + right * 0 + right = right + left * 0 + + if axis == 1: + left = left.T + right = right.T + + # demeaned data + ldem = left - left.mean() + rdem = right - right.mean() + + num = (ldem * rdem).sum() + dom = (left.count() - 1) * left.std() * right.std() + + correl = num / dom + + if not drop: + raxis = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) + correl = correl.reindex(result_index) + + return correl + + #---------------------------------------------------------------------- + # ndarray-like stats methods + + def count(self, axis=0, level=None, numeric_only=False): + """ + Return Series with number of non-NA/null observations over requested + axis. Works with non-floating point data as well (detects NaN and None) + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame + numeric_only : boolean, default False + Include only float, int, boolean data + + Returns + ------- + count : Series (or DataFrame if level specified) + """ + axis = self._get_axis_number(axis) + if level is not None: + return self._count_level(level, axis=axis, + numeric_only=numeric_only) + + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + # GH #423 + if len(frame._get_axis(axis)) == 0: + result = Series(0, index=frame._get_agg_axis(axis)) + else: + if axis == 1: + counts = notnull(frame.values).sum(1) + result = Series(counts, index=frame._get_agg_axis(axis)) + else: + result = notnull(frame).sum(axis=axis) + + return result.astype('int64') + + def _count_level(self, level, axis=0, numeric_only=False): + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + if axis == 1: + frame = frame.T + + if not isinstance(frame.index, MultiIndex): + raise TypeError("Can only count levels on hierarchical %s." % + self._get_axis_name(axis)) + + # python 2.5 + mask = notnull(frame.values).view(np.uint8) + + if isinstance(level, compat.string_types): + level = self.index._get_level_number(level) + + level_index = frame.index.levels[level] + labels = com._ensure_int64(frame.index.labels[level]) + counts = lib.count_level_2d(mask, labels, len(level_index)) + + result = DataFrame(counts, index=level_index, + columns=frame.columns) + + if axis == 1: + return result.T + else: + return result + + def any(self, axis=None, bool_only=None, skipna=True, level=None, + **kwargs): + """ + Return whether any element is True over requested axis. + %(na_action)s + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame + bool_only : boolean, default None + Only include boolean data. + + Returns + ------- + any : Series (or DataFrame if level specified) + """ + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level('any', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanany, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') + + def all(self, axis=None, bool_only=None, skipna=True, level=None, + **kwargs): + """ + Return whether all elements are True over requested axis. + %(na_action)s + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame + bool_only : boolean, default None + Only include boolean data. + + Returns + ------- + any : Series (or DataFrame if level specified) + """ + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level('all', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanall, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') + + def _reduce(self, op, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + axis = self._get_axis_number(axis) + f = lambda x: op(x, axis=axis, skipna=skipna, **kwds) + labels = self._get_agg_axis(axis) + + # exclude timedelta/datetime unless we are uniform types + if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: + numeric_only = True + + if numeric_only is None: + try: + values = self.values + result = f(values) + except Exception as e: + + # try by-column first + if filter_type is None and axis == 0: + try: + + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + result = self.apply(f,reduce=False) + if result.ndim == self.ndim: + result = result.iloc[0] + return result + except: + pass + + if filter_type is None or filter_type == 'numeric': + data = self._get_numeric_data() + elif filter_type == 'bool': + data = self._get_bool_data() + else: # pragma: no cover + e = NotImplementedError("Handling exception with filter_" + "type %s not implemented." + % filter_type) + raise_with_traceback(e) + result = f(data.values) + labels = data._get_agg_axis(axis) + else: + if numeric_only: + if filter_type is None or filter_type == 'numeric': + data = self._get_numeric_data() + elif filter_type == 'bool': + data = self._get_bool_data() + else: # pragma: no cover + msg = ("Generating numeric_only data with filter_type %s" + "not supported." % filter_type) + raise NotImplementedError(msg) + values = data.values + labels = data._get_agg_axis(axis) + else: + values = self.values + result = f(values) + + if result.dtype == np.object_: + try: + if filter_type is None or filter_type == 'numeric': + result = result.astype(np.float64) + elif filter_type == 'bool' and notnull(result).all(): + result = result.astype(np.bool_) + except (ValueError, TypeError): + + # try to coerce to the original dtypes item by item if we can + if axis == 0: + result = com._coerce_to_dtypes(result, self.dtypes) + + return Series(result, index=labels) + + def idxmin(self, axis=0, skipna=True): + """ + Return index of first occurrence of minimum over requested axis. + NA/null values are excluded. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + + Returns + ------- + idxmin : Series + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + See Also + -------- + Series.idxmin + """ + axis = self._get_axis_number(axis) + indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else NA for i in indices] + return Series(result, index=self._get_agg_axis(axis)) + + def idxmax(self, axis=0, skipna=True): + """ + Return index of first occurrence of maximum over requested axis. + NA/null values are excluded. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be first index. + + Returns + ------- + idxmax : Series + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + See Also + -------- + Series.idxmax + """ + axis = self._get_axis_number(axis) + indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else NA for i in indices] + return Series(result, index=self._get_agg_axis(axis)) + + def _get_agg_axis(self, axis_num): + """ let's be explict about this """ + if axis_num == 0: + return self.columns + elif axis_num == 1: + return self.index + else: + raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num) + + def mode(self, axis=0, numeric_only=False): + """ + Gets the mode of each element along the axis selected. Empty if nothing + has 2+ occurrences. Adds a row for each mode per label, fills in gaps + with nan. + + Parameters + ---------- + axis : {0, 1, 'index', 'columns'} (default 0) + * 0/'index' : get mode of each column + * 1/'columns' : get mode of each row + numeric_only : boolean, default False + if True, only apply to numeric columns + + Returns + ------- + modes : DataFrame (sorted) + """ + data = self if not numeric_only else self._get_numeric_data() + f = lambda s: s.mode() + return data.apply(f, axis=axis) + + def quantile(self, q=0.5, axis=0, numeric_only=True): + """ + Return values at the given quantile over requested axis, a la + numpy.percentile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + 0 <= q <= 1, the quantile(s) to compute + axis : {0, 1} + 0 for row-wise, 1 for column-wise + + Returns + ------- + quantiles : Series or DataFrame + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + + Examples + -------- + + >>> df = DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + dtype: float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + """ + per = np.asarray(q) * 100 + + if not com.is_list_like(per): + per = [per] + q = [q] + squeeze = True + else: + squeeze = False + + def f(arr, per): + if arr._is_datelike_mixed_type: + values = _values_from_object(arr).view('i8') + else: + values = arr.astype(float) + values = values[notnull(values)] + if len(values) == 0: + return NA + else: + return _quantile(values, per) + + data = self._get_numeric_data() if numeric_only else self + if axis == 1: + data = data.T + + # need to know which cols are timestamp going in so that we can + # map timestamp over them after getting the quantile. + is_dt_col = data.dtypes.map(com.is_datetime64_dtype) + is_dt_col = is_dt_col[is_dt_col].index + + quantiles = [[f(vals, x) for x in per] + for (_, vals) in data.iteritems()] + result = DataFrame(quantiles, index=data._info_axis, columns=q).T + if len(is_dt_col) > 0: + result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp) + if squeeze: + if result.shape == (1, 1): + result = result.T.iloc[:, 0] # don't want scalar + else: + result = result.T.squeeze() + result.name = None # For groupby, so it can set an index name + return result + + def rank(self, axis=0, numeric_only=None, method='average', + na_option='keep', ascending=True, pct=False): + """ + Compute numerical data ranks (1 through n) along axis. Equal values are + assigned a rank that is the average of the ranks of those values + + Parameters + ---------- + axis : {0, 1}, default 0 + Ranks over columns (0) or rows (1) + numeric_only : boolean, default None + Include only float, int, boolean data + method : {'average', 'min', 'max', 'first', 'dense'} + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + na_option : {'keep', 'top', 'bottom'} + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean, default True + False for ranks by high (1) to low (N) + pct : boolean, default False + Computes percentage rank of data + + Returns + ------- + ranks : DataFrame + """ + axis = self._get_axis_number(axis) + if numeric_only is None: + try: + ranks = algos.rank(self.values, axis=axis, method=method, + ascending=ascending, na_option=na_option, + pct=pct) + return self._constructor(ranks, index=self.index, + columns=self.columns) + except TypeError: + numeric_only = True + if numeric_only: + data = self._get_numeric_data() + else: + data = self + ranks = algos.rank(data.values, axis=axis, method=method, + ascending=ascending, na_option=na_option, pct=pct) + return self._constructor(ranks, index=data.index, columns=data.columns) + + def to_timestamp(self, freq=None, how='start', axis=0, copy=True): + """ + Cast to DatetimeIndex of timestamps, at *beginning* of period + + Parameters + ---------- + freq : string, default frequency of PeriodIndex + Desired frequency + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end + axis : {0, 1} default 0 + The axis to convert (the index by default) + copy : boolean, default True + If false then underlying input data is not copied + + Returns + ------- + df : DataFrame with DatetimeIndex + """ + new_data = self._data + if copy: + new_data = new_data.copy() + + axis = self._get_axis_number(axis) + if axis == 0: + new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how)) + elif axis == 1: + new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) + else: # pragma: no cover + raise AssertionError('Axis must be 0 or 1. Got %s' % str(axis)) + + return self._constructor(new_data) + + def to_period(self, freq=None, axis=0, copy=True): + """ + Convert DataFrame from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed) + + Parameters + ---------- + freq : string, default + axis : {0, 1}, default 0 + The axis to convert (the index by default) + copy : boolean, default True + If False then underlying input data is not copied + + Returns + ------- + ts : TimeSeries with PeriodIndex + """ + new_data = self._data + if copy: + new_data = new_data.copy() + + axis = self._get_axis_number(axis) + if axis == 0: + new_data.set_axis(1, self.index.to_period(freq=freq)) + elif axis == 1: + new_data.set_axis(0, self.columns.to_period(freq=freq)) + else: # pragma: no cover + raise AssertionError('Axis must be 0 or 1. Got %s' % str(axis)) + + return self._constructor(new_data) + + def isin(self, values): + """ + Return boolean DataFrame showing whether each element in the + DataFrame is contained in values. + + Parameters + ---------- + values : iterable, Series, DataFrame or dictionary + The result will only be true at a location if all the + labels match. If `values` is a Series, that's the index. If + `values` is a dictionary, the keys must be the column names, + which must match. If `values` is a DataFrame, + then both the index and column labels must match. + + Returns + ------- + + DataFrame of booleans + + Examples + -------- + When ``values`` is a list: + + >>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) + >>> df.isin([1, 3, 12, 'a']) + A B + 0 True True + 1 False False + 2 True False + + When ``values`` is a dict: + + >>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]}) + >>> df.isin({'A': [1, 3], 'B': [4, 7, 12]}) + A B + 0 True False # Note that B didn't match the 1 here. + 1 False True + 2 True True + + When ``values`` is a Series or DataFrame: + + >>> df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) + >>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']}) + >>> df.isin(other) + A B + 0 True False + 1 False False # Column A in `other` has a 3, but not at index 1. + 2 True True + """ + if isinstance(values, dict): + from collections import defaultdict + from pandas.tools.merge import concat + values = defaultdict(list, values) + return concat((self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns)), axis=1) + elif isinstance(values, Series): + if not values.index.is_unique: + raise ValueError("ValueError: cannot compute isin with" + " a duplicate axis.") + return self.eq(values.reindex_like(self), axis='index') + elif isinstance(values, DataFrame): + if not (values.columns.is_unique and values.index.is_unique): + raise ValueError("ValueError: cannot compute isin with" + " a duplicate axis.") + return self.eq(values.reindex_like(self)) + else: + if not is_list_like(values): + raise TypeError("only list-like or dict-like objects are" + " allowed to be passed to DataFrame.isin(), " + "you passed a " + "{0!r}".format(type(values).__name__)) + return DataFrame(lib.ismember(self.values.ravel(), + set(values)).reshape(self.shape), + self.index, + self.columns) + + #---------------------------------------------------------------------- + # Deprecated stuff + + def combineAdd(self, other): + """ + Add two DataFrame objects and do not propagate + NaN values, so if for a (column, time) one frame is missing a + value, it will default to the other frame's value (which might + be NaN as well) + + Parameters + ---------- + other : DataFrame + + Returns + ------- + DataFrame + """ + return self.add(other, fill_value=0.) + + def combineMult(self, other): + """ + Multiply two DataFrame objects and do not propagate NaN values, so if + for a (column, time) one frame is missing a value, it will default to + the other frame's value (which might be NaN as well) + + Parameters + ---------- + other : DataFrame + + Returns + ------- + DataFrame + """ + return self.mul(other, fill_value=1.) + + +DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0, + axes_are_reversed=True, aliases={'rows': 0}) +DataFrame._add_numeric_operations() + +_EMPTY_SERIES = Series([]) + + +def group_agg(values, bounds, f): + """ + R-style aggregator + + Parameters + ---------- + values : N-length or N x K ndarray + bounds : B-length ndarray + f : ndarray aggregation function + + Returns + ------- + ndarray with same length as bounds array + """ + if values.ndim == 1: + N = len(values) + result = np.empty(len(bounds), dtype=float) + elif values.ndim == 2: + N, K = values.shape + result = np.empty((len(bounds), K), dtype=float) + + testagg = f(values[:min(1, len(values))]) + if isinstance(testagg, np.ndarray) and testagg.ndim == 2: + raise AssertionError('Function must reduce') + + for i, left_bound in enumerate(bounds): + if i == len(bounds) - 1: + right_bound = N + else: + right_bound = bounds[i + 1] + + result[i] = f(values[left_bound:right_bound]) + + return result + + +def factor_agg(factor, vec, func): + """ + Aggregate array based on Categorical + + Parameters + ---------- + factor : Categorical + length n + vec : sequence + length n + func : function + 1D array aggregation function + + Returns + ------- + ndarray corresponding to factor levels + + See Also + -------- + pandas.Categorical + """ + indexer = np.argsort(factor.labels) + unique_labels = np.arange(len(factor.levels)) + + ordered_labels = factor.labels.take(indexer) + ordered_vec = np.asarray(vec).take(indexer) + bounds = ordered_labels.searchsorted(unique_labels) + + return group_agg(ordered_vec, bounds, func) + + +def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. + """ + # figure out the index, if necessary + if index is None: + index = extract_index(arrays) + else: + index = _ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + arrays = _homogenize(arrays, index, dtype) + + # from BlockManager perspective + axes = [_ensure_index(columns), _ensure_index(index)] + + return create_block_manager_from_arrays(arrays, arr_names, axes) + + +def extract_index(data): + from pandas.core.index import _union_indexes + + index = None + if len(data) == 0: + index = Index([]) + elif len(data) > 0: + raw_lengths = [] + indexes = [] + + have_raw_arrays = False + have_series = False + have_dicts = False + + for v in data: + if isinstance(v, Series): + have_series = True + indexes.append(v.index) + elif isinstance(v, dict): + have_dicts = True + indexes.append(list(v.keys())) + elif is_list_like(v) and getattr(v, 'ndim', 1) == 1: + have_raw_arrays = True + raw_lengths.append(len(v)) + + if not indexes and not raw_lengths: + raise ValueError('If using all scalar values, you must must pass' + ' an index') + + if have_series or have_dicts: + index = _union_indexes(indexes) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError('arrays must all be same length') + + if have_dicts: + raise ValueError('Mixing dicts with non-Series may lead to ' + 'ambiguous ordering.') + + if have_series: + if lengths[0] != len(index): + msg = ('array length %d does not match index length %d' + % (lengths[0], len(index))) + raise ValueError(msg) + else: + index = Index(np.arange(lengths[0])) + + return _ensure_index(index) + + +def _prep_ndarray(values, copy=True): + if not isinstance(values, (np.ndarray, Series)): + if len(values) == 0: + return np.empty((0, 0), dtype=object) + + def convert(v): + return com._possibly_convert_platform(v) + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + try: + if com.is_list_like(values[0]) or hasattr(values[0], 'len'): + values = np.array([convert(v) for v in values]) + else: + values = convert(values) + except: + values = convert(values) + + else: + # drop subclass info, do not copy data + values = np.asarray(values) + if copy: + values = values.copy() + + if values.ndim == 1: + values = values.reshape((values.shape[0], 1)) + elif values.ndim != 2: + raise ValueError('Must pass 2-d input') + + return values + + +def _to_arrays(data, columns, coerce_float=False, dtype=None): + """ + Return list of arrays, columns + """ + if isinstance(data, DataFrame): + if columns is not None: + arrays = [data.icol(i).values for i, col in enumerate(data.columns) + if col in columns] + else: + columns = data.columns + arrays = [data.icol(i).values for i in range(len(columns))] + + return arrays, columns + + if not len(data): + if isinstance(data, np.ndarray): + columns = data.dtype.names + if columns is not None: + return [[]] * len(columns), columns + return [], [] # columns if columns is not None else [] + if isinstance(data[0], (list, tuple)): + return _list_to_arrays(data, columns, coerce_float=coerce_float, + dtype=dtype) + elif isinstance(data[0], collections.Mapping): + return _list_of_dict_to_arrays(data, columns, + coerce_float=coerce_float, + dtype=dtype) + elif isinstance(data[0], Series): + return _list_of_series_to_arrays(data, columns, + coerce_float=coerce_float, + dtype=dtype) + elif (isinstance(data, (np.ndarray, Series)) + and data.dtype.names is not None): + + columns = list(data.dtype.names) + arrays = [data[k] for k in columns] + return arrays, columns + else: + # last ditch effort + data = lmap(tuple, data) + return _list_to_arrays(data, columns, + coerce_float=coerce_float, + dtype=dtype) + + +def _masked_rec_array_to_mgr(data, index, columns, dtype, copy): + """ extract from a masked rec array and create the manager """ + + # essentially process a record array then fill it + fill_value = data.fill_value + fdata = ma.getdata(data) + if index is None: + index = _get_names_from_index(fdata) + if index is None: + index = _default_index(len(data)) + index = _ensure_index(index) + + if columns is not None: + columns = _ensure_index(columns) + arrays, arr_columns = _to_arrays(fdata, columns) + + # fill if needed + new_arrays = [] + for fv, arr, col in zip(fill_value, arrays, arr_columns): + mask = ma.getmaskarray(data[col]) + if mask.any(): + arr, fv = _maybe_upcast(arr, fill_value=fv, copy=True) + arr[mask] = fv + new_arrays.append(arr) + + # create the manager + arrays, arr_columns = _reorder_arrays(new_arrays, arr_columns, columns) + if columns is None: + columns = arr_columns + + mgr = _arrays_to_mgr(arrays, arr_columns, index, columns) + + if copy: + mgr = mgr.copy() + return mgr + + +def _reorder_arrays(arrays, arr_columns, columns): + # reorder according to the columns + if (columns is not None and len(columns) and arr_columns is not None and + len(arr_columns)): + indexer = _ensure_index( + arr_columns).get_indexer(columns) + arr_columns = _ensure_index( + [arr_columns[i] for i in indexer]) + arrays = [arrays[i] for i in indexer] + return arrays, arr_columns + + +def _list_to_arrays(data, columns, coerce_float=False, dtype=None): + if len(data) > 0 and isinstance(data[0], tuple): + content = list(lib.to_object_array_tuples(data).T) + else: + # list of lists + content = list(lib.to_object_array(data).T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + + +def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): + from pandas.core.index import _get_combined_index + + if columns is None: + columns = _get_combined_index([ + s.index for s in data if getattr(s, 'index', None) is not None + ]) + + indexer_cache = {} + + aligned_values = [] + for s in data: + index = getattr(s, 'index', None) + if index is None: + index = _default_index(len(s)) + + if id(index) in indexer_cache: + indexer = indexer_cache[id(index)] + else: + indexer = indexer_cache[id(index)] = index.get_indexer(columns) + + values = _values_from_object(s) + aligned_values.append(com.take_1d(values, indexer)) + + values = np.vstack(aligned_values) + + if values.dtype == np.object_: + content = list(values.T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + else: + return values.T, columns + + +def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): + if columns is None: + gen = (list(x.keys()) for x in data) + columns = lib.fast_unique_multiple_list_gen(gen) + + # assure that they are of the base dict class and not of derived + # classes + data = [(type(d) is dict) and d or dict(d) for d in data] + + content = list(lib.dicts_to_array(data, list(columns)).T) + return _convert_object_array(content, columns, dtype=dtype, + coerce_float=coerce_float) + + +def _convert_object_array(content, columns, coerce_float=False, dtype=None): + if columns is None: + columns = _default_index(len(content)) + else: + if len(columns) != len(content): # pragma: no cover + # caller's responsibility to check for this... + raise AssertionError('%d columns passed, passed data had %s ' + 'columns' % (len(columns), len(content))) + + # provide soft conversion of object dtypes + def convert(arr): + if dtype != object and dtype != np.object: + arr = lib.maybe_convert_objects(arr, try_float=coerce_float) + arr = com._possibly_cast_to_datetime(arr, dtype) + return arr + + arrays = [ convert(arr) for arr in content ] + + return arrays, columns + + +def _get_names_from_index(data): + index = lrange(len(data)) + has_some_name = any([getattr(s, 'name', None) is not None for s in data]) + if not has_some_name: + return index + + count = 0 + for i, s in enumerate(data): + n = getattr(s, 'name', None) + if n is not None: + index[i] = n + else: + index[i] = 'Unnamed %d' % count + count += 1 + + return index + + +def _homogenize(data, index, dtype=None): + from pandas.core.series import _sanitize_array + + oindex = None + homogenized = [] + + for v in data: + if isinstance(v, Series): + if dtype is not None: + v = v.astype(dtype) + if v.index is not index: + # Forces alignment. No need to copy data since we + # are putting it into an ndarray later + v = v.reindex(index, copy=False) + else: + if isinstance(v, dict): + if oindex is None: + oindex = index.astype('O') + if type(v) == dict: + # fast cython method + v = lib.fast_multiget(v, oindex, default=NA) + else: + v = lib.map_infer(oindex, v.get) + + v = _sanitize_array(v, index, dtype=dtype, copy=False, + raise_cast_failure=False) + + homogenized.append(v) + + return homogenized + + +def _from_nested_dict(data): + # TODO: this should be seriously cythonized + new_data = OrderedDict() + for index, s in compat.iteritems(data): + for col, v in compat.iteritems(s): + new_data[col] = new_data.get(col, OrderedDict()) + new_data[col][index] = v + return new_data + + +def _put_str(s, space): + return ('%s' % s)[:space].ljust(space) + + +#---------------------------------------------------------------------- +# Add plotting methods to DataFrame + +import pandas.tools.plotting as gfx + +DataFrame.plot = gfx.plot_frame +DataFrame.hist = gfx.hist_frame + + +@Appender(_shared_docs['boxplot'] % _shared_doc_kwargs) +def boxplot(self, column=None, by=None, ax=None, fontsize=None, + rot=0, grid=True, figsize=None, layout=None, return_type=None, + **kwds): + import pandas.tools.plotting as plots + import matplotlib.pyplot as plt + ax = plots.boxplot(self, column=column, by=by, ax=ax, + fontsize=fontsize, grid=grid, rot=rot, + figsize=figsize, layout=layout, return_type=return_type, + **kwds) + plt.draw_if_interactive() + return ax + +DataFrame.boxplot = boxplot + +ops.add_flex_arithmetic_methods(DataFrame, **ops.frame_flex_funcs) +ops.add_special_arithmetic_methods(DataFrame, **ops.frame_special_funcs) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py new file mode 100644 index 00000000..59a45722 --- /dev/null +++ b/pandas/core/generic.py @@ -0,0 +1,3949 @@ +# pylint: disable=W0231,E1101 +import warnings +import operator +import weakref +import gc +import numpy as np +import pandas.lib as lib + +import pandas as pd +from pandas.core.base import PandasObject +from pandas.core.index import (Index, MultiIndex, _ensure_index, + InvalidIndexError) +import pandas.core.indexing as indexing +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex +from pandas.core.internals import BlockManager +import pandas.core.common as com +import pandas.core.datetools as datetools +from pandas import compat, _np_version_under1p7 +from pandas.compat import map, zip, lrange, string_types, isidentifier, lmap +from pandas.core.common import (isnull, notnull, is_list_like, + _values_from_object, _maybe_promote, + _maybe_box_datetimelike, ABCSeries, + SettingWithCopyError, SettingWithCopyWarning) +import pandas.core.nanops as nanops +from pandas.util.decorators import Appender, Substitution +from pandas.core import config + +# goal is to be able to define the docs close to function, while still being +# able to share +_shared_docs = dict() +_shared_doc_kwargs = dict(axes='keywords for axes', + klass='NDFrame', + axes_single_arg='int or labels for object', + args_transpose='axes to permute (int or label for' + ' object)') + + +def is_dictlike(x): + return isinstance(x, (dict, com.ABCSeries)) + + +def _single_replace(self, to_replace, method, inplace, limit): + if self.ndim != 1: + raise TypeError('cannot replace {0} with method {1} on a {2}' + .format(to_replace, method, type(self).__name__)) + + orig_dtype = self.dtype + result = self if inplace else self.copy() + fill_f = com._get_fill_func(method) + + mask = com.mask_missing(result.values, to_replace) + values = fill_f(result.values, limit=limit, mask=mask) + + if values.dtype == orig_dtype and inplace: + return + + result = pd.Series(values, index=self.index, + dtype=self.dtype).__finalize__(self) + + if inplace: + self._update_inplace(result._data) + return + + return result + + +class NDFrame(PandasObject): + + """ + N-dimensional analogue of DataFrame. Store multi-dimensional in a + size-mutable, labeled data structure + + Parameters + ---------- + data : BlockManager + axes : list + copy : boolean, default False + """ + _internal_names = ['_data', '_cacher', '_item_cache', '_cache', + 'is_copy', 'str', '_subtyp', '_index', '_default_kind', + '_default_fill_value','__array_struct__','__array_interface__'] + _internal_names_set = set(_internal_names) + _metadata = [] + is_copy = None + + def __init__(self, data, axes=None, copy=False, dtype=None, + fastpath=False): + + if not fastpath: + if dtype is not None: + data = data.astype(dtype) + elif copy: + data = data.copy() + + if axes is not None: + for i, ax in enumerate(axes): + data = data.reindex_axis(ax, axis=i) + + object.__setattr__(self, 'is_copy', None) + object.__setattr__(self, '_data', data) + object.__setattr__(self, '_item_cache', {}) + + def _validate_dtype(self, dtype): + """ validate the passed dtype """ + + if dtype is not None: + dtype = np.dtype(dtype) + + # a compound dtype + if dtype.kind == 'V': + raise NotImplementedError("compound dtypes are not implemented" + "in the {0} constructor" + .format(self.__class__.__name__)) + return dtype + + def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): + """ passed a manager and a axes dict """ + for a, axe in axes.items(): + if axe is not None: + mgr = mgr.reindex_axis( + axe, axis=self._get_block_manager_axis(a), copy=False) + + # do not copy BlockManager unless explicitly done + if copy and dtype is None: + mgr = mgr.copy() + elif dtype is not None: + # avoid copy if we can + if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: + mgr = mgr.astype(dtype=dtype) + return mgr + + #---------------------------------------------------------------------- + # Construction + + @property + def _constructor(self): + raise NotImplementedError + + def __unicode__(self): + # unicode representation based upon iterating over self + # (since, by definition, `PandasContainers` are iterable) + prepr = '[%s]' % ','.join(map(com.pprint_thing, self)) + return '%s(%s)' % (self.__class__.__name__, prepr) + + def _local_dir(self): + """ add the string-like attributes from the info_axis """ + return [c for c in self._info_axis + if isinstance(c, string_types) and isidentifier(c)] + + @property + def _constructor_sliced(self): + raise NotImplementedError + + #---------------------------------------------------------------------- + # Axis + + @classmethod + def _setup_axes( + cls, axes, info_axis=None, stat_axis=None, aliases=None, slicers=None, + axes_are_reversed=False, build_axes=True, ns=None): + """ provide axes setup for the major PandasObjects + + axes : the names of the axes in order (lowest to highest) + info_axis_num : the axis of the selector dimension (int) + stat_axis_num : the number of axis for the default stats (int) + aliases : other names for a single axis (dict) + slicers : how axes slice to others (dict) + axes_are_reversed : boolean whether to treat passed axes as + reversed (DataFrame) + build_axes : setup the axis properties (default True) + """ + + cls._AXIS_ORDERS = axes + cls._AXIS_NUMBERS = dict((a, i) for i, a in enumerate(axes)) + cls._AXIS_LEN = len(axes) + cls._AXIS_ALIASES = aliases or dict() + cls._AXIS_IALIASES = dict((v, k) + for k, v in cls._AXIS_ALIASES.items()) + cls._AXIS_NAMES = dict(enumerate(axes)) + cls._AXIS_SLICEMAP = slicers or None + cls._AXIS_REVERSED = axes_are_reversed + + # typ + setattr(cls, '_typ', cls.__name__.lower()) + + # indexing support + cls._ix = None + + if info_axis is not None: + cls._info_axis_number = info_axis + cls._info_axis_name = axes[info_axis] + + if stat_axis is not None: + cls._stat_axis_number = stat_axis + cls._stat_axis_name = axes[stat_axis] + + # setup the actual axis + if build_axes: + + def set_axis(a, i): + setattr(cls, a, lib.AxisProperty(i)) + cls._internal_names_set.add(a) + + if axes_are_reversed: + m = cls._AXIS_LEN - 1 + for i, a in cls._AXIS_NAMES.items(): + set_axis(a, m - i) + else: + for i, a in cls._AXIS_NAMES.items(): + set_axis(a, i) + + # addtl parms + if isinstance(ns, dict): + for k, v in ns.items(): + setattr(cls, k, v) + + def _construct_axes_dict(self, axes=None, **kwargs): + """ return an axes dictionary for myself """ + d = dict([(a, self._get_axis(a)) for a in (axes or self._AXIS_ORDERS)]) + d.update(kwargs) + return d + + @staticmethod + def _construct_axes_dict_from(self, axes, **kwargs): + """ return an axes dictionary for the passed axes """ + d = dict([(a, ax) for a, ax in zip(self._AXIS_ORDERS, axes)]) + d.update(kwargs) + return d + + def _construct_axes_dict_for_slice(self, axes=None, **kwargs): + """ return an axes dictionary for myself """ + d = dict([(self._AXIS_SLICEMAP[a], self._get_axis(a)) + for a in (axes or self._AXIS_ORDERS)]) + d.update(kwargs) + return d + + def _construct_axes_from_arguments(self, args, kwargs, require_all=False): + """ construct and returns axes if supplied in args/kwargs + if require_all, raise if all axis arguments are not supplied + return a tuple of (axes, kwargs) """ + + # construct the args + args = list(args) + for a in self._AXIS_ORDERS: + + # if we have an alias for this axis + alias = self._AXIS_IALIASES.get(a) + if alias is not None: + if a in kwargs: + if alias in kwargs: + raise TypeError( + "arguments are mutually exclusive for [%s,%s]" % + (a, alias) + ) + continue + if alias in kwargs: + kwargs[a] = kwargs.pop(alias) + continue + + # look for a argument by position + if a not in kwargs: + try: + kwargs[a] = args.pop(0) + except (IndexError): + if require_all: + raise TypeError( + "not enough/duplicate arguments specified!") + + axes = dict([(a, kwargs.get(a)) for a in self._AXIS_ORDERS]) + return axes, kwargs + + @classmethod + def _from_axes(cls, data, axes): + # for construction from BlockManager + if isinstance(data, BlockManager): + return cls(data) + else: + if cls._AXIS_REVERSED: + axes = axes[::-1] + d = cls._construct_axes_dict_from(cls, axes, copy=False) + return cls(data, **d) + + def _get_axis_number(self, axis): + axis = self._AXIS_ALIASES.get(axis, axis) + if com.is_integer(axis): + if axis in self._AXIS_NAMES: + return axis + else: + try: + return self._AXIS_NUMBERS[axis] + except: + pass + raise ValueError('No axis named {0} for object type {1}' + .format(axis, type(self))) + + def _get_axis_name(self, axis): + axis = self._AXIS_ALIASES.get(axis, axis) + if isinstance(axis, string_types): + if axis in self._AXIS_NUMBERS: + return axis + else: + try: + return self._AXIS_NAMES[axis] + except: + pass + raise ValueError('No axis named {0} for object type {1}' + .format(axis, type(self))) + + def _get_axis(self, axis): + name = self._get_axis_name(axis) + return getattr(self, name) + + def _get_block_manager_axis(self, axis): + """ map the axis to the block_manager axis """ + axis = self._get_axis_number(axis) + if self._AXIS_REVERSED: + m = self._AXIS_LEN - 1 + return m - axis + return axis + + def _get_axis_resolvers(self, axis): + # index or columns + axis_index = getattr(self, axis) + d = dict() + prefix = axis[0] + + for i, name in enumerate(axis_index.names): + if name is not None: + key = level = name + else: + # prefix with 'i' or 'c' depending on the input axis + # e.g., you must do ilevel_0 for the 0th level of an unnamed + # multiiindex + key = '{prefix}level_{i}'.format(prefix=prefix, i=i) + level = i + + level_values = axis_index.get_level_values(level) + s = level_values.to_series() + s.index = axis_index + d[key] = s + + # put the index/columns itself in the dict + if isinstance(axis_index, MultiIndex): + dindex = axis_index + else: + dindex = axis_index.to_series() + + d[axis] = dindex + return d + + def _get_index_resolvers(self): + d = {} + for axis_name in self._AXIS_ORDERS: + d.update(self._get_axis_resolvers(axis_name)) + return d + + @property + def _info_axis(self): + return getattr(self, self._info_axis_name) + + @property + def _stat_axis(self): + return getattr(self, self._stat_axis_name) + + @property + def shape(self): + "tuple of axis dimensions" + return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS) + + @property + def axes(self): + "index(es) of the NDFrame" + # we do it this way because if we have reversed axes, then + # the block manager shows then reversed + return [self._get_axis(a) for a in self._AXIS_ORDERS] + + @property + def ndim(self): + "Number of axes / array dimensions" + return self._data.ndim + + def _expand_axes(self, key): + new_axes = [] + for k, ax in zip(key, self.axes): + if k not in ax: + if type(k) != ax.dtype.type: + ax = ax.astype('O') + new_axes.append(ax.insert(len(ax), k)) + else: + new_axes.append(ax) + + return new_axes + + def set_axis(self, axis, labels): + """ public verson of axis assignment """ + setattr(self,self._get_axis_name(axis),labels) + + def _set_axis(self, axis, labels): + self._data.set_axis(axis, labels) + self._clear_item_cache() + + _shared_docs['transpose'] = """ + Permute the dimensions of the %(klass)s + + Parameters + ---------- + args : %(args_transpose)s + copy : boolean, default False + Make a copy of the underlying data. Mixed-dtype data will + always result in a copy + + Examples + -------- + >>> p.transpose(2, 0, 1) + >>> p.transpose(2, 0, 1, copy=True) + + Returns + ------- + y : same as input + """ + + @Appender(_shared_docs['transpose'] % _shared_doc_kwargs) + def transpose(self, *args, **kwargs): + + # construct the args + axes, kwargs = self._construct_axes_from_arguments( + args, kwargs, require_all=True) + axes_names = tuple([self._get_axis_name(axes[a]) + for a in self._AXIS_ORDERS]) + axes_numbers = tuple([self._get_axis_number(axes[a]) + for a in self._AXIS_ORDERS]) + + # we must have unique axes + if len(axes) != len(set(axes)): + raise ValueError('Must specify %s unique axes' % self._AXIS_LEN) + + new_axes = self._construct_axes_dict_from( + self, [self._get_axis(x) for x in axes_names]) + new_values = self.values.transpose(axes_numbers) + if kwargs.get('copy') or (len(args) and args[-1]): + new_values = new_values.copy() + return self._constructor(new_values, **new_axes).__finalize__(self) + + def swapaxes(self, axis1, axis2, copy=True): + """ + Interchange axes and swap values axes appropriately + + Returns + ------- + y : same as input + """ + i = self._get_axis_number(axis1) + j = self._get_axis_number(axis2) + + if i == j: + if copy: + return self.copy() + return self + + mapping = {i: j, j: i} + + new_axes = (self._get_axis(mapping.get(k, k)) + for k in range(self._AXIS_LEN)) + new_values = self.values.swapaxes(i, j) + if copy: + new_values = new_values.copy() + + return self._constructor(new_values, *new_axes).__finalize__(self) + + def pop(self, item): + """ + Return item and drop from frame. Raise KeyError if not found. + """ + result = self[item] + del self[item] + return result + + def squeeze(self): + """ squeeze length 1 dimensions """ + try: + return self.ix[tuple([slice(None) if len(a) > 1 else a[0] + for a in self.axes])] + except: + return self + + def swaplevel(self, i, j, axis=0): + """ + Swap levels i and j in a MultiIndex on a particular axis + + Parameters + ---------- + i, j : int, string (can be mixed) + Level of index to be swapped. Can pass level name as string. + + Returns + ------- + swapped : type of caller (new object) + """ + axis = self._get_axis_number(axis) + result = self.copy() + labels = result._data.axes[axis] + result._data.set_axis(axis, labels.swaplevel(i, j)) + return result + + #---------------------------------------------------------------------- + # Rename + + # TODO: define separate funcs for DataFrame, Series and Panel so you can + # get completion on keyword arguments. + _shared_docs['rename'] = """ + Alter axes input function or functions. Function / dict values must be + unique (1-to-1). Labels not contained in a dict / Series will be left + as-is. + + Parameters + ---------- + %(axes)s : dict-like or function, optional + Transformation to apply to that axis values + + copy : boolean, default True + Also copy underlying data + inplace : boolean, default False + Whether to return a new %(klass)s. If True then value of copy is + ignored. + + Returns + ------- + renamed : %(klass)s (new object) + """ + + @Appender(_shared_docs['rename'] % dict(axes='axes keywords for this' + ' object', klass='NDFrame')) + def rename(self, *args, **kwargs): + + axes, kwargs = self._construct_axes_from_arguments(args, kwargs) + copy = kwargs.get('copy', True) + inplace = kwargs.get('inplace', False) + + if (com._count_not_none(*axes.values()) == 0): + raise TypeError('must pass an index to rename') + + # renamer function if passed a dict + def _get_rename_function(mapper): + if isinstance(mapper, (dict, ABCSeries)): + def f(x): + if x in mapper: + return mapper[x] + else: + return x + else: + f = mapper + + return f + + self._consolidate_inplace() + result = self if inplace else self.copy(deep=copy) + + # start in the axis order to eliminate too many copies + for axis in lrange(self._AXIS_LEN): + v = axes.get(self._AXIS_NAMES[axis]) + if v is None: + continue + f = _get_rename_function(v) + + baxis = self._get_block_manager_axis(axis) + result._data = result._data.rename_axis(f, axis=baxis, copy=copy) + result._clear_item_cache() + + if inplace: + self._update_inplace(result._data) + else: + return result.__finalize__(self) + + rename.__doc__ = _shared_docs['rename'] + + def rename_axis(self, mapper, axis=0, copy=True, inplace=False): + """ + Alter index and / or columns using input function or functions. + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. + + Parameters + ---------- + mapper : dict-like or function, optional + axis : int or string, default 0 + copy : boolean, default True + Also copy underlying data + inplace : boolean, default False + + Returns + ------- + renamed : type of caller + """ + axis = self._get_axis_name(axis) + d = {'copy': copy, 'inplace': inplace} + d[axis] = mapper + return self.rename(**d) + + #---------------------------------------------------------------------- + # Comparisons + + def _indexed_same(self, other): + return all([self._get_axis(a).equals(other._get_axis(a)) + for a in self._AXIS_ORDERS]) + + def __neg__(self): + values = _values_from_object(self) + if values.dtype == np.bool_: + arr = operator.inv(values) + else: + arr = operator.neg(values) + return self.__array_wrap__(arr) + + def __invert__(self): + try: + arr = operator.inv(_values_from_object(self)) + return self.__array_wrap__(arr) + except: + + # inv fails with 0 len + if not np.prod(self.shape): + return self + + raise + + def equals(self, other): + """ + Determines if two NDFrame objects contain the same elements. NaNs in the + same location are considered equal. + """ + if not isinstance(other, self._constructor): + return False + return self._data.equals(other._data) + + #---------------------------------------------------------------------- + # Iteration + + def __hash__(self): + raise TypeError('{0!r} objects are mutable, thus they cannot be' + ' hashed'.format(self.__class__.__name__)) + + def __iter__(self): + """ + Iterate over infor axis + """ + return iter(self._info_axis) + + # can we get a better explanation of this? + def keys(self): + """Get the 'info axis' (see Indexing for more) + + This is index for Series, columns for DataFrame and major_axis for + Panel.""" + return self._info_axis + + def iteritems(self): + """Iterate over (label, values) on info axis + + This is index for Series, columns for DataFrame, major_axis for Panel, + and so on. + """ + for h in self._info_axis: + yield h, self[h] + + # originally used to get around 2to3's changes to iteritems. + # Now unnecessary. Sidenote: don't want to deprecate this for a while, + # otherwise libraries that use 2to3 will have issues. + def iterkv(self, *args, **kwargs): + "iteritems alias used to get around 2to3. Deprecated" + warnings.warn("iterkv is deprecated and will be removed in a future " + "release, use ``iteritems`` instead.", + DeprecationWarning) + return self.iteritems(*args, **kwargs) + + def __len__(self): + """Returns length of info axis """ + return len(self._info_axis) + + def __contains__(self, key): + """True if the key is in the info axis """ + return key in self._info_axis + + @property + def empty(self): + "True if NDFrame is entirely empty [no items]" + return not all(len(self._get_axis(a)) > 0 for a in self._AXIS_ORDERS) + + def __nonzero__(self): + raise ValueError("The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + .format(self.__class__.__name__)) + + __bool__ = __nonzero__ + + def bool(self): + """ Return the bool of a single element PandasObject + This must be a boolean scalar value, either True or False + + Raise a ValueError if the PandasObject does not have exactly + 1 element, or that element is not boolean """ + v = self.squeeze() + if isinstance(v, (bool, np.bool_)): + return bool(v) + elif np.isscalar(v): + raise ValueError("bool cannot act on a non-boolean single element " + "{0}".format(self.__class__.__name__)) + + self.__nonzero__() + + def __abs__(self): + return self.abs() + + #---------------------------------------------------------------------- + # Array Interface + + def __array__(self, dtype=None): + return _values_from_object(self) + + def __array_wrap__(self, result, context=None): + d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) + return self._constructor(result, **d).__finalize__(self) + + # ideally we would define this to avoid the getattr checks, but + # is slower + #@property + #def __array_interface__(self): + # """ provide numpy array interface method """ + # values = self.values + # return dict(typestr=values.dtype.str,shape=values.shape,data=values) + + def to_dense(self): + "Return dense representation of NDFrame (as opposed to sparse)" + # compat + return self + + #---------------------------------------------------------------------- + # Picklability + + def __getstate__(self): + return self._data + + def __setstate__(self, state): + + if isinstance(state, BlockManager): + self._data = state + elif isinstance(state, dict): + typ = state.get('_typ') + if typ is not None: + + # set in the order of internal names + # to avoid definitional recursion + # e.g. say fill_value needing _data to be + # defined + meta = set(self._internal_names + self._metadata) + for k in list(meta): + if k in state: + v = state[k] + object.__setattr__(self, k, v) + + for k, v in state.items(): + if k not in meta: + object.__setattr__(self, k, v) + + else: + self._unpickle_series_compat(state) + elif isinstance(state[0], dict): + if len(state) == 5: + self._unpickle_sparse_frame_compat(state) + else: + self._unpickle_frame_compat(state) + elif len(state) == 4: + self._unpickle_panel_compat(state) + elif len(state) == 2: + self._unpickle_series_compat(state) + else: # pragma: no cover + # old pickling format, for compatibility + self._unpickle_matrix_compat(state) + + self._item_cache = {} + + #---------------------------------------------------------------------- + # IO + + #---------------------------------------------------------------------- + # I/O Methods + + def to_json(self, path_or_buf=None, orient=None, date_format='epoch', + double_precision=10, force_ascii=True, date_unit='ms', + default_handler=None): + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + path_or_buf : the path or buffer to write the result string + if this is None, return a StringIO of the converted string + orient : string + + * Series + + - default is 'index' + - allowed values are: {'split','records','index'} + + * DataFrame + + - default is 'columns' + - allowed values are: + {'split','records','index','columns','values'} + + * The format of the JSON string + + - split : dict like + {index -> [index], columns -> [columns], data -> [values]} + - records : list like + [{column -> value}, ... , {column -> value}] + - index : dict like {index -> {column -> value}} + - columns : dict like {column -> {index -> value}} + - values : just the values array + + date_format : {'epoch', 'iso'} + Type of date conversion. `epoch` = epoch milliseconds, + `iso`` = ISO8601, default is epoch. + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + date_unit : string, default 'ms' (milliseconds) + The time unit to encode to, governs timestamp and ISO8601 + precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, + microsecond, and nanosecond respectively. + default_handler : callable, default None + Handler to call if object cannot otherwise be converted to a + suitable format for JSON. Should receive a single argument which is + the object to convert and return a serialisable object. + + Returns + ------- + same type as input object with filtered info axis + + """ + + from pandas.io import json + return json.to_json( + path_or_buf=path_or_buf, + obj=self, orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler) + + def to_hdf(self, path_or_buf, key, **kwargs): + """ activate the HDFStore + + Parameters + ---------- + path_or_buf : the path (string) or buffer to put the store + key : string + indentifier for the group in the store + mode : optional, {'a', 'w', 'r', 'r+'}, default 'a' + + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + format : 'fixed(f)|table(t)', default is 'fixed' + fixed(f) : Fixed format + Fast writing/reading. Not-appendable, nor searchable + table(t) : Table format + Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching + / selecting subsets of the data + append : boolean, default False + For Table formats, append the input data to the existing + complevel : int, 1-9, default 0 + If a complib is specified compression will be applied + where possible + complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None + If complevel is > 0 apply compression to objects written + in the store wherever possible + fletcher32 : bool, default False + If applying compression use the fletcher32 checksum + + """ + + from pandas.io import pytables + return pytables.to_hdf(path_or_buf, key, self, **kwargs) + + def to_msgpack(self, path_or_buf=None, **kwargs): + """ + msgpack (serialize) object to input file path + + THIS IS AN EXPERIMENTAL LIBRARY and the storage format + may not be stable until a future release. + + Parameters + ---------- + path : string File path, buffer-like, or None + if None, return generated string + append : boolean whether to append to an existing msgpack + (default is False) + compress : type of compressor (zlib or blosc), default to None (no + compression) + """ + + from pandas.io import packers + return packers.to_msgpack(path_or_buf, self, **kwargs) + + def to_sql(self, name, con, flavor='sqlite', if_exists='fail', index=True, + index_label=None): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + name : string + Name of SQL table + con : SQLAlchemy engine or DBAPI2 connection (legacy mode) + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + flavor : {'sqlite', 'mysql'}, default 'sqlite' + The flavor of SQL to use. Ignored when using SQLAlchemy engine. + 'mysql' is deprecated and will be removed in future versions, but it + will be further supported through SQLAlchemy engines. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + + """ + from pandas.io import sql + sql.to_sql( + self, name, con, flavor=flavor, if_exists=if_exists, index=index, + index_label=index_label) + + def to_pickle(self, path): + """ + Pickle (serialize) object to input file path + + Parameters + ---------- + path : string + File path + """ + from pandas.io.pickle import to_pickle + return to_pickle(self, path) + + def save(self, path): # TODO remove in 0.14 + "Deprecated. Use to_pickle instead" + import warnings + from pandas.io.pickle import to_pickle + warnings.warn("save is deprecated, use to_pickle", FutureWarning) + return to_pickle(self, path) + + def load(self, path): # TODO remove in 0.14 + "Deprecated. Use read_pickle instead." + import warnings + from pandas.io.pickle import read_pickle + warnings.warn("load is deprecated, use pd.read_pickle", FutureWarning) + return read_pickle(path) + + def to_clipboard(self, excel=None, sep=None, **kwargs): + """ + Attempt to write text representation of object to the system clipboard + This can be pasted into Excel, for example. + + Parameters + ---------- + excel : boolean, defaults to True + if True, use the provided separator, writing in a csv + format for allowing easy pasting into excel. + if False, write a string representation of the object + to the clipboard + sep : optional, defaults to tab + other keywords are passed to to_csv + + Notes + ----- + Requirements for your platform + - Linux: xclip, or xsel (with gtk or PyQt4 modules) + - Windows: none + - OS X: none + """ + from pandas.io import clipboard + clipboard.to_clipboard(self, excel=excel, sep=sep, **kwargs) + + #---------------------------------------------------------------------- + # Fancy Indexing + + @classmethod + def _create_indexer(cls, name, indexer): + """ create an indexer like _name in the class """ + + if getattr(cls, name, None) is None: + iname = '_%s' % name + setattr(cls, iname, None) + + def _indexer(self): + i = getattr(self, iname) + if i is None: + i = indexer(self, name) + setattr(self, iname, i) + return i + + setattr(cls, name, property(_indexer)) + + # add to our internal names set + cls._internal_names_set.add(iname) + + def get(self, key, default=None): + """ + Get item from object for given key (DataFrame column, Panel slice, + etc.). Returns default value if not found + + Parameters + ---------- + key : object + + Returns + ------- + value : type of items contained in object + """ + try: + return self[key] + except (KeyError, ValueError): + return default + + def __getitem__(self, item): + return self._get_item_cache(item) + + def _get_item_cache(self, item): + """ return the cached item, item represents a label indexer """ + cache = self._item_cache + res = cache.get(item) + if res is None: + values = self._data.get(item) + res = self._box_item_values(item, values) + cache[item] = res + res._set_as_cached(item, self) + + # for a chain + res.is_copy = self.is_copy + return res + + def _set_as_cached(self, item, cacher): + """ set the _cacher attribute on the calling object with + a weakref to cacher """ + self._cacher = (item, weakref.ref(cacher)) + + def _iget_item_cache(self, item): + """ return the cached item, item represents a positional indexer """ + ax = self._info_axis + if ax.is_unique: + lower = self._get_item_cache(ax[item]) + else: + lower = self.take(item, axis=self._info_axis_number, convert=True) + return lower + + def _box_item_values(self, key, values): + raise NotImplementedError + + def _maybe_cache_changed(self, item, value): + """ + the object has called back to us saying + maybe it has changed + + numpy < 1.8 has an issue with object arrays and aliasing + GH6026 + """ + self._data.set(item, value, check=pd._np_version_under1p8) + + @property + def _is_cached(self): + """ boolean : return if I am cached """ + cacher = getattr(self, '_cacher', None) + return cacher is not None + + @property + def _is_view(self): + """ boolean : return if I am a view of another array """ + return self._data.is_view + + def _maybe_update_cacher(self, clear=False): + """ see if we need to update our parent cacher + if clear, then clear our cache """ + cacher = getattr(self, '_cacher', None) + if cacher is not None: + ref = cacher[1]() + + # we are trying to reference a dead referant, hence + # a copy + if ref is None: + del self._cacher + else: + try: + ref._maybe_cache_changed(cacher[0], self) + except: + pass + + # check if we are a copy + self._check_setitem_copy(stacklevel=5, t='referant') + + if clear: + self._clear_item_cache() + + def _clear_item_cache(self, i=None): + if i is not None: + self._item_cache.pop(i, None) + else: + self._item_cache.clear() + + def _slice(self, slobj, axis=0, typ=None): + """ + Construct a slice of this container. + + typ parameter is maintained for compatibility with Series slicing. + + """ + axis = self._get_block_manager_axis(axis) + return self._constructor(self._data.get_slice(slobj, axis=axis)) + + def _set_item(self, key, value): + self._data.set(key, value) + self._clear_item_cache() + + def _set_is_copy(self, ref=None, copy=True): + if not copy: + self.is_copy = None + else: + if ref is not None: + self.is_copy = weakref.ref(ref) + else: + self.is_copy = None + + def _check_setitem_copy(self, stacklevel=4, t='setting'): + """ validate if we are doing a settitem on a chained copy. + + If you call this function, be sure to set the stacklevel such that the + user will see the error *at the level of setting*""" + if self.is_copy: + + value = config.get_option('mode.chained_assignment') + if value is None: + return + + # see if the copy is not actually refererd; if so, then disolve + # the copy weakref + try: + gc.collect(2) + if not gc.get_referents(self.is_copy()): + self.is_copy = None + return + except: + pass + + if t == 'referant': + t = ("A value is trying to be set on a copy of a slice from a " + "DataFrame") + else: + t = ("A value is trying to be set on a copy of a slice from a " + "DataFrame.\nTry using .loc[row_index,col_indexer] = value " + "instead") + if value == 'raise': + raise SettingWithCopyError(t) + elif value == 'warn': + warnings.warn(t, SettingWithCopyWarning, stacklevel=stacklevel) + + def __delitem__(self, key): + """ + Delete item + """ + deleted = False + + maybe_shortcut = False + if hasattr(self, 'columns') and isinstance(self.columns, MultiIndex): + try: + maybe_shortcut = key not in self.columns._engine + except TypeError: + pass + + if maybe_shortcut: + # Allow shorthand to delete all columns whose first len(key) + # elements match key: + if not isinstance(key, tuple): + key = (key,) + for col in self.columns: + if isinstance(col, tuple) and col[:len(key)] == key: + del self[col] + deleted = True + if not deleted: + # If the above loop ran and didn't delete anything because + # there was no match, this call should raise the appropriate + # exception: + self._data.delete(key) + + try: + del self._item_cache[key] + except KeyError: + pass + + def take(self, indices, axis=0, convert=True, is_copy=True): + """ + Analogous to ndarray.take + + Parameters + ---------- + indices : list / array of ints + axis : int, default 0 + convert : translate neg to pos indices (default) + is_copy : mark the returned frame as a copy + + Returns + ------- + taken : type of caller + """ + + new_data = self._data.take(indices, + axis=self._get_block_manager_axis(axis), + convert=True, verify=True) + result = self._constructor(new_data).__finalize__(self) + + # maybe set copy if we didn't actually change the index + if is_copy and not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) + + return result + + def xs(self, key, axis=0, level=None, copy=None, drop_level=True): + """ + Returns a cross-section (row(s) or column(s)) from the Series/DataFrame. + Defaults to cross-section on the rows (axis=0). + + Parameters + ---------- + key : object + Some label contained in the index, or partially in a MultiIndex + axis : int, default 0 + Axis to retrieve cross-section on + level : object, defaults to first n levels (n=1 or len(key)) + In case of a key partially contained in a MultiIndex, indicate + which levels are used. Levels can be referred by label or position. + copy : boolean [deprecated] + Whether to make a copy of the data + drop_level : boolean, default True + If False, returns object with same levels as self. + + Examples + -------- + >>> df + A B C + a 4 5 2 + b 4 0 9 + c 9 7 3 + >>> df.xs('a') + A 4 + B 5 + C 2 + Name: a + >>> df.xs('C', axis=1) + a 2 + b 9 + c 3 + Name: C + + >>> df + A B C D + first second third + bar one 1 4 1 8 9 + two 1 7 5 5 0 + baz one 1 6 6 8 0 + three 2 5 3 5 3 + >>> df.xs(('baz', 'three')) + A B C D + third + 2 5 3 5 3 + >>> df.xs('one', level=1) + A B C D + first third + bar 1 4 1 8 9 + baz 1 6 6 8 0 + >>> df.xs(('baz', 2), level=[0, 'third']) + A B C D + second + three 5 3 5 3 + + Returns + ------- + xs : Series or DataFrame + + Notes + ----- + xs is only for getting, not setting values. + + MultiIndex Slicers is a generic way to get/set values on any level or levels + it is a superset of xs functionality, see :ref:`MultiIndex Slicers ` + + """ + if copy is not None: + warnings.warn("copy keyword is deprecated, " + "default is to return a copy or a view if possible") + + axis = self._get_axis_number(axis) + labels = self._get_axis(axis) + if level is not None: + loc, new_ax = labels.get_loc_level(key, level=level, + drop_level=drop_level) + + # convert to a label indexer if needed + if isinstance(loc, slice): + lev_num = labels._get_level_number(level) + if labels.levels[lev_num].inferred_type == 'integer': + loc = labels[loc] + + # create the tuple of the indexer + indexer = [slice(None)] * self.ndim + indexer[axis] = loc + indexer = tuple(indexer) + + result = self.ix[indexer] + setattr(result, result._get_axis_name(axis), new_ax) + return result + + if axis == 1: + return self[key] + + self._consolidate_inplace() + + index = self.index + if isinstance(index, MultiIndex): + loc, new_index = self.index.get_loc_level(key, + drop_level=drop_level) + else: + loc = self.index.get_loc(key) + + if isinstance(loc, np.ndarray): + if loc.dtype == np.bool_: + inds, = loc.nonzero() + return self.take(inds, axis=axis, convert=False) + else: + return self.take(loc, axis=axis, convert=True) + + if not np.isscalar(loc): + new_index = self.index[loc] + + if np.isscalar(loc): + from pandas import Series + new_values = self._data.fast_xs(loc) + + # may need to box a datelike-scalar + # + # if we encounter an array-like and we only have 1 dim + # that means that their are list/ndarrays inside the Series! + # so just return them (GH 6394) + if not is_list_like(new_values) or self.ndim == 1: + return _maybe_box_datetimelike(new_values) + + result = Series(new_values, index=self.columns, + name=self.index[loc]) + + else: + result = self[loc] + result.index = new_index + + # this could be a view + # but only in a single-dtyped view slicable case + result._set_is_copy(self, copy=not result._is_view) + return result + + _xs = xs + + # TODO: Check if this was clearer in 0.12 + def select(self, crit, axis=0): + """ + Return data corresponding to axis labels matching criteria + + Parameters + ---------- + crit : function + To be called on each index (label). Should return True or False + axis : int + + Returns + ------- + selection : type of caller + """ + axis = self._get_axis_number(axis) + axis_name = self._get_axis_name(axis) + axis_values = self._get_axis(axis) + + if len(axis_values) > 0: + new_axis = axis_values[ + np.asarray([bool(crit(label)) for label in axis_values])] + else: + new_axis = axis_values + + return self.reindex(**{axis_name: new_axis}) + + def reindex_like(self, other, method=None, copy=True, limit=None): + """ return an object with matching indicies to myself + + Parameters + ---------- + other : Object + method : string or None + copy : boolean, default True + limit : int, default None + Maximum size gap to forward or backward fill + + Notes + ----- + Like calling s.reindex(index=other.index, columns=other.columns, + method=...) + + Returns + ------- + reindexed : same as input + """ + d = other._construct_axes_dict(method=method, copy=copy, limit=limit) + return self.reindex(**d) + + def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): + """ + Return new object with labels in requested axis removed + + Parameters + ---------- + labels : single label or list-like + axis : int or axis name + level : int or level name, default None + For MultiIndex + inplace : bool, default False + If True, do operation inplace and return None. + + Returns + ------- + dropped : type of caller + """ + axis = self._get_axis_number(axis) + axis_name = self._get_axis_name(axis) + axis, axis_ = self._get_axis(axis), axis + + if axis.is_unique: + if level is not None: + if not isinstance(axis, MultiIndex): + raise AssertionError('axis must be a MultiIndex') + new_axis = axis.drop(labels, level=level) + else: + new_axis = axis.drop(labels) + dropped = self.reindex(**{axis_name: new_axis}) + try: + dropped.axes[axis_].set_names(axis.names, inplace=True) + except AttributeError: + pass + result = dropped + + else: + labels = com._index_labels_to_array(labels) + if level is not None: + if not isinstance(axis, MultiIndex): + raise AssertionError('axis must be a MultiIndex') + indexer = ~lib.ismember(axis.get_level_values(level), + set(labels)) + else: + indexer = ~axis.isin(labels) + + slicer = [slice(None)] * self.ndim + slicer[self._get_axis_number(axis_name)] = indexer + + result = self.ix[tuple(slicer)] + + if inplace: + self._update_inplace(result) + else: + return result + + def _update_inplace(self, result): + "replace self internals with result." + # NOTE: This does *not* call __finalize__ and that's an explicit + # decision that we may revisit in the future. + self._reset_cache() + self._clear_item_cache() + self._data = getattr(result,'_data',result) + self._maybe_update_cacher() + + def add_prefix(self, prefix): + """ + Concatenate prefix string with panel items names. + + Parameters + ---------- + prefix : string + + Returns + ------- + with_prefix : type of caller + """ + new_data = self._data.add_prefix(prefix) + return self._constructor(new_data).__finalize__(self) + + def add_suffix(self, suffix): + """ + Concatenate suffix string with panel items names + + Parameters + ---------- + suffix : string + + Returns + ------- + with_suffix : type of caller + """ + new_data = self._data.add_suffix(suffix) + return self._constructor(new_data).__finalize__(self) + + def sort_index(self, axis=0, ascending=True): + """ + Sort object by labels (along an axis) + + Parameters + ---------- + axis : {0, 1} + Sort index/rows versus columns + ascending : boolean, default True + Sort ascending vs. descending + + Returns + ------- + sorted_obj : type of caller + """ + axis = self._get_axis_number(axis) + axis_name = self._get_axis_name(axis) + labels = self._get_axis(axis) + + sort_index = labels.argsort() + if not ascending: + sort_index = sort_index[::-1] + + new_axis = labels.take(sort_index) + return self.reindex(**{axis_name: new_axis}) + _shared_docs['reindex'] = """ + Conform %(klass)s to new index with optional filling logic, placing + NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + copy=False + + Parameters + ---------- + %(axes)s : array-like, optional (can be specified in order, or as + keywords) + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed DataFrame + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value + limit : int, default None + Maximum size gap to forward or backward fill + + Examples + -------- + >>> df.reindex(index=[date1, date2, date3], columns=['A', 'B', 'C']) + + Returns + ------- + reindexed : %(klass)s + """ + # TODO: Decide if we care about having different examples for different + # kinds + + @Appender(_shared_docs['reindex'] % dict(axes="axes", klass="NDFrame")) + def reindex(self, *args, **kwargs): + + # construct the args + axes, kwargs = self._construct_axes_from_arguments(args, kwargs) + method = com._clean_fill_method(kwargs.get('method')) + level = kwargs.get('level') + copy = kwargs.get('copy', True) + limit = kwargs.get('limit') + fill_value = kwargs.get('fill_value', np.nan) + + self._consolidate_inplace() + + # if all axes that are requested to reindex are equal, then only copy + # if indicated must have index names equal here as well as values + if all([self._get_axis(axis).identical(ax) + for axis, ax in axes.items() if ax is not None]): + if copy: + return self.copy() + return self + + # check if we are a multi reindex + if self._needs_reindex_multi(axes, method, level): + try: + return self._reindex_multi(axes, copy, fill_value) + except: + pass + + # perform the reindex on the axes + return self._reindex_axes(axes, level, limit, + method, fill_value, copy).__finalize__(self) + + def _reindex_axes(self, axes, level, limit, method, fill_value, copy): + """ perform the reinxed for all the axes """ + obj = self + for a in self._AXIS_ORDERS: + labels = axes[a] + if labels is None: + continue + + # convert to an index if we are not a multi-selection + ax = self._get_axis(a) + if level is None: + labels = _ensure_index(labels) + + axis = self._get_axis_number(a) + new_index, indexer = ax.reindex( + labels, level=level, limit=limit, method=method) + + obj = obj._reindex_with_indexers( + {axis: [new_index, indexer]}, method=method, + fill_value=fill_value, limit=limit, copy=copy, + allow_dups=False) + + return obj + + def _needs_reindex_multi(self, axes, method, level): + """ check if we do need a multi reindex """ + return ((com._count_not_none(*axes.values()) == self._AXIS_LEN) and + method is None and level is None and not self._is_mixed_type) + + def _reindex_multi(self, axes, copy, fill_value): + return NotImplemented + + _shared_docs['reindex_axis'] = ( + """Conform input object to new index with optional filling logic, + placing NA/NaN in locations having no value in the previous index. A + new object is produced unless the new index is equivalent to the + current one and copy=False + + Parameters + ---------- + labels : array-like + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + axis : %(axes_single_arg)s + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed object. + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + limit : int, default None + Maximum size gap to forward or backward fill + + Examples + -------- + >>> df.reindex_axis(['A', 'B', 'C'], axis=1) + + See also + -------- + reindex, reindex_like + + Returns + ------- + reindexed : %(klass)s + """) + + @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) + def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, + limit=None, fill_value=np.nan): + self._consolidate_inplace() + + axis_name = self._get_axis_name(axis) + axis_values = self._get_axis(axis_name) + method = com._clean_fill_method(method) + new_index, indexer = axis_values.reindex( + labels, method, level, limit=limit, copy_if_needed=True) + return self._reindex_with_indexers( + {axis: [new_index, indexer]}, method=method, fill_value=fill_value, + limit=limit, copy=copy) + + def _reindex_with_indexers(self, reindexers, method=None, + fill_value=np.nan, limit=None, copy=False, + allow_dups=False): + """ allow_dups indicates an internal call here """ + + # reindex doing multiple operations on different axes if indiciated + new_data = self._data + for axis in sorted(reindexers.keys()): + index, indexer = reindexers[axis] + baxis = self._get_block_manager_axis(axis) + + if index is None: + continue + + index = _ensure_index(index) + if indexer is not None: + indexer = com._ensure_int64(indexer) + + # TODO: speed up on homogeneous DataFrame objects + new_data = new_data.reindex_indexer(index, indexer, axis=baxis, + fill_value=fill_value, + allow_dups=allow_dups, + copy=copy) + + if copy and new_data is self._data: + new_data = new_data.copy() + + return self._constructor(new_data).__finalize__(self) + + def _reindex_axis(self, new_index, fill_method, axis, copy): + new_data = self._data.reindex_axis(new_index, axis=axis, + method=fill_method, copy=copy) + + if new_data is self._data and not copy: + return self + else: + return self._constructor(new_data).__finalize__(self) + + def filter(self, items=None, like=None, regex=None, axis=None): + """ + Restrict the info axis to set of items or wildcard + + Parameters + ---------- + items : list-like + List of info axis to restrict to (must not all be present) + like : string + Keep info axis where "arg in col == True" + regex : string (regular expression) + Keep info axis with re.search(regex, col) == True + axis : int or None + The axis to filter on. By default this is the info axis. The "info + axis" is the axis that is used when indexing with ``[]``. For + example, ``df = DataFrame({'a': [1, 2, 3, 4]]}); df['a']``. So, + the ``DataFrame`` columns are the info axis. + + Notes + ----- + Arguments are mutually exclusive, but this is not checked for + + """ + import re + + if axis is None: + axis = self._info_axis_name + axis_name = self._get_axis_name(axis) + axis_values = self._get_axis(axis_name) + + if items is not None: + return self.reindex(**{axis_name: [r for r in items + if r in axis_values]}) + elif like: + matchf = lambda x: (like in x if isinstance(x, string_types) + else like in str(x)) + return self.select(matchf, axis=axis_name) + elif regex: + matcher = re.compile(regex) + return self.select(lambda x: matcher.search(x) is not None, + axis=axis_name) + else: + raise TypeError('Must pass either `items`, `like`, or `regex`') + + def head(self, n=5): + """ + Returns first n rows + """ + l = len(self) + if l == 0 or n==0: + return self + return self.iloc[:n] + + def tail(self, n=5): + """ + Returns last n rows + """ + l = len(self) + if l == 0 or n == 0: + return self + return self.iloc[-n:] + + #---------------------------------------------------------------------- + # Attribute access + + def __finalize__(self, other, method=None, **kwargs): + """ + propagate metadata from other to self + + Parameters + ---------- + other : the object from which to get the attributes that we are going + to propagate + method : optional, a passed method name ; possibly to take different + types of propagation actions based on this + + """ + if isinstance(other, NDFrame): + for name in self._metadata: + object.__setattr__(self, name, getattr(other, name, None)) + return self + + def __getattr__(self, name): + """After regular attribute access, try looking up the name of a the + info. + + This allows simpler access to columns for interactive use. + """ + if name in self._internal_names_set: + return object.__getattribute__(self, name) + elif name in self._metadata: + return object.__getattribute__(self, name) + else: + if name in self._info_axis: + return self[name] + raise AttributeError("'%s' object has no attribute '%s'" % + (type(self).__name__, name)) + + def __setattr__(self, name, value): + """After regular attribute access, try looking up the name of the info + This allows simpler access to columns for interactive use.""" + if name in self._internal_names_set: + object.__setattr__(self, name, value) + elif name in self._metadata: + return object.__setattr__(self, name, value) + else: + try: + existing = getattr(self, name) + if isinstance(existing, Index): + object.__setattr__(self, name, value) + elif name in self._info_axis: + self[name] = value + else: + object.__setattr__(self, name, value) + except (AttributeError, TypeError): + object.__setattr__(self, name, value) + + #---------------------------------------------------------------------- + # Getting and setting elements + + #---------------------------------------------------------------------- + # Consolidation of internals + + def _consolidate_inplace(self): + f = lambda: self._data.consolidate() + self._data = self._protect_consolidate(f) + + def consolidate(self, inplace=False): + """ + Compute NDFrame with "consolidated" internals (data of each dtype + grouped together in a single ndarray). Mainly an internal API function, + but available here to the savvy user + + Parameters + ---------- + inplace : boolean, default False + If False return new object, otherwise modify existing object + + Returns + ------- + consolidated : type of caller + """ + if inplace: + self._consolidate_inplace() + else: + f = lambda: self._data.consolidate() + cons_data = self._protect_consolidate(f) + if cons_data is self._data: + cons_data = cons_data.copy() + return self._constructor(cons_data).__finalize__(self) + + @property + def _is_mixed_type(self): + f = lambda: self._data.is_mixed_type + return self._protect_consolidate(f) + + @property + def _is_numeric_mixed_type(self): + f = lambda: self._data.is_numeric_mixed_type + return self._protect_consolidate(f) + + @property + def _is_datelike_mixed_type(self): + f = lambda: self._data.is_datelike_mixed_type + return self._protect_consolidate(f) + + def _check_inplace_setting(self, value): + """ check whether we allow in-place setting with this type of value """ + + if self._is_mixed_type: + if not self._is_numeric_mixed_type: + + # allow an actual np.nan thru + try: + if np.isnan(value): + return True + except: + pass + + raise TypeError( + 'Cannot do inplace boolean setting on mixed-types with a non np.nan value') + + return True + + def _protect_consolidate(self, f): + blocks_before = len(self._data.blocks) + result = f() + if len(self._data.blocks) != blocks_before: + self._clear_item_cache() + return result + + def _get_numeric_data(self): + return self._constructor( + self._data.get_numeric_data()).__finalize__(self) + + def _get_bool_data(self): + return self._constructor(self._data.get_bool_data()).__finalize__(self) + + #---------------------------------------------------------------------- + # Internal Interface Methods + + def as_matrix(self, columns=None): + """ + Convert the frame to its Numpy-array representation. + + Parameters + ---------- + columns: list, optional, default:None + If None, return all columns, otherwise, returns specified columns. + + Returns + ------- + values : ndarray + If the caller is heterogeneous and contains booleans or objects, + the result will be of dtype=object. See Notes. + + + Notes + ----- + Return is NOT a Numpy-matrix, rather, a Numpy-array. + + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcase to + int32. + + This method is provided for backwards compatibility. Generally, + it is recommended to use '.values'. + + See Also + -------- + pandas.DataFrame.values + """ + self._consolidate_inplace() + if self._AXIS_REVERSED: + return self._data.as_matrix(columns).T + return self._data.as_matrix(columns) + + @property + def values(self): + """Numpy representation of NDFrame + + Notes + ----- + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcase to + int32. + """ + return self.as_matrix() + + @property + def _get_values(self): + # compat + return self.as_matrix() + + def get_values(self): + """ same as values (but handles sparseness conversions) """ + return self.as_matrix() + + def get_dtype_counts(self): + """ Return the counts of dtypes in this object """ + from pandas import Series + return Series(self._data.get_dtype_counts()) + + def get_ftype_counts(self): + """ Return the counts of ftypes in this object """ + from pandas import Series + return Series(self._data.get_ftype_counts()) + + @property + def dtypes(self): + """ Return the dtypes in this object """ + from pandas import Series + return Series(self._data.get_dtypes(), index=self._info_axis, + dtype=np.object_) + + @property + def ftypes(self): + """ + Return the ftypes (indication of sparse/dense and dtype) + in this object. + """ + from pandas import Series + return Series(self._data.get_ftypes(), index=self._info_axis, + dtype=np.object_) + + def as_blocks(self): + """ + Convert the frame to a dict of dtype -> Constructor Types that each has + a homogeneous dtype. + + are presented in sorted order unless a specific list of columns is + provided. + + NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in + as_matrix) + + Parameters + ---------- + columns : array-like + Specific column order + + Returns + ------- + values : a list of Object + """ + self._consolidate_inplace() + + bd = {} + for b in self._data.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + result = {} + for dtype, blocks in bd.items(): + # Must combine even after consolidation, because there may be + # sparse items which are never consolidated into one block. + combined = self._data.combine(blocks, copy=True) + result[dtype] = self._constructor(combined).__finalize__(self) + + return result + + @property + def blocks(self): + "Internal property, property synonym for as_blocks()" + return self.as_blocks() + + def astype(self, dtype, copy=True, raise_on_error=True): + """ + Cast object to input numpy.dtype + Return a copy when copy = True (be really careful with this!) + + Parameters + ---------- + dtype : numpy.dtype or Python type + raise_on_error : raise on invalid input + + Returns + ------- + casted : type of caller + """ + + mgr = self._data.astype( + dtype=dtype, copy=copy, raise_on_error=raise_on_error) + return self._constructor(mgr).__finalize__(self) + + def copy(self, deep=True): + """ + Make a copy of this object + + Parameters + ---------- + deep : boolean, default True + Make a deep copy, i.e. also copy data + + Returns + ------- + copy : type of caller + """ + data = self._data + if deep: + data = data.copy() + return self._constructor(data).__finalize__(self) + + def convert_objects(self, convert_dates=True, convert_numeric=False, + convert_timedeltas=True, copy=True): + """ + Attempt to infer better dtype for object columns + + Parameters + ---------- + convert_dates : if True, attempt to soft convert dates, if 'coerce', + force conversion (and non-convertibles get NaT) + convert_numeric : if True attempt to coerce to numbers (including + strings), non-convertibles get NaN + convert_timedeltas : if True, attempt to soft convert timedeltas, if 'coerce', + force conversion (and non-convertibles get NaT) + copy : Boolean, if True, return copy even if no copy is necessary + (e.g. no conversion was done), default is True. + It is meant for internal use, not to be confused with `inplace` kw. + + Returns + ------- + converted : asm as input object + """ + return self._constructor( + self._data.convert(convert_dates=convert_dates, + convert_numeric=convert_numeric, + convert_timedeltas=convert_timedeltas, + copy=copy)).__finalize__(self) + + #---------------------------------------------------------------------- + # Filling NA's + + def fillna(self, value=None, method=None, axis=0, inplace=False, + limit=None, downcast=None): + """ + Fill NA/NaN values using the specified method + + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + value : scalar, dict, or Series + Value to use to fill holes (e.g. 0), alternately a dict/Series of + values specifying which value to use for each index (for a Series) or + column (for a DataFrame). (values not in the dict/Series will not be + filled). This value cannot be a list. + axis : {0, 1}, default 0 + * 0: fill column-by-column + * 1: fill row-by-row + inplace : boolean, default False + If True, fill in place. Note: this will modify any + other views on this object, (e.g. a no-copy slice for a column in a + DataFrame). + limit : int, default None + Maximum size gap to forward or backward fill + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) + + See also + -------- + reindex, asfreq + + Returns + ------- + filled : same type as caller + """ + if isinstance(value, (list, tuple)): + raise TypeError('"value" parameter must be a scalar or dict, but ' + 'you passed a "{0}"'.format(type(value).__name__)) + self._consolidate_inplace() + + axis = self._get_axis_number(axis) + method = com._clean_fill_method(method) + + if value is None: + if method is None: + raise ValueError('must specify a fill method or value') + if self._is_mixed_type and axis == 1: + if inplace: + raise NotImplementedError() + result = self.T.fillna(method=method, limit=limit).T + + # need to downcast here because of all of the transposes + result._data = result._data.downcast() + + return result + + # > 3d + if self.ndim > 3: + raise NotImplementedError( + 'Cannot fillna with a method for > 3dims' + ) + + # 3d + elif self.ndim == 3: + + # fill in 2d chunks + result = dict([(col, s.fillna(method=method, value=value)) + for col, s in compat.iteritems(self)]) + return self._constructor.from_dict(result).__finalize__(self) + + # 2d or less + method = com._clean_fill_method(method) + new_data = self._data.interpolate(method=method, + axis=axis, + limit=limit, + inplace=inplace, + coerce=True, + downcast=downcast) + else: + if method is not None: + raise ValueError('cannot specify both a fill method and value') + + if len(self._get_axis(axis)) == 0: + return self + + if self.ndim == 1 and value is not None: + if isinstance(value, (dict, com.ABCSeries)): + from pandas import Series + value = Series(value) + + new_data = self._data.fillna(value=value, + limit=limit, + inplace=inplace, + downcast=downcast) + + elif isinstance(value, (dict, com.ABCSeries)): + if axis == 1: + raise NotImplementedError('Currently only can fill ' + 'with dict/Series column ' + 'by column') + + result = self if inplace else self.copy() + for k, v in compat.iteritems(value): + if k not in result: + continue + obj = result[k] + obj.fillna(v, limit=limit, inplace=True) + return result + else: + new_data = self._data.fillna(value=value, + limit=limit, + inplace=inplace, + downcast=downcast) + + if inplace: + self._update_inplace(new_data) + else: + return self._constructor(new_data).__finalize__(self) + + def ffill(self, axis=0, inplace=False, limit=None, downcast=None): + "Synonym for NDFrame.fillna(method='ffill')" + return self.fillna(method='ffill', axis=axis, inplace=inplace, + limit=limit, downcast=downcast) + + def bfill(self, axis=0, inplace=False, limit=None, downcast=None): + "Synonym for NDFrame.fillna(method='bfill')" + return self.fillna(method='bfill', axis=axis, inplace=inplace, + limit=limit, downcast=downcast) + + def replace(self, to_replace=None, value=None, inplace=False, limit=None, + regex=False, method='pad', axis=None): + """ + Replace values given in 'to_replace' with 'value'. + + Parameters + ---------- + to_replace : str, regex, list, dict, Series, numeric, or None + + * str or regex: + + - str: string exactly matching `to_replace` will be replaced + with `value` + - regex: regexs matching `to_replace` will be replaced with + `value` + + * list of str, regex, or numeric: + + - First, if `to_replace` and `value` are both lists, they + **must** be the same length. + - Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + - str and regex rules apply as above. + + * dict: + + - Nested dictionaries, e.g., {'a': {'b': nan}}, are read as + follows: look in column 'a' for the value 'b' and replace it + with nan. You can nest regular expressions as well. Note that + column names (the top-level dictionary keys in a nested + dictionary) **cannot** be regular expressions. + - Keys map to column names and values map to substitution + values. You can treat this as a special case of passing two + lists except that you are specifying the column to search in. + + * None: + + - This means that the ``regex`` argument must be a string, + compiled regular expression, or list, dict, ndarray or Series + of such elements. If `value` is also ``None`` then this + **must** be a nested dictionary or ``Series``. + + See the examples section for examples of each of these. + value : scalar, dict, list, str, regex, default None + Value to use to fill holes (e.g. 0), alternately a dict of values + specifying which value to use for each column (columns not in the + dict will not be filled). Regular expressions, strings and lists or + dicts of such objects are also allowed. + inplace : boolean, default False + If True, in place. Note: this will modify any + other views on this object (e.g. a column form a DataFrame). + Returns the caller if this is True. + limit : int, default None + Maximum size gap to forward or backward fill + regex : bool or same types as `to_replace`, default False + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. Otherwise, `to_replace` must be ``None`` because this + parameter will be interpreted as a regular expression or a list, + dict, or array of regular expressions. + method : string, optional, {'pad', 'ffill', 'bfill'} + The method to use when for replacement, when ``to_replace`` is a + ``list``. + + See also + -------- + NDFrame.reindex + NDFrame.asfreq + NDFrame.fillna + + Returns + ------- + filled : NDFrame + + Raises + ------ + AssertionError + * If `regex` is not a ``bool`` and `to_replace` is not ``None``. + TypeError + * If `to_replace` is a ``dict`` and `value` is not a ``list``, + ``dict``, ``ndarray``, or ``Series`` + * If `to_replace` is ``None`` and `regex` is not compilable into a + regular expression or is a list, dict, ndarray, or Series. + ValueError + * If `to_replace` and `value` are ``list`` s or ``ndarray`` s, but + they are not the same length. + + Notes + ----- + * Regex substitution is performed under the hood with ``re.sub``. The + rules for substitution for ``re.sub`` are the same. + * Regular expressions will only substitute on strings, meaning you + cannot provide, for example, a regular expression matching floating + point numbers and expect the columns in your frame that have a + numeric dtype to be matched. However, if those floating point numbers + *are* strings, then you can do this. + * This method has *a lot* of options. You are encouraged to experiment + and play with this method to gain intuition about how it works. + + """ + if not com.is_bool(regex) and to_replace is not None: + raise AssertionError("'to_replace' must be 'None' if 'regex' is " + "not a bool") + if axis is not None: + from warnings import warn + warn('the "axis" argument is deprecated and will be removed in' + 'v0.13; this argument has no effect') + + self._consolidate_inplace() + + if value is None: + # passing a single value that is scalar like + # when value is None (GH5319), for compat + if not is_dictlike(to_replace) and not is_dictlike(regex): + to_replace = [to_replace] + + if isinstance(to_replace, (tuple, list)): + return _single_replace(self, to_replace, method, inplace, + limit) + + if not is_dictlike(to_replace): + if not is_dictlike(regex): + raise TypeError('If "to_replace" and "value" are both None' + ' and "to_replace" is not a list, then ' + 'regex must be a mapping') + to_replace = regex + regex = True + + items = list(compat.iteritems(to_replace)) + keys, values = zip(*items) + + are_mappings = [is_dictlike(v) for v in values] + + if any(are_mappings): + if not all(are_mappings): + raise TypeError("If a nested mapping is passed, all values" + " of the top level mapping must be " + "mappings") + # passed a nested dict/Series + to_rep_dict = {} + value_dict = {} + + for k, v in items: + keys, values = zip(*v.items()) + if set(keys) & set(values): + raise ValueError("Replacement not allowed with " + "overlapping keys and values") + to_rep_dict[k] = list(keys) + value_dict[k] = list(values) + + to_replace, value = to_rep_dict, value_dict + else: + to_replace, value = keys, values + + return self.replace(to_replace, value, inplace=inplace, + limit=limit, regex=regex) + else: + + # need a non-zero len on all axes + for a in self._AXIS_ORDERS: + if not len(self._get_axis(a)): + return self + + new_data = self._data + if is_dictlike(to_replace): + if is_dictlike(value): # {'A' : NA} -> {'A' : 0} + res = self if inplace else self.copy() + for c, src in compat.iteritems(to_replace): + if c in value and c in self: + res[c] = res[c].replace(to_replace=src, + value=value[c], + inplace=False, + regex=regex) + return None if inplace else res + + # {'A': NA} -> 0 + elif not com.is_list_like(value): + for k, src in compat.iteritems(to_replace): + if k in self: + new_data = new_data.replace(to_replace=src, + value=value, + filter=[k], + inplace=inplace, + regex=regex) + else: + raise TypeError('value argument must be scalar, dict, or ' + 'Series') + + elif com.is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] + if com.is_list_like(value): + if len(to_replace) != len(value): + raise ValueError('Replacement lists must match ' + 'in length. Expecting %d got %d ' % + (len(to_replace), len(value))) + + new_data = self._data.replace_list(src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex) + + else: # [NA, ''] -> 0 + new_data = self._data.replace(to_replace=to_replace, + value=value, + inplace=inplace, + regex=regex) + elif to_replace is None: + if not (com.is_re_compilable(regex) or + com.is_list_like(regex) or + is_dictlike(regex)): + raise TypeError("'regex' must be a string or a compiled " + "regular expression or a list or dict of " + "strings or regular expressions, you " + "passed a" + " {0!r}".format(type(regex).__name__)) + return self.replace(regex, value, inplace=inplace, limit=limit, + regex=True) + else: + + # dest iterable dict-like + if is_dictlike(value): # NA -> {'A' : 0, 'B' : -1} + new_data = self._data + + for k, v in compat.iteritems(value): + if k in self: + new_data = new_data.replace(to_replace=to_replace, + value=v, + filter=[k], + inplace=inplace, + regex=regex) + + elif not com.is_list_like(value): # NA -> 0 + new_data = self._data.replace(to_replace=to_replace, value=value, + inplace=inplace, regex=regex) + else: + msg = ('Invalid "to_replace" type: ' + '{0!r}').format(type(to_replace).__name__) + raise TypeError(msg) # pragma: no cover + + new_data = new_data.convert(copy=not inplace, convert_numeric=False) + + if inplace: + self._update_inplace(new_data) + else: + return self._constructor(new_data).__finalize__(self) + + def interpolate(self, method='linear', axis=0, limit=None, inplace=False, + downcast=None, **kwargs): + """ + Interpolate values according to different methods. + + Parameters + ---------- + method : {'linear', 'time', 'index', 'values', 'nearest', 'zero', + 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', + 'polynomial', 'spline' 'piecewise_polynomial', 'pchip'} + + * 'linear': ignore the index and treat the values as equally + spaced. default + * 'time': interpolation works on daily and higher resolution + data to interpolate given length of interval + * 'index', 'values': use the actual numerical values of the index + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'polynomial' is passed to + `scipy.interpolate.interp1d` with the order given both + 'polynomial' and 'spline' requre that you also specify and order + (int) e.g. df.interpolate(method='polynomial', order=4) + * 'krogh', 'piecewise_polynomial', 'spline', and 'pchip' are all + wrappers around the scipy interpolation methods of similar + names. See the scipy documentation for more on their behavior: + http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation + http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html + + axis : {0, 1}, default 0 + * 0: fill column-by-column + * 1: fill row-by-row + limit : int, default None. + Maximum number of consecutive NaNs to fill. + inplace : bool, default False + Update the NDFrame in place if possible. + downcast : optional, 'infer' or None, defaults to None + Downcast dtypes if possible. + + Returns + ------- + Series or DataFrame of same shape interpolated at the NaNs + + See Also + -------- + reindex, replace, fillna + + Examples + -------- + + # Filling in NaNs: + >>> s = pd.Series([0, 1, np.nan, 3]) + >>> s.interpolate() + 0 0 + 1 1 + 2 2 + 3 3 + dtype: float64 + + """ + if self.ndim > 2: + raise NotImplementedError("Interpolate has not been implemented " + "on Panel and Panel 4D objects.") + + if axis == 0: + ax = self._info_axis_name + elif axis == 1: + self = self.T + ax = 1 + ax = self._get_axis_number(ax) + + if self.ndim == 2: + alt_ax = 1 - ax + else: + alt_ax = ax + + if isinstance(self.index, MultiIndex) and method != 'linear': + raise ValueError("Only `method=linear` interpolation is supported " + "on MultiIndexes.") + + if self._data.get_dtype_counts().get('object') == len(self.T): + raise TypeError("Cannot interpolate with all NaNs.") + + # create/use the index + if method == 'linear': + index = np.arange(len(self._get_axis(alt_ax))) # prior default + else: + index = self._get_axis(alt_ax) + + if pd.isnull(index).any(): + raise NotImplementedError("Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating.") + new_data = self._data.interpolate(method=method, + axis=ax, + index=index, + values=self, + limit=limit, + inplace=inplace, + downcast=downcast, + **kwargs) + if inplace: + if axis == 1: + self._update_inplace(new_data) + self = self.T + else: + self._update_inplace(new_data) + else: + res = self._constructor(new_data).__finalize__(self) + if axis == 1: + res = res.T + return res + + #---------------------------------------------------------------------- + # Action Methods + + def isnull(self): + """ + Return a boolean same-sized object indicating if the values are null + + See also + -------- + notnull : boolean inverse of isnull + """ + return isnull(self).__finalize__(self) + + def notnull(self): + """Return a boolean same-sized object indicating if the values are + not null + + See also + -------- + isnull : boolean inverse of notnull + """ + return notnull(self).__finalize__(self) + + def clip(self, lower=None, upper=None, out=None): + """ + Trim values at input threshold(s) + + Parameters + ---------- + lower : float, default None + upper : float, default None + + Returns + ------- + clipped : Series + """ + if out is not None: # pragma: no cover + raise Exception('out argument is not supported yet') + + # GH 2747 (arguments were reversed) + if lower is not None and upper is not None: + lower, upper = min(lower, upper), max(lower, upper) + + result = self + if lower is not None: + result = result.clip_lower(lower) + if upper is not None: + result = result.clip_upper(upper) + + return result + + def clip_upper(self, threshold): + """ + Return copy of input with values above given value truncated + + See also + -------- + clip + + Returns + ------- + clipped : same type as input + """ + if isnull(threshold): + raise ValueError("Cannot use an NA value as a clip threshold") + + return self.where((self <= threshold) | isnull(self), threshold) + + def clip_lower(self, threshold): + """ + Return copy of the input with values below given value truncated + + See also + -------- + clip + + Returns + ------- + clipped : same type as input + """ + if isnull(threshold): + raise ValueError("Cannot use an NA value as a clip threshold") + + return self.where((self >= threshold) | isnull(self), threshold) + + def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, + group_keys=True, squeeze=False): + """ + Group series using mapper (dict or key function, apply given function + to group, return result as series) or by a series of columns + + Parameters + ---------- + by : mapping function / list of functions, dict, Series, or tuple / + list of column names. + Called on each element of the object index to determine the groups. + If a dict or Series is passed, the Series or dict VALUES will be + used to determine the groups + axis : int, default 0 + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels + as_index : boolean, default True + For aggregated output, return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output + sort : boolean, default True + Sort group keys. Get better performance by turning this off + group_keys : boolean, default True + When calling apply, add group keys to index to identify pieces + squeeze : boolean, default False + reduce the dimensionaility of the return type if possible, + otherwise return a consistent type + + Examples + -------- + # DataFrame result + >>> data.groupby(func, axis=0).mean() + + # DataFrame result + >>> data.groupby(['col1', 'col2'])['col3'].mean() + + # DataFrame with hierarchical index + >>> data.groupby(['col1', 'col2']).mean() + + Returns + ------- + GroupBy object + + """ + + from pandas.core.groupby import groupby + axis = self._get_axis_number(axis) + return groupby(self, by, axis=axis, level=level, as_index=as_index, + sort=sort, group_keys=group_keys, squeeze=squeeze) + + def asfreq(self, freq, method=None, how=None, normalize=False): + """ + Convert all TimeSeries inside to specified frequency using DateOffset + objects. Optionally provide fill method to pad/backfill missing values. + + Parameters + ---------- + freq : DateOffset object, or string + method : {'backfill', 'bfill', 'pad', 'ffill', None} + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill method + how : {'start', 'end'}, default end + For PeriodIndex only, see PeriodIndex.asfreq + normalize : bool, default False + Whether to reset output index to midnight + + Returns + ------- + converted : type of caller + """ + from pandas.tseries.resample import asfreq + return asfreq(self, freq, method=method, how=how, + normalize=normalize) + + def at_time(self, time, asof=False): + """ + Select values at particular time of day (e.g. 9:30AM) + + Parameters + ---------- + time : datetime.time or string + + Returns + ------- + values_at_time : type of caller + """ + try: + indexer = self.index.indexer_at_time(time, asof=asof) + return self.take(indexer, convert=False) + except AttributeError: + raise TypeError('Index must be DatetimeIndex') + + def between_time(self, start_time, end_time, include_start=True, + include_end=True): + """ + Select values between particular times of the day (e.g., 9:00-9:30 AM) + + Parameters + ---------- + start_time : datetime.time or string + end_time : datetime.time or string + include_start : boolean, default True + include_end : boolean, default True + + Returns + ------- + values_between_time : type of caller + """ + try: + indexer = self.index.indexer_between_time( + start_time, end_time, include_start=include_start, + include_end=include_end) + return self.take(indexer, convert=False) + except AttributeError: + raise TypeError('Index must be DatetimeIndex') + + def resample(self, rule, how=None, axis=0, fill_method=None, + closed=None, label=None, convention='start', + kind=None, loffset=None, limit=None, base=0): + """ + Convenience method for frequency conversion and resampling of regular + time-series data. + + Parameters + ---------- + rule : string + the offset string or object representing target conversion + how : string + method for down- or re-sampling, default to 'mean' for + downsampling + axis : int, optional, default 0 + fill_method : string, default None + fill_method for upsampling + closed : {'right', 'left'} + Which side of bin interval is closed + label : {'right', 'left'} + Which bin edge label to label bucket with + convention : {'start', 'end', 's', 'e'} + kind : "period"/"timestamp" + loffset : timedelta + Adjust the resampled time labels + limit : int, default None + Maximum size gap to when reindexing with fill_method + base : int, default 0 + For frequencies that evenly subdivide 1 day, the "origin" of the + aggregated intervals. For example, for '5min' frequency, base could + range from 0 through 4. Defaults to 0 + """ + from pandas.tseries.resample import TimeGrouper + axis = self._get_axis_number(axis) + sampler = TimeGrouper(rule, label=label, closed=closed, how=how, + axis=axis, kind=kind, loffset=loffset, + fill_method=fill_method, convention=convention, + limit=limit, base=base) + return sampler.resample(self).__finalize__(self) + + def first(self, offset): + """ + Convenience method for subsetting initial periods of time series data + based on a date offset + + Parameters + ---------- + offset : string, DateOffset, dateutil.relativedelta + + Examples + -------- + ts.last('10D') -> First 10 days + + Returns + ------- + subset : type of caller + """ + from pandas.tseries.frequencies import to_offset + if not isinstance(self.index, DatetimeIndex): + raise NotImplementedError + + if len(self.index) == 0: + return self + + offset = to_offset(offset) + end_date = end = self.index[0] + offset + + # Tick-like, e.g. 3 weeks + if not offset.isAnchored() and hasattr(offset, '_inc'): + if end_date in self.index: + end = self.index.searchsorted(end_date, side='left') + + return self.ix[:end] + + def last(self, offset): + """ + Convenience method for subsetting final periods of time series data + based on a date offset + + Parameters + ---------- + offset : string, DateOffset, dateutil.relativedelta + + Examples + -------- + ts.last('5M') -> Last 5 months + + Returns + ------- + subset : type of caller + """ + from pandas.tseries.frequencies import to_offset + if not isinstance(self.index, DatetimeIndex): + raise NotImplementedError + + if len(self.index) == 0: + return self + + offset = to_offset(offset) + + start_date = start = self.index[-1] - offset + start = self.index.searchsorted(start_date, side='right') + return self.ix[start:] + + def align(self, other, join='outer', axis=None, level=None, copy=True, + fill_value=None, method=None, limit=None, fill_axis=0): + """ + Align two object on their axes with the + specified join method for each axis Index + + Parameters + ---------- + other : DataFrame or Series + join : {'outer', 'inner', 'left', 'right'}, default 'outer' + axis : allowed axis of the other object, default None + Align on index (0), columns (1), or both (None) + level : int or level name, default None + Broadcast across a level, matching Index values on the + passed MultiIndex level + copy : boolean, default True + Always returns new objects. If copy=False and no reindexing is + required then original objects are returned. + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value + method : str, default None + limit : int, default None + fill_axis : {0, 1}, default 0 + Filling axis, method and limit + + Returns + ------- + (left, right) : (type of input, type of other) + Aligned objects + """ + from pandas import DataFrame, Series + method = com._clean_fill_method(method) + + if axis is not None: + axis = self._get_axis_number(axis) + if isinstance(other, DataFrame): + return self._align_frame(other, join=join, axis=axis, level=level, + copy=copy, fill_value=fill_value, + method=method, limit=limit, + fill_axis=fill_axis) + elif isinstance(other, Series): + return self._align_series(other, join=join, axis=axis, level=level, + copy=copy, fill_value=fill_value, + method=method, limit=limit, + fill_axis=fill_axis) + else: # pragma: no cover + raise TypeError('unsupported type: %s' % type(other)) + + def _align_frame(self, other, join='outer', axis=None, level=None, + copy=True, fill_value=np.nan, method=None, limit=None, + fill_axis=0): + # defaults + join_index, join_columns = None, None + ilidx, iridx = None, None + clidx, cridx = None, None + + if axis is None or axis == 0: + if not self.index.equals(other.index): + join_index, ilidx, iridx = \ + self.index.join(other.index, how=join, level=level, + return_indexers=True) + + if axis is None or axis == 1: + if not self.columns.equals(other.columns): + join_columns, clidx, cridx = \ + self.columns.join(other.columns, how=join, level=level, + return_indexers=True) + + left = self._reindex_with_indexers({0: [join_index, ilidx], + 1: [join_columns, clidx]}, + copy=copy, fill_value=fill_value, + allow_dups=True) + right = other._reindex_with_indexers({0: [join_index, iridx], + 1: [join_columns, cridx]}, + copy=copy, fill_value=fill_value, + allow_dups=True) + + if method is not None: + left = left.fillna(axis=fill_axis, method=method, limit=limit) + right = right.fillna(axis=fill_axis, method=method, limit=limit) + + return left.__finalize__(self), right.__finalize__(other) + + def _align_series(self, other, join='outer', axis=None, level=None, + copy=True, fill_value=None, method=None, limit=None, + fill_axis=0): + from pandas import DataFrame + + # series/series compat + if isinstance(self, ABCSeries) and isinstance(other, ABCSeries): + if axis: + raise ValueError('cannot align series to a series other than ' + 'axis 0') + + join_index, lidx, ridx = self.index.join(other.index, how=join, + level=level, + return_indexers=True) + + left_result = self._reindex_indexer(join_index, lidx, copy) + right_result = other._reindex_indexer(join_index, ridx, copy) + + else: + + # for join compat if we have an unnamed index, but + # are specifying a level join + other_index = other.index + if level is not None and other.index.name is None: + other_index = other_index.set_names([level]) + + # one has > 1 ndim + fdata = self._data + if axis == 0: + join_index = self.index + lidx, ridx = None, None + if not self.index.equals(other_index): + join_index, lidx, ridx = self.index.join( + other_index, how=join, return_indexers=True) + + if lidx is not None: + fdata = fdata.reindex_indexer(join_index, lidx, axis=1) + + elif axis == 1: + join_index = self.columns + lidx, ridx = None, None + if not self.columns.equals(other_index): + join_index, lidx, ridx = \ + self.columns.join(other_index, how=join, + return_indexers=True) + + if lidx is not None: + fdata = fdata.reindex_indexer(join_index, lidx, axis=0) + else: + raise ValueError('Must specify axis=0 or 1') + + if copy and fdata is self._data: + fdata = fdata.copy() + + left_result = DataFrame(fdata) + + if ridx is None: + right_result = other + else: + right_result = other.reindex(join_index, level=level) + + # fill + fill_na = notnull(fill_value) or (method is not None) + if fill_na: + return (left_result.fillna(fill_value, method=method, limit=limit, + axis=fill_axis), + right_result.fillna(fill_value, method=method, + limit=limit)) + else: + return (left_result.__finalize__(self), + right_result.__finalize__(other)) + + def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, + try_cast=False, raise_on_error=True): + """ + Return an object of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from other. + + Parameters + ---------- + cond : boolean NDFrame or array + other : scalar or NDFrame + inplace : boolean, default False + Whether to perform the operation in place on the data + axis : alignment axis if needed, default None + level : alignment level if needed, default None + try_cast : boolean, default False + try to cast the result back to the input type (if possible), + raise_on_error : boolean, default True + Whether to raise on invalid data types (e.g. trying to where on + strings) + + Returns + ------- + wh : same type as caller + """ + if isinstance(cond, NDFrame): + cond = cond.reindex(**self._construct_axes_dict()) + else: + if not hasattr(cond, 'shape'): + raise ValueError('where requires an ndarray like object for ' + 'its condition') + if cond.shape != self.shape: + raise ValueError( + 'Array conditional must be same shape as self') + cond = self._constructor(cond, **self._construct_axes_dict()) + + if inplace: + cond = -(cond.fillna(True).astype(bool)) + else: + cond = cond.fillna(False).astype(bool) + + # try to align + try_quick = True + if hasattr(other, 'align'): + + # align with me + if other.ndim <= self.ndim: + + _, other = self.align(other, join='left', + axis=axis, level=level, + fill_value=np.nan) + + # if we are NOT aligned, raise as we cannot where index + if (axis is None and + not all([other._get_axis(i).equals(ax) + for i, ax in enumerate(self.axes)])): + raise InvalidIndexError + + # slice me out of the other + else: + raise NotImplemented( + "cannot align with a higher dimensional NDFrame" + ) + + elif is_list_like(other): + + if self.ndim == 1: + + # try to set the same dtype as ourselves + new_other = np.array(other, dtype=self.dtype) + if not (new_other == np.array(other)).all(): + other = np.array(other) + + # we can't use our existing dtype + # because of incompatibilities + try_quick = False + else: + other = new_other + else: + + other = np.array(other) + + if isinstance(other, np.ndarray): + + if other.shape != self.shape: + + if self.ndim == 1: + + icond = cond.values + + # GH 2745 / GH 4192 + # treat like a scalar + if len(other) == 1: + other = np.array(other[0]) + + # GH 3235 + # match True cond to other + elif len(cond[icond]) == len(other): + + # try to not change dtype at first (if try_quick) + if try_quick: + + try: + new_other = _values_from_object(self).copy() + new_other[icond] = other + other = new_other + except: + try_quick = False + + # let's create a new (if we failed at the above + # or not try_quick + if not try_quick: + + dtype, fill_value = _maybe_promote(other.dtype) + new_other = np.empty(len(icond), dtype=dtype) + new_other.fill(fill_value) + com._maybe_upcast_putmask(new_other, icond, other) + other = new_other + + else: + raise ValueError( + 'Length of replacements must equal series length') + + else: + raise ValueError('other must be the same shape as self ' + 'when an ndarray') + + # we are the same shape, so create an actual object for alignment + else: + other = self._constructor(other, **self._construct_axes_dict()) + + if inplace: + # we may have different type blocks come out of putmask, so + # reconstruct the block manager + + self._check_inplace_setting(other) + new_data = self._data.putmask(mask=cond, new=other, align=axis is None, + inplace=True) + self._update_inplace(new_data) + + else: + new_data = self._data.where(other=other, cond=cond, align=axis is None, + raise_on_error=raise_on_error, + try_cast=try_cast) + + return self._constructor(new_data).__finalize__(self) + + def mask(self, cond): + """ + Returns copy whose values are replaced with nan if the + inverted condition is True + + Parameters + ---------- + cond : boolean NDFrame or array + + Returns + ------- + wh: same as input + """ + return self.where(~cond, np.nan) + + def shift(self, periods=1, freq=None, axis=0, **kwds): + """ + Shift index by desired number of periods with an optional time freq + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + freq : DateOffset, timedelta, or time rule string, optional + Increment to use from datetools module or time rule (e.g. 'EOM'). + See Notes. + + Notes + ----- + If freq is specified then the index values are shifted but the data + is not realigned. That is, use freq if you would like to extend the + index when shifting and preserve the original data. + + Returns + ------- + shifted : same type as caller + """ + if periods == 0: + return self + + block_axis = self._get_block_manager_axis(axis) + if freq is None and not len(kwds): + new_data = self._data.shift(periods=periods, axis=block_axis) + else: + return self.tshift(periods, freq, **kwds) + + return self._constructor(new_data).__finalize__(self) + + def slice_shift(self, periods=1, axis=0, **kwds): + """ + Equivalent to `shift` without copying data. The shifted data will + not include the dropped periods and the shifted axis will be smaller + than the original. + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + + Notes + ----- + While the `slice_shift` is faster than `shift`, you may pay for it + later during alignment. + + Returns + ------- + shifted : same type as caller + """ + if periods == 0: + return self + + if periods > 0: + vslicer = slice(None, -periods) + islicer = slice(periods, None) + else: + vslicer = slice(-periods, None) + islicer = slice(None, periods) + + new_obj = self._slice(vslicer, axis=axis) + shifted_axis = self._get_axis(axis)[islicer] + new_obj.set_axis(axis, shifted_axis) + + return new_obj.__finalize__(self) + + def tshift(self, periods=1, freq=None, axis=0, **kwds): + """ + Shift the time index, using the index's frequency if available + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + freq : DateOffset, timedelta, or time rule string, default None + Increment to use from datetools module or time rule (e.g. 'EOM') + axis : int or basestring + Corresponds to the axis that contains the Index + + Notes + ----- + If freq is not specified then tries to use the freq or inferred_freq + attributes of the index. If neither of those attributes exist, a + ValueError is thrown + + Returns + ------- + shifted : NDFrame + """ + from pandas.core.datetools import _resolve_offset + + index = self._get_axis(axis) + if freq is None: + freq = getattr(index, 'freq', None) + + if freq is None: + freq = getattr(index, 'inferred_freq', None) + + if freq is None: + msg = 'Freq was not given and was not set in the index' + raise ValueError(msg) + + if periods == 0: + return self + + offset = _resolve_offset(freq, kwds) + + if isinstance(offset, string_types): + offset = datetools.to_offset(offset) + + block_axis = self._get_block_manager_axis(axis) + if isinstance(index, PeriodIndex): + orig_offset = datetools.to_offset(index.freq) + if offset == orig_offset: + new_data = self._data.copy() + new_data.axes[block_axis] = index.shift(periods) + else: + msg = ('Given freq %s does not match PeriodIndex freq %s' % + (offset.rule_code, orig_offset.rule_code)) + raise ValueError(msg) + else: + new_data = self._data.copy() + new_data.axes[block_axis] = index.shift(periods, offset) + + return self._constructor(new_data).__finalize__(self) + + def truncate(self, before=None, after=None, axis=None, copy=True): + """Truncates a sorted NDFrame before and/or after some particular + dates. + + Parameters + ---------- + before : date + Truncate before date + after : date + Truncate after date + axis : the truncation axis, defaults to the stat axis + copy : boolean, default is True, + return a copy of the truncated section + + Returns + ------- + truncated : type of caller + """ + + if axis is None: + axis = self._stat_axis_number + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + # if we have a date index, convert to dates, otherwise + # treat like a slice + if ax.is_all_dates: + from pandas.tseries.tools import to_datetime + before = to_datetime(before) + after = to_datetime(after) + + if before is not None and after is not None: + if before > after: + raise ValueError('Truncate: %s must be after %s' % + (after, before)) + + slicer = [slice(None, None)] * self._AXIS_LEN + slicer[axis] = slice(before, after) + result = self.ix[tuple(slicer)] + + if isinstance(ax, MultiIndex): + setattr(result, self._get_axis_name(axis), + ax.truncate(before, after)) + + if copy: + result = result.copy() + + return result + + def tz_convert(self, tz, axis=0, copy=True): + """ + Convert the axis to target time zone. If it is time zone naive, it + will be localized to the passed time zone. + + Parameters + ---------- + tz : string or pytz.timezone object + copy : boolean, default True + Also make a copy of the underlying data + + Returns + ------- + """ + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + if not hasattr(ax, 'tz_convert'): + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + else: + ax = DatetimeIndex([],tz=tz) + else: + ax = ax.tz_convert(tz) + + result = self._constructor(self._data, copy=copy) + result.set_axis(axis,ax) + return result.__finalize__(self) + + def tz_localize(self, tz, axis=0, copy=True, infer_dst=False): + """ + Localize tz-naive TimeSeries to target time zone + + Parameters + ---------- + tz : string or pytz.timezone object + copy : boolean, default True + Also make a copy of the underlying data + infer_dst : boolean, default False + Attempt to infer fall dst-transition times based on order + + Returns + ------- + """ + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + if not hasattr(ax, 'tz_localize'): + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + else: + ax = DatetimeIndex([],tz=tz) + else: + ax = ax.tz_localize(tz, infer_dst=infer_dst) + + result = self._constructor(self._data, copy=copy) + result.set_axis(axis,ax) + return result.__finalize__(self) + + #---------------------------------------------------------------------- + # Numeric Methods + def abs(self): + """ + Return an object with absolute value taken. Only applicable to objects + that are all numeric + + Returns + ------- + abs: type of caller + """ + + # suprimo numpy 1.6 hacking + # for timedeltas + if _np_version_under1p7: + + def _convert_timedeltas(x): + if x.dtype.kind == 'm': + return np.abs(x.view('i8')).astype(x.dtype) + return np.abs(x) + + if self.ndim == 1: + return _convert_timedeltas(self) + elif self.ndim == 2: + return self.apply(_convert_timedeltas) + + return np.abs(self) + + _shared_docs['describe'] = """ + Generate various summary statistics, excluding NaN values. + + Parameters + ---------- + percentile_width : float, deprecated + The ``percentile_width`` argument will be removed in a future + version. Use ``percentiles`` instead. + width of the desired uncertainty interval, default is 50, + which corresponds to lower=25, upper=75 + percentiles : array-like, optional + The percentiles to include in the output. Should all + be in the interval [0, 1]. By default `percentiles` is + [.25, .5, .75], returning the 25th, 50th, and 75th percentiles. + + Returns + ------- + summary: %(klass)s of summary statistics + + Notes + ----- + For numeric dtypes the index includes: count, mean, std, min, + max, and lower, 50, and upper percentiles. + + If self is of object dtypes (e.g. timestamps or strings), the output + will include the count, unique, most common, and frequency of the + most common. Timestamps also include the first and last items. + + If multiple values have the highest count, then the + `count` and `most common` pair will be arbitrarily chosen from + among those with the highest count. + """ + + @Appender(_shared_docs['describe'] % _shared_doc_kwargs) + def describe(self, percentile_width=None, percentiles=None): + if self.ndim >= 3: + msg = "describe is not implemented on on Panel or PanelND objects." + raise NotImplementedError(msg) + + if percentile_width is not None and percentiles is not None: + msg = "Cannot specify both 'percentile_width' and 'percentiles.'" + raise ValueError(msg) + if percentiles is not None: + # get them all to be in [0, 1] + percentiles = np.asarray(percentiles) + if (percentiles > 1).any(): + percentiles = percentiles / 100.0 + msg = ("percentiles should all be in the interval [0, 1]. " + "Try {0} instead.") + raise ValueError(msg.format(list(percentiles))) + else: + # only warn if they change the default + if percentile_width is not None: + do_warn = True + else: + do_warn = False + percentile_width = percentile_width or 50 + lb = .5 * (1. - percentile_width / 100.) + ub = 1. - lb + percentiles = np.array([lb, 0.5, ub]) + if do_warn: + msg = ("The `percentile_width` keyword is deprecated. " + "Use percentiles={0} instead".format(list(percentiles))) + warnings.warn(msg, FutureWarning) + + # median should always be included + if (percentiles != 0.5).all(): # median isn't included + lh = percentiles[percentiles < .5] + uh = percentiles[percentiles > .5] + percentiles = np.hstack([lh, 0.5, uh]) + + # dtypes: numeric only, numeric mixed, objects only + data = self._get_numeric_data() + if self.ndim > 1: + if len(data._info_axis) == 0: + is_object = True + else: + is_object = False + else: + is_object = not self._is_numeric_mixed_type + + def pretty_name(x): + x *= 100 + if x == int(x): + return '%.0f%%' % x + else: + return '%.1f%%' % x + + def describe_numeric_1d(series, percentiles): + return ([series.count(), series.mean(), series.std(), + series.min()] + + [series.quantile(x) for x in percentiles] + + [series.max()]) + + def describe_categorical_1d(data): + names = ['count', 'unique'] + objcounts = data.value_counts() + result = [data.count(), len(objcounts)] + if result[1] > 0: + top, freq = objcounts.index[0], objcounts.iloc[0] + + if data.dtype == object: + names += ['top', 'freq'] + result += [top, freq] + + elif issubclass(data.dtype.type, np.datetime64): + asint = data.dropna().values.view('i8') + names += ['first', 'last', 'top', 'freq'] + result += [lib.Timestamp(asint.min()), + lib.Timestamp(asint.max()), + lib.Timestamp(top), freq] + + return pd.Series(result, index=names) + + if is_object: + if data.ndim == 1: + return describe_categorical_1d(self) + else: + result = pd.DataFrame(dict((k, describe_categorical_1d(v)) + for k, v in compat.iteritems(self)), + columns=self._info_axis, + index=['count', 'unique', 'first', 'last', + 'top', 'freq']) + # just objects, no datime + if pd.isnull(result.loc['first']).all(): + result = result.drop(['first', 'last'], axis=0) + return result + else: + stat_index = (['count', 'mean', 'std', 'min'] + + [pretty_name(x) for x in percentiles] + + ['max']) + if data.ndim == 1: + return pd.Series(describe_numeric_1d(data, percentiles), + index=stat_index) + else: + destat = [] + for i in range(len(data._info_axis)): # BAD + series = data.iloc[:, i] + destat.append(describe_numeric_1d(series, percentiles)) + + return self._constructor(lmap(list, zip(*destat)), + index=stat_index, + columns=data._info_axis) + + _shared_docs['pct_change'] = """ + Percent change over given number of periods. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change + fill_method : str, default 'pad' + How to handle NAs before computing percent changes + limit : int, default None + The number of consecutive NAs to fill before stopping + freq : DateOffset, timedelta, or offset alias string, optional + Increment to use from time series API (e.g. 'M' or BDay()) + + Returns + ------- + chg : %(klass)s + + Notes + ----- + + By default, the percentage change is calculated along the stat + axis: 0, or ``Index``, for ``DataFrame`` and 1, or ``minor`` for + ``Panel``. You can change this with the ``axis`` keyword argument. + """ + + @Appender(_shared_docs['pct_change'] % _shared_doc_kwargs) + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, + **kwds): + # TODO: Not sure if above is correct - need someone to confirm. + axis = self._get_axis_number(kwds.pop('axis', self._stat_axis_name)) + if fill_method is None: + data = self + else: + data = self.fillna(method=fill_method, limit=limit) + + rs = (data.div(data.shift(periods=periods, freq=freq, + axis=axis, **kwds)) - 1) + if freq is None: + mask = com.isnull(_values_from_object(self)) + np.putmask(rs.values, mask, np.nan) + return rs + + def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwds): + grouped = self.groupby(level=level, axis=axis) + if hasattr(grouped, name) and skipna: + return getattr(grouped, name)(**kwds) + axis = self._get_axis_number(axis) + method = getattr(type(self), name) + applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwds) + return grouped.aggregate(applyf) + + @classmethod + def _add_numeric_operations(cls): + """ add the operations to the cls; evaluate the doc strings again """ + + axis_descr = "{%s}" % ', '.join([ + "{0} ({1})".format(a, i) for i, a in enumerate(cls._AXIS_ORDERS) + ]) + name = (cls._constructor_sliced.__name__ + if cls._AXIS_LEN > 1 else 'scalar') + _num_doc = """ + +%(desc)s + +Parameters +---------- +axis : """ + axis_descr + """ +skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA +level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a """ + name + """ +numeric_only : boolean, default None + Include only float, int, boolean data. If None, will attempt to use + everything, then use only numeric data + +Returns +------- +%(outname)s : """ + name + " or " + cls.__name__ + " (if level specified)\n" + + _cnum_doc = """ + +Parameters +---------- +axis : """ + axis_descr + """ +skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + +Returns +------- +%(outname)s : """ + name + "\n" + + def _make_stat_function(name, desc, f): + + @Substitution(outname=name, desc=desc) + @Appender(_num_doc) + def stat_func(self, axis=None, skipna=None, level=None, + numeric_only=None, **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna) + return self._reduce(f, axis=axis, + skipna=skipna, numeric_only=numeric_only) + stat_func.__name__ = name + return stat_func + + cls.sum = _make_stat_function( + 'sum', 'Return the sum of the values for the requested axis', + nanops.nansum) + cls.mean = _make_stat_function( + 'mean', 'Return the mean of the values for the requested axis', + nanops.nanmean) + cls.skew = _make_stat_function( + 'skew', + 'Return unbiased skew over requested axis\nNormalized by N-1', + nanops.nanskew) + cls.kurt = _make_stat_function( + 'kurt', + 'Return unbiased kurtosis over requested axis\nNormalized by N-1', + nanops.nankurt) + cls.kurtosis = cls.kurt + cls.prod = _make_stat_function( + 'prod', 'Return the product of the values for the requested axis', + nanops.nanprod) + cls.product = cls.prod + cls.median = _make_stat_function( + 'median', 'Return the median of the values for the requested axis', + nanops.nanmedian) + cls.max = _make_stat_function('max', """ +This method returns the maximum of the values in the object. If you +want the *index* of the maximum, use ``idxmax``. This is the +equivalent of the ``numpy.ndarray`` method ``argmax``.""", nanops.nanmax) + cls.min = _make_stat_function('min', """ +This method returns the minimum of the values in the object. If you +want the *index* of the minimum, use ``idxmin``. This is the +equivalent of the ``numpy.ndarray`` method ``argmin``.""", nanops.nanmin) + + @Substitution(outname='mad', + desc="Return the mean absolute deviation of the values " + "for the requested axis") + @Appender(_num_doc) + def mad(self, axis=None, skipna=None, level=None, **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level('mad', axis=axis, level=level, + skipna=skipna) + + data = self._get_numeric_data() + if axis == 0: + demeaned = data - data.mean(axis=0) + else: + demeaned = data.sub(data.mean(axis=1), axis=0) + return np.abs(demeaned).mean(axis=axis, skipna=skipna) + cls.mad = mad + + @Substitution(outname='variance', + desc="Return unbiased variance over requested " + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument") + @Appender(_num_doc) + def var(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level('var', axis=axis, level=level, + skipna=skipna, ddof=ddof) + + return self._reduce(nanops.nanvar, axis=axis, skipna=skipna, + ddof=ddof) + cls.var = var + + @Substitution(outname='stdev', + desc="Return unbiased standard deviation over requested " + "axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument") + @Appender(_num_doc) + def std(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level('std', axis=axis, level=level, + skipna=skipna, ddof=ddof) + result = self.var(axis=axis, skipna=skipna, ddof=ddof) + if getattr(result, 'ndim', 0) > 0: + return result.apply(np.sqrt) + return np.sqrt(result) + cls.std = std + + @Substitution(outname='standarderror', + desc="Return unbiased standard error of the mean over " + "requested axis.\n\nNormalized by N-1 by default. " + "This can be changed using the ddof argument") + @Appender(_num_doc) + def sem(self, axis=None, skipna=None, level=None, ddof=1, **kwargs): + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level('sem', axis=axis, level=level, + skipna=skipna, ddof=ddof) + + return self._reduce(nanops.nansem, axis=axis, skipna=skipna, + ddof=ddof) + cls.sem = sem + + @Substitution(outname='compounded', + desc="Return the compound percentage of the values for " + "the requested axis") + @Appender(_num_doc) + def compound(self, axis=None, skipna=None, level=None, **kwargs): + if skipna is None: + skipna = True + return (1 + self).prod(axis=axis, skipna=skipna, level=level) - 1 + cls.compound = compound + + def _make_cum_function(name, accum_func, mask_a, mask_b): + + @Substitution(outname=name) + @Appender("Return cumulative {0} over requested axis.".format(name) + + _cnum_doc) + def func(self, axis=None, dtype=None, out=None, skipna=True, + **kwargs): + if axis is None: + axis = self._stat_axis_number + else: + axis = self._get_axis_number(axis) + + y = _values_from_object(self).copy() + if not issubclass(y.dtype.type, (np.integer, np.bool_)): + mask = isnull(self) + if skipna: + np.putmask(y, mask, mask_a) + result = accum_func(y, axis) + if skipna: + np.putmask(result, mask, mask_b) + else: + result = accum_func(y, axis) + + d = self._construct_axes_dict() + d['copy'] = False + return self._constructor(result, **d).__finalize__(self) + + func.__name__ = name + return func + + cls.cummin = _make_cum_function( + 'min', lambda y, axis: np.minimum.accumulate(y, axis), + np.inf, np.nan) + cls.cumsum = _make_cum_function( + 'sum', lambda y, axis: y.cumsum(axis), 0., np.nan) + cls.cumprod = _make_cum_function( + 'prod', lambda y, axis: y.cumprod(axis), 1., np.nan) + cls.cummax = _make_cum_function( + 'max', lambda y, axis: np.maximum.accumulate(y, axis), + -np.inf, np.nan) + +# install the indexerse +for _name, _indexer in indexing.get_indexers_list(): + NDFrame._create_indexer(_name, _indexer) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py new file mode 100644 index 00000000..249aa0af --- /dev/null +++ b/pandas/core/groupby.py @@ -0,0 +1,3566 @@ +import types +from functools import wraps +import numpy as np +import datetime +import collections + +from pandas.compat import( + zip, builtins, range, long, lzip, + OrderedDict, callable +) +from pandas import compat + +from pandas.core.base import PandasObject +from pandas.core.categorical import Categorical +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame +from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes +from pandas.core.internals import BlockManager, make_block +from pandas.core.series import Series +from pandas.core.panel import Panel +from pandas.util.decorators import cache_readonly, Appender +import pandas.core.algorithms as algos +import pandas.core.common as com +from pandas.core.common import(_possibly_downcast_to_dtype, isnull, + notnull, _DATELIKE_DTYPES, is_numeric_dtype, + is_timedelta64_dtype, is_datetime64_dtype) + +from pandas import _np_version_under1p7 +import pandas.lib as lib +from pandas.lib import Timestamp +import pandas.tslib as tslib +import pandas.algos as _algos +import pandas.hashtable as _hash + +_agg_doc = """Aggregate using input function or dict of {column -> function} + +Parameters +---------- +arg : function or dict + Function to use for aggregating groups. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. If + passed a dict, the keys must be DataFrame column names. + +Notes +----- +Numpy functions mean/median/prod/sum/std/var are special cased so the +default behavior is applying the function along axis=0 +(e.g., np.mean(arr_2d, axis=0)) as opposed to +mimicking the default Numpy behavior (e.g., np.mean(arr_2d)). + +Returns +------- +aggregated : DataFrame +""" + + +# special case to prevent duplicate plots when catching exceptions when +# forwarding methods from NDFrames +_plotting_methods = frozenset(['plot', 'boxplot', 'hist']) + +_common_apply_whitelist = frozenset([ + 'last', 'first', + 'head', 'tail', 'median', + 'mean', 'sum', 'min', 'max', + 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', + 'resample', + 'describe', + 'rank', 'quantile', 'count', + 'fillna', + 'mad', + 'any', 'all', + 'irow', 'take', + 'idxmax', 'idxmin', + 'shift', 'tshift', + 'ffill', 'bfill', + 'pct_change', 'skew', + 'corr', 'cov', 'diff', +]) | _plotting_methods + +_series_apply_whitelist = \ + (_common_apply_whitelist - set(['boxplot'])) | \ + frozenset(['dtype', 'value_counts', 'unique', 'nunique', + 'nlargest', 'nsmallest']) + +_dataframe_apply_whitelist = \ + _common_apply_whitelist | frozenset(['dtypes', 'corrwith']) + + +class GroupByError(Exception): + pass + + +class DataError(GroupByError): + pass + + +class SpecificationError(GroupByError): + pass + + +def _groupby_function(name, alias, npfunc, numeric_only=True, + _convert=False): + def f(self): + self._set_selection_from_grouper() + try: + return self._cython_agg_general(alias, numeric_only=numeric_only) + except AssertionError as e: + raise SpecificationError(str(e)) + except Exception: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + if _convert: + result = result.convert_objects() + return result + + f.__doc__ = "Compute %s of group values" % name + f.__name__ = name + + return f + + +def _first_compat(x, axis=0): + def _first(x): + x = np.asarray(x) + x = x[notnull(x)] + if len(x) == 0: + return np.nan + return x[0] + + if isinstance(x, DataFrame): + return x.apply(_first, axis=axis) + else: + return _first(x) + + +def _last_compat(x, axis=0): + def _last(x): + x = np.asarray(x) + x = x[notnull(x)] + if len(x) == 0: + return np.nan + return x[-1] + + if isinstance(x, DataFrame): + return x.apply(_last, axis=axis) + else: + return _last(x) + + +def _count_compat(x, axis=0): + return x.size + + +class Grouper(object): + """ + A Grouper allows the user to specify a groupby instruction for a target object + + This specification will select a column via the key parameter, or if the level and/or + axis parameters are given, a level of the index of the target object. + + These are local specifications and will override 'global' settings, that is the parameters + axis and level which are passed to the groupby itself. + + Parameters + ---------- + key : string, defaults to None + groupby key, which selects the grouping column of the target + level : name/number, defaults to None + the level for the target index + freq : string / freqency object, defaults to None + This will groupby the specified frequency if the target selection (via key or level) is + a datetime-like object + axis : number/name of the axis, defaults to None + sort : boolean, default to False + whether to sort the resulting labels + + additional kwargs to control time-like groupers (when freq is passed) + + closed : closed end of interval; left or right + label : interval boundary to use for labeling; left or right + convention : {'start', 'end', 'e', 's'} + If grouper is PeriodIndex + + Returns + ------- + A specification for a groupby instruction + + Examples + -------- + >>> df.groupby(Grouper(key='A')) : syntatic sugar for df.groupby('A') + >>> df.groupby(Grouper(key='date',freq='60s')) : specify a resample on the column 'date' + >>> df.groupby(Grouper(level='date',freq='60s',axis=1)) : + specify a resample on the level 'date' on the columns axis with a frequency of 60s + + """ + + def __new__(cls, *args, **kwargs): + if kwargs.get('freq') is not None: + from pandas.tseries.resample import TimeGrouper + cls = TimeGrouper + return super(Grouper, cls).__new__(cls) + + def __init__(self, key=None, level=None, freq=None, axis=None, sort=False): + self.key=key + self.level=level + self.freq=freq + self.axis=axis + self.sort=sort + + self.grouper=None + self.obj=None + self.indexer=None + self.binner=None + self.grouper=None + + @property + def ax(self): + return self.grouper + + def _get_grouper(self, obj): + + """ + Parameters + ---------- + obj : the subject object + + Returns + ------- + a tuple of binner, grouper, obj (possibly sorted) + """ + + self._set_grouper(obj) + return self.binner, self.grouper, self.obj + + def _set_grouper(self, obj, sort=False): + """ + given an object and the specifcations, setup the internal grouper for this particular specification + + Parameters + ---------- + obj : the subject object + + """ + + if self.key is not None and self.level is not None: + raise ValueError("The Grouper cannot specify both a key and a level!") + + # the key must be a valid info item + if self.key is not None: + key = self.key + if key not in obj._info_axis: + raise KeyError("The grouper name {0} is not found".format(key)) + ax = Index(obj[key],name=key) + + else: + ax = obj._get_axis(self.axis) + if self.level is not None: + level = self.level + + # if a level is given it must be a mi level or + # equivalent to the axis name + if isinstance(ax, MultiIndex): + + if isinstance(level, compat.string_types): + if obj.index.name != level: + raise ValueError('level name %s is not the name of the ' + 'index' % level) + elif level > 0: + raise ValueError('level > 0 only valid with MultiIndex') + ax = Index(ax.get_level_values(level), name=level) + + else: + if not (level == 0 or level == ax.name): + raise ValueError("The grouper level {0} is not valid".format(level)) + + # possibly sort + if (self.sort or sort) and not ax.is_monotonic: + indexer = self.indexer = ax.argsort(kind='quicksort') + ax = ax.take(indexer) + obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False) + + self.obj = obj + self.grouper = ax + return self.grouper + + def _get_binner_for_grouping(self, obj): + raise NotImplementedError + + @property + def groups(self): + return self.grouper.groups + +class GroupBy(PandasObject): + + """ + Class for grouping and aggregating relational data. See aggregate, + transform, and apply functions on this object. + + It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: + + :: + + grouped = groupby(obj, ...) + + Parameters + ---------- + obj : pandas object + axis : int, default 0 + level : int, default None + Level of MultiIndex + groupings : list of Grouping objects + Most users should ignore this + exclusions : array-like, optional + List of columns to exclude + name : string + Most users should ignore this + + Notes + ----- + After grouping, see aggregate, apply, and transform functions. Here are + some other brief notes about usage. When grouping by multiple groups, the + result index will be a MultiIndex (hierarchical) by default. + + Iteration produces (key, group) tuples, i.e. chunking the data by group. So + you can write code like: + + :: + + grouped = obj.groupby(keys, axis=axis) + for key, group in grouped: + # do something with the data + + Function calls on GroupBy, if not specially implemented, "dispatch" to the + grouped data. So if you group a DataFrame and wish to invoke the std() + method on each group, you can simply do: + + :: + + df.groupby(mapper).std() + + rather than + + :: + + df.groupby(mapper).aggregate(np.std) + + You can pass arguments to these "wrapped" functions, too. + + See the online documentation for full exposition on these topics and much + more + + Returns + ------- + **Attributes** + groups : dict + {group name -> group labels} + len(grouped) : int + Number of groups + """ + _apply_whitelist = _common_apply_whitelist + _internal_names = ['_cache'] + _internal_names_set = set(_internal_names) + _group_selection = None + + def __init__(self, obj, keys=None, axis=0, level=None, + grouper=None, exclusions=None, selection=None, as_index=True, + sort=True, group_keys=True, squeeze=False): + self._selection = selection + + if isinstance(obj, NDFrame): + obj._consolidate_inplace() + + self.level = level + + if not as_index: + if not isinstance(obj, DataFrame): + raise TypeError('as_index=False only valid with DataFrame') + if axis != 0: + raise ValueError('as_index=False only valid for axis=0') + + self.as_index = as_index + self.keys = keys + self.sort = sort + self.group_keys = group_keys + self.squeeze = squeeze + + if grouper is None: + grouper, exclusions, obj = _get_grouper(obj, keys, axis=axis, + level=level, sort=sort) + + self.obj = obj + self.axis = obj._get_axis_number(axis) + self.grouper = grouper + self.exclusions = set(exclusions) if exclusions else set() + + def __len__(self): + return len(self.indices) + + def __unicode__(self): + # TODO: Better unicode/repr for GroupBy object + return object.__repr__(self) + + @property + def groups(self): + """ dict {group name -> group labels} """ + return self.grouper.groups + + @property + def ngroups(self): + return self.grouper.ngroups + + @property + def indices(self): + """ dict {group name -> group indices} """ + return self.grouper.indices + + def _get_index(self, name): + """ safe get index, translate keys for datelike to underlying repr """ + + def convert(key, s): + # possibly convert to they actual key types + # in the indices, could be a Timestamp or a np.datetime64 + + if isinstance(s, (Timestamp,datetime.datetime)): + return Timestamp(key) + elif isinstance(s, np.datetime64): + return Timestamp(key).asm8 + return key + + sample = next(iter(self.indices)) + if isinstance(sample, tuple): + if not isinstance(name, tuple): + raise ValueError("must supply a tuple to get_group with multiple grouping keys") + if not len(name) == len(sample): + raise ValueError("must supply a a same-length tuple to get_group with multiple grouping keys") + + name = tuple([ convert(n, k) for n, k in zip(name,sample) ]) + + else: + + name = convert(name, sample) + + return self.indices[name] + + @property + def name(self): + if self._selection is None: + return None # 'result' + else: + return self._selection + + @property + def _selection_list(self): + if not isinstance(self._selection, (list, tuple, Series, np.ndarray)): + return [self._selection] + return self._selection + + @cache_readonly + def _selected_obj(self): + + if self._selection is None or isinstance(self.obj, Series): + if self._group_selection is not None: + return self.obj[self._group_selection] + return self.obj + else: + return self.obj[self._selection] + + def _set_selection_from_grouper(self): + """ we may need create a selection if we have non-level groupers """ + grp = self.grouper + if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1: + ax = self.obj._info_axis + groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ] + if len(groupers): + self._group_selection = (ax-Index(groupers)).tolist() + + def _local_dir(self): + return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) + + def __getattr__(self, attr): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self.obj: + return self[attr] + if hasattr(self.obj, attr): + return self._make_wrapper(attr) + + raise AttributeError("%r object has no attribute %r" % + (type(self).__name__, attr)) + + def __getitem__(self, key): + raise NotImplementedError('Not implemented: %s' % key) + + def _make_wrapper(self, name): + if name not in self._apply_whitelist: + is_callable = callable(getattr(self._selected_obj, name, None)) + kind = ' callable ' if is_callable else ' ' + msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try " + "using the 'apply' method".format(kind, name, + type(self).__name__)) + raise AttributeError(msg) + + # need to setup the selection + # as are not passed directly but in the grouper + self._set_selection_from_grouper() + + f = getattr(self._selected_obj, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) + + f = getattr(type(self._selected_obj), name) + + def wrapper(*args, **kwargs): + # a little trickery for aggregation functions that need an axis + # argument + kwargs_with_axis = kwargs.copy() + if 'axis' not in kwargs_with_axis: + kwargs_with_axis['axis'] = self.axis + + def curried_with_axis(x): + return f(x, *args, **kwargs_with_axis) + + def curried(x): + return f(x, *args, **kwargs) + + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = curried_with_axis.__name__ = name + + # special case otherwise extra plots are created when catching the + # exception below + if name in _plotting_methods: + return self.apply(curried) + + try: + return self.apply(curried_with_axis) + except Exception: + try: + return self.apply(curried) + except Exception: + + # related to : GH3688 + # try item-by-item + # this can be called recursively, so need to raise ValueError if + # we don't have this method to indicated to aggregate to + # mark this column as an error + try: + return self._aggregate_item_by_item(name, *args, **kwargs) + except (AttributeError): + raise ValueError + + return wrapper + + def get_group(self, name, obj=None): + """ + Constructs NDFrame from group with provided name + + Parameters + ---------- + name : object + the name of the group to get as a DataFrame + obj : NDFrame, default None + the NDFrame to take the DataFrame out of. If + it is None, the object groupby was called on will + be used + + Returns + ------- + group : type of obj + """ + if obj is None: + obj = self._selected_obj + + inds = self._get_index(name) + return obj.take(inds, axis=self.axis, convert=False) + + def __iter__(self): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + return self.grouper.get_iterator(self.obj, axis=self.axis) + + def apply(self, func, *args, **kwargs): + """ + Apply function and combine results together in an intelligent way. The + split-apply-combine combination rules attempt to be as common sense + based as possible. For example: + + case 1: + group DataFrame + apply aggregation function (f(chunk) -> Series) + yield DataFrame, with group axis having group labels + + case 2: + group DataFrame + apply transform function ((f(chunk) -> DataFrame with same indexes) + yield DataFrame with resulting chunks glued together + + case 3: + group Series + apply function with f(chunk) -> DataFrame + yield DataFrame with result of chunks glued together + + Parameters + ---------- + func : function + + Notes + ----- + See online documentation for full exposition on how to use apply. + + In the current implementation apply calls func twice on the + first group to decide whether it can take a fast or slow code + path. This can lead to unexpected behavior if func has + side-effects, as they will take effect twice for the first + group. + + + See also + -------- + aggregate, transform + + Returns + ------- + applied : type depending on grouped object and function + """ + func = _intercept_function(func) + + @wraps(func) + def f(g): + return func(g, *args, **kwargs) + + return self._python_apply_general(f) + + def _python_apply_general(self, f): + keys, values, mutated = self.grouper.apply(f, self._selected_obj, + self.axis) + + return self._wrap_applied_output(keys, values, + not_indexed_same=mutated) + + def aggregate(self, func, *args, **kwargs): + raise NotImplementedError + + @Appender(_agg_doc) + def agg(self, func, *args, **kwargs): + return self.aggregate(func, *args, **kwargs) + + def _iterate_slices(self): + yield self.name, self._selected_obj + + def transform(self, func, *args, **kwargs): + raise NotImplementedError + + def mean(self): + """ + Compute mean of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + try: + return self._cython_agg_general('mean') + except GroupByError: + raise + except Exception: # pragma: no cover + self._set_selection_from_grouper() + f = lambda x: x.mean(axis=self.axis) + return self._python_agg_general(f) + + def median(self): + """ + Compute median of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + try: + return self._cython_agg_general('median') + except GroupByError: + raise + except Exception: # pragma: no cover + + self._set_selection_from_grouper() + def f(x): + if isinstance(x, np.ndarray): + x = Series(x) + return x.median(axis=self.axis) + return self._python_agg_general(f) + + def std(self, ddof=1): + """ + Compute standard deviation of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + # todo, implement at cython level? + return np.sqrt(self.var(ddof=ddof)) + + def var(self, ddof=1): + """ + Compute variance of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + if ddof == 1: + return self._cython_agg_general('var') + else: + self._set_selection_from_grouper() + f = lambda x: x.var(ddof=ddof) + return self._python_agg_general(f) + + def sem(self, ddof=1): + """ + Compute standard error of the mean of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + return self.std(ddof=ddof)/np.sqrt(self.count()) + + def size(self): + """ + Compute group sizes + + """ + return self.grouper.size() + + sum = _groupby_function('sum', 'add', np.sum) + prod = _groupby_function('prod', 'prod', np.prod) + min = _groupby_function('min', 'min', np.min, numeric_only=False) + max = _groupby_function('max', 'max', np.max, numeric_only=False) + first = _groupby_function('first', 'first', _first_compat, + numeric_only=False, _convert=True) + last = _groupby_function('last', 'last', _last_compat, numeric_only=False, + _convert=True) + _count = _groupby_function('_count', 'count', _count_compat, + numeric_only=False) + + def count(self, axis=0): + return self._count().astype('int64') + + def ohlc(self): + """ + Compute sum of values, excluding missing values + For multiple groupings, the result index will be a MultiIndex + """ + return self._apply_to_column_groupbys( + lambda x: x._cython_agg_general('ohlc')) + + def nth(self, n, dropna=None): + """ + Take the nth row from each group. + + If dropna, will not show nth non-null row, dropna is either + Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent + to calling dropna(how=dropna) before the groupby. + + Examples + -------- + >>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0) + A B + 0 1 NaN + 2 5 6 + >>> g.nth(1) + A B + 1 1 4 + >>> g.nth(-1) + A B + 1 1 4 + 2 5 6 + >>> g.nth(0, dropna='any') + B + A + 1 4 + 5 6 + >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + B + A + 1 NaN + 5 NaN + + """ + + self._set_selection_from_grouper() + if not dropna: # good choice + m = self.grouper._max_groupsize + if n >= m or n < -m: + return self._selected_obj.loc[[]] + rng = np.zeros(m, dtype=bool) + if n >= 0: + rng[n] = True + is_nth = self._cumcount_array(rng) + else: + rng[- n - 1] = True + is_nth = self._cumcount_array(rng, ascending=False) + + result = self._selected_obj[is_nth] + + # the result index + if self.as_index: + ax = self.obj._info_axis + names = self.grouper.names + if self.obj.ndim == 1: + # this is a pass-thru + pass + elif all([ n in ax for n in names ]): + result.index = Index(self.obj[names][is_nth].values.ravel()).set_names(names) + elif self._group_selection is not None: + result.index = self.obj._get_axis(self.axis)[is_nth] + + result = result.sort_index() + + return result + + if (isinstance(self._selected_obj, DataFrame) + and dropna not in ['any', 'all']): + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed %s)." % (dropna),) + + # old behaviour, but with all and any support for DataFrames. + # modified in GH 7559 to have better perf + max_len = n if n >= 0 else - 1 - n + dropped = self.obj.dropna(how=dropna, axis=self.axis) + + # get a new grouper for our dropped obj + if self.keys is None and self.level is None: + + # we don't have the grouper info available (e.g. we have selected out + # a column that is not in the current object) + axis = self.grouper.axis + grouper = axis[axis.isin(dropped.index)] + keys = self.grouper.names + else: + + # create a grouper with the original parameters, but on the dropped object + grouper, _, _ = _get_grouper(dropped, key=self.keys, axis=self.axis, + level=self.level, sort=self.sort) + + sizes = dropped.groupby(grouper).size() + result = dropped.groupby(grouper).nth(n) + mask = (sizes>> self.apply(lambda x: Series(np.arange(len(x)), x.index)) + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + + Example + ------- + + >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], + ... columns=['A']) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').cumcount() + 0 0 + 1 1 + 2 2 + 3 0 + 4 1 + 5 3 + dtype: int64 + >>> df.groupby('A').cumcount(ascending=False) + 0 3 + 1 2 + 2 1 + 3 1 + 4 0 + 5 0 + dtype: int64 + + """ + self._set_selection_from_grouper() + ascending = kwargs.pop('ascending', True) + + index = self._selected_obj.index + cumcounts = self._cumcount_array(ascending=ascending) + return Series(cumcounts, index) + + def head(self, n=5): + """ + Returns first n rows of each group. + + Essentially equivalent to ``.apply(lambda x: x.head(n))``, + except ignores as_index flag. + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).head(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + 0 1 2 + 2 5 6 + + """ + obj = self._selected_obj + in_head = self._cumcount_array() < n + head = obj[in_head] + return head + + def tail(self, n=5): + """ + Returns last n rows of each group + + Essentially equivalent to ``.apply(lambda x: x.tail(n))``, + except ignores as_index flag. + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).tail(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + 0 1 2 + 2 5 6 + + """ + obj = self._selected_obj + rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') + in_tail = self._cumcount_array(rng, ascending=False) > -n + tail = obj[in_tail] + return tail + + def _cumcount_array(self, arr=None, **kwargs): + """ + arr is where cumcount gets it's values from + + note: this is currently implementing sort=False (though the default is sort=True) + for groupby in general + """ + ascending = kwargs.pop('ascending', True) + + if arr is None: + arr = np.arange(self.grouper._max_groupsize, dtype='int64') + + len_index = len(self._selected_obj.index) + cumcounts = np.zeros(len_index, dtype=arr.dtype) + if not len_index: + return cumcounts + + indices, values = [], [] + for v in self.indices.values(): + indices.append(v) + + if ascending: + values.append(arr[:len(v)]) + else: + values.append(arr[len(v)-1::-1]) + + indices = np.concatenate(indices) + values = np.concatenate(values) + cumcounts[indices] = values + + return cumcounts + + def _index_with_as_index(self, b): + """ + Take boolean mask of index to be returned from apply, if as_index=True + + """ + # TODO perf, it feels like this should already be somewhere... + from itertools import chain + original = self._selected_obj.index + gp = self.grouper + levels = chain((gp.levels[i][gp.labels[i][b]] + for i in range(len(gp.groupings))), + (original.get_level_values(i)[b] + for i in range(original.nlevels))) + new = MultiIndex.from_arrays(list(levels)) + new.names = gp.names + original.names + return new + + def _try_cast(self, result, obj): + """ + try to cast the result to our obj original type, + we may have roundtripped thru object in the mean-time + + """ + if obj.ndim > 1: + dtype = obj.values.dtype + else: + dtype = obj.dtype + + if not np.isscalar(result): + result = _possibly_downcast_to_dtype(result, dtype) + + return result + + def _cython_agg_general(self, how, numeric_only=True): + output = {} + for name, obj in self._iterate_slices(): + is_numeric = is_numeric_dtype(obj.dtype) + if numeric_only and not is_numeric: + continue + + try: + result, names = self.grouper.aggregate(obj.values, how) + except AssertionError as e: + raise GroupByError(str(e)) + output[name] = self._try_cast(result, obj) + + if len(output) == 0: + raise DataError('No numeric types to aggregate') + + return self._wrap_aggregated_output(output, names) + + def _python_agg_general(self, func, *args, **kwargs): + func = _intercept_function(func) + f = lambda x: func(x, *args, **kwargs) + + # iterate through "columns" ex exclusions to populate output dict + output = {} + for name, obj in self._iterate_slices(): + try: + result, counts = self.grouper.agg_series(obj, f) + output[name] = self._try_cast(result, obj) + except TypeError: + continue + + if len(output) == 0: + return self._python_apply_general(f) + + if self.grouper._filter_empty_groups: + + mask = counts.ravel() > 0 + for name, result in compat.iteritems(output): + + # since we are masking, make sure that we have a float object + values = result + if is_numeric_dtype(values.dtype): + values = com.ensure_float(values) + + output[name] = self._try_cast(values[mask], result) + + return self._wrap_aggregated_output(output) + + def _wrap_applied_output(self, *args, **kwargs): + raise NotImplementedError + + def _concat_objects(self, keys, values, not_indexed_same=False): + from pandas.tools.merge import concat + + if not not_indexed_same: + result = concat(values, axis=self.axis) + ax = self._selected_obj._get_axis(self.axis) + + if isinstance(result, Series): + result = result.reindex(ax) + else: + result = result.reindex_axis(ax, axis=self.axis) + + elif self.group_keys: + + if self.as_index: + + # possible MI return case + group_keys = keys + group_levels = self.grouper.levels + group_names = self.grouper.names + result = concat(values, axis=self.axis, keys=group_keys, + levels=group_levels, names=group_names) + else: + + # GH5610, returns a MI, with the first level being a + # range index + keys = list(range(len(values))) + result = concat(values, axis=self.axis, keys=keys) + else: + result = concat(values, axis=self.axis) + + return result + + def _apply_filter(self, indices, dropna): + if len(indices) == 0: + indices = [] + else: + indices = np.sort(np.concatenate(indices)) + if dropna: + filtered = self._selected_obj.take(indices) + else: + mask = np.empty(len(self._selected_obj.index), dtype=bool) + mask.fill(False) + mask[indices.astype(int)] = True + # mask fails to broadcast when passed to where; broadcast manually. + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + filtered = self._selected_obj.where(mask) # Fill with NaNs. + return filtered + + +@Appender(GroupBy.__doc__) +def groupby(obj, by, **kwds): + if isinstance(obj, Series): + klass = SeriesGroupBy + elif isinstance(obj, DataFrame): + klass = DataFrameGroupBy + else: # pragma: no cover + raise TypeError('invalid type: %s' % type(obj)) + + return klass(obj, by, **kwds) + + +def _get_axes(group): + if isinstance(group, Series): + return [group.index] + else: + return group.axes + + +def _is_indexed_like(obj, axes): + if isinstance(obj, Series): + if len(axes) > 1: + return False + return obj.index.equals(axes[0]) + elif isinstance(obj, DataFrame): + return obj.index.equals(axes[0]) + + return False + + +class BaseGrouper(object): + """ + This is an internal Grouper class, which actually holds the generated groups + """ + + def __init__(self, axis, groupings, sort=True, group_keys=True): + self.axis = axis + self.groupings = groupings + self.sort = sort + self.group_keys = group_keys + self.compressed = True + + @property + def shape(self): + return tuple(ping.ngroups for ping in self.groupings) + + def __iter__(self): + return iter(self.indices) + + @property + def nkeys(self): + return len(self.groupings) + + def get_iterator(self, data, axis=0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + splitter = self._get_splitter(data, axis=axis) + keys = self._get_group_keys() + for key, (i, group) in zip(keys, splitter): + yield key, group + + def _get_splitter(self, data, axis=0): + comp_ids, _, ngroups = self.group_info + return get_splitter(data, comp_ids, ngroups, axis=axis) + + def _get_group_keys(self): + if len(self.groupings) == 1: + return self.levels[0] + else: + comp_ids, _, ngroups = self.group_info + # provide "flattened" iterator for multi-group setting + mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels) + return [mapper.get_key(i) for i in range(ngroups)] + + def apply(self, f, data, axis=0): + mutated = False + splitter = self._get_splitter(data, axis=axis) + group_keys = self._get_group_keys() + + # oh boy + if (f.__name__ not in _plotting_methods and + hasattr(splitter, 'fast_apply') and axis == 0): + try: + values, mutated = splitter.fast_apply(f, group_keys) + return group_keys, values, mutated + except (lib.InvalidApply): + # we detect a mutation of some kind + # so take slow path + pass + except (Exception) as e: + # raise this error to the caller + pass + + result_values = [] + for key, (i, group) in zip(group_keys, splitter): + object.__setattr__(group, 'name', key) + + # group might be modified + group_axes = _get_axes(group) + res = f(group) + if not _is_indexed_like(res, group_axes): + mutated = True + result_values.append(res) + + return group_keys, result_values, mutated + + @cache_readonly + def indices(self): + """ dict {group name -> group indices} """ + if len(self.groupings) == 1: + return self.groupings[0].indices + else: + label_list = [ping.labels for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return _get_indices_dict(label_list, keys) + + @property + def labels(self): + return [ping.labels for ping in self.groupings] + + @property + def levels(self): + return [ping.group_index for ping in self.groupings] + + @property + def names(self): + return [ping.name for ping in self.groupings] + + def size(self): + """ + Compute group sizes + + """ + # TODO: better impl + labels, _, ngroups = self.group_info + bin_counts = algos.value_counts(labels, sort=False) + bin_counts = bin_counts.reindex(np.arange(ngroups)) + bin_counts.index = self.result_index + return bin_counts + + @cache_readonly + def _max_groupsize(self): + ''' + Compute size of largest group + + ''' + # For many items in each group this is much faster than + # self.size().max(), in worst case marginally slower + if self.indices: + return max(len(v) for v in self.indices.values()) + else: + return 0 + + @cache_readonly + def groups(self): + """ dict {group name -> group labels} """ + if len(self.groupings) == 1: + return self.groupings[0].groups + else: + to_groupby = lzip(*(ping.grouper for ping in self.groupings)) + to_groupby = Index(to_groupby) + return self.axis.groupby(to_groupby.values) + + @cache_readonly + def group_info(self): + comp_ids, obs_group_ids = self._get_compressed_labels() + + ngroups = len(obs_group_ids) + comp_ids = com._ensure_int64(comp_ids) + return comp_ids, obs_group_ids, ngroups + + + def _get_compressed_labels(self): + all_labels = [ping.labels for ping in self.groupings] + if self._overflow_possible: + tups = lib.fast_zip(all_labels) + labs, uniques = algos.factorize(tups) + + if self.sort: + uniques, labs = _reorder_by_uniques(uniques, labs) + + return labs, uniques + else: + if len(all_labels) > 1: + group_index = get_group_index(all_labels, self.shape) + comp_ids, obs_group_ids = _compress_group_index(group_index) + else: + ping = self.groupings[0] + comp_ids = ping.labels + obs_group_ids = np.arange(len(ping.group_index)) + self.compressed = False + self._filter_empty_groups = False + + return comp_ids, obs_group_ids + + @cache_readonly + def _overflow_possible(self): + return _int64_overflow_possible(self.shape) + + @cache_readonly + def ngroups(self): + return len(self.result_index) + + @cache_readonly + def result_index(self): + recons = self.get_group_levels() + return MultiIndex.from_arrays(recons, names=self.names) + + def get_group_levels(self): + obs_ids = self.group_info[1] + + if not self.compressed and len(self.groupings) == 1: + return [self.groupings[0].group_index] + + if self._overflow_possible: + recons_labels = [np.array(x) for x in zip(*obs_ids)] + else: + recons_labels = decons_group_index(obs_ids, self.shape) + + name_list = [] + for ping, labels in zip(self.groupings, recons_labels): + labels = com._ensure_platform_int(labels) + name_list.append(ping.group_index.take(labels)) + + return name_list + + #------------------------------------------------------------ + # Aggregation functions + + _cython_functions = { + 'add': 'group_add', + 'prod': 'group_prod', + 'min': 'group_min', + 'max': 'group_max', + 'mean': 'group_mean', + 'median': { + 'name': 'group_median' + }, + 'var': 'group_var', + 'first': { + 'name': 'group_nth', + 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) + }, + 'last': 'group_last', + 'count': 'group_count', + } + + _cython_arity = { + 'ohlc': 4, # OHLC + } + + _name_functions = {} + + _filter_empty_groups = True + + def _get_aggregate_function(self, how, values): + + dtype_str = values.dtype.name + + def get_func(fname): + # find the function, or use the object function, or return a + # generic + for dt in [dtype_str, 'object']: + f = getattr(_algos, "%s_%s" % (fname, dtype_str), None) + if f is not None: + return f + return getattr(_algos, fname, None) + + ftype = self._cython_functions[how] + + if isinstance(ftype, dict): + func = afunc = get_func(ftype['name']) + + # a sub-function + f = ftype.get('f') + if f is not None: + + def wrapper(*args, **kwargs): + return f(afunc, *args, **kwargs) + + # need to curry our sub-function + func = wrapper + + else: + func = get_func(ftype) + + if func is None: + raise NotImplementedError("function is not implemented for this" + "dtype: [how->%s,dtype->%s]" % + (how, dtype_str)) + return func, dtype_str + + def aggregate(self, values, how, axis=0): + + arity = self._cython_arity.get(how, 1) + + vdim = values.ndim + swapped = False + if vdim == 1: + values = values[:, None] + out_shape = (self.ngroups, arity) + else: + if axis > 0: + swapped = True + values = values.swapaxes(0, axis) + if arity > 1: + raise NotImplementedError + out_shape = (self.ngroups,) + values.shape[1:] + + if is_numeric_dtype(values.dtype): + values = com.ensure_float(values) + is_numeric = True + out_dtype = 'f%d' % values.dtype.itemsize + else: + is_numeric = issubclass(values.dtype.type, (np.datetime64, + np.timedelta64)) + if is_numeric: + out_dtype = 'float64' + values = values.view('int64') + else: + out_dtype = 'object' + values = values.astype(object) + + # will be filled in Cython function + result = np.empty(out_shape, dtype=out_dtype) + + result.fill(np.nan) + counts = np.zeros(self.ngroups, dtype=np.int64) + + result = self._aggregate(result, counts, values, how, is_numeric) + + if self._filter_empty_groups: + if result.ndim == 2: + try: + result = lib.row_bool_subset( + result, (counts > 0).view(np.uint8)) + except ValueError: + result = lib.row_bool_subset_object( + result, (counts > 0).view(np.uint8)) + else: + result = result[counts > 0] + + if vdim == 1 and arity == 1: + result = result[:, 0] + + if how in self._name_functions: + # TODO + names = self._name_functions[how]() + else: + names = None + + if swapped: + result = result.swapaxes(0, axis) + + return result, names + + def _aggregate(self, result, counts, values, how, is_numeric): + agg_func, dtype = self._get_aggregate_function(how, values) + + comp_ids, _, ngroups = self.group_info + if values.ndim > 3: + # punting for now + raise NotImplementedError + elif values.ndim > 2: + for i, chunk in enumerate(values.transpose(2, 0, 1)): + + chunk = chunk.squeeze() + agg_func(result[:, :, i], counts, chunk, comp_ids) + else: + agg_func(result, counts, values, comp_ids) + + return result + + def agg_series(self, obj, func): + try: + return self._aggregate_series_fast(obj, func) + except Exception: + return self._aggregate_series_pure_python(obj, func) + + def _aggregate_series_fast(self, obj, func): + func = _intercept_function(func) + + if obj.index._has_complex_internals: + raise TypeError('Incompatible index for Cython grouper') + + group_index, _, ngroups = self.group_info + + # avoids object / Series creation overhead + dummy = obj._get_values(slice(None, 0)).to_dense() + indexer = _algos.groupsort_indexer(group_index, ngroups)[0] + obj = obj.take(indexer, convert=False) + group_index = com.take_nd(group_index, indexer, allow_fill=False) + grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, + dummy) + result, counts = grouper.get_result() + return result, counts + + def _aggregate_series_pure_python(self, obj, func): + + group_index, _, ngroups = self.group_info + + counts = np.zeros(ngroups, dtype=int) + result = None + + splitter = get_splitter(obj, group_index, ngroups, axis=self.axis) + + for label, group in splitter: + res = func(group) + if result is None: + if (isinstance(res, (Series, np.ndarray)) or + isinstance(res, list)): + raise ValueError('Function does not reduce') + result = np.empty(ngroups, dtype='O') + + counts[label] = group.shape[0] + result[label] = res + + result = lib.maybe_convert_objects(result, try_float=0) + return result, counts + + +def generate_bins_generic(values, binner, closed): + """ + Generate bin edge offsets and bin labels for one array using another array + which has bin edge values. Both arrays must be sorted. + + Parameters + ---------- + values : array of values + binner : a comparable array of values representing bins into which to bin + the first array. Note, 'values' end-points must fall within 'binner' + end-points. + closed : which end of bin is closed; left (default), right + + Returns + ------- + bins : array of offsets (into 'values' argument) of bins. + Zero and last edge are excluded in result, so for instance the first + bin is values[0:bin[0]] and the last is values[bin[-1]:] + """ + lenidx = len(values) + lenbin = len(binner) + + if lenidx <= 0 or lenbin <= 0: + raise ValueError("Invalid length for values or for binner") + + # check binner fits data + if values[0] < binner[0]: + raise ValueError("Values falls before first bin") + + if values[lenidx - 1] > binner[lenbin - 1]: + raise ValueError("Values falls after last bin") + + bins = np.empty(lenbin - 1, dtype=np.int64) + + j = 0 # index into values + bc = 0 # bin count + + # linear scan, presume nothing about values/binner except that it fits ok + for i in range(0, lenbin - 1): + r_bin = binner[i + 1] + + # count values in current bin, advance to next bin + while j < lenidx and (values[j] < r_bin or + (closed == 'right' and values[j] == r_bin)): + j += 1 + + bins[bc] = j + bc += 1 + + return bins + +class BinGrouper(BaseGrouper): + + def __init__(self, bins, binlabels, filter_empty=False): + self.bins = com._ensure_int64(bins) + self.binlabels = _ensure_index(binlabels) + self._filter_empty_groups = filter_empty + + @cache_readonly + def groups(self): + """ dict {group name -> group labels} """ + + # this is mainly for compat + # GH 3881 + result = {} + for key, value in zip(self.binlabels, self.bins): + if key is not tslib.NaT: + result[key] = value + return result + + @property + def nkeys(self): + return 1 + + def get_iterator(self, data, axis=0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + if isinstance(data, NDFrame): + slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis) + length = len(data.axes[axis]) + else: + slicer = lambda start,edge: data[slice(start,edge)] + length = len(data) + + start = 0 + for edge, label in zip(self.bins, self.binlabels): + if label is not tslib.NaT: + yield label, slicer(start,edge) + start = edge + + if start < length: + yield self.binlabels[-1], slicer(start,None) + + def apply(self, f, data, axis=0): + result_keys = [] + result_values = [] + mutated = False + for key, group in self.get_iterator(data, axis=axis): + object.__setattr__(group, 'name', key) + + # group might be modified + group_axes = _get_axes(group) + res = f(group) + + if not _is_indexed_like(res, group_axes): + mutated = True + + result_keys.append(key) + result_values.append(res) + + return result_keys, result_values, mutated + + @cache_readonly + def indices(self): + indices = collections.defaultdict(list) + + i = 0 + for label, bin in zip(self.binlabels, self.bins): + if i < bin: + if label is not tslib.NaT: + indices[label] = list(range(i, bin)) + i = bin + return indices + + @cache_readonly + def ngroups(self): + return len(self.binlabels) + + @cache_readonly + def result_index(self): + mask = self.binlabels.asi8 == tslib.iNaT + return self.binlabels[~mask] + + @property + def levels(self): + return [self.binlabels] + + @property + def names(self): + return [self.binlabels.name] + + def size(self): + """ + Compute group sizes + + """ + base = Series(np.zeros(len(self.result_index), dtype=np.int64), + index=self.result_index) + indices = self.indices + for k, v in compat.iteritems(indices): + indices[k] = len(v) + bin_counts = Series(indices, dtype=np.int64) + result = base.add(bin_counts, fill_value=0) + # addition with fill_value changes dtype to float64 + result = result.astype(np.int64) + return result + + #---------------------------------------------------------------------- + # cython aggregation + + _cython_functions = { + 'add': 'group_add_bin', + 'prod': 'group_prod_bin', + 'mean': 'group_mean_bin', + 'min': 'group_min_bin', + 'max': 'group_max_bin', + 'var': 'group_var_bin', + 'ohlc': 'group_ohlc', + 'first': { + 'name': 'group_nth_bin', + 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) + }, + 'last': 'group_last_bin', + 'count': 'group_count_bin', + } + + _name_functions = { + 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] + } + + _filter_empty_groups = True + + def _aggregate(self, result, counts, values, how, is_numeric=True): + + agg_func, dtype = self._get_aggregate_function(how, values) + + if values.ndim > 3: + # punting for now + raise NotImplementedError + elif values.ndim > 2: + for i, chunk in enumerate(values.transpose(2, 0, 1)): + agg_func(result[:, :, i], counts, chunk, self.bins) + else: + agg_func(result, counts, values, self.bins) + + return result + + def agg_series(self, obj, func): + dummy = obj[:0] + grouper = lib.SeriesBinGrouper(obj, func, self.bins, dummy) + return grouper.get_result() + + +class Grouping(object): + + """ + Holds the grouping information for a single key + + Parameters + ---------- + index : Index + grouper : + obj : + name : + level : + + Returns + ------- + **Attributes**: + * indices : dict of {group -> index_list} + * labels : ndarray, group labels + * ids : mapping of label -> group + * counts : array of group counts + * group_index : unique groups + * groups : dict of {group -> label_list} + """ + + def __init__(self, index, grouper=None, obj=None, name=None, level=None, + sort=True): + + self.name = name + self.level = level + self.grouper = _convert_grouper(index, grouper) + self.index = index + self.sort = sort + self.obj = obj + + # right place for this? + if isinstance(grouper, (Series, Index)) and name is None: + self.name = grouper.name + + if isinstance(grouper, MultiIndex): + self.grouper = grouper.values + + # pre-computed + self._was_factor = False + self._should_compress = True + + # we have a single grouper which may be a myriad of things, some of which are + # dependent on the passing in level + # + + if level is not None: + if not isinstance(level, int): + if level not in index.names: + raise AssertionError('Level %s not in index' % str(level)) + level = index.names.index(level) + + inds = index.labels[level] + level_index = index.levels[level] + + if self.name is None: + self.name = index.names[level] + + # XXX complete hack + + if grouper is not None: + level_values = index.levels[level].take(inds) + self.grouper = level_values.map(self.grouper) + else: + self._was_factor = True + + # all levels may not be observed + labels, uniques = algos.factorize(inds, sort=True) + + if len(uniques) > 0 and uniques[0] == -1: + # handle NAs + mask = inds != -1 + ok_labels, uniques = algos.factorize(inds[mask], sort=True) + + labels = np.empty(len(inds), dtype=inds.dtype) + labels[mask] = ok_labels + labels[~mask] = -1 + + if len(uniques) < len(level_index): + level_index = level_index.take(uniques) + + self._labels = labels + self._group_index = level_index + self.grouper = level_index.take(labels) + else: + if isinstance(self.grouper, (list, tuple)): + self.grouper = com._asarray_tuplesafe(self.grouper) + + # a passed Categorical + elif isinstance(self.grouper, Categorical): + + factor = self.grouper + self._was_factor = True + + # Is there any way to avoid this? + self.grouper = np.asarray(factor) + + self._labels = factor.labels + self._group_index = factor.levels + if self.name is None: + self.name = factor.name + + # a passed Grouper like + elif isinstance(self.grouper, Grouper): + + # get the new grouper + grouper = self.grouper._get_binner_for_grouping(self.obj) + self.obj = self.grouper.obj + self.grouper = grouper + if self.name is None: + self.name = grouper.name + + # no level passed + if not isinstance(self.grouper, (Series, np.ndarray)): + self.grouper = self.index.map(self.grouper) + if not (hasattr(self.grouper, "__len__") and + len(self.grouper) == len(self.index)): + errmsg = ('Grouper result violates len(labels) == ' + 'len(data)\nresult: %s' % + com.pprint_thing(self.grouper)) + self.grouper = None # Try for sanity + raise AssertionError(errmsg) + + # if we have a date/time-like grouper, make sure that we have Timestamps like + if getattr(self.grouper,'dtype',None) is not None: + if is_datetime64_dtype(self.grouper): + from pandas import to_datetime + self.grouper = to_datetime(self.grouper) + elif is_timedelta64_dtype(self.grouper): + from pandas import to_timedelta + self.grouper = to_timedelta(self.grouper) + + def __repr__(self): + return 'Grouping(%s)' % self.name + + def __iter__(self): + return iter(self.indices) + + _labels = None + _group_index = None + + @property + def ngroups(self): + return len(self.group_index) + + @cache_readonly + def indices(self): + return _groupby_indices(self.grouper) + + @property + def labels(self): + if self._labels is None: + self._make_labels() + return self._labels + + @property + def group_index(self): + if self._group_index is None: + self._make_labels() + return self._group_index + + def _make_labels(self): + if self._was_factor: # pragma: no cover + raise Exception('Should not call this method grouping by level') + else: + labels, uniques = algos.factorize(self.grouper, sort=self.sort) + uniques = Index(uniques, name=self.name) + self._labels = labels + self._group_index = uniques + + _groups = None + + @property + def groups(self): + if self._groups is None: + self._groups = self.index.groupby(self.grouper) + return self._groups + +def _get_grouper(obj, key=None, axis=0, level=None, sort=True): + """ + create and return a BaseGrouper, which is an internal + mapping of how to create the grouper indexers. + This may be composed of multiple Grouping objects, indicating + multiple groupers + + Groupers are ultimately index mappings. They can originate as: + index mappings, keys to columns, functions, or Groupers + + Groupers enable local references to axis,level,sort, while + the passed in axis, level, and sort are 'global'. + + This routine tries to figure of what the passing in references + are and then creates a Grouping for each one, combined into + a BaseGrouper. + + """ + + group_axis = obj._get_axis(axis) + + # validate thatthe passed level is compatible with the passed + # axis of the object + if level is not None: + if not isinstance(group_axis, MultiIndex): + if isinstance(level, compat.string_types): + if obj.index.name != level: + raise ValueError('level name %s is not the name of the ' + 'index' % level) + elif level > 0: + raise ValueError('level > 0 only valid with MultiIndex') + + level = None + key = group_axis + + # a passed in Grouper, directly convert + if isinstance(key, Grouper): + binner, grouper, obj = key._get_grouper(obj) + if key.key is None: + return grouper, [], obj + else: + return grouper, set([key.key]), obj + + # already have a BaseGrouper, just return it + elif isinstance(key, BaseGrouper): + return key, [], obj + + if not isinstance(key, (tuple, list)): + keys = [key] + else: + keys = key + + # what are we after, exactly? + match_axis_length = len(keys) == len(group_axis) + any_callable = any(callable(g) or isinstance(g, dict) for g in keys) + any_arraylike = any(isinstance(g, (list, tuple, Series, np.ndarray)) + for g in keys) + + try: + if isinstance(obj, DataFrame): + all_in_columns = all(g in obj.columns for g in keys) + else: + all_in_columns = False + except Exception: + all_in_columns = False + + if (not any_callable and not all_in_columns + and not any_arraylike and match_axis_length + and level is None): + keys = [com._asarray_tuplesafe(keys)] + + if isinstance(level, (tuple, list)): + if key is None: + keys = [None] * len(level) + levels = level + else: + levels = [level] * len(keys) + + groupings = [] + exclusions = [] + for i, (gpr, level) in enumerate(zip(keys, levels)): + name = None + try: + obj._data.items.get_loc(gpr) + in_axis = True + except Exception: + in_axis = False + + if _is_label_like(gpr) or in_axis: + exclusions.append(gpr) + name = gpr + gpr = obj[gpr] + + if isinstance(gpr, Categorical) and len(gpr) != len(obj): + errmsg = "Categorical grouper must have len(grouper) == len(data)" + raise AssertionError(errmsg) + + ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort) + groupings.append(ping) + + if len(groupings) == 0: + raise ValueError('No group keys passed!') + + # create the internals grouper + grouper = BaseGrouper(group_axis, groupings, sort=sort) + + return grouper, exclusions, obj + + +def _is_label_like(val): + return isinstance(val, compat.string_types) or np.isscalar(val) + + +def _convert_grouper(axis, grouper): + if isinstance(grouper, dict): + return grouper.get + elif isinstance(grouper, Series): + if grouper.index.equals(axis): + return grouper.values + else: + return grouper.reindex(axis).values + elif isinstance(grouper, (list, Series, np.ndarray)): + if len(grouper) != len(axis): + raise AssertionError('Grouper and axis must be same length') + return grouper + else: + return grouper + + +class SeriesGroupBy(GroupBy): + _apply_whitelist = _series_apply_whitelist + + def aggregate(self, func_or_funcs, *args, **kwargs): + """ + Apply aggregation function or functions to groups, yielding most likely + Series but in some cases DataFrame depending on the output of the + aggregation function + + Parameters + ---------- + func_or_funcs : function or list / dict of functions + List/dict of functions will produce DataFrame with column names + determined by the function names themselves (list) or the keys in + the dict + + Notes + ----- + agg is an alias for aggregate. Use it. + + Examples + -------- + >>> series + bar 1.0 + baz 2.0 + qot 3.0 + qux 4.0 + + >>> mapper = lambda x: x[0] # first letter + >>> grouped = series.groupby(mapper) + + >>> grouped.aggregate(np.sum) + b 3.0 + q 7.0 + + >>> grouped.aggregate([np.sum, np.mean, np.std]) + mean std sum + b 1.5 0.5 3 + q 3.5 0.5 7 + + >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), + ... 'total' : np.sum}) + result total + b 2.121 3 + q 4.95 7 + + See also + -------- + apply, transform + + Returns + ------- + Series or DataFrame + """ + if isinstance(func_or_funcs, compat.string_types): + return getattr(self, func_or_funcs)(*args, **kwargs) + + if hasattr(func_or_funcs, '__iter__'): + ret = self._aggregate_multiple_funcs(func_or_funcs) + else: + cyfunc = _intercept_cython(func_or_funcs) + if cyfunc and not args and not kwargs: + return getattr(self, cyfunc)() + + if self.grouper.nkeys > 1: + return self._python_agg_general(func_or_funcs, *args, **kwargs) + + try: + return self._python_agg_general(func_or_funcs, *args, **kwargs) + except Exception: + result = self._aggregate_named(func_or_funcs, *args, **kwargs) + + index = Index(sorted(result), name=self.grouper.names[0]) + ret = Series(result, index=index) + + if not self.as_index: # pragma: no cover + print('Warning, ignoring as_index=True') + + return ret + + def _aggregate_multiple_funcs(self, arg): + if isinstance(arg, dict): + columns = list(arg.keys()) + arg = list(arg.items()) + elif any(isinstance(x, (tuple, list)) for x in arg): + arg = [(x, x) if not isinstance(x, (tuple, list)) else x + for x in arg] + + # indicated column order + columns = lzip(*arg)[0] + else: + # list of functions / function names + columns = [] + for f in arg: + if isinstance(f, compat.string_types): + columns.append(f) + else: + columns.append(f.__name__) + arg = lzip(columns, arg) + + results = {} + + for name, func in arg: + if name in results: + raise SpecificationError('Function names must be unique, ' + 'found multiple named %s' % name) + + results[name] = self.aggregate(func) + + return DataFrame(results, columns=columns) + + def _wrap_aggregated_output(self, output, names=None): + # sort of a kludge + output = output[self.name] + index = self.grouper.result_index + + if names is not None: + return DataFrame(output, index=index, columns=names) + else: + name = self.name + if name is None: + name = self._selected_obj.name + return Series(output, index=index, name=name) + + def _wrap_applied_output(self, keys, values, not_indexed_same=False): + if len(keys) == 0: + # GH #6265 + return Series([], name=self.name) + + def _get_index(): + if self.grouper.nkeys > 1: + index = MultiIndex.from_tuples(keys, names=self.grouper.names) + else: + index = Index(keys, name=self.grouper.names[0]) + return index + + if isinstance(values[0], dict): + # GH #823 + index = _get_index() + return DataFrame(values, index=index).stack() + + if isinstance(values[0], (Series, dict)): + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + elif isinstance(values[0], DataFrame): + # possible that Series -> DataFrame by applied function + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + else: + # GH #6265 + return Series(values, index=_get_index(), name=self.name) + + def _aggregate_named(self, func, *args, **kwargs): + result = {} + + for name, group in self: + group.name = name + output = func(group, *args, **kwargs) + if isinstance(output, (Series, np.ndarray)): + raise Exception('Must produce aggregated value') + result[name] = self._try_cast(output, group) + + return result + + def transform(self, func, *args, **kwargs): + """ + Call function producing a like-indexed Series on each group and return + a Series with the transformed values + + Parameters + ---------- + func : function + To apply to each group. Should return a Series with the same index + + Examples + -------- + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + + Returns + ------- + transformed : Series + """ + dtype = self._selected_obj.dtype + + if isinstance(func, compat.string_types): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + result = self._selected_obj.values.copy() + for i, (name, group) in enumerate(self): + + object.__setattr__(group, 'name', name) + res = wrapper(group) + + if hasattr(res, 'values'): + res = res.values + + # may need to astype + try: + common_type = np.common_type(np.array(res), result) + if common_type != result.dtype: + result = result.astype(common_type) + except: + pass + + indexer = self._get_index(name) + result[indexer] = res + + result = _possibly_downcast_to_dtype(result, dtype) + return self._selected_obj.__class__(result, + index=self._selected_obj.index, + name=self._selected_obj.name) + + def filter(self, func, dropna=True, *args, **kwargs): + """ + Return a copy of a Series excluding elements from groups that + do not satisfy the boolean criterion specified by func. + + Parameters + ---------- + func : function + To apply to each group. Should return True or False. + dropna : Drop groups that do not pass the filter. True by default; + if False, groups that evaluate False are filled with NaNs. + + Example + ------- + >>> grouped.filter(lambda x: x.mean() > 0) + + Returns + ------- + filtered : Series + """ + if isinstance(func, compat.string_types): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + # Interpret np.nan as False. + def true_and_notnull(x, *args, **kwargs): + b = wrapper(x, *args, **kwargs) + return b and notnull(b) + + try: + indices = [self._get_index(name) if true_and_notnull(group) else [] + for name, group in self] + except ValueError: + raise TypeError("the filter must return a boolean result") + except TypeError: + raise TypeError("the filter must return a boolean result") + + filtered = self._apply_filter(indices, dropna) + return filtered + + def _apply_to_column_groupbys(self, func): + """ return a pass thru """ + return func(self) + +class NDFrameGroupBy(GroupBy): + + def _iterate_slices(self): + if self.axis == 0: + # kludge + if self._selection is None: + slice_axis = self.obj.columns + else: + slice_axis = self._selection_list + slicer = lambda x: self.obj[x] + else: + slice_axis = self.obj.index + slicer = self.obj.xs + + for val in slice_axis: + if val in self.exclusions: + continue + yield val, slicer(val) + + def _cython_agg_general(self, how, numeric_only=True): + new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) + return self._wrap_agged_blocks(new_items, new_blocks) + + def _wrap_agged_blocks(self, items, blocks): + obj = self._obj_with_exclusions + + new_axes = list(obj._data.axes) + + # more kludge + if self.axis == 0: + new_axes[0], new_axes[1] = new_axes[1], self.grouper.result_index + else: + new_axes[self.axis] = self.grouper.result_index + + # Make sure block manager integrity check passes. + assert new_axes[0].equals(items) + new_axes[0] = items + + mgr = BlockManager(blocks, new_axes) + + new_obj = type(obj)(mgr) + + return self._post_process_cython_aggregate(new_obj) + + _block_agg_axis = 0 + + def _cython_agg_blocks(self, how, numeric_only=True): + data, agg_axis = self._get_data_to_aggregate() + + new_blocks = [] + + if numeric_only: + data = data.get_numeric_data(copy=False) + + for block in data.blocks: + + values = block._try_operate(block.values) + + if block.is_numeric: + values = com.ensure_float(values) + + result, _ = self.grouper.aggregate(values, how, axis=agg_axis) + + # see if we can cast the block back to the original dtype + result = block._try_coerce_and_cast_result(result) + + newb = make_block(result, placement=block.mgr_locs) + new_blocks.append(newb) + + if len(new_blocks) == 0: + raise DataError('No numeric types to aggregate') + + return data.items, new_blocks + + def _get_data_to_aggregate(self): + obj = self._obj_with_exclusions + if self.axis == 0: + return obj.swapaxes(0, 1)._data, 1 + else: + return obj._data, self.axis + + def _post_process_cython_aggregate(self, obj): + # undoing kludge from below + if self.axis == 0: + obj = obj.swapaxes(0, 1) + return obj + + @cache_readonly + def _obj_with_exclusions(self): + if self._selection is not None: + return self.obj.reindex(columns=self._selection_list) + + if len(self.exclusions) > 0: + return self.obj.drop(self.exclusions, axis=1) + else: + return self.obj + + @Appender(_agg_doc) + def aggregate(self, arg, *args, **kwargs): + if isinstance(arg, compat.string_types): + return getattr(self, arg)(*args, **kwargs) + + result = OrderedDict() + if isinstance(arg, dict): + if self.axis != 0: # pragma: no cover + raise ValueError('Can only pass dict with axis=0') + + obj = self._selected_obj + + if any(isinstance(x, (list, tuple, dict)) for x in arg.values()): + new_arg = OrderedDict() + for k, v in compat.iteritems(arg): + if not isinstance(v, (tuple, list, dict)): + new_arg[k] = [v] + else: + new_arg[k] = v + arg = new_arg + + keys = [] + if self._selection is not None: + subset = obj + if isinstance(subset, DataFrame): + raise NotImplementedError + + for fname, agg_how in compat.iteritems(arg): + colg = SeriesGroupBy(subset, selection=self._selection, + grouper=self.grouper) + result[fname] = colg.aggregate(agg_how) + keys.append(fname) + else: + for col, agg_how in compat.iteritems(arg): + colg = SeriesGroupBy(obj[col], selection=col, + grouper=self.grouper) + result[col] = colg.aggregate(agg_how) + keys.append(col) + + if isinstance(list(result.values())[0], DataFrame): + from pandas.tools.merge import concat + result = concat([result[k] for k in keys], keys=keys, axis=1) + else: + result = DataFrame(result) + elif isinstance(arg, list): + return self._aggregate_multiple_funcs(arg) + else: + cyfunc = _intercept_cython(arg) + if cyfunc and not args and not kwargs: + return getattr(self, cyfunc)() + + if self.grouper.nkeys > 1: + return self._python_agg_general(arg, *args, **kwargs) + else: + + # try to treat as if we are passing a list + try: + assert not args and not kwargs + result = self._aggregate_multiple_funcs([arg]) + result.columns = Index(result.columns.levels[0], + name=self._selected_obj.columns.name) + except: + result = self._aggregate_generic(arg, *args, **kwargs) + + if not self.as_index: + if isinstance(result.index, MultiIndex): + zipped = zip(result.index.levels, result.index.labels, + result.index.names) + for i, (lev, lab, name) in enumerate(zipped): + result.insert(i, name, + com.take_nd(lev.values, lab, + allow_fill=False)) + result = result.consolidate() + else: + values = result.index.values + name = self.grouper.groupings[0].name + result.insert(0, name, values) + result.index = np.arange(len(result)) + + return result.convert_objects() + + def _aggregate_multiple_funcs(self, arg): + from pandas.tools.merge import concat + + if self.axis != 0: + raise NotImplementedError + + obj = self._obj_with_exclusions + + results = [] + keys = [] + for col in obj: + try: + colg = SeriesGroupBy(obj[col], selection=col, + grouper=self.grouper) + results.append(colg.aggregate(arg)) + keys.append(col) + except (TypeError, DataError): + pass + except SpecificationError: + raise + result = concat(results, keys=keys, axis=1) + + return result + + def _aggregate_generic(self, func, *args, **kwargs): + if self.grouper.nkeys != 1: + raise AssertionError('Number of keys must be 1') + + axis = self.axis + obj = self._obj_with_exclusions + + result = {} + if axis != obj._info_axis_number: + try: + for name, data in self: + # for name in self.indices: + # data = self.get_group(name, obj=obj) + result[name] = self._try_cast(func(data, *args, **kwargs), + data) + except Exception: + return self._aggregate_item_by_item(func, *args, **kwargs) + else: + for name in self.indices: + try: + data = self.get_group(name, obj=obj) + result[name] = self._try_cast(func(data, *args, **kwargs), + data) + except Exception: + wrapper = lambda x: func(x, *args, **kwargs) + result[name] = data.apply(wrapper, axis=axis) + + return self._wrap_generic_output(result, obj) + + def _wrap_aggregated_output(self, output, names=None): + raise NotImplementedError + + def _aggregate_item_by_item(self, func, *args, **kwargs): + # only for axis==0 + + obj = self._obj_with_exclusions + result = {} + cannot_agg = [] + errors=None + for item in obj: + try: + data = obj[item] + colg = SeriesGroupBy(data, selection=item, + grouper=self.grouper) + result[item] = self._try_cast( + colg.aggregate(func, *args, **kwargs), data) + except ValueError: + cannot_agg.append(item) + continue + except TypeError as e: + cannot_agg.append(item) + errors=e + continue + + result_columns = obj.columns + if cannot_agg: + result_columns = result_columns.drop(cannot_agg) + + # GH6337 + if not len(result_columns) and errors is not None: + raise errors + + return DataFrame(result, columns=result_columns) + + def _decide_output_index(self, output, labels): + if len(output) == len(labels): + output_keys = labels + else: + output_keys = sorted(output) + try: + output_keys.sort() + except Exception: # pragma: no cover + pass + + if isinstance(labels, MultiIndex): + output_keys = MultiIndex.from_tuples(output_keys, + names=labels.names) + + return output_keys + + def _wrap_applied_output(self, keys, values, not_indexed_same=False): + from pandas.core.index import _all_indexes_same + + if len(keys) == 0: + # XXX + return DataFrame({}) + + key_names = self.grouper.names + + if isinstance(values[0], DataFrame): + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + elif hasattr(self.grouper, 'groupings'): + if len(self.grouper.groupings) > 1: + key_index = MultiIndex.from_tuples(keys, names=key_names) + + else: + ping = self.grouper.groupings[0] + if len(keys) == ping.ngroups: + key_index = ping.group_index + key_index.name = key_names[0] + + key_lookup = Index(keys) + indexer = key_lookup.get_indexer(key_index) + + # reorder the values + values = [values[i] for i in indexer] + else: + + key_index = Index(keys, name=key_names[0]) + + # don't use the key indexer + if not self.as_index: + key_index = None + + # make Nones an empty object + if com._count_not_none(*values) != len(values): + v = next(v for v in values if v is not None) + if v is None: + return DataFrame() + elif isinstance(v, NDFrame): + values = [ + x if x is not None else + v._constructor(**v._construct_axes_dict()) + for x in values + ] + + v = values[0] + + if isinstance(v, (np.ndarray, Series)): + if isinstance(v, Series): + applied_index = self._selected_obj._get_axis(self.axis) + all_indexed_same = _all_indexes_same([ + x.index for x in values + ]) + singular_series = (len(values) == 1 and + applied_index.nlevels == 1) + + # GH3596 + # provide a reduction (Frame -> Series) if groups are + # unique + if self.squeeze: + + # assign the name to this series + if singular_series: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) + + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.tools.merge import concat + return concat(values) + + if not all_indexed_same: + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) + + try: + if self.axis == 0: + # GH6124 if the list of Series have a consistent name, + # then propagate that name to the result. + index = v.index.copy() + if index.name is None: + # Only propagate the series name to the result + # if all series have a consistent name. If the + # series do not have a consistent name, do + # nothing. + names = set(v.name for v in values) + if len(names) == 1: + index.name = list(names)[0] + + # normally use vstack as its faster than concat + # and if we have mi-columns + if not _np_version_under1p7 or isinstance(v.index,MultiIndex) or key_index is None: + stacked_values = np.vstack([np.asarray(x) for x in values]) + result = DataFrame(stacked_values,index=key_index,columns=index) + else: + # GH5788 instead of stacking; concat gets the dtypes correct + from pandas.tools.merge import concat + result = concat(values,keys=key_index,names=key_index.names, + axis=self.axis).unstack() + result.columns = index + else: + stacked_values = np.vstack([np.asarray(x) for x in values]) + result = DataFrame(stacked_values.T,index=v.index,columns=key_index) + + except (ValueError, AttributeError): + # GH1738: values is list of arrays of unequal lengths fall + # through to the outer else caluse + return Series(values, index=key_index) + + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + if (self._selected_obj.ndim == 2 + and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): + cd = 'coerce' + else: + cd = True + return result.convert_objects(convert_dates=cd) + + else: + # only coerce dates if we find at least 1 datetime + cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False + return Series(values, index=key_index).convert_objects(convert_dates=cd) + + else: + # Handle cases like BinGrouper + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + + def _transform_general(self, func, *args, **kwargs): + from pandas.tools.merge import concat + + applied = [] + + obj = self._obj_with_exclusions + gen = self.grouper.get_iterator(obj, axis=self.axis) + fast_path, slow_path = self._define_paths(func, *args, **kwargs) + + path = None + for name, group in gen: + object.__setattr__(group, 'name', name) + + if path is None: + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except Exception: # pragma: no cover + res = fast_path(group) + path = fast_path + else: + res = path(group) + + # broadcasting + if isinstance(res, Series): + if res.index.is_(obj.index): + group.T.values[:] = res + else: + group.values[:] = res + + applied.append(group) + else: + applied.append(res) + + concat_index = obj.columns if self.axis == 0 else obj.index + concatenated = concat(applied, join_axes=[concat_index], + axis=self.axis, verify_integrity=False) + concatenated.sort_index(inplace=True) + return concatenated + + def transform(self, func, *args, **kwargs): + """ + Call function producing a like-indexed DataFrame on each group and + return a DataFrame having the same indexes as the original object + filled with the transformed values + + Parameters + ---------- + f : function + Function to apply to each subframe + + Notes + ----- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Examples + -------- + >>> grouped = df.groupby(lambda x: mapping[x]) + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + """ + + # try to do a fast transform via merge if possible + try: + obj = self._obj_with_exclusions + if isinstance(func, compat.string_types): + result = getattr(self, func)(*args, **kwargs) + else: + cyfunc = _intercept_cython(func) + if cyfunc and not args and not kwargs: + result = getattr(self, cyfunc)() + else: + return self._transform_general(func, *args, **kwargs) + except: + return self._transform_general(func, *args, **kwargs) + + # a reduction transform + if not isinstance(result, DataFrame): + return self._transform_general(func, *args, **kwargs) + + # nuiscance columns + if not result.columns.equals(obj.columns): + return self._transform_general(func, *args, **kwargs) + + # a grouped that doesn't preserve the index, remap index based on the grouper + # and broadcast it + if not isinstance(obj.index,MultiIndex) and type(result.index) != type(obj.index): + results = obj.values.copy() + for (name, group), (i, row) in zip(self, result.iterrows()): + indexer = self._get_index(name) + results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1) + return DataFrame(results,columns=result.columns,index=obj.index).convert_objects() + + # we can merge the result in + # GH 7383 + names = result.columns + result = obj.merge(result, how='outer', left_index=True, right_index=True).ix[:,-result.shape[1]:] + result.columns = names + return result + + def _define_paths(self, func, *args, **kwargs): + if isinstance(func, compat.string_types): + fast_path = lambda group: getattr(group, func)(*args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis) + else: + fast_path = lambda group: func(group, *args, **kwargs) + slow_path = lambda group: group.apply( + lambda x: func(x, *args, **kwargs), axis=self.axis) + return fast_path, slow_path + + def _choose_path(self, fast_path, slow_path, group): + path = slow_path + res = slow_path(group) + + # if we make it here, test if we can use the fast path + try: + res_fast = fast_path(group) + + # compare that we get the same results + if res.shape == res_fast.shape: + res_r = res.values.ravel() + res_fast_r = res_fast.values.ravel() + mask = notnull(res_r) + if (res_r[mask] == res_fast_r[mask]).all(): + path = fast_path + + except: + pass + return path, res + + def _transform_item_by_item(self, obj, wrapper): + # iterate through columns + output = {} + inds = [] + for i, col in enumerate(obj): + try: + output[col] = self[col].transform(wrapper) + inds.append(i) + except Exception: + pass + + if len(output) == 0: # pragma: no cover + raise TypeError('Transform function invalid for data types') + + columns = obj.columns + if len(output) < len(obj.columns): + columns = columns.take(inds) + + return DataFrame(output, index=obj.index, columns=columns) + + def filter(self, func, dropna=True, *args, **kwargs): + """ + Return a copy of a DataFrame excluding elements from groups that + do not satisfy the boolean criterion specified by func. + + Parameters + ---------- + f : function + Function to apply to each subframe. Should return True or False. + dropna : Drop groups that do not pass the filter. True by default; + if False, groups that evaluate False are filled with NaNs. + + Notes + ----- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Example + -------- + >>> grouped = df.groupby(lambda x: mapping[x]) + >>> grouped.filter(lambda x: x['A'].sum() + x['B'].sum() > 0) + """ + from pandas.tools.merge import concat + + indices = [] + + obj = self._selected_obj + gen = self.grouper.get_iterator(obj, axis=self.axis) + + fast_path, slow_path = self._define_paths(func, *args, **kwargs) + + path = None + for name, group in gen: + object.__setattr__(group, 'name', name) + + if path is None: + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except Exception: # pragma: no cover + res = fast_path(group) + path = fast_path + else: + res = path(group) + + def add_indices(): + indices.append(self._get_index(name)) + + # interpret the result of the filter + if isinstance(res, (bool, np.bool_)): + if res: + add_indices() + else: + if getattr(res, 'ndim', None) == 1: + val = res.ravel()[0] + if val and notnull(val): + add_indices() + else: + + # in theory you could do .all() on the boolean result ? + raise TypeError("the filter must return a boolean result") + + filtered = self._apply_filter(indices, dropna) + return filtered + + +class DataFrameGroupBy(NDFrameGroupBy): + _apply_whitelist = _dataframe_apply_whitelist + + _block_agg_axis = 1 + + def __getitem__(self, key): + if self._selection is not None: + raise Exception('Column(s) %s already selected' % self._selection) + + if isinstance(key, (list, tuple, Series, np.ndarray)): + if len(self.obj.columns.intersection(key)) != len(key): + bad_keys = list(set(key).difference(self.obj.columns)) + raise KeyError("Columns not found: %s" + % str(bad_keys)[1:-1]) + return DataFrameGroupBy(self.obj, self.grouper, selection=key, + grouper=self.grouper, + exclusions=self.exclusions, + as_index=self.as_index) + + elif not self.as_index: + if key not in self.obj.columns: + raise KeyError("Column not found: %s" % key) + return DataFrameGroupBy(self.obj, self.grouper, selection=key, + grouper=self.grouper, + exclusions=self.exclusions, + as_index=self.as_index) + + else: + if key not in self.obj: + raise KeyError("Column not found: %s" % key) + # kind of a kludge + return SeriesGroupBy(self.obj[key], selection=key, + grouper=self.grouper, + exclusions=self.exclusions) + + def _wrap_generic_output(self, result, obj): + result_index = self.grouper.levels[0] + + if result: + if self.axis == 0: + result = DataFrame(result, index=obj.columns, + columns=result_index).T + else: + result = DataFrame(result, index=obj.index, + columns=result_index) + else: + result = DataFrame(result) + + return result + + def _get_data_to_aggregate(self): + obj = self._obj_with_exclusions + if self.axis == 1: + return obj.T._data, 1 + else: + return obj._data, 1 + + def _wrap_aggregated_output(self, output, names=None): + agg_axis = 0 if self.axis == 1 else 1 + agg_labels = self._obj_with_exclusions._get_axis(agg_axis) + + output_keys = self._decide_output_index(output, agg_labels) + + if not self.as_index: + result = DataFrame(output, columns=output_keys) + group_levels = self.grouper.get_group_levels() + zipped = zip(self.grouper.names, group_levels) + + for i, (name, labels) in enumerate(zipped): + result.insert(i, name, labels) + result = result.consolidate() + else: + index = self.grouper.result_index + result = DataFrame(output, index=index, columns=output_keys) + + if self.axis == 1: + result = result.T + + return result.convert_objects() + + def _wrap_agged_blocks(self, items, blocks): + if not self.as_index: + index = np.arange(blocks[0].values.shape[1]) + mgr = BlockManager(blocks, [items, index]) + result = DataFrame(mgr) + + group_levels = self.grouper.get_group_levels() + zipped = zip(self.grouper.names, group_levels) + + for i, (name, labels) in enumerate(zipped): + result.insert(i, name, labels) + result = result.consolidate() + else: + index = self.grouper.result_index + mgr = BlockManager(blocks, [items, index]) + result = DataFrame(mgr) + + if self.axis == 1: + result = result.T + + return result.convert_objects() + + def _iterate_column_groupbys(self): + for i, colname in enumerate(self._selected_obj.columns): + yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], + selection=colname, + grouper=self.grouper, + exclusions=self.exclusions) + + def _apply_to_column_groupbys(self, func): + from pandas.tools.merge import concat + return concat( + (func(col_groupby) for _, col_groupby + in self._iterate_column_groupbys()), + keys=self._selected_obj.columns, axis=1) + +from pandas.tools.plotting import boxplot_frame_groupby +DataFrameGroupBy.boxplot = boxplot_frame_groupby + + +class PanelGroupBy(NDFrameGroupBy): + + def _iterate_slices(self): + if self.axis == 0: + # kludge + if self._selection is None: + slice_axis = self._selected_obj.items + else: + slice_axis = self._selection_list + slicer = lambda x: self._selected_obj[x] + else: + raise NotImplementedError + + for val in slice_axis: + if val in self.exclusions: + continue + + yield val, slicer(val) + + def aggregate(self, arg, *args, **kwargs): + """ + Aggregate using input function or dict of {column -> function} + + Parameters + ---------- + arg : function or dict + Function to use for aggregating groups. If a function, must either + work when passed a Panel or when passed to Panel.apply. If + pass a dict, the keys must be DataFrame column names + + Returns + ------- + aggregated : Panel + """ + if isinstance(arg, compat.string_types): + return getattr(self, arg)(*args, **kwargs) + + return self._aggregate_generic(arg, *args, **kwargs) + + def _wrap_generic_output(self, result, obj): + if self.axis == 0: + new_axes = list(obj.axes) + new_axes[0] = self.grouper.result_index + elif self.axis == 1: + x, y, z = obj.axes + new_axes = [self.grouper.result_index, z, x] + else: + x, y, z = obj.axes + new_axes = [self.grouper.result_index, y, x] + + result = Panel._from_axes(result, new_axes) + + if self.axis == 1: + result = result.swapaxes(0, 1).swapaxes(0, 2) + elif self.axis == 2: + result = result.swapaxes(0, 2) + + return result + + def _aggregate_item_by_item(self, func, *args, **kwargs): + obj = self._obj_with_exclusions + result = {} + + if self.axis > 0: + for item in obj: + try: + itemg = DataFrameGroupBy(obj[item], + axis=self.axis - 1, + grouper=self.grouper) + result[item] = itemg.aggregate(func, *args, **kwargs) + except (ValueError, TypeError): + raise + new_axes = list(obj.axes) + new_axes[self.axis] = self.grouper.result_index + return Panel._from_axes(result, new_axes) + else: + raise NotImplementedError + + def _wrap_aggregated_output(self, output, names=None): + raise NotImplementedError + + +class NDArrayGroupBy(GroupBy): + pass + + +#---------------------------------------------------------------------- +# Splitting / application + + +class DataSplitter(object): + + def __init__(self, data, labels, ngroups, axis=0): + self.data = data + self.labels = com._ensure_int64(labels) + self.ngroups = ngroups + + self.axis = axis + + @cache_readonly + def slabels(self): + # Sorted labels + return com.take_nd(self.labels, self.sort_idx, allow_fill=False) + + @cache_readonly + def sort_idx(self): + # Counting sort indexer + return _algos.groupsort_indexer(self.labels, self.ngroups)[0] + + def __iter__(self): + sdata = self._get_sorted_data() + + if self.ngroups == 0: + raise StopIteration + + starts, ends = lib.generate_slices(self.slabels, self.ngroups) + + for i, (start, end) in enumerate(zip(starts, ends)): + # Since I'm now compressing the group ids, it's now not "possible" + # to produce empty slices because such groups would not be observed + # in the data + # if start >= end: + # raise AssertionError('Start %s must be less than end %s' + # % (str(start), str(end))) + yield i, self._chop(sdata, slice(start, end)) + + def _get_sorted_data(self): + return self.data.take(self.sort_idx, axis=self.axis, convert=False) + + def _chop(self, sdata, slice_obj): + return sdata.iloc[slice_obj] + + def apply(self, f): + raise NotImplementedError + + +class ArraySplitter(DataSplitter): + pass + + +class SeriesSplitter(DataSplitter): + + def _chop(self, sdata, slice_obj): + return sdata._get_values(slice_obj).to_dense() + + +class FrameSplitter(DataSplitter): + + def __init__(self, data, labels, ngroups, axis=0): + super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis) + + def fast_apply(self, f, names): + # must return keys::list, values::list, mutated::bool + try: + starts, ends = lib.generate_slices(self.slabels, self.ngroups) + except: + # fails when all -1 + return [], True + + sdata = self._get_sorted_data() + results, mutated = lib.apply_frame_axis0(sdata, f, names, starts, ends) + + return results, mutated + + def _chop(self, sdata, slice_obj): + if self.axis == 0: + return sdata.iloc[slice_obj] + else: + return sdata._slice(slice_obj, axis=1) # ix[:, slice_obj] + + +class NDFrameSplitter(DataSplitter): + + def __init__(self, data, labels, ngroups, axis=0): + super(NDFrameSplitter, self).__init__(data, labels, ngroups, axis=axis) + + self.factory = data._constructor + + def _get_sorted_data(self): + # this is the BlockManager + data = self.data._data + + # this is sort of wasteful but... + sorted_axis = data.axes[self.axis].take(self.sort_idx) + sorted_data = data.reindex_axis(sorted_axis, axis=self.axis) + + return sorted_data + + def _chop(self, sdata, slice_obj): + return self.factory(sdata.get_slice(slice_obj, axis=self.axis)) + + +def get_splitter(data, *args, **kwargs): + if isinstance(data, Series): + klass = SeriesSplitter + elif isinstance(data, DataFrame): + klass = FrameSplitter + else: + klass = NDFrameSplitter + + return klass(data, *args, **kwargs) + + +#---------------------------------------------------------------------- +# Misc utilities + + +def get_group_index(label_list, shape): + """ + For the particular label_list, gets the offsets into the hypothetical list + representing the totally ordered cartesian product of all possible label + combinations. + """ + if len(label_list) == 1: + return label_list[0] + + n = len(label_list[0]) + group_index = np.zeros(n, dtype=np.int64) + mask = np.zeros(n, dtype=bool) + for i in range(len(shape)): + stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64) + group_index += com._ensure_int64(label_list[i]) * stride + mask |= label_list[i] < 0 + + np.putmask(group_index, mask, -1) + return group_index + +_INT64_MAX = np.iinfo(np.int64).max + + +def _int64_overflow_possible(shape): + the_prod = long(1) + for x in shape: + the_prod *= long(x) + + return the_prod >= _INT64_MAX + + +def decons_group_index(comp_labels, shape): + # reconstruct labels + label_list = [] + factor = 1 + y = 0 + x = comp_labels + for i in reversed(range(len(shape))): + labels = (x - y) % (factor * shape[i]) // factor + np.putmask(labels, comp_labels < 0, -1) + label_list.append(labels) + y = labels * factor + factor *= shape[i] + return label_list[::-1] + + +def _indexer_from_factorized(labels, shape, compress=True): + if _int64_overflow_possible(shape): + indexer = np.lexsort(np.array(labels[::-1])) + return indexer + + group_index = get_group_index(labels, shape) + + if compress: + comp_ids, obs_ids = _compress_group_index(group_index) + max_group = len(obs_ids) + else: + comp_ids = group_index + max_group = com._long_prod(shape) + + if max_group > 1e6: + # Use mergesort to avoid memory errors in counting sort + indexer = comp_ids.argsort(kind='mergesort') + else: + indexer, _ = _algos.groupsort_indexer(comp_ids.astype(np.int64), + max_group) + + return indexer + + +def _lexsort_indexer(keys, orders=None, na_position='last'): + labels = [] + shape = [] + if isinstance(orders, bool): + orders = [orders] * len(keys) + elif orders is None: + orders = [True] * len(keys) + + for key, order in zip(keys, orders): + key = np.asanyarray(key) + rizer = _hash.Factorizer(len(key)) + + if not key.dtype == np.object_: + key = key.astype('O') + + # factorize maps nans to na_sentinel=-1 + ids = rizer.factorize(key, sort=True) + n = len(rizer.uniques) + mask = (ids == -1) + if order: # ascending + if na_position == 'last': + ids = np.where(mask, n, ids) + elif na_position == 'first': + ids += 1 + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + else: # not order means descending + if na_position == 'last': + ids = np.where(mask, n, n-ids-1) + elif na_position == 'first': + ids = np.where(mask, 0, n-ids) + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + if mask.any(): + n += 1 + shape.append(n) + labels.append(ids) + return _indexer_from_factorized(labels, shape) + +def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): + """ + This is intended to be a drop-in replacement for np.argsort which handles NaNs + It adds ascending and na_position parameters. + GH #6399, #5231 + """ + items = np.asanyarray(items) + idx = np.arange(len(items)) + mask = isnull(items) + non_nans = items[~mask] + non_nan_idx = idx[~mask] + nan_idx = np.nonzero(mask)[0] + if not ascending: + non_nans = non_nans[::-1] + non_nan_idx = non_nan_idx[::-1] + indexer = non_nan_idx[non_nans.argsort(kind=kind)] + if not ascending: + indexer = indexer[::-1] + # Finally, place the NaNs at the end or the beginning according to na_position + if na_position == 'last': + indexer = np.concatenate([indexer, nan_idx]) + elif na_position == 'first': + indexer = np.concatenate([nan_idx, indexer]) + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + return indexer + + +class _KeyMapper(object): + + """ + Ease my suffering. Map compressed group id -> key tuple + """ + + def __init__(self, comp_ids, ngroups, labels, levels): + self.levels = levels + self.labels = labels + self.comp_ids = comp_ids.astype(np.int64) + + self.k = len(labels) + self.tables = [_hash.Int64HashTable(ngroups) for _ in range(self.k)] + + self._populate_tables() + + def _populate_tables(self): + for labs, table in zip(self.labels, self.tables): + table.map(self.comp_ids, labs.astype(np.int64)) + + def get_key(self, comp_id): + return tuple(level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels)) + + +def _get_indices_dict(label_list, keys): + shape = [len(x) for x in keys] + group_index = get_group_index(label_list, shape) + + sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index), + np.prod(shape)) + + sorter_int = com._ensure_platform_int(sorter) + + sorted_labels = [lab.take(sorter_int) for lab in label_list] + group_index = group_index.take(sorter_int) + + return lib.indices_fast(sorter, group_index, keys, sorted_labels) + + +#---------------------------------------------------------------------- +# sorting levels...cleverly? + + +def _compress_group_index(group_index, sort=True): + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + """ + + table = _hash.Int64HashTable(min(1000000, len(group_index))) + + group_index = com._ensure_int64(group_index) + + # note, group labels come out ascending (ie, 1,2,3 etc) + comp_ids, obs_group_ids = table.get_labels_groupby(group_index) + + if sort and len(obs_group_ids) > 0: + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) + + return comp_ids, obs_group_ids + + +def _reorder_by_uniques(uniques, labels): + # sorter is index where elements ought to go + sorter = uniques.argsort() + + # reverse_indexer is where elements came from + reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + + # move labels to right locations (ie, unsort ascending labels) + labels = com.take_nd(reverse_indexer, labels, allow_fill=False) + np.putmask(labels, mask, -1) + + # sort observed ids + uniques = com.take_nd(uniques, sorter, allow_fill=False) + + return uniques, labels + + +_func_table = { + builtins.sum: np.sum +} + + +_cython_table = { + builtins.sum: 'sum', + np.sum: 'sum', + np.mean: 'mean', + np.prod: 'prod', + np.std: 'std', + np.var: 'var', + np.median: 'median', + np.max: 'max', + np.min: 'min' +} + + +def _intercept_function(func): + return _func_table.get(func, func) + + +def _intercept_cython(func): + return _cython_table.get(func) + + +def _groupby_indices(values): + return _algos.groupby_indices(com._ensure_object(values)) + + +def numpy_groupby(data, labels, axis=0): + s = np.argsort(labels) + keys, inv = np.unique(labels, return_inverse=True) + i = inv.take(s) + groups_at = np.where(i != np.concatenate(([-1], i[:-1])))[0] + ordered_data = data.take(s, axis=axis) + group_sums = np.add.reduceat(ordered_data, groups_at, axis=axis) + + return group_sums diff --git a/pandas/core/index.py b/pandas/core/index.py new file mode 100644 index 00000000..262305a3 --- /dev/null +++ b/pandas/core/index.py @@ -0,0 +1,4093 @@ +# pylint: disable=E1101,E1103,W0232 +import datetime +import warnings +from functools import partial +from pandas.compat import range, zip, lrange, lzip, u, reduce +from pandas import compat +import numpy as np + +import pandas.tslib as tslib +import pandas.lib as lib +import pandas.algos as _algos +import pandas.index as _index +from pandas.lib import Timestamp, is_datetime_array +from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin +from pandas.util.decorators import cache_readonly, deprecate +from pandas.core.common import isnull, array_equivalent +import pandas.core.common as com +from pandas.core.common import (_values_from_object, is_float, is_integer, + ABCSeries) +from pandas.core.config import get_option + +# simplify +default_pprint = lambda x: com.pprint_thing(x, escape_chars=('\t', '\r', '\n'), + quote_strings=True) + + +__all__ = ['Index'] + + +_unsortable_types = frozenset(('mixed', 'mixed-integer')) + + +def _try_get_item(x): + try: + return x.item() + except AttributeError: + return x + + +def _indexOp(opname): + """ + Wrapper function for index comparison operations, to avoid + code duplication. + """ + + def wrapper(self, other): + func = getattr(self.view(np.ndarray), opname) + result = func(other) + try: + return result.view(np.ndarray) + except: # pragma: no cover + return result + return wrapper + + +class InvalidIndexError(Exception): + pass + + +_o_dtype = np.dtype(object) + + +def _shouldbe_timestamp(obj): + return (tslib.is_datetime_array(obj) + or tslib.is_datetime64_array(obj) + or tslib.is_timestamp_array(obj)) + +_Identity = object + + +class Index(IndexOpsMixin, FrozenNDArray): + + """ + Immutable ndarray implementing an ordered, sliceable set. The basic object + storing axis labels for all pandas objects + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: object) + copy : bool + Make a copy of input ndarray + name : object + Name to be stored in the index + tupleize_cols : bool (default: True) + When True, attempt to create a MultiIndex if possible + + Notes + ----- + An Index instance can **only** contain hashable objects + """ + # To hand over control to subclasses + _join_precedence = 1 + + # Cython methods + _groupby = _algos.groupby_object + _arrmap = _algos.arrmap_object + _left_indexer_unique = _algos.left_join_indexer_unique_object + _left_indexer = _algos.left_join_indexer_object + _inner_indexer = _algos.inner_join_indexer_object + _outer_indexer = _algos.outer_join_indexer_object + + _box_scalars = False + + name = None + asi8 = None + _comparables = ['name'] + _allow_index_ops = True + _allow_datetime_index_ops = False + _allow_period_index_ops = False + + _engine_type = _index.ObjectEngine + + def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, + tupleize_cols=True, **kwargs): + + # no class inference! + if fastpath: + return cls._simple_new(data, name) + + from pandas.tseries.period import PeriodIndex + if isinstance(data, (np.ndarray, ABCSeries)): + if issubclass(data.dtype.type, np.datetime64): + from pandas.tseries.index import DatetimeIndex + result = DatetimeIndex(data, copy=copy, name=name, **kwargs) + if dtype is not None and _o_dtype == dtype: + return Index(result.to_pydatetime(), dtype=_o_dtype) + else: + return result + elif issubclass(data.dtype.type, np.timedelta64): + return Int64Index(data, copy=copy, name=name) + + if dtype is not None: + try: + data = np.array(data, dtype=dtype, copy=copy) + except TypeError: + pass + elif isinstance(data, PeriodIndex): + return PeriodIndex(data, copy=copy, name=name, **kwargs) + + if issubclass(data.dtype.type, np.integer): + return Int64Index(data, copy=copy, dtype=dtype, name=name) + if issubclass(data.dtype.type, np.floating): + return Float64Index(data, copy=copy, dtype=dtype, name=name) + + subarr = com._asarray_tuplesafe(data, dtype=object) + + # _asarray_tuplesafe does not always copy underlying data, + # so need to make sure that this happens + if copy: + subarr = subarr.copy() + + elif hasattr(data, '__array__'): + return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, + **kwargs) + elif np.isscalar(data): + cls._scalar_data_error(data) + else: + if tupleize_cols and isinstance(data, list) and data: + try: + sorted(data) + has_mixed_types = False + except (TypeError, UnicodeDecodeError): + has_mixed_types = True # python3 only + if isinstance(data[0], tuple) and not has_mixed_types: + try: + return MultiIndex.from_tuples( + data, names=name or kwargs.get('names')) + except (TypeError, KeyError): + pass # python2 - MultiIndex fails on mixed types + # other iterable of some kind + subarr = com._asarray_tuplesafe(data, dtype=object) + + if dtype is None: + inferred = lib.infer_dtype(subarr) + if inferred == 'integer': + return Int64Index(subarr.astype('i8'), copy=copy, name=name) + elif inferred in ['floating', 'mixed-integer-float']: + return Float64Index(subarr, copy=copy, name=name) + elif inferred != 'string': + if (inferred.startswith('datetime') or + tslib.is_timestamp_array(subarr)): + from pandas.tseries.index import DatetimeIndex + return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) + elif inferred == 'period': + return PeriodIndex(subarr, name=name, **kwargs) + + subarr = subarr.view(cls) + # could also have a _set_name, but I don't think it's really necessary + subarr._set_names([name]) + return subarr + + @classmethod + def _simple_new(cls, values, name, **kwargs): + result = values.view(cls) + result.name = name + return result + + def is_(self, other): + """ + More flexible, faster check like ``is`` but that works through views + + Note: this is *not* the same as ``Index.identical()``, which checks + that metadata is also the same. + + Parameters + ---------- + other : object + other object to compare against. + + Returns + ------- + True if both have same underlying data, False otherwise : bool + """ + # use something other than None to be clearer + return self._id is getattr(other, '_id', Ellipsis) + + def _reset_identity(self): + """Initializes or resets ``_id`` attribute with new object""" + self._id = _Identity() + + def view(self, *args, **kwargs): + result = super(Index, self).view(*args, **kwargs) + if isinstance(result, Index): + result._id = self._id + return result + + # construction helpers + @classmethod + def _scalar_data_error(cls, data): + raise TypeError( + '{0}(...) must be called with a collection of some kind, {1} was ' + 'passed'.format(cls.__name__, repr(data)) + ) + + @classmethod + def _string_data_error(cls, data): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to a numeric type') + + @classmethod + def _coerce_to_ndarray(cls, data): + """coerces data to ndarray, raises on scalar data. Converts other + iterables to list first and then to array. Does not touch ndarrays.""" + + if not isinstance(data, np.ndarray): + if np.isscalar(data): + cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + data = np.asarray(data) + return data + + def __array_finalize__(self, obj): + self._reset_identity() + if not isinstance(obj, type(self)): + # Only relevant if array being created from an Index instance + return + + self.name = getattr(obj, 'name', None) + + def _shallow_copy(self): + return self.view() + + def copy(self, names=None, name=None, dtype=None, deep=False): + """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. + + Parameters + ---------- + name : string, optional + dtype : numpy dtype or pandas type + + Returns + ------- + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + if deep: + from copy import deepcopy + new_index = np.ndarray.__deepcopy__(self, {}).view(self.__class__) + name = name or deepcopy(self.name) + else: + new_index = super(Index, self).copy() + if name is not None: + names = [name] + if names: + new_index = new_index.set_names(names) + if dtype: + new_index = new_index.astype(dtype) + return new_index + + def to_series(self, keep_tz=False): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False. + applies only to a DatetimeIndex + + Returns + ------- + Series : dtype will be based on the type of the Index values. + """ + + import pandas as pd + values = self._to_embed(keep_tz) + return pd.Series(values, index=self, name=self.name) + + def _to_embed(self, keep_tz=False): + """ return an array repr of this object, potentially casting to object """ + return self.values + + def astype(self, dtype): + return Index(self.values.astype(dtype), name=self.name, + dtype=dtype) + + def to_datetime(self, dayfirst=False): + """ + For an Index containing strings or datetime.datetime objects, attempt + conversion to DatetimeIndex + """ + from pandas.tseries.index import DatetimeIndex + if self.inferred_type == 'string': + from dateutil.parser import parse + parser = lambda x: parse(x, dayfirst=dayfirst) + parsed = lib.try_parse_dates(self.values, parser=parser) + return DatetimeIndex(parsed) + else: + return DatetimeIndex(self.values) + + def _assert_can_do_setop(self, other): + return True + + def tolist(self): + """ + Overridden version of ndarray.tolist + """ + return list(self.values) + + @cache_readonly + def dtype(self): + return self.values.dtype + + @property + def nlevels(self): + return 1 + + # for compat with multindex code + + def _get_names(self): + return FrozenList((self.name,)) + + def _set_names(self, values): + if len(values) != 1: + raise ValueError('Length of new names must be 1, got %d' + % len(values)) + self.name = values[0] + + names = property(fset=_set_names, fget=_get_names) + + def set_names(self, names, inplace=False): + """ + Set new names on index. Defaults to returning new index. + + Parameters + ---------- + names : sequence + names to set + inplace : bool + if True, mutates in place + + Returns + ------- + new index (of same type and class...etc) [if inplace, returns None] + """ + if not com.is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._set_names(names) + if not inplace: + return idx + + def rename(self, name, inplace=False): + """ + Set new names on index. Defaults to returning new index. + + Parameters + ---------- + name : str or list + name to set + inplace : bool + if True, mutates in place + + Returns + ------- + new index (of same type and class...etc) [if inplace, returns None] + """ + return self.set_names([name], inplace=inplace) + + @property + def _has_complex_internals(self): + # to disable groupby tricks in MultiIndex + return False + + def summary(self, name=None): + if len(self) > 0: + head = self[0] + if hasattr(head, 'format') and\ + not isinstance(head, compat.string_types): + head = head.format() + tail = self[-1] + if hasattr(tail, 'format') and\ + not isinstance(tail, compat.string_types): + tail = tail.format() + index_summary = ', %s to %s' % (com.pprint_thing(head), + com.pprint_thing(tail)) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + return '%s: %s entries%s' % (name, len(self), index_summary) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values + + @property + def values(self): + return np.asarray(self) + + def get_values(self): + return self.values + + _na_value = np.nan + """The expected NA value to use with this index.""" + + @property + def is_monotonic(self): + return self._engine.is_monotonic + + def is_lexsorted_for_tuple(self, tup): + return True + + @cache_readonly(allow_setting=True) + def is_unique(self): + return self._engine.is_unique + + def is_integer(self): + return self.inferred_type in ['integer'] + + def is_floating(self): + return self.inferred_type in ['floating', 'mixed-integer-float'] + + def is_numeric(self): + return self.inferred_type in ['integer', 'floating'] + + def is_mixed(self): + return 'mixed' in self.inferred_type + + def holds_integer(self): + return self.inferred_type in ['integer', 'mixed-integer'] + + def _convert_scalar_indexer(self, key, typ=None): + """ convert a scalar indexer, right now we are converting + floats -> ints if the index supports it + """ + + def to_int(): + ikey = int(key) + if ikey != key: + return self._convert_indexer_error(key, 'label') + return ikey + + if typ == 'iloc': + if is_integer(key): + return key + elif is_float(key): + key = to_int() + warnings.warn("scalar indexers for index type {0} should be integers and not floating point".format( + type(self).__name__),FutureWarning) + return key + return self._convert_indexer_error(key, 'label') + + if is_float(key): + if not self.is_floating(): + warnings.warn("scalar indexers for index type {0} should be integers and not floating point".format( + type(self).__name__),FutureWarning) + return to_int() + + return key + + def _validate_slicer(self, key, f): + """ validate and raise if needed on a slice indexers according to the + passed in function """ + + for c in ['start','stop','step']: + if not f(getattr(key,c)): + self._convert_indexer_error(key.start, 'slice {0} value'.format(c)) + + def _convert_slice_indexer_getitem(self, key, is_index_slice=False): + """ called from the getitem slicers, determine how to treat the key + whether positional or not """ + if self.is_integer() or is_index_slice: + return key + return self._convert_slice_indexer(key) + + def _convert_slice_indexer(self, key, typ=None): + """ convert a slice indexer. disallow floats in the start/stop/step """ + + # if we are not a slice, then we are done + if not isinstance(key, slice): + return key + + # validate iloc + if typ == 'iloc': + + # need to coerce to_int if needed + def f(c): + v = getattr(key,c) + if v is None or is_integer(v): + return v + + # warn if its a convertible float + if v == int(v): + warnings.warn("slice indexers when using iloc should be integers " + "and not floating point",FutureWarning) + return int(v) + + self._convert_indexer_error(v, 'slice {0} value'.format(c)) + + return slice(*[ f(c) for c in ['start','stop','step']]) + + # validate slicers + def validate(v): + if v is None or is_integer(v): + return True + + # dissallow floats + elif is_float(v): + return False + + return True + self._validate_slicer(key, validate) + + # figure out if this is a positional indexer + start, stop, step = key.start, key.stop, key.step + + def is_int(v): + return v is None or is_integer(v) + + is_null_slice = start is None and stop is None + is_index_slice = is_int(start) and is_int(stop) + is_positional = is_index_slice and not self.is_integer() + + if typ == 'getitem': + return self._convert_slice_indexer_getitem( + key, is_index_slice=is_index_slice) + + # convert the slice to an indexer here + + # if we are mixed and have integers + try: + if is_positional and self.is_mixed(): + if start is not None: + i = self.get_loc(start) + if stop is not None: + j = self.get_loc(stop) + is_positional = False + except KeyError: + if self.inferred_type == 'mixed-integer-float': + raise + + if is_null_slice: + indexer = key + elif is_positional: + indexer = key + else: + try: + indexer = self.slice_indexer(start, stop, step) + except Exception: + if is_index_slice: + if self.is_integer(): + raise + else: + indexer = key + else: + raise + + return indexer + + def _convert_list_indexer(self, key, typ=None): + """ convert a list indexer. these should be locations """ + return key + + def _convert_list_indexer_for_mixed(self, keyarr, typ=None): + """ passed a key that is tuplesafe that is integer based + and we have a mixed index (e.g. number/labels). figure out + the indexer. return None if we can't help + """ + if (typ is None or typ in ['iloc','ix']) and (com.is_integer_dtype(keyarr) and not self.is_floating()): + if self.inferred_type != 'integer': + keyarr = np.where(keyarr < 0, + len(self) + keyarr, keyarr) + + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + + from pandas.core.indexing import _maybe_convert_indices + return _maybe_convert_indices(indexer, len(self)) + + elif not self.inferred_type == 'integer': + return keyarr + + return None + + def _convert_indexer_error(self, key, msg=None): + if msg is None: + msg = 'label' + raise TypeError("the {0} [{1}] is not a proper indexer for this index " + "type ({2})".format(msg, key, self.__class__.__name__)) + + def get_duplicates(self): + from collections import defaultdict + counter = defaultdict(lambda: 0) + for k in self.values: + counter[k] += 1 + return sorted(k for k, v in compat.iteritems(counter) if v > 1) + + _get_duplicates = get_duplicates + + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self.values, len(self)) + + def _get_level_number(self, level): + if not isinstance(level, int): + if level != self.name: + raise AssertionError('Level %s must be same as name (%s)' + % (level, self.name)) + level = 0 + return level + + @cache_readonly + def inferred_type(self): + return lib.infer_dtype(self) + + def is_type_compatible(self, typ): + return typ == self.inferred_type + + @cache_readonly + def is_all_dates(self): + return is_datetime_array(self.values) + + def __iter__(self): + return iter(self.values) + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(np.ndarray.__reduce__(self)) + subclass_state = self.name, + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if len(state) == 2: + nd_state, own_state = state + np.ndarray.__setstate__(self, nd_state) + self.name = own_state[0] + else: # pragma: no cover + np.ndarray.__setstate__(self, state) + + def __deepcopy__(self, memo={}): + return self.copy(deep=True) + + def __contains__(self, key): + hash(key) + # work around some kind of odd cython bug + try: + return key in self._engine + except TypeError: + return False + + def __hash__(self): + raise TypeError("unhashable type: %r" % type(self).__name__) + + def __getitem__(self, key): + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + __getitem__ = super(Index, self).__getitem__ + if np.isscalar(key): + return __getitem__(key) + + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return __getitem__(key) + + if com._is_bool_indexer(key): + return __getitem__(np.asarray(key)) + + result = __getitem__(key) + if result.ndim > 1: + return result.view(np.ndarray) + else: + return result + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + name = self.name + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) + + for obj in to_concat: + if isinstance(obj, Index) and obj.name != name: + name = None + break + + to_concat = self._ensure_compat_concat(to_concat) + to_concat = [x.values if isinstance(x, Index) else x + for x in to_concat] + + return Index(np.concatenate(to_concat), name=name) + + @staticmethod + def _ensure_compat_concat(indexes): + from pandas.tseries.api import DatetimeIndex, PeriodIndex + klasses = DatetimeIndex, PeriodIndex + + is_ts = [isinstance(idx, klasses) for idx in indexes] + + if any(is_ts) and not all(is_ts): + return [_maybe_box(idx) for idx in indexes] + + return indexes + + def take(self, indexer, axis=0): + """ + Analogous to ndarray.take + """ + indexer = com._ensure_platform_int(indexer) + taken = self.view(np.ndarray).take(indexer) + return self._simple_new(taken, name=self.name, freq=None, + tz=getattr(self, 'tz', None)) + + def format(self, name=False, formatter=None, **kwargs): + """ + Render a string representation of the Index + """ + header = [] + if name: + header.append(com.pprint_thing(self.name, + escape_chars=('\t', '\r', '\n')) + if self.name is not None else '') + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header, **kwargs) + + def _format_with_header(self, header, na_rep='NaN', **kwargs): + values = self.values + + from pandas.core.format import format_array + + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values, safe=1) + + if values.dtype == np.object_: + result = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n')) + for x in values] + + # could have nans + mask = isnull(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() + + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result + + def to_native_types(self, slicer=None, **kwargs): + """ slice and dice then format """ + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) + + def _format_native_types(self, na_rep='', **kwargs): + """ actually format my specific types """ + mask = isnull(self) + values = np.array(self, dtype=object, copy=True) + values[mask] = na_rep + return values.tolist() + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + if not isinstance(other, Index): + return False + + if type(other) != Index: + return other.equals(self) + + return array_equivalent(self, other) + + def identical(self, other): + """Similar to equals, but check that other comparable attributes are + also equal + """ + return (self.equals(other) and + all((getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables)) and + type(self) == type(other)) + + def asof(self, label): + """ + For a sorted index, return the most recent label up to and including + the passed label. Return NaN if not found + """ + if isinstance(label, (Index, ABCSeries, np.ndarray)): + raise TypeError('%s' % type(label)) + + if label not in self: + loc = self.searchsorted(label, side='left') + if loc > 0: + return self[loc - 1] + else: + return np.nan + + if not isinstance(label, Timestamp): + label = Timestamp(label) + return label + + def asof_locs(self, where, mask): + """ + where : array of timestamps + mask : array of booleans where data is not NA + + """ + locs = self.values[mask].searchsorted(where.values, side='right') + + locs = np.where(locs > 0, locs - 1, 0) + result = np.arange(len(self))[mask].take(locs) + + first = mask.argmax() + result[(locs == 0) & (where < self.values[first])] = -1 + + return result + + def order(self, return_indexer=False, ascending=True): + """ + Return sorted copy of Index + """ + _as = self.argsort() + if not ascending: + _as = _as[::-1] + + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + + def sort(self, *args, **kwargs): + raise TypeError('Cannot sort an %r object' % self.__class__.__name__) + + def shift(self, periods=1, freq=None): + """ + Shift Index containing datetime objects by input number of periods and + DateOffset + + Returns + ------- + shifted : Index + """ + if periods == 0: + # OK because immutable + return self + + offset = periods * freq + return Index([idx + offset for idx in self], name=self.name) + + def argsort(self, *args, **kwargs): + """ + See docstring for ndarray.argsort + """ + result = self.asi8 + if result is None: + result = self.view(np.ndarray) + return result.argsort(*args, **kwargs) + + def __add__(self, other): + if isinstance(other, Index): + return self.union(other) + else: + return Index(self.view(np.ndarray) + other) + + __iadd__ = __add__ + __eq__ = _indexOp('__eq__') + __ne__ = _indexOp('__ne__') + __lt__ = _indexOp('__lt__') + __gt__ = _indexOp('__gt__') + __le__ = _indexOp('__le__') + __ge__ = _indexOp('__ge__') + + def __sub__(self, other): + return self.diff(other) + + def __and__(self, other): + return self.intersection(other) + + def __or__(self, other): + return self.union(other) + + def __xor__(self, other): + return self.sym_diff(other) + + def union(self, other): + """ + Form the union of two Index objects and sorts if possible + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + union : Index + """ + if not hasattr(other, '__iter__'): + raise TypeError('Input must be iterable.') + + if len(other) == 0 or self.equals(other): + return self + + if len(self) == 0: + return _ensure_index(other) + + self._assert_can_do_setop(other) + + if self.dtype != other.dtype: + this = self.astype('O') + other = other.astype('O') + return this.union(other) + + if self.is_monotonic and other.is_monotonic: + try: + result = self._outer_indexer(self, other.values)[0] + except TypeError: + # incomparable objects + result = list(self.values) + + # worth making this faster? a very unusual case + value_set = set(self.values) + result.extend([x for x in other.values if x not in value_set]) + else: + indexer = self.get_indexer(other) + indexer, = (indexer == -1).nonzero() + + if len(indexer) > 0: + other_diff = com.take_nd(other.values, indexer, + allow_fill=False) + result = com._concat_compat((self.values, other_diff)) + + try: + self.values[0] < other_diff[0] + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning) + else: + types = frozenset((self.inferred_type, + other.inferred_type)) + if not types & _unsortable_types: + result.sort() + + else: + result = self.values + + try: + result = np.sort(result) + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning) + + # for subclasses + return self._wrap_union_result(other, result) + + def _wrap_union_result(self, other, result): + name = self.name if self.name == other.name else None + return self.__class__(data=result, name=name) + + def intersection(self, other): + """ + Form the intersection of two Index objects. Sortedness of the result is + not guaranteed + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + intersection : Index + """ + if not hasattr(other, '__iter__'): + raise TypeError('Input must be iterable!') + + self._assert_can_do_setop(other) + + other = _ensure_index(other) + + if self.equals(other): + return self + + if self.dtype != other.dtype: + this = self.astype('O') + other = other.astype('O') + return this.intersection(other) + + if self.is_monotonic and other.is_monotonic: + try: + result = self._inner_indexer(self, other.values)[0] + return self._wrap_union_result(other, result) + except TypeError: + pass + + try: + indexer = self.get_indexer(other.values) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except: + # duplicates + indexer = self.get_indexer_non_unique(other.values)[0].unique() + + taken = self.take(indexer) + if self.name != other.name: + taken.name = None + return taken + + def diff(self, other): + """ + Compute sorted set difference of two Index objects + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + diff : Index + + Notes + ----- + One can do either of these and achieve the same result + + >>> index - index2 + >>> index.diff(index2) + """ + + if not hasattr(other, '__iter__'): + raise TypeError('Input must be iterable!') + + if self.equals(other): + return Index([], name=self.name) + + if not isinstance(other, Index): + other = np.asarray(other) + result_name = self.name + else: + result_name = self.name if self.name == other.name else None + + theDiff = sorted(set(self) - set(other)) + return Index(theDiff, name=result_name) + + def sym_diff(self, other, result_name=None): + """ + Compute the sorted symmetric difference of two Index objects. + + Parameters + ---------- + + other : array-like + result_name : str + + Returns + ------- + sym_diff : Index + + Notes + ----- + ``sym_diff`` contains elements that appear in either ``idx1`` or + ``idx2`` but not both. Equivalent to the Index created by + ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. + + The sorting of a result containing ``NaN`` values is not guaranteed + across Python versions. See GitHub issue #6444. + + Examples + -------- + >>> idx1 = Index([1, 2, 3, 4]) + >>> idx2 = Index([2, 3, 4, 5]) + >>> idx1.sym_diff(idx2) + Int64Index([1, 5], dtype='int64') + + You can also use the ``^`` operator: + + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + if not hasattr(other, '__iter__'): + raise TypeError('Input must be iterable!') + + if not isinstance(other, Index): + other = Index(other) + result_name = result_name or self.name + + the_diff = sorted(set((self - other) + (other - self))) + return Index(the_diff, name=result_name) + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int if unique index, possibly slice or mask if not + """ + return self._engine.get_loc(_values_from_object(key)) + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + s = _values_from_object(series) + k = _values_from_object(key) + + # prevent integer truncation bug in indexing + if is_float(k) and not self.is_floating(): + raise KeyError + + try: + return self._engine.get_value(s, k) + except KeyError as e1: + if len(self) > 0 and self.inferred_type in ['integer','boolean']: + raise + + try: + return tslib.get_value_box(s, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if com.is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + # python 3 + if np.isscalar(key): # pragma: no cover + raise IndexError(key) + raise InvalidIndexError(key) + + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + self._engine.set_value( + _values_from_object(arr), _values_from_object(key), value) + + def get_level_values(self, level): + """ + Return vector of label values for requested level, equal to the length + of the index + + Parameters + ---------- + level : int + + Returns + ------- + values : ndarray + """ + # checks that level number is actually just 1 + self._get_level_number(level) + return self + + def get_indexer(self, target, method=None, limit=None): + """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. The mask determines whether labels are + found or not in the current index + + Parameters + ---------- + target : Index + method : {'pad', 'ffill', 'backfill', 'bfill'} + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Notes + ----- + This is a low-level method and probably should be used at your own risk + + Examples + -------- + >>> indexer = index.get_indexer(new_index) + >>> new_values = cur_values.take(indexer) + + Returns + ------- + indexer : ndarray + """ + method = self._get_method(method) + target = _ensure_index(target) + + pself, ptarget = self._possibly_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer(ptarget, method=method, limit=limit) + + if self.dtype != target.dtype: + this = self.astype(object) + target = target.astype(object) + return this.get_indexer(target, method=method, limit=limit) + + if not self.is_unique: + raise InvalidIndexError('Reindexing only valid with uniquely' + ' valued Index objects') + + if method == 'pad': + if not self.is_monotonic or not target.is_monotonic: + raise ValueError('Must be monotonic for forward fill') + indexer = self._engine.get_pad_indexer(target.values, limit) + elif method == 'backfill': + if not self.is_monotonic or not target.is_monotonic: + raise ValueError('Must be monotonic for backward fill') + indexer = self._engine.get_backfill_indexer(target.values, limit) + elif method is None: + indexer = self._engine.get_indexer(target.values) + else: + raise ValueError('unrecognized method: %s' % method) + + return com._ensure_platform_int(indexer) + + def get_indexer_non_unique(self, target, **kwargs): + """ return an indexer suitable for taking from a non unique index + return the labels in the same order as the target, and + return a missing indexer into the target (missing are marked as -1 + in the indexer); target must be an iterable """ + target = _ensure_index(target) + pself, ptarget = self._possibly_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if self.is_all_dates: + self = Index(self.asi8) + tgt_values = target.asi8 + else: + tgt_values = target.values + + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return Index(indexer), missing + + def get_indexer_for(self, target, **kwargs): + """ guaranteed return of an indexer even when non-unique """ + if self.is_unique: + return self.get_indexer(target, **kwargs) + return self.get_indexer_non_unique(target, **kwargs)[0] + + def _possibly_promote(self, other): + # A hack, but it works + from pandas.tseries.index import DatetimeIndex + if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + return DatetimeIndex(self), other + return self, other + + def groupby(self, to_groupby): + return self._groupby(self.values, to_groupby) + + def map(self, mapper): + return self._arrmap(self.values, mapper) + + def isin(self, values): + """ + Compute boolean array of whether each index value is found in the + passed set of values + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + value_set = set(values) + return lib.ismember(self._array_values(), value_set) + + def _array_values(self): + return self + + def _get_method(self, method): + if method: + method = method.lower() + + aliases = { + 'ffill': 'pad', + 'bfill': 'backfill' + } + return aliases.get(method, method) + + def reindex(self, target, method=None, level=None, limit=None, + copy_if_needed=False): + """ + For Index, simply returns the new index and the results of + get_indexer. Provided here to enable an interface that is amenable for + subclasses of Index whose internals are different (like MultiIndex) + + Returns + ------- + (new_index, indexer, mask) : tuple + """ + target = _ensure_index(target) + if level is not None: + if method is not None: + raise TypeError('Fill method not supported if level passed') + _, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) + else: + + if self.equals(target): + indexer = None + + # to avoid aliasing an existing index + if (copy_if_needed and target.name != self.name and + self.name is not None): + if target.name is None: + target = self.copy() + + else: + if self.is_unique: + indexer = self.get_indexer(target, method=method, + limit=limit) + else: + if method is not None or limit is not None: + raise ValueError("cannot reindex a non-unique index " + "with a method or limit") + indexer, missing = self.get_indexer_non_unique(target) + + return target, indexer + + def join(self, other, how='left', level=None, return_indexers=False): + """ + Internal API method. Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + # try to figure out the join level + # GH3662 + if (level is None and (self_is_mi or other_is_mi)): + + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how, return_indexers=return_indexers) + + # join on the level + if (level is not None and (self_is_mi or other_is_mi)): + return self._join_level(other, level, how=how, + return_indexers=return_indexers) + + other = _ensure_index(other) + + if len(other) == 0 and how in ('left', 'outer'): + join_index = self._shallow_copy() + if return_indexers: + rindexer = np.repeat(-1, len(join_index)) + return join_index, None, rindexer + else: + return join_index + + if len(self) == 0 and how in ('right', 'outer'): + join_index = other._shallow_copy() + if return_indexers: + lindexer = np.repeat(-1, len(join_index)) + return join_index, lindexer, None + else: + return join_index + + if self._join_precedence < other._join_precedence: + how = {'right': 'left', 'left': 'right'}.get(how, how) + result = other.join(self, how=how, level=level, + return_indexers=return_indexers) + if return_indexers: + x, y, z = result + result = x, z, y + return result + + if self.dtype != other.dtype: + this = self.astype('O') + other = other.astype('O') + return this.join(other, how=how, + return_indexers=return_indexers) + + _validate_join_method(how) + + if not self.is_unique and not other.is_unique: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + else: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif self.is_monotonic and other.is_monotonic: + try: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + except TypeError: + pass + + if how == 'left': + join_index = self + elif how == 'right': + join_index = other + elif how == 'inner': + join_index = self.intersection(other) + elif how == 'outer': + join_index = self.union(other) + + if return_indexers: + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer(join_index) + return join_index, lindexer, rindexer + else: + return join_index + + def _join_multi(self, other, how, return_indexers=True): + + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + # figure out join names + self_names = [ n for n in self.names if n is not None ] + other_names = [ n for n in other.names if n is not None ] + overlap = list(set(self_names) & set(other_names)) + + # need at least 1 in common, but not more than 1 + if not len(overlap): + raise ValueError("cannot join with no level specified and no overlapping names") + if len(overlap) > 1: + raise NotImplementedError("merging with more than one level overlap on a multi-index is not implemented") + jl = overlap[0] + + # make the indices into mi's that match + if not (self_is_mi and other_is_mi): + + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) + + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result + + # 2 multi-indexes + raise NotImplementedError("merging with both multi-indexes is not implemented") + + def _join_non_unique(self, other, how='left', return_indexers=False): + from pandas.tools.merge import _get_join_indexers + + left_idx, right_idx = _get_join_indexers([self.values], [other.values], + how=how, sort=True) + + left_idx = com._ensure_platform_int(left_idx) + right_idx = com._ensure_platform_int(right_idx) + + join_index = self.values.take(left_idx) + mask = left_idx == -1 + np.putmask(join_index, mask, other.values.take(right_idx)) + + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + return join_index, left_idx, right_idx + else: + return join_index + + def _join_level(self, other, level, how='left', return_indexers=False): + """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. The order of the data indexed by + the MultiIndex will not be changed (currently) + """ + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise TypeError('Join on level between two MultiIndex objects ' + 'is ambiguous') + + left, right = self, other + + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + how = {'right': 'left', 'left': 'right'}.get(how, how) + + level = left._get_level_number(level) + old_level = left.levels[level] + + new_level, left_lev_indexer, right_lev_indexer = \ + old_level.join(right, how=how, return_indexers=True) + + if left_lev_indexer is not None: + left_lev_indexer = com._ensure_int64(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, + len(old_level)) + + new_lev_labels = com.take_nd(rev_indexer, left.labels[level], + allow_fill=False) + omit_mask = new_lev_labels != -1 + + new_labels = list(left.labels) + new_labels[level] = new_lev_labels + + if not omit_mask.all(): + new_labels = [lab[omit_mask] for lab in new_labels] + + new_levels = list(left.levels) + new_levels[level] = new_level + + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=left.names, verify_integrity=False) + left_indexer = np.arange(len(left))[new_lev_labels != -1] + else: + join_index = left + left_indexer = None + + if right_lev_indexer is not None: + right_indexer = com.take_nd(right_lev_indexer, + join_index.labels[level], + allow_fill=False) + else: + right_indexer = join_index.labels[level] + + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer + + if return_indexers: + return join_index, left_indexer, right_indexer + else: + return join_index + + def _join_monotonic(self, other, how='left', return_indexers=False): + if self.equals(other): + ret_index = other if how == 'right' else self + if return_indexers: + return ret_index, None, None + else: + return ret_index + + sv = self.values + ov = other.values + + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == 'left': + join_index = self + lidx = None + ridx = self._left_indexer_unique(sv, ov) + elif how == 'right': + join_index = other + lidx = self._left_indexer_unique(ov, sv) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + else: + if how == 'left': + join_index, lidx, ridx = self._left_indexer(sv, ov) + elif how == 'right': + join_index, ridx, lidx = self._left_indexer(other, self) + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + return join_index, lidx, ridx + else: + return join_index + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + return Index(joined, name=name) + + def slice_indexer(self, start=None, end=None, step=None): + """ + For an ordered Index, compute the slice indexer for input labels and + step + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning + end : label, default None + If None, defaults to the end + step : int, default None + + Returns + ------- + indexer : ndarray or slice + + Notes + ----- + This function assumes that the data is sorted, so use at your own peril + """ + start_slice, end_slice = self.slice_locs(start, end) + + # return a slice + if np.isscalar(start_slice) and np.isscalar(end_slice): + + # degenerate cases + if start is None and end is None: + return slice(None, None, step) + + return slice(start_slice, end_slice, step) + + # loc indexers + return Index(start_slice) & Index(end_slice) + + def slice_locs(self, start=None, end=None): + """ + For an ordered Index, compute the slice locations for input labels + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning + end : label, default None + If None, defaults to the end + + Returns + ------- + (start, end) : (int, int) + + Notes + ----- + This function assumes that the data is sorted, so use at your own peril + """ + + is_unique = self.is_unique + + def _get_slice(starting_value, offset, search_side, slice_property, + search_value): + if search_value is None: + return starting_value + + try: + slc = self.get_loc(search_value) + + if not is_unique: + + # get_loc will return a boolean array for non_uniques + # if we are not monotonic + if isinstance(slc, np.ndarray): + raise KeyError("cannot peform a slice operation " + "on a non-unique non-monotonic index") + + if isinstance(slc, slice): + slc = getattr(slc, slice_property) + else: + slc += offset + + except KeyError: + if self.is_monotonic: + + # we are duplicated but non-unique + # so if we have an indexer then we are done + # else search for it (GH 7523) + if not is_unique and is_integer(search_value): + slc = search_value + else: + slc = self.searchsorted(search_value, + side=search_side) + else: + raise + return slc + + start_slice = _get_slice(0, offset=0, search_side='left', + slice_property='start', search_value=start) + end_slice = _get_slice(len(self), offset=1, search_side='right', + slice_property='stop', search_value=end) + + return start_slice, end_slice + + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted + + Returns + ------- + new_index : Index + """ + return np.delete(self, loc) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + """ + _self = np.asarray(self) + item_idx = Index([item], dtype=self.dtype).values + idx = np.concatenate( + (_self[:loc], item_idx, _self[loc:])) + return Index(idx, name=self.name) + + def drop(self, labels): + """ + Make new Index with passed list of labels deleted + + Parameters + ---------- + labels : array-like + + Returns + ------- + dropped : Index + """ + labels = com._index_labels_to_array(labels) + indexer = self.get_indexer(labels) + mask = indexer == -1 + if mask.any(): + raise ValueError('labels %s not contained in axis' % labels[mask]) + return self.delete(indexer) + + +class Int64Index(Index): + + """ + Immutable ndarray implementing an ordered, sliceable set. The basic object + storing axis labels for all pandas objects. Int64Index is a special case + of `Index` with purely integer labels. This is the default index type used + by the DataFrame and Series ctors when no explicit index is provided by the + user. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: int64) + copy : bool + Make a copy of input ndarray + name : object + Name to be stored in the index + + Notes + ----- + An Index instance can **only** contain hashable objects + """ + + _groupby = _algos.groupby_int64 + _arrmap = _algos.arrmap_int64 + _left_indexer_unique = _algos.left_join_indexer_unique_int64 + _left_indexer = _algos.left_join_indexer_int64 + _inner_indexer = _algos.inner_join_indexer_int64 + _outer_indexer = _algos.outer_join_indexer_int64 + + _engine_type = _index.Int64Engine + + def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): + + if fastpath: + subarr = data.view(cls) + subarr.name = name + return subarr + + # isscalar, generators handled in coerce_to_ndarray + data = cls._coerce_to_ndarray(data) + + if issubclass(data.dtype.type, compat.string_types): + cls._string_data_error(data) + + elif issubclass(data.dtype.type, np.integer): + # don't force the upcast as we may be dealing + # with a platform int + if dtype is None or not issubclass(np.dtype(dtype).type, + np.integer): + dtype = np.int64 + + subarr = np.array(data, dtype=dtype, copy=copy) + else: + subarr = np.array(data, dtype=np.int64, copy=copy) + if len(data) > 0: + if (subarr != data).any(): + raise TypeError('Unsafe NumPy casting to integer, you must' + ' explicitly cast') + + subarr = subarr.view(cls) + subarr.name = name + return subarr + + @property + def inferred_type(self): + return 'integer' + + @property + def asi8(self): + # do not cache or you'll create a memory leak + return self.values.view('i8') + + @property + def is_all_dates(self): + """ + Checks that all the labels are datetime objects + """ + return False + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + # if not isinstance(other, Int64Index): + # return False + + try: + return array_equivalent(self, other) + except TypeError: + # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + return False + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + return Int64Index(joined, name=name) + + +class Float64Index(Index): + + """ + Immutable ndarray implementing an ordered, sliceable set. The basic object + storing axis labels for all pandas objects. Float64Index is a special case + of `Index` with purely floating point labels. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: object) + copy : bool + Make a copy of input ndarray + name : object + Name to be stored in the index + + Notes + ----- + An Float64Index instance can **only** contain hashable objects + """ + + # when this is not longer object dtype this can be changed + _engine_type = _index.Float64Engine + _groupby = _algos.groupby_float64 + _arrmap = _algos.arrmap_float64 + _left_indexer_unique = _algos.left_join_indexer_unique_float64 + _left_indexer = _algos.left_join_indexer_float64 + _inner_indexer = _algos.inner_join_indexer_float64 + _outer_indexer = _algos.outer_join_indexer_float64 + + def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False): + + if fastpath: + subarr = data.view(cls) + subarr.name = name + return subarr + + data = cls._coerce_to_ndarray(data) + + if issubclass(data.dtype.type, compat.string_types): + cls._string_data_error(data) + + if dtype is None: + dtype = np.float64 + + try: + subarr = np.array(data, dtype=dtype, copy=copy) + except: + raise TypeError('Unsafe NumPy casting, you must ' + 'explicitly cast') + + # coerce to float64 for storage + if subarr.dtype != np.float64: + subarr = subarr.astype(np.float64) + + subarr = subarr.view(cls) + subarr.name = name + return subarr + + @property + def inferred_type(self): + return 'floating' + + def astype(self, dtype): + if np.dtype(dtype) not in (np.object, np.float64): + raise TypeError('Setting %s dtype to anything other than ' + 'float64 or object is not supported' % + self.__class__) + return Index(self.values, name=self.name, dtype=dtype) + + def _convert_scalar_indexer(self, key, typ=None): + if typ == 'iloc': + return super(Float64Index, self)._convert_scalar_indexer(key, + typ=typ) + return key + + def _convert_slice_indexer(self, key, typ=None): + """ convert a slice indexer, by definition these are labels + unless we are iloc """ + + # if we are not a slice, then we are done + if not isinstance(key, slice): + return key + + if typ == 'iloc': + return super(Float64Index, self)._convert_slice_indexer(key, + typ=typ) + + # allow floats here + validator = lambda v: v is None or is_integer(v) or is_float(v) + self._validate_slicer(key, validator) + + # translate to locations + return self.slice_indexer(key.start, key.stop, key.step) + + def get_value(self, series, key): + """ we always want to get an index value, never a value """ + if not np.isscalar(key): + raise InvalidIndexError + + from pandas.core.indexing import _maybe_droplevels + from pandas.core.series import Series + + k = _values_from_object(key) + loc = self.get_loc(k) + new_values = series.values[loc] + if np.isscalar(new_values) or new_values is None: + return new_values + + new_index = self[loc] + new_index = _maybe_droplevels(new_index, k) + return Series(new_values, index=new_index, name=series.name) + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self is other: + return True + + # need to compare nans locations and make sure that they are the same + # since nans don't compare equal this is a bit tricky + try: + if not isinstance(other, Float64Index): + other = self._constructor(other) + if self.dtype != other.dtype or self.shape != other.shape: + return False + left, right = self.values, other.values + return ((left == right) | (self._isnan & other._isnan)).all() + except TypeError: + # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + return False + + def __contains__(self, other): + if super(Float64Index, self).__contains__(other): + return True + + try: + # if other is a sequence this throws a ValueError + return np.isnan(other) and self.hasnans + except ValueError: + try: + return len(other) <= 1 and _try_get_item(other) in self + except TypeError: + return False + except: + return False + + def get_loc(self, key): + try: + if np.all(np.isnan(key)): + try: + return self._nan_idxs.item() + except ValueError: + return self._nan_idxs + except (TypeError, NotImplementedError): + pass + return super(Float64Index, self).get_loc(key) + + @property + def is_all_dates(self): + """ + Checks that all the labels are datetime objects + """ + return False + + @cache_readonly + def _nan_idxs(self): + w, = self._isnan.nonzero() + return w + + @cache_readonly + def _isnan(self): + return np.isnan(self.values) + + @cache_readonly + def hasnans(self): + return self._isnan.any() + + @cache_readonly + def is_unique(self): + return super(Float64Index, self).is_unique and self._nan_idxs.size < 2 + + def isin(self, values): + """ + Compute boolean array of whether each index value is found in the + passed set of values + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + value_set = set(values) + return lib.ismember_nans(self._array_values(), value_set, + isnull(list(value_set)).any()) + + +class MultiIndex(Index): + + """ + Implements multi-level, a.k.a. hierarchical, index object for pandas + objects + + Parameters + ---------- + levels : sequence of arrays + The unique labels for each level + labels : sequence of arrays + Integers for each level designating which label at each location + sortorder : optional int + Level of sortedness (must be lexicographically sorted by that + level) + names : optional sequence of objects + Names for each of the index levels. + """ + # initialize to zero-length tuples to make everything work + _names = FrozenList() + _levels = FrozenList() + _labels = FrozenList() + _comparables = ['names'] + rename = Index.set_names + + def __new__(cls, levels=None, labels=None, sortorder=None, names=None, + copy=False, verify_integrity=True): + if levels is None or labels is None: + raise TypeError("Must pass both levels and labels") + if len(levels) != len(labels): + raise ValueError('Length of levels and labels must be the same.') + if len(levels) == 0: + raise ValueError('Must pass non-zero number of levels/labels') + if len(levels) == 1: + if names: + name = names[0] + else: + name = None + + return Index(levels[0], name=name, copy=True).take(labels[0]) + + # v3, 0.8.0 + subarr = np.empty(0, dtype=object).view(cls) + # we've already validated levels and labels, so shortcut here + subarr._set_levels(levels, copy=copy, validate=False) + subarr._set_labels(labels, copy=copy, validate=False) + + if names is not None: + # handles name validation + subarr._set_names(names) + + if sortorder is not None: + subarr.sortorder = int(sortorder) + else: + subarr.sortorder = sortorder + + if verify_integrity: + subarr._verify_integrity() + + return subarr + + def _verify_integrity(self): + """Raises ValueError if length of levels and labels don't match or any + label would exceed level bounds""" + # NOTE: Currently does not check, among other things, that cached + # nlevels matches nor that sortorder matches actually sortorder. + labels, levels = self.labels, self.levels + if len(levels) != len(labels): + raise ValueError("Length of levels and labels must match. NOTE:" + " this index is in an inconsistent state.") + label_length = len(self.labels[0]) + for i, (level, label) in enumerate(zip(levels, labels)): + if len(label) != label_length: + raise ValueError("Unequal label lengths: %s" % ( + [len(lab) for lab in labels])) + if len(label) and label.max() >= len(level): + raise ValueError("On level %d, label max (%d) >= length of" + " level (%d). NOTE: this index is in an" + " inconsistent state" % (i, label.max(), + len(level))) + + def _get_levels(self): + return self._levels + + def _set_levels(self, levels, copy=False, validate=True, + verify_integrity=False): + # This is NOT part of the levels property because it should be + # externally not allowed to set levels. User beware if you change + # _levels directly + if validate and len(levels) == 0: + raise ValueError('Must set non-zero number of levels.') + if validate and len(levels) != len(self._labels): + raise ValueError('Length of levels must match length of labels.') + levels = FrozenList(_ensure_index(lev, copy=copy)._shallow_copy() + for lev in levels) + names = self.names + self._levels = levels + if any(names): + self._set_names(names) + + self._tuples = None + self._reset_cache() + + if verify_integrity: + self._verify_integrity() + + def set_levels(self, levels, inplace=False, verify_integrity=True): + """ + Set new levels on MultiIndex. Defaults to returning + new index. + + Parameters + ---------- + levels : sequence + new levels to apply + inplace : bool + if True, mutates in place + verify_integrity : bool (default True) + if True, checks that levels and labels are compatible + + Returns + ------- + new index (of same type and class...etc) + """ + if not com.is_list_like(levels) or not com.is_list_like(levels[0]): + raise TypeError("Levels must be list of lists-like") + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._reset_identity() + idx._set_levels(levels, validate=True, + verify_integrity=verify_integrity) + if not inplace: + return idx + + # remove me in 0.14 and change to read only property + __set_levels = deprecate("setting `levels` directly", + partial(set_levels, inplace=True, + verify_integrity=True), + alt_name="set_levels") + levels = property(fget=_get_levels, fset=__set_levels) + + def _get_labels(self): + return self._labels + + def _set_labels(self, labels, copy=False, validate=True, + verify_integrity=False): + if validate and len(labels) != self.nlevels: + raise ValueError("Length of labels must match length of levels") + self._labels = FrozenList( + _ensure_frozen(labs, copy=copy)._shallow_copy() for labs in labels) + self._tuples = None + self._reset_cache() + + if verify_integrity: + self._verify_integrity() + + def set_labels(self, labels, inplace=False, verify_integrity=True): + """ + Set new labels on MultiIndex. Defaults to returning + new index. + + Parameters + ---------- + labels : sequence of arrays + new labels to apply + inplace : bool + if True, mutates in place + verify_integrity : bool (default True) + if True, checks that levels and labels are compatible + + Returns + ------- + new index (of same type and class...etc) + """ + if not com.is_list_like(labels) or not com.is_list_like(labels[0]): + raise TypeError("Labels must be list of lists-like") + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._reset_identity() + idx._set_labels(labels, verify_integrity=verify_integrity) + if not inplace: + return idx + + # remove me in 0.14 and change to readonly property + __set_labels = deprecate("setting labels directly", + partial(set_labels, inplace=True, + verify_integrity=True), + alt_name="set_labels") + labels = property(fget=_get_labels, fset=__set_labels) + + def copy(self, names=None, dtype=None, levels=None, labels=None, + deep=False): + """ + Make a copy of this object. Names, dtype, levels and labels can be + passed and will be set on new copy. + + Parameters + ---------- + names : sequence, optional + dtype : numpy dtype or pandas type, optional + levels : sequence, optional + labels : sequence, optional + + Returns + ------- + copy : MultiIndex + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + This could be potentially expensive on large MultiIndex objects. + """ + new_index = np.ndarray.copy(self) + if deep: + from copy import deepcopy + levels = levels if levels is not None else deepcopy(self.levels) + labels = labels if labels is not None else deepcopy(self.labels) + names = names if names is not None else deepcopy(self.names) + if levels is not None: + new_index = new_index.set_levels(levels) + if labels is not None: + new_index = new_index.set_labels(labels) + if names is not None: + new_index = new_index.set_names(names) + if dtype: + new_index = new_index.astype(dtype) + return new_index + + def __array_finalize__(self, obj): + """ + Update custom MultiIndex attributes when a new array is created by + numpy, e.g. when calling ndarray.view() + """ + # overriden if a view + self._reset_identity() + if not isinstance(obj, type(self)): + # Only relevant if this array is being created from an Index + # instance. + return + + # skip the validation on first, rest will catch the errors + self._set_levels(getattr(obj, 'levels', []), validate=False) + self._set_labels(getattr(obj, 'labels', [])) + self._set_names(getattr(obj, 'names', [])) + self.sortorder = getattr(obj, 'sortorder', None) + + def _array_values(self): + # hack for various methods + return self.values + + @cache_readonly + def dtype(self): + return np.dtype('O') + + def __repr__(self): + encoding = get_option('display.encoding') + attrs = [('levels', default_pprint(self.levels)), + ('labels', default_pprint(self.labels))] + if not all(name is None for name in self.names): + attrs.append(('names', default_pprint(self.names))) + if self.sortorder is not None: + attrs.append(('sortorder', default_pprint(self.sortorder))) + + space = ' ' * (len(self.__class__.__name__) + 1) + prepr = (u(",\n%s") % space).join([u("%s=%s") % (k, v) + for k, v in attrs]) + res = u("%s(%s)") % (self.__class__.__name__, prepr) + + if not compat.PY3: + # needs to be str in Python 2 + res = res.encode(encoding) + return res + + def __unicode__(self): + """ + Return a string representation for a particular Index + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + rows = self.format(names=True) + max_rows = get_option('display.max_rows') + if len(rows) > max_rows: + spaces = (len(rows[0]) - 3) // 2 + centered = ' ' * spaces + half = max_rows // 2 + rows = rows[:half] + [centered + '...' + centered] + rows[-half:] + return "\n".join(rows) + + def __len__(self): + return len(self.labels[0]) + + def _get_names(self): + return FrozenList(level.name for level in self.levels) + + def _set_names(self, values, validate=True): + """ + sets names on levels. WARNING: mutates! + + Note that you generally want to set this *after* changing levels, so + that it only acts on copies""" + values = list(values) + if validate and len(values) != self.nlevels: + raise ValueError('Length of names must match length of levels') + # set the name + for name, level in zip(values, self.levels): + level.rename(name, inplace=True) + + names = property( + fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") + + def _reference_duplicate_name(self, name): + """ + Returns True if the name refered to in self.names is duplicated. + """ + # count the times name equals an element in self.names. + return np.sum(name == np.asarray(self.names)) > 1 + + def _format_native_types(self, **kwargs): + return self.tolist() + + @property + def _constructor(self): + return MultiIndex.from_tuples + + @cache_readonly + def inferred_type(self): + return 'mixed' + + @staticmethod + def _from_elements(values, labels=None, levels=None, names=None, + sortorder=None): + index = values.view(MultiIndex) + index._set_levels(levels) + index._set_labels(labels) + index._set_names(names) + index.sortorder = sortorder + return index + + def _get_level_number(self, level): + try: + count = self.names.count(level) + if count > 1: + raise ValueError('The name %s occurs multiple times, use a ' + 'level number' % level) + level = self.names.index(level) + except ValueError: + if not isinstance(level, int): + raise KeyError('Level %s not found' % str(level)) + elif level < 0: + level += self.nlevels + # Note: levels are zero-based + elif level >= self.nlevels: + raise IndexError('Too many levels: Index has only %d levels, ' + 'not %d' % (self.nlevels, level + 1)) + return level + + _tuples = None + + @property + def values(self): + if self._is_v2: + return self.view(np.ndarray) + else: + if self._tuples is not None: + return self._tuples + + values = [] + for lev, lab in zip(self.levels, self.labels): + taken = com.take_1d(lev.values, lab) + # Need to box timestamps, etc. + if hasattr(lev, '_box_values'): + taken = lev._box_values(taken) + values.append(taken) + + self._tuples = lib.fast_zip(values) + return self._tuples + + # fml + @property + def _is_v1(self): + contents = self.view(np.ndarray) + return len(contents) > 0 and not isinstance(contents[0], tuple) + + @property + def _is_v2(self): + contents = self.view(np.ndarray) + return len(contents) > 0 and isinstance(contents[0], tuple) + + @property + def _has_complex_internals(self): + # to disable groupby tricks + return True + + @property + def has_duplicates(self): + """ + Return True if there are no unique groups + """ + # has duplicates + shape = [len(lev) for lev in self.levels] + group_index = np.zeros(len(self), dtype='i8') + for i in range(len(shape)): + stride = np.prod([x for x in shape[i + 1:]], dtype='i8') + group_index += self.labels[i] * stride + + if len(np.unique(group_index)) < len(group_index): + return True + + return False + + def get_value(self, series, key): + # somewhat broken encapsulation + from pandas.core.indexing import _maybe_droplevels + from pandas.core.series import Series + + # Label-based + s = _values_from_object(series) + k = _values_from_object(key) + + def _try_mi(k): + # TODO: what if a level contains tuples?? + loc = self.get_loc(k) + new_values = series.values[loc] + new_index = self[loc] + new_index = _maybe_droplevels(new_index, k) + return Series(new_values, index=new_index, name=series.name) + + try: + return self._engine.get_value(s, k) + except KeyError as e1: + try: + return _try_mi(key) + except KeyError: + pass + + try: + return _index.get_value_at(s, k) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if com.is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + + # a Timestamp will raise a TypeError in a multi-index + # rather than a KeyError, try it here + # note that a string that 'looks' like a Timestamp will raise + # a KeyError! (GH5725) + if isinstance(key, (datetime.datetime, np.datetime64)) or ( + compat.PY3 and isinstance(key, compat.string_types)): + try: + return _try_mi(key) + except (KeyError): + raise + except: + pass + + try: + return _try_mi(Timestamp(key)) + except: + pass + + raise InvalidIndexError(key) + + def get_level_values(self, level): + """ + Return vector of label values for requested level, equal to the length + of the index + + Parameters + ---------- + level : int or level name + + Returns + ------- + values : ndarray + """ + num = self._get_level_number(level) + unique = self.levels[num] # .values + labels = self.labels[num] + filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) + values = unique._simple_new(filled, self.names[num], + freq=getattr(unique, 'freq', None), + tz=getattr(unique, 'tz', None)) + return values + + def format(self, space=2, sparsify=None, adjoin=True, names=False, + na_rep=None, formatter=None): + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, lab in zip(self.levels, self.labels): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) + + if len(lev) > 0: + + formatted = lev.take(lab).format(formatter=formatter) + + # we have some NA + mask = lab == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [com.pprint_thing(na if isnull(x) else x, + escape_chars=('\t', '\r', '\n')) + for x in com.take_1d(lev.values, lab)] + stringified_levels.append(formatted) + + result_levels = [] + for lev, name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append(com.pprint_thing(name, + escape_chars=('\t', '\r', '\n')) + if name is not None else '') + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel = '' + # GH3547 + # use value of sparsify as sentinel, unless it's an obvious + # "Truthey" value + if sparsify not in [True, 1]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = _sparsify(result_levels, + start=int(names), + sentinel=sentinel) + + if adjoin: + return com.adjoin(space, *result_levels).split('\n') + else: + return result_levels + + def to_hierarchical(self, n_repeat, n_shuffle=1): + """ + Return a MultiIndex reshaped to conform to the + shapes given by n_repeat and n_shuffle. + + Useful to replicate and rearrange a MultiIndex for combination + with another Index with n_repeat items. + + Parameters + ---------- + n_repeat : int + Number of times to repeat the labels on self + n_shuffle : int + Controls the reordering of the labels. If the result is going + to be an inner level in a MultiIndex, n_shuffle will need to be + greater than one. The size of each label must divisible by + n_shuffle. + + Returns + ------- + MultiIndex + + Examples + -------- + >>> idx = MultiIndex.from_tuples([(1, u'one'), (1, u'two'), + (2, u'one'), (2, u'two')]) + >>> idx.to_hierarchical(3) + MultiIndex(levels=[[1, 2], [u'one', u'two']], + labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + """ + levels = self.levels + labels = [np.repeat(x, n_repeat) for x in self.labels] + # Assumes that each label is divisible by n_shuffle + labels = [x.reshape(n_shuffle, -1).ravel(1) for x in labels] + names = self.names + return MultiIndex(levels=levels, labels=labels, names=names) + + @property + def is_all_dates(self): + return False + + def is_lexsorted(self): + """ + Return True if the labels are lexicographically sorted + """ + return self.lexsort_depth == self.nlevels + + def is_lexsorted_for_tuple(self, tup): + """ + Return True if we are correctly lexsorted given the passed tuple + """ + return len(tup) <= self.lexsort_depth + + @cache_readonly + def lexsort_depth(self): + if self.sortorder is not None: + if self.sortorder == 0: + return self.nlevels + else: + return 0 + + int64_labels = [com._ensure_int64(lab) for lab in self.labels] + for k in range(self.nlevels, 0, -1): + if lib.is_lexsorted(int64_labels[:k]): + return k + + return 0 + + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=None): + """ + Convert arrays to MultiIndex + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> MultiIndex.from_arrays(arrays, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables + """ + from pandas.core.categorical import Categorical + + if len(arrays) == 1: + name = None if names is None else names[0] + return Index(arrays[0], name=name) + + cats = [Categorical.from_array(arr) for arr in arrays] + levels = [c.levels for c in cats] + labels = [c.labels for c in cats] + if names is None: + names = [c.name for c in cats] + + return MultiIndex(levels=levels, labels=labels, + sortorder=sortorder, names=names, + verify_integrity=False) + + @classmethod + def from_tuples(cls, tuples, sortorder=None, names=None): + """ + Convert list of tuples to MultiIndex + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> tuples = [(1, u'red'), (1, u'blue'), + (2, u'red'), (2, u'blue')] + >>> MultiIndex.from_tuples(tuples, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables + """ + if len(tuples) == 0: + # I think this is right? Not quite sure... + raise TypeError('Cannot infer number of levels from empty list') + + if isinstance(tuples, np.ndarray): + if isinstance(tuples, Index): + tuples = tuples.values + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrays = lzip(*tuples) + + return MultiIndex.from_arrays(arrays, sortorder=sortorder, + names=names) + + @classmethod + def from_product(cls, iterables, sortorder=None, names=None): + """ + Make a MultiIndex from the cartesian product of multiple iterables + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of strings or None + Names for the levels in the index. + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = [u'green', u'purple'] + >>> MultiIndex.from_product([numbers, colors], + names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=[u'number', u'color']) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_tuples : Convert list of tuples to MultiIndex + """ + from pandas.core.categorical import Categorical + from pandas.tools.util import cartesian_product + + categoricals = [Categorical.from_array(it) for it in iterables] + labels = cartesian_product([c.labels for c in categoricals]) + + return MultiIndex(levels=[c.levels for c in categoricals], + labels=labels, sortorder=sortorder, names=names) + + @property + def nlevels(self): + return len(self.levels) + + @property + def levshape(self): + return tuple(len(x) for x in self.levels) + + def __contains__(self, key): + hash(key) + # work around some kind of odd cython bug + try: + self.get_loc(key) + return True + except KeyError: + return False + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(np.ndarray.__reduce__(self)) + subclass_state = ([lev.view(np.ndarray) for lev in self.levels], + [label.view(np.ndarray) for label in self.labels], + self.sortorder, list(self.names)) + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + nd_state, own_state = state + np.ndarray.__setstate__(self, nd_state) + levels, labels, sortorder, names = own_state + + self._set_levels([Index(x) for x in levels], validate=False) + self._set_labels(labels) + self._set_names(names) + self.sortorder = sortorder + self._verify_integrity() + + def __getitem__(self, key): + if np.isscalar(key): + retval = [] + for lev, lab in zip(self.levels, self.labels): + if lab[key] == -1: + retval.append(np.nan) + else: + retval.append(lev[lab[key]]) + + return tuple(retval) + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + sortorder = self.sortorder + else: + # cannot be sure whether the result will be sorted + sortorder = None + + result = np.empty(0, dtype=object).view(type(self)) + new_labels = [lab[key] for lab in self.labels] + + # an optimization + result._set_levels(self.levels, validate=False) + result._set_labels(new_labels) + result.sortorder = sortorder + result._set_names(self.names) + + return result + + def take(self, indexer, axis=None): + """ + Analogous to ndarray.take + """ + indexer = com._ensure_platform_int(indexer) + new_labels = [lab.take(indexer) for lab in self.labels] + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names, verify_integrity=False) + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + if not isinstance(other, (list, tuple)): + other = [other] + + if all((isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other): + arrays = [] + for i in range(self.nlevels): + label = self.get_level_values(i) + appended = [o.get_level_values(i) for o in other] + arrays.append(label.append(appended)) + return MultiIndex.from_arrays(arrays, names=self.names) + + to_concat = (self.values,) + tuple(k.values for k in other) + new_tuples = np.concatenate(to_concat) + + # if all(isinstance(x, MultiIndex) for x in other): + try: + return MultiIndex.from_tuples(new_tuples, names=self.names) + except: + return Index(new_tuples) + + def argsort(self, *args, **kwargs): + return self.values.argsort() + + def drop(self, labels, level=None): + """ + Make new MultiIndex with passed list of labels deleted + + Parameters + ---------- + labels : array-like + Must be a list of tuples + level : int or level name, default None + + Returns + ------- + dropped : MultiIndex + """ + if level is not None: + return self._drop_from_level(labels, level) + + try: + if not isinstance(labels, np.ndarray): + labels = com._index_labels_to_array(labels) + indexer = self.get_indexer(labels) + mask = indexer == -1 + if mask.any(): + raise ValueError('labels %s not contained in axis' + % labels[mask]) + return self.delete(indexer) + except Exception: + pass + + inds = [] + for label in labels: + loc = self.get_loc(label) + if isinstance(loc, int): + inds.append(loc) + else: + inds.extend(lrange(loc.start, loc.stop)) + + return self.delete(inds) + + def _drop_from_level(self, labels, level): + labels = com._index_labels_to_array(labels) + i = self._get_level_number(level) + index = self.levels[i] + values = index.get_indexer(labels) + + mask = ~lib.ismember(self.labels[i], set(values)) + + return self[mask] + + def droplevel(self, level=0): + """ + Return Index with requested level removed. If MultiIndex has only 2 + levels, the result will be of Index type not MultiIndex. + + Parameters + ---------- + level : int/level name or list thereof + + Notes + ----- + Does not check if result index is unique or not + + Returns + ------- + index : Index or MultiIndex + """ + levels = level + if not isinstance(levels, (tuple, list)): + levels = [level] + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + levnums = sorted(self._get_level_number(lev) for lev in levels)[::-1] + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + np.putmask(result, mask, np.nan) + + result.name = new_names[0] + return result + else: + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + def swaplevel(self, i, j): + """ + Swap level i with level j. Do not change the ordering of anything + + Parameters + ---------- + i, j : int, string (can be mixed) + Level of index to be swapped. Can pass level name as string. + + Returns + ------- + swapped : MultiIndex + """ + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + i = self._get_level_number(i) + j = self._get_level_number(j) + + new_levels[i], new_levels[j] = new_levels[j], new_levels[i] + new_labels[i], new_labels[j] = new_labels[j], new_labels[i] + new_names[i], new_names[j] = new_names[j], new_names[i] + + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + def reorder_levels(self, order): + """ + Rearrange levels using input order. May not drop or duplicate levels + + Parameters + ---------- + """ + order = [self._get_level_number(i) for i in order] + if len(order) != self.nlevels: + raise AssertionError(('Length of order must be same as ' + 'number of levels (%d), got %d') + % (self.nlevels, len(order))) + new_levels = [self.levels[i] for i in order] + new_labels = [self.labels[i] for i in order] + new_names = [self.names[i] for i in order] + + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + def __getslice__(self, i, j): + return self.__getitem__(slice(i, j)) + + def sortlevel(self, level=0, ascending=True, sort_remaining=True): + """ + Sort MultiIndex at the requested level. The result will respect the + original ordering of the associated factor at that level. + + Parameters + ---------- + level : list-like, int or str, default 0 + If a string is given, must be a name of the level + If list-like must be names or ints of levels. + ascending : boolean, default True + False to sort in descending order + sort_remaining : sort by the remaining levels after level. + + Returns + ------- + sorted_index : MultiIndex + """ + from pandas.core.groupby import _indexer_from_factorized + + labels = list(self.labels) + shape = list(self.levshape) + + if isinstance(level, (str, int)): + level = [level] + level = [self._get_level_number(lev) for lev in level] + + # partition labels and shape + primary = tuple(labels.pop(lev - i) for i, lev in enumerate(level)) + primshp = tuple(shape.pop(lev - i) for i, lev in enumerate(level)) + + if sort_remaining: + primary += primary + tuple(labels) + primshp += primshp + tuple(shape) + sortorder = None + else: + sortorder = level[0] + + indexer = _indexer_from_factorized(primary, + primshp, + compress=False) + + if not ascending: + indexer = indexer[::-1] + + indexer = com._ensure_platform_int(indexer) + new_labels = [lab.take(indexer) for lab in self.labels] + + new_index = MultiIndex(labels=new_labels, levels=self.levels, + names=self.names, sortorder=sortorder, + verify_integrity=False) + + return new_index, indexer + + def get_indexer(self, target, method=None, limit=None): + """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. The mask determines whether labels are + found or not in the current index + + Parameters + ---------- + target : MultiIndex or Index (of tuples) + method : {'pad', 'ffill', 'backfill', 'bfill'} + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Notes + ----- + This is a low-level method and probably should be used at your own risk + + Examples + -------- + >>> indexer, mask = index.get_indexer(new_index) + >>> new_values = cur_values.take(indexer) + >>> new_values[-mask] = np.nan + + Returns + ------- + (indexer, mask) : (ndarray, ndarray) + """ + method = self._get_method(method) + + target = _ensure_index(target) + + target_index = target + if isinstance(target, MultiIndex): + target_index = target._tuple_index + + if target_index.dtype != object: + return np.ones(len(target_index)) * -1 + + if not self.is_unique: + raise Exception('Reindexing only valid with uniquely valued Index ' + 'objects') + + self_index = self._tuple_index + + if method == 'pad': + if not self.is_unique or not self.is_monotonic: + raise AssertionError(('Must be unique and monotonic to ' + 'use forward fill getting the indexer')) + indexer = self_index._engine.get_pad_indexer(target_index, + limit=limit) + elif method == 'backfill': + if not self.is_unique or not self.is_monotonic: + raise AssertionError(('Must be unique and monotonic to ' + 'use backward fill getting the indexer')) + indexer = self_index._engine.get_backfill_indexer(target_index, + limit=limit) + else: + indexer = self_index._engine.get_indexer(target_index) + + return com._ensure_platform_int(indexer) + + def reindex(self, target, method=None, level=None, limit=None, + copy_if_needed=False): + """ + Performs any necessary conversion on the input index and calls + get_indexer. This method is here so MultiIndex and an Index of + like-labeled tuples can play nice together + + Returns + ------- + (new_index, indexer, mask) : (MultiIndex, ndarray, ndarray) + """ + + if level is not None: + if method is not None: + raise TypeError('Fill method not supported if level passed') + target = _ensure_index(target) + target, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) + else: + if self.equals(target): + indexer = None + else: + if self.is_unique: + indexer = self.get_indexer(target, method=method, + limit=limit) + else: + raise Exception( + "cannot handle a non-unique multi-index!") + + if not isinstance(target, MultiIndex): + if indexer is None: + target = self + elif (indexer >= 0).all(): + target = self.take(indexer) + else: + # hopefully? + target = MultiIndex.from_tuples(target) + + return target, indexer + + @cache_readonly + def _tuple_index(self): + """ + Convert MultiIndex to an Index of tuples + + Returns + ------- + index : Index + """ + return Index(self.values) + + def slice_locs(self, start=None, end=None, strict=False): + """ + For an ordered MultiIndex, compute the slice locations for input + labels. They can be tuples representing partial levels, e.g. for a + MultiIndex with 3 levels, you can pass a single value (corresponding to + the first level), or a 1-, 2-, or 3-tuple. + + Parameters + ---------- + start : label or tuple, default None + If None, defaults to the beginning + end : label or tuple + If None, defaults to the end + strict : boolean, + + Returns + ------- + (start, end) : (int, int) + + Notes + ----- + This function assumes that the data is sorted by the first level + """ + if start is None: + start_slice = 0 + else: + if not isinstance(start, tuple): + start = start, + start_slice = self._partial_tup_index(start, side='left') + + if end is None: + end_slice = len(self) + else: + if not isinstance(end, tuple): + end = end, + end_slice = self._partial_tup_index(end, side='right') + + return start_slice, end_slice + + def _partial_tup_index(self, tup, side='left'): + if len(tup) > self.lexsort_depth: + raise KeyError('Key length (%d) was greater than MultiIndex' + ' lexsort depth (%d)' % + (len(tup), self.lexsort_depth)) + + n = len(tup) + start, end = 0, len(self) + zipped = zip(tup, self.levels, self.labels) + for k, (lab, lev, labs) in enumerate(zipped): + section = labs[start:end] + + if lab not in lev: + if not lev.is_type_compatible(lib.infer_dtype([lab])): + raise TypeError('Level type mismatch: %s' % lab) + + # short circuit + loc = lev.searchsorted(lab, side=side) + if side == 'right' and loc >= 0: + loc -= 1 + return start + section.searchsorted(loc, side=side) + + idx = lev.get_loc(lab) + if k < n - 1: + end = start + section.searchsorted(idx, side='right') + start = start + section.searchsorted(idx, side='left') + else: + return start + section.searchsorted(idx, side=side) + + def get_loc(self, key): + """ + Get integer location slice for requested label or tuple + + Parameters + ---------- + key : label or tuple + + Returns + ------- + loc : int or slice object + """ + if isinstance(key, tuple): + if len(key) == self.nlevels: + if self.is_unique: + return self._engine.get_loc(_values_from_object(key)) + else: + return slice(*self.slice_locs(key, key)) + else: + # partial selection + result = slice(*self.slice_locs(key, key)) + if result.start == result.stop: + raise KeyError(key) + return result + else: + return self._get_level_indexer(key, level=0) + + def get_loc_level(self, key, level=0, drop_level=True): + """ + Get integer location slice for requested label or tuple + + Parameters + ---------- + key : label or tuple + level : int/level name or list thereof + + Returns + ------- + loc : int or slice object + """ + def _maybe_drop_levels(indexer, levels, drop_level): + if not drop_level: + return self[indexer] + # kludgearound + orig_index = new_index = self[indexer] + levels = [self._get_level_number(i) for i in levels] + for i in sorted(levels, reverse=True): + try: + new_index = new_index.droplevel(i) + except: + + # no dropping here + return orig_index + return new_index + + if isinstance(level, (tuple, list)): + if len(key) != len(level): + raise AssertionError('Key for location must have same ' + 'length as number of levels') + result = None + for lev, k in zip(level, key): + loc, new_index = self.get_loc_level(k, level=lev) + if isinstance(loc, slice): + mask = np.zeros(len(self), dtype=bool) + mask[loc] = True + loc = mask + + result = loc if result is None else result & loc + + return result, _maybe_drop_levels(result, level, drop_level) + + level = self._get_level_number(level) + + # kludge for #1796 + if isinstance(key, list): + key = tuple(key) + + if isinstance(key, tuple) and level == 0: + + try: + if key in self.levels[0]: + indexer = self._get_level_indexer(key, level=level) + new_index = _maybe_drop_levels(indexer, [0], drop_level) + return indexer, new_index + except TypeError: + pass + + if not any(isinstance(k, slice) for k in key): + + # partial selection + def partial_selection(key): + indexer = slice(*self.slice_locs(key, key)) + if indexer.start == indexer.stop: + raise KeyError(key) + ilevels = [i for i in range(len(key)) + if key[i] != slice(None, None)] + return indexer, _maybe_drop_levels(indexer, ilevels, + drop_level) + + if len(key) == self.nlevels: + + if self.is_unique: + + # here we have a completely specified key, but are + # using some partial string matching here + # GH4758 + can_index_exactly = any([ + (l.is_all_dates and + not isinstance(k, compat.string_types)) + for k, l in zip(key, self.levels) + ]) + if any([ + l.is_all_dates for k, l in zip(key, self.levels) + ]) and not can_index_exactly: + indexer = slice(*self.slice_locs(key, key)) + + # we have a multiple selection here + if not indexer.stop - indexer.start == 1: + return partial_selection(key) + + key = tuple(self[indexer].tolist()[0]) + + return (self._engine.get_loc(_values_from_object(key)), + None) + else: + return partial_selection(key) + else: + return partial_selection(key) + else: + indexer = None + for i, k in enumerate(key): + if not isinstance(k, slice): + k = self._get_level_indexer(k, level=i) + if isinstance(k, slice): + # everything + if k.start == 0 and k.stop == len(self): + k = slice(None, None) + else: + k_index = k + + if isinstance(k, slice): + if k == slice(None, None): + continue + else: + raise TypeError(key) + + if indexer is None: + indexer = k_index + else: # pragma: no cover + indexer &= k_index + if indexer is None: + indexer = slice(None, None) + ilevels = [i for i in range(len(key)) + if key[i] != slice(None, None)] + return indexer, _maybe_drop_levels(indexer, ilevels, + drop_level) + else: + indexer = self._get_level_indexer(key, level=level) + return indexer, _maybe_drop_levels(indexer, [level], drop_level) + + def _get_level_indexer(self, key, level=0): + # return a boolean indexer or a slice showing where the key is + # in the totality of values + + level_index = self.levels[level] + labels = self.labels[level] + + if isinstance(key, slice): + # handle a slice, returnig a slice if we can + # otherwise a boolean indexer + + start = level_index.get_loc(key.start or 0) + stop = level_index.get_loc(key.stop or len(level_index)-1) + step = key.step + + if isinstance(start,slice) or isinstance(stop,slice): + # we have a slice for start and/or stop + # a partial date slicer on a DatetimeIndex generates a slice + # note that the stop ALREADY includes the stopped point (if + # it was a string sliced) + m = np.zeros(len(labels),dtype=bool) + m[np.in1d(labels,np.arange(start.start,stop.stop,step))] = True + return m + + elif level > 0 or self.lexsort_depth == 0 or step is not None: + # need to have like semantics here to right + # searching as when we are using a slice + # so include the stop+1 (so we include stop) + m = np.zeros(len(labels),dtype=bool) + m[np.in1d(labels,np.arange(start,stop+1,step))] = True + return m + else: + # sorted, so can return slice object -> view + i = labels.searchsorted(start, side='left') + j = labels.searchsorted(stop, side='right') + return slice(i, j, step) + + else: + + loc = level_index.get_loc(key) + if level > 0 or self.lexsort_depth == 0: + return np.array(labels == loc,dtype=bool) + else: + # sorted, so can return slice object -> view + i = labels.searchsorted(loc, side='left') + j = labels.searchsorted(loc, side='right') + return slice(i, j) + + def get_locs(self, tup): + """ + Given a tuple of slices/lists/labels/boolean indexer to a level-wise spec + produce an indexer to extract those locations + + Parameters + ---------- + key : tuple of (slices/list/labels) + + Returns + ------- + locs : integer list of locations or boolean indexer suitable + for passing to iloc + """ + + from pandas.core.indexing import _is_null_slice + + # must be lexsorted to at least as many levels + if not self.is_lexsorted_for_tuple(tup): + raise KeyError('MultiIndex Slicing requires the index to be fully lexsorted' + ' tuple len ({0}), lexsort depth ({1})'.format(len(tup), self.lexsort_depth)) + + def _convert_indexer(r): + if isinstance(r, slice): + m = np.zeros(len(self),dtype=bool) + m[r] = True + return m + return r + + ranges = [] + for i,k in enumerate(tup): + + if com._is_bool_indexer(k): + # a boolean indexer, must be the same length! + k = np.asarray(k) + if len(k) != len(self): + raise ValueError("cannot index with a boolean indexer that is" + " not the same length as the index") + ranges.append(k) + elif com.is_list_like(k): + # a collection of labels to include from this level (these are or'd) + ranges.append(reduce( + np.logical_or,[ _convert_indexer(self._get_level_indexer(x, level=i) + ) for x in k ])) + elif _is_null_slice(k): + # empty slice + pass + + elif isinstance(k,slice): + + # a slice, include BOTH of the labels + ranges.append(self._get_level_indexer(k,level=i)) + else: + # a single label + ranges.append(self.get_loc_level(k,level=i,drop_level=False)[0]) + + # identity + if len(ranges) == 0: + return slice(0,len(self)) + + elif len(ranges) == 1: + return ranges[0] + + # construct a boolean indexer if we have a slice or boolean indexer + return reduce(np.logical_and,[ _convert_indexer(r) for r in ranges ]) + + def truncate(self, before=None, after=None): + """ + Slice index between two labels / tuples, return new MultiIndex + + Parameters + ---------- + before : label or tuple, can be partial. Default None + None defaults to start + after : label or tuple, can be partial. Default None + None defaults to end + + Returns + ------- + truncated : MultiIndex + """ + if after and before and after < before: + raise ValueError('after < before') + + i, j = self.levels[0].slice_locs(before, after) + left, right = self.slice_locs(before, after) + + new_levels = list(self.levels) + new_levels[0] = new_levels[0][i:j] + + new_labels = [lab[left:right] for lab in self.labels] + new_labels[0] = new_labels[0] - i + + return MultiIndex(levels=new_levels, labels=new_labels, + verify_integrity=False) + + def equals(self, other): + """ + Determines if two MultiIndex objects have the same labeling information + (the levels themselves do not necessarily have to be the same) + + See also + -------- + equal_levels + """ + if self.is_(other): + return True + + if not isinstance(other, MultiIndex): + return array_equivalent(self.values, _ensure_index(other)) + + if self.nlevels != other.nlevels: + return False + + if len(self) != len(other): + return False + + for i in range(self.nlevels): + svalues = com.take_nd(self.levels[i].values, self.labels[i], + allow_fill=False) + ovalues = com.take_nd(other.levels[i].values, other.labels[i], + allow_fill=False) + if not array_equivalent(svalues, ovalues): + return False + + return True + + def equal_levels(self, other): + """ + Return True if the levels of both MultiIndex objects are the same + + """ + if self.nlevels != other.nlevels: + return False + + for i in range(self.nlevels): + if not self.levels[i].equals(other.levels[i]): + return False + return True + + def union(self, other): + """ + Form the union of two MultiIndex objects, sorting if possible + + Parameters + ---------- + other : MultiIndex or array / Index of tuples + + Returns + ------- + Index + """ + self._assert_can_do_setop(other) + + if len(other) == 0 or self.equals(other): + return self + + result_names = self.names if self.names == other.names else None + + uniq_tuples = lib.fast_unique_multiple([self.values, other.values]) + return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, + names=result_names) + + def intersection(self, other): + """ + Form the intersection of two MultiIndex objects, sorting if possible + + Parameters + ---------- + other : MultiIndex or array / Index of tuples + + Returns + ------- + Index + """ + self._assert_can_do_setop(other) + + if self.equals(other): + return self + + result_names = self.names if self.names == other.names else None + + self_tuples = self.values + other_tuples = other.values + uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) + if len(uniq_tuples) == 0: + return MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + names=result_names, verify_integrity=False) + else: + return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, + names=result_names) + + def diff(self, other): + """ + Compute sorted set difference of two MultiIndex objects + + Returns + ------- + diff : MultiIndex + """ + self._assert_can_do_setop(other) + + if not isinstance(other, MultiIndex): + if len(other) == 0: + return self + try: + other = MultiIndex.from_tuples(other) + except: + raise TypeError('other must be a MultiIndex or a list of' + ' tuples') + result_names = self.names + else: + result_names = self.names if self.names == other.names else None + + if self.equals(other): + return MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + names=result_names, verify_integrity=False) + + difference = sorted(set(self.values) - set(other.values)) + + if len(difference) == 0: + return MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + names=result_names, verify_integrity=False) + else: + return MultiIndex.from_tuples(difference, sortorder=0, + names=result_names) + + def _assert_can_do_setop(self, other): + pass + + def astype(self, dtype): + if np.dtype(dtype) != np.object_: + raise TypeError('Setting %s dtype to anything other than object ' + 'is not supported' % self.__class__) + return self._shallow_copy() + + def insert(self, loc, item): + """ + Make new MultiIndex inserting new item at location + + Parameters + ---------- + loc : int + item : tuple + Must be same length as number of levels in the MultiIndex + + Returns + ------- + new_index : Index + """ + # Pad the key with empty strings if lower levels of the key + # aren't specified: + if not isinstance(item, tuple): + item = (item,) + ('',) * (self.nlevels - 1) + elif len(item) != self.nlevels: + raise ValueError( + 'Item must have length equal to number of levels.') + + new_levels = [] + new_labels = [] + for k, level, labels in zip(item, self.levels, self.labels): + if k not in level: + # have to insert into level + # must insert at end otherwise you have to recompute all the + # other labels + lev_loc = len(level) + level = level.insert(lev_loc, k) + else: + lev_loc = level.get_loc(k) + + new_levels.append(level) + new_labels.append(np.insert(labels, loc, lev_loc)) + + return MultiIndex(levels=new_levels, labels=new_labels, + names=self.names, verify_integrity=False) + + def delete(self, loc): + """ + Make new index with passed location deleted + + Returns + ------- + new_index : MultiIndex + """ + new_labels = [np.delete(lab, loc) for lab in self.labels] + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names, verify_integrity=False) + + get_major_bounds = slice_locs + + __bounds = None + + @property + def _bounds(self): + """ + Return or compute and return slice points for level 0, assuming + sortedness + """ + if self.__bounds is None: + inds = np.arange(len(self.levels[0])) + self.__bounds = self.labels[0].searchsorted(inds) + + return self.__bounds + + def _wrap_joined_index(self, joined, other): + names = self.names if self.names == other.names else None + return MultiIndex.from_tuples(joined, names=names) + + +# For utility purposes + +def _sparsify(label_list, start=0, sentinel=''): + pivoted = lzip(*label_list) + k = len(label_list) + + result = pivoted[:start + 1] + prev = pivoted[start] + + for cur in pivoted[start + 1:]: + sparse_cur = [] + + for i, (p, t) in enumerate(zip(prev, cur)): + if i == k - 1: + sparse_cur.append(t) + result.append(sparse_cur) + break + + if p == t: + sparse_cur.append(sentinel) + else: + sparse_cur.extend(cur[i:]) + result.append(sparse_cur) + break + + prev = cur + + return lzip(*result) + + +def _ensure_index(index_like, copy=False): + if isinstance(index_like, Index): + if copy: + index_like = index_like.copy() + return index_like + if hasattr(index_like, 'name'): + return Index(index_like, name=index_like.name, copy=copy) + + # must check for exactly list here because of strict type + # check in clean_index_list + if isinstance(index_like, list): + if type(index_like) != list: + index_like = list(index_like) + # 2200 ? + converted, all_arrays = lib.clean_index_list(index_like) + + if len(converted) > 0 and all_arrays: + return MultiIndex.from_arrays(converted) + else: + index_like = converted + else: + # clean_index_list does the equivalent of copying + # so only need to do this if not list instance + if copy: + from copy import copy + index_like = copy(index_like) + + return Index(index_like) + + +def _ensure_frozen(array_like, copy=False): + array_like = np.asanyarray(array_like, dtype=np.int_) + array_like = array_like.view(FrozenNDArray) + if copy: + array_like = array_like.copy() + return array_like + + +def _validate_join_method(method): + if method not in ['left', 'right', 'inner', 'outer']: + raise ValueError('do not recognize join method %s' % method) + + +# TODO: handle index names! +def _get_combined_index(indexes, intersect=False): + indexes = _get_distinct_indexes(indexes) + if len(indexes) == 0: + return Index([]) + if len(indexes) == 1: + return indexes[0] + if intersect: + index = indexes[0] + for other in indexes[1:]: + index = index.intersection(other) + return index + union = _union_indexes(indexes) + return _ensure_index(union) + + +def _get_distinct_indexes(indexes): + return list(dict((id(x), x) for x in indexes).values()) + + +def _union_indexes(indexes): + if len(indexes) == 0: + raise AssertionError('Must have at least 1 Index to union') + if len(indexes) == 1: + result = indexes[0] + if isinstance(result, list): + result = Index(sorted(result)) + return result + + indexes, kind = _sanitize_and_check(indexes) + + if kind == 'special': + result = indexes[0] + + if hasattr(result, 'union_many'): + return result.union_many(indexes[1:]) + else: + for other in indexes[1:]: + result = result.union(other) + return result + elif kind == 'array': + index = indexes[0] + for other in indexes[1:]: + if not index.equals(other): + return Index(lib.fast_unique_multiple(indexes)) + + return index + else: + return Index(lib.fast_unique_multiple_list(indexes)) + + +def _trim_front(strings): + """ + Trims zeros and decimal points + """ + trimmed = strings + while len(strings) > 0 and all([x[0] == ' ' for x in trimmed]): + trimmed = [x[1:] for x in trimmed] + return trimmed + + +def _sanitize_and_check(indexes): + kinds = list(set([type(index) for index in indexes])) + + if list in kinds: + if len(kinds) > 1: + indexes = [Index(com._try_sort(x)) + if not isinstance(x, Index) else x + for x in indexes] + kinds.remove(list) + else: + return indexes, 'list' + + if len(kinds) > 1 or Index not in kinds: + return indexes, 'special' + else: + return indexes, 'array' + + +def _get_consensus_names(indexes): + + # find the non-none names, need to tupleify to make + # the set hashable, then reverse on return + consensus_names = set([ + tuple(i.names) for i in indexes if all(n is not None for n in i.names) + ]) + if len(consensus_names) == 1: + return list(list(consensus_names)[0]) + return [None] * indexes[0].nlevels + + +def _maybe_box(idx): + from pandas.tseries.api import DatetimeIndex, PeriodIndex + klasses = DatetimeIndex, PeriodIndex + + if isinstance(idx, klasses): + return idx.asobject + return idx + + +def _all_indexes_same(indexes): + first = indexes[0] + for index in indexes[1:]: + if not first.equals(index): + return False + return True + + +def _get_na_rep(dtype): + return {np.datetime64: 'NaT', np.timedelta64: 'NaT'}.get(dtype, 'NaN') + + +def _get_na_value(dtype): + return {np.datetime64: tslib.NaT, np.timedelta64: tslib.NaT}.get(dtype, + np.nan) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py new file mode 100644 index 00000000..d387cb64 --- /dev/null +++ b/pandas/core/indexing.py @@ -0,0 +1,1706 @@ +# pylint: disable=W0223 + +from datetime import datetime +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.compat import range, zip +import pandas.compat as compat +import pandas.core.common as com +from pandas.core.common import (_is_bool_indexer, is_integer_dtype, + _asarray_tuplesafe, is_list_like, isnull, + ABCSeries, ABCDataFrame, ABCPanel, is_float) +import pandas.lib as lib + +import numpy as np + +# the supported indexers +def get_indexers_list(): + + return [ + ('ix', _IXIndexer), + ('iloc', _iLocIndexer), + ('loc', _LocIndexer), + ('at', _AtIndexer), + ('iat', _iAtIndexer), + ] + +# "null slice" +_NS = slice(None, None) + +# the public IndexSlicerMaker +class _IndexSlice(object): + def __getitem__(self, arg): + return arg +IndexSlice = _IndexSlice() + +class IndexingError(Exception): + pass + +class _NDFrameIndexer(object): + _valid_types = None + _exception = KeyError + + def __init__(self, obj, name): + self.obj = obj + self.ndim = obj.ndim + self.name = name + self.axis = None + + def __call__(self, *args, **kwargs): + # we need to return a copy of ourselves + self = self.__class__(self.obj, self.name) + + # set the passed in values + for k, v in compat.iteritems(kwargs): + setattr(self,k,v) + return self + + def __iter__(self): + raise NotImplementedError('ix is not iterable') + + def __getitem__(self, key): + if type(key) is tuple: + try: + values = self.obj.get_value(*key) + if np.isscalar(values): + return values + except Exception: + pass + + return self._getitem_tuple(key) + else: + return self._getitem_axis(key, axis=0) + + def _get_label(self, label, axis=0): + if self.ndim == 1: + # for perf reasons we want to try _xs first + # as its basically direct indexing + # but will fail when the index is not present + # see GH5667 + try: + return self.obj._xs(label, axis=axis) + except: + return self.obj[label] + elif (isinstance(label, tuple) and + isinstance(label[axis], slice)): + raise IndexingError('no slices here, handle elsewhere') + + return self.obj._xs(label, axis=axis) + + def _get_loc(self, key, axis=0): + return self.obj._ixs(key, axis=axis) + + def _slice(self, obj, axis=0, typ=None): + return self.obj._slice(obj, axis=axis, typ=typ) + + def __setitem__(self, key, value): + + if self.axis is not None: + indexer = self._convert_tuple(key, is_setter=True) + + else: + + # kludgetastic + ax = self.obj._get_axis(0) + if isinstance(ax, MultiIndex): + try: + indexer = ax.get_loc(key) + self._setitem_with_indexer(indexer, value) + return + except Exception: + pass + + if isinstance(key, tuple): + if len(key) > self.ndim: + raise IndexingError('only tuples of length <= %d supported' % + self.ndim) + indexer = self._convert_tuple(key, is_setter=True) + else: + indexer = self._convert_to_indexer(key, is_setter=True) + + self._setitem_with_indexer(indexer, value) + + def _has_valid_type(self, k, axis): + raise NotImplementedError() + + def _has_valid_tuple(self, key): + """ check the key for valid keys across my indexer """ + for i, k in enumerate(key): + if i >= self.obj.ndim: + raise IndexingError('Too many indexers') + if not self._has_valid_type(k, i): + raise ValueError("Location based indexing can only have [%s] " + "types" % self._valid_types) + + def _is_nested_tuple_indexer(self, tup): + if any([ isinstance(ax, MultiIndex) for ax in self.obj.axes ]): + return any([ _is_nested_tuple(tup,ax) for ax in self.obj.axes ]) + return False + + def _convert_tuple(self, key, is_setter=False): + keyidx = [] + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + for i in range(self.ndim): + if i == axis: + keyidx.append(self._convert_to_indexer(key, axis=axis, is_setter=is_setter)) + else: + keyidx.append(slice(None)) + else: + for i, k in enumerate(key): + idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) + keyidx.append(idx) + return tuple(keyidx) + + def _convert_scalar_indexer(self, key, axis): + # if we are accessing via lowered dim, use the last dim + ax = self.obj._get_axis(min(axis, self.ndim - 1)) + # a scalar + return ax._convert_scalar_indexer(key, typ=self.name) + + def _convert_slice_indexer(self, key, axis): + # if we are accessing via lowered dim, use the last dim + ax = self.obj._get_axis(min(axis, self.ndim - 1)) + return ax._convert_slice_indexer(key, typ=self.name) + + def _has_valid_setitem_indexer(self, indexer): + return True + + def _has_valid_positional_setitem_indexer(self, indexer): + """ validate that an positional indexer cannot enlarge its target + will raise if needed, does not modify the indexer externally """ + if isinstance(indexer, dict): + raise IndexError("{0} cannot enlarge its target object" + .format(self.name)) + else: + if not isinstance(indexer, tuple): + indexer = self._tuplify(indexer) + for ax, i in zip(self.obj.axes, indexer): + if isinstance(i, slice): + # should check the stop slice? + pass + elif is_list_like(i): + # should check the elements? + pass + elif com.is_integer(i): + if i >= len(ax): + raise IndexError("{0} cannot enlarge its target object" + .format(self.name)) + elif isinstance(i, dict): + raise IndexError("{0} cannot enlarge its target object" + .format(self.name)) + + return True + + def _setitem_with_indexer(self, indexer, value): + + self._has_valid_setitem_indexer(indexer) + + # also has the side effect of consolidating in-place + from pandas import Panel, DataFrame, Series + + # maybe partial set + take_split_path = self.obj._is_mixed_type + if isinstance(indexer, tuple): + nindexer = [] + for i, idx in enumerate(indexer): + if isinstance(idx, dict): + + # reindex the axis to the new value + # and set inplace + key, _ = _convert_missing_indexer(idx) + + # if this is the items axes, then take the main missing + # path first + # this correctly sets the dtype and avoids cache issues + # essentially this separates out the block that is needed + # to possibly be modified + if self.ndim > 1 and i == self.obj._info_axis_number: + + # add the new item, and set the value + # must have all defined axes if we have a scalar + # or a list-like on the non-info axes if we have a + # list-like + len_non_info_axes = [ + len(_ax) for _i, _ax in enumerate(self.obj.axes) + if _i != i + ] + if any([not l for l in len_non_info_axes]): + if not is_list_like(value): + raise ValueError("cannot set a frame with no " + "defined index and a scalar") + self.obj[key] = value + return self.obj + + self.obj[key] = np.nan + + new_indexer = _convert_from_missing_indexer_tuple( + indexer, self.obj.axes) + self._setitem_with_indexer(new_indexer, value) + return self.obj + + # reindex the axis + # make sure to clear the cache because we are + # just replacing the block manager here + # so the object is the same + index = self.obj._get_axis(i) + labels = _safe_append_to_index(index, key) + self.obj._data = self.obj.reindex_axis(labels, i)._data + self.obj._maybe_update_cacher(clear=True) + self.obj.is_copy=None + + if isinstance(labels, MultiIndex): + self.obj.sortlevel(inplace=True) + labels = self.obj._get_axis(i) + + nindexer.append(labels.get_loc(key)) + + else: + nindexer.append(idx) + + indexer = tuple(nindexer) + else: + + indexer, missing = _convert_missing_indexer(indexer) + + if missing: + + # reindex the axis to the new value + # and set inplace + if self.ndim == 1: + index = self.obj.index + if len(index) == 0: + new_index = Index([indexer]) + else: + new_index = _safe_append_to_index(index, indexer) + + # this preserves dtype of the value + new_values = Series([value]).values + if len(self.obj.values): + new_values = np.concatenate([self.obj.values, + new_values]) + + self.obj._data = self.obj._constructor( + new_values, index=new_index, name=self.obj.name)._data + self.obj._maybe_update_cacher(clear=True) + return self.obj + + elif self.ndim == 2: + + # no columns and scalar + if not len(self.obj.columns): + raise ValueError( + "cannot set a frame with no defined columns" + ) + + index = self.obj._get_axis(0) + labels = _safe_append_to_index(index, indexer) + self.obj._data = self.obj.reindex_axis(labels, 0)._data + self.obj._maybe_update_cacher(clear=True) + return getattr(self.obj, self.name).__setitem__(indexer, + value) + + # set using setitem (Panel and > dims) + elif self.ndim >= 3: + return self.obj.__setitem__(indexer, value) + + # set + info_axis = self.obj._info_axis_number + item_labels = self.obj._get_axis(info_axis) + + # if we have a complicated setup, take the split path + if (isinstance(indexer, tuple) and + any([isinstance(ax, MultiIndex) for ax in self.obj.axes])): + take_split_path = True + + # align and set the values + if take_split_path: + + if not isinstance(indexer, tuple): + indexer = self._tuplify(indexer) + + if isinstance(value, ABCSeries): + value = self._align_series(indexer, value) + + info_idx = indexer[info_axis] + if com.is_integer(info_idx): + info_idx = [info_idx] + labels = item_labels[info_idx] + + # if we have a partial multiindex, then need to adjust the plane + # indexer here + if (len(labels) == 1 and + isinstance(self.obj[labels[0]].index, MultiIndex)): + item = labels[0] + obj = self.obj[item] + index = obj.index + idx = indexer[:info_axis][0] + + plane_indexer = tuple([idx]) + indexer[info_axis + 1:] + lplane_indexer = _length_of_indexer(plane_indexer[0], index) + + # require that we are setting the right number of values that + # we are indexing + if is_list_like(value) and np.iterable(value) and lplane_indexer != len(value): + + if len(obj[idx]) != len(value): + raise ValueError( + "cannot set using a multi-index selection indexer " + "with a different length than the value" + ) + + # make sure we have an ndarray + value = getattr(value,'values',value).ravel() + + # we can directly set the series here + # as we select a slice indexer on the mi + idx = index._convert_slice_indexer(idx) + obj = obj.copy() + obj._data = obj._data.setitem(indexer=tuple([idx]), value=value) + self.obj[item] = obj + return + + # non-mi + else: + plane_indexer = indexer[:info_axis] + indexer[info_axis + 1:] + if info_axis > 0: + plane_axis = self.obj.axes[:info_axis][0] + lplane_indexer = _length_of_indexer(plane_indexer[0], + plane_axis) + else: + lplane_indexer = 0 + + def setter(item, v): + s = self.obj[item] + pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + + # perform the equivalent of a setitem on the info axis + # as we have a null slice which means essentially reassign to the columns + # of a multi-dim object + # GH6149 + if isinstance(pi, tuple) and all(_is_null_slice(idx) for idx in pi): + s = v + else: + # set the item, possibly having a dtype change + s = s.copy() + s._data = s._data.setitem(indexer=pi, value=v) + s._maybe_update_cacher(clear=True) + + # reset the sliced object if unique + self.obj[item] = s + + def can_do_equal_len(): + """ return True if we have an equal len settable """ + if not len(labels) == 1 or not np.iterable(value): + return False + + l = len(value) + item = labels[0] + index = self.obj[item].index + + # equal len list/ndarray + if len(index) == l: + return True + elif lplane_indexer == l: + return True + + return False + + if _is_list_like(value): + + # we have an equal len Frame + if isinstance(value, ABCDataFrame) and value.ndim > 1: + + for item in labels: + + # align to + if item in value: + v = value[item] + i = self.obj[item].index + v = v.reindex(i & v.index) + + setter(item, v.values) + else: + setter(item, np.nan) + + # we have an equal len ndarray/convertible to our labels + elif np.array(value).ndim == 2: + + # note that this coerces the dtype if we are mixed + # GH 7551 + value = np.array(value,dtype=object) + if len(labels) != value.shape[1]: + raise ValueError('Must have equal len keys and value ' + 'when setting with an ndarray') + + for i, item in enumerate(labels): + + # setting with a list, recoerces + setter(item, value[:, i].tolist()) + + # we have an equal len list/ndarray + elif can_do_equal_len(): + setter(labels[0], value) + + # per label values + else: + + if len(labels) != len(value): + raise ValueError('Must have equal len keys and value ' + 'when setting with an iterable') + + for item, v in zip(labels, value): + setter(item, v) + else: + + # scalar + for item in labels: + setter(item, value) + + else: + if isinstance(indexer, tuple): + indexer = _maybe_convert_ix(*indexer) + + if isinstance(value, ABCSeries): + value = self._align_series(indexer, value) + + elif isinstance(value, ABCDataFrame): + value = self._align_frame(indexer, value) + + if isinstance(value, ABCPanel): + value = self._align_panel(indexer, value) + + # actually do the set + self.obj._data = self.obj._data.setitem(indexer=indexer, value=value) + self.obj._maybe_update_cacher(clear=True) + + def _align_series(self, indexer, ser): + # indexer to assign Series can be tuple, slice, scalar + if isinstance(indexer, (slice, np.ndarray, list)): + indexer = tuple([indexer]) + + if isinstance(indexer, tuple): + + aligners = [not _is_null_slice(idx) for idx in indexer] + sum_aligners = sum(aligners) + single_aligner = sum_aligners == 1 + is_frame = self.obj.ndim == 2 + is_panel = self.obj.ndim >= 3 + obj = self.obj + + # are we a single alignable value on a non-primary + # dim (e.g. panel: 1,2, or frame: 0) ? + # hence need to align to a single axis dimension + # rather that find all valid dims + + # frame + if is_frame: + single_aligner = single_aligner and aligners[0] + + # panel + elif is_panel: + single_aligner = (single_aligner and + (aligners[1] or aligners[2])) + + # we have a frame, with multiple indexers on both axes; and a + # series, so need to broadcast (see GH5206) + if (sum_aligners == self.ndim and + all([com._is_sequence(_) for _ in indexer])): + ser = ser.reindex(obj.axes[0][indexer[0].ravel()], + copy=True).values + + # single indexer + if len(indexer) > 1: + l = len(indexer[1].ravel()) + ser = np.tile(ser, l).reshape(l, -1).T + + return ser + + for i, idx in enumerate(indexer): + ax = obj.axes[i] + + # multiple aligners (or null slices) + if com._is_sequence(idx) or isinstance(idx, slice): + if single_aligner and _is_null_slice(idx): + continue + new_ix = ax[idx] + if not is_list_like(new_ix): + new_ix = Index([new_ix]) + else: + new_ix = Index(new_ix.ravel()) + if ser.index.equals(new_ix) or not len(new_ix): + return ser.values.copy() + + return ser.reindex(new_ix).values + + # 2 dims + elif single_aligner and is_frame: + + # reindex along index + ax = self.obj.axes[1] + if ser.index.equals(ax) or not len(ax): + return ser.values.copy() + return ser.reindex(ax).values + + # >2 dims + elif single_aligner: + + broadcast = [] + for n, labels in enumerate(self.obj._get_plane_axes(i)): + + # reindex along the matching dimensions + if len(labels & ser.index): + ser = ser.reindex(labels) + else: + broadcast.append((n, len(labels))) + + # broadcast along other dims + ser = ser.values.copy() + for (axis, l) in broadcast: + shape = [-1] * (len(broadcast) + 1) + shape[axis] = l + ser = np.tile(ser, l).reshape(shape) + + if self.obj.ndim == 3: + ser = ser.T + + return ser + + elif np.isscalar(indexer): + ax = self.obj._get_axis(1) + + if ser.index.equals(ax): + return ser.values.copy() + + return ser.reindex(ax).values + + raise ValueError('Incompatible indexer with Series') + + def _align_frame(self, indexer, df): + is_frame = self.obj.ndim == 2 + is_panel = self.obj.ndim >= 3 + if isinstance(indexer, tuple): + idx, cols = None, None + sindexers = [] + for i, ix in enumerate(indexer): + ax = self.obj.axes[i] + if com._is_sequence(ix) or isinstance(ix, slice): + if idx is None: + idx = ax[ix].ravel() + elif cols is None: + cols = ax[ix].ravel() + else: + break + else: + sindexers.append(i) + + # panel + if is_panel: + if len(sindexers) == 1 and idx is None and cols is None: + if sindexers[0] == 0: + df = df.T + return self.obj.conform(df, axis=sindexers[0]) + df = df.T + + if idx is not None and cols is not None: + if df.index.equals(idx) and df.columns.equals(cols): + val = df.copy().values + else: + val = df.reindex(idx, columns=cols).values + return val + + elif ((isinstance(indexer, slice) or com.is_list_like(indexer)) + and is_frame): + ax = self.obj.index[indexer] + if df.index.equals(ax): + val = df.copy().values + else: + + # we have a multi-index and are trying to align + # with a particular, level GH3738 + if isinstance(ax, MultiIndex) and isinstance( + df.index, MultiIndex) and ax.nlevels != df.index.nlevels: + raise TypeError("cannot align on a multi-index with out specifying the join levels") + + val = df.reindex(index=ax).values + return val + + elif np.isscalar(indexer) and not is_frame: + idx = self.obj.axes[1] + cols = self.obj.axes[2] + + # by definition we are indexing on the 0th axis + if is_panel: + df = df.T + + if idx.equals(df.index) and cols.equals(df.columns): + return df.copy().values + + # a passed in dataframe which is actually a transpose + # of what is needed + elif idx.equals(df.columns) and cols.equals(df.index): + return df.T.copy().values + + return df.reindex(idx, columns=cols).values + + raise ValueError('Incompatible indexer with DataFrame') + + def _align_panel(self, indexer, df): + is_frame = self.obj.ndim == 2 + is_panel = self.obj.ndim >= 3 + raise NotImplementedError("cannot set using an indexer with a Panel " + "yet!") + + def _getitem_tuple(self, tup): + try: + return self._getitem_lowerdim(tup) + except IndexingError: + pass + + # no multi-index, so validate all of the indexers + self._has_valid_tuple(tup) + + # ugly hack for GH #836 + if self._multi_take_opportunity(tup): + return self._multi_take(tup) + + # no shortcut needed + retval = self.obj + for i, key in enumerate(tup): + if i >= self.obj.ndim: + raise IndexingError('Too many indexers') + + if _is_null_slice(key): + continue + + retval = getattr(retval, self.name)._getitem_axis(key, axis=i) + + return retval + + def _multi_take_opportunity(self, tup): + from pandas.core.generic import NDFrame + + # ugly hack for GH #836 + if not isinstance(self.obj, NDFrame): + return False + + if not all(_is_list_like(x) for x in tup): + return False + + # just too complicated + for indexer, ax in zip(tup, self.obj._data.axes): + if isinstance(ax, MultiIndex): + return False + elif com._is_bool_indexer(indexer): + return False + elif not ax.is_unique: + return False + + return True + + def _multi_take(self, tup): + """ create the reindex map for our objects, raise the _exception if we + can't create the indexer + """ + try: + o = self.obj + d = dict([ + (a, self._convert_for_reindex(t, axis=o._get_axis_number(a))) + for t, a in zip(tup, o._AXIS_ORDERS) + ]) + return o.reindex(**d) + except: + raise self._exception + + def _convert_for_reindex(self, key, axis=0): + labels = self.obj._get_axis(axis) + + if com._is_bool_indexer(key): + key = _check_bool_indexer(labels, key) + return labels[key] + else: + if isinstance(key, Index): + # want Index objects to pass through untouched + keyarr = key + else: + # asarray can be unsafe, NumPy strings are weird + keyarr = _asarray_tuplesafe(key) + + if is_integer_dtype(keyarr) and not labels.is_integer(): + keyarr = com._ensure_platform_int(keyarr) + return labels.take(keyarr) + + return keyarr + + def _handle_lowerdim_multi_index_axis0(self, tup): + # we have an axis0 multi-index, handle or raise + + try: + # fast path for series or for tup devoid of slices + return self._get_label(tup, axis=0) + except TypeError: + # slices are unhashable + pass + except Exception as e1: + if isinstance(tup[0], (slice, Index)): + raise IndexingError("Handle elsewhere") + + # raise the error if we are not sorted + ax0 = self.obj._get_axis(0) + if not ax0.is_lexsorted_for_tuple(tup): + raise e1 + + return None + + def _getitem_lowerdim(self, tup): + + # we can directly get the axis result since the axis is specified + if self.axis is not None: + axis = self.obj._get_axis_number(self.axis) + return self._getitem_axis(tup, axis=axis, validate_iterable=True) + + # we may have a nested tuples indexer here + if self._is_nested_tuple_indexer(tup): + return self._getitem_nested_tuple(tup) + + # we maybe be using a tuple to represent multiple dimensions here + ax0 = self.obj._get_axis(0) + if isinstance(ax0, MultiIndex): + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result + + if len(tup) > self.obj.ndim: + raise IndexingError("Too many indexers. handle elsewhere") + + # to avoid wasted computation + # df.ix[d1:d2, 0] -> columns first (True) + # df.ix[0, ['C', 'B', A']] -> rows first (False) + for i, key in enumerate(tup): + if _is_label_like(key) or isinstance(key, tuple): + section = self._getitem_axis(key, axis=i) + + # we have yielded a scalar ? + if not _is_list_like(section): + return section + + elif section.ndim == self.ndim: + # we're in the middle of slicing through a MultiIndex + # revise the key wrt to `section` by inserting an _NS + new_key = tup[:i] + (_NS,) + tup[i + 1:] + + else: + new_key = tup[:i] + tup[i + 1:] + + # unfortunately need an odious kludge here because of + # DataFrame transposing convention + if (isinstance(section, ABCDataFrame) and i > 0 + and len(new_key) == 2): + a, b = new_key + new_key = b, a + + if len(new_key) == 1: + new_key, = new_key + + # This is an elided recursive call to iloc/loc/etc' + return getattr(section, self.name)[new_key] + + raise IndexingError('not applicable') + + def _getitem_nested_tuple(self, tup): + # we have a nested tuple so have at least 1 multi-index level + # we should be able to match up the dimensionaility here + + # we have too many indexers for our dim, but have at least 1 + # multi-index dimension, try to see if we have something like + # a tuple passed to a series with a multi-index + if len(tup) > self.ndim: + result = self._handle_lowerdim_multi_index_axis0(tup) + if result is not None: + return result + + # this is a series with a multi-index specified a tuple of selectors + return self._getitem_axis(tup, axis=0, validate_iterable=True) + + # handle the multi-axis by taking sections and reducing + # this is iterative + obj = self.obj + axis = 0 + for i, key in enumerate(tup): + + if _is_null_slice(key): + axis += 1 + continue + + current_ndim = obj.ndim + obj = getattr(obj, self.name)._getitem_axis(key, axis=axis, validate_iterable=True) + axis += 1 + + # if we have a scalar, we are done + if np.isscalar(obj): + break + + # has the dim of the obj changed? + # GH 7199 + if obj.ndim < current_ndim: + + # GH 7516 + # if had a 3 dim and are going to a 2d + # axes are reversed on a DataFrame + if i >= 1 and current_ndim == 3 and obj.ndim == 2: + obj = obj.T + + axis -= 1 + + return obj + + def _getitem_axis(self, key, axis=0, validate_iterable=False): + + self._has_valid_type(key, axis) + labels = self.obj._get_axis(axis) + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + elif _is_list_like(key) and not (isinstance(key, tuple) and + isinstance(labels, MultiIndex)): + + if hasattr(key, 'ndim') and key.ndim > 1: + raise ValueError('Cannot index with multidimensional key') + + return self._getitem_iterable(key, axis=axis) + else: + if com.is_integer(key): + if axis == 0 and isinstance(labels, MultiIndex): + try: + return self._get_label(key, axis=axis) + except (KeyError, TypeError): + if self.obj.index.levels[0].is_integer(): + raise + + # this is the fallback! (for a non-float, non-integer index) + if not labels.is_floating() and not labels.is_integer(): + return self._get_loc(key, axis=axis) + + return self._get_label(key, axis=axis) + + def _getitem_iterable(self, key, axis=0): + labels = self.obj._get_axis(axis) + + def _reindex(keys, level=None): + try: + return self.obj.reindex_axis(keys, axis=axis, level=level) + except AttributeError: + # Series + if axis != 0: + raise AssertionError('axis must be 0') + return self.obj.reindex(keys, level=level) + + if com._is_bool_indexer(key): + key = _check_bool_indexer(labels, key) + inds, = key.nonzero() + return self.obj.take(inds, axis=axis, convert=False) + else: + if isinstance(key, Index): + # want Index objects to pass through untouched + keyarr = key + else: + # asarray can be unsafe, NumPy strings are weird + keyarr = _asarray_tuplesafe(key) + + # handle a mixed integer scenario + indexer = labels._convert_list_indexer_for_mixed(keyarr, typ=self.name) + if indexer is not None: + return self.obj.take(indexer, axis=axis) + + # this is not the most robust, but... + if (isinstance(labels, MultiIndex) and len(keyarr) and + not isinstance(keyarr[0], tuple)): + level = 0 + else: + level = None + + keyarr_is_unique = Index(keyarr).is_unique + + # existing labels are unique and indexer is unique + if labels.is_unique and keyarr_is_unique: + return _reindex(keyarr, level=level) + + else: + indexer, missing = labels.get_indexer_non_unique(keyarr) + check = indexer != -1 + result = self.obj.take(indexer[check], axis=axis, + convert=False) + + # need to merge the result labels and the missing labels + if len(missing): + l = np.arange(len(indexer)) + + missing = com._ensure_platform_int(missing) + missing_labels = keyarr.take(missing) + missing_indexer = com._ensure_int64(l[~check]) + cur_labels = result._get_axis(axis).values + cur_indexer = com._ensure_int64(l[check]) + + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + + # reindex with the specified axis + ndim = self.obj.ndim + if axis + 1 > ndim: + raise AssertionError("invalid indexing error with " + "non-unique index") + + # a unique indexer + if keyarr_is_unique: + + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange( + len(result._get_axis(axis)) + ) + new_indexer[missing_indexer] = -1 + + # we have a non_unique selector, need to use the original + # indexer here + else: + + # need to retake to have the same size as the indexer + rindexer = indexer.values + rindexer[~check] = 0 + result = self.obj.take(rindexer, axis=axis, + convert=False) + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(result)) + new_indexer[~check] = -1 + + result = result._reindex_with_indexers({ + axis: [new_labels, new_indexer] + }, copy=True, allow_dups=True) + + return result + + def _convert_to_indexer(self, obj, axis=0, is_setter=False): + """ + Convert indexing key into something we can use to do actual fancy + indexing on an ndarray + + Examples + ix[:5] -> slice(0, 5) + ix[[1,2,3]] -> [1,2,3] + ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) + + Going by Zen of Python? + "In the face of ambiguity, refuse the temptation to guess." + raise AmbiguousIndexError with integer labels? + - No, prefer label-based indexing + """ + labels = self.obj._get_axis(axis) + + # if we are a scalar indexer and not type correct raise + obj = self._convert_scalar_indexer(obj, axis) + + # see if we are positional in nature + is_int_index = labels.is_integer() + is_int_positional = com.is_integer(obj) and not is_int_index + + # if we are a label return me + try: + return labels.get_loc(obj) + except (KeyError, TypeError): + pass + except (ValueError): + if not is_int_positional: + raise + + # a positional + if is_int_positional: + + # if we are setting and its not a valid location + # its an insert which fails by definition + if is_setter: + + # always valid + if self.name == 'loc': + return {'key': obj} + + # a positional + if (obj >= self.obj.shape[axis] and + not isinstance(labels, MultiIndex)): + raise ValueError("cannot set by positional indexing with " + "enlargement") + + return obj + + if isinstance(obj, slice): + return self._convert_slice_indexer(obj, axis) + + elif _is_nested_tuple(obj, labels): + return labels.get_locs(obj) + elif _is_list_like(obj): + if com._is_bool_indexer(obj): + obj = _check_bool_indexer(labels, obj) + inds, = obj.nonzero() + return inds + else: + if isinstance(obj, Index): + objarr = obj.values + else: + objarr = _asarray_tuplesafe(obj) + + # If have integer labels, defer to label-based indexing + indexer = labels._convert_list_indexer_for_mixed(objarr, typ=self.name) + if indexer is not None: + return indexer + + # this is not the most robust, but... + if (isinstance(labels, MultiIndex) and + not isinstance(objarr[0], tuple)): + level = 0 + _, indexer = labels.reindex(objarr, level=level) + + # take all + if indexer is None: + indexer = np.arange(len(labels)) + + check = labels.levels[0].get_indexer(objarr) + else: + level = None + + # unique index + if labels.is_unique: + indexer = check = labels.get_indexer(objarr) + + # non-unique (dups) + else: + (indexer, + missing) = labels.get_indexer_non_unique(objarr) + check = indexer + + mask = check == -1 + if mask.any(): + + # mi here + if isinstance(obj, tuple) and is_setter: + return {'key': obj} + raise KeyError('%s not in index' % objarr[mask]) + + return indexer + + else: + try: + return labels.get_loc(obj) + except KeyError: + # allow a not found key only if we are a setter + if not is_list_like(obj) and is_setter: + return {'key': obj} + raise + + def _tuplify(self, loc): + tup = [slice(None, None) for _ in range(self.ndim)] + tup[0] = loc + return tuple(tup) + + def _get_slice_axis(self, slice_obj, axis=0): + obj = self.obj + + if not _need_slice(slice_obj): + return obj + indexer = self._convert_slice_indexer(slice_obj, axis) + + if isinstance(indexer, slice): + return self._slice(indexer, axis=axis, typ='iloc') + else: + return self.obj.take(indexer, axis=axis, convert=False) + + +class _IXIndexer(_NDFrameIndexer): + + """ A primarily location based indexer, with integer fallback """ + + def _has_valid_type(self, key, axis): + if isinstance(key, slice): + return True + + elif com._is_bool_indexer(key): + return True + + elif _is_list_like(key): + return True + + else: + + self._convert_scalar_indexer(key, axis) + + return True + + +class _LocationIndexer(_NDFrameIndexer): + _exception = Exception + + def __getitem__(self, key): + if type(key) is tuple: + return self._getitem_tuple(key) + else: + return self._getitem_axis(key, axis=0) + + def _getitem_axis(self, key, axis=0, validate_iterable=False): + raise NotImplementedError() + + def _getbool_axis(self, key, axis=0): + labels = self.obj._get_axis(axis) + key = _check_bool_indexer(labels, key) + inds, = key.nonzero() + try: + return self.obj.take(inds, axis=axis, convert=False) + except Exception as detail: + raise self._exception(detail) + + def _get_slice_axis(self, slice_obj, axis=0): + """ this is pretty simple as we just have to deal with labels """ + obj = self.obj + if not _need_slice(slice_obj): + return obj + + labels = obj._get_axis(axis) + indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, + slice_obj.step) + + if isinstance(indexer, slice): + return self._slice(indexer, axis=axis, typ='iloc') + else: + return self.obj.take(indexer, axis=axis, convert=False) + + +class _LocIndexer(_LocationIndexer): + + """ purely label based location based indexing """ + _valid_types = ("labels (MUST BE IN THE INDEX), slices of labels (BOTH " + "endpoints included! Can be slices of integers if the " + "index is integers), listlike of labels, boolean") + _exception = KeyError + + def _has_valid_type(self, key, axis): + ax = self.obj._get_axis(axis) + + # valid for a label where all labels are in the index + # slice of lables (where start-end in labels) + # slice of integers (only if in the lables) + # boolean + + if isinstance(key, slice): + + if ax.is_floating(): + + # allowing keys to be slicers with no fallback + pass + + else: + if key.start is not None: + if key.start not in ax: + raise KeyError( + "start bound [%s] is not the [%s]" % + (key.start, self.obj._get_axis_name(axis)) + ) + if key.stop is not None: + if key.stop not in ax: + raise KeyError( + "stop bound [%s] is not in the [%s]" % + (key.stop, self.obj._get_axis_name(axis)) + ) + + elif com._is_bool_indexer(key): + return True + + elif _is_list_like(key): + + # mi is just a passthru + if isinstance(key, tuple) and isinstance(ax, MultiIndex): + return True + + # require all elements in the index + idx = _ensure_index(key) + if not idx.isin(ax).all(): + + raise KeyError("[%s] are not in ALL in the [%s]" % + (key, self.obj._get_axis_name(axis))) + + return True + + else: + + def error(): + if isnull(key): + raise ValueError( + "cannot use label indexing with a null key") + raise KeyError("the label [%s] is not in the [%s]" % + (key, self.obj._get_axis_name(axis))) + + try: + key = self._convert_scalar_indexer(key, axis) + if not key in ax: + error() + except (TypeError) as e: + + # python 3 type errors should be raised + if 'unorderable' in str(e): # pragma: no cover + error() + raise + except: + error() + + return True + + def _getitem_axis(self, key, axis=0, validate_iterable=False): + labels = self.obj._get_axis(axis) + + if isinstance(key, slice): + self._has_valid_type(key, axis) + return self._get_slice_axis(key, axis=axis) + elif com._is_bool_indexer(key): + return self._getbool_axis(key, axis=axis) + elif _is_list_like(key): + + # GH 7349 + # possibly convert a list-like into a nested tuple + # but don't convert a list-like of tuples + if isinstance(labels, MultiIndex): + if not isinstance(key, tuple) and len(key) > 1 and not isinstance(key[0], tuple): + key = tuple([key]) + + # an iterable multi-selection + if not (isinstance(key, tuple) and + isinstance(labels, MultiIndex)): + + if hasattr(key, 'ndim') and key.ndim > 1: + raise ValueError('Cannot index with multidimensional key') + + if validate_iterable: + self._has_valid_type(key, axis) + + return self._getitem_iterable(key, axis=axis) + + # nested tuple slicing + if _is_nested_tuple(key, labels): + locs = labels.get_locs(key) + indexer = [ slice(None) ] * self.ndim + indexer[axis] = locs + return self.obj.iloc[tuple(indexer)] + + # fall thru to straight lookup + self._has_valid_type(key, axis) + return self._get_label(key, axis=axis) + + +class _iLocIndexer(_LocationIndexer): + + """ purely integer based location based indexing """ + _valid_types = ("integer, integer slice (START point is INCLUDED, END " + "point is EXCLUDED), listlike of integers, boolean array") + _exception = IndexError + + def _has_valid_type(self, key, axis): + if com._is_bool_indexer(key): + if hasattr(key, 'index') and isinstance(key.index, Index): + if key.index.inferred_type == 'integer': + raise NotImplementedError( + "iLocation based boolean indexing on an integer type " + "is not available" + ) + raise ValueError("iLocation based boolean indexing cannot use " + "an indexable as a mask") + return True + + if isinstance(key, slice): + return True + elif com.is_integer(key): + return self._is_valid_integer(key, axis) + elif (_is_list_like(key)): + return self._is_valid_list_like(key, axis) + return False + + def _has_valid_setitem_indexer(self, indexer): + self._has_valid_positional_setitem_indexer(indexer) + + def _is_valid_integer(self, key, axis): + # return a boolean if we have a valid integer indexer + + ax = self.obj._get_axis(axis) + if key > len(ax): + raise IndexError("single positional indexer is out-of-bounds") + return True + + + def _is_valid_list_like(self, key, axis): + # return a boolean if we are a valid list-like (e.g. that we dont' have out-of-bounds values) + + # coerce the key to not exceed the maximum size of the index + arr = np.array(key) + ax = self.obj._get_axis(axis) + l = len(ax) + if len(arr) and (arr.max() >= l or arr.min() <= -l): + raise IndexError("positional indexers are out-of-bounds") + + return True + + def _getitem_tuple(self, tup): + + self._has_valid_tuple(tup) + try: + return self._getitem_lowerdim(tup) + except: + pass + + retval = self.obj + axis=0 + for i, key in enumerate(tup): + if i >= self.obj.ndim: + raise IndexingError('Too many indexers') + + if _is_null_slice(key): + axis += 1 + continue + + retval = getattr(retval, self.name)._getitem_axis(key, axis=axis) + + # if the dim was reduced, then pass a lower-dim the next time + if retval.ndim l: + stop = l + elif stop < 0: + stop += l + if step is None: + step = 1 + elif step < 0: + step = abs(step) + return (stop - start) / step + elif isinstance(indexer, (ABCSeries, np.ndarray, list)): + return len(indexer) + elif not is_list_like(indexer): + return 1 + raise AssertionError("cannot find the length of the indexer") + + +def _convert_to_index_sliceable(obj, key): + """if we are index sliceable, then return my slicer, otherwise return None + """ + idx = obj.index + if isinstance(key, slice): + return idx._convert_slice_indexer(key, typ='getitem') + + elif isinstance(key, compat.string_types): + + # we are an actual column + if key in obj._data.items: + return None + + # we need a timelike key here + if idx.is_all_dates: + try: + return idx._get_string_slice(key) + except: + return None + + return None + + +def _is_index_slice(obj): + def _is_valid_index(x): + return (com.is_integer(x) or com.is_float(x) + and np.allclose(x, int(x), rtol=_eps, atol=0)) + + def _crit(v): + return v is None or _is_valid_index(v) + + both_none = obj.start is None and obj.stop is None + + return not both_none and (_crit(obj.start) and _crit(obj.stop)) + + +def _check_bool_indexer(ax, key): + # boolean indexing, need to check that the data are aligned, otherwise + # disallowed + + # this function assumes that com._is_bool_indexer(key) == True + + result = key + if isinstance(key, ABCSeries) and not key.index.equals(ax): + result = result.reindex(ax) + mask = com.isnull(result.values) + if mask.any(): + raise IndexingError('Unalignable boolean Series key provided') + + result = result.astype(bool).values + + else: + # com._is_bool_indexer has already checked for nulls in the case of an + # object array key, so no check needed here + result = np.asarray(result, dtype=bool) + + return result + + +def _convert_missing_indexer(indexer): + """ reverse convert a missing indexer, which is a dict + return the scalar indexer and a boolean indicating if we converted """ + + if isinstance(indexer, dict): + + # a missing key (but not a tuple indexer) + indexer = indexer['key'] + + if isinstance(indexer, bool): + raise KeyError("cannot use a single bool to index into setitem") + return indexer, True + + return indexer, False + + +def _convert_from_missing_indexer_tuple(indexer, axes): + """ create a filtered indexer that doesn't have any missing indexers """ + def get_indexer(_i, _idx): + return (axes[_i].get_loc(_idx['key']) + if isinstance(_idx, dict) else _idx) + return tuple([get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)]) + + +def _safe_append_to_index(index, key): + """ a safe append to an index, if incorrect type, then catch and recreate + """ + try: + return index.insert(len(index), key) + except: + + # raise here as this is basically an unsafe operation and we want + # it to be obvious that you are doing something wrong + raise ValueError("unsafe appending to index of type {0} with a key " + "{1}".format(index.__class__.__name__, key)) + + +def _maybe_convert_indices(indices, n): + """ if we have negative indicies, translate to postive here + if have indicies that are out-of-bounds, raise an IndexError + """ + if isinstance(indices, list): + indices = np.array(indices) + if len(indices) == 0: + # If list is empty, np.array will return float and cause indexing + # errors. + return np.empty(0, dtype=np.int_) + + mask = indices < 0 + if mask.any(): + indices[mask] += n + mask = (indices >= n) | (indices < 0) + if mask.any(): + raise IndexError("indices are out-of-bounds") + return indices + + +def _maybe_convert_ix(*args): + """ + We likely want to take the cross-product + """ + + ixify = True + for arg in args: + if not isinstance(arg, (np.ndarray, list, ABCSeries)): + ixify = False + + if ixify: + return np.ix_(*args) + else: + return args + + +def _is_nested_tuple(tup, labels): + # check for a compatiable nested tuple and multiindexes among the axes + if not isinstance(tup, tuple): + return False + + # are we nested tuple of: tuple,list,slice + for i, k in enumerate(tup): + + if isinstance(k, (tuple, list, slice)): + return isinstance(labels, MultiIndex) + + return False + + +def _is_null_slice(obj): + return (isinstance(obj, slice) and obj.start is None and + obj.stop is None and obj.step is None) + + +def _is_label_like(key): + # select a label or row + return not isinstance(key, slice) and not _is_list_like(key) + + +def _is_list_like(obj): + # Consider namedtuples to be not list like as they are useful as indices + return (np.iterable(obj) + and not isinstance(obj, compat.string_types) + and not (isinstance(obj, tuple) and type(obj) is not tuple)) + + +def _need_slice(obj): + return (obj.start is not None or + obj.stop is not None or + (obj.step is not None and obj.step != 1)) + + +def _maybe_droplevels(index, key): + # drop levels + original_index = index + if isinstance(key, tuple): + for _ in key: + try: + index = index.droplevel(0) + except: + # we have dropped too much, so back out + return original_index + else: + try: + index = index.droplevel(0) + except: + pass + + return index + diff --git a/pandas/core/internals.py b/pandas/core/internals.py new file mode 100644 index 00000000..4f7f36dd --- /dev/null +++ b/pandas/core/internals.py @@ -0,0 +1,4090 @@ +import copy +import itertools +import re +import operator +from datetime import datetime, timedelta +from collections import defaultdict + +import numpy as np +from pandas.core.base import PandasObject + +from pandas.core.common import (_possibly_downcast_to_dtype, isnull, + _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like, + ABCSparseSeries, _infer_dtype_from_scalar, + _is_null_datelike_scalar, + is_timedelta64_dtype, is_datetime64_dtype, + _possibly_infer_to_datetimelike) +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) +import pandas.core.common as com +from pandas.sparse.array import _maybe_to_sparse, SparseArray +import pandas.lib as lib +import pandas.tslib as tslib +import pandas.computation.expressions as expressions +from pandas.util.decorators import cache_readonly + +from pandas.tslib import Timestamp +from pandas import compat +from pandas.compat import range, map, zip, u +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + + +from pandas.lib import BlockPlacement + + +class Block(PandasObject): + + """ + Canonical n-dimensional unit of homogeneous dtype contained in a pandas + data structure + + Index-ignorant; let the container take care of that + """ + __slots__ = ['_mgr_locs', 'values', 'ndim'] + is_numeric = False + is_float = False + is_integer = False + is_complex = False + is_datetime = False + is_timedelta = False + is_bool = False + is_object = False + is_sparse = False + _can_hold_na = False + _downcast_dtype = None + _can_consolidate = True + _verify_integrity = True + _ftype = 'dense' + + def __init__(self, values, placement, ndim=None, fastpath=False): + if ndim is None: + ndim = values.ndim + elif values.ndim != ndim: + raise ValueError('Wrong number of dimensions') + self.ndim = ndim + + self.mgr_locs = placement + self.values = values + + if len(self.mgr_locs) != len(self.values): + raise ValueError('Wrong number of items passed %d,' + ' placement implies %d' % ( + len(self.values), len(self.mgr_locs))) + + @property + def _consolidate_key(self): + return (self._can_consolidate, self.dtype.name) + + @property + def _is_single_block(self): + return self.ndim == 1 + + @property + def is_datelike(self): + """ return True if I am a non-datelike """ + return self.is_datetime or self.is_timedelta + + @property + def fill_value(self): + return np.nan + + @property + def mgr_locs(self): + return self._mgr_locs + + def make_block_same_class(self, values, placement, copy=False, + **kwargs): + """ + Wrap given values in a block of same type as self. + + `kwargs` are used in SparseBlock override. + + """ + if copy: + values = values.copy() + return make_block(values, placement, klass=self.__class__, + fastpath=True) + + @mgr_locs.setter + def mgr_locs(self, new_mgr_locs): + if not isinstance(new_mgr_locs, BlockPlacement): + new_mgr_locs = BlockPlacement(new_mgr_locs) + + self._mgr_locs = new_mgr_locs + + def __unicode__(self): + + # don't want to print out all of the items here + name = com.pprint_thing(self.__class__.__name__) + if self._is_single_block: + + result = '%s: %s dtype: %s' % ( + name, len(self), self.dtype) + + else: + + shape = ' x '.join([com.pprint_thing(s) for s in self.shape]) + result = '%s: %s, %s, dtype: %s' % ( + name, com.pprint_thing(self.mgr_locs.indexer), shape, + self.dtype) + + return result + + def __len__(self): + return len(self.values) + + def __getstate__(self): + return self.mgr_locs.indexer, self.values + + def __setstate__(self, state): + self.mgr_locs = BlockPlacement(state[0]) + self.values = state[1] + self.ndim = self.values.ndim + + def _slice(self, slicer): + """ return a slice of my values """ + return self.values[slicer] + + def getitem_block(self, slicer, new_mgr_locs=None): + """ + Perform __getitem__-like, return result as block. + + As of now, only supports slices that preserve dimensionality. + + """ + if new_mgr_locs is None: + if isinstance(slicer, tuple): + axis0_slicer = slicer[0] + else: + axis0_slicer = slicer + new_mgr_locs = self.mgr_locs[axis0_slicer] + + new_values = self._slice(slicer) + + if new_values.ndim != self.ndim: + raise ValueError("Only same dim slicing is allowed") + + return self.make_block_same_class(new_values, new_mgr_locs) + + @property + def shape(self): + return self.values.shape + + @property + def itemsize(self): + return self.values.itemsize + + @property + def dtype(self): + return self.values.dtype + + @property + def ftype(self): + return "%s:%s" % (self.dtype, self._ftype) + + def merge(self, other): + return _merge_blocks([self, other]) + + def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, + limit=None, mask_info=None): + """ + Reindex using pre-computed indexer information + """ + if axis < 1: + raise AssertionError('axis must be at least 1, got %d' % axis) + if fill_value is None: + fill_value = self.fill_value + + new_values = com.take_nd(self.values, indexer, axis, + fill_value=fill_value, mask_info=mask_info) + return make_block(new_values, + ndim=self.ndim, fastpath=True, + placement=self.mgr_locs) + + def get(self, item): + loc = self.items.get_loc(item) + return self.values[loc] + + def iget(self, i): + return self.values[i] + + def set(self, locs, values, check=False): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + self.values[locs] = values + + def delete(self, loc): + """ + Delete given loc(-s) from block in-place. + """ + self.values = np.delete(self.values, loc, 0) + self.mgr_locs = self.mgr_locs.delete(loc) + + def apply(self, func, **kwargs): + """ apply the function to my values; return a block if we are not one """ + result = func(self.values) + if not isinstance(result, Block): + result = make_block(values=result, placement=self.mgr_locs,) + + return result + + def fillna(self, value, limit=None, inplace=False, downcast=None): + if not self._can_hold_na: + if inplace: + return [self] + else: + return [self.copy()] + + mask = isnull(self.values) + if limit is not None: + if self.ndim > 2: + raise NotImplementedError + mask[mask.cumsum(self.ndim-1)>limit]=False + + value = self._try_fill(value) + blocks = self.putmask(mask, value, inplace=inplace) + return self._maybe_downcast(blocks, downcast) + + def _maybe_downcast(self, blocks, downcast=None): + + # no need to downcast our float + # unless indicated + if downcast is None and self.is_float: + return blocks + elif downcast is None and (self.is_timedelta or self.is_datetime): + return blocks + + result_blocks = [] + for b in blocks: + result_blocks.extend(b.downcast(downcast)) + + return result_blocks + + def downcast(self, dtypes=None): + """ try to downcast each item to the dict of dtypes if present """ + + # turn it off completely + if dtypes is False: + return [self] + + values = self.values + + # single block handling + if self._is_single_block: + + # try to cast all non-floats here + if dtypes is None: + dtypes = 'infer' + + nv = _possibly_downcast_to_dtype(values, dtypes) + return [make_block(nv, ndim=self.ndim, + fastpath=True, placement=self.mgr_locs)] + + # ndim > 1 + if dtypes is None: + return [self] + + if not (dtypes == 'infer' or isinstance(dtypes, dict)): + raise ValueError("downcast must have a dictionary or 'infer' as " + "its argument") + + # item-by-item + # this is expensive as it splits the blocks items-by-item + blocks = [] + for i, rl in enumerate(self.mgr_locs): + + if dtypes == 'infer': + dtype = 'infer' + else: + raise AssertionError("dtypes as dict is not supported yet") + dtype = dtypes.get(item, self._downcast_dtype) + + if dtype is None: + nv = _block_shape(values[i], ndim=self.ndim) + else: + nv = _possibly_downcast_to_dtype(values[i], dtype) + nv = _block_shape(nv, ndim=self.ndim) + + blocks.append(make_block(nv, + ndim=self.ndim, fastpath=True, + placement=[rl])) + + return blocks + + def astype(self, dtype, copy=False, raise_on_error=True, values=None): + return self._astype(dtype, copy=copy, raise_on_error=raise_on_error, + values=values) + + def _astype(self, dtype, copy=False, raise_on_error=True, values=None, + klass=None): + """ + Coerce to the new type (if copy=True, return a new copy) + raise on an except if raise == True + """ + dtype = np.dtype(dtype) + if self.dtype == dtype: + if copy: + return self.copy() + return self + + try: + # force the copy here + if values is None: + # _astype_nansafe works fine with 1-d only + values = com._astype_nansafe(self.values.ravel(), dtype, copy=True) + values = values.reshape(self.values.shape) + newb = make_block(values, + ndim=self.ndim, placement=self.mgr_locs, + fastpath=True, dtype=dtype, klass=klass) + except: + if raise_on_error is True: + raise + newb = self.copy() if copy else self + + if newb.is_numeric and self.is_numeric: + if newb.shape != self.shape: + raise TypeError("cannot set astype for copy = [%s] for dtype " + "(%s [%s]) with smaller itemsize that current " + "(%s [%s])" % (copy, self.dtype.name, + self.itemsize, newb.dtype.name, + newb.itemsize)) + return newb + + def convert(self, copy=True, **kwargs): + """ attempt to coerce any object types to better types + return a copy of the block (if copy = True) + by definition we are not an ObjectBlock here! """ + + return [self.copy()] if copy else [self] + + def _can_hold_element(self, value): + raise NotImplementedError() + + def _try_cast(self, value): + raise NotImplementedError() + + def _try_cast_result(self, result, dtype=None): + """ try to cast the result to our original type, + we may have roundtripped thru object in the mean-time """ + if dtype is None: + dtype = self.dtype + + if self.is_integer or self.is_bool or self.is_datetime: + pass + elif self.is_float and result.dtype == self.dtype: + + # protect against a bool/object showing up here + if isinstance(dtype, compat.string_types) and dtype == 'infer': + return result + if not isinstance(dtype, type): + dtype = dtype.type + if issubclass(dtype, (np.bool_, np.object_)): + if issubclass(dtype, np.bool_): + if isnull(result).all(): + return result.astype(np.bool_) + else: + result = result.astype(np.object_) + result[result == 1] = True + result[result == 0] = False + return result + else: + return result.astype(np.object_) + + return result + + # may need to change the dtype here + return _possibly_downcast_to_dtype(result, dtype) + + def _try_operate(self, values): + """ return a version to operate on as the input """ + return values + + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments """ + return values, other + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + return result + + def _try_coerce_and_cast_result(self, result, dtype=None): + result = self._try_coerce_result(result) + result = self._try_cast_result(result, dtype=dtype) + return result + + def _try_fill(self, value): + return value + + def to_native_types(self, slicer=None, na_rep='', **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + values = np.array(values, dtype=object) + mask = isnull(values) + values[mask] = na_rep + return values.tolist() + + # block actions #### + def copy(self, deep=True): + values = self.values + if deep: + values = values.copy() + return make_block(values, ndim=self.ndim, + klass=self.__class__, fastpath=True, + placement=self.mgr_locs) + + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False): + """ replace the to_replace value with value, possible to create new + blocks here this is just a call to putmask. regex is not used here. + It is used in ObjectBlocks. It is here for API + compatibility.""" + mask = com.mask_missing(self.values, to_replace) + if filter is not None: + filtered_out = ~self.mgr_locs.isin(filter) + mask[filtered_out.nonzero()[0]] = False + + if not mask.any(): + if inplace: + return [self] + return [self.copy()] + return self.putmask(mask, value, inplace=inplace) + + def setitem(self, indexer, value): + """ set the value inplace; return a new block (of a possibly different + dtype) + + indexer is a direct slice/positional indexer; value must be a + compatible shape + """ + + # coerce args + values, value = self._try_coerce_args(self.values, value) + arr_value = np.array(value) + + # cast the values to a type that can hold nan (if necessary) + if not self._can_hold_element(value): + dtype, _ = com._maybe_promote(arr_value.dtype) + values = values.astype(dtype) + + transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x) + values = transf(values) + l = len(values) + + # length checking + # boolean with truth values == len of the value is ok too + if isinstance(indexer, (np.ndarray, list)): + if is_list_like(value) and len(indexer) != len(value): + if not (isinstance(indexer, np.ndarray) and + indexer.dtype == np.bool_ and + len(indexer[indexer]) == len(value)): + raise ValueError("cannot set using a list-like indexer " + "with a different length than the value") + + # slice + elif isinstance(indexer, slice): + + if is_list_like(value) and l: + if len(value) != _length_of_indexer(indexer, values): + raise ValueError("cannot set using a slice indexer with a " + "different length than the value") + + try: + # setting a single element for each dim and with a rhs that could be say a list + # GH 6043 + if arr_value.ndim == 1 and ( + np.isscalar(indexer) or (isinstance(indexer, tuple) and all([ np.isscalar(idx) for idx in indexer ]))): + values[indexer] = value + + # if we are an exact match (ex-broadcasting), + # then use the resultant dtype + elif len(arr_value.shape) and arr_value.shape[0] == values.shape[0] and np.prod(arr_value.shape) == np.prod(values.shape): + values[indexer] = value + values = values.astype(arr_value.dtype) + + # set + else: + values[indexer] = value + + # coerce and try to infer the dtypes of the result + if np.isscalar(value): + dtype, _ = _infer_dtype_from_scalar(value) + else: + dtype = 'infer' + values = self._try_coerce_and_cast_result(values, dtype) + return [make_block(transf(values), + ndim=self.ndim, placement=self.mgr_locs, + fastpath=True)] + except (ValueError, TypeError) as detail: + raise + except Exception as detail: + pass + + return [self] + + def putmask(self, mask, new, align=True, inplace=False): + """ putmask the data to the block; it is possible that we may create a + new dtype of block + + return the resulting block(s) + + Parameters + ---------- + mask : the condition to respect + new : a ndarray/object + align : boolean, perform alignment on other/cond, default is True + inplace : perform inplace modification, default is False + + Returns + ------- + a new block(s), the result of the putmask + """ + + new_values = self.values if inplace else self.values.copy() + + # may need to align the new + if hasattr(new, 'reindex_axis'): + new = new.values.T + + # may need to align the mask + if hasattr(mask, 'reindex_axis'): + mask = mask.values.T + + # if we are passed a scalar None, convert it here + if not is_list_like(new) and isnull(new): + new = self.fill_value + + if self._can_hold_element(new): + new = self._try_cast(new) + + # pseudo-broadcast + if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1: + new = np.repeat(new, self.shape[-1]).reshape(self.shape) + + np.putmask(new_values, mask, new) + + # maybe upcast me + elif mask.any(): + + # need to go column by column + new_blocks = [] + if self.ndim > 1: + for i, ref_loc in enumerate(self.mgr_locs): + m = mask[i] + v = new_values[i] + + # need a new block + if m.any(): + + n = new[i] if isinstance( + new, np.ndarray) else np.array(new) + + # type of the new block + dtype, _ = com._maybe_promote(n.dtype) + + # we need to exiplicty astype here to make a copy + n = n.astype(dtype) + + nv = _putmask_smart(v, m, n) + else: + nv = v if inplace else v.copy() + + # Put back the dimension that was taken from it and make + # a block out of the result. + block = make_block(values=nv[np.newaxis], + placement=[ref_loc], + fastpath=True) + + new_blocks.append(block) + + else: + nv = _putmask_smart(new_values, mask, new) + new_blocks.append(make_block(values=nv, + placement=self.mgr_locs, + fastpath=True)) + + return new_blocks + + if inplace: + return [self] + + return [make_block(new_values, + placement=self.mgr_locs, fastpath=True)] + + def interpolate(self, method='pad', axis=0, index=None, + values=None, inplace=False, limit=None, + fill_value=None, coerce=False, downcast=None, **kwargs): + + def check_int_bool(self, inplace): + # Only FloatBlocks will contain NaNs. + # timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + if inplace: + return self + else: + return self.copy() + + # a fill na type method + try: + m = com._clean_fill_method(method) + except: + m = None + + if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate_with_fill(method=m, + axis=axis, + inplace=inplace, + limit=limit, + fill_value=fill_value, + coerce=coerce, + downcast=downcast) + # try an interp method + try: + m = com._clean_interp_method(method, **kwargs) + except: + m = None + + if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate(method=m, + index=index, + values=values, + axis=axis, + limit=limit, + fill_value=fill_value, + inplace=inplace, + downcast=downcast, + **kwargs) + + raise ValueError("invalid method '{0}' to interpolate.".format(method)) + + def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, + limit=None, fill_value=None, coerce=False, + downcast=None): + """ fillna but using the interpolate machinery """ + + # if we are coercing, then don't force the conversion + # if the block can't hold the type + if coerce: + if not self._can_hold_na: + if inplace: + return [self] + else: + return [self.copy()] + + fill_value = self._try_fill(fill_value) + values = self.values if inplace else self.values.copy() + values = self._try_operate(values) + values = com.interpolate_2d(values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype) + values = self._try_coerce_result(values) + + blocks = [make_block(values, + ndim=self.ndim, klass=self.__class__, + fastpath=True, placement=self.mgr_locs)] + return self._maybe_downcast(blocks, downcast) + + def _interpolate(self, method=None, index=None, values=None, + fill_value=None, axis=0, limit=None, + inplace=False, downcast=None, **kwargs): + """ interpolate using scipy wrappers """ + + data = self.values if inplace else self.values.copy() + + # only deal with floats + if not self.is_float: + if not self.is_integer: + return self + data = data.astype(np.float64) + + if fill_value is None: + fill_value = self.fill_value + + if method in ('krogh', 'piecewise_polynomial', 'pchip'): + if not index.is_monotonic: + raise ValueError("{0} interpolation requires that the " + "index be monotonic.".format(method)) + # process 1-d slices in the axis direction + + def func(x): + + # process a 1-d slice, returning it + # should the axis argument be handled below in apply_along_axis? + # i.e. not an arg to com.interpolate_1d + return com.interpolate_1d(index, x, method=method, limit=limit, + fill_value=fill_value, + bounds_error=False, **kwargs) + + # interp each column independently + interp_values = np.apply_along_axis(func, axis, data) + + blocks = [make_block(interp_values, + ndim=self.ndim, klass=self.__class__, + fastpath=True, placement=self.mgr_locs)] + return self._maybe_downcast(blocks, downcast) + + def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb + + """ + if fill_tuple is None: + fill_value = self.fill_value + new_values = com.take_nd(self.get_values(), indexer, axis=axis, + allow_fill=False) + else: + fill_value = fill_tuple[0] + new_values = com.take_nd(self.get_values(), indexer, axis=axis, + allow_fill=True, fill_value=fill_value) + + if new_mgr_locs is None: + if axis == 0: + slc = lib.indexer_as_slice(indexer) + if slc is not None: + new_mgr_locs = self.mgr_locs[slc] + else: + new_mgr_locs = self.mgr_locs[indexer] + else: + new_mgr_locs = self.mgr_locs + + if new_values.dtype != self.dtype: + return make_block(new_values, new_mgr_locs) + else: + return self.make_block_same_class(new_values, new_mgr_locs) + + def get_values(self, dtype=None): + return self.values + + def diff(self, n): + """ return block for the diff of the values """ + new_values = com.diff(self.values, n, axis=1) + return [make_block(values=new_values, + ndim=self.ndim, fastpath=True, + placement=self.mgr_locs)] + + def shift(self, periods, axis=0): + """ shift the block by periods, possibly upcast """ + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + new_values, fill_value = com._maybe_upcast(self.values) + # make sure array sent to np.roll is c_contiguous + f_ordered = new_values.flags.f_contiguous + if f_ordered: + new_values = new_values.T + axis = new_values.ndim - axis - 1 + new_values = np.roll(new_values, periods, axis=axis) + axis_indexer = [ slice(None) ] * self.ndim + if periods > 0: + axis_indexer[axis] = slice(None,periods) + else: + axis_indexer[axis] = slice(periods,None) + new_values[tuple(axis_indexer)] = fill_value + + # restore original order + if f_ordered: + new_values = new_values.T + + return [make_block(new_values, + ndim=self.ndim, fastpath=True, + placement=self.mgr_locs)] + + def eval(self, func, other, raise_on_error=True, try_cast=False): + """ + evaluate the block; return result block from the result + + Parameters + ---------- + func : how to combine self, other + other : a ndarray/object + raise_on_error : if True, raise when I can't perform the function, + False by default (and just return the data that we had coming in) + + Returns + ------- + a new block, the result of the func + """ + values = self.values + + if hasattr(other, 'reindex_axis'): + other = other.values + + # make sure that we can broadcast + is_transposed = False + if hasattr(other, 'ndim') and hasattr(values, 'ndim'): + if values.ndim != other.ndim: + is_transposed = True + else: + if values.shape == other.shape[::-1]: + is_transposed = True + elif values.shape[0] == other.shape[-1]: + is_transposed = True + else: + # this is a broadcast error heree + raise ValueError("cannot broadcast shape [%s] with block " + "values [%s]" % (values.T.shape, + other.shape)) + + transf = (lambda x: x.T) if is_transposed else (lambda x: x) + + # coerce/transpose the args if needed + values, other = self._try_coerce_args(transf(values), other) + + # get the result, may need to transpose the other + def get_result(other): + return self._try_coerce_result(func(values, other)) + + # error handler if we have an issue operating with the function + def handle_error(): + + if raise_on_error: + raise TypeError('Could not operate %s with block values %s' + % (repr(other), str(detail))) + else: + # return the values + result = np.empty(values.shape, dtype='O') + result.fill(np.nan) + return result + + # get the result + try: + result = get_result(other) + + # if we have an invalid shape/broadcast error + # GH4576, so raise instead of allowing to pass through + except ValueError as detail: + raise + except Exception as detail: + result = handle_error() + + # technically a broadcast error in numpy can 'work' by returning a + # boolean False + if not isinstance(result, np.ndarray): + if not isinstance(result, np.ndarray): + + # differentiate between an invalid ndarray-ndarray comparison + # and an invalid type comparison + if isinstance(values, np.ndarray) and is_list_like(other): + raise ValueError('Invalid broadcasting comparison [%s] ' + 'with block values' % repr(other)) + + raise TypeError('Could not compare [%s] with block values' + % repr(other)) + + # transpose if needed + result = transf(result) + + # try to cast if requested + if try_cast: + result = self._try_cast_result(result) + + return [make_block(result, ndim=self.ndim, + fastpath=True, placement=self.mgr_locs)] + + def where(self, other, cond, align=True, raise_on_error=True, + try_cast=False): + """ + evaluate the block; return result block(s) from the result + + Parameters + ---------- + other : a ndarray/object + cond : the condition to respect + align : boolean, perform alignment on other/cond + raise_on_error : if True, raise when I can't perform the function, + False by default (and just return the data that we had coming in) + + Returns + ------- + a new block(s), the result of the func + """ + + values = self.values + + # see if we can align other + if hasattr(other, 'reindex_axis'): + other = other.values + + # make sure that we can broadcast + is_transposed = False + if hasattr(other, 'ndim') and hasattr(values, 'ndim'): + if values.ndim != other.ndim or values.shape == other.shape[::-1]: + + # if its symmetric are ok, no reshaping needed (GH 7506) + if (values.shape[0] == np.array(values.shape)).all(): + pass + + # pseodo broadcast (its a 2d vs 1d say and where needs it in a + # specific direction) + elif (other.ndim >= 1 and values.ndim - 1 == other.ndim and + values.shape[0] != other.shape[0]): + other = _block_shape(other).T + else: + values = values.T + is_transposed = True + + # see if we can align cond + if not hasattr(cond, 'shape'): + raise ValueError( + "where must have a condition that is ndarray like") + + if hasattr(cond, 'reindex_axis'): + cond = cond.values + + # may need to undo transpose of values + if hasattr(values, 'ndim'): + if values.ndim != cond.ndim or values.shape == cond.shape[::-1]: + + values = values.T + is_transposed = not is_transposed + + + # our where function + def func(c, v, o): + if c.ravel().all(): + return v + + v, o = self._try_coerce_args(v, o) + try: + return self._try_coerce_result( + expressions.where(c, v, o, raise_on_error=True) + ) + except Exception as detail: + if raise_on_error: + raise TypeError('Could not operate [%s] with block values ' + '[%s]' % (repr(o), str(detail))) + else: + # return the values + result = np.empty(v.shape, dtype='float64') + result.fill(np.nan) + return result + + # see if we can operate on the entire block, or need item-by-item + # or if we are a single block (ndim == 1) + result = func(cond, values, other) + if self._can_hold_na or self.ndim == 1: + + if not isinstance(result, np.ndarray): + raise TypeError('Could not compare [%s] with block values' + % repr(other)) + + if is_transposed: + result = result.T + + # try to cast if requested + if try_cast: + result = self._try_cast_result(result) + + return make_block(result, + ndim=self.ndim, placement=self.mgr_locs) + + # might need to separate out blocks + axis = cond.ndim - 1 + cond = cond.swapaxes(axis, 0) + mask = np.array([cond[i].all() for i in range(cond.shape[0])], + dtype=bool) + + result_blocks = [] + for m in [mask, ~mask]: + if m.any(): + r = self._try_cast_result( + result.take(m.nonzero()[0], axis=axis)) + result_blocks.append(make_block(r.T, + placement=self.mgr_locs[m])) + + return result_blocks + + def equals(self, other): + if self.dtype != other.dtype or self.shape != other.shape: return False + return np.array_equal(self.values, other.values) + + +class NumericBlock(Block): + __slots__ = () + is_numeric = True + _can_hold_na = True + + +class FloatOrComplexBlock(NumericBlock): + __slots__ = () + + def equals(self, other): + if self.dtype != other.dtype or self.shape != other.shape: return False + left, right = self.values, other.values + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + +class FloatBlock(FloatOrComplexBlock): + __slots__ = () + is_float = True + _downcast_dtype = 'int64' + + def _can_hold_element(self, element): + if is_list_like(element): + element = np.array(element) + tipo = element.dtype.type + return issubclass(tipo, (np.floating, np.integer)) and not issubclass( + tipo, (np.datetime64, np.timedelta64)) + return isinstance(element, (float, int, np.float_, np.int_)) and not isinstance( + element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64)) + + def _try_cast(self, element): + try: + return float(element) + except: # pragma: no cover + return element + + def to_native_types(self, slicer=None, na_rep='', float_format=None, + **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + values = np.array(values, dtype=object) + mask = isnull(values) + values[mask] = na_rep + if float_format: + imask = (~mask).ravel() + values.flat[imask] = np.array( + [float_format % val for val in values.ravel()[imask]]) + return values.tolist() + + def should_store(self, value): + # when inserting a column should not coerce integers to floats + # unnecessarily + return (issubclass(value.dtype.type, np.floating) and + value.dtype == self.dtype) + + +class ComplexBlock(FloatOrComplexBlock): + __slots__ = () + is_complex = True + + def _can_hold_element(self, element): + if is_list_like(element): + element = np.array(element) + return issubclass(element.dtype.type, (np.floating, np.integer, np.complexfloating)) + return (isinstance(element, (float, int, complex, np.float_, np.int_)) and + not isinstance(bool, np.bool_)) + + def _try_cast(self, element): + try: + return complex(element) + except: # pragma: no cover + return element + + def should_store(self, value): + return issubclass(value.dtype.type, np.complexfloating) + + +class IntBlock(NumericBlock): + __slots__ = () + is_integer = True + _can_hold_na = False + + def _can_hold_element(self, element): + if is_list_like(element): + element = np.array(element) + tipo = element.dtype.type + return issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64)) + return com.is_integer(element) + + def _try_cast(self, element): + try: + return int(element) + except: # pragma: no cover + return element + + def should_store(self, value): + return com.is_integer_dtype(value) and value.dtype == self.dtype + + +class TimeDeltaBlock(IntBlock): + __slots__ = () + is_timedelta = True + _can_hold_na = True + is_numeric = False + + @property + def fill_value(self): + return tslib.iNaT + + def _try_fill(self, value): + """ if we are a NaT, return the actual fill value """ + if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all(): + value = tslib.iNaT + elif isinstance(value, np.timedelta64): + pass + elif com.is_integer(value): + # coerce to seconds of timedelta + value = np.timedelta64(int(value * 1e9)) + elif isinstance(value, timedelta): + value = np.timedelta64(value) + + return value + + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments + we are going to compare vs i8, so coerce to floats + repring NaT with np.nan so nans propagate + values is always ndarray like, other may not be """ + def masker(v): + mask = isnull(v) + v = v.view('i8').astype('float64') + v[mask] = np.nan + return v + + values = masker(values) + + if _is_null_datelike_scalar(other): + other = np.nan + elif isinstance(other, np.timedelta64): + other = _coerce_scalar_to_timedelta_type(other, unit='s').item() + if other == tslib.iNaT: + other = np.nan + else: + other = masker(other) + + return values, other + + def _try_operate(self, values): + """ return a version to operate on """ + return values.view('i8') + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args / try_operate """ + if isinstance(result, np.ndarray): + mask = isnull(result) + if result.dtype.kind in ['i', 'f', 'O']: + result = result.astype('m8[ns]') + result[mask] = tslib.iNaT + elif isinstance(result, np.integer): + result = np.timedelta64(result) + return result + + def should_store(self, value): + return issubclass(value.dtype.type, np.timedelta64) + + def to_native_types(self, slicer=None, na_rep=None, **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + mask = isnull(values) + + rvalues = np.empty(values.shape, dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (~mask).ravel() + rvalues.flat[imask] = np.array([lib.repr_timedelta64(val) + for val in values.ravel()[imask]], + dtype=object) + return rvalues.tolist() + + +class BoolBlock(NumericBlock): + __slots__ = () + is_bool = True + _can_hold_na = False + + def _can_hold_element(self, element): + if is_list_like(element): + element = np.array(element) + return issubclass(element.dtype.type, np.integer) + return isinstance(element, (int, bool)) + + def _try_cast(self, element): + try: + return bool(element) + except: # pragma: no cover + return element + + def should_store(self, value): + return issubclass(value.dtype.type, np.bool_) + + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False): + to_replace_values = np.atleast_1d(to_replace) + if not np.can_cast(to_replace_values, bool): + return self + return super(BoolBlock, self).replace(to_replace, value, + inplace=inplace, filter=filter, + regex=regex) + + +class ObjectBlock(Block): + __slots__ = () + is_object = True + _can_hold_na = True + + def __init__(self, values, ndim=2, fastpath=False, + placement=None): + if issubclass(values.dtype.type, compat.string_types): + values = np.array(values, dtype=object) + + super(ObjectBlock, self).__init__(values, ndim=ndim, + fastpath=fastpath, + placement=placement) + + @property + def is_bool(self): + """ we can be a bool if we have only bool values but are of type + object + """ + return lib.is_bool_array(self.values.ravel()) + + def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=True, + copy=True, by_item=True): + """ attempt to coerce any object types to better types + return a copy of the block (if copy = True) + by definition we ARE an ObjectBlock!!!!! + + can return multiple blocks! + """ + + # attempt to create new type blocks + blocks = [] + if by_item and not self._is_single_block: + + for i, rl in enumerate(self.mgr_locs): + values = self.iget(i) + + values = com._possibly_convert_objects( + values.ravel(), convert_dates=convert_dates, + convert_numeric=convert_numeric, + convert_timedeltas=convert_timedeltas, + ).reshape(values.shape) + values = _block_shape(values, ndim=self.ndim) + newb = make_block(values, + ndim=self.ndim, placement=[rl]) + blocks.append(newb) + + else: + + values = com._possibly_convert_objects( + self.values.ravel(), convert_dates=convert_dates, + convert_numeric=convert_numeric + ).reshape(self.values.shape) + blocks.append(make_block(values, + ndim=self.ndim, placement=self.mgr_locs)) + + return blocks + + def set(self, locs, values, check=False): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + + # GH6026 + if check: + try: + if (self.values[locs] == values).all(): + return + except: + pass + try: + self.values[locs] = values + except (ValueError): + + # broadcasting error + # see GH6171 + new_shape = list(values.shape) + new_shape[0] = len(self.items) + self.values = np.empty(tuple(new_shape),dtype=self.dtype) + self.values.fill(np.nan) + self.values[locs] = values + + + def _maybe_downcast(self, blocks, downcast=None): + + if downcast is not None: + return blocks + + # split and convert the blocks + result_blocks = [] + for blk in blocks: + result_blocks.extend(blk.convert(convert_dates=True, + convert_numeric=False)) + return result_blocks + + def _can_hold_element(self, element): + return True + + def _try_cast(self, element): + return element + + def should_store(self, value): + return not issubclass(value.dtype.type, + (np.integer, np.floating, np.complexfloating, + np.datetime64, np.bool_)) + + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False): + blk = [self] + to_rep_is_list = com.is_list_like(to_replace) + value_is_list = com.is_list_like(value) + both_lists = to_rep_is_list and value_is_list + either_list = to_rep_is_list or value_is_list + + if not either_list and com.is_re(to_replace): + blk[0], = blk[0]._replace_single(to_replace, value, + inplace=inplace, filter=filter, + regex=True) + elif not (either_list or regex): + blk = super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, regex=regex) + elif both_lists: + for to_rep, v in zip(to_replace, value): + blk[0], = blk[0]._replace_single(to_rep, v, inplace=inplace, + filter=filter, regex=regex) + elif to_rep_is_list and regex: + for to_rep in to_replace: + blk[0], = blk[0]._replace_single(to_rep, value, + inplace=inplace, + filter=filter, regex=regex) + else: + blk[0], = blk[0]._replace_single(to_replace, value, + inplace=inplace, filter=filter, + regex=regex) + return blk + + def _replace_single(self, to_replace, value, inplace=False, filter=None, + regex=False): + # to_replace is regex compilable + to_rep_re = regex and com.is_re_compilable(to_replace) + + # regex is regex compilable + regex_re = com.is_re_compilable(regex) + + # only one will survive + if to_rep_re and regex_re: + raise AssertionError('only one of to_replace and regex can be ' + 'regex compilable') + + # if regex was passed as something that can be a regex (rather than a + # boolean) + if regex_re: + to_replace = regex + + regex = regex_re or to_rep_re + + # try to get the pattern attribute (compiled re) or it's a string + try: + pattern = to_replace.pattern + except AttributeError: + pattern = to_replace + + # if the pattern is not empty and to_replace is either a string or a + # regex + if regex and pattern: + rx = re.compile(to_replace) + else: + # if the thing to replace is not a string or compiled regex call + # the superclass method -> to_replace is some kind of object + result = super(ObjectBlock, self).replace(to_replace, value, + inplace=inplace, + filter=filter, + regex=regex) + if not isinstance(result, list): + result = [result] + return result + + new_values = self.values if inplace else self.values.copy() + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isnull(value) or not isinstance(value, compat.string_types): + def re_replacer(s): + try: + return value if rx.search(s) is not None else s + except TypeError: + return s + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + try: + return rx.sub(value, s) + except TypeError: + return s + + f = np.vectorize(re_replacer, otypes=[self.dtype]) + + if filter is None: + filt = slice(None) + else: + filt = self.mgr_locs.isin(filter).nonzero()[0] + + new_values[filt] = f(new_values[filt]) + + return [self if inplace else + make_block(new_values, + fastpath=True, placement=self.mgr_locs)] + + +class DatetimeBlock(Block): + __slots__ = () + is_datetime = True + _can_hold_na = True + + def __init__(self, values, placement, + fastpath=False, **kwargs): + if values.dtype != _NS_DTYPE: + values = tslib.cast_to_nanoseconds(values) + + super(DatetimeBlock, self).__init__(values, + fastpath=True, placement=placement, + **kwargs) + + def _can_hold_element(self, element): + if is_list_like(element): + element = np.array(element) + return element.dtype == _NS_DTYPE or element.dtype == np.int64 + return (com.is_integer(element) or + isinstance(element, datetime) or + isnull(element)) + + def _try_cast(self, element): + try: + return int(element) + except: + return element + + def _try_operate(self, values): + """ return a version to operate on """ + return values.view('i8') + + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments + we are going to compare vs i8, so coerce to integer + values is always ndarra like, other may not be """ + values = values.view('i8') + if _is_null_datelike_scalar(other): + other = tslib.iNaT + elif isinstance(other, datetime): + other = lib.Timestamp(other).asm8.view('i8') + else: + other = other.view('i8') + + return values, other + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + if isinstance(result, np.ndarray): + if result.dtype == 'i8': + result = tslib.array_to_datetime( + result.astype(object).ravel()).reshape(result.shape) + elif result.dtype.kind in ['i', 'f', 'O']: + result = result.astype('M8[ns]') + elif isinstance(result, (np.integer, np.datetime64)): + result = lib.Timestamp(result) + return result + + @property + def fill_value(self): + return tslib.iNaT + + def _try_fill(self, value): + """ if we are a NaT, return the actual fill value """ + if isinstance(value, type(tslib.NaT)) or np.array(isnull(value)).all(): + value = tslib.iNaT + return value + + def fillna(self, value, limit=None, + inplace=False, downcast=None): + + # straight putmask here + values = self.values if inplace else self.values.copy() + mask = isnull(self.values) + value = self._try_fill(value) + if limit is not None: + if self.ndim > 2: + raise NotImplementedError + mask[mask.cumsum(self.ndim-1)>limit]=False + + np.putmask(values, mask, value) + return [self if inplace else + make_block(values, + fastpath=True, placement=self.mgr_locs)] + + def to_native_types(self, slicer=None, na_rep=None, date_format=None, + **kwargs): + """ convert to our native types format, slicing if desired """ + + values = self.values + if slicer is not None: + values = values[:, slicer] + mask = isnull(values) + + rvalues = np.empty(values.shape, dtype=object) + if na_rep is None: + na_rep = 'NaT' + rvalues[mask] = na_rep + imask = (~mask).ravel() + + if date_format is None: + date_formatter = lambda x: Timestamp(x)._repr_base + else: + date_formatter = lambda x: Timestamp(x).strftime(date_format) + + rvalues.flat[imask] = np.array([date_formatter(val) for val in + values.ravel()[imask]], dtype=object) + + return rvalues.tolist() + + def should_store(self, value): + return issubclass(value.dtype.type, np.datetime64) + + def astype(self, dtype, copy=False, raise_on_error=True): + """ + handle convert to object as a special case + """ + klass = None + if np.dtype(dtype).type == np.object_: + klass = ObjectBlock + return self._astype(dtype, copy=copy, raise_on_error=raise_on_error, + klass=klass) + + def set(self, locs, values, check=False): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + if values.dtype != _NS_DTYPE: + # Workaround for numpy 1.6 bug + values = tslib.cast_to_nanoseconds(values) + + self.values[locs] = values + + def get_values(self, dtype=None): + # return object dtype as Timestamps + if dtype == object: + return lib.map_infer(self.values.ravel(), lib.Timestamp)\ + .reshape(self.values.shape) + return self.values + + +class SparseBlock(Block): + """ implement as a list of sparse arrays of the same dtype """ + __slots__ = () + is_sparse = True + is_numeric = True + _can_hold_na = True + _can_consolidate = False + _verify_integrity = False + _ftype = 'sparse' + + def __init__(self, values, placement, + ndim=None, fastpath=False,): + + # Placement must be converted to BlockPlacement via property setter + # before ndim logic, because placement may be a slice which doesn't + # have a length. + self.mgr_locs = placement + + # kludgetastic + if ndim is None: + if len(self.mgr_locs) != 1: + ndim = 1 + else: + ndim = 2 + self.ndim = ndim + + if not isinstance(values, SparseArray): + raise TypeError("values must be SparseArray") + + self.values = values + + @property + def shape(self): + return (len(self.mgr_locs), self.sp_index.length) + + @property + def itemsize(self): + return self.dtype.itemsize + + @property + def fill_value(self): + #return np.nan + return self.values.fill_value + + @fill_value.setter + def fill_value(self, v): + # we may need to upcast our fill to match our dtype + if issubclass(self.dtype.type, np.floating): + v = float(v) + self.values.fill_value = v + + @property + def sp_values(self): + return self.values.sp_values + + @sp_values.setter + def sp_values(self, v): + # reset the sparse values + self.values = SparseArray(v, sparse_index=self.sp_index, + kind=self.kind, dtype=v.dtype, + fill_value=self.values.fill_value, + copy=False) + + def iget(self, col): + if col != 0: + raise IndexError("SparseBlock only contains one item") + return self.values + + @property + def sp_index(self): + return self.values.sp_index + + @property + def kind(self): + return self.values.kind + + def __len__(self): + try: + return self.sp_index.length + except: + return 0 + + def should_store(self, value): + return isinstance(value, SparseArray) + + def set(self, locs, values, check=False): + assert locs.tolist() == [0] + self.values = values + + def get(self, item): + if self.ndim == 1: + loc = self.items.get_loc(item) + return self.values[loc] + else: + return self.values + + def _slice(self, slicer): + """ return a slice of my values (but densify first) """ + return self.get_values()[slicer] + + def get_values(self, dtype=None): + """ need to to_dense myself (and always return a ndim sized object) """ + values = self.values.to_dense() + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def copy(self, deep=True): + return self.make_block_same_class(values=self.values, + sparse_index=self.sp_index, + kind=self.kind, copy=deep, + placement=self.mgr_locs) + + def make_block_same_class(self, values, placement, + sparse_index=None, kind=None, dtype=None, + fill_value=None, copy=False, fastpath=True): + """ return a new block """ + if dtype is None: + dtype = self.dtype + if fill_value is None: + fill_value = self.values.fill_value + + # if not isinstance(values, SparseArray) and values.ndim != self.ndim: + # raise ValueError("ndim mismatch") + + if values.ndim == 2: + nitems = values.shape[0] + + if nitems == 0: + # kludgy, but SparseBlocks cannot handle slices, where the + # output is 0-item, so let's convert it to a dense block: it + # won't take space since there's 0 items, plus it will preserve + # the dtype. + return make_block(np.empty(values.shape, dtype=dtype), + placement, fastpath=True,) + elif nitems > 1: + raise ValueError("Only 1-item 2d sparse blocks are supported") + else: + values = values.reshape(values.shape[1]) + + new_values = SparseArray(values, sparse_index=sparse_index, + kind=kind or self.kind, dtype=dtype, + fill_value=fill_value, copy=copy) + return make_block(new_values, ndim=self.ndim, + fastpath=fastpath, placement=placement) + + def interpolate(self, method='pad', axis=0, inplace=False, + limit=None, fill_value=None, **kwargs): + + values = com.interpolate_2d( + self.values.to_dense(), method, axis, limit, fill_value) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) + + def fillna(self, value, limit=None, inplace=False, downcast=None): + # we may need to upcast our fill to match our dtype + if limit is not None: + raise NotImplementedError + if issubclass(self.dtype.type, np.floating): + value = float(value) + values = self.values if inplace else self.values.copy() + return [self.make_block_same_class(values=values.get_values(value), + fill_value=value, + placement=self.mgr_locs)] + + def shift(self, periods, axis=0): + """ shift the block by periods """ + N = len(self.values.T) + indexer = np.zeros(N, dtype=int) + if periods > 0: + indexer[periods:] = np.arange(N - periods) + else: + indexer[:periods] = np.arange(-periods, N) + new_values = self.values.to_dense().take(indexer) + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + new_values, fill_value = com._maybe_upcast(new_values) + if periods > 0: + new_values[:periods] = fill_value + else: + new_values[periods:] = fill_value + return [self.make_block_same_class(new_values, placement=self.mgr_locs)] + + def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, + limit=None, mask_info=None): + """ + Reindex using pre-computed indexer information + """ + if axis < 1: + raise AssertionError('axis must be at least 1, got %d' % axis) + + # taking on the 0th axis always here + if fill_value is None: + fill_value = self.fill_value + return self.make_block_same_class(self.values.take(indexer), + fill_value=fill_value, + placement=self.mgr_locs) + + def sparse_reindex(self, new_index): + """ sparse reindex and return a new block + current reindex only works for float64 dtype! """ + values = self.values + values = values.sp_index.to_int_index().reindex( + values.sp_values.astype('float64'), values.fill_value, new_index) + return self.make_block_same_class(values, sparse_index=new_index, + placement=self.mgr_locs) + + def _try_cast_result(self, result, dtype=None): + return result + + +def make_block(values, placement, klass=None, ndim=None, + dtype=None, fastpath=False): + if klass is None: + dtype = dtype or values.dtype + vtype = dtype.type + + if isinstance(values, SparseArray): + klass = SparseBlock + elif issubclass(vtype, np.floating): + klass = FloatBlock + elif (issubclass(vtype, np.integer) and + issubclass(vtype, np.timedelta64)): + klass = TimeDeltaBlock + elif (issubclass(vtype, np.integer) and + not issubclass(vtype, np.datetime64)): + klass = IntBlock + elif dtype == np.bool_: + klass = BoolBlock + elif issubclass(vtype, np.datetime64): + klass = DatetimeBlock + elif issubclass(vtype, np.complexfloating): + klass = ComplexBlock + + else: + + # we want to infer here if its a datetimelike if its object type + # this is pretty strict in that it requires a datetime/timedelta + # value IN addition to possible nulls/strings + # an array of ONLY strings will not be inferred + if np.prod(values.shape): + result = _possibly_infer_to_datetimelike(values) + vtype = result.dtype.type + if issubclass(vtype, np.datetime64): + klass = DatetimeBlock + values = result + elif (issubclass(vtype, np.timedelta64)): + klass = TimeDeltaBlock + values = result + + if klass is None: + klass = ObjectBlock + + return klass(values, ndim=ndim, fastpath=fastpath, + placement=placement) + + +# TODO: flexible with index=None and/or items=None + + +class BlockManager(PandasObject): + + """ + Core internal data structure to implement DataFrame + + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) + + get_dtype_counts + get_ftype_counts + get_dtypes + get_ftypes + + apply(func, axes, block_filter_fn) + + get_bool_data + get_numeric_data + + get_slice(slice_like, axis) + get(label) + iget(loc) + get_scalar(label_tup) + + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) + + Parameters + ---------- + + + Notes + ----- + This is *not* a public API class + """ + __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', + '_is_consolidated', '_blknos', '_blklocs'] + + def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True): + self.axes = [_ensure_index(ax) for ax in axes] + self.blocks = tuple(blocks) + + for block in blocks: + if block.is_sparse: + if len(block.mgr_locs) != 1: + raise AssertionError("Sparse block refers to multiple items") + else: + if self.ndim != block.ndim: + raise AssertionError(('Number of Block dimensions (%d) must ' + 'equal number of axes (%d)') + % (block.ndim, self.ndim)) + + if do_integrity_check: + self._verify_integrity() + + self._consolidate_check() + + self._rebuild_blknos_and_blklocs() + + def make_empty(self, axes=None): + """ return an empty BlockManager with the items axis of len 0 """ + if axes is None: + axes = [_ensure_index([])] + [ + _ensure_index(a) for a in self.axes[1:] + ] + + # preserve dtype if possible + if self.ndim == 1: + blocks = np.array([], dtype=self.dtype) + else: + blocks = [] + return self.__class__(blocks, axes) + + def __nonzero__(self): + return True + + # Python3 compat + __bool__ = __nonzero__ + + @property + def shape(self): + return tuple(len(ax) for ax in self.axes) + + @property + def ndim(self): + return len(self.axes) + + def set_axis(self, axis, new_labels): + new_labels = _ensure_index(new_labels) + old_len = len(self.axes[axis]) + new_len = len(new_labels) + + if new_len != old_len: + raise ValueError('Length mismatch: Expected axis has %d elements, ' + 'new values have %d elements' % (old_len, new_len)) + + self.axes[axis] = new_labels + + def rename_axis(self, mapper, axis, copy=True): + """ + Rename one of axes. + + Parameters + ---------- + mapper : unary callable + axis : int + copy : boolean, default True + + """ + obj = self.copy(deep=copy) + obj.set_axis(axis, _transform_index(self.axes[axis], mapper)) + return obj + + def add_prefix(self, prefix): + f = (str(prefix) + '%s').__mod__ + return self.rename_axis(f, axis=0) + + def add_suffix(self, suffix): + f = ('%s' + str(suffix)).__mod__ + return self.rename_axis(f, axis=0) + + @property + def _is_single_block(self): + if self.ndim == 1: + return True + + if len(self.blocks) != 1: + return False + + blk = self.blocks[0] + return (blk.mgr_locs.is_slice_like and + blk.mgr_locs.as_slice == slice(0, len(self), 1)) + + def _rebuild_blknos_and_blklocs(self): + """ + Update mgr._blknos / mgr._blklocs. + """ + new_blknos = np.empty(self.shape[0], dtype=np.int64) + new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos.fill(-1) + new_blklocs.fill(-1) + + for blkno, blk in enumerate(self.blocks): + rl = blk.mgr_locs + new_blknos[rl.indexer] = blkno + new_blklocs[rl.indexer] = np.arange(len(rl)) + + if (new_blknos == -1).any(): + raise AssertionError("Gaps in blk ref_locs") + + self._blknos = new_blknos + self._blklocs = new_blklocs + + # make items read only for now + def _get_items(self): + return self.axes[0] + items = property(fget=_get_items) + + def _get_counts(self, f): + """ return a dict of the counts of the function in BlockManager """ + self._consolidate_inplace() + counts = dict() + for b in self.blocks: + v = f(b) + counts[v] = counts.get(v, 0) + b.shape[0] + return counts + + def get_dtype_counts(self): + return self._get_counts(lambda b: b.dtype.name) + + def get_ftype_counts(self): + return self._get_counts(lambda b: b.ftype) + + def get_dtypes(self): + dtypes = np.array([blk.dtype for blk in self.blocks]) + return com.take_1d(dtypes, self._blknos, allow_fill=False) + + def get_ftypes(self): + ftypes = np.array([blk.ftype for blk in self.blocks]) + return com.take_1d(ftypes, self._blknos, allow_fill=False) + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] + axes_array = [ax for ax in self.axes] + + extra_state = { + '0.14.1': { + 'axes': axes_array, + 'blocks': [dict(values=b.values, + mgr_locs=b.mgr_locs.indexer) + for b in self.blocks] + } + } + + # First three elements of the state are to maintain forward + # compatibility with 0.13.1. + return axes_array, block_values, block_items, extra_state + + def __setstate__(self, state): + def unpickle_block(values, mgr_locs): + # numpy < 1.7 pickle compat + if values.dtype == 'M8[us]': + values = values.astype('M8[ns]') + return make_block(values, placement=mgr_locs) + + if (isinstance(state, tuple) and len(state) >= 4 + and '0.14.1' in state[3]): + state = state[3]['0.14.1'] + self.axes = [_ensure_index(ax) for ax in state['axes']] + self.blocks = tuple( + unpickle_block(b['values'], b['mgr_locs']) + for b in state['blocks']) + else: + # discard anything after 3rd, support beta pickling format for a + # little while longer + ax_arrays, bvalues, bitems = state[:3] + + self.axes = [_ensure_index(ax) for ax in ax_arrays] + self.blocks = tuple( + unpickle_block(values, + self.axes[0].get_indexer(items)) + for values, items in zip(bvalues, bitems)) + + self._post_setstate() + + def _post_setstate(self): + self._is_consolidated = False + self._known_consolidated = False + self._rebuild_blknos_and_blklocs() + + def __len__(self): + return len(self.items) + + def __unicode__(self): + output = com.pprint_thing(self.__class__.__name__) + for i, ax in enumerate(self.axes): + if i == 0: + output += u('\nItems: %s') % ax + else: + output += u('\nAxis %d: %s') % (i, ax) + + for block in self.blocks: + output += u('\n%s') % com.pprint_thing(block) + return output + + def _verify_integrity(self): + mgr_shape = self.shape + tot_items = sum(len(x.mgr_locs) for x in self.blocks) + for block in self.blocks: + if not block.is_sparse and block.shape[1:] != mgr_shape[1:]: + construction_error(tot_items, block.shape[1:], self.axes) + if len(self.items) != tot_items: + raise AssertionError('Number of manager items must equal union of ' + 'block items\n# manager items: {0}, # ' + 'tot_items: {1}'.format(len(self.items), + tot_items)) + + def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs): + """ + iterate over the blocks, collect and create a new block manager + + Parameters + ---------- + f : the callable or function name to operate on at the block level + axes : optional (if not supplied, use self.axes) + filter : list, if supplied, only call the block if the filter is in + the block + do_integrity_check : boolean, default False. Do the block manager integrity check + + Returns + ------- + Block Manager (new object) + + """ + + result_blocks = [] + + # filter kwarg is used in replace-* family of methods + if filter is not None: + filter_locs = set(self.items.get_indexer_for(filter)) + if len(filter_locs) == len(self.items): + # All items are included, as if there were no filtering + filter = None + else: + kwargs['filter'] = filter_locs + + if f == 'where' and kwargs.get('align', True): + align_copy = True + align_keys = ['other', 'cond'] + elif f == 'putmask' and kwargs.get('align', True): + align_copy = False + align_keys = ['new', 'mask'] + elif f == 'eval': + align_copy = False + align_keys = ['other'] + elif f == 'fillna': + # fillna internally does putmask, maybe it's better to do this + # at mgr, not block level? + align_copy = False + align_keys = ['value'] + else: + align_keys = [] + + aligned_args = dict((k, kwargs[k]) for k in align_keys + if hasattr(kwargs[k], 'reindex_axis')) + + for b in self.blocks: + if filter is not None: + if not b.mgr_locs.isin(filter_locs).any(): + result_blocks.append(b) + continue + + if aligned_args: + b_items = self.items[b.mgr_locs.indexer] + + for k, obj in aligned_args.items(): + axis = getattr(obj, '_info_axis_number', 0) + kwargs[k] = obj.reindex_axis(b_items, axis=axis, + copy=align_copy) + + applied = getattr(b, f)(**kwargs) + + if isinstance(applied, list): + result_blocks.extend(applied) + else: + result_blocks.append(applied) + + if len(result_blocks) == 0: + return self.make_empty(axes or self.axes) + bm = self.__class__(result_blocks, axes or self.axes, + do_integrity_check=do_integrity_check) + bm._consolidate_inplace() + return bm + + def isnull(self, **kwargs): + return self.apply('apply', **kwargs) + + def where(self, **kwargs): + return self.apply('where', **kwargs) + + def eval(self, **kwargs): + return self.apply('eval', **kwargs) + + def setitem(self, **kwargs): + return self.apply('setitem', **kwargs) + + def putmask(self, **kwargs): + return self.apply('putmask', **kwargs) + + def diff(self, **kwargs): + return self.apply('diff', **kwargs) + + def interpolate(self, **kwargs): + return self.apply('interpolate', **kwargs) + + def shift(self, **kwargs): + return self.apply('shift', **kwargs) + + def fillna(self, **kwargs): + return self.apply('fillna', **kwargs) + + def downcast(self, **kwargs): + return self.apply('downcast', **kwargs) + + def astype(self, dtype, **kwargs): + return self.apply('astype', dtype=dtype, **kwargs) + + def convert(self, **kwargs): + return self.apply('convert', **kwargs) + + def replace(self, **kwargs): + return self.apply('replace', **kwargs) + + def replace_list(self, src_list, dest_list, inplace=False, regex=False): + """ do a list replace """ + + # figure out our mask a-priori to avoid repeated replacements + values = self.as_matrix() + + def comp(s): + if isnull(s): + return isnull(values) + return _possibly_compare(values, getattr(s, 'asm8', s), + operator.eq) + masks = [comp(s) for i, s in enumerate(src_list)] + + result_blocks = [] + for blk in self.blocks: + + # its possible to get multiple result blocks here + # replace ALWAYS will return a list + rb = [blk if inplace else blk.copy()] + for i, (s, d) in enumerate(zip(src_list, dest_list)): + new_rb = [] + for b in rb: + if b.dtype == np.object_: + result = b.replace(s, d, inplace=inplace, + regex=regex) + if isinstance(result, list): + new_rb.extend(result) + else: + new_rb.append(result) + else: + # get our mask for this element, sized to this + # particular block + m = masks[i][b.mgr_locs.indexer] + if m.any(): + new_rb.extend(b.putmask(m, d, inplace=True)) + else: + new_rb.append(b) + rb = new_rb + result_blocks.extend(rb) + + bm = self.__class__(result_blocks, self.axes) + bm._consolidate_inplace() + return bm + + def is_consolidated(self): + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self): + ftypes = [blk.ftype for blk in self.blocks] + self._is_consolidated = len(ftypes) == len(set(ftypes)) + self._known_consolidated = True + + @property + def is_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return len(self.blocks) > 1 + + @property + def is_numeric_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return all([block.is_numeric for block in self.blocks]) + + @property + def is_datelike_mixed_type(self): + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return any([block.is_datelike for block in self.blocks]) + + @property + def is_view(self): + """ return a boolean if we are a single block and are a view """ + if len(self.blocks) == 1: + return self.blocks[0].values.base is not None + return False + + def get_bool_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_bool], copy) + + def get_numeric_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_numeric], copy) + + def combine(self, blocks, copy=True): + """ return a new manager with the blocks """ + if len(blocks) == 0: + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) + new_items = self.items.take(indexer) + + new_blocks = [] + for b in blocks: + b = b.copy(deep=copy) + b.mgr_locs = com.take_1d(inv_indexer, b.mgr_locs.as_array, axis=0, + allow_fill=False) + new_blocks.append(b) + + new_axes = list(self.axes) + new_axes[0] = new_items + return self.__class__(new_blocks, new_axes, do_integrity_check=False) + + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0(slobj) + else: + slicer = [slice(None)] * (axis + 1) + slicer[axis] = slobj + slicer = tuple(slicer) + new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] + + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis][slobj] + + bm = self.__class__(new_blocks, new_axes, do_integrity_check=False, + fastpath=True) + bm._consolidate_inplace() + return bm + + def __contains__(self, item): + return item in self.items + + @property + def nblocks(self): + return len(self.blocks) + + def copy(self, deep=True): + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : boolean, default True + If False, return shallow copy (do not copy data) + + Returns + ------- + copy : BlockManager + """ + if deep: + new_axes = [ax.view() for ax in self.axes] + else: + new_axes = list(self.axes) + return self.apply('copy', axes=new_axes, deep=deep, + do_integrity_check=False) + + def as_matrix(self, items=None): + if len(self.blocks) == 0: + return np.empty(self.shape, dtype=float) + + if items is not None: + mgr = self.reindex_axis(items, axis=0) + else: + mgr = self + + if self._is_single_block: + return mgr.blocks[0].get_values() + else: + return mgr._interleave() + + def _interleave(self): + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + dtype = _interleaved_dtype(self.blocks) + + result = np.empty(self.shape, dtype=dtype) + + if result.shape[0] == 0: + # Workaround for numpy 1.7 bug: + # + # >>> a = np.empty((0,10)) + # >>> a[slice(0,0)] + # array([], shape=(0, 10), dtype=float64) + # >>> a[[]] + # Traceback (most recent call last): + # File "", line 1, in + # IndexError: index 0 is out of bounds for axis 0 with size 0 + return result + + itemmask = np.zeros(self.shape[0]) + + for blk in self.blocks: + rl = blk.mgr_locs + result[rl.indexer] = blk.get_values(dtype) + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError('Some items were not contained in blocks') + + return result + + def xs(self, key, axis=1, copy=True, takeable=False): + if axis < 1: + raise AssertionError('Can only take xs across axis >= 1, got %d' + % axis) + + # take by position + if takeable: + loc = key + else: + loc = self.axes[axis].get_loc(key) + + slicer = [slice(None, None) for _ in range(self.ndim)] + slicer[axis] = loc + slicer = tuple(slicer) + + new_axes = list(self.axes) + + # could be an array indexer! + if isinstance(loc, (slice, np.ndarray)): + new_axes[axis] = new_axes[axis][loc] + else: + new_axes.pop(axis) + + new_blocks = [] + if len(self.blocks) > 1: + # we must copy here as we are mixed type + for blk in self.blocks: + newb = make_block(values=blk.values[slicer], + klass=blk.__class__, fastpath=True, + placement=blk.mgr_locs) + new_blocks.append(newb) + elif len(self.blocks) == 1: + block = self.blocks[0] + vals = block.values[slicer] + if copy: + vals = vals.copy() + new_blocks = [make_block(values=vals, placement=block.mgr_locs, + klass=block.__class__, fastpath=True,)] + + return self.__class__(new_blocks, new_axes) + + def fast_xs(self, loc): + """ + get a cross sectional for a given location in the + items ; handle dups + + return the result, is *could* be a view in the case of a + single block + """ + if len(self.blocks) == 1: + return self.blocks[0].values[:, loc] + + items = self.items + + # non-unique (GH4726) + if not items.is_unique: + result = self._interleave() + if self.ndim == 2: + result = result.T + return result[loc] + + # unique + dtype = _interleaved_dtype(self.blocks) + n = len(items) + result = np.empty(n, dtype=dtype) + for blk in self.blocks: + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk._try_coerce_result(blk.iget((i, loc))) + + return result + + def consolidate(self): + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + bm = self.__class__(self.blocks, self.axes) + bm._consolidate_inplace() + return bm + + def _consolidate_inplace(self): + if not self.is_consolidated(): + self.blocks = tuple(_consolidate(self.blocks)) + + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + + def get(self, item, fastpath=True): + """ + Return values for selected item (ndarray or BlockManager). + """ + if self.items.is_unique: + + if not isnull(item): + loc = self.items.get_loc(item) + else: + indexer = np.arange(len(self.items))[isnull(self.items)] + + # allow a single nan location indexer + if not np.isscalar(indexer): + if len(indexer) == 1: + loc = indexer.item() + else: + raise ValueError("cannot label index with a null key") + + return self.iget(loc, fastpath=fastpath) + else: + + if isnull(item): + raise ValueError("cannot label index with a null key") + + indexer = self.items.get_indexer_for([item]) + return self.reindex_indexer(new_axis=self.items[indexer], + indexer=indexer, axis=0, allow_dups=True) + + def iget(self, i, fastpath=True): + """ + Return the data as a SingleBlockManager if fastpath=True and possible + + Otherwise return as a ndarray + + """ + + block = self.blocks[self._blknos[i]] + values = block.iget(self._blklocs[i]) + if not fastpath or block.is_sparse or values.ndim != 1: + return values + + # fastpath shortcut for select a single-dim from a 2-dim BM + return SingleBlockManager([ block.make_block_same_class(values, + placement=slice(0, len(values)), + fastpath=True) ], + self.axes[1]) + + + def get_scalar(self, tup): + """ + Retrieve single item + """ + full_loc = list(ax.get_loc(x) + for ax, x in zip(self.axes, tup)) + blk = self.blocks[self._blknos[full_loc[0]]] + full_loc[0] = self._blklocs[full_loc[0]] + + # FIXME: this may return non-upcasted types? + return blk.values[tuple(full_loc)] + + def delete(self, item): + """ + Delete selected item (items if non-unique) in-place. + """ + indexer = self.items.get_loc(item) + + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + ref_loc_offset = -is_deleted.cumsum() + + is_blk_deleted = [False] * len(self.blocks) + + if isinstance(indexer, int): + affected_start = indexer + else: + affected_start = is_deleted.nonzero()[0][0] + + for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + blk = self.blocks[blkno] + bml = blk.mgr_locs + blk_del = is_deleted[bml.indexer].nonzero()[0] + + if len(blk_del) == len(bml): + is_blk_deleted[blkno] = True + continue + elif len(blk_del) != 0: + blk.delete(blk_del) + bml = blk.mgr_locs + + blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) + + # FIXME: use Index.delete as soon as it uses fastpath=True + self.axes[0] = self.items[~is_deleted] + self.blocks = tuple(b for blkno, b in enumerate(self.blocks) + if not is_blk_deleted[blkno]) + self._shape = None + self._rebuild_blknos_and_blklocs() + + def set(self, item, value, check=False): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + if check, then validate that we are not setting the same data in-place + """ + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + value_is_sparse = isinstance(value, SparseArray) + + if value_is_sparse: + assert self.ndim == 2 + + def value_getitem(placement): + return value + else: + if value.ndim == self.ndim - 1: + value = value.reshape((1,) + value.shape) + + def value_getitem(placement): + return value + else: + def value_getitem(placement): + return value[placement.indexer] + if value.shape[1:] != self.shape[1:]: + raise AssertionError('Shape of new values must be compatible ' + 'with manager shape') + + try: + loc = self.items.get_loc(item) + except KeyError: + # This item wasn't present, just insert at end + self.insert(len(self.items), item, value) + return + + if isinstance(loc, int): + loc = [loc] + + blknos = self._blknos[loc] + blklocs = self._blklocs[loc] + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + blk = self.blocks[blkno] + blk_locs = blklocs[val_locs.indexer] + if blk.should_store(value): + blk.set(blk_locs, value_getitem(val_locs), check=check) + else: + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) + + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) + else: + self._blklocs[blk.mgr_locs.indexer] = -1 + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.int64) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - + len(removed_blknos)) + self._blknos = com.take_1d(new_blknos, self._blknos, axis=0, + allow_fill=False) + self.blocks = tuple(blk for i, blk in enumerate(self.blocks) + if i not in set(removed_blknos)) + + if unfit_val_locs: + unfit_mgr_locs = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_mgr_locs) + + new_blocks = [] + if value_is_sparse: + # This code (ab-)uses the fact that sparse blocks contain only + # one item. + new_blocks.extend( + make_block(values=value.copy(), ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1)) + for mgr_loc in unfit_mgr_locs) + + self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + + len(self.blocks)) + self._blklocs[unfit_mgr_locs] = 0 + + else: + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) + + new_blocks.append( + make_block(values=value_getitem(unfit_val_items), + ndim=self.ndim, placement=unfit_mgr_locs)) + + self._blknos[unfit_mgr_locs] = len(self.blocks) + self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) + + self.blocks += tuple(new_blocks) + + # Newly created block's dtype may already be present. + self._known_consolidated = False + + def insert(self, loc, item, value, allow_duplicates=False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ + if not allow_duplicates and item in self.items: + # Should this be a different kind of error?? + raise ValueError('cannot insert %s, already exists' % item) + + if not isinstance(loc, int): + raise TypeError("loc must be int") + + block = make_block(values=value, + ndim=self.ndim, + placement=slice(loc, loc+1)) + + for blkno, count in _fast_count_smallints(self._blknos[loc:]): + blk = self.blocks[blkno] + if count == len(blk.mgr_locs): + blk.mgr_locs = blk.mgr_locs.add(1) + else: + new_mgr_locs = blk.mgr_locs.as_array.copy() + new_mgr_locs[new_mgr_locs >= loc] += 1 + blk.mgr_locs = new_mgr_locs + + if loc == self._blklocs.shape[0]: + # np.append is a lot faster (at least in numpy 1.7.1), let's use it + # if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + else: + self._blklocs = np.insert(self._blklocs, loc, 0) + self._blknos = np.insert(self._blknos, loc, len(self.blocks)) + + self.axes[0] = self.items.insert(loc, item) + + self.blocks += (block,) + self._shape = None + + self._known_consolidated = False + + if len(self.blocks) > 100: + self._consolidate_inplace() + + def reindex_axis(self, new_index, axis, method=None, limit=None, + fill_value=None, copy=True): + """ + Conform block manager to new index. + """ + new_index = _ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit, copy_if_needed=True) + + return self.reindex_indexer(new_index, indexer, axis=axis, + fill_value=fill_value, copy=copy) + + def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, + allow_dups=False, copy=True): + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object + allow_dups : bool + + pandas-indexer with -1's only. + """ + + if indexer is None: + if new_axis is self.axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + self._consolidate_inplace() + + # trying to reindex on an axis with duplicates + if (not allow_dups and not self.axes[axis].is_unique + and len(indexer)): + raise ValueError("cannot reindex from a duplicate axis") + + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + if axis == 0: + new_blocks = self._slice_take_blocks_ax0( + indexer, fill_tuple=(fill_value,)) + else: + new_blocks = [blk.take_nd(indexer, axis=axis, + fill_tuple=(fill_value if fill_value is not None else + blk.fill_value,)) + for blk in self.blocks] + + new_axes = list(self.axes) + new_axes[axis] = new_axis + return self.__class__(new_blocks, new_axes) + + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): + """ + Slice/take blocks along axis=0. + + Overloaded for SingleBlock + + Returns + ------- + new_blocks : list of Block + + """ + + allow_fill = fill_tuple is not None + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill) + + if self._is_single_block: + blk = self.blocks[0] + + if sl_type in ('slice', 'mask'): + return [blk.getitem_block(slobj, + new_mgr_locs=slice(0, sllen))] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_tuple[0] is None: + _, fill_value = com._maybe_promote(blk.dtype) + fill_tuple = (fill_value,) + + return [blk.take_nd(slobj, axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple)] + + if sl_type in ('slice', 'mask'): + blknos = self._blknos[slobj] + blklocs = self._blklocs[slobj] + else: + blknos = com.take_1d(self._blknos, slobj, fill_value=-1, + allow_fill=allow_fill) + blklocs = com.take_1d(self._blklocs, slobj, fill_value=-1, + allow_fill=allow_fill) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + # + # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, + # pytables serialization will break otherwise. + blocks = [] + for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + if blkno == -1: + # If we've got here, fill_tuple was not None. + fill_value = fill_tuple[0] + + blocks.append(self._make_na_block( + placement=mgr_locs, fill_value=fill_value)) + else: + blk = self.blocks[blkno] + + # Otherwise, slicing along items axis is necessary. + if blk.is_sparse: + # A sparse block, it's easy, because there's only one item + # and each mgr loc is a copy of that single item. + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=True) + newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) + blocks.append(newblk) + + else: + blocks.append(blk.take_nd( + blklocs[mgr_locs.indexer], axis=0, + new_mgr_locs=mgr_locs, fill_tuple=None)) + + return blocks + + def _make_na_block(self, placement, fill_value=None): + # TODO: infer dtypes other than float64 from fill_value + + if fill_value is None: + fill_value = np.nan + block_shape = list(self.shape) + block_shape[0] = len(placement) + + dtype, fill_value = com._infer_dtype_from_scalar(fill_value) + block_values = np.empty(block_shape, dtype=dtype) + block_values.fill(fill_value) + return make_block(block_values, placement=placement) + + def take(self, indexer, axis=1, verify=True, convert=True): + """ + Take items along any axis. + """ + self._consolidate_inplace() + indexer = np.asanyarray(indexer, dtype=np.int_) + + n = self.shape[axis] + if convert: + indexer = _maybe_convert_indices(indexer, n) + + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception('Indices must be nonzero and less than ' + 'the axis length') + + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer(new_axis=new_labels, indexer=indexer, + axis=axis, allow_dups=True) + + def merge(self, other, lsuffix='', rsuffix=''): + if not self._is_indexed_like(other): + raise AssertionError('Must have same axes to merge managers') + + l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix, + right=other.items, rsuffix=rsuffix) + new_items = _concat_indexes([l, r]) + + new_blocks = [blk.copy(deep=False) + for blk in self.blocks] + + offset = self.shape[0] + for blk in other.blocks: + blk = blk.copy(deep=False) + blk.mgr_locs = blk.mgr_locs.add(offset) + new_blocks.append(blk) + + new_axes = list(self.axes) + new_axes[0] = new_items + + return self.__class__(_consolidate(new_blocks), new_axes) + + def _is_indexed_like(self, other): + """ + Check all axes except items + """ + if self.ndim != other.ndim: + raise AssertionError(('Number of dimensions must agree ' + 'got %d and %d') % (self.ndim, other.ndim)) + for ax, oax in zip(self.axes[1:], other.axes[1:]): + if not ax.equals(oax): + return False + return True + + def equals(self, other): + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all (ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + self._consolidate_inplace() + other._consolidate_inplace() + return all(block.equals(oblock) for block, oblock in + zip(self.blocks, other.blocks)) + + +class SingleBlockManager(BlockManager): + """ manage a single block with """ + + ndim = 1 + _is_consolidated = True + _known_consolidated = True + __slots__ = () + + def __init__(self, block, axis, do_integrity_check=False, fastpath=False): + + if isinstance(axis, list): + if len(axis) != 1: + raise ValueError( + "cannot create SingleBlockManager with more than 1 axis") + axis = axis[0] + + # passed from constructor, single block, single axis + if fastpath: + self.axes = [axis] + if isinstance(block, list): + + # empty block + if len(block) == 0: + block = [np.array([])] + elif len(block) != 1: + raise ValueError('Cannot create SingleBlockManager with ' + 'more than 1 block') + block = block[0] + else: + self.axes = [_ensure_index(axis)] + + # create the block here + if isinstance(block, list): + + # provide consolidation to the interleaved_dtype + if len(block) > 1: + dtype = _interleaved_dtype(block) + block = [b.astype(dtype) for b in block] + block = _consolidate(block) + + if len(block) != 1: + raise ValueError('Cannot create SingleBlockManager with ' + 'more than 1 block') + block = block[0] + + if not isinstance(block, Block): + block = make_block(block, + placement=slice(0, len(axis)), + ndim=1, fastpath=True) + + self.blocks = [block] + + def _post_setstate(self): + pass + + @property + def _block(self): + return self.blocks[0] + + @property + def _values(self): + return self._block.values + + def reindex(self, new_axis, indexer=None, method=None, fill_value=None, + limit=None, copy=True): + # if we are the same and don't copy, just return + if self.index.equals(new_axis): + if copy: + return self.copy(deep=True) + else: + return self + + values = self._block.get_values() + + if indexer is None: + indexer = self.items.get_indexer_for(new_axis) + + if fill_value is None: + # FIXME: is fill_value used correctly in sparse blocks? + if not self._block.is_sparse: + fill_value = self._block.fill_value + else: + fill_value = np.nan + + new_values = com.take_1d(values, indexer, + fill_value=fill_value) + + # fill if needed + if method is not None or limit is not None: + new_values = com.interpolate_2d(new_values, method=method, + limit=limit, fill_value=fill_value) + + if self._block.is_sparse: + make_block = self._block.make_block_same_class + + block = make_block(new_values, copy=copy, + placement=slice(0, len(new_axis))) + + mgr = SingleBlockManager(block, new_axis) + mgr._consolidate_inplace() + return mgr + + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + return self.__class__(self._block._slice(slobj), + self.index[slobj], fastpath=True) + + @property + def index(self): + return self.axes[0] + + def convert(self, **kwargs): + """ convert the whole block as one """ + kwargs['by_item'] = False + return self.apply('convert', **kwargs) + + @property + def dtype(self): + return self._values.dtype + + @property + def ftype(self): + return self._block.ftype + + def get_dtype_counts(self): + return {self.dtype.name: 1} + + def get_ftype_counts(self): + return {self.ftype: 1} + + def get_dtypes(self): + return np.array([self._block.dtype]) + + def get_ftypes(self): + return np.array([self._block.ftype]) + + @property + def values(self): + return self._values.view() + + @property + def itemsize(self): + return self._values.itemsize + + @property + def _can_hold_na(self): + return self._block._can_hold_na + + def is_consolidated(self): + return True + + def _consolidate_check(self): + pass + + def _consolidate_inplace(self): + pass + + def delete(self, item): + """ + Delete single item from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + loc = self.items.get_loc(item) + self._block.delete(loc) + self.axes[0] = self.axes[0].delete(loc) + + def fast_xs(self, loc): + """ + fast path for getting a cross-section + return a view of the data + """ + return self._block.values[loc] + + +def construction_error(tot_items, block_shape, axes, e=None): + """ raise a helpful message about our construction """ + passed = tuple(map(int, [tot_items] + list(block_shape))) + implied = tuple(map(int, [len(ax) for ax in axes])) + if passed == implied and e is not None: + raise e + raise ValueError("Shape of passed values is {0}, indices imply {1}".format( + passed,implied)) + + +def create_block_manager_from_blocks(blocks, axes): + try: + if len(blocks) == 1 and not isinstance(blocks[0], Block): + # It's OK if a single block is passed as values, its placement is + # basically "all items", but if there're many, don't bother + # converting, it's an error anyway. + blocks = [make_block(values=blocks[0], + placement=slice(0, len(axes[0])))] + + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + + except (ValueError) as e: + blocks = [getattr(b, 'values', b) for b in blocks] + tot_items = sum(b.shape[0] for b in blocks) + construction_error(tot_items, blocks[0].shape[1:], axes, e) + + +def create_block_manager_from_arrays(arrays, names, axes): + try: + blocks = form_blocks(arrays, names, axes) + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + except (ValueError) as e: + construction_error(len(arrays), arrays[0].shape[1:], axes, e) + + +def form_blocks(arrays, names, axes): + # put "leftover" items in float bucket, where else? + # generalize? + float_items = [] + complex_items = [] + int_items = [] + bool_items = [] + object_items = [] + sparse_items = [] + datetime_items = [] + extra_locs = [] + + names_idx = Index(names) + if names_idx.equals(axes[0]): + names_indexer = np.arange(len(names_idx)) + else: + assert names_idx.intersection(axes[0]).is_unique + names_indexer = names_idx.get_indexer_for(axes[0]) + + for i, name_idx in enumerate(names_indexer): + if name_idx == -1: + extra_locs.append(i) + continue + + k = names[name_idx] + v = arrays[name_idx] + + if isinstance(v, (SparseArray, ABCSparseSeries)): + sparse_items.append((i, k, v)) + elif issubclass(v.dtype.type, np.floating): + float_items.append((i, k, v)) + elif issubclass(v.dtype.type, np.complexfloating): + complex_items.append((i, k, v)) + elif issubclass(v.dtype.type, np.datetime64): + if v.dtype != _NS_DTYPE: + v = tslib.cast_to_nanoseconds(v) + + if hasattr(v, 'tz') and v.tz is not None: + object_items.append((i, k, v)) + else: + datetime_items.append((i, k, v)) + elif issubclass(v.dtype.type, np.integer): + if v.dtype == np.uint64: + # HACK #2355 definite overflow + if (v > 2 ** 63 - 1).any(): + object_items.append((i, k, v)) + continue + int_items.append((i, k, v)) + elif v.dtype == np.bool_: + bool_items.append((i, k, v)) + else: + object_items.append((i, k, v)) + + blocks = [] + if len(float_items): + float_blocks = _multi_blockify(float_items) + blocks.extend(float_blocks) + + if len(complex_items): + complex_blocks = _simple_blockify( + complex_items, np.complex128) + blocks.extend(complex_blocks) + + if len(int_items): + int_blocks = _multi_blockify(int_items) + blocks.extend(int_blocks) + + if len(datetime_items): + datetime_blocks = _simple_blockify( + datetime_items, _NS_DTYPE) + blocks.extend(datetime_blocks) + + if len(bool_items): + bool_blocks = _simple_blockify( + bool_items, np.bool_) + blocks.extend(bool_blocks) + + if len(object_items) > 0: + object_blocks = _simple_blockify( + object_items, np.object_) + blocks.extend(object_blocks) + + if len(sparse_items) > 0: + sparse_blocks = _sparse_blockify(sparse_items) + blocks.extend(sparse_blocks) + + if len(extra_locs): + shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) + + # empty items -> dtype object + block_values = np.empty(shape, dtype=object) + block_values.fill(np.nan) + + na_block = make_block(block_values, placement=extra_locs) + blocks.append(na_block) + + return blocks + + +def _simple_blockify(tuples, dtype): + """ return a single array of a block that has a single dtype; if dtype is + not None, coerce to this dtype + """ + values, placement = _stack_arrays(tuples, dtype) + + # CHECK DTYPE? + if dtype is not None and values.dtype != dtype: # pragma: no cover + values = values.astype(dtype) + + block = make_block(values, placement=placement) + return [block] + + +def _multi_blockify(tuples, dtype=None): + """ return an array of blocks that potentially have different dtypes """ + + # group by dtype + grouper = itertools.groupby(tuples, lambda x: x[2].dtype) + + new_blocks = [] + for dtype, tup_block in grouper: + + values, placement = _stack_arrays( + list(tup_block), dtype) + + block = make_block(values, placement=placement) + new_blocks.append(block) + + return new_blocks + + +def _sparse_blockify(tuples, dtype=None): + """ return an array of blocks that potentially have different dtypes (and + are sparse) + """ + + new_blocks = [] + for i, names, array in tuples: + array = _maybe_to_sparse(array) + block = make_block( + array, klass=SparseBlock, fastpath=True, + placement=[i]) + new_blocks.append(block) + + return new_blocks + + +def _stack_arrays(tuples, dtype): + + # fml + def _asarray_compat(x): + if isinstance(x, ABCSeries): + return x.values + else: + return np.asarray(x) + + def _shape_compat(x): + if isinstance(x, ABCSeries): + return len(x), + else: + return x.shape + + placement, names, arrays = zip(*tuples) + + first = arrays[0] + shape = (len(arrays),) + _shape_compat(first) + + stacked = np.empty(shape, dtype=dtype) + for i, arr in enumerate(arrays): + stacked[i] = _asarray_compat(arr) + + return stacked, placement + + +def _interleaved_dtype(blocks): + if not len(blocks): + return None + + counts = defaultdict(lambda: []) + for x in blocks: + counts[type(x)].append(x) + + def _lcd_dtype(l): + """ find the lowest dtype that can accomodate the given types """ + m = l[0].dtype + for x in l[1:]: + if x.dtype.itemsize > m.itemsize: + m = x.dtype + return m + + have_int = len(counts[IntBlock]) > 0 + have_bool = len(counts[BoolBlock]) > 0 + have_object = len(counts[ObjectBlock]) > 0 + have_float = len(counts[FloatBlock]) > 0 + have_complex = len(counts[ComplexBlock]) > 0 + have_dt64 = len(counts[DatetimeBlock]) > 0 + have_td64 = len(counts[TimeDeltaBlock]) > 0 + have_sparse = len(counts[SparseBlock]) > 0 + have_numeric = have_float or have_complex or have_int + + if (have_object or + (have_bool and have_numeric) or + (have_numeric and (have_dt64 or have_td64))): + return np.dtype(object) + elif have_bool: + return np.dtype(bool) + elif have_int and not have_float and not have_complex: + + # if we are mixing unsigned and signed, then return + # the next biggest int type (if we can) + lcd = _lcd_dtype(counts[IntBlock]) + kinds = set([i.dtype.kind for i in counts[IntBlock]]) + if len(kinds) == 1: + return lcd + + if lcd == 'uint64' or lcd == 'int64': + return np.dtype('int64') + + # return 1 bigger on the itemsize if unsinged + if lcd.kind == 'u': + return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) + return lcd + + elif have_dt64 and not have_float and not have_complex: + return np.dtype('M8[ns]') + elif have_td64 and not have_float and not have_complex: + return np.dtype('m8[ns]') + elif have_complex: + return np.dtype('c16') + else: + return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) + + +def _consolidate(blocks): + """ + Merge blocks having same dtype, exclude non-consolidating blocks + """ + + # sort by _can_consolidate, dtype + gkey = lambda x: x._consolidate_key + grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) + + new_blocks = [] + for (_can_consolidate, dtype), group_blocks in grouper: + merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, + _can_consolidate=_can_consolidate) + if isinstance(merged_blocks, list): + new_blocks.extend(merged_blocks) + else: + new_blocks.append(merged_blocks) + + return new_blocks + + +def _merge_blocks(blocks, dtype=None, _can_consolidate=True): + if len(blocks) == 1: + return blocks[0] + + if _can_consolidate: + + if dtype is None: + if len(set([b.dtype for b in blocks])) != 1: + raise AssertionError("_merge_blocks are invalid!") + dtype = blocks[0].dtype + + # FIXME: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) + new_values = _vstack([b.values for b in blocks], dtype) + + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] + + return make_block(new_values, + fastpath=True, placement=new_mgr_locs) + + # no merge + return blocks + + +def _block_shape(values, ndim=1, shape=None): + """ guarantee the shape of the values to be at least 1 d """ + if values.ndim <= ndim: + if shape is None: + shape = values.shape + values = values.reshape(tuple((1,) + shape)) + return values + + +def _vstack(to_stack, dtype): + + # work around NumPy 1.6 bug + if dtype == _NS_DTYPE or dtype == _TD_DTYPE: + new_values = np.vstack([x.view('i8') for x in to_stack]) + return new_values.view(dtype) + + else: + return np.vstack(to_stack) + + +def _possibly_compare(a, b, op): + res = op(a, b) + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + if np.isscalar(res) and (is_a_array or is_b_array): + type_names = [type(a).__name__, type(b).__name__] + + if is_a_array: + type_names[0] = 'ndarray(dtype=%s)' % a.dtype + + if is_b_array: + type_names[1] = 'ndarray(dtype=%s)' % b.dtype + + raise TypeError("Cannot compare types %r and %r" % tuple(type_names)) + return res + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + + +def _get_blkno_placements(blknos, blk_count, group=True): + """ + + Parameters + ---------- + blknos : array of int64 + blk_count : int + group : bool + + Returns + ------- + iterator + yield (BlockPlacement, blkno) + + """ + + blknos = com._ensure_int64(blknos) + + # FIXME: blk_count is unused, but it may avoid the use of dicts in cython + for blkno, indexer in lib.get_blkno_indexers(blknos, group): + yield blkno, BlockPlacement(indexer) + + +def items_overlap_with_suffix(left, lsuffix, right, rsuffix): + """ + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + else: + if not lsuffix and not rsuffix: + raise ValueError('columns overlap but no suffix specified: %s' % + to_rename) + + def lrenamer(x): + if x in to_rename: + return '%s%s' % (x, lsuffix) + return x + + def rrenamer(x): + if x in to_rename: + return '%s%s' % (x, rsuffix) + return x + + return (_transform_index(left, lrenamer), + _transform_index(right, rrenamer)) + + +def _transform_index(index, func): + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + + """ + if isinstance(index, MultiIndex): + items = [tuple(func(y) for y in x) for x in index] + return MultiIndex.from_tuples(items, names=index.names) + else: + items = [func(x) for x in index] + return Index(items, name=index.name) + + +def _putmask_smart(v, m, n): + """ + Return a new block, try to preserve dtype if possible. + + Parameters + ---------- + v : array_like + m : array_like + n : array_like + """ + + # n should be the length of the mask or a scalar here + if not is_list_like(n): + n = np.array([n] * len(m)) + + # see if we are only masking values that if putted + # will work in the current dtype + try: + nn = n[m] + nn_at = nn.astype(v.dtype) + if (nn == nn_at).all(): + nv = v.copy() + nv[m] = nn_at + return nv + except (ValueError, IndexError, TypeError): + pass + + # change the dtype + dtype, _ = com._maybe_promote(n.dtype) + nv = v.astype(dtype) + try: + nv[m] = n + except ValueError: + idx, = np.where(np.squeeze(m)) + for mask_index, new_val in zip(idx, n): + nv[mask_index] = new_val + return nv + + +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + """ + concat_plan = combine_concat_plans([get_mgr_concatenation_plan(mgr, indexers) + for mgr, indexers in mgrs_indexers], + concat_axis) + + blocks = [make_block(concatenate_join_units(join_units, concat_axis, + copy=copy), + placement=placement) + for placement, join_units in concat_plan] + + return BlockManager(blocks, axes) + + +def get_empty_dtype_and_na(join_units): + """ + Return dtype and N/A values to use when concatenating specified units. + + Returned N/A value may be None which means there was no casting involved. + + Returns + ------- + dtype + na + """ + + if len(join_units) == 1: + blk = join_units[0].block + if blk is None: + return np.float64, np.nan + else: + return blk.dtype, None + + has_none_blocks = False + dtypes = [None] * len(join_units) + + for i, unit in enumerate(join_units): + if unit.block is None: + has_none_blocks = True + else: + dtypes[i] = unit.dtype + + if not has_none_blocks and len(set(dtypes)) == 1: + # Unanimous decision, nothing to upcast. + return dtypes[0], None + + # dtypes = set() + upcast_classes = set() + null_upcast_classes = set() + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + if issubclass(dtype.type, (np.object_, np.bool_)): + upcast_cls = 'object' + elif is_datetime64_dtype(dtype): + upcast_cls = 'datetime' + elif is_timedelta64_dtype(dtype): + upcast_cls = 'timedelta' + else: + upcast_cls = 'float' + + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_null: + null_upcast_classes.add(upcast_cls) + else: + upcast_classes.add(upcast_cls) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + # create the result + if 'object' in upcast_classes: + return np.dtype(np.object_), np.nan + elif 'float' in upcast_classes: + return np.dtype(np.float64), np.nan + elif 'datetime' in upcast_classes: + return np.dtype('M8[ns]'), tslib.iNaT + elif 'timedelta' in upcast_classes: + return np.dtype('m8[ns]'), tslib.iNaT + else: # pragma + raise AssertionError("invalid dtype determination in get_concat_dtype") + + +def concatenate_join_units(join_units, concat_axis, copy): + """ + Concatenate values from several join units along selected axis. + """ + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") + + empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) + + to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, + upcasted_na=upcasted_na) + for ju in join_units] + + if len(to_concat) == 1: + # Only one block, nothing to concatenate. + concat_values = to_concat[0] + if copy and concat_values.base is not None: + concat_values = concat_values.copy() + else: + concat_values = com._concat_compat(to_concat, axis=concat_axis) + + # FIXME: optimization potential: if len(join_units) == 1, single join unit + # is densified and sparsified back. + if any(unit.is_sparse for unit in join_units): + # If one of the units was sparse, concat_values are 2d and there's only + # one item. + return SparseArray(concat_values[0]) + else: + return concat_values + + +def get_mgr_concatenation_plan(mgr, indexers): + """ + Construct concatenation plan for given block manager and indexers. + + Parameters + ---------- + mgr : BlockManager + indexers : dict of {axis: indexer} + + Returns + ------- + plan : list of (BlockPlacement, JoinUnit) tuples + + """ + # Calculate post-reindex shape , save for item axis which will be separate + # for each block anyway. + mgr_shape = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape[ax] = len(indexer) + mgr_shape = tuple(mgr_shape) + + if 0 in indexers: + ax0_indexer = indexers.pop(0) + blknos = com.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) + blklocs = com.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) + else: + + if mgr._is_single_block: + blk = mgr.blocks[0] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] + + ax0_indexer = None + blknos = mgr._blknos + blklocs = mgr._blklocs + + plan = [] + for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), + group=False): + assert placements.is_slice_like + + join_unit_indexers = indexers.copy() + + shape = list(mgr_shape) + shape[0] = len(placements) + shape = tuple(shape) + + if blkno == -1: + unit = JoinUnit(None, shape) + else: + blk = mgr.blocks[blkno] + ax0_blk_indexer = blklocs[placements.indexer] + + unit_no_ax0_reindexing = ( + len(placements) == len(blk.mgr_locs) and + # Fastpath detection of join unit not needing to reindex its + # block: no ax0 reindexing took place and block placement was + # sequential before. + ((ax0_indexer is None + and blk.mgr_locs.is_slice_like + and blk.mgr_locs.as_slice.step == 1) or + # Slow-ish detection: all indexer locs are sequential (and + # length match is checked above). + (np.diff(ax0_blk_indexer) == 1).all())) + + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer + + unit = JoinUnit(blk, shape, join_unit_indexers) + + plan.append((placements, unit)) + + return plan + + +def combine_concat_plans(plans, concat_axis): + """ + Combine multiple concatenation plans into one. + + existing_plan is updated in-place. + """ + if len(plans) == 1: + for p in plans[0]: + yield p[0], [p[1]] + + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None + + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc + + if last_plc is not None: + offset += last_plc.as_slice.stop + + else: + num_ended = [0] + def _next_or_none(seq): + retval = next(seq, None) + if retval is None: + num_ended[0] += 1 + return retval + + plans = list(map(iter, plans)) + next_items = list(map(_next_or_none, plans)) + + while num_ended[0] != len(next_items): + if num_ended[0] > 0: + raise ValueError("Plan shapes are not aligned") + + placements, units = zip(*next_items) + + lengths = list(map(len, placements)) + min_len, max_len = min(lengths), max(lengths) + + if min_len == max_len: + yield placements[0], units + next_items[:] = map(_next_or_none, plans) + else: + yielded_placement = None + yielded_units = [None] * len(next_items) + for i, (plc, unit) in enumerate(next_items): + yielded_units[i] = unit + if len(plc) > min_len: + # trim_join_unit updates unit in place, so only + # placement needs to be sliced to skip min_len. + next_items[i] = (plc[min_len:], + trim_join_unit(unit, min_len)) + else: + yielded_placement = plc + next_items[i] = _next_or_none(plans[i]) + + yield yielded_placement, yielded_units + + +def trim_join_unit(join_unit, length): + """ + Reduce join_unit's shape along item axis to length. + + Extra items that didn't fit are returned as a separate block. + """ + + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers + + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block + + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] + + extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] + join_unit.shape = (length,) + join_unit.shape[1:] + + return JoinUnit(block=extra_block, indexers=extra_indexers, + shape=extra_shape) + + +class JoinUnit(object): + def __init__(self, block, shape, indexers={}): + # Passing shape explicitly is required for cases when block is None. + self.block = block + self.indexers = indexers + self.shape = shape + + def __repr__(self): + return '%s(%r, %s)' % (self.__class__.__name__, + self.block, self.indexers) + + @cache_readonly + def needs_filling(self): + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + if self.block is None: + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return self.block.dtype + else: + return np.dtype(com._maybe_promote(self.block.dtype, + self.block.fill_value)[0]) + return self._dtype + + @cache_readonly + def is_null(self): + if self.block is None: + return True + + if not self.block._can_hold_na: + return False + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. 1000 value + # was chosen rather arbitrarily. + values_flat = self.block.values.ravel() + total_len = values_flat.shape[0] + chunk_len = max(total_len // 40, 1000) + for i in range(0, total_len, chunk_len): + if not isnull(values_flat[i: i + chunk_len]).all(): + return False + + return True + + @cache_readonly + def is_sparse(self): + return self.block is not None and self.block.is_sparse + + def get_reindexed_values(self, empty_dtype, upcasted_na): + if upcasted_na is None: + # No upcasting is necessary + fill_value = self.block.fill_value + values = self.block.get_values() + else: + fill_value = upcasted_na + + if self.is_null: + missing_arr = np.empty(self.shape, dtype=empty_dtype) + if np.prod(self.shape): + # NumPy 1.6 workaround: this statement gets strange if all + # blocks are of same dtype and some of them are empty: + # empty one are considered "null" so they must be filled, + # but no dtype upcasting happens and the dtype may not + # allow NaNs. + # + # In general, no one should get hurt when one tries to put + # incorrect values into empty array, but numpy 1.6 is + # strict about that. + missing_arr.fill(fill_value) + return missing_arr + + if self.block.is_bool: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.object_).values + else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.get_values() + + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + return values.view() + else: + for ax, indexer in self.indexers.items(): + values = com.take_nd(values, indexer, axis=ax, + fill_value=fill_value) + + return values + + +def _fast_count_smallints(arr): + """Faster version of set(arr) for sequences of small numbers.""" + if len(arr) == 0: + # Handle empty arr case separately: numpy 1.6 chokes on that. + return np.empty((0, 2), dtype=arr.dtype) + else: + counts = np.bincount(arr.astype(np.int_)) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] + + +def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): + if isinstance(slice_or_indexer, slice): + return 'slice', slice_or_indexer, lib.slice_len(slice_or_indexer, + length) + elif (isinstance(slice_or_indexer, np.ndarray) and + slice_or_indexer.dtype == np.bool_): + return 'mask', slice_or_indexer, slice_or_indexer.sum() + else: + indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) + if not allow_fill: + indexer = _maybe_convert_indices(indexer, length) + return 'fancy', indexer, len(indexer) diff --git a/pandas/core/matrix.py b/pandas/core/matrix.py new file mode 100644 index 00000000..3d42fd93 --- /dev/null +++ b/pandas/core/matrix.py @@ -0,0 +1 @@ +from pandas.core.frame import DataFrame as DataMatrix diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py new file mode 100644 index 00000000..aa614038 --- /dev/null +++ b/pandas/core/nanops.py @@ -0,0 +1,718 @@ +import sys +import itertools +import functools + +import numpy as np + +try: + import bottleneck as bn + _USE_BOTTLENECK = True +except ImportError: # pragma: no cover + _USE_BOTTLENECK = False + +import pandas.core.common as com +import pandas.hashtable as _hash +from pandas import compat, lib, algos, tslib +from pandas.compat import builtins +from pandas.core.common import (isnull, notnull, _values_from_object, + _maybe_upcast_putmask, + ensure_float, _ensure_float64, + _ensure_int64, _ensure_object, + is_float, is_integer, is_complex, + is_float_dtype, _is_floating_dtype, + is_complex_dtype, is_integer_dtype, + is_bool_dtype, is_object_dtype, + is_datetime64_dtype, is_timedelta64_dtype, + _is_datetime_or_timedelta_dtype, + _is_int_or_datetime_dtype, _is_any_int_dtype) + + +class disallow(object): + + def __init__(self, *dtypes): + super(disallow, self).__init__() + self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes) + + def check(self, obj): + return hasattr(obj, 'dtype') and issubclass(obj.dtype.type, + self.dtypes) + + def __call__(self, f): + @functools.wraps(f) + def _f(*args, **kwargs): + obj_iter = itertools.chain(args, compat.itervalues(kwargs)) + if any(self.check(obj) for obj in obj_iter): + raise TypeError('reduction operation {0!r} not allowed for ' + 'this dtype'.format(f.__name__.replace('nan', + ''))) + return f(*args, **kwargs) + return _f + + +class bottleneck_switch(object): + + def __init__(self, zero_value=None, **kwargs): + self.zero_value = zero_value + self.kwargs = kwargs + + def __call__(self, alt): + bn_name = alt.__name__ + + try: + bn_func = getattr(bn, bn_name) + except (AttributeError, NameError): # pragma: no cover + bn_func = None + + @functools.wraps(alt) + def f(values, axis=None, skipna=True, **kwds): + if len(self.kwargs) > 0: + for k, v in compat.iteritems(self.kwargs): + if k not in kwds: + kwds[k] = v + try: + if self.zero_value is not None and values.size == 0: + if values.ndim == 1: + return 0 + else: + result_shape = (values.shape[:axis] + + values.shape[axis + 1:]) + result = np.empty(result_shape) + result.fill(0) + return result + + if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, + bn_name): + result = bn_func(values, axis=axis, **kwds) + + # prefer to treat inf/-inf as NA, but must compute the func + # twice :( + if _has_infs(result): + result = alt(values, axis=axis, skipna=skipna, **kwds) + else: + result = alt(values, axis=axis, skipna=skipna, **kwds) + except Exception: + result = alt(values, axis=axis, skipna=skipna, **kwds) + + return result + + return f + + +def _bn_ok_dtype(dt, name): + # Bottleneck chokes on datetime64 + if (not is_object_dtype(dt) and + not _is_datetime_or_timedelta_dtype(dt)): + + # bottleneck does not properly upcast during the sum + # so can overflow + if name == 'nansum': + if dt.itemsize < 8: + return False + + return True + return False + + +def _has_infs(result): + if isinstance(result, np.ndarray): + if result.dtype == 'f8': + return lib.has_infs_f8(result.ravel()) + elif result.dtype == 'f4': + return lib.has_infs_f4(result.ravel()) + try: + return np.isinf(result).any() + except (TypeError, NotImplementedError) as e: + # if it doesn't support infs, then it can't have infs + return False + + +def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): + """ return the correct fill value for the dtype of the values """ + if fill_value is not None: + return fill_value + if _na_ok_dtype(dtype): + if fill_value_typ is None: + return np.nan + else: + if fill_value_typ == '+inf': + return np.inf + else: + return -np.inf + else: + if fill_value_typ is None: + return tslib.iNaT + else: + if fill_value_typ == '+inf': + # need the max int here + return np.iinfo(np.int64).max + else: + return tslib.iNaT + + +def _get_values(values, skipna, fill_value=None, fill_value_typ=None, + isfinite=False, copy=True): + """ utility to get the values view, mask, dtype + if necessary copy and mask using the specified fill_value + copy = True will force the copy """ + values = _values_from_object(values) + if isfinite: + mask = _isfinite(values) + else: + mask = isnull(values) + + dtype = values.dtype + dtype_ok = _na_ok_dtype(dtype) + + # get our fill value (in case we need to provide an alternative + # dtype for it) + fill_value = _get_fill_value(dtype, fill_value=fill_value, + fill_value_typ=fill_value_typ) + + if skipna: + if copy: + values = values.copy() + if dtype_ok: + np.putmask(values, mask, fill_value) + + # promote if needed + else: + values, changed = _maybe_upcast_putmask(values, mask, fill_value) + + elif copy: + values = values.copy() + + values = _view_if_needed(values) + + # return a platform independent precision dtype + dtype_max = dtype + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + dtype_max = np.int64 + elif is_float_dtype(dtype): + dtype_max = np.float64 + + return values, mask, dtype, dtype_max + + +def _isfinite(values): + if _is_datetime_or_timedelta_dtype(values): + return isnull(values) + if (is_complex_dtype(values) or is_float_dtype(values) or + is_integer_dtype(values) or is_bool_dtype(values)): + return ~np.isfinite(values) + return ~np.isfinite(values.astype('float64')) + + +def _na_ok_dtype(dtype): + return not _is_int_or_datetime_dtype(dtype) + + +def _view_if_needed(values): + if _is_datetime_or_timedelta_dtype(values): + return values.view(np.int64) + return values + + +def _wrap_results(result, dtype): + """ wrap our results if needed """ + + if is_datetime64_dtype(dtype): + if not isinstance(result, np.ndarray): + result = lib.Timestamp(result) + else: + result = result.view(dtype) + elif is_timedelta64_dtype(dtype): + if not isinstance(result, np.ndarray): + + # this is a scalar timedelta result! + # we have series convert then take the element (scalar) + # as series will do the right thing in py3 (and deal with numpy + # 1.6.2 bug in that it results dtype of timedelta64[us] + from pandas import Series + + # coerce float to results + if is_float(result): + result = int(result) + result = Series([result], dtype='timedelta64[ns]') + else: + result = result.view(dtype) + + return result + + +def nanany(values, axis=None, skipna=True): + values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna) + return values.any(axis) + + +def nanall(values, axis=None, skipna=True): + values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna) + return values.all(axis) + + +@disallow('M8') +@bottleneck_switch(zero_value=0) +def nansum(values, axis=None, skipna=True): + values, mask, dtype, dtype_max = _get_values(values, skipna, 0) + the_sum = values.sum(axis, dtype=dtype_max) + the_sum = _maybe_null_out(the_sum, axis, mask) + + return _wrap_results(the_sum, dtype) + + +@disallow('M8') +@bottleneck_switch() +def nanmean(values, axis=None, skipna=True): + values, mask, dtype, dtype_max = _get_values(values, skipna, 0) + the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_max)) + count = _get_counts(mask, axis) + + if axis is not None and getattr(the_sum, 'ndim', False): + the_mean = the_sum / count + ct_mask = count == 0 + if ct_mask.any(): + the_mean[ct_mask] = np.nan + else: + the_mean = the_sum / count if count > 0 else np.nan + + return _wrap_results(the_mean, dtype) + + +@disallow('M8') +@bottleneck_switch() +def nanmedian(values, axis=None, skipna=True): + + values, mask, dtype, dtype_max = _get_values(values, skipna) + + def get_median(x): + mask = notnull(x) + if not skipna and not mask.all(): + return np.nan + return algos.median(_values_from_object(x[mask])) + + if values.dtype != np.float64: + values = values.astype('f8') + + if axis is None: + values = values.ravel() + + notempty = values.size + + # an array from a frame + if values.ndim > 1: + # there's a non-empty array to apply over otherwise numpy raises + if notempty: + return np.apply_along_axis(get_median, axis, values) + + # must return the correct shape, but median is not defined for the + # empty set so return nans of shape "everything but the passed axis" + # since "axis" is where the reduction would occur if we had a nonempty + # array + shp = np.array(values.shape) + dims = np.arange(values.ndim) + ret = np.empty(shp[dims != axis]) + ret.fill(np.nan) + return ret + + # otherwise return a scalar value + return _wrap_results(get_median(values), dtype) if notempty else np.nan + + +def _get_counts_nanvar(mask, axis, ddof): + count = _get_counts(mask, axis) + + d = count-ddof + + # always return NaN, never inf + if np.isscalar(count): + if count <= ddof: + count = np.nan + d = np.nan + else: + mask2 = count <= ddof + if mask2.any(): + np.putmask(d, mask2, np.nan) + np.putmask(count, mask2, np.nan) + return count, d + + +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanvar(values, axis=None, skipna=True, ddof=1): + if not _is_floating_dtype(values): + values = values.astype('f8') + + mask = isnull(values) + + count, d = _get_counts_nanvar(mask, axis, ddof) + + if skipna: + values = values.copy() + np.putmask(values, mask, 0) + + X = _ensure_numeric(values.sum(axis)) + XX = _ensure_numeric((values ** 2).sum(axis)) + return np.fabs((XX - X ** 2 / count) / d) + + +def nansem(values, axis=None, skipna=True, ddof=1): + var = nanvar(values, axis, skipna, ddof=ddof) + + if not _is_floating_dtype(values): + values = values.astype('f8') + mask = isnull(values) + count, _ = _get_counts_nanvar(mask, axis, ddof) + + return np.sqrt(var)/np.sqrt(count) + + +@bottleneck_switch() +def nanmin(values, axis=None, skipna=True): + values, mask, dtype, dtype_max = _get_values(values, skipna, + fill_value_typ='+inf') + + # numpy 1.6.1 workaround in Python 3.x + if is_object_dtype(values) and compat.PY3: + if values.ndim > 1: + apply_ax = axis if axis is not None else 0 + result = np.apply_along_axis(builtins.min, apply_ax, values) + else: + try: + result = builtins.min(values) + except: + result = np.nan + else: + if ((axis is not None and values.shape[axis] == 0) + or values.size == 0): + try: + result = ensure_float(values.sum(axis, dtype=dtype_max)) + result.fill(np.nan) + except: + result = np.nan + else: + result = values.min(axis) + + result = _wrap_results(result, dtype) + return _maybe_null_out(result, axis, mask) + + +@bottleneck_switch() +def nanmax(values, axis=None, skipna=True): + values, mask, dtype, dtype_max = _get_values(values, skipna, + fill_value_typ='-inf') + + # numpy 1.6.1 workaround in Python 3.x + if is_object_dtype(values) and compat.PY3: + + if values.ndim > 1: + apply_ax = axis if axis is not None else 0 + result = np.apply_along_axis(builtins.max, apply_ax, values) + else: + try: + result = builtins.max(values) + except: + result = np.nan + else: + if ((axis is not None and values.shape[axis] == 0) + or values.size == 0): + try: + result = ensure_float(values.sum(axis, dtype=dtype_max)) + result.fill(np.nan) + except: + result = np.nan + else: + result = values.max(axis) + + result = _wrap_results(result, dtype) + return _maybe_null_out(result, axis, mask) + + +def nanargmax(values, axis=None, skipna=True): + """ + Returns -1 in the NA case + """ + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf', + isfinite=True) + result = values.argmax(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + + +def nanargmin(values, axis=None, skipna=True): + """ + Returns -1 in the NA case + """ + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf', + isfinite=True) + result = values.argmin(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + + +@disallow('M8') +def nanskew(values, axis=None, skipna=True): + if not _is_floating_dtype(values): + values = values.astype('f8') + + mask = isnull(values) + count = _get_counts(mask, axis) + + if skipna: + values = values.copy() + np.putmask(values, mask, 0) + + A = values.sum(axis) / count + B = (values ** 2).sum(axis) / count - A ** 2 + C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B + + # floating point error + B = _zero_out_fperr(B) + C = _zero_out_fperr(C) + + result = ((np.sqrt((count ** 2 - count)) * C) / + ((count - 2) * np.sqrt(B) ** 3)) + + if isinstance(result, np.ndarray): + result = np.where(B == 0, 0, result) + result[count < 3] = np.nan + return result + else: + result = 0 if B == 0 else result + if count < 3: + return np.nan + return result + + +@disallow('M8') +def nankurt(values, axis=None, skipna=True): + if not _is_floating_dtype(values): + values = values.astype('f8') + + mask = isnull(values) + count = _get_counts(mask, axis) + + if skipna: + values = values.copy() + np.putmask(values, mask, 0) + + A = values.sum(axis) / count + B = (values ** 2).sum(axis) / count - A ** 2 + C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B + D = (values ** 4).sum(axis) / count - A ** 4 - 6 * B * A * A - 4 * C * A + + B = _zero_out_fperr(B) + C = _zero_out_fperr(C) + D = _zero_out_fperr(D) + + result = (((count * count - 1.) * D / (B * B) - 3 * ((count - 1.) ** 2)) / + ((count - 2.) * (count - 3.))) + if isinstance(result, np.ndarray): + result = np.where(B == 0, 0, result) + result[count < 4] = np.nan + return result + else: + result = 0 if B == 0 else result + if count < 4: + return np.nan + return result + + +@disallow('M8') +def nanprod(values, axis=None, skipna=True): + mask = isnull(values) + if skipna and not _is_any_int_dtype(values): + values = values.copy() + values[mask] = 1 + result = values.prod(axis) + return _maybe_null_out(result, axis, mask) + + +def _maybe_arg_null_out(result, axis, mask, skipna): + # helper function for nanargmin/nanargmax + if axis is None or not getattr(result, 'ndim', False): + if skipna: + if mask.all(): + result = -1 + else: + if mask.any(): + result = -1 + else: + if skipna: + na_mask = mask.all(axis) + else: + na_mask = mask.any(axis) + if na_mask.any(): + result[na_mask] = -1 + return result + + +def _get_counts(mask, axis): + if axis is None: + return float(mask.size - mask.sum()) + + count = mask.shape[axis] - mask.sum(axis) + try: + return count.astype(float) + except AttributeError: + return np.array(count, dtype=float) + + +def _maybe_null_out(result, axis, mask): + if axis is not None and getattr(result, 'ndim', False): + null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 + if np.any(null_mask): + if np.iscomplexobj(result): + result = result.astype('c16') + else: + result = result.astype('f8') + result[null_mask] = np.nan + else: + null_mask = mask.size - mask.sum() + if null_mask == 0: + result = np.nan + + return result + + +def _zero_out_fperr(arg): + if isinstance(arg, np.ndarray): + return np.where(np.abs(arg) < 1e-14, 0, arg) + else: + return 0 if np.abs(arg) < 1e-14 else arg + + +@disallow('M8') +def nancorr(a, b, method='pearson', min_periods=None): + """ + a, b: ndarrays + """ + if len(a) != len(b): + raise AssertionError('Operands to nancorr must have same size') + + if min_periods is None: + min_periods = 1 + + valid = notnull(a) & notnull(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) < min_periods: + return np.nan + + f = get_corr_func(method) + return f(a, b) + + +def get_corr_func(method): + if method in ['kendall', 'spearman']: + from scipy.stats import kendalltau, spearmanr + + def _pearson(a, b): + return np.corrcoef(a, b)[0, 1] + + def _kendall(a, b): + rs = kendalltau(a, b) + if isinstance(rs, tuple): + return rs[0] + return rs + + def _spearman(a, b): + return spearmanr(a, b)[0] + + _cor_methods = { + 'pearson': _pearson, + 'kendall': _kendall, + 'spearman': _spearman + } + return _cor_methods[method] + + +@disallow('M8') +def nancov(a, b, min_periods=None): + if len(a) != len(b): + raise AssertionError('Operands to nancov must have same size') + + if min_periods is None: + min_periods = 1 + + valid = notnull(a) & notnull(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) < min_periods: + return np.nan + + return np.cov(a, b)[0, 1] + + +def _ensure_numeric(x): + if isinstance(x, np.ndarray): + if is_integer_dtype(x) or is_bool_dtype(x): + x = x.astype(np.float64) + elif is_object_dtype(x): + try: + x = x.astype(np.complex128) + except: + x = x.astype(np.float64) + else: + if not np.any(x.imag): + x = x.real + elif not (is_float(x) or is_integer(x) or is_complex(x)): + try: + x = float(x) + except Exception: + try: + x = complex(x) + except Exception: + raise TypeError('Could not convert %s to numeric' % str(x)) + return x + +# NA-friendly array comparisons + +import operator + + +def make_nancomp(op): + def f(x, y): + xmask = isnull(x) + ymask = isnull(y) + mask = xmask | ymask + + result = op(x, y) + + if mask.any(): + if is_bool_dtype(result): + result = result.astype('O') + np.putmask(result, mask, np.nan) + + return result + return f + +nangt = make_nancomp(operator.gt) +nange = make_nancomp(operator.ge) +nanlt = make_nancomp(operator.lt) +nanle = make_nancomp(operator.le) +naneq = make_nancomp(operator.eq) +nanne = make_nancomp(operator.ne) + + +def unique1d(values): + """ + Hash table-based unique + """ + if np.issubdtype(values.dtype, np.floating): + table = _hash.Float64HashTable(len(values)) + uniques = np.array(table.unique(_ensure_float64(values)), + dtype=np.float64) + elif np.issubdtype(values.dtype, np.datetime64): + table = _hash.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + uniques = uniques.view('M8[ns]') + elif np.issubdtype(values.dtype, np.integer): + table = _hash.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + else: + table = _hash.PyObjectHashTable(len(values)) + uniques = table.unique(_ensure_object(values)) + return uniques diff --git a/pandas/core/ops.py b/pandas/core/ops.py new file mode 100644 index 00000000..780edec6 --- /dev/null +++ b/pandas/core/ops.py @@ -0,0 +1,985 @@ +""" +Arithmetic operations for PandasObjects + +This is not a public API. +""" +# necessary to enforce truediv in Python 2.X +from __future__ import division +import operator +import numpy as np +import pandas as pd +from pandas import compat, lib, tslib +import pandas.index as _index +from pandas.util.decorators import Appender +import pandas.core.common as com +import pandas.core.array as pa +import pandas.computation.expressions as expressions +from pandas.core.common import(bind_method, is_list_like, notnull, isnull, + _values_from_object, _maybe_match_name) + +# ----------------------------------------------------------------------------- +# Functions that add arithmetic methods to objects, given arithmetic factory +# methods + + +def _create_methods(arith_method, radd_func, comp_method, bool_method, + use_numexpr, special=False, default_axis='columns'): + # creates actual methods based upon arithmetic, comp and bool method + # constructors. + + # NOTE: Only frame cares about default_axis, specifically: special methods + # have default axis None, whereas flex methods have default axis 'columns' + # if we're not using numexpr, then don't pass a str_rep + if use_numexpr: + op = lambda x: x + else: + op = lambda x: None + if special: + def names(x): + if x[-1] == "_": + return "__%s_" % x + else: + return "__%s__" % x + else: + names = lambda x: x + radd_func = radd_func or operator.add + # Inframe, all special methods have default_axis=None, flex methods have + # default_axis set to the default (columns) + new_methods = dict( + add=arith_method(operator.add, names('add'), op('+'), + default_axis=default_axis), + radd=arith_method(radd_func, names('radd'), op('+'), + default_axis=default_axis), + sub=arith_method(operator.sub, names('sub'), op('-'), + default_axis=default_axis), + mul=arith_method(operator.mul, names('mul'), op('*'), + default_axis=default_axis), + truediv=arith_method(operator.truediv, names('truediv'), op('/'), + truediv=True, fill_zeros=np.inf, + default_axis=default_axis), + floordiv=arith_method(operator.floordiv, names('floordiv'), op('//'), + default_axis=default_axis, fill_zeros=np.inf), + # Causes a floating point exception in the tests when numexpr + # enabled, so for now no speedup + mod=arith_method(operator.mod, names('mod'), None, + default_axis=default_axis, fill_zeros=np.nan), + pow=arith_method(operator.pow, names('pow'), op('**'), + default_axis=default_axis), + # not entirely sure why this is necessary, but previously was included + # so it's here to maintain compatibility + rmul=arith_method(operator.mul, names('rmul'), op('*'), + default_axis=default_axis, reversed=True), + rsub=arith_method(lambda x, y: y - x, names('rsub'), op('-'), + default_axis=default_axis, reversed=True), + rtruediv=arith_method(lambda x, y: operator.truediv(y, x), + names('rtruediv'), op('/'), truediv=True, + fill_zeros=np.inf, default_axis=default_axis, + reversed=True), + rfloordiv=arith_method(lambda x, y: operator.floordiv(y, x), + names('rfloordiv'), op('//'), + default_axis=default_axis, fill_zeros=np.inf, + reversed=True), + rpow=arith_method(lambda x, y: y ** x, names('rpow'), op('**'), + default_axis=default_axis, reversed=True), + rmod=arith_method(lambda x, y: y % x, names('rmod'), op('%'), + default_axis=default_axis, reversed=True), + ) + new_methods['div'] = new_methods['truediv'] + new_methods['rdiv'] = new_methods['rtruediv'] + + # Comp methods never had a default axis set + if comp_method: + new_methods.update(dict( + eq=comp_method(operator.eq, names('eq'), op('==')), + ne=comp_method(operator.ne, names('ne'), op('!='), masker=True), + lt=comp_method(operator.lt, names('lt'), op('<')), + gt=comp_method(operator.gt, names('gt'), op('>')), + le=comp_method(operator.le, names('le'), op('<=')), + ge=comp_method(operator.ge, names('ge'), op('>=')), + )) + if bool_method: + new_methods.update(dict( + and_=bool_method(operator.and_, names('and_'), op('&')), + or_=bool_method(operator.or_, names('or_'), op('|')), + # For some reason ``^`` wasn't used in original. + xor=bool_method(operator.xor, names('xor'), op('^')), + rand_=bool_method(lambda x, y: operator.and_(y, x), + names('rand_'), op('&')), + ror_=bool_method(lambda x, y: operator.or_(y, x), names('ror_'), op('|')), + rxor=bool_method(lambda x, y: operator.xor(y, x), names('rxor'), op('^')) + )) + + new_methods = dict((names(k), v) for k, v in new_methods.items()) + return new_methods + + +def add_methods(cls, new_methods, force, select, exclude): + if select and exclude: + raise TypeError("May only pass either select or exclude") + methods = new_methods + if select: + select = set(select) + methods = {} + for key, method in new_methods.items(): + if key in select: + methods[key] = method + if exclude: + for k in exclude: + new_methods.pop(k, None) + + for name, method in new_methods.items(): + if force or name not in cls.__dict__: + bind_method(cls, name, method) + + +#---------------------------------------------------------------------- +# Arithmetic +def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, + comp_method=None, bool_method=None, + use_numexpr=True, force=False, select=None, + exclude=None): + """ + Adds the full suite of special arithmetic methods (``__add__``, + ``__sub__``, etc.) to the class. + + Parameters + ---------- + arith_method : function (optional) + factory for special arithmetic methods, with op string: + f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) + radd_func : function (optional) + Possible replacement for ``operator.add`` for compatibility + comp_method : function, optional, + factory for rich comparison - signature: f(op, name, str_rep) + use_numexpr : bool, default True + whether to accelerate with numexpr, defaults to True + force : bool, default False + if False, checks whether function is defined **on ``cls.__dict__``** + before defining if True, always defines functions on class base + select : iterable of strings (optional) + if passed, only sets functions with names in select + exclude : iterable of strings (optional) + if passed, will not set functions with names in exclude + """ + radd_func = radd_func or operator.add + # in frame, special methods have default_axis = None, comp methods use + # 'columns' + new_methods = _create_methods(arith_method, radd_func, comp_method, + bool_method, use_numexpr, default_axis=None, + special=True) + + # inplace operators (I feel like these should get passed an `inplace=True` + # or just be removed + new_methods.update(dict( + __iadd__=new_methods["__add__"], + __isub__=new_methods["__sub__"], + __imul__=new_methods["__mul__"], + __itruediv__=new_methods["__truediv__"], + __ipow__=new_methods["__pow__"] + )) + if not compat.PY3: + new_methods["__idiv__"] = new_methods["__div__"] + + add_methods(cls, new_methods=new_methods, force=force, select=select, + exclude=exclude) + + +def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, + flex_comp_method=None, flex_bool_method=None, + use_numexpr=True, force=False, select=None, + exclude=None): + """ + Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) + to the class. + + Parameters + ---------- + flex_arith_method : function (optional) + factory for special arithmetic methods, with op string: + f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) + radd_func : function (optional) + Possible replacement for ``lambda x, y: operator.add(y, x)`` for + compatibility + flex_comp_method : function, optional, + factory for rich comparison - signature: f(op, name, str_rep) + use_numexpr : bool, default True + whether to accelerate with numexpr, defaults to True + force : bool, default False + if False, checks whether function is defined **on ``cls.__dict__``** + before defining if True, always defines functions on class base + select : iterable of strings (optional) + if passed, only sets functions with names in select + exclude : iterable of strings (optional) + if passed, will not set functions with names in exclude + """ + radd_func = radd_func or (lambda x, y: operator.add(y, x)) + # in frame, default axis is 'columns', doesn't matter for series and panel + new_methods = _create_methods( + flex_arith_method, radd_func, flex_comp_method, flex_bool_method, + use_numexpr, default_axis='columns', special=False) + new_methods.update(dict( + multiply=new_methods['mul'], + subtract=new_methods['sub'], + divide=new_methods['div'] + )) + # opt out of bool flex methods for now + for k in ('ror_', 'rxor', 'rand_'): + if k in new_methods: + new_methods.pop(k) + + add_methods(cls, new_methods=new_methods, force=force, select=select, + exclude=exclude) + + +class _TimeOp(object): + + """ + Wrapper around Series datetime/time/timedelta arithmetic operations. + Generally, you should use classmethod ``maybe_convert_for_time_op`` as an + entry point. + """ + fill_value = tslib.iNaT + wrap_results = staticmethod(lambda x: x) + dtype = None + + def __init__(self, left, right, name): + self.name = name + + # need to make sure that we are aligning the data + if isinstance(left, pd.Series) and isinstance(right, pd.Series): + left, right = left.align(right) + + self.left = left + self.right = right + lvalues = self._convert_to_array(left, name=name) + rvalues = self._convert_to_array(right, name=name, other=lvalues) + + self.is_timedelta_lhs = com.is_timedelta64_dtype(left) + self.is_datetime_lhs = com.is_datetime64_dtype(left) + self.is_integer_lhs = left.dtype.kind in ['i', 'u'] + self.is_datetime_rhs = com.is_datetime64_dtype(rvalues) + self.is_timedelta_rhs = (com.is_timedelta64_dtype(rvalues) + or (not self.is_datetime_rhs + and pd._np_version_under1p7)) + self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u') + + self._validate() + + self._convert_for_datetime(lvalues, rvalues) + + def _validate(self): + # timedelta and integer mul/div + + if (self.is_timedelta_lhs and self.is_integer_rhs) or\ + (self.is_integer_lhs and self.is_timedelta_rhs): + + if self.name not in ('__truediv__', '__div__', '__mul__'): + raise TypeError("can only operate on a timedelta and an " + "integer for division, but the operator [%s]" + "was passed" % self.name) + + # 2 datetimes + elif self.is_datetime_lhs and self.is_datetime_rhs: + if self.name != '__sub__': + raise TypeError("can only operate on a datetimes for" + " subtraction, but the operator [%s] was" + " passed" % self.name) + + # 2 timedeltas + elif self.is_timedelta_lhs and self.is_timedelta_rhs: + + if self.name not in ('__div__', '__truediv__', '__add__', + '__sub__'): + raise TypeError("can only operate on a timedeltas for " + "addition, subtraction, and division, but the" + " operator [%s] was passed" % self.name) + + # datetime and timedelta + elif self.is_datetime_lhs and self.is_timedelta_rhs: + + if self.name not in ('__add__', '__sub__'): + raise TypeError("can only operate on a datetime with a rhs of" + " a timedelta for addition and subtraction, " + " but the operator [%s] was passed" % + self.name) + + elif self.is_timedelta_lhs and self.is_datetime_rhs: + + if self.name != '__add__': + raise TypeError("can only operate on a timedelta and" + " a datetime for addition, but the operator" + " [%s] was passed" % self.name) + else: + raise TypeError('cannot operate on a series with out a rhs ' + 'of a series/ndarray of type datetime64[ns] ' + 'or a timedelta') + + def _convert_to_array(self, values, name=None, other=None): + """converts values to ndarray""" + from pandas.tseries.timedeltas import _possibly_cast_to_timedelta + + coerce = 'compat' if pd._np_version_under1p7 else True + if not is_list_like(values): + values = np.array([values]) + inferred_type = lib.infer_dtype(values) + + if inferred_type in ('datetime64', 'datetime', 'date', 'time'): + # if we have a other of timedelta, but use pd.NaT here we + # we are in the wrong path + if (other is not None and other.dtype == 'timedelta64[ns]' and + all(isnull(v) for v in values)): + values = np.empty(values.shape, dtype=other.dtype) + values[:] = tslib.iNaT + + # a datetlike + elif not (isinstance(values, (pa.Array, pd.Series)) and + com.is_datetime64_dtype(values)): + values = tslib.array_to_datetime(values) + elif isinstance(values, pd.DatetimeIndex): + values = values.to_series() + elif inferred_type in ('timedelta', 'timedelta64'): + # have a timedelta, convert to to ns here + values = _possibly_cast_to_timedelta(values, coerce=coerce, dtype='timedelta64[ns]') + elif inferred_type == 'integer': + # py3 compat where dtype is 'm' but is an integer + if values.dtype.kind == 'm': + values = values.astype('timedelta64[ns]') + elif isinstance(values, pd.PeriodIndex): + values = values.to_timestamp().to_series() + elif name not in ('__truediv__', '__div__', '__mul__'): + raise TypeError("incompatible type for a datetime/timedelta " + "operation [{0}]".format(name)) + elif isinstance(values[0], pd.DateOffset): + # handle DateOffsets + os = pa.array([getattr(v, 'delta', None) for v in values]) + mask = isnull(os) + if mask.any(): + raise TypeError("cannot use a non-absolute DateOffset in " + "datetime/timedelta operations [{0}]".format( + ', '.join([com.pprint_thing(v) + for v in values[mask]]))) + values = _possibly_cast_to_timedelta(os, coerce=coerce) + elif inferred_type == 'floating': + + # all nan, so ok, use the other dtype (e.g. timedelta or datetime) + if isnull(values).all(): + values = np.empty(values.shape, dtype=other.dtype) + values[:] = tslib.iNaT + else: + raise TypeError( + 'incompatible type [{0}] for a datetime/timedelta ' + 'operation'.format(pa.array(values).dtype)) + else: + raise TypeError("incompatible type [{0}] for a datetime/timedelta" + " operation".format(pa.array(values).dtype)) + + return values + + def _convert_for_datetime(self, lvalues, rvalues): + mask = None + # datetimes require views + if self.is_datetime_lhs or self.is_datetime_rhs: + # datetime subtraction means timedelta + if self.is_datetime_lhs and self.is_datetime_rhs: + self.dtype = 'timedelta64[ns]' + else: + self.dtype = 'datetime64[ns]' + mask = isnull(lvalues) | isnull(rvalues) + lvalues = lvalues.view(np.int64) + rvalues = rvalues.view(np.int64) + + # otherwise it's a timedelta + else: + self.dtype = 'timedelta64[ns]' + mask = isnull(lvalues) | isnull(rvalues) + lvalues = lvalues.astype(np.int64) + rvalues = rvalues.astype(np.int64) + + # time delta division -> unit less + # integer gets converted to timedelta in np < 1.6 + if (self.is_timedelta_lhs and self.is_timedelta_rhs) and\ + not self.is_integer_rhs and\ + not self.is_integer_lhs and\ + self.name in ('__div__', '__truediv__'): + self.dtype = 'float64' + self.fill_value = np.nan + lvalues = lvalues.astype(np.float64) + rvalues = rvalues.astype(np.float64) + + # if we need to mask the results + if mask is not None: + if mask.any(): + def f(x): + x = pa.array(x, dtype=self.dtype) + np.putmask(x, mask, self.fill_value) + return x + self.wrap_results = f + self.lvalues = lvalues + self.rvalues = rvalues + + @classmethod + def maybe_convert_for_time_op(cls, left, right, name): + """ + if ``left`` and ``right`` are appropriate for datetime arithmetic with + operation ``name``, processes them and returns a ``_TimeOp`` object + that stores all the required values. Otherwise, it will generate + either a ``NotImplementedError`` or ``None``, indicating that the + operation is unsupported for datetimes (e.g., an unsupported r_op) or + that the data is not the right type for time ops. + """ + # decide if we can do it + is_timedelta_lhs = com.is_timedelta64_dtype(left) + is_datetime_lhs = com.is_datetime64_dtype(left) + if not (is_datetime_lhs or is_timedelta_lhs): + return None + + # rops are allowed. No need for special checks, just strip off + # r part. + if name.startswith('__r'): + name = "__" + name[3:] + return cls(left, right, name) + + +def _arith_method_SERIES(op, name, str_rep, fill_zeros=None, + default_axis=None, **eval_kwargs): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def na_op(x, y): + try: + result = expressions.evaluate(op, str_rep, x, y, + raise_on_error=True, **eval_kwargs) + except TypeError: + if isinstance(y, (pa.Array, pd.Series)): + dtype = np.find_common_type([x.dtype, y.dtype], []) + result = np.empty(x.size, dtype=dtype) + mask = notnull(x) & notnull(y) + result[mask] = op(x[mask], y[mask]) + else: + result = pa.empty(len(x), dtype=x.dtype) + mask = notnull(x) + result[mask] = op(x[mask], y) + + result, changed = com._maybe_upcast_putmask(result, ~mask, pa.NA) + + result = com._fill_zeros(result, x, y, name, fill_zeros) + return result + + def wrapper(left, right, name=name): + + if isinstance(right, pd.DataFrame): + return NotImplemented + + time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name) + + if time_converted is None: + lvalues, rvalues = left, right + dtype = None + wrap_results = lambda x: x + elif time_converted == NotImplemented: + return NotImplemented + else: + left, right = time_converted.left, time_converted.right + lvalues, rvalues = time_converted.lvalues, time_converted.rvalues + dtype = time_converted.dtype + wrap_results = time_converted.wrap_results + + if isinstance(rvalues, pd.Series): + rindex = getattr(rvalues,'index',rvalues) + name = _maybe_match_name(left, rvalues) + lvalues = getattr(lvalues, 'values', lvalues) + rvalues = getattr(rvalues, 'values', rvalues) + if left.index.equals(rindex): + index = left.index + else: + index, lidx, ridx = left.index.join(rindex, how='outer', + return_indexers=True) + + if lidx is not None: + lvalues = com.take_1d(lvalues, lidx) + + if ridx is not None: + rvalues = com.take_1d(rvalues, ridx) + + arr = na_op(lvalues, rvalues) + + return left._constructor(wrap_results(arr), index=index, + name=name, dtype=dtype) + else: + # scalars + if hasattr(lvalues, 'values'): + lvalues = lvalues.values + return left._constructor(wrap_results(na_op(lvalues, rvalues)), + index=left.index, name=left.name, + dtype=dtype) + return wrapper + + +def _comp_method_SERIES(op, name, str_rep, masker=False): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def na_op(x, y): + if x.dtype == np.object_: + if isinstance(y, list): + y = lib.list_to_object_array(y) + + if isinstance(y, (pa.Array, pd.Series)): + if y.dtype != np.object_: + result = lib.vec_compare(x, y.astype(np.object_), op) + else: + result = lib.vec_compare(x, y, op) + else: + result = lib.scalar_compare(x, y, op) + else: + + try: + result = getattr(x, name)(y) + if result is NotImplemented: + raise TypeError("invalid type comparison") + except (AttributeError): + result = op(x, y) + + return result + + def wrapper(self, other): + if isinstance(other, pd.Series): + name = _maybe_match_name(self, other) + if len(self) != len(other): + raise ValueError('Series lengths must match to compare') + return self._constructor(na_op(self.values, other.values), + index=self.index, name=name) + elif isinstance(other, pd.DataFrame): # pragma: no cover + return NotImplemented + elif isinstance(other, (pa.Array, pd.Series)): + if len(self) != len(other): + raise ValueError('Lengths must match to compare') + return self._constructor(na_op(self.values, np.asarray(other)), + index=self.index).__finalize__(self) + else: + + mask = isnull(self) + + values = self.values + other = _index.convert_scalar(values, other) + + if issubclass(values.dtype.type, np.datetime64): + values = values.view('i8') + + # scalars + res = na_op(values, other) + if np.isscalar(res): + raise TypeError('Could not compare %s type with Series' + % type(other)) + + # always return a full value series here + res = _values_from_object(res) + + res = pd.Series(res, index=self.index, name=self.name, + dtype='bool') + + # mask out the invalids + if mask.any(): + res[mask] = masker + + return res + return wrapper + + +def _bool_method_SERIES(op, name, str_rep): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def na_op(x, y): + try: + result = op(x, y) + except TypeError: + if isinstance(y, list): + y = lib.list_to_object_array(y) + + if isinstance(y, (pa.Array, pd.Series)): + if (x.dtype == np.bool_ and + y.dtype == np.bool_): # pragma: no cover + result = op(x, y) # when would this be hit? + else: + x = com._ensure_object(x) + y = com._ensure_object(y) + result = lib.vec_binop(x, y, op) + else: + try: + + # let null fall thru + if not isnull(y): + y = bool(y) + result = lib.scalar_binop(x, y, op) + except: + raise TypeError("cannot compare a dtyped [{0}] array with " + "a scalar of type [{1}]".format( + x.dtype, type(y).__name__)) + + return result + + def wrapper(self, other): + if isinstance(other, pd.Series): + name = _maybe_match_name(self, other) + + other = other.reindex_like(self).fillna(False).astype(bool) + return self._constructor(na_op(self.values, other.values), + index=self.index, + name=name).fillna(False).astype(bool) + elif isinstance(other, pd.DataFrame): + return NotImplemented + else: + # scalars + res = self._constructor(na_op(self.values, other), + index=self.index).fillna(False) + return res.astype(bool).__finalize__(self) + return wrapper + + +def _radd_compat(left, right): + radd = lambda x, y: y + x + # GH #353, NumPy 1.5.1 workaround + try: + output = radd(left, right) + except TypeError: + cond = (pd._np_version_under1p6 and + left.dtype == np.object_) + if cond: # pragma: no cover + output = np.empty_like(left) + output.flat[:] = [radd(x, right) for x in left.flat] + else: + raise + + return output + + +def _flex_method_SERIES(op, name, str_rep, default_axis=None, + fill_zeros=None, **eval_kwargs): + doc = """ + Binary operator %s with support to substitute a fill_value for missing data + in one of the inputs + + Parameters + ---------- + other: Series or scalar value + fill_value : None or float value, default None (NaN) + Fill missing (NaN) values with this value. If both Series are + missing, the result will be missing + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + + Returns + ------- + result : Series + """ % name + + @Appender(doc) + def flex_wrapper(self, other, level=None, fill_value=None, axis=0): + # validate axis + self._get_axis_number(axis) + if isinstance(other, pd.Series): + return self._binop(other, op, level=level, fill_value=fill_value) + elif isinstance(other, (pa.Array, pd.Series, list, tuple)): + if len(other) != len(self): + raise ValueError('Lengths must be equal') + return self._binop(self._constructor(other, self.index), op, + level=level, fill_value=fill_value) + else: + return self._constructor(op(self.values, other), + self.index).__finalize__(self) + + flex_wrapper.__name__ = name + return flex_wrapper + +series_flex_funcs = dict(flex_arith_method=_flex_method_SERIES, + radd_func=_radd_compat, + flex_comp_method=_comp_method_SERIES) + +series_special_funcs = dict(arith_method=_arith_method_SERIES, + radd_func=_radd_compat, + comp_method=_comp_method_SERIES, + bool_method=_bool_method_SERIES) + + +_arith_doc_FRAME = """ +Binary operator %s with support to substitute a fill_value for missing data in +one of the inputs + +Parameters +---------- +other : Series, DataFrame, or constant +axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on +fill_value : None or float value, default None + Fill missing (NaN) values with this value. If both DataFrame locations are + missing, the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Notes +----- +Mismatched indices will be unioned together + +Returns +------- +result : DataFrame +""" + + +def _arith_method_FRAME(op, name, str_rep=None, default_axis='columns', + fill_zeros=None, **eval_kwargs): + def na_op(x, y): + try: + result = expressions.evaluate( + op, str_rep, x, y, raise_on_error=True, **eval_kwargs) + except TypeError: + xrav = x.ravel() + if isinstance(y, (np.ndarray, pd.Series)): + dtype = np.find_common_type([x.dtype, y.dtype], []) + result = np.empty(x.size, dtype=dtype) + yrav = y.ravel() + mask = notnull(xrav) & notnull(yrav) + xrav = xrav[mask] + yrav = yrav[mask] + if np.prod(xrav.shape) and np.prod(yrav.shape): + result[mask] = op(xrav, yrav) + else: + result = np.empty(x.size, dtype=x.dtype) + mask = notnull(xrav) + xrav = xrav[mask] + if np.prod(xrav.shape): + result[mask] = op(xrav, y) + + result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result = result.reshape(x.shape) + + result = com._fill_zeros(result, x, y, name, fill_zeros) + + return result + + @Appender(_arith_doc_FRAME % name) + def f(self, other, axis=default_axis, level=None, fill_value=None): + if isinstance(other, pd.DataFrame): # Another DataFrame + return self._combine_frame(other, na_op, fill_value, level) + elif isinstance(other, pd.Series): + return self._combine_series(other, na_op, fill_value, axis, level) + elif isinstance(other, (list, tuple)): + if axis is not None and self._get_axis_name(axis) == 'index': + # TODO: Get all of these to use _constructor_sliced + # casted = self._constructor_sliced(other, index=self.index) + casted = pd.Series(other, index=self.index) + else: + # casted = self._constructor_sliced(other, index=self.columns) + casted = pd.Series(other, index=self.columns) + return self._combine_series(casted, na_op, fill_value, axis, level) + elif isinstance(other, np.ndarray): + if other.ndim == 1: + if axis is not None and self._get_axis_name(axis) == 'index': + # casted = self._constructor_sliced(other, + # index=self.index) + casted = pd.Series(other, index=self.index) + else: + # casted = self._constructor_sliced(other, + # index=self.columns) + casted = pd.Series(other, index=self.columns) + return self._combine_series(casted, na_op, fill_value, + axis, level) + elif other.ndim == 2: + # casted = self._constructor(other, index=self.index, + # columns=self.columns) + casted = pd.DataFrame(other, index=self.index, + columns=self.columns) + return self._combine_frame(casted, na_op, fill_value, level) + else: + raise ValueError("Incompatible argument shape: %s" % + (other.shape, )) + else: + return self._combine_const(other, na_op) + + f.__name__ = name + + return f + + +# Masker unused for now +def _flex_comp_method_FRAME(op, name, str_rep=None, default_axis='columns', + masker=False): + + def na_op(x, y): + try: + result = op(x, y) + except TypeError: + xrav = x.ravel() + result = np.empty(x.size, dtype=x.dtype) + if isinstance(y, (np.ndarray, pd.Series)): + yrav = y.ravel() + mask = notnull(xrav) & notnull(yrav) + result[mask] = op(np.array(list(xrav[mask])), + np.array(list(yrav[mask]))) + else: + mask = notnull(xrav) + result[mask] = op(np.array(list(xrav[mask])), y) + + if op == operator.ne: # pragma: no cover + np.putmask(result, ~mask, True) + else: + np.putmask(result, ~mask, False) + result = result.reshape(x.shape) + + return result + + @Appender('Wrapper for flexible comparison methods %s' % name) + def f(self, other, axis=default_axis, level=None): + if isinstance(other, pd.DataFrame): # Another DataFrame + return self._flex_compare_frame(other, na_op, str_rep, level) + + elif isinstance(other, pd.Series): + return self._combine_series(other, na_op, None, axis, level) + + elif isinstance(other, (list, tuple)): + if axis is not None and self._get_axis_name(axis) == 'index': + casted = pd.Series(other, index=self.index) + else: + casted = pd.Series(other, index=self.columns) + + return self._combine_series(casted, na_op, None, axis, level) + + elif isinstance(other, np.ndarray): + if other.ndim == 1: + if axis is not None and self._get_axis_name(axis) == 'index': + casted = pd.Series(other, index=self.index) + else: + casted = pd.Series(other, index=self.columns) + + return self._combine_series(casted, na_op, None, axis, level) + + elif other.ndim == 2: + casted = pd.DataFrame(other, index=self.index, + columns=self.columns) + + return self._flex_compare_frame(casted, na_op, str_rep, level) + + else: + raise ValueError("Incompatible argument shape: %s" % + (other.shape, )) + + else: + return self._combine_const(other, na_op) + + f.__name__ = name + + return f + + +def _comp_method_FRAME(func, name, str_rep, masker=False): + @Appender('Wrapper for comparison method %s' % name) + def f(self, other): + if isinstance(other, pd.DataFrame): # Another DataFrame + return self._compare_frame(other, func, str_rep) + elif isinstance(other, pd.Series): + return self._combine_series_infer(other, func) + else: + + # straight boolean comparisions we want to allow all columns + # (regardless of dtype to pass thru) See #4537 for discussion. + res = self._combine_const(other, func, raise_on_error=False) + return res.fillna(True).astype(bool) + + f.__name__ = name + + return f + + +frame_flex_funcs = dict(flex_arith_method=_arith_method_FRAME, + radd_func=_radd_compat, + flex_comp_method=_flex_comp_method_FRAME) + + +frame_special_funcs = dict(arith_method=_arith_method_FRAME, + radd_func=_radd_compat, + comp_method=_comp_method_FRAME, + bool_method=_arith_method_FRAME) + + +def _arith_method_PANEL(op, name, str_rep=None, fill_zeros=None, + default_axis=None, **eval_kwargs): + # copied from Series na_op above, but without unnecessary branch for + # non-scalar + def na_op(x, y): + try: + result = expressions.evaluate(op, str_rep, x, y, + raise_on_error=True, **eval_kwargs) + except TypeError: + + # TODO: might need to find_common_type here? + result = pa.empty(len(x), dtype=x.dtype) + mask = notnull(x) + result[mask] = op(x[mask], y) + result, changed = com._maybe_upcast_putmask(result, ~mask, pa.NA) + + result = com._fill_zeros(result, x, y, name, fill_zeros) + return result + + # work only for scalars + def f(self, other): + if not np.isscalar(other): + raise ValueError('Simple arithmetic with %s can only be ' + 'done with scalar values' % + self._constructor.__name__) + + return self._combine(other, op) + f.__name__ = name + return f + + +def _comp_method_PANEL(op, name, str_rep=None, masker=False): + + def na_op(x, y): + try: + result = expressions.evaluate(op, str_rep, x, y, + raise_on_error=True) + except TypeError: + xrav = x.ravel() + result = np.empty(x.size, dtype=bool) + if isinstance(y, np.ndarray): + yrav = y.ravel() + mask = notnull(xrav) & notnull(yrav) + result[mask] = op(np.array(list(xrav[mask])), + np.array(list(yrav[mask]))) + else: + mask = notnull(xrav) + result[mask] = op(np.array(list(xrav[mask])), y) + + if op == operator.ne: # pragma: no cover + np.putmask(result, ~mask, True) + else: + np.putmask(result, ~mask, False) + result = result.reshape(x.shape) + + return result + + @Appender('Wrapper for comparison method %s' % name) + def f(self, other): + if isinstance(other, self._constructor): + return self._compare_constructor(other, na_op) + elif isinstance(other, (self._constructor_sliced, pd.DataFrame, + pd.Series)): + raise Exception("input needs alignment for this object [%s]" % + self._constructor) + else: + return self._combine_const(other, na_op) + + f.__name__ = name + + return f + + +panel_special_funcs = dict(arith_method=_arith_method_PANEL, + comp_method=_comp_method_PANEL, + bool_method=_arith_method_PANEL) diff --git a/pandas/core/panel.py b/pandas/core/panel.py new file mode 100644 index 00000000..e9f88933 --- /dev/null +++ b/pandas/core/panel.py @@ -0,0 +1,1451 @@ +""" +Contains data structures designed for manipulating panel (3-dimensional) data +""" +# pylint: disable=E1103,W0231,W0212,W0621 +from __future__ import division +from pandas.compat import (map, zip, range, lrange, lmap, u, OrderedDict, + OrderedDefaultdict) +from pandas import compat +import sys +import numpy as np +from pandas.core.common import (PandasError, _try_sort, _default_index, + _infer_dtype_from_scalar, notnull) +from pandas.core.categorical import Categorical +from pandas.core.index import (Index, MultiIndex, _ensure_index, + _get_combined_index) +from pandas.core.indexing import _maybe_droplevels, _is_list_like +from pandas.core.internals import (BlockManager, + create_block_manager_from_arrays, + create_block_manager_from_blocks) +from pandas.core.series import Series +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame, _shared_docs +from pandas.tools.util import cartesian_product +from pandas import compat +from pandas.util.decorators import (deprecate, Appender, Substitution, + deprecate_kwarg) +import pandas.core.common as com +import pandas.core.ops as ops +import pandas.core.nanops as nanops +import pandas.computation.expressions as expressions + + +_shared_doc_kwargs = dict( + axes='items, major_axis, minor_axis', + klass="Panel", + axes_single_arg="{0,1,2,'items','major_axis','minor_axis'}") +_shared_doc_kwargs['args_transpose'] = ("three positional arguments: each one" + "of\n %s" % + _shared_doc_kwargs['axes_single_arg']) + + +def _ensure_like_indices(time, panels): + """ + Makes sure that time and panels are conformable + """ + n_time = len(time) + n_panel = len(panels) + u_panels = np.unique(panels) # this sorts! + u_time = np.unique(time) + if len(u_time) == n_time: + time = np.tile(u_time, len(u_panels)) + if len(u_panels) == n_panel: + panels = np.repeat(u_panels, len(u_time)) + return time, panels + + +def panel_index(time, panels, names=['time', 'panel']): + """ + Returns a multi-index suitable for a panel-like DataFrame + + Parameters + ---------- + time : array-like + Time index, does not have to repeat + panels : array-like + Panel index, does not have to repeat + names : list, optional + List containing the names of the indices + + Returns + ------- + multi_index : MultiIndex + Time index is the first level, the panels are the second level. + + Examples + -------- + >>> years = range(1960,1963) + >>> panels = ['A', 'B', 'C'] + >>> panel_idx = panel_index(years, panels) + >>> panel_idx + MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'), + (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'), + (1962, 'C')], dtype=object) + + or + + >>> import numpy as np + >>> years = np.repeat(range(1960,1963), 3) + >>> panels = np.tile(['A', 'B', 'C'], 3) + >>> panel_idx = panel_index(years, panels) + >>> panel_idx + MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'), + (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'), + (1962, 'C')], dtype=object) + """ + time, panels = _ensure_like_indices(time, panels) + time_factor = Categorical.from_array(time) + panel_factor = Categorical.from_array(panels) + + labels = [time_factor.labels, panel_factor.labels] + levels = [time_factor.levels, panel_factor.levels] + return MultiIndex(levels, labels, sortorder=None, names=names, + verify_integrity=False) + + +class Panel(NDFrame): + + """ + Represents wide format panel data, stored as 3-dimensional array + + Parameters + ---------- + data : ndarray (items x major x minor), or dict of DataFrames + items : Index or array-like + axis=0 + major_axis : Index or array-like + axis=1 + minor_axis : Index or array-like + axis=2 + dtype : dtype, default None + Data type to force, otherwise infer + copy : boolean, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input + """ + + @property + def _constructor(self): + return type(self) + + _constructor_sliced = DataFrame + + def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, + copy=False, dtype=None): + self._init_data(data=data, items=items, major_axis=major_axis, + minor_axis=minor_axis, copy=copy, dtype=dtype) + + def _init_data(self, data, copy, dtype, **kwargs): + """ + Generate ND initialization; axes are passed + as required objects to __init__ + """ + if data is None: + data = {} + if dtype is not None: + dtype = self._validate_dtype(dtype) + + passed_axes = [kwargs.get(a) for a in self._AXIS_ORDERS] + axes = None + if isinstance(data, BlockManager): + if any(x is not None for x in passed_axes): + axes = [x if x is not None else y + for x, y in zip(passed_axes, data.axes)] + mgr = data + elif isinstance(data, dict): + mgr = self._init_dict(data, passed_axes, dtype=dtype) + copy = False + dtype = None + elif isinstance(data, (np.ndarray, list)): + mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) + copy = False + dtype = None + else: # pragma: no cover + raise PandasError('Panel constructor not properly called!') + + NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype) + + def _init_dict(self, data, axes, dtype=None): + haxis = axes.pop(self._info_axis_number) + + # prefilter if haxis passed + if haxis is not None: + haxis = _ensure_index(haxis) + data = OrderedDict((k, v) for k, v + in compat.iteritems(data) if k in haxis) + else: + ks = list(data.keys()) + if not isinstance(data, OrderedDict): + ks = _try_sort(ks) + haxis = Index(ks) + + for k, v in compat.iteritems(data): + if isinstance(v, dict): + data[k] = self._constructor_sliced(v) + + # extract axis for remaining axes & create the slicemap + raxes = [self._extract_axis(self, data, axis=i) + if a is None else a for i, a in enumerate(axes)] + raxes_sm = self._extract_axes_for_slice(self, raxes) + + # shallow copy + arrays = [] + haxis_shape = [len(a) for a in raxes] + for h in haxis: + v = values = data.get(h) + if v is None: + values = np.empty(haxis_shape, dtype=dtype) + values.fill(np.nan) + elif isinstance(v, self._constructor_sliced): + d = raxes_sm.copy() + d['copy'] = False + v = v.reindex(**d) + if dtype is not None: + v = v.astype(dtype) + values = v.values + arrays.append(values) + + return self._init_arrays(arrays, haxis, [haxis] + raxes) + + def _init_arrays(self, arrays, arr_names, axes): + return create_block_manager_from_arrays(arrays, arr_names, axes) + + @classmethod + def from_dict(cls, data, intersect=False, orient='items', dtype=None): + """ + Construct Panel from dict of DataFrame objects + + Parameters + ---------- + data : dict + {field : DataFrame} + intersect : boolean + Intersect indexes of input DataFrames + orient : {'items', 'minor'}, default 'items' + The "orientation" of the data. If the keys of the passed dict + should be the items of the result panel, pass 'items' + (default). Otherwise if the columns of the values of the passed + DataFrame objects should be the items (which in the case of + mixed-dtype data you should do), instead pass 'minor' + + + Returns + ------- + Panel + """ + orient = orient.lower() + if orient == 'minor': + new_data = OrderedDefaultdict(dict) + for col, df in compat.iteritems(data): + for item, s in compat.iteritems(df): + new_data[item][col] = s + data = new_data + elif orient != 'items': # pragma: no cover + raise ValueError('Orientation must be one of {items, minor}.') + + d = cls._homogenize_dict(cls, data, intersect=intersect, dtype=dtype) + ks = list(d['data'].keys()) + if not isinstance(d['data'], OrderedDict): + ks = list(sorted(ks)) + d[cls._info_axis_name] = Index(ks) + return cls(**d) + + def __getitem__(self, key): + if isinstance(self._info_axis, MultiIndex): + return self._getitem_multilevel(key) + return super(Panel, self).__getitem__(key) + + def _getitem_multilevel(self, key): + info = self._info_axis + loc = info.get_loc(key) + if isinstance(loc, (slice, np.ndarray)): + new_index = info[loc] + result_index = _maybe_droplevels(new_index, key) + slices = [loc] + [slice(None) for x in range( + self._AXIS_LEN - 1)] + new_values = self.values[slices] + + d = self._construct_axes_dict(self._AXIS_ORDERS[1:]) + d[self._info_axis_name] = result_index + result = self._constructor(new_values, **d) + return result + else: + return self._get_item_cache(key) + + def _init_matrix(self, data, axes, dtype=None, copy=False): + values = self._prep_ndarray(self, data, copy=copy) + + if dtype is not None: + try: + values = values.astype(dtype) + except Exception: + raise ValueError('failed to cast to %s' % dtype) + + shape = values.shape + fixed_axes = [] + for i, ax in enumerate(axes): + if ax is None: + ax = _default_index(shape[i]) + else: + ax = _ensure_index(ax) + fixed_axes.append(ax) + + return create_block_manager_from_blocks([values], fixed_axes) + + #---------------------------------------------------------------------- + # Comparison methods + + def _compare_constructor(self, other, func): + if not self._indexed_same(other): + raise Exception('Can only compare identically-labeled ' + 'same type objects') + + new_data = {} + for col in self._info_axis: + new_data[col] = func(self[col], other[col]) + + d = self._construct_axes_dict(copy=False) + return self._constructor(data=new_data, **d) + + #---------------------------------------------------------------------- + # Magic methods + + def __unicode__(self): + """ + Return a string representation for a particular Panel + + Invoked by unicode(df) in py2 only. + Yields a Unicode String in both py2/py3. + """ + + class_name = str(self.__class__) + + shape = self.shape + dims = u('Dimensions: %s') % ' x '.join( + ["%d (%s)" % (s, a) for a, s in zip(self._AXIS_ORDERS, shape)]) + + def axis_pretty(a): + v = getattr(self, a) + if len(v) > 0: + return u('%s axis: %s to %s') % (a.capitalize(), + com.pprint_thing(v[0]), + com.pprint_thing(v[-1])) + else: + return u('%s axis: None') % a.capitalize() + + output = '\n'.join( + [class_name, dims] + [axis_pretty(a) for a in self._AXIS_ORDERS]) + return output + + def _get_plane_axes_index(self, axis): + """ + Get my plane axes indexes: these are already + (as compared with higher level planes), + as we are returning a DataFrame axes indexes + """ + axis_name = self._get_axis_name(axis) + + if axis_name == 'major_axis': + index = 'minor_axis' + columns = 'items' + if axis_name == 'minor_axis': + index = 'major_axis' + columns = 'items' + elif axis_name == 'items': + index = 'major_axis' + columns = 'minor_axis' + + return index, columns + + def _get_plane_axes(self, axis): + """ + Get my plane axes indexes: these are already + (as compared with higher level planes), + as we are returning a DataFrame axes + """ + return [ self._get_axis(axi) for axi in self._get_plane_axes_index(axis) ] + + fromDict = from_dict + + def to_sparse(self, fill_value=None, kind='block'): + """ + Convert to SparsePanel + + Parameters + ---------- + fill_value : float, default NaN + kind : {'block', 'integer'} + + Returns + ------- + y : SparseDataFrame + """ + from pandas.core.sparse import SparsePanel + frames = dict(compat.iteritems(self)) + return SparsePanel(frames, items=self.items, + major_axis=self.major_axis, + minor_axis=self.minor_axis, + default_kind=kind, + default_fill_value=fill_value) + + def to_excel(self, path, na_rep='', engine=None, **kwargs): + """ + Write each DataFrame in Panel to a separate excel sheet + + Parameters + ---------- + path : string or ExcelWriter object + File path or existing ExcelWriter + na_rep : string, default '' + Missing data representation + engine : string, default None + write engine to use - you can also set this via the options + ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and + ``io.excel.xlsm.writer``. + + Other Parameters + ---------------- + float_format : string, default None + Format string for floating point numbers + cols : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : boolean, default True + Write row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + startrow : upper left cell row to dump data frame + startcol : upper left cell column to dump data frame + + Notes + ----- + Keyword arguments (and na_rep) are passed to the ``to_excel`` method + for each DataFrame written. + """ + from pandas.io.excel import ExcelWriter + + if isinstance(path, compat.string_types): + writer = ExcelWriter(path, engine=engine) + else: + writer = path + kwargs['na_rep'] = na_rep + + for item, df in compat.iteritems(self): + name = str(item) + df.to_excel(writer, name, **kwargs) + writer.save() + + def as_matrix(self): + self._consolidate_inplace() + return self._data.as_matrix() + + #---------------------------------------------------------------------- + # Getting and setting elements + + def get_value(self, *args, **kwargs): + """ + Quickly retrieve single value at (item, major, minor) location + + Parameters + ---------- + item : item label (panel item) + major : major axis label (panel item row) + minor : minor axis label (panel item column) + takeable : interpret the passed labels as indexers, default False + + Returns + ------- + value : scalar value + """ + nargs = len(args) + nreq = self._AXIS_LEN + + # require an arg for each axis + if nargs != nreq: + raise TypeError('There must be an argument for each axis, you gave' + ' {0} args, but {1} are required'.format(nargs, + nreq)) + takeable = kwargs.get('takeable') + + if takeable is True: + lower = self._iget_item_cache(args[0]) + else: + lower = self._get_item_cache(args[0]) + + return lower.get_value(*args[1:], takeable=takeable) + + def set_value(self, *args, **kwargs): + """ + Quickly set single value at (item, major, minor) location + + Parameters + ---------- + item : item label (panel item) + major : major axis label (panel item row) + minor : minor axis label (panel item column) + value : scalar + takeable : interpret the passed labels as indexers, default False + + Returns + ------- + panel : Panel + If label combo is contained, will be reference to calling Panel, + otherwise a new object + """ + # require an arg for each axis and the value + nargs = len(args) + nreq = self._AXIS_LEN + 1 + + if nargs != nreq: + raise TypeError('There must be an argument for each axis plus the ' + 'value provided, you gave {0} args, but {1} are ' + 'required'.format(nargs, nreq)) + takeable = kwargs.get('takeable') + + try: + if takeable is True: + lower = self._iget_item_cache(args[0]) + else: + lower = self._get_item_cache(args[0]) + + lower.set_value(*args[1:], takeable=takeable) + return self + except KeyError: + axes = self._expand_axes(args) + d = self._construct_axes_dict_from(self, axes, copy=False) + result = self.reindex(**d) + args = list(args) + likely_dtype, args[-1] = _infer_dtype_from_scalar(args[-1]) + made_bigger = not np.array_equal( + axes[0], self._info_axis) + # how to make this logic simpler? + if made_bigger: + com._possibly_cast_item(result, args[0], likely_dtype) + + return result.set_value(*args) + + def _box_item_values(self, key, values): + if self.ndim == values.ndim: + result = self._constructor(values) + + # a dup selection will yield a full ndim + if result._get_axis(0).is_unique: + result = result[key] + + return result + + d = self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:]) + return self._constructor_sliced(values, **d) + + def __setitem__(self, key, value): + shape = tuple(self.shape) + if isinstance(value, self._constructor_sliced): + value = value.reindex( + **self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:])) + mat = value.values + elif isinstance(value, np.ndarray): + if value.shape != shape[1:]: + raise ValueError( + 'shape of value must be {0}, shape of given object was ' + '{1}'.format(shape[1:], tuple(map(int, value.shape)))) + mat = np.asarray(value) + elif np.isscalar(value): + dtype, value = _infer_dtype_from_scalar(value) + mat = np.empty(shape[1:], dtype=dtype) + mat.fill(value) + else: + raise TypeError('Cannot set item of type: %s' % str(type(value))) + + mat = mat.reshape(tuple([1]) + shape[1:]) + NDFrame._set_item(self, key, mat) + + def _unpickle_panel_compat(self, state): # pragma: no cover + "Unpickle the panel" + _unpickle = com._unpickle_array + vals, items, major, minor = state + + items = _unpickle(items) + major = _unpickle(major) + minor = _unpickle(minor) + values = _unpickle(vals) + wp = Panel(values, items, major, minor) + self._data = wp._data + + def conform(self, frame, axis='items'): + """ + Conform input DataFrame to align with chosen axis pair. + + Parameters + ---------- + frame : DataFrame + axis : {'items', 'major', 'minor'} + + Axis the input corresponds to. E.g., if axis='major', then + the frame's columns would be items, and the index would be + values of the minor axis + + Returns + ------- + DataFrame + """ + axes = self._get_plane_axes(axis) + return frame.reindex(**self._extract_axes_for_slice(self, axes)) + + def head(self, n=5): + raise NotImplementedError + + def tail(self, n=5): + raise NotImplementedError + + def _needs_reindex_multi(self, axes, method, level): + """ don't allow a multi reindex on Panel or above ndim """ + return False + + def dropna(self, axis=0, how='any', inplace=False, **kwargs): + """ + Drop 2D from panel, holding passed axis constant + + Parameters + ---------- + axis : int, default 0 + Axis to hold constant. E.g. axis=1 will drop major_axis entries + having a certain amount of NA data + how : {'all', 'any'}, default 'any' + 'any': one or more values are NA in the DataFrame along the + axis. For 'all' they all must be. + inplace : bool, default False + If True, do operation inplace and return None. + + Returns + ------- + dropped : Panel + """ + axis = self._get_axis_number(axis) + + values = self.values + mask = com.notnull(values) + + for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))): + mask = mask.sum(ax) + + per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:]) + + if how == 'all': + cond = mask > 0 + else: + cond = mask == per_slice + + new_ax = self._get_axis(axis)[cond] + result = self.reindex_axis(new_ax, axis=axis) + if inplace: + self._update_inplace(result) + else: + return result + + def _combine(self, other, func, axis=0): + if isinstance(other, Panel): + return self._combine_panel(other, func) + elif isinstance(other, DataFrame): + return self._combine_frame(other, func, axis=axis) + elif np.isscalar(other): + return self._combine_const(other, func) + + def _combine_const(self, other, func): + new_values = func(self.values, other) + d = self._construct_axes_dict() + return self._constructor(new_values, **d) + + def _combine_frame(self, other, func, axis=0): + index, columns = self._get_plane_axes(axis) + axis = self._get_axis_number(axis) + + other = other.reindex(index=index, columns=columns) + + if axis == 0: + new_values = func(self.values, other.values) + elif axis == 1: + new_values = func(self.values.swapaxes(0, 1), other.values.T) + new_values = new_values.swapaxes(0, 1) + elif axis == 2: + new_values = func(self.values.swapaxes(0, 2), other.values) + new_values = new_values.swapaxes(0, 2) + + return self._constructor(new_values, self.items, self.major_axis, + self.minor_axis) + + def _combine_panel(self, other, func): + items = self.items + other.items + major = self.major_axis + other.major_axis + minor = self.minor_axis + other.minor_axis + + # could check that everything's the same size, but forget it + this = self.reindex(items=items, major=major, minor=minor) + other = other.reindex(items=items, major=major, minor=minor) + + result_values = func(this.values, other.values) + + return self._constructor(result_values, items, major, minor) + + def major_xs(self, key, copy=None): + """ + Return slice of panel along major axis + + Parameters + ---------- + key : object + Major axis label + copy : boolean [deprecated] + Whether to make a copy of the data + + Returns + ------- + y : DataFrame + index -> minor axis, columns -> items + + Notes + ----- + major_xs is only for getting, not setting values. + + MultiIndex Slicers is a generic way to get/set values on any level or levels + it is a superset of major_xs functionality, see :ref:`MultiIndex Slicers ` + + """ + if copy is not None: + warnings.warn("copy keyword is deprecated, " + "default is to return a copy or a view if possible") + + return self.xs(key, axis=self._AXIS_LEN - 2) + + def minor_xs(self, key, copy=None): + """ + Return slice of panel along minor axis + + Parameters + ---------- + key : object + Minor axis label + copy : boolean [deprecated] + Whether to make a copy of the data + + Returns + ------- + y : DataFrame + index -> major axis, columns -> items + + Notes + ----- + minor_xs is only for getting, not setting values. + + MultiIndex Slicers is a generic way to get/set values on any level or levels + it is a superset of minor_xs functionality, see :ref:`MultiIndex Slicers ` + + """ + if copy is not None: + warnings.warn("copy keyword is deprecated, " + "default is to return a copy or a view if possible") + + return self.xs(key, axis=self._AXIS_LEN - 1) + + def xs(self, key, axis=1, copy=None): + """ + Return slice of panel along selected axis + + Parameters + ---------- + key : object + Label + axis : {'items', 'major', 'minor}, default 1/'major' + copy : boolean [deprecated] + Whether to make a copy of the data + + Returns + ------- + y : ndim(self)-1 + + Notes + ----- + xs is only for getting, not setting values. + + MultiIndex Slicers is a generic way to get/set values on any level or levels + it is a superset of xs functionality, see :ref:`MultiIndex Slicers ` + + """ + if copy is not None: + warnings.warn("copy keyword is deprecated, " + "default is to return a copy or a view if possible") + + axis = self._get_axis_number(axis) + if axis == 0: + return self[key] + + self._consolidate_inplace() + axis_number = self._get_axis_number(axis) + new_data = self._data.xs(key, axis=axis_number, copy=False) + result = self._construct_return_type(new_data) + copy = new_data.is_mixed_type + result._set_is_copy(self, copy=copy) + return result + + _xs = xs + + def _ixs(self, i, axis=0): + """ + i : int, slice, or sequence of integers + axis : int + """ + + ax = self._get_axis(axis) + key = ax[i] + + # xs cannot handle a non-scalar key, so just reindex here + # if we have a multi-index and a single tuple, then its a reduction (GH 7516) + if not (isinstance(ax, MultiIndex) and isinstance(key, tuple)): + if _is_list_like(key): + indexer = {self._get_axis_name(axis): key} + return self.reindex(**indexer) + + # a reduction + if axis == 0: + values = self._data.iget(i) + return self._box_item_values(key, values) + + # xs by position + self._consolidate_inplace() + new_data = self._data.xs(i, axis=axis, copy=True, takeable=True) + return self._construct_return_type(new_data) + + def groupby(self, function, axis='major'): + """ + Group data on given axis, returning GroupBy object + + Parameters + ---------- + function : callable + Mapping function for chosen access + axis : {'major', 'minor', 'items'}, default 'major' + + Returns + ------- + grouped : PanelGroupBy + """ + from pandas.core.groupby import PanelGroupBy + axis = self._get_axis_number(axis) + return PanelGroupBy(self, function, axis=axis) + + def to_frame(self, filter_observations=True): + """ + Transform wide format into long (stacked) format as DataFrame whose + columns are the Panel's items and whose index is a MultiIndex formed + of the Panel's major and minor axes. + + Parameters + ---------- + filter_observations : boolean, default True + Drop (major, minor) pairs without a complete set of observations + across all the items + + Returns + ------- + y : DataFrame + """ + _, N, K = self.shape + + if filter_observations: + # shaped like the return DataFrame + mask = com.notnull(self.values).all(axis=0) + # size = mask.sum() + selector = mask.ravel() + else: + # size = N * K + selector = slice(None, None) + + data = {} + for item in self.items: + data[item] = self[item].values.ravel()[selector] + + def construct_multi_parts(idx, n_repeat, n_shuffle=1): + axis_idx = idx.to_hierarchical(n_repeat, n_shuffle) + labels = [x[selector] for x in axis_idx.labels] + levels = axis_idx.levels + names = axis_idx.names + return labels, levels, names + + def construct_index_parts(idx, major=True): + levels = [idx] + if major: + labels = [np.arange(N).repeat(K)[selector]] + names = idx.name or 'major' + else: + labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] + labels = [labels.ravel()[selector]] + names = idx.name or 'minor' + names = [names] + return labels, levels, names + + if isinstance(self.major_axis, MultiIndex): + major_labels, major_levels, major_names = construct_multi_parts( + self.major_axis, n_repeat=K) + else: + major_labels, major_levels, major_names = construct_index_parts( + self.major_axis) + + if isinstance(self.minor_axis, MultiIndex): + minor_labels, minor_levels, minor_names = construct_multi_parts( + self.minor_axis, n_repeat=N, n_shuffle=K) + else: + minor_labels, minor_levels, minor_names = construct_index_parts( + self.minor_axis, major=False) + + levels = major_levels + minor_levels + labels = major_labels + minor_labels + names = major_names + minor_names + + index = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) + + return DataFrame(data, index=index, columns=self.items) + + to_long = deprecate('to_long', to_frame) + toLong = deprecate('toLong', to_frame) + + def apply(self, func, axis='major', **kwargs): + """ + Applies function along input axis of the Panel + + Parameters + ---------- + func : function + Function to apply to each combination of 'other' axes + e.g. if axis = 'items', then the combination of major_axis/minor_axis + will be passed a Series + axis : {'major', 'minor', 'items'} + Additional keyword arguments will be passed as keywords to the function + + Examples + -------- + >>> p.apply(numpy.sqrt) # returns a Panel + >>> p.apply(lambda x: x.sum(), axis=0) # equiv to p.sum(0) + >>> p.apply(lambda x: x.sum(), axis=1) # equiv to p.sum(1) + >>> p.apply(lambda x: x.sum(), axis=2) # equiv to p.sum(2) + + Returns + ------- + result : Pandas Object + """ + + if kwargs and not isinstance(func, np.ufunc): + f = lambda x: func(x, **kwargs) + else: + f = func + + # 2d-slabs + if isinstance(axis, (tuple,list)) and len(axis) == 2: + return self._apply_2d(f, axis=axis) + + axis = self._get_axis_number(axis) + + # try ufunc like + if isinstance(f, np.ufunc): + try: + result = np.apply_along_axis(func, axis, self.values) + return self._wrap_result(result, axis=axis) + except (AttributeError): + pass + + # 1d + return self._apply_1d(f, axis=axis) + + def _apply_1d(self, func, axis): + + axis_name = self._get_axis_name(axis) + ax = self._get_axis(axis) + ndim = self.ndim + values = self.values + + # iter thru the axes + slice_axis = self._get_axis(axis) + slice_indexer = [0]*(ndim-1) + indexer = np.zeros(ndim, 'O') + indlist = list(range(ndim)) + indlist.remove(axis) + indexer[axis] = slice(None, None) + indexer.put(indlist, slice_indexer) + planes = [ self._get_axis(axi) for axi in indlist ] + shape = np.array(self.shape).take(indlist) + + # all the iteration points + points = cartesian_product(planes) + + results = [] + for i in range(np.prod(shape)): + + # construct the object + pts = tuple([ p[i] for p in points ]) + indexer.put(indlist, slice_indexer) + + obj = Series(values[tuple(indexer)],index=slice_axis,name=pts) + result = func(obj) + + results.append(result) + + # increment the indexer + slice_indexer[-1] += 1 + n = -1 + while (slice_indexer[n] >= shape[n]) and (n > (1-ndim)): + slice_indexer[n-1] += 1 + slice_indexer[n] = 0 + n -= 1 + + # empty object + if not len(results): + return self._constructor(**self._construct_axes_dict()) + + # same ndim as current + if isinstance(results[0],Series): + arr = np.vstack([ r.values for r in results ]) + arr = arr.T.reshape(tuple([len(slice_axis)] + list(shape))) + tranp = np.array([axis]+indlist).argsort() + arr = arr.transpose(tuple(list(tranp))) + return self._constructor(arr,**self._construct_axes_dict()) + + # ndim-1 shape + results = np.array(results).reshape(shape) + if results.ndim == 2 and axis_name != self._info_axis_name: + results = results.T + planes = planes[::-1] + return self._construct_return_type(results,planes) + + def _apply_2d(self, func, axis): + """ handle 2-d slices, equiv to iterating over the other axis """ + + ndim = self.ndim + axis = [ self._get_axis_number(a) for a in axis ] + + # construct slabs, in 2-d this is a DataFrame result + indexer_axis = list(range(ndim)) + for a in axis: + indexer_axis.remove(a) + indexer_axis = indexer_axis[0] + + slicer = [ slice(None,None) ] * ndim + ax = self._get_axis(indexer_axis) + + results = [] + for i, e in enumerate(ax): + + slicer[indexer_axis] = i + sliced = self.iloc[tuple(slicer)] + + obj = func(sliced) + results.append((e,obj)) + + return self._construct_return_type(dict(results)) + + def _reduce(self, op, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + axis_name = self._get_axis_name(axis) + axis_number = self._get_axis_number(axis_name) + f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) + + result = f(self.values) + + axes = self._get_plane_axes(axis_name) + if result.ndim == 2 and axis_name != self._info_axis_name: + result = result.T + + return self._construct_return_type(result, axes) + + def _construct_return_type(self, result, axes=None, **kwargs): + """ return the type for the ndim of the result """ + ndim = getattr(result,'ndim',None) + + # need to assume they are the same + if ndim is None: + if isinstance(result,dict): + ndim = getattr(list(compat.itervalues(result))[0],'ndim',None) + + # a saclar result + if ndim is None: + ndim = 0 + + # have a dict, so top-level is +1 dim + else: + ndim += 1 + + # scalar + if ndim == 0: + return Series(result) + + # same as self + elif self.ndim == ndim: + """ return the construction dictionary for these axes """ + if axes is None: + return self._constructor(result) + return self._constructor(result, **self._construct_axes_dict()) + + # sliced + elif self.ndim == ndim + 1: + if axes is None: + return self._constructor_sliced(result) + return self._constructor_sliced( + result, **self._extract_axes_for_slice(self, axes)) + + raise PandasError('invalid _construct_return_type [self->%s] ' + '[result->%s]' % (self, result)) + + def _wrap_result(self, result, axis): + axis = self._get_axis_name(axis) + axes = self._get_plane_axes(axis) + if result.ndim == 2 and axis != self._info_axis_name: + result = result.T + + return self._construct_return_type(result, axes) + + @Appender(_shared_docs['reindex'] % _shared_doc_kwargs) + def reindex(self, items=None, major_axis=None, minor_axis=None, **kwargs): + major_axis = (major_axis if major_axis is not None + else kwargs.pop('major', None)) + minor_axis = (minor_axis if minor_axis is not None + else kwargs.pop('minor', None)) + return super(Panel, self).reindex(items=items, major_axis=major_axis, + minor_axis=minor_axis, **kwargs) + + @Appender(_shared_docs['rename'] % _shared_doc_kwargs) + def rename(self, items=None, major_axis=None, minor_axis=None, **kwargs): + major_axis = (major_axis if major_axis is not None + else kwargs.pop('major', None)) + minor_axis = (minor_axis if minor_axis is not None + else kwargs.pop('minor', None)) + return super(Panel, self).rename(items=items, major_axis=major_axis, + minor_axis=minor_axis, **kwargs) + + @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) + def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, + limit=None, fill_value=np.nan): + return super(Panel, self).reindex_axis(labels=labels, axis=axis, + method=method, level=level, + copy=copy, limit=limit, + fill_value=fill_value) + + @Appender(_shared_docs['transpose'] % _shared_doc_kwargs) + def transpose(self, *args, **kwargs): + return super(Panel, self).transpose(*args, **kwargs) + + def count(self, axis='major'): + """ + Return number of observations over requested axis. + + Parameters + ---------- + axis : {'items', 'major', 'minor'} or {0, 1, 2} + + Returns + ------- + count : DataFrame + """ + i = self._get_axis_number(axis) + + values = self.values + mask = np.isfinite(values) + result = mask.sum(axis=i,dtype='int64') + + return self._wrap_result(result, axis) + + @deprecate_kwarg(old_arg_name='lags', new_arg_name='periods') + def shift(self, periods=1, freq=None, axis='major'): + """ + Shift major or minor axis by specified number of leads/lags. Drops + periods right now compared with DataFrame.shift + + Parameters + ---------- + lags : int + axis : {'major', 'minor'} + + Returns + ------- + shifted : Panel + """ + if freq: + return self.tshift(periods, freq, axis=axis) + + if axis == 'items': + raise ValueError('Invalid axis') + + return super(Panel, self).slice_shift(periods, axis=axis) + + def tshift(self, periods=1, freq=None, axis='major', **kwds): + return super(Panel, self).tshift(periods, freq, axis, **kwds) + + def join(self, other, how='left', lsuffix='', rsuffix=''): + """ + Join items with other Panel either on major and minor axes column + + Parameters + ---------- + other : Panel or list of Panels + Index should be similar to one of the columns in this one + how : {'left', 'right', 'outer', 'inner'} + How to handle indexes of the two objects. Default: 'left' + for joining on index, None otherwise + * left: use calling frame's index + * right: use input frame's index + * outer: form union of indexes + * inner: use intersection of indexes + lsuffix : string + Suffix to use from left frame's overlapping columns + rsuffix : string + Suffix to use from right frame's overlapping columns + + Returns + ------- + joined : Panel + """ + from pandas.tools.merge import concat + + if isinstance(other, Panel): + join_major, join_minor = self._get_join_index(other, how) + this = self.reindex(major=join_major, minor=join_minor) + other = other.reindex(major=join_major, minor=join_minor) + merged_data = this._data.merge(other._data, lsuffix, rsuffix) + return self._constructor(merged_data) + else: + if lsuffix or rsuffix: + raise ValueError('Suffixes not supported when passing ' + 'multiple panels') + + if how == 'left': + how = 'outer' + join_axes = [self.major_axis, self.minor_axis] + elif how == 'right': + raise ValueError('Right join not supported with multiple ' + 'panels') + else: + join_axes = None + + return concat([self] + list(other), axis=0, join=how, + join_axes=join_axes, verify_integrity=True) + + def update(self, other, join='left', overwrite=True, filter_func=None, + raise_conflict=False): + """ + Modify Panel in place using non-NA values from passed + Panel, or object coercible to Panel. Aligns on items + + Parameters + ---------- + other : Panel, or object coercible to Panel + join : How to join individual DataFrames + {'left', 'right', 'outer', 'inner'}, default 'left' + overwrite : boolean, default True + If True then overwrite values for common keys in the calling panel + filter_func : callable(1d-array) -> 1d-array, default None + Can choose to replace values other than NA. Return True for values + that should be updated + raise_conflict : bool + If True, will raise an error if a DataFrame and other both + contain data in the same place. + """ + + if not isinstance(other, self._constructor): + other = self._constructor(other) + + axis_name = self._info_axis_name + axis_values = self._info_axis + other = other.reindex(**{axis_name: axis_values}) + + for frame in axis_values: + self[frame].update(other[frame], join, overwrite, filter_func, + raise_conflict) + + def _get_join_index(self, other, how): + if how == 'left': + join_major, join_minor = self.major_axis, self.minor_axis + elif how == 'right': + join_major, join_minor = other.major_axis, other.minor_axis + elif how == 'inner': + join_major = self.major_axis.intersection(other.major_axis) + join_minor = self.minor_axis.intersection(other.minor_axis) + elif how == 'outer': + join_major = self.major_axis.union(other.major_axis) + join_minor = self.minor_axis.union(other.minor_axis) + return join_major, join_minor + + # miscellaneous data creation + @staticmethod + def _extract_axes(self, data, axes, **kwargs): + """ return a list of the axis indicies """ + return [self._extract_axis(self, data, axis=i, **kwargs) for i, a + in enumerate(axes)] + + @staticmethod + def _extract_axes_for_slice(self, axes): + """ return the slice dictionary for these axes """ + return dict([(self._AXIS_SLICEMAP[i], a) + for i, a in zip(self._AXIS_ORDERS[self._AXIS_LEN - + len(axes):], axes)]) + + @staticmethod + def _prep_ndarray(self, values, copy=True): + if not isinstance(values, np.ndarray): + values = np.asarray(values) + # NumPy strings are a pain, convert to object + if issubclass(values.dtype.type, compat.string_types): + values = np.array(values, dtype=object, copy=True) + else: + if copy: + values = values.copy() + if values.ndim != self._AXIS_LEN: + raise ValueError("The number of dimensions required is {0}, " + "but the number of dimensions of the " + "ndarray given was {1}".format(self._AXIS_LEN, + values.ndim)) + return values + + @staticmethod + def _homogenize_dict(self, frames, intersect=True, dtype=None): + """ + Conform set of _constructor_sliced-like objects to either + an intersection of indices / columns or a union. + + Parameters + ---------- + frames : dict + intersect : boolean, default True + + Returns + ------- + dict of aligned results & indicies + """ + + result = dict() + # caller differs dict/ODict, presered type + if isinstance(frames, OrderedDict): + result = OrderedDict() + + adj_frames = OrderedDict() + for k, v in compat.iteritems(frames): + if isinstance(v, dict): + adj_frames[k] = self._constructor_sliced(v) + else: + adj_frames[k] = v + + axes = self._AXIS_ORDERS[1:] + axes_dict = dict([(a, ax) for a, ax in zip(axes, self._extract_axes( + self, adj_frames, axes, intersect=intersect))]) + + reindex_dict = dict( + [(self._AXIS_SLICEMAP[a], axes_dict[a]) for a in axes]) + reindex_dict['copy'] = False + for key, frame in compat.iteritems(adj_frames): + if frame is not None: + result[key] = frame.reindex(**reindex_dict) + else: + result[key] = None + + axes_dict['data'] = result + return axes_dict + + @staticmethod + def _extract_axis(self, data, axis=0, intersect=False): + + index = None + if len(data) == 0: + index = Index([]) + elif len(data) > 0: + raw_lengths = [] + indexes = [] + + have_raw_arrays = False + have_frames = False + + for v in data.values(): + if isinstance(v, self._constructor_sliced): + have_frames = True + indexes.append(v._get_axis(axis)) + elif v is not None: + have_raw_arrays = True + raw_lengths.append(v.shape[axis]) + + if have_frames: + index = _get_combined_index(indexes, intersect=intersect) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError('ndarrays must match shape on axis %d' % axis) + + if have_frames: + if lengths[0] != len(index): + raise AssertionError('Length of data and index must match') + else: + index = Index(np.arange(lengths[0])) + + if index is None: + index = Index([]) + + return _ensure_index(index) + + @classmethod + def _add_aggregate_operations(cls, use_numexpr=True): + """ add the operations to the cls; evaluate the doc strings again """ + + # doc strings substitors + _agg_doc = """ +Wrapper method for %%s + +Parameters +---------- +other : %s or %s""" % (cls._constructor_sliced.__name__, cls.__name__) + """ +axis : {""" + ', '.join(cls._AXIS_ORDERS) + "}" + """ +Axis to broadcast over + +Returns +------- +""" + cls.__name__ + "\n" + + def _panel_arith_method(op, name, str_rep=None, default_axis=None, + fill_zeros=None, **eval_kwargs): + def na_op(x, y): + try: + result = expressions.evaluate(op, str_rep, x, y, + raise_on_error=True, + **eval_kwargs) + except TypeError: + result = op(x, y) + + # handles discrepancy between numpy and numexpr on division/mod + # by 0 though, given that these are generally (always?) + # non-scalars, I'm not sure whether it's worth it at the moment + result = com._fill_zeros(result, x, y, name, fill_zeros) + return result + + @Substitution(name) + @Appender(_agg_doc) + def f(self, other, axis=0): + return self._combine(other, na_op, axis=axis) + f.__name__ = name + return f + + # add `div`, `mul`, `pow`, etc.. + ops.add_flex_arithmetic_methods( + cls, _panel_arith_method, use_numexpr=use_numexpr, + flex_comp_method=ops._comp_method_PANEL) + +Panel._setup_axes(axes=['items', 'major_axis', 'minor_axis'], + info_axis=0, + stat_axis=1, + aliases={'major': 'major_axis', + 'minor': 'minor_axis'}, + slicers={'major_axis': 'index', + 'minor_axis': 'columns'}) + +ops.add_special_arithmetic_methods(Panel, **ops.panel_special_funcs) +Panel._add_aggregate_operations() +Panel._add_numeric_operations() + +WidePanel = Panel +LongPanel = DataFrame diff --git a/pandas/core/panel4d.py b/pandas/core/panel4d.py new file mode 100644 index 00000000..3d480464 --- /dev/null +++ b/pandas/core/panel4d.py @@ -0,0 +1,41 @@ +""" Panel4D: a 4-d dict like collection of panels """ + +from pandas.core.panelnd import create_nd_panel_factory +from pandas.core.panel import Panel + +Panel4D = create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', 'minor_axis'], + slices={'labels': 'labels', 'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2, + ns=dict(__doc__=""" + Represents a 4 dimensional structured + + Parameters + ---------- + data : ndarray (labels x items x major x minor), or dict of Panels + + labels : Index or array-like : axis=0 + items : Index or array-like : axis=1 + major_axis : Index or array-like: axis=2 + minor_axis : Index or array-like: axis=3 + + dtype : dtype, default None + Data type to force, otherwise infer + copy : boolean, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input + """) +) + + +def panel4d_init(self, data=None, labels=None, items=None, major_axis=None, + minor_axis=None, copy=False, dtype=None): + + self._init_data(data=data, labels=labels, items=items, + major_axis=major_axis, minor_axis=minor_axis, + copy=copy, dtype=dtype) + +Panel4D.__init__ = panel4d_init diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py new file mode 100644 index 00000000..3eebd511 --- /dev/null +++ b/pandas/core/panelnd.py @@ -0,0 +1,109 @@ +""" Factory methods to create N-D panels """ + +import pandas.lib as lib +from pandas.compat import zip +import pandas.compat as compat + + +def create_nd_panel_factory(klass_name, orders, slices, slicer, aliases=None, + stat_axis=2, info_axis=0, ns=None): + """ manufacture a n-d class: + + Parameters + ---------- + klass_name : the klass name + orders : the names of the axes in order (highest to lowest) + slices : a dictionary that defines how the axes map to the slice axis + slicer : the class representing a slice of this panel + aliases : a dictionary defining aliases for various axes + default = { major : major_axis, minor : minor_axis } + stat_axis : the default statistic axis default = 2 + info_axis : the info axis + + Returns + ------- + a class object representing this panel + + """ + + # if slicer is a name, get the object + if isinstance(slicer, compat.string_types): + import pandas + try: + slicer = getattr(pandas, slicer) + except: + raise Exception("cannot create this slicer [%s]" % slicer) + + # build the klass + ns = {} if not ns else ns + klass = type(klass_name, (slicer,), ns) + + # setup the axes + klass._setup_axes(axes=orders, info_axis=info_axis, stat_axis=stat_axis, + aliases=aliases, slicers=slices) + + klass._constructor_sliced = slicer + + # define the methods #### + def __init__(self, *args, **kwargs): + if not (kwargs.get('data') or len(args)): + raise Exception( + "must supply at least a data argument to [%s]" % klass_name) + if 'copy' not in kwargs: + kwargs['copy'] = False + if 'dtype' not in kwargs: + kwargs['dtype'] = None + self._init_data(*args, **kwargs) + klass.__init__ = __init__ + + def _get_plane_axes_index(self, axis): + """ return the sliced index for this object """ + + axis_name = self._get_axis_name(axis) + index = self._AXIS_ORDERS.index(axis) + + planes = [] + if index: + planes.extend(self._AXIS_ORDERS[0:index]) + if index != self._AXIS_LEN: + planes.extend(self._AXIS_ORDERS[index + 1:]) + + return planes + klass._get_plane_axes_index = _get_plane_axes_index + + def _combine(self, other, func, axis=0): + if isinstance(other, klass): + return self._combine_with_constructor(other, func) + return super(klass, self)._combine(other, func, axis=axis) + klass._combine = _combine + + def _combine_with_constructor(self, other, func): + + # combine labels to form new axes + new_axes = [] + for a in self._AXIS_ORDERS: + new_axes.append(getattr(self, a) + getattr(other, a)) + + # reindex: could check that everything's the same size, but forget it + d = dict([(a, ax) for a, ax in zip(self._AXIS_ORDERS, new_axes)]) + d['copy'] = False + this = self.reindex(**d) + other = other.reindex(**d) + + result_values = func(this.values, other.values) + + return self._constructor(result_values, **d) + klass._combine_with_constructor = _combine_with_constructor + + # set as NonImplemented operations which we don't support + for f in ['to_frame', 'to_excel', 'to_sparse', 'groupby', 'join', 'filter', + 'dropna', 'shift']: + def func(self, *args, **kwargs): + raise NotImplementedError + setattr(klass, f, func) + + # add the aggregate operations + klass._add_aggregate_operations() + klass._add_numeric_operations() + + return klass diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py new file mode 100644 index 00000000..e1712be7 --- /dev/null +++ b/pandas/core/reshape.py @@ -0,0 +1,1113 @@ +# pylint: disable=E1101,E1103 +# pylint: disable=W0703,W0622,W0613,W0201 +from pandas.compat import range, zip +from pandas import compat +import itertools + +import numpy as np + +from pandas.core.series import Series +from pandas.core.frame import DataFrame + +from pandas.core.categorical import Categorical +from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, + isnull) +from pandas.core.groupby import (get_group_index, _compress_group_index, + decons_group_index) +import pandas.core.common as com +import pandas.algos as algos + +from pandas.core.index import MultiIndex, _get_na_value + + +class _Unstacker(object): + + """ + Helper class to unstack data / pivot with multi-level index + + Parameters + ---------- + level : int or str, default last level + Level to "unstack". Accepts a name for the level. + + Examples + -------- + >>> import pandas as pd + >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), + ... ('two', 'a'), ('two', 'b')]) + >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s + one a 1 + b 2 + two a 3 + b 4 + dtype: float64 + + >>> s.unstack(level=-1) + a b + one 1 2 + two 3 4 + + >>> s.unstack(level=0) + one two + a 1 2 + b 3 4 + + Returns + ------- + unstacked : DataFrame + """ + + def __init__(self, values, index, level=-1, value_columns=None): + if values.ndim == 1: + values = values[:, np.newaxis] + self.values = values + self.value_columns = value_columns + + if value_columns is None and values.shape[1] != 1: # pragma: no cover + raise ValueError('must pass column labels for multi-column data') + + self.index = index + + if isinstance(self.index, MultiIndex): + if index._reference_duplicate_name(level): + msg = ("Ambiguous reference to {0}. The index " + "names are not unique.".format(level)) + raise ValueError(msg) + + self.level = self.index._get_level_number(level) + + levels = index.levels + labels = index.labels + + def _make_index(lev, lab): + values = _make_index_array_level(lev.values, lab) + i = lev._simple_new(values, lev.name, + freq=getattr(lev, 'freq', None), + tz=getattr(lev, 'tz', None)) + return i + + self.new_index_levels = [_make_index(lev, lab) + for lev, lab in zip(levels, labels)] + self.new_index_names = list(index.names) + + self.removed_name = self.new_index_names.pop(self.level) + self.removed_level = self.new_index_levels.pop(self.level) + + self._make_sorted_values_labels() + self._make_selectors() + + def _make_sorted_values_labels(self): + v = self.level + + labs = list(self.index.labels) + levs = list(self.index.levels) + to_sort = labs[:v] + labs[v + 1:] + [labs[v]] + sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] + + comp_index, obs_ids = get_compressed_ids(to_sort, sizes) + + # group_index = get_group_index(to_sort, sizes) + # comp_index, obs_ids = _compress_group_index(group_index) + + ngroups = len(obs_ids) + + indexer = algos.groupsort_indexer(comp_index, ngroups)[0] + indexer = _ensure_platform_int(indexer) + + self.sorted_values = com.take_nd(self.values, indexer, axis=0) + self.sorted_labels = [l.take(indexer) for l in to_sort] + + def _make_selectors(self): + new_levels = self.new_index_levels + + # make the mask + remaining_labels = self.sorted_labels[:-1] + level_sizes = [len(x) for x in new_levels] + + comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) + ngroups = len(obs_ids) + + comp_index = _ensure_platform_int(comp_index) + stride = self.index.levshape[self.level] + self.full_shape = ngroups, stride + + selector = self.sorted_labels[-1] + stride * comp_index + mask = np.zeros(np.prod(self.full_shape), dtype=bool) + mask.put(selector, True) + + if mask.sum() < len(self.index): + raise ValueError('Index contains duplicate entries, ' + 'cannot reshape') + + self.group_index = comp_index + self.mask = mask + self.unique_groups = obs_ids + self.compressor = comp_index.searchsorted(np.arange(ngroups)) + + def get_result(self): + # TODO: find a better way than this masking business + + values, value_mask = self.get_new_values() + columns = self.get_new_columns() + index = self.get_new_index() + + # filter out missing levels + if values.shape[1] > 0: + col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) + # rare case, level values not observed + if len(obs_ids) < self.full_shape[1]: + inds = (value_mask.sum(0) > 0).nonzero()[0] + values = com.take_nd(values, inds, axis=1) + columns = columns[inds] + + # we might have a missing index + if len(index) != values.shape[0]: + mask = isnull(index) + if mask.any(): + l = np.arange(len(index)) + values, orig_values = (np.empty((len(index), values.shape[1])), + values) + values.fill(np.nan) + values_indexer = com._ensure_int64(l[~mask]) + for i, j in enumerate(values_indexer): + values[j] = orig_values[i] + else: + index = index.take(self.unique_groups) + + return DataFrame(values, index=index, columns=columns) + + def get_new_values(self): + values = self.values + + # place the values + length, width = self.full_shape + stride = values.shape[1] + result_width = width * stride + result_shape = (length, result_width) + + # if our mask is all True, then we can use our existing dtype + if self.mask.all(): + dtype = values.dtype + new_values = np.empty(result_shape, dtype=dtype) + else: + dtype, fill_value = _maybe_promote(values.dtype) + new_values = np.empty(result_shape, dtype=dtype) + new_values.fill(fill_value) + + new_mask = np.zeros(result_shape, dtype=bool) + + # is there a simpler / faster way of doing this? + for i in range(values.shape[1]): + chunk = new_values[:, i * width: (i + 1) * width] + mask_chunk = new_mask[:, i * width: (i + 1) * width] + + chunk.flat[self.mask] = self.sorted_values[:, i] + mask_chunk.flat[self.mask] = True + + return new_values, new_mask + + def get_new_columns(self): + if self.value_columns is None: + return self.removed_level + + stride = len(self.removed_level) + width = len(self.value_columns) + propagator = np.repeat(np.arange(width), stride) + if isinstance(self.value_columns, MultiIndex): + new_levels = self.value_columns.levels + (self.removed_level,) + new_names = self.value_columns.names + (self.removed_name,) + + new_labels = [lab.take(propagator) + for lab in self.value_columns.labels] + new_labels.append(np.tile(np.arange(stride), width)) + else: + new_levels = [self.value_columns, self.removed_level] + new_names = [self.value_columns.name, self.removed_name] + + new_labels = [] + + new_labels.append(propagator) + new_labels.append(np.tile(np.arange(stride), width)) + + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + def get_new_index(self): + result_labels = [] + for cur in self.sorted_labels[:-1]: + labels = cur.take(self.compressor) + labels = _make_index_array_level(labels, cur) + result_labels.append(labels) + + # construct the new index + if len(self.new_index_levels) == 1: + new_index = self.new_index_levels[0] + new_index.name = self.new_index_names[0] + else: + new_index = MultiIndex(levels=self.new_index_levels, + labels=result_labels, + names=self.new_index_names, + verify_integrity=False) + + return new_index + + +def _make_index_array_level(lev, lab): + """ create the combined index array, preserving nans, return an array """ + mask = lab == -1 + if not mask.any(): + return lev + + l = np.arange(len(lab)) + mask_labels = np.empty(len(mask[mask]), dtype=object) + mask_labels.fill(_get_na_value(lev.dtype.type)) + mask_indexer = com._ensure_int64(l[mask]) + + labels = lev + labels_indexer = com._ensure_int64(l[~mask]) + + new_labels = np.empty(tuple([len(lab)]), dtype=object) + new_labels[labels_indexer] = labels + new_labels[mask_indexer] = mask_labels + + return new_labels + + +def _unstack_multiple(data, clocs): + if len(clocs) == 0: + return data + + # NOTE: This doesn't deal with hierarchical columns yet + + index = data.index + + clocs = [index._get_level_number(i) for i in clocs] + + rlocs = [i for i in range(index.nlevels) if i not in clocs] + + clevels = [index.levels[i] for i in clocs] + clabels = [index.labels[i] for i in clocs] + cnames = [index.names[i] for i in clocs] + rlevels = [index.levels[i] for i in rlocs] + rlabels = [index.labels[i] for i in rlocs] + rnames = [index.names[i] for i in rlocs] + + shape = [len(x) for x in clevels] + group_index = get_group_index(clabels, shape) + + comp_ids, obs_ids = _compress_group_index(group_index, sort=False) + recons_labels = decons_group_index(obs_ids, shape) + + dummy_index = MultiIndex(levels=rlevels + [obs_ids], + labels=rlabels + [comp_ids], + names=rnames + ['__placeholder__'], + verify_integrity=False) + + if isinstance(data, Series): + dummy = Series(data.values, index=dummy_index) + unstacked = dummy.unstack('__placeholder__') + new_levels = clevels + new_names = cnames + new_labels = recons_labels + else: + if isinstance(data.columns, MultiIndex): + result = data + for i in range(len(clocs)): + val = clocs[i] + result = result.unstack(val) + clocs = [val if i > val else val - 1 for val in clocs] + + return result + + dummy = DataFrame(data.values, index=dummy_index, + columns=data.columns) + + unstacked = dummy.unstack('__placeholder__') + if isinstance(unstacked, Series): + unstcols = unstacked.index + else: + unstcols = unstacked.columns + new_levels = [unstcols.levels[0]] + clevels + new_names = [data.columns.name] + cnames + + new_labels = [unstcols.labels[0]] + for rec in recons_labels: + new_labels.append(rec.take(unstcols.labels[-1])) + + new_columns = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + if isinstance(unstacked, Series): + unstacked.index = new_columns + else: + unstacked.columns = new_columns + + return unstacked + + +def pivot(self, index=None, columns=None, values=None): + """ + See DataFrame.pivot + """ + if values is None: + indexed = self.set_index([index, columns]) + return indexed.unstack(columns) + else: + indexed = Series(self[values].values, + index=MultiIndex.from_arrays([self[index], + self[columns]])) + return indexed.unstack(columns) + + +def pivot_simple(index, columns, values): + """ + Produce 'pivot' table based on 3 columns of this DataFrame. + Uses unique values from index / columns and fills with values. + + Parameters + ---------- + index : ndarray + Labels to use to make new frame's index + columns : ndarray + Labels to use to make new frame's columns + values : ndarray + Values to use for populating new frame's values + + Notes + ----- + Obviously, all 3 of the input arguments must have the same length + + Returns + ------- + DataFrame + """ + if (len(index) != len(columns)) or (len(columns) != len(values)): + raise AssertionError('Length of index, columns, and values must be the' + ' same') + + if len(index) == 0: + return DataFrame(index=[]) + + hindex = MultiIndex.from_arrays([index, columns]) + series = Series(values.ravel(), index=hindex) + series = series.sortlevel(0) + return series.unstack() + + +def _slow_pivot(index, columns, values): + """ + Produce 'pivot' table based on 3 columns of this DataFrame. + Uses unique values from index / columns and fills with values. + + Parameters + ---------- + index : string or object + Column name to use to make new frame's index + columns : string or object + Column name to use to make new frame's columns + values : string or object + Column name to use for populating new frame's values + + Could benefit from some Cython here. + """ + tree = {} + for i, (idx, col) in enumerate(zip(index, columns)): + if col not in tree: + tree[col] = {} + branch = tree[col] + branch[idx] = values[i] + + return DataFrame(tree) + + +def unstack(obj, level): + if isinstance(level, (tuple, list)): + return _unstack_multiple(obj, level) + + if isinstance(obj, DataFrame): + if isinstance(obj.index, MultiIndex): + return _unstack_frame(obj, level) + else: + return obj.T.stack(dropna=False) + else: + unstacker = _Unstacker(obj.values, obj.index, level=level) + return unstacker.get_result() + + +def _unstack_frame(obj, level): + from pandas.core.internals import BlockManager, make_block + + if obj._is_mixed_type: + unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy + obj.index, level=level, + value_columns=obj.columns) + new_columns = unstacker.get_new_columns() + new_index = unstacker.get_new_index() + new_axes = [new_columns, new_index] + + new_blocks = [] + mask_blocks = [] + for blk in obj._data.blocks: + blk_items = obj._data.items[blk.mgr_locs.indexer] + bunstacker = _Unstacker(blk.values.T, obj.index, level=level, + value_columns=blk_items) + new_items = bunstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = bunstacker.get_new_values() + + mblk = make_block(mask.T, placement=new_placement) + mask_blocks.append(mblk) + + newb = make_block(new_values.T, placement=new_placement) + new_blocks.append(newb) + + result = DataFrame(BlockManager(new_blocks, new_axes)) + mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) + return result.ix[:, mask_frame.sum(0) > 0] + else: + unstacker = _Unstacker(obj.values, obj.index, level=level, + value_columns=obj.columns) + return unstacker.get_result() + + +def get_compressed_ids(labels, sizes): + # no overflow + if com._long_prod(sizes) < 2 ** 63: + group_index = get_group_index(labels, sizes) + comp_index, obs_ids = _compress_group_index(group_index) + else: + n = len(labels[0]) + mask = np.zeros(n, dtype=bool) + for v in labels: + mask |= v < 0 + + while com._long_prod(sizes) >= 2 ** 63: + i = len(sizes) + while com._long_prod(sizes[:i]) >= 2 ** 63: + i -= 1 + + rem_index, rem_ids = get_compressed_ids(labels[:i], + sizes[:i]) + sizes = [len(rem_ids)] + sizes[i:] + labels = [rem_index] + labels[i:] + + return get_compressed_ids(labels, sizes) + + return comp_index, obs_ids + + +def stack(frame, level=-1, dropna=True): + """ + Convert DataFrame to Series with multi-level Index. Columns become the + second level of the resulting hierarchical index + + Returns + ------- + stacked : Series + """ + N, K = frame.shape + if isinstance(frame.columns, MultiIndex): + if frame.columns._reference_duplicate_name(level): + msg = ("Ambiguous reference to {0}. The column " + "names are not unique.".format(level)) + raise ValueError(msg) + + if isinstance(level, int) and level < 0: + level += frame.columns.nlevels + + level = frame.columns._get_level_number(level) + + if isinstance(frame.columns, MultiIndex): + return _stack_multi_columns(frame, level=level, dropna=dropna) + elif isinstance(frame.index, MultiIndex): + new_levels = list(frame.index.levels) + new_levels.append(frame.columns) + + new_labels = [lab.repeat(K) for lab in frame.index.labels] + new_labels.append(np.tile(np.arange(K), N).ravel()) + + new_names = list(frame.index.names) + new_names.append(frame.columns.name) + new_index = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + else: + ilabels = np.arange(N).repeat(K) + clabels = np.tile(np.arange(K), N).ravel() + new_index = MultiIndex(levels=[frame.index, frame.columns], + labels=[ilabels, clabels], + names=[frame.index.name, frame.columns.name], + verify_integrity=False) + + new_values = frame.values.ravel() + if dropna: + mask = notnull(new_values) + new_values = new_values[mask] + new_index = new_index[mask] + return Series(new_values, index=new_index) + + +def _stack_multi_columns(frame, level=-1, dropna=True): + this = frame.copy() + + # this makes life much simpler + if level != frame.columns.nlevels - 1: + # roll levels to put selected level at end + roll_columns = this.columns + for i in range(level, frame.columns.nlevels - 1): + roll_columns = roll_columns.swaplevel(i, i + 1) + this.columns = roll_columns + + if not this.columns.is_lexsorted(): + this = this.sortlevel(0, axis=1) + + # tuple list excluding level for grouping columns + if len(frame.columns.levels) > 2: + tuples = list(zip(*[ + lev.values.take(lab) for lev, lab in + zip(this.columns.levels[:-1], this.columns.labels[:-1]) + ])) + unique_groups = [key for key, _ in itertools.groupby(tuples)] + new_names = this.columns.names[:-1] + new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) + else: + new_columns = unique_groups = this.columns.levels[0] + + # time to ravel the values + new_data = {} + level_vals = this.columns.levels[-1] + levsize = len(level_vals) + drop_cols = [] + for key in unique_groups: + loc = this.columns.get_loc(key) + slice_len = loc.stop - loc.start + # can make more efficient? + + if slice_len == 0: + drop_cols.append(key) + continue + elif slice_len != levsize: + chunk = this.ix[:, this.columns[loc]] + chunk.columns = level_vals.take(chunk.columns.labels[-1]) + value_slice = chunk.reindex(columns=level_vals).values + else: + if frame._is_mixed_type: + value_slice = this.ix[:, this.columns[loc]].values + else: + value_slice = this.values[:, loc] + + new_data[key] = value_slice.ravel() + + if len(drop_cols) > 0: + new_columns = new_columns - drop_cols + + N = len(this) + + if isinstance(this.index, MultiIndex): + new_levels = list(this.index.levels) + new_names = list(this.index.names) + new_labels = [lab.repeat(levsize) for lab in this.index.labels] + else: + new_levels = [this.index] + new_labels = [np.arange(N).repeat(levsize)] + new_names = [this.index.name] # something better? + + new_levels.append(frame.columns.levels[level]) + new_labels.append(np.tile(np.arange(levsize), N)) + new_names.append(frame.columns.names[level]) + + new_index = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + result = DataFrame(new_data, index=new_index, columns=new_columns) + + # more efficient way to go about this? can do the whole masking biz but + # will only save a small amount of time... + if dropna: + result = result.dropna(axis=0, how='all') + + return result + + +def melt(frame, id_vars=None, value_vars=None, + var_name=None, value_name='value', col_level=None): + """ + "Unpivots" a DataFrame from wide format to long format, optionally leaving + identifier variables set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + frame : DataFrame + id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name : scalar, default 'value' + Name to use for the 'value' column. + col_level : int or string, optional + If columns are a MultiIndex then use this level to melt. + + See also + -------- + pivot_table + DataFrame.pivot + + Examples + -------- + >>> import pandas as pd + >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, + ... 'B': {0: 1, 1: 3, 2: 5}, + ... 'C': {0: 2, 1: 4, 2: 6}}) + >>> df + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> pd.melt(df, id_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 + + The names of 'variable' and 'value' columns can be customized: + + >>> pd.melt(df, id_vars=['A'], value_vars=['B'], + ... var_name='myVarname', value_name='myValname') + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 + + If you have multi-index columns: + + >>> df.columns = [list('ABC'), list('DEF')] + >>> df + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 + + """ + # TODO: what about the existing index? + if id_vars is not None: + if not isinstance(id_vars, (tuple, list, np.ndarray)): + id_vars = [id_vars] + else: + id_vars = list(id_vars) + else: + id_vars = [] + + if value_vars is not None: + if not isinstance(value_vars, (tuple, list, np.ndarray)): + value_vars = [value_vars] + frame = frame.ix[:, id_vars + value_vars] + else: + frame = frame.copy() + + if col_level is not None: # allow list or other? + # frame is a copy + frame.columns = frame.columns.get_level_values(col_level) + + if var_name is None: + if isinstance(frame.columns, MultiIndex): + if len(frame.columns.names) == len(set(frame.columns.names)): + var_name = frame.columns.names + else: + var_name = ['variable_%s' % i for i in + range(len(frame.columns.names))] + else: + var_name = [frame.columns.name if frame.columns.name is not None + else 'variable'] + if isinstance(var_name, compat.string_types): + var_name = [var_name] + + N, K = frame.shape + K -= len(id_vars) + + mdata = {} + for col in id_vars: + mdata[col] = np.tile(frame.pop(col).values, K) + + mcolumns = id_vars + var_name + [value_name] + + mdata[value_name] = frame.values.ravel('F') + for i, col in enumerate(var_name): + # asanyarray will keep the columns as an Index + mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N) + + return DataFrame(mdata, columns=mcolumns) + + +def lreshape(data, groups, dropna=True, label=None): + """ + Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + + Parameters + ---------- + data : DataFrame + groups : dict + {new_name : list_of_columns} + dropna : boolean, default True + + Examples + -------- + >>> import pandas as pd + >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], + ... 'team': ['Red Sox', 'Yankees'], + ... 'year1': [2007, 2008], 'year2': [2008, 2008]}) + >>> data + hr1 hr2 team year1 year2 + 0 514 545 Red Sox 2007 2008 + 1 573 526 Yankees 2007 2008 + + >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) + team hr year + 0 Red Sox 514 2007 + 1 Yankees 573 2007 + 2 Red Sox 545 2008 + 3 Yankees 526 2008 + + Returns + ------- + reshaped : DataFrame + """ + if isinstance(groups, dict): + keys = list(groups.keys()) + values = list(groups.values()) + else: + keys, values = zip(*groups) + + all_cols = list(set.union(*[set(x) for x in values])) + id_cols = list(data.columns.diff(all_cols)) + + K = len(values[0]) + + for seq in values: + if len(seq) != K: + raise ValueError('All column lists must be same length') + + mdata = {} + pivot_cols = [] + + for target, names in zip(keys, values): + mdata[target] = com._concat_compat([data[col].values for col in names]) + pivot_cols.append(target) + + for col in id_cols: + mdata[col] = np.tile(data[col].values, K) + + if dropna: + mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) + for c in pivot_cols: + mask &= notnull(mdata[c]) + if not mask.all(): + mdata = dict((k, v[mask]) for k, v in compat.iteritems(mdata)) + + return DataFrame(mdata, columns=id_cols + pivot_cols) + + +def wide_to_long(df, stubnames, i, j): + """ + Wide panel to long format. Less flexible but more user-friendly than melt. + + Parameters + ---------- + df : DataFrame + The wide-format DataFrame + stubnames : list + A list of stub names. The wide format variables are assumed to + start with the stub names. + i : str + The name of the id variable. + j : str + The name of the subobservation variable. + stubend : str + Regex to match for the end of the stubs. + + Returns + ------- + DataFrame + A DataFrame that contains each stub name as a variable as well as + variables for i and j. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> np.random.seed(123) + >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, + ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, + ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, + ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, + ... "X" : dict(zip(range(3), np.random.randn(3))) + ... }) + >>> df["id"] = df.index + >>> df + A1970 A1980 B1970 B1980 X id + 0 a d 2.5 3.2 -1.085631 0 + 1 b e 1.2 1.3 0.997345 1 + 2 c f 0.7 0.1 0.282978 2 + >>> wide_to_long(df, ["A", "B"], i="id", j="year") + X A B + id year + 0 1970 -1.085631 a 2.5 + 1 1970 0.997345 b 1.2 + 2 1970 0.282978 c 0.7 + 0 1980 -1.085631 d 3.2 + 1 1980 0.997345 e 1.3 + 2 1980 0.282978 f 0.1 + + Notes + ----- + All extra variables are treated as extra id variables. This simply uses + `pandas.melt` under the hood, but is hard-coded to "do the right thing" + in a typicaly case. + """ + def get_var_names(df, regex): + return df.filter(regex=regex).columns.tolist() + + def melt_stub(df, stub, i, j): + varnames = get_var_names(df, "^" + stub) + newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub, + var_name=j) + newdf_j = newdf[j].str.replace(stub, "") + try: + newdf_j = newdf_j.astype(int) + except ValueError: + pass + newdf[j] = newdf_j + return newdf + + id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames)) + if i not in id_vars: + id_vars += [i] + + stub = stubnames.pop(0) + newdf = melt_stub(df, stub, id_vars, j) + + for stub in stubnames: + new = melt_stub(df, stub, id_vars, j) + newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) + return newdf.set_index([i, j]) + + +def convert_dummies(data, cat_variables, prefix_sep='_'): + """ + Compute DataFrame with specified columns converted to dummy variables (0 / + 1). Result columns will be prefixed with the column name, then the level + name, e.g. 'A_foo' for column A and level foo + + Parameters + ---------- + data : DataFrame + cat_variables : list-like + Must be column names in the DataFrame + prefix_sep : string, default '_' + String to use to separate column name from dummy level + + Returns + ------- + dummies : DataFrame + """ + result = data.drop(cat_variables, axis=1) + for variable in cat_variables: + dummies = get_dummies(data[variable], prefix=variable, + prefix_sep=prefix_sep) + result = result.join(dummies) + return result + + +def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): + """ + Convert categorical variable into dummy/indicator variables + + Parameters + ---------- + data : array-like or Series + prefix : string, default None + String to append DataFrame column names + prefix_sep : string, default '_' + If appending prefix, separator/delimiter to use + dummy_na : bool, default False + Add a column to indicate NaNs, if False NaNs are ignored. + + Returns + ------- + dummies : DataFrame + + Examples + -------- + >>> import pandas as pd + >>> s = pd.Series(list('abca')) + + >>> get_dummies(s) + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + + >>> s1 = ['a', 'b', np.nan] + + >>> get_dummies(s1) + a b + 0 1 0 + 1 0 1 + 2 0 0 + + >>> get_dummies(s1, dummy_na=True) + a b NaN + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + + See also ``Series.str.get_dummies``. + + """ + # Series avoids inconsistent NaN handling + cat = Categorical.from_array(Series(data)) + levels = cat.levels + + # if all NaN + if not dummy_na and len(levels) == 0: + if isinstance(data, Series): + index = data.index + else: + index = np.arange(len(data)) + return DataFrame(index=index) + + number_of_cols = len(levels) + if dummy_na: + number_of_cols += 1 + + dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0) + + if dummy_na: + levels = np.append(cat.levels, np.nan) + else: + # reset NaN GH4446 + dummy_mat[cat.labels == -1] = 0 + + if prefix is not None: + dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) + for v in levels] + else: + dummy_cols = levels + + if isinstance(data, Series): + index = data.index + else: + index = None + + return DataFrame(dummy_mat, index=index, columns=dummy_cols) + + +def make_axis_dummies(frame, axis='minor', transform=None): + """ + Construct 1-0 dummy variables corresponding to designated axis + labels + + Parameters + ---------- + frame : DataFrame + axis : {'major', 'minor'}, default 'minor' + transform : function, default None + Function to apply to axis labels first. For example, to + get "day of week" dummies in a time series regression + you might call:: + + make_axis_dummies(panel, axis='major', + transform=lambda d: d.weekday()) + Returns + ------- + dummies : DataFrame + Column names taken from chosen axis + """ + numbers = { + 'major': 0, + 'minor': 1 + } + num = numbers.get(axis, axis) + + items = frame.index.levels[num] + labels = frame.index.labels[num] + if transform is not None: + mapped_items = items.map(transform) + cat = Categorical.from_array(mapped_items.take(labels)) + labels = cat.labels + items = cat.levels + + values = np.eye(len(items), dtype=float) + values = values.take(labels, axis=0) + + return DataFrame(values, columns=items, index=frame.index) + + +def block2d_to_blocknd(values, placement, shape, labels, ref_items): + """ pivot to the labels shape """ + from pandas.core.internals import make_block + + panel_shape = (len(placement),) + shape + + # TODO: lexsort depth needs to be 2!! + + # Create observation selection vector using major and minor + # labels, for converting to panel format. + selector = factor_indexer(shape[1:], labels) + mask = np.zeros(np.prod(shape), dtype=bool) + mask.put(selector, True) + + if mask.all(): + pvalues = np.empty(panel_shape, dtype=values.dtype) + else: + dtype, fill_value = _maybe_promote(values.dtype) + pvalues = np.empty(panel_shape, dtype=dtype) + pvalues.fill(fill_value) + + values = values + for i in range(len(placement)): + pvalues[i].flat[mask] = values[:, i] + + return make_block(pvalues, placement=placement) + + +def factor_indexer(shape, labels): + """ given a tuple of shape and a list of Categorical labels, return the + expanded label indexer + """ + mult = np.array(shape)[::-1].cumprod()[::-1] + return com._ensure_platform_int( + np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) diff --git a/pandas/core/series.py b/pandas/core/series.py new file mode 100644 index 00000000..a484efe7 --- /dev/null +++ b/pandas/core/series.py @@ -0,0 +1,2531 @@ +""" +Data structure for 1-dimensional cross-sectional and time series data +""" +from __future__ import division + +# pylint: disable=E1101,E1103 +# pylint: disable=W0703,W0622,W0613,W0201 + +import types +import warnings + +from numpy import nan, ndarray +import numpy as np +import numpy.ma as ma + +from pandas.core.common import (isnull, notnull, _is_bool_indexer, + _default_index, _maybe_upcast, + _asarray_tuplesafe, _infer_dtype_from_scalar, + is_list_like, _values_from_object, + _possibly_cast_to_datetime, _possibly_castable, + _possibly_convert_platform, _try_sort, + ABCSparseArray, _maybe_match_name, + _ensure_object, SettingWithCopyError) +from pandas.core.index import (Index, MultiIndex, InvalidIndexError, + _ensure_index) +from pandas.core.indexing import _check_bool_indexer, _maybe_convert_indices +from pandas.core import generic, base +from pandas.core.internals import SingleBlockManager +from pandas.core.categorical import Categorical +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex, Period +from pandas import compat +from pandas.util.terminal import get_terminal_size +from pandas.compat import zip, u, OrderedDict + +import pandas.core.array as pa +import pandas.core.ops as ops +from pandas.core.algorithms import select_n + +import pandas.core.common as com +import pandas.core.datetools as datetools +import pandas.core.format as fmt +import pandas.core.nanops as nanops +from pandas.util.decorators import Appender, cache_readonly + +import pandas.lib as lib +import pandas.tslib as tslib +import pandas.index as _index + +from numpy import percentile as _quantile +from pandas.core.config import get_option + +__all__ = ['Series'] + +_shared_doc_kwargs = dict( + axes='index', + klass='Series', + axes_single_arg="{0,'index'}" +) + + +def _coerce_method(converter): + """ install the scalar coercion methods """ + + def wrapper(self): + if len(self) == 1: + return converter(self.iloc[0]) + raise TypeError( + "cannot convert the series to {0}".format(str(converter))) + return wrapper + + +def _unbox(func): + @Appender(func.__doc__) + def f(self, *args, **kwargs): + result = func(self.values, *args, **kwargs) + if isinstance(result, (pa.Array, Series)) and result.ndim == 0: + # return NumPy type + return result.dtype.type(result.item()) + else: # pragma: no cover + return result + f.__name__ = func.__name__ + return f + +#---------------------------------------------------------------------- +# Series class + + +class Series(base.IndexOpsMixin, generic.NDFrame): + + """ + One-dimensional ndarray with axis labels (including time series). + + Labels need not be unique but must be any hashable type. The object + supports both integer- and label-based indexing and provides a host of + methods for performing operations involving the index. Statistical + methods from ndarray have been overridden to automatically exclude + missing data (currently represented as NaN) + + Operations between Series (+, -, /, *, **) align values based on their + associated index values-- they need not be the same length. The result + index will be the sorted union of the two indexes. + + Parameters + ---------- + data : array-like, dict, or scalar value + Contains data stored in Series + index : array-like or Index (1d) + Values must be unique and hashable, same length as data. Index + object (or other iterable of same length as data) Will default to + np.arange(len(data)) if not provided. If both a dict and index + sequence are used, the index will override the keys found in the + dict. + dtype : numpy.dtype or None + If None, dtype will be inferred + copy : boolean, default False + Copy input data + """ + _metadata = ['name'] + _allow_index_ops = True + + @property + def _allow_datetime_index_ops(self): + # disabling to invalidate datetime index ops (GH7206) + # return self.index.is_all_dates and isinstance(self.index, DatetimeIndex) + return False + + @property + def _allow_period_index_ops(self): + # disabling to invalidate period index ops (GH7206) + # return self.index.is_all_dates and isinstance(self.index, PeriodIndex) + return False + + def __init__(self, data=None, index=None, dtype=None, name=None, + copy=False, fastpath=False): + + # we are called internally, so short-circuit + if fastpath: + + # data is an ndarray, index is defined + if not isinstance(data, SingleBlockManager): + data = SingleBlockManager(data, index, fastpath=True) + if copy: + data = data.copy() + if index is None: + index = data.index + + else: + + if index is not None: + index = _ensure_index(index) + + if data is None: + data = {} + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, MultiIndex): + raise NotImplementedError + elif isinstance(data, Index): + # need to copy to avoid aliasing issues + if name is None: + name = data.name + + data = data._to_embed(keep_tz=True) + copy = True + elif isinstance(data, pa.Array): + pass + elif isinstance(data, Series): + if name is None: + name = data.name + if index is None: + index = data.index + else: + data = data.reindex(index, copy=copy) + data = data._data + elif isinstance(data, dict): + if index is None: + if isinstance(data, OrderedDict): + index = Index(data) + else: + index = Index(_try_sort(data)) + try: + if isinstance(index, DatetimeIndex): + # coerce back to datetime objects for lookup + data = lib.fast_multiget(data, index.astype('O'), + default=pa.NA) + elif isinstance(index, PeriodIndex): + data = [data.get(i, nan) for i in index] + else: + data = lib.fast_multiget(data, index.values, + default=pa.NA) + except TypeError: + data = [data.get(i, nan) for i in index] + + elif isinstance(data, SingleBlockManager): + if index is None: + index = data.index + else: + data = data.reindex(index, copy=copy) + elif isinstance(data, Categorical): + if name is None: + name = data.name + data = np.asarray(data) + elif isinstance(data, types.GeneratorType): + data = list(data) + elif isinstance(data, (set, frozenset)): + raise TypeError("{0!r} type is unordered" + "".format(data.__class__.__name__)) + else: + + # handle sparse passed here (and force conversion) + if isinstance(data, ABCSparseArray): + data = data.to_dense() + + if index is None: + if not is_list_like(data): + data = [data] + index = _default_index(len(data)) + + # create/copy the manager + if isinstance(data, SingleBlockManager): + if dtype is not None: + data = data.astype(dtype=dtype, raise_on_error=False) + elif copy: + data = data.copy() + else: + data = _sanitize_array(data, index, dtype, copy, + raise_cast_failure=True) + + data = SingleBlockManager(data, index, fastpath=True) + + generic.NDFrame.__init__(self, data, fastpath=True) + + object.__setattr__(self, 'name', name) + self._set_axis(0, index, fastpath=True) + + @classmethod + def from_array(cls, arr, index=None, name=None, copy=False, + fastpath=False): + # return a sparse series here + if isinstance(arr, ABCSparseArray): + from pandas.sparse.series import SparseSeries + cls = SparseSeries + + return cls(arr, index=index, name=name, copy=copy, fastpath=fastpath) + + @property + def _constructor(self): + return Series + + # types + @property + def _can_hold_na(self): + return self._data._can_hold_na + + @property + def is_time_series(self): + return self._subtyp in ['time_series', 'sparse_time_series'] + + _index = None + + def _set_axis(self, axis, labels, fastpath=False): + """ override generic, we want to set the _typ here """ + + if not fastpath: + labels = _ensure_index(labels) + + is_all_dates = labels.is_all_dates + if is_all_dates: + from pandas.tseries.index import DatetimeIndex + from pandas.tseries.period import PeriodIndex + if not isinstance(labels, (DatetimeIndex, PeriodIndex)): + labels = DatetimeIndex(labels) + + # need to set here becuase we changed the index + if fastpath: + self._data.set_axis(axis, labels) + self._set_subtyp(is_all_dates) + + object.__setattr__(self, '_index', labels) + if not fastpath: + self._data.set_axis(axis, labels) + + def _set_subtyp(self, is_all_dates): + if is_all_dates: + object.__setattr__(self, '_subtyp', 'time_series') + else: + object.__setattr__(self, '_subtyp', 'series') + + # ndarray compatibility + def item(self): + return self._data.values.item() + + @property + def data(self): + return self._data.values.data + + @property + def strides(self): + return self._data.values.strides + + @property + def size(self): + return self._data.values.size + + @property + def flags(self): + return self._data.values.flags + + @property + def dtype(self): + return self._data.dtype + + @property + def dtypes(self): + """ for compat """ + return self._data.dtype + + @property + def ftype(self): + return self._data.ftype + + @property + def ftypes(self): + """ for compat """ + return self._data.ftype + + @property + def shape(self): + return self._data.shape + + @property + def ndim(self): + return 1 + + @property + def base(self): + return self.values.base + + def ravel(self, order='C'): + return self.values.ravel(order=order) + + def compress(self, condition, axis=0, out=None, **kwargs): + # 1-d compat with numpy + return self[condition] + + def transpose(self): + """ support for compatiblity """ + return self + + T = property(transpose) + + def nonzero(self): + """ numpy like, returns same as nonzero """ + return self.values.nonzero() + + def put(self, *args, **kwargs): + self.values.put(*args, **kwargs) + + def __len__(self): + return len(self._data) + + def view(self, dtype=None): + return self._constructor(self.values.view(dtype), + index=self.index).__finalize__(self) + + def __array__(self, result=None): + """ + the array interface, return my values + """ + return self.values + + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc + """ + return self._constructor(result, index=self.index, + copy=False).__finalize__(self) + + def __array_prepare__(self, result, context=None): + """ + Gets called prior to a ufunc + """ + return result + + # complex + @property + def real(self): + return self.values.real + + @real.setter + def real(self, v): + self.values.real = v + + @property + def imag(self): + return self.values.imag + + @imag.setter + def imag(self, v): + self.values.imag = v + + # coercion + __float__ = _coerce_method(float) + __long__ = _coerce_method(int) + __int__ = _coerce_method(int) + + # we are preserving name here + def __getstate__(self): + return dict(_data=self._data, name=self.name) + + def _unpickle_series_compat(self, state): + if isinstance(state, dict): + self._data = state['_data'] + self.name = state['name'] + self.index = self._data.index + + elif isinstance(state, tuple): + + # < 0.12 series pickle + + nd_state, own_state = state + + # recreate the ndarray + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + + # backwards compat + index, name = own_state[0], None + if len(own_state) > 1: + name = own_state[1] + + # recreate + self._data = SingleBlockManager(data, index, fastpath=True) + self.index = index + self.name = name + + else: + raise Exception("cannot unpickle legacy formats -> [%s]" % state) + + # indexers + @property + def axes(self): + return [self.index] + + def _ixs(self, i, axis=0): + """ + Return the i-th value or values in the Series by location + + Parameters + ---------- + i : int, slice, or sequence of integers + + Returns + ------- + value : scalar (int) or Series (slice, sequence) + """ + try: + return _index.get_value_at(self.values, i) + except IndexError: + raise + except: + if isinstance(i, slice): + indexer = self.index._convert_slice_indexer(i, typ='iloc') + return self._get_values(indexer) + else: + label = self.index[i] + if isinstance(label, Index): + return self.take(i, axis=axis, convert=True) + else: + return _index.get_value_at(self, i) + + @property + def _is_mixed_type(self): + return False + + def _slice(self, slobj, axis=0, typ=None): + slobj = self.index._convert_slice_indexer(slobj, typ=typ or 'getitem') + return self._get_values(slobj) + + def __getitem__(self, key): + try: + result = self.index.get_value(self, key) + + if not np.isscalar(result): + if is_list_like(result) and not isinstance(result, Series): + + # we need to box if we have a non-unique index here + # otherwise have inline ndarray/lists + if not self.index.is_unique: + result = self._constructor(result, + index=[key]*len(result) + ,dtype=self.dtype).__finalize__(self) + + return result + except InvalidIndexError: + pass + except (KeyError, ValueError): + if isinstance(key, tuple) and isinstance(self.index, MultiIndex): + # kludge + pass + elif key is Ellipsis: + return self + elif _is_bool_indexer(key): + pass + else: + + # we can try to coerce the indexer (or this will raise) + new_key = self.index._convert_scalar_indexer(key) + if type(new_key) != type(key): + return self.__getitem__(new_key) + raise + + except Exception: + raise + + if com.is_iterator(key): + key = list(key) + + if _is_bool_indexer(key): + key = _check_bool_indexer(self.index, key) + + return self._get_with(key) + + def _get_with(self, key): + # other: fancy integer or otherwise + if isinstance(key, slice): + indexer = self.index._convert_slice_indexer(key, typ='getitem') + return self._get_values(indexer) + else: + if isinstance(key, tuple): + try: + return self._get_values_tuple(key) + except: + if len(key) == 1: + key = key[0] + if isinstance(key, slice): + return self._get_values(key) + raise + + # pragma: no cover + if not isinstance(key, (list, pa.Array, Series)): + key = list(key) + + if isinstance(key, Index): + key_type = key.inferred_type + else: + key_type = lib.infer_dtype(key) + + if key_type == 'integer': + if self.index.is_integer() or self.index.is_floating(): + return self.reindex(key) + else: + return self._get_values(key) + elif key_type == 'boolean': + return self._get_values(key) + else: + try: + # handle the dup indexing case (GH 4246) + if isinstance(key, (list, tuple)): + return self.ix[key] + + return self.reindex(key) + except Exception: + # [slice(0, 5, None)] will break if you convert to ndarray, + # e.g. as requested by np.median + # hack + if isinstance(key[0], slice): + return self._get_values(key) + raise + + def _get_values_tuple(self, key): + # mpl hackaround + if any(k is None for k in key): + return self._get_values(key) + + if not isinstance(self.index, MultiIndex): + raise ValueError('Can only tuple-index with a MultiIndex') + + # If key is contained, would have returned by now + indexer, new_index = self.index.get_loc_level(key) + return self._constructor(self.values[indexer], + index=new_index).__finalize__(self) + + def _get_values(self, indexer): + try: + return self._constructor(self._data.get_slice(indexer), + fastpath=True).__finalize__(self) + except Exception: + return self.values[indexer] + + def __setitem__(self, key, value): + try: + self._set_with_engine(key, value) + return + except (SettingWithCopyError): + raise + except (KeyError, ValueError): + values = self.values + if (com.is_integer(key) + and not self.index.inferred_type == 'integer'): + + values[key] = value + return + elif key is Ellipsis: + self[:] = value + return + elif _is_bool_indexer(key): + pass + elif com.is_timedelta64_dtype(self.dtype): + # reassign a null value to iNaT + if isnull(value): + value = tslib.iNaT + + try: + self.index._engine.set_value(self.values, key, value) + return + except (TypeError): + pass + + self.loc[key] = value + return + + except TypeError as e: + if isinstance(key, tuple) and not isinstance(self.index, + MultiIndex): + raise ValueError("Can only tuple-index with a MultiIndex") + + # python 3 type errors should be raised + if 'unorderable' in str(e): # pragma: no cover + raise IndexError(key) + + if _is_bool_indexer(key): + key = _check_bool_indexer(self.index, key) + try: + self.where(~key, value, inplace=True) + return + except (InvalidIndexError): + pass + + self._set_with(key, value) + + def _set_with_engine(self, key, value): + values = self.values + try: + self.index._engine.set_value(values, key, value) + self._check_setitem_copy() + return + except KeyError: + values[self.index.get_loc(key)] = value + return + + def _set_with(self, key, value): + # other: fancy integer or otherwise + if isinstance(key, slice): + indexer = self.index._convert_slice_indexer(key, typ='getitem') + return self._set_values(indexer, value) + else: + if isinstance(key, tuple): + try: + self._set_values(key, value) + except Exception: + pass + + if not isinstance(key, (list, Series, pa.Array, Series)): + key = list(key) + + if isinstance(key, Index): + key_type = key.inferred_type + else: + key_type = lib.infer_dtype(key) + + if key_type == 'integer': + if self.index.inferred_type == 'integer': + self._set_labels(key, value) + else: + return self._set_values(key, value) + elif key_type == 'boolean': + self._set_values(key.astype(np.bool_), value) + else: + self._set_labels(key, value) + + def _set_labels(self, key, value): + if isinstance(key, Index): + key = key.values + else: + key = _asarray_tuplesafe(key) + indexer = self.index.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise ValueError('%s not contained in the index' + % str(key[mask])) + self._set_values(indexer, value) + + def _set_values(self, key, value): + if isinstance(key, Series): + key = key.values + self._data = self._data.setitem(indexer=key, value=value) + self._maybe_update_cacher() + + # help out SparseSeries + _get_val_at = ndarray.__getitem__ + + def repeat(self, reps): + """ + See ndarray.repeat + """ + new_index = self.index.repeat(reps) + new_values = self.values.repeat(reps) + return self._constructor(new_values, + index=new_index).__finalize__(self) + + def reshape(self, *args, **kwargs): + """ + See numpy.ndarray.reshape + """ + if len(args) == 1 and hasattr(args[0], '__iter__'): + shape = args[0] + else: + shape = args + + if tuple(shape) == self.shape: + # XXX ignoring the "order" keyword. + return self + + return self.values.reshape(shape, **kwargs) + + iget_value = _ixs + iget = _ixs + irow = _ixs + + def get_value(self, label, takeable=False): + """ + Quickly retrieve single value at passed index label + + Parameters + ---------- + index : label + takeable : interpret the index as indexers, default False + + Returns + ------- + value : scalar value + """ + if takeable is True: + return self.values[label] + return self.index.get_value(self.values, label) + + def set_value(self, label, value, takeable=False): + """ + Quickly set single value at passed label. If label is not contained, a + new object is created with the label placed at the end of the result + index + + Parameters + ---------- + label : object + Partial indexing with MultiIndex not allowed + value : object + Scalar value + takeable : interpret the index as indexers, default False + + Returns + ------- + series : Series + If label is contained, will be reference to calling Series, + otherwise a new object + """ + try: + if takeable: + self.values[label] = value + else: + self.index._engine.set_value(self.values, label, value) + return self + except KeyError: + + # set using a non-recursive method + self.loc[label] = value + return self + + def reset_index(self, level=None, drop=False, name=None, inplace=False): + """ + Analogous to the :meth:`pandas.DataFrame.reset_index` function, see + docstring there. + + Parameters + ---------- + level : int, str, tuple, or list, default None + Only remove the given levels from the index. Removes all levels by + default + drop : boolean, default False + Do not try to insert index into dataframe columns + name : object, default None + The name of the column corresponding to the Series values + inplace : boolean, default False + Modify the Series in place (do not create a new object) + + Returns + ---------- + resetted : DataFrame, or Series if drop == True + """ + if drop: + new_index = pa.arange(len(self)) + if level is not None and isinstance(self.index, MultiIndex): + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if len(level) < len(self.index.levels): + new_index = self.index.droplevel(level) + + if inplace: + self.index = new_index + # set name if it was passed, otherwise, keep the previous name + self.name = name or self.name + else: + return self._constructor(self.values.copy(), + index=new_index).__finalize__(self) + elif inplace: + raise TypeError('Cannot reset_index inplace on a Series ' + 'to create a DataFrame') + else: + df = self.to_frame(name) + return df.reset_index(level=level, drop=drop) + + def __unicode__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + width, height = get_terminal_size() + max_rows = (height if get_option("display.max_rows") == 0 + else get_option("display.max_rows")) + if max_rows and len(self.index) > max_rows: + result = self._tidy_repr(min(30, max_rows - 4)) + elif len(self.index) > 0: + result = self._get_repr(print_header=True, + length=len(self) > 50, + name=True, + dtype=True) + elif self.name is None: + result = u('Series([], dtype: %s)') % (self.dtype) + else: + result = u('Series([], name: %s, dtype: %s)') % (self.name, + self.dtype) + return result + + def _tidy_repr(self, max_vals=20): + """ + + Internal function, should always return unicode string + """ + if max_vals > 1: + num = max_vals // 2 + else: + num = 1 + max_vals = 2 + head = self.iloc[:num]._get_repr(print_header=True, length=False, + dtype=False, name=False) + tail = self.iloc[-(max_vals - num):]._get_repr(print_header=False, + length=False, + name=False, + dtype=False) + result = head + '\n...\n' + tail + result = '%s\n%s' % (result, self._repr_footer()) + + return compat.text_type(result) + + def _repr_footer(self): + + # time series + if self.is_time_series: + if self.index.freq is not None: + freqstr = u('Freq: %s, ') % self.index.freqstr + else: + freqstr = u('') + + namestr = u("Name: %s, ") % com.pprint_thing( + self.name) if self.name is not None else "" + return u('%s%sLength: %d') % (freqstr, namestr, len(self)) + + # reg series + namestr = u("Name: %s, ") % com.pprint_thing( + self.name) if self.name is not None else "" + return u('%sLength: %d, dtype: %s') % (namestr, + len(self), + str(self.dtype.name)) + + def to_string(self, buf=None, na_rep='NaN', float_format=None, + length=False, dtype=False, name=False): + """ + Render a string representation of the Series + + Parameters + ---------- + buf : StringIO-like, optional + buffer to write to + na_rep : string, optional + string representation of NAN to use, default 'NaN' + float_format : one-parameter function, optional + formatter function to apply to columns' elements if they are floats + default None + length : boolean, default False + Add the Series length + dtype : boolean, default False + Add the Series dtype + name : boolean, default False + Add the Series name (which may be None) + + Returns + ------- + formatted : string (if not buffer passed) + """ + + the_repr = self._get_repr(float_format=float_format, na_rep=na_rep, + length=length, dtype=dtype, name=name) + + # catch contract violations + if not isinstance(the_repr, compat.text_type): + raise AssertionError("result must be of type unicode, type" + " of result is {0!r}" + "".format(the_repr.__class__.__name__)) + + if buf is None: + return the_repr + else: + try: + buf.write(the_repr) + except AttributeError: + with open(buf, 'w') as f: + f.write(the_repr) + + def _get_repr( + self, name=False, print_header=False, length=True, dtype=True, + na_rep='NaN', float_format=None): + """ + + Internal function, should always return unicode string + """ + + formatter = fmt.SeriesFormatter(self, name=name, header=print_header, + length=length, dtype=dtype, + na_rep=na_rep, + float_format=float_format) + result = formatter.to_string() + + # TODO: following check prob. not neces. + if not isinstance(result, compat.text_type): + raise AssertionError("result must be of type unicode, type" + " of result is {0!r}" + "".format(result.__class__.__name__)) + return result + + def __iter__(self): + if np.issubdtype(self.dtype, np.datetime64): + return (lib.Timestamp(x) for x in self.values) + else: + return iter(self.values) + + def iteritems(self): + """ + Lazily iterate over (index, value) tuples + """ + return zip(iter(self.index), iter(self)) + + if compat.PY3: # pragma: no cover + items = iteritems + + #---------------------------------------------------------------------- + # unbox reductions + + all = _unbox(pa.Array.all) + any = _unbox(pa.Array.any) + + #---------------------------------------------------------------------- + # Misc public methods + + def keys(self): + "Alias for index" + return self.index + + @property + def values(self): + """ + Return Series as ndarray + + Returns + ------- + arr : numpy.ndarray + """ + return self._data.values + + def get_values(self): + """ same as values (but handles sparseness conversions); is a view """ + return self._data.values + + def tolist(self): + """ Convert Series to a nested list """ + return list(self) + + def to_dict(self): + """ + Convert Series to {label -> value} dict + + Returns + ------- + value_dict : dict + """ + return dict(compat.iteritems(self)) + + def to_frame(self, name=None): + """ + Convert Series to DataFrame + + Parameters + ---------- + name : object, default None + The passed name should substitute for the series name (if it has + one). + + Returns + ------- + data_frame : DataFrame + """ + from pandas.core.frame import DataFrame + if name is None: + df = DataFrame(self) + else: + df = DataFrame({name: self}) + + return df + + def to_sparse(self, kind='block', fill_value=None): + """ + Convert Series to SparseSeries + + Parameters + ---------- + kind : {'block', 'integer'} + fill_value : float, defaults to NaN (missing) + + Returns + ------- + sp : SparseSeries + """ + from pandas.core.sparse import SparseSeries + return SparseSeries(self, kind=kind, + fill_value=fill_value).__finalize__(self) + + #---------------------------------------------------------------------- + # Statistics, overridden ndarray methods + + # TODO: integrate bottleneck + + def count(self, level=None): + """ + Return number of non-NA/null observations in the Series + + Parameters + ---------- + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a smaller Series + + Returns + ------- + nobs : int or Series (if level specified) + """ + if level is not None: + mask = notnull(self.values) + + if isinstance(level, compat.string_types): + level = self.index._get_level_number(level) + + level_index = self.index.levels[level] + + if len(self) == 0: + return self._constructor(0, index=level_index)\ + .__finalize__(self) + + # call cython function + max_bin = len(level_index) + labels = com._ensure_int64(self.index.labels[level]) + counts = lib.count_level_1d(mask.view(pa.uint8), + labels, max_bin) + return self._constructor(counts, + index=level_index).__finalize__(self) + + return notnull(_values_from_object(self)).sum() + + def mode(self): + """Returns the mode(s) of the dataset. + + Empty if nothing occurs at least 2 times. Always returns Series even + if only one value. + + Parameters + ---------- + sort : bool, default True + If True, will lexicographically sort values, if False skips + sorting. Result ordering when ``sort=False`` is not defined. + + Returns + ------- + modes : Series (sorted) + """ + # TODO: Add option for bins like value_counts() + from pandas.core.algorithms import mode + return mode(self) + + def drop_duplicates(self, take_last=False, inplace=False): + """ + Return Series with duplicate values removed + + Parameters + ---------- + take_last : boolean, default False + Take the last observed index in a group. Default first + inplace : boolean, default False + If True, performs operation inplace and returns None. + + Returns + ------- + deduplicated : Series + """ + duplicated = self.duplicated(take_last=take_last) + result = self[-duplicated] + if inplace: + return self._update_inplace(result) + else: + return result + + def duplicated(self, take_last=False): + """ + Return boolean Series denoting duplicate values + + Parameters + ---------- + take_last : boolean, default False + Take the last observed index in a group. Default first + + Returns + ------- + duplicated : Series + """ + keys = _ensure_object(self.values) + duplicated = lib.duplicated(keys, take_last=take_last) + return self._constructor(duplicated, + index=self.index).__finalize__(self) + + def idxmin(self, axis=None, out=None, skipna=True): + """ + Index of first occurrence of minimum of values. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + idxmin : Index of minimum of values + + Notes + ----- + This method is the Series version of ``ndarray.argmin``. + + See Also + -------- + DataFrame.idxmin + """ + i = nanops.nanargmin(_values_from_object(self), skipna=skipna) + if i == -1: + return pa.NA + return self.index[i] + + def idxmax(self, axis=None, out=None, skipna=True): + """ + Index of first occurrence of maximum of values. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + idxmax : Index of maximum of values + + Notes + ----- + This method is the Series version of ``ndarray.argmax``. + + See Also + -------- + DataFrame.idxmax + """ + i = nanops.nanargmax(_values_from_object(self), skipna=skipna) + if i == -1: + return pa.NA + return self.index[i] + + # ndarray compat + argmin = idxmin + argmax = idxmax + + @Appender(pa.Array.round.__doc__) + def round(self, decimals=0, out=None): + """ + + """ + result = _values_from_object(self).round(decimals, out=out) + if out is None: + result = self._constructor(result, + index=self.index).__finalize__(self) + + return result + + def quantile(self, q=0.5): + """ + Return value at the given quantile, a la numpy.percentile. + + Parameters + ---------- + q : float or array-like, default 0.5 (50% quantile) + 0 <= q <= 1, the quantile(s) to compute + + Returns + ------- + quantile : float or Series + if ``q`` is an array, a Series will be returned where the + index is ``q`` and the values are the quantiles. + + Examples + -------- + + >>> s = Series([1, 2, 3, 4]) + >>> s.quantile(.5) + 2.5 + >>> s.quantile([.25, .5, .75]) + 0.25 1.75 + 0.50 2.50 + 0.75 3.25 + dtype: float64 + """ + valid_values = self.dropna().values + if len(valid_values) == 0: + return pa.NA + + def multi(values, qs): + if com.is_list_like(qs): + return Series([_quantile(values, x*100) + for x in qs], index=qs) + else: + return _quantile(values, qs*100) + + if com.is_datetime64_dtype(self): + values = _values_from_object(self).view('i8') + result = multi(values, q) + if com.is_list_like(q): + result = result.map(lib.Timestamp) + else: + result = lib.Timestamp(result) + else: + result = multi(valid_values, q) + + return result + + def ptp(self, axis=None, out=None): + return _values_from_object(self).ptp(axis, out) + + def corr(self, other, method='pearson', + min_periods=None): + """ + Compute correlation with `other` Series, excluding missing values + + Parameters + ---------- + other : Series + method : {'pearson', 'kendall', 'spearman'} + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + min_periods : int, optional + Minimum number of observations needed to have a valid result + + + Returns + ------- + correlation : float + """ + this, other = self.align(other, join='inner', copy=False) + if len(this) == 0: + return pa.NA + return nanops.nancorr(this.values, other.values, method=method, + min_periods=min_periods) + + def cov(self, other, min_periods=None): + """ + Compute covariance with Series, excluding missing values + + Parameters + ---------- + other : Series + min_periods : int, optional + Minimum number of observations needed to have a valid result + + Returns + ------- + covariance : float + + Normalized by N-1 (unbiased estimator). + """ + this, other = self.align(other, join='inner') + if len(this) == 0: + return pa.NA + return nanops.nancov(this.values, other.values, + min_periods=min_periods) + + def diff(self, periods=1): + """ + 1st discrete difference of object + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming difference + + Returns + ------- + diffed : Series + """ + result = com.diff(_values_from_object(self), periods) + return self._constructor(result, index=self.index).__finalize__(self) + + def autocorr(self): + """ + Lag-1 autocorrelation + + Returns + ------- + autocorr : float + """ + return self.corr(self.shift(1)) + + def dot(self, other): + """ + Matrix multiplication with DataFrame or inner-product with Series + objects + + Parameters + ---------- + other : Series or DataFrame + + Returns + ------- + dot_product : scalar or Series + """ + from pandas.core.frame import DataFrame + if isinstance(other, (Series, DataFrame)): + common = self.index.union(other.index) + if (len(common) > len(self.index) or + len(common) > len(other.index)): + raise ValueError('matrices are not aligned') + + left = self.reindex(index=common, copy=False) + right = other.reindex(index=common, copy=False) + lvals = left.values + rvals = right.values + else: + left = self + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[0] != rvals.shape[0]: + raise Exception('Dot product shape mismatch, %s vs %s' % + (lvals.shape, rvals.shape)) + + if isinstance(other, DataFrame): + return self._constructor(np.dot(lvals, rvals), + index=other.columns).__finalize__(self) + elif isinstance(other, Series): + return np.dot(lvals, rvals) + elif isinstance(rvals, np.ndarray): + return np.dot(lvals, rvals) + else: # pragma: no cover + raise TypeError('unsupported type: %s' % type(other)) + +#------------------------------------------------------------------------------ +# Combination + + def append(self, to_append, verify_integrity=False): + """ + Concatenate two or more Series. The indexes must not overlap + + Parameters + ---------- + to_append : Series or list/tuple of Series + verify_integrity : boolean, default False + If True, raise Exception on creating index with duplicates + + Returns + ------- + appended : Series + """ + from pandas.tools.merge import concat + + if isinstance(to_append, (list, tuple)): + to_concat = [self] + to_append + else: + to_concat = [self, to_append] + return concat(to_concat, ignore_index=False, + verify_integrity=verify_integrity) + + def _binop(self, other, func, level=None, fill_value=None): + """ + Perform generic binary operation with optional fill value + + Parameters + ---------- + other : Series + func : binary operator + fill_value : float or object + Value to substitute for NA/null values. If both Series are NA in a + location, the result will be NA regardless of the passed fill value + level : int or level name, default None + Broadcast across a level, matching Index values on the + passed MultiIndex level + + Returns + ------- + combined : Series + """ + if not isinstance(other, Series): + raise AssertionError('Other operand must be Series') + + new_index = self.index + this = self + + if not self.index.equals(other.index): + this, other = self.align(other, level=level, join='outer') + new_index = this.index + + this_vals = this.values + other_vals = other.values + + if fill_value is not None: + this_mask = isnull(this_vals) + other_mask = isnull(other_vals) + this_vals = this_vals.copy() + other_vals = other_vals.copy() + + # one but not both + mask = this_mask ^ other_mask + this_vals[this_mask & mask] = fill_value + other_vals[other_mask & mask] = fill_value + + result = func(this_vals, other_vals) + name = _maybe_match_name(self, other) + return self._constructor(result, index=new_index).__finalize__(self) + + def combine(self, other, func, fill_value=nan): + """ + Perform elementwise binary operation on two Series using given function + with optional fill value when an index is missing from one Series or + the other + + Parameters + ---------- + other : Series or scalar value + func : function + fill_value : scalar value + + Returns + ------- + result : Series + """ + if isinstance(other, Series): + new_index = self.index + other.index + new_name = _maybe_match_name(self, other) + new_values = pa.empty(len(new_index), dtype=self.dtype) + for i, idx in enumerate(new_index): + lv = self.get(idx, fill_value) + rv = other.get(idx, fill_value) + new_values[i] = func(lv, rv) + else: + new_index = self.index + new_values = func(self.values, other) + new_name = self.name + return self._constructor(new_values, index=new_index, name=new_name) + + def combine_first(self, other): + """ + Combine Series values, choosing the calling Series's values + first. Result index will be the union of the two indexes + + Parameters + ---------- + other : Series + + Returns + ------- + y : Series + """ + new_index = self.index + other.index + this = self.reindex(new_index, copy=False) + other = other.reindex(new_index, copy=False) + name = _maybe_match_name(self, other) + rs_vals = com._where_compat(isnull(this), other.values, this.values) + return self._constructor(rs_vals, index=new_index).__finalize__(self) + + def update(self, other): + """ + Modify Series in place using non-NA values from passed + Series. Aligns on index + + Parameters + ---------- + other : Series + """ + other = other.reindex_like(self) + mask = notnull(other) + + self._data = self._data.putmask(mask=mask, new=other, inplace=True) + self._maybe_update_cacher() + + #---------------------------------------------------------------------- + # Reindexing, sorting + + def sort_index(self, ascending=True): + """ + Sort object by labels (along an axis) + + Parameters + ---------- + ascending : boolean or list, default True + Sort ascending vs. descending. Specify list for multiple sort + orders + + Examples + -------- + >>> result1 = s.sort_index(ascending=False) + >>> result2 = s.sort_index(ascending=[1, 0]) + + Returns + ------- + sorted_obj : Series + """ + index = self.index + if isinstance(index, MultiIndex): + from pandas.core.groupby import _lexsort_indexer + indexer = _lexsort_indexer(index.labels, orders=ascending) + indexer = com._ensure_platform_int(indexer) + new_labels = index.take(indexer) + else: + new_labels, indexer = index.order(return_indexer=True, + ascending=ascending) + + new_values = self.values.take(indexer) + return self._constructor(new_values, + index=new_labels).__finalize__(self) + + def argsort(self, axis=0, kind='quicksort', order=None): + """ + Overrides ndarray.argsort. Argsorts the value, omitting NA/null values, + and places the result in the same locations as the non-NA values + + Parameters + ---------- + axis : int (can only be zero) + kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See np.sort for more + information. 'mergesort' is the only stable algorithm + order : ignored + + Returns + ------- + argsorted : Series, with -1 indicated where nan values are present + + """ + values = self.values + mask = isnull(values) + + if mask.any(): + result = Series( + -1, index=self.index, name=self.name, dtype='int64') + notmask = ~mask + result[notmask] = np.argsort(values[notmask], kind=kind) + return self._constructor(result, + index=self.index).__finalize__(self) + else: + return self._constructor( + np.argsort(values, kind=kind), index=self.index, + dtype='int64').__finalize__(self) + + def rank(self, method='average', na_option='keep', ascending=True, + pct=False): + """ + Compute data ranks (1 through n). Equal values are assigned a rank that + is the average of the ranks of those values + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'} + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + na_option : {'keep'} + keep: leave NA values where they are + ascending : boolean, default True + False for ranks by high (1) to low (N) + pct : boolean, default False + Computes percentage rank of data + + Returns + ------- + ranks : Series + """ + from pandas.core.algorithms import rank + ranks = rank(self.values, method=method, na_option=na_option, + ascending=ascending, pct=pct) + return self._constructor(ranks, index=self.index).__finalize__(self) + + def sort(self, axis=0, ascending=True, kind='quicksort', na_position='last', inplace=True): + """ + Sort values and index labels by value. This is an inplace sort by default. + Series.order is the equivalent but returns a new Series. + + Parameters + ---------- + axis : int (can only be zero) + ascending : boolean, default True + Sort ascending. Passing False sorts descending + kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See np.sort for more + information. 'mergesort' is the only stable algorithm + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + inplace : boolean, default True + Do operation in place. + + See Also + -------- + Series.order + """ + return self.order(ascending=ascending, + kind=kind, + na_position=na_position, + inplace=inplace) + + def order(self, na_last=None, ascending=True, kind='quicksort', na_position='last', inplace=False): + """ + Sorts Series object, by value, maintaining index-value link. + This will return a new Series by default. Series.sort is the equivalent but as an inplace method. + + Parameters + ---------- + na_last : boolean (optional, default=True) (DEPRECATED; use na_position) + Put NaN's at beginning or end + ascending : boolean, default True + Sort ascending. Passing False sorts descending + kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See np.sort for more + information. 'mergesort' is the only stable algorithm + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + inplace : boolean, default False + Do operation in place. + + Returns + ------- + y : Series + + See Also + -------- + Series.sort + """ + + # GH 5856/5853 + if inplace and self._is_cached: + raise ValueError("This Series is a view of some other array, to " + "sort in-place you must create a copy") + + if na_last is not None: + warnings.warn(("na_last is deprecated. Please use na_position instead"), + FutureWarning) + na_position = 'last' if na_last else 'first' + + def _try_kind_sort(arr): + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind='quicksort') + + arr = self.values + sortedIdx = pa.empty(len(self), dtype=np.int32) + + bad = isnull(arr) + + good = ~bad + idx = pa.arange(len(self)) + + argsorted = _try_kind_sort(arr[good]) + + if not ascending: + argsorted = argsorted[::-1] + + if na_position == 'last': + n = good.sum() + sortedIdx[:n] = idx[good][argsorted] + sortedIdx[n:] = idx[bad] + elif na_position == 'first': + n = bad.sum() + sortedIdx[n:] = idx[good][argsorted] + sortedIdx[:n] = idx[bad] + else: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + + result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx]) + + if inplace: + self._update_inplace(result) + else: + return result.__finalize__(self) + + def nlargest(self, n=5, take_last=False): + """Return the largest `n` elements. + + Parameters + ---------- + n : int + Return this many descending sorted values + take_last : bool + Where there are duplicate values, take the last duplicate + + Returns + ------- + top_n : Series + The n largest values in the Series, in sorted order + + Notes + ----- + Faster than ``.order(ascending=False).head(n)`` for small `n` relative + to the size of the ``Series`` object. + + See Also + -------- + Series.nsmallest + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> s = pd.Series(np.random.randn(1e6)) + >>> s.nlargest(10) # only sorts up to the N requested + """ + return select_n(self, n=n, take_last=take_last, method='nlargest') + + def nsmallest(self, n=5, take_last=False): + """Return the smallest `n` elements. + + Parameters + ---------- + n : int + Return this many ascending sorted values + take_last : bool + Where there are duplicate values, take the last duplicate + + Returns + ------- + bottom_n : Series + The n smallest values in the Series, in sorted order + + Notes + ----- + Faster than ``.order().head(n)`` for small `n` relative to + the size of the ``Series`` object. + + See Also + -------- + Series.nlargest + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> s = pd.Series(np.random.randn(1e6)) + >>> s.nsmallest(10) # only sorts up to the N requested + """ + return select_n(self, n=n, take_last=take_last, method='nsmallest') + + def sortlevel(self, level=0, ascending=True, sort_remaining=True): + """ + Sort Series with MultiIndex by chosen level. Data will be + lexicographically sorted by the chosen level followed by the other + levels (in order) + + Parameters + ---------- + level : int or level name, default None + ascending : bool, default True + + Returns + ------- + sorted : Series + """ + if not isinstance(self.index, MultiIndex): + raise TypeError('can only sort by level with a hierarchical index') + + new_index, indexer = self.index.sortlevel(level, ascending=ascending, + sort_remaining=sort_remaining) + new_values = self.values.take(indexer) + return self._constructor(new_values, + index=new_index).__finalize__(self) + + def swaplevel(self, i, j, copy=True): + """ + Swap levels i and j in a MultiIndex + + Parameters + ---------- + i, j : int, string (can be mixed) + Level of index to be swapped. Can pass level name as string. + + Returns + ------- + swapped : Series + """ + new_index = self.index.swaplevel(i, j) + return self._constructor(self.values, index=new_index, + copy=copy).__finalize__(self) + + def reorder_levels(self, order): + """ + Rearrange index levels using input order. May not drop or duplicate + levels + + Parameters + ---------- + order: list of int representing new level order. + (reference level by number or key) + axis: where to reorder levels + + Returns + ------- + type of caller (new object) + """ + if not isinstance(self.index, MultiIndex): # pragma: no cover + raise Exception('Can only reorder levels on a hierarchical axis.') + + result = self.copy() + result.index = result.index.reorder_levels(order) + return result + + def unstack(self, level=-1): + """ + Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame + + Parameters + ---------- + level : int, string, or list of these, default last level + Level(s) to unstack, can pass level name + + Examples + -------- + >>> s + one a 1. + one b 2. + two a 3. + two b 4. + + >>> s.unstack(level=-1) + a b + one 1. 2. + two 3. 4. + + >>> s.unstack(level=0) + one two + a 1. 2. + b 3. 4. + + Returns + ------- + unstacked : DataFrame + """ + from pandas.core.reshape import unstack + return unstack(self, level) + + #---------------------------------------------------------------------- + # function application + + def map(self, arg, na_action=None): + """ + Map values of Series using input correspondence (which can be + a dict, Series, or function) + + Parameters + ---------- + arg : function, dict, or Series + na_action : {None, 'ignore'} + If 'ignore', propagate NA values + + Examples + -------- + >>> x + one 1 + two 2 + three 3 + + >>> y + 1 foo + 2 bar + 3 baz + + >>> x.map(y) + one foo + two bar + three baz + + Returns + ------- + y : Series + same index as caller + """ + values = self.values + if com.is_datetime64_dtype(values.dtype): + values = lib.map_infer(values, lib.Timestamp) + + if na_action == 'ignore': + mask = isnull(values) + + def map_f(values, f): + return lib.map_infer_mask(values, f, mask.view(pa.uint8)) + else: + map_f = lib.map_infer + + if isinstance(arg, (dict, Series)): + if isinstance(arg, dict): + arg = self._constructor(arg, index=arg.keys()) + + indexer = arg.index.get_indexer(values) + new_values = com.take_1d(arg.values, indexer) + return self._constructor(new_values, + index=self.index).__finalize__(self) + else: + mapped = map_f(values, arg) + return self._constructor(mapped, + index=self.index).__finalize__(self) + + def apply(self, func, convert_dtype=True, args=(), **kwds): + """ + Invoke function on values of Series. Can be ufunc (a NumPy function + that applies to the entire Series) or a Python function that only works + on single values + + Parameters + ---------- + func : function + convert_dtype : boolean, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object + args : tuple + Positional arguments to pass to function in addition to the value + Additional keyword arguments will be passed as keywords to the function + + See also + -------- + Series.map: For element-wise operations + + Returns + ------- + y : Series or DataFrame if func returns a Series + """ + if len(self) == 0: + return Series() + + if kwds or args and not isinstance(func, np.ufunc): + f = lambda x: func(x, *args, **kwds) + else: + f = func + + if isinstance(f, np.ufunc): + return f(self) + + values = _values_from_object(self) + if com.is_datetime64_dtype(values.dtype): + values = lib.map_infer(values, lib.Timestamp) + + mapped = lib.map_infer(values, f, convert=convert_dtype) + if len(mapped) and isinstance(mapped[0], Series): + from pandas.core.frame import DataFrame + return DataFrame(mapped.tolist(), index=self.index) + else: + return self._constructor(mapped, + index=self.index).__finalize__(self) + + def _reduce(self, op, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + """ perform a reduction operation """ + return op(_values_from_object(self), skipna=skipna, **kwds) + + def _reindex_indexer(self, new_index, indexer, copy): + if indexer is None: + if copy: + return self.copy() + return self + + # be subclass-friendly + new_values = com.take_1d(self.get_values(), indexer) + return self._constructor(new_values, index=new_index) + + def _needs_reindex_multi(self, axes, method, level): + """ check if we do need a multi reindex; this is for compat with + higher dims + """ + return False + + @Appender(generic._shared_docs['rename'] % _shared_doc_kwargs) + def rename(self, index=None, **kwargs): + return super(Series, self).rename(index=index, **kwargs) + + @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs) + def reindex(self, index=None, **kwargs): + return super(Series, self).reindex(index=index, **kwargs) + + def reindex_axis(self, labels, axis=0, **kwargs): + """ for compatibility with higher dims """ + if axis != 0: + raise ValueError("cannot reindex series on non-zero axis!") + return self.reindex(index=labels, **kwargs) + + def take(self, indices, axis=0, convert=True, is_copy=False): + """ + Analogous to ndarray.take, return Series corresponding to requested + indices + + Parameters + ---------- + indices : list / array of ints + convert : translate negative to positive indices (default) + + Returns + ------- + taken : Series + """ + # check/convert indicies here + if convert: + indices = _maybe_convert_indices( + indices, len(self._get_axis(axis))) + + indices = com._ensure_platform_int(indices) + new_index = self.index.take(indices) + new_values = self.values.take(indices) + return self._constructor(new_values, + index=new_index).__finalize__(self) + + def isin(self, values): + """ + Return a boolean :class:`~pandas.Series` showing whether each element + in the :class:`~pandas.Series` is exactly contained in the passed + sequence of ``values``. + + Parameters + ---------- + values : list-like + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + ``list`` of one element. + + Returns + ------- + isin : Series (bool dtype) + + Raises + ------ + TypeError + * If ``values`` is a string + + See Also + -------- + pandas.DataFrame.isin + + Examples + -------- + + >>> s = pd.Series(list('abc')) + >>> s.isin(['a', 'c', 'e']) + 0 True + 1 False + 2 True + dtype: bool + + Passing a single string as ``s.isin('a')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['a']) + 0 True + 1 False + 2 False + dtype: bool + + """ + if not com.is_list_like(values): + raise TypeError("only list-like objects are allowed to be passed" + " to Series.isin(), you passed a " + "{0!r}".format(type(values).__name__)) + + # may need i8 conversion for proper membership testing + comps = _values_from_object(self) + if com.is_datetime64_dtype(self): + from pandas.tseries.tools import to_datetime + values = Series(to_datetime(values)).values.view('i8') + comps = comps.view('i8') + elif com.is_timedelta64_dtype(self): + from pandas.tseries.timedeltas import to_timedelta + values = Series(to_timedelta(values)).values.view('i8') + comps = comps.view('i8') + + value_set = set(values) + result = lib.ismember(comps, value_set) + return self._constructor(result, index=self.index).__finalize__(self) + + def between(self, left, right, inclusive=True): + """ + Return boolean Series equivalent to left <= series <= right. NA values + will be treated as False + + Parameters + ---------- + left : scalar + Left boundary + right : scalar + Right boundary + + Returns + ------- + is_between : Series + """ + if inclusive: + lmask = self >= left + rmask = self <= right + else: + lmask = self > left + rmask = self < right + + return lmask & rmask + + @classmethod + def from_csv(cls, path, sep=',', parse_dates=True, header=None, + index_col=0, encoding=None, infer_datetime_format=False): + """ + Read delimited file into Series + + Parameters + ---------- + path : string file path or file handle / StringIO + sep : string, default ',' + Field delimiter + parse_dates : boolean, default True + Parse dates. Different default from read_table + header : int, default 0 + Row to use at header (skip prior rows) + index_col : int or sequence, default 0 + Column to use for index. If a sequence is given, a MultiIndex + is used. Different default from read_table + encoding : string, optional + a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + infer_datetime_format: boolean, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. + + Returns + ------- + y : Series + """ + from pandas.core.frame import DataFrame + df = DataFrame.from_csv(path, header=header, index_col=index_col, + sep=sep, parse_dates=parse_dates, + encoding=encoding, + infer_datetime_format=infer_datetime_format) + result = df.icol(0) + result.index.name = result.name = None + return result + + def to_csv(self, path, index=True, sep=",", na_rep='', + float_format=None, header=False, + index_label=None, mode='w', nanRep=None, encoding=None, + date_format=None): + """ + Write Series to a comma-separated values (csv) file + + Parameters + ---------- + path : string file path or file handle / StringIO + na_rep : string, default '' + Missing data representation + float_format : string, default None + Format string for floating point numbers + header : boolean, default False + Write out series name + index : boolean, default True + Write row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + mode : Python write mode, default 'w' + sep : character, default "," + Field delimiter for the output file. + encoding : string, optional + a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + date_format: string, default None + Format string for datetime objects. + """ + from pandas.core.frame import DataFrame + df = DataFrame(self) + df.to_csv(path, index=index, sep=sep, na_rep=na_rep, + float_format=float_format, header=header, + index_label=index_label, mode=mode, nanRep=nanRep, + encoding=encoding, date_format=date_format) + + def dropna(self, axis=0, inplace=False, **kwargs): + """ + Return Series without null values + + Returns + ------- + valid : Series + inplace : boolean, default False + Do operation in place. + """ + axis = self._get_axis_number(axis or 0) + result = remove_na(self) + if inplace: + self._update_inplace(result) + else: + return result + + valid = lambda self, inplace=False, **kwargs: self.dropna(inplace=inplace, + **kwargs) + + def first_valid_index(self): + """ + Return label for first non-NA/null value + """ + if len(self) == 0: + return None + + mask = isnull(self.values) + i = mask.argmin() + if mask[i]: + return None + else: + return self.index[i] + + def last_valid_index(self): + """ + Return label for last non-NA/null value + """ + if len(self) == 0: + return None + + mask = isnull(self.values[::-1]) + i = mask.argmin() + if mask[i]: + return None + else: + return self.index[len(self) - i - 1] + + #---------------------------------------------------------------------- + # Time series-oriented methods + + def asof(self, where): + """ + Return last good (non-NaN) value in TimeSeries if value is NaN for + requested date. + + If there is no good value, NaN is returned. + + Parameters + ---------- + where : date or array of dates + + Notes + ----- + Dates are assumed to be sorted + + Returns + ------- + value or NaN + """ + if isinstance(where, compat.string_types): + where = datetools.to_datetime(where) + + values = self.values + + if not hasattr(where, '__iter__'): + start = self.index[0] + if isinstance(self.index, PeriodIndex): + where = Period(where, freq=self.index.freq).ordinal + start = start.ordinal + + if where < start: + return pa.NA + loc = self.index.searchsorted(where, side='right') + if loc > 0: + loc -= 1 + while isnull(values[loc]) and loc > 0: + loc -= 1 + return values[loc] + + if not isinstance(where, Index): + where = Index(where) + + locs = self.index.asof_locs(where, notnull(values)) + new_values = com.take_1d(values, locs) + return self._constructor(new_values, index=where).__finalize__(self) + + @cache_readonly + def str(self): + from pandas.core.strings import StringMethods + return StringMethods(self) + + def to_timestamp(self, freq=None, how='start', copy=True): + """ + Cast to datetimeindex of timestamps, at *beginning* of period + + Parameters + ---------- + freq : string, default frequency of PeriodIndex + Desired frequency + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end + + Returns + ------- + ts : TimeSeries with DatetimeIndex + """ + new_values = self.values + if copy: + new_values = new_values.copy() + + new_index = self.index.to_timestamp(freq=freq, how=how) + return self._constructor(new_values, + index=new_index).__finalize__(self) + + def to_period(self, freq=None, copy=True): + """ + Convert TimeSeries from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed) + + Parameters + ---------- + freq : string, default + + Returns + ------- + ts : TimeSeries with PeriodIndex + """ + new_values = self.values + if copy: + new_values = new_values.copy() + + new_index = self.index.to_period(freq=freq) + return self._constructor(new_values, + index=new_index).__finalize__(self) + +Series._setup_axes(['index'], info_axis=0, stat_axis=0, + aliases={'rows': 0}) +Series._add_numeric_operations() +_INDEX_TYPES = ndarray, Index, list, tuple + +#------------------------------------------------------------------------------ +# Supplementary functions + + +def remove_na(series): + """ + Return series containing only true/non-NaN values, possibly empty. + """ + return series[notnull(_values_from_object(series))] + + +def _sanitize_array(data, index, dtype=None, copy=False, + raise_cast_failure=False): + if dtype is not None: + dtype = np.dtype(dtype) + + if isinstance(data, ma.MaskedArray): + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = _maybe_upcast(data, copy=True) + data[mask] = fill_value + else: + data = data.copy() + + def _try_cast(arr, take_fast_path): + + # perf shortcut as this is the most common case + if take_fast_path: + if _possibly_castable(arr) and not copy and dtype is None: + return arr + + try: + arr = _possibly_cast_to_datetime(arr, dtype) + subarr = pa.array(arr, dtype=dtype, copy=copy) + except (ValueError, TypeError): + if dtype is not None and raise_cast_failure: + raise + else: # pragma: no cover + subarr = pa.array(arr, dtype=object, copy=copy) + return subarr + + # GH #846 + if isinstance(data, (pa.Array, Series)): + subarr = np.array(data, copy=False) + if dtype is not None: + + # possibility of nan -> garbage + if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): + if not isnull(data).any(): + subarr = _try_cast(data, True) + elif copy: + subarr = data.copy() + else: + if (com.is_datetime64_dtype(data.dtype) and + not com.is_datetime64_dtype(dtype)): + if dtype == object: + ints = np.asarray(data).view('i8') + subarr = tslib.ints_to_pydatetime(ints) + elif raise_cast_failure: + raise TypeError('Cannot cast datetime64 to %s' % dtype) + else: + subarr = _try_cast(data, True) + else: + # don't coerce Index types + # e.g. indexes can have different conversions (so don't fast path them) + # GH 6140 + subarr = _try_cast(data, not isinstance(data, Index)) + + if copy: + subarr = data.copy() + + elif isinstance(data, list) and len(data) > 0: + if dtype is not None: + try: + subarr = _try_cast(data, False) + except Exception: + if raise_cast_failure: # pragma: no cover + raise + subarr = pa.array(data, dtype=object, copy=copy) + subarr = lib.maybe_convert_objects(subarr) + + else: + subarr = _possibly_convert_platform(data) + + subarr = _possibly_cast_to_datetime(subarr, dtype) + + else: + subarr = _try_cast(data, False) + + # scalar like + if subarr.ndim == 0: + if isinstance(data, list): # pragma: no cover + subarr = pa.array(data, dtype=object) + elif index is not None: + value = data + + # figure out the dtype from the value (upcast if necessary) + if dtype is None: + dtype, value = _infer_dtype_from_scalar(value) + else: + # need to possibly convert the value here + value = _possibly_cast_to_datetime(value, dtype) + + subarr = pa.empty(len(index), dtype=dtype) + subarr.fill(value) + + else: + return subarr.item() + + # the result that we want + elif subarr.ndim == 1: + if index is not None: + + # a 1-element ndarray + if len(subarr) != len(index) and len(subarr) == 1: + value = subarr[0] + subarr = pa.empty(len(index), dtype=subarr.dtype) + subarr.fill(value) + + elif subarr.ndim > 1: + if isinstance(data, pa.Array): + raise Exception('Data must be 1-dimensional') + else: + subarr = _asarray_tuplesafe(data, dtype=dtype) + + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(subarr.dtype.type, compat.string_types): + subarr = pa.array(data, dtype=object, copy=copy) + + return subarr + +# backwards compatiblity +TimeSeries = Series + +#---------------------------------------------------------------------- +# Add plotting methods to Series + +import pandas.tools.plotting as _gfx + +Series.plot = _gfx.plot_series +Series.hist = _gfx.hist_series + +# Add arithmetic! +ops.add_flex_arithmetic_methods(Series, **ops.series_flex_funcs) +ops.add_special_arithmetic_methods(Series, **ops.series_special_funcs) diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py new file mode 100644 index 00000000..84149e55 --- /dev/null +++ b/pandas/core/sparse.py @@ -0,0 +1,10 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only +with float64 data +""" + +# pylint: disable=W0611 + +from pandas.sparse.series import SparseSeries +from pandas.sparse.frame import SparseDataFrame +from pandas.sparse.panel import SparsePanel diff --git a/pandas/core/strings.py b/pandas/core/strings.py new file mode 100644 index 00000000..3e730942 --- /dev/null +++ b/pandas/core/strings.py @@ -0,0 +1,1039 @@ +import numpy as np + +from pandas.compat import zip +from pandas.core.common import isnull, _values_from_object +from pandas.core.series import Series +from pandas.core.frame import DataFrame +import pandas.compat as compat +import re +import pandas.lib as lib +import warnings +import textwrap + + +def _get_array_list(arr, others): + if len(others) and isinstance(others[0], (list, np.ndarray)): + arrays = [arr] + list(others) + else: + arrays = [arr, others] + + return [np.asarray(x, dtype=object) for x in arrays] + + +def str_cat(arr, others=None, sep=None, na_rep=None): + """ + Concatenate arrays of strings with given separator + + Parameters + ---------- + arr : list or array-like + others : list or array, or list of arrays + sep : string or None, default None + na_rep : string or None, default None + If None, an NA in any array will propagate + + Returns + ------- + concat : array + """ + if sep is None: + sep = '' + + if others is not None: + arrays = _get_array_list(arr, others) + + n = _length_check(arrays) + masks = np.array([isnull(x) for x in arrays]) + cats = None + + if na_rep is None: + na_mask = np.logical_or.reduce(masks, axis=0) + + result = np.empty(n, dtype=object) + np.putmask(result, na_mask, np.nan) + + notmask = ~na_mask + + tuples = zip(*[x[notmask] for x in arrays]) + cats = [sep.join(tup) for tup in tuples] + + result[notmask] = cats + else: + for i, x in enumerate(arrays): + x = np.where(masks[i], na_rep, x) + if cats is None: + cats = x + else: + cats = cats + sep + x + + result = cats + + return result + else: + arr = np.asarray(arr, dtype=object) + mask = isnull(arr) + if na_rep is None and mask.any(): + return np.nan + return sep.join(np.where(mask, na_rep, arr)) + + +def _length_check(others): + n = None + for x in others: + if n is None: + n = len(x) + elif len(x) != n: + raise ValueError('All arrays must be same length') + + return n + + +def _na_map(f, arr, na_result=np.nan, dtype=object): + # should really _check_ for NA + return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + + +def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): + if not len(arr): + return np.ndarray(0, dtype=dtype) + + if isinstance(arr, Series): + arr = arr.values + if not isinstance(arr, np.ndarray): + arr = np.asarray(arr, dtype=object) + if na_mask: + mask = isnull(arr) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8)) + except (TypeError, AttributeError): + def g(x): + try: + return f(x) + except (TypeError, AttributeError): + return na_value + return _map(g, arr, dtype=dtype) + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + else: + return lib.map_infer(arr, f) + + +def str_title(arr): + """ + Convert strings to titlecased version + + Returns + ------- + titled : array + """ + return _na_map(lambda x: x.title(), arr) + + +def str_count(arr, pat, flags=0): + """ + Count occurrences of pattern in each string + + Parameters + ---------- + arr : list or array-like + pat : string, valid regular expression + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + + Returns + ------- + counts : arrays + """ + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return _na_map(f, arr, dtype=int) + + +def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): + """ + Check whether given pattern is contained in each string in the array + + Parameters + ---------- + pat : string + Character sequence or regular expression + case : boolean, default True + If True, case sensitive + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + na : default NaN, fill value for missing values. + regex : bool, default True + If True use re.search, otherwise use Python in operator + + Returns + ------- + Series of boolean values + + See Also + -------- + match : analagous, but stricter, relying on re.match instead of re.search + + """ + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: + warnings.warn("This pattern has match groups. To actually get the" + " groups, use str.extract.", UserWarning) + + f = lambda x: bool(regex.search(x)) + else: + if case: + f = lambda x: pat in x + else: + upper_pat = pat.upper() + f = lambda x: upper_pat in x + return _na_map(f, str_upper(arr), na, dtype=bool) + return _na_map(f, arr, na, dtype=bool) + + +def str_startswith(arr, pat, na=np.nan): + """ + Return boolean array indicating whether each string starts with passed + pattern + + Parameters + ---------- + pat : string + Character sequence + na : bool, default NaN + + Returns + ------- + startswith : array (boolean) + """ + f = lambda x: x.startswith(pat) + return _na_map(f, arr, na, dtype=bool) + + +def str_endswith(arr, pat, na=np.nan): + """ + Return boolean array indicating whether each string ends with passed + pattern + + Parameters + ---------- + pat : string + Character sequence + na : bool, default NaN + + Returns + ------- + endswith : array (boolean) + """ + f = lambda x: x.endswith(pat) + return _na_map(f, arr, na, dtype=bool) + + +def str_lower(arr): + """ + Convert strings in array to lowercase + + Returns + ------- + lowercase : array + """ + return _na_map(lambda x: x.lower(), arr) + + +def str_upper(arr): + """ + Convert strings in array to uppercase + + Returns + ------- + uppercase : array + """ + return _na_map(lambda x: x.upper(), arr) + + +def str_replace(arr, pat, repl, n=-1, case=True, flags=0): + """ + Replace + + Parameters + ---------- + pat : string + Character sequence or regular expression + repl : string + Replacement sequence + n : int, default -1 (all) + Number of replacements to make from start + case : boolean, default True + If True, case sensitive + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + + Returns + ------- + replaced : array + """ + use_re = not case or len(pat) > 1 or flags + + if use_re: + if not case: + flags |= re.IGNORECASE + regex = re.compile(pat, flags=flags) + n = n if n >= 0 else 0 + + def f(x): + return regex.sub(repl, x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + + return _na_map(f, arr) + + +def str_repeat(arr, repeats): + """ + Duplicate each string in the array by indicated number of times + + Parameters + ---------- + repeats : int or array + Same value for all (int) or different value per (array) + + Returns + ------- + repeated : array + """ + if np.isscalar(repeats): + def rep(x): + try: + return compat.binary_type.__mul__(x, repeats) + except TypeError: + return compat.text_type.__mul__(x, repeats) + + return _na_map(rep, arr) + else: + def rep(x, r): + try: + return compat.binary_type.__mul__(x, r) + except TypeError: + return compat.text_type.__mul__(x, r) + + repeats = np.asarray(repeats, dtype=object) + result = lib.vec_binop(_values_from_object(arr), repeats, rep) + return result + + +def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): + """ + Deprecated: Find groups in each string using passed regular expression. + If as_indexer=True, determine if each string matches a regular expression. + + Parameters + ---------- + pat : string + Character sequence or regular expression + case : boolean, default True + If True, case sensitive + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + na : default NaN, fill value for missing values. + as_indexer : False, by default, gives deprecated behavior better achieved + using str_extract. True return boolean indexer. + + Returns + ------- + Series of boolean values + if as_indexer=True + Series of tuples + if as_indexer=False, default but deprecated + + See Also + -------- + contains : analagous, but less strict, relying on re.search instead of + re.match + extract : now preferred to the deprecated usage of match (as_indexer=False) + + Notes + ----- + To extract matched groups, which is the deprecated behavior of match, use + str.extract. + """ + + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if (not as_indexer) and regex.groups > 0: + # Do this first, to make sure it happens even if the re.compile + # raises below. + warnings.warn("In future versions of pandas, match will change to" + " always return a bool indexer.", UserWarning) + + if as_indexer and regex.groups > 0: + warnings.warn("This pattern has match groups. To actually get the" + " groups, use str.extract.", UserWarning) + + # If not as_indexer and regex.groups == 0, this returns empty lists + # and is basically useless, so we will not warn. + + if (not as_indexer) and regex.groups > 0: + dtype = object + def f(x): + m = regex.match(x) + if m: + return m.groups() + else: + return [] + else: + # This is the new behavior of str_match. + dtype = bool + f = lambda x: bool(regex.match(x)) + + return _na_map(f, arr, na, dtype=dtype) + + +def _get_single_group_name(rx): + try: + return list(rx.groupindex.keys()).pop() + except IndexError: + return None + + +def str_extract(arr, pat, flags=0): + """ + Find groups in each string using passed regular expression + + Parameters + ---------- + pat : string + Pattern or regular expression + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + + Returns + ------- + extracted groups : Series (one group) or DataFrame (multiple groups) + Note that dtype of the result is always object, even when no match is + found and the result is a Series or DataFrame containing only NaN + values. + + Examples + -------- + A pattern with one group will return a Series. Non-matches will be NaN. + + >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') + 0 1 + 1 2 + 2 NaN + dtype: object + + A pattern with more than one group will return a DataFrame. + + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern may contain optional groups. + + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN 3 + + Named groups will become column names in the result. + + >>> Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') + letter digit + 0 a 1 + 1 b 2 + 2 NaN NaN + + """ + regex = re.compile(pat, flags=flags) + # just to be safe, check this + if regex.groups == 0: + raise ValueError("This pattern contains no groups to capture.") + empty_row = [np.nan]*regex.groups + def f(x): + if not isinstance(x, compat.string_types): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row + if regex.groups == 1: + result = Series([f(val)[0] for val in arr], + name=_get_single_group_name(regex), + index=arr.index, dtype=object) + else: + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + if arr.empty: + result = DataFrame(columns=columns, dtype=object) + else: + result = DataFrame([f(val) for val in arr], + columns=columns, + index=arr.index, + dtype=object) + return result + + +def str_get_dummies(arr, sep='|'): + """ + Split each string by sep and return a frame of dummy/indicator variables. + + Examples + -------- + >>> Series(['a|b', 'a', 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 1 0 0 + 2 1 0 1 + + >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + + See also ``pd.get_dummies``. + + """ + # TODO remove this hack? + arr = arr.fillna('') + try: + arr = sep + arr + sep + except TypeError: + arr = sep + arr.astype(str) + sep + + tags = set() + for ts in arr.str.split(sep): + tags.update(ts) + tags = sorted(tags - set([""])) + + dummies = np.empty((len(arr), len(tags)), dtype=np.int64) + + for i, t in enumerate(tags): + pat = sep + t + sep + dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x) + return DataFrame(dummies, arr.index, tags) + + +def str_join(arr, sep): + """ + Join lists contained as elements in array, a la str.join + + Parameters + ---------- + sep : string + Delimiter + + Returns + ------- + joined : array + """ + return _na_map(sep.join, arr) + + +def str_len(arr): + """ + Compute length of each string in array. + + Returns + ------- + lengths : array + """ + return _na_map(len, arr, dtype=int) + + +def str_findall(arr, pat, flags=0): + """ + Find all occurrences of pattern or regular expression + + Parameters + ---------- + pat : string + Pattern or regular expression + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + + Returns + ------- + matches : array + """ + regex = re.compile(pat, flags=flags) + return _na_map(regex.findall, arr) + + +def str_pad(arr, width, side='left'): + """ + Pad strings with whitespace + + Parameters + ---------- + arr : list or array-like + width : int + Minimum width of resulting string; additional characters will be filled + with spaces + side : {'left', 'right', 'both'}, default 'left' + + Returns + ------- + padded : array + """ + if side == 'left': + f = lambda x: x.rjust(width) + elif side == 'right': + f = lambda x: x.ljust(width) + elif side == 'both': + f = lambda x: x.center(width) + else: # pragma: no cover + raise ValueError('Invalid side') + + return _na_map(f, arr) + + +def str_center(arr, width): + """ + "Center" strings, filling left and right side with additional whitespace + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with spaces + + Returns + ------- + centered : array + """ + return str_pad(arr, width, side='both') + + +def str_split(arr, pat=None, n=None): + """ + Split each string (a la re.split) in array by given pattern, propagating NA + values + + Parameters + ---------- + pat : string, default None + String or regular expression to split on. If None, splits on whitespace + n : int, default None (all) + + Notes + ----- + Both 0 and -1 will be interpreted as return all splits + + Returns + ------- + split : array + """ + if pat is None: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if len(pat) == 1: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if n is None or n == -1: + n = 0 + regex = re.compile(pat) + f = lambda x: regex.split(x, maxsplit=n) + + return _na_map(f, arr) + + +def str_slice(arr, start=None, stop=None, step=1): + """ + Slice substrings from each element in array + + Parameters + ---------- + start : int or None + stop : int or None + + Returns + ------- + sliced : array + """ + obj = slice(start, stop, step) + f = lambda x: x[obj] + return _na_map(f, arr) + + +def str_slice_replace(arr, start=None, stop=None, repl=None): + """ + + Parameters + ---------- + + Returns + ------- + replaced : array + """ + raise NotImplementedError + + +def str_strip(arr, to_strip=None): + """ + Strip whitespace (including newlines) from each string in the array + + Parameters + ---------- + to_strip : str or unicode + + Returns + ------- + stripped : array + """ + return _na_map(lambda x: x.strip(to_strip), arr) + + +def str_lstrip(arr, to_strip=None): + """ + Strip whitespace (including newlines) from left side of each string in the + array + + Parameters + ---------- + to_strip : str or unicode + + Returns + ------- + stripped : array + """ + return _na_map(lambda x: x.lstrip(to_strip), arr) + + +def str_rstrip(arr, to_strip=None): + """ + Strip whitespace (including newlines) from right side of each string in the + array + + Parameters + ---------- + to_strip : str or unicode + + Returns + ------- + stripped : array + """ + return _na_map(lambda x: x.rstrip(to_strip), arr) + + +def str_wrap(arr, width, **kwargs): + """ + Wrap long strings to be formatted in paragraphs + + Parameters + ---------- + Same keyword parameters and defaults as :class:`textwrap.TextWrapper` + width : int + Maximum line-width + expand_tabs : bool, optional + If true, tab characters will be expanded to spaces (default: True) + replace_whitespace : bool, optional + If true, each whitespace character (as defined by string.whitespace) remaining + after tab expansion will be replaced by a single space (default: True) + drop_whitespace : bool, optional + If true, whitespace that, after wrapping, happens to end up at the beginning + or end of a line is dropped (default: True) + break_long_words : bool, optional + If true, then words longer than width will be broken in order to ensure that + no lines are longer than width. If it is false, long words will not be broken, + and some lines may be longer than width. (default: True) + break_on_hyphens : bool, optional + If true, wrapping will occur preferably on whitespace and right after hyphens + in compound words, as it is customary in English. If false, only whitespaces + will be considered as potentially good places for line breaks, but you need + to set break_long_words to false if you want truly insecable words. + (default: True) + + Returns + ------- + wrapped : array + + Notes + ----- + Internally, this method uses a :class:`textwrap.TextWrapper` instance with default + settings. To achieve behavior matching R's stringr library str_wrap function, use + the arguments: + + expand_tabs = False + replace_whitespace = True + drop_whitespace = True + break_long_words = False + break_on_hyphens = False + + Examples + -------- + + >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s.str.wrap(12) + 0 line to be\nwrapped + 1 another line\nto be\nwrapped + """ + kwargs['width'] = width + + tw = textwrap.TextWrapper(**kwargs) + + return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr) + + +def str_get(arr, i): + """ + Extract element from lists, tuples, or strings in each element in the array + + Parameters + ---------- + i : int + Integer index (location) + + Returns + ------- + items : array + """ + f = lambda x: x[i] if len(x) > i else np.nan + return _na_map(f, arr) + + +def str_decode(arr, encoding, errors="strict"): + """ + Decode character string to unicode using indicated encoding + + Parameters + ---------- + encoding : string + errors : string + + Returns + ------- + decoded : array + """ + f = lambda x: x.decode(encoding, errors) + return _na_map(f, arr) + + +def str_encode(arr, encoding, errors="strict"): + """ + Encode character string to some other encoding using indicated encoding + + Parameters + ---------- + encoding : string + errors : string + + Returns + ------- + encoded : array + """ + f = lambda x: x.encode(encoding, errors) + return _na_map(f, arr) + + +def _noarg_wrapper(f): + def wrapper(self): + result = f(self.series) + return self._wrap_result(result) + + wrapper.__name__ = f.__name__ + if f.__doc__: + wrapper.__doc__ = f.__doc__ + + return wrapper + + +def _pat_wrapper(f, flags=False, na=False, **kwargs): + def wrapper1(self, pat): + result = f(self.series, pat) + return self._wrap_result(result) + + def wrapper2(self, pat, flags=0, **kwargs): + result = f(self.series, pat, flags=flags, **kwargs) + return self._wrap_result(result) + + def wrapper3(self, pat, na=np.nan): + result = f(self.series, pat, na=na) + return self._wrap_result(result) + + wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 + + wrapper.__name__ = f.__name__ + if f.__doc__: + wrapper.__doc__ = f.__doc__ + + return wrapper + + +def copy(source): + "Copy a docstring from another source function (if present)" + def do_copy(target): + if source.__doc__: + target.__doc__ = source.__doc__ + return target + return do_copy + + +class StringMethods(object): + + """ + Vectorized string functions for Series. NAs stay NA unless handled + otherwise by a particular method. Patterned after Python's string methods, + with some inspiration from R's stringr package. + + Examples + -------- + >>> s.str.split('_') + >>> s.str.replace('_', '') + """ + + def __init__(self, series): + self.series = series + + def __getitem__(self, key): + if isinstance(key, slice): + return self.slice(start=key.start, stop=key.stop, + step=key.step) + else: + return self.get(key) + + def __iter__(self): + i = 0 + g = self.get(i) + while g.notnull().any(): + yield g + i += 1 + g = self.get(i) + + def _wrap_result(self, result): + if not hasattr(result, 'ndim'): + return result + elif result.ndim == 1: + name = getattr(result, 'name', None) + return Series(result, index=self.series.index, + name=name or self.series.name) + else: + assert result.ndim < 3 + return DataFrame(result, index=self.series.index) + + @copy(str_cat) + def cat(self, others=None, sep=None, na_rep=None): + result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep) + return self._wrap_result(result) + + @copy(str_split) + def split(self, pat=None, n=-1): + result = str_split(self.series, pat, n=n) + return self._wrap_result(result) + + @copy(str_get) + def get(self, i): + result = str_get(self.series, i) + return self._wrap_result(result) + + @copy(str_join) + def join(self, sep): + result = str_join(self.series, sep) + return self._wrap_result(result) + + @copy(str_contains) + def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + result = str_contains(self.series, pat, case=case, flags=flags, + na=na, regex=regex) + return self._wrap_result(result) + + @copy(str_match) + def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False): + result = str_match(self.series, pat, case=case, flags=flags, + na=na, as_indexer=as_indexer) + return self._wrap_result(result) + + @copy(str_replace) + def replace(self, pat, repl, n=-1, case=True, flags=0): + result = str_replace(self.series, pat, repl, n=n, case=case, + flags=flags) + return self._wrap_result(result) + + @copy(str_repeat) + def repeat(self, repeats): + result = str_repeat(self.series, repeats) + return self._wrap_result(result) + + @copy(str_pad) + def pad(self, width, side='left'): + result = str_pad(self.series, width, side=side) + return self._wrap_result(result) + + @copy(str_center) + def center(self, width): + result = str_center(self.series, width) + return self._wrap_result(result) + + @copy(str_slice) + def slice(self, start=None, stop=None, step=1): + result = str_slice(self.series, start, stop) + return self._wrap_result(result) + + @copy(str_slice) + def slice_replace(self, i=None, j=None): + raise NotImplementedError + + @copy(str_decode) + def decode(self, encoding, errors="strict"): + result = str_decode(self.series, encoding, errors) + return self._wrap_result(result) + + @copy(str_encode) + def encode(self, encoding, errors="strict"): + result = str_encode(self.series, encoding, errors) + return self._wrap_result(result) + + @copy(str_strip) + def strip(self, to_strip=None): + result = str_strip(self.series, to_strip) + return self._wrap_result(result) + + @copy(str_lstrip) + def lstrip(self, to_strip=None): + result = str_lstrip(self.series, to_strip) + return self._wrap_result(result) + + @copy(str_rstrip) + def rstrip(self, to_strip=None): + result = str_rstrip(self.series, to_strip) + return self._wrap_result(result) + + @copy(str_wrap) + def wrap(self, width, **kwargs): + result = str_wrap(self.series, width, **kwargs) + return self._wrap_result(result) + + @copy(str_get_dummies) + def get_dummies(self, sep='|'): + result = str_get_dummies(self.series, sep) + return self._wrap_result(result) + + count = _pat_wrapper(str_count, flags=True) + startswith = _pat_wrapper(str_startswith, na=True) + endswith = _pat_wrapper(str_endswith, na=True) + findall = _pat_wrapper(str_findall, flags=True) + extract = _pat_wrapper(str_extract, flags=True) + + len = _noarg_wrapper(str_len) + lower = _noarg_wrapper(str_lower) + upper = _noarg_wrapper(str_upper) + title = _noarg_wrapper(str_title) diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd new file mode 100644 index 00000000..97b6687d --- /dev/null +++ b/pandas/hashtable.pxd @@ -0,0 +1,24 @@ +from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, int64_t, float64_t + +# prototypes for sharing + +cdef class HashTable: + pass + +cdef class Int64HashTable(HashTable): + cdef kh_int64_t *table + + cpdef get_item(self, int64_t val) + cpdef set_item(self, int64_t key, Py_ssize_t val) + +cdef class Float64HashTable(HashTable): + cdef kh_float64_t *table + + cpdef get_item(self, float64_t val) + cpdef set_item(self, float64_t key, Py_ssize_t val) + +cdef class PyObjectHashTable(HashTable): + cdef kh_pymap_t *table + + cpdef get_item(self, object val) + cpdef set_item(self, object key, Py_ssize_t val) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx new file mode 100644 index 00000000..cf9428d5 --- /dev/null +++ b/pandas/hashtable.pyx @@ -0,0 +1,1064 @@ +from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check + +from khash cimport * +from numpy cimport * + +from util cimport _checknan +cimport util + +import numpy as np + +ONAN = np.nan + +cimport cython +cimport numpy as cnp + +cnp.import_array() +cnp.import_ufunc() + +cdef int64_t iNaT = util.get_nat() + +cdef extern from "datetime.h": + bint PyDateTime_Check(object o) + void PyDateTime_IMPORT() + +PyDateTime_IMPORT + +cdef extern from "Python.h": + int PySlice_Check(object) + + +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr + + +cdef size_t _INIT_VEC_CAP = 32 + +cdef class ObjectVector: + + cdef: + size_t n, m + ndarray ao + PyObject **data + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = self.ao.data + + def __len__(self): + return self.n + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + cdef inline append(self, object o): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + Py_INCREF(o) + self.data[self.n] = o + self.n += 1 + + +cdef class Int64Vector: + + cdef: + size_t n, m + ndarray ao + int64_t *data + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=np.int64) + self.data = self.ao.data + + def __len__(self): + return self.n + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + cdef inline append(self, int64_t x): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + self.data[self.n] = x + self.n += 1 + +cdef class Float64Vector: + + cdef: + size_t n, m + ndarray ao + float64_t *data + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=np.float64) + self.data = self.ao.data + + def __len__(self): + return self.n + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + cdef inline append(self, float64_t x): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + self.data[self.n] = x + self.n += 1 + + +cdef class HashTable: + pass + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cdef inline int check_type(self, object val): + return util.is_string_object(val) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + count += 1 + uniques.append(val) + + # return None + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + # return None + return reverse, labels + +cdef class Int32HashTable(HashTable): + cdef kh_int32_t *table + + def __init__(self, size_hint=1): + if size_hint is not None: + kh_resize_int32(self.table, size_hint) + + def __cinit__(self): + self.table = kh_init_int32() + + def __dealloc__(self): + kh_destroy_int32(self.table) + + cdef inline int check_type(self, object val): + return util.is_string_object(val) + + cpdef get_item(self, int32_t val): + cdef khiter_t k + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, int32_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, int32_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_int32(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_int32(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int32_t val + khiter_t k + + for i in range(n): + val = values[i] + k = kh_put_int32(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int32_t val + khiter_t k + ndarray[int32_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def factorize(self, ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + int32_t val + khiter_t k + + for i in range(n): + val = values[i] + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int32(self.table, val, &ret) + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + # return None + return reverse, labels + +cdef class Int64HashTable: #(HashTable): + # cdef kh_int64_t *table + + def __cinit__(self, size_hint=1): + self.table = kh_init_int64() + if size_hint is not None: + kh_resize_int64(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_int64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_int64(self.table, key) + return k != self.table.n_buckets + + def __len__(self): + return self.table.size + + cpdef get_item(self, int64_t val): + cdef khiter_t k + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, int64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, int64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_int64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_int64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map(self, ndarray[int64_t] keys, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t key + khiter_t k + + for i in range(n): + key = keys[i] + k = kh_put_int64(self.table, key, &ret) + self.table.vals[k] = values[i] + + def map_locations(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + + for i in range(n): + val = values[i] + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def factorize(self, ndarray[object] values): + reverse = {} + labels = self.get_labels(values, reverse, 0) + return reverse, labels + + def get_labels(self, ndarray[int64_t] values, Int64Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + int64_t val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return labels + + def get_labels_groupby(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + Py_ssize_t idx, count = 0 + int ret = 0 + int64_t val + khiter_t k + Int64Vector uniques = Int64Vector() + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return labels, arr_uniques + + def unique(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + ndarray result + int64_t val + khiter_t k + Int64Vector uniques = Int64Vector() + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k == self.table.n_buckets: + k = kh_put_int64(self.table, val, &ret) + uniques.append(val) + count += 1 + + result = uniques.to_array() + + return result + + +cdef class Float64HashTable(HashTable): + def __cinit__(self, size_hint=1): + self.table = kh_init_float64() + if size_hint is not None: + kh_resize_float64(self.table, size_hint) + + def __len__(self): + return self.table.size + + cpdef get_item(self, float64_t val): + cdef khiter_t k + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + cpdef set_item(self, float64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_float64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_float64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def __dealloc__(self): + kh_destroy_float64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_float64(self.table, key) + return k != self.table.n_buckets + + def factorize(self, ndarray[float64_t] values): + uniques = Float64Vector() + labels = self.get_labels(values, uniques, 0, -1) + return uniques.to_array(), labels + + def get_labels(self, ndarray[float64_t] values, + Float64Vector uniques, + Py_ssize_t count_prior, int64_t na_sentinel): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + float64_t val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + + if val != val: + labels[i] = na_sentinel + continue + + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return labels + + def map_locations(self, ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + khiter_t k + + for i in range(n): + k = kh_put_float64(self.table, values[i], &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def unique(self, ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + float64_t val + khiter_t k + Float64Vector uniques = Float64Vector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_float64(self.table, val) + if k == self.table.n_buckets: + k = kh_put_float64(self.table, val, &ret) + uniques.append(val) + count += 1 + elif not seen_na: + seen_na = 1 + uniques.append(ONAN) + + return uniques.to_array() + +na_sentinel = object + +cdef class PyObjectHashTable(HashTable): + # cdef kh_pymap_t *table + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + def destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def lookup2(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + long hval + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + # for i in range(n): + # val = values[i] + # hval = PyObject_Hash(val) + # k = kh_get_pymap(self.table, val) + + return locs + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + object val + ndarray result + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + k = kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(ONAN) + + result = uniques.to_array() + + return result + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return labels + + +cdef class Factorizer: + cdef public PyObjectHashTable table + cdef public ObjectVector uniques + cdef public Py_ssize_t count + + def __init__(self, size_hint): + self.table = PyObjectHashTable(size_hint) + self.uniques = ObjectVector() + self.count = 0 + + def get_count(self): + return self.count + + def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1): + """ + Factorize values with nans replaced by na_sentinel + >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) + array([ 0, 1, 20]) + """ + labels = self.table.get_labels(values, self.uniques, + self.count, na_sentinel) + mask = (labels == na_sentinel) + # sort on + if sort: + if labels.dtype != np.int_: + labels = labels.astype(np.int_) + sorter = self.uniques.to_array().argsort() + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + labels = reverse_indexer.take(labels, mode='clip') + labels[mask] = na_sentinel + self.count = len(self.uniques) + return labels + + def unique(self, ndarray[object] values): + # just for fun + return self.table.unique(values) + + +cdef class Int64Factorizer: + cdef public Int64HashTable table + cdef public Int64Vector uniques + cdef public Py_ssize_t count + + def __init__(self, size_hint): + self.table = Int64HashTable(size_hint) + self.uniques = Int64Vector() + self.count = 0 + + def get_count(self): + return self.count + + def factorize(self, ndarray[int64_t] values, sort=False, + na_sentinel=-1): + labels = self.table.get_labels(values, self.uniques, + self.count, na_sentinel) + + # sort on + if sort: + if labels.dtype != np.int_: + labels = labels.astype(np.int_) + + sorter = self.uniques.to_array().argsort() + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + labels = reverse_indexer.take(labels) + + self.count = len(self.uniques) + return labels + + +cdef build_count_table_int64(ndarray[int64_t] values, kh_int64_t *table): + cdef: + int k + Py_ssize_t i, n = len(values) + int ret = 0 + + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + k = kh_get_int64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_int64(table, val, &ret) + table.vals[k] = 1 + + +cpdef value_count_int64(ndarray[int64_t] values): + cdef: + Py_ssize_t i + kh_int64_t *table + int ret = 0 + int k + + table = kh_init_int64() + build_count_table_int64(values, table) + + i = 0 + result_keys = np.empty(table.n_occupied, dtype=np.int64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_int64(table) + + return result_keys, result_counts + + +cdef build_count_table_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + kh_pymap_t *table): + cdef: + int k + Py_ssize_t i, n = len(values) + int ret = 0 + + kh_resize_pymap(table, n // 10) + + for i in range(n): + if mask[i]: + continue + + val = values[i] + k = kh_get_pymap(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_pymap(table, val, &ret) + table.vals[k] = 1 + + +cpdef value_count_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask): + cdef: + Py_ssize_t i = len(values) + kh_pymap_t *table + int k + + table = kh_init_pymap() + build_count_table_object(values, mask, table) + + i = 0 + result_keys = np.empty(table.n_occupied, dtype=object) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + for k in range(table.n_buckets): + if kh_exist_pymap(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_pymap(table) + + return result_keys, result_counts + + +def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): + cdef: + int count, max_count = 2 + int j = -1 # so you can do += + int k + Py_ssize_t i, n = len(values) + kh_pymap_t *table + int ret = 0 + + table = kh_init_pymap() + build_count_table_object(values, mask, table) + + modes = np.empty(table.n_buckets, dtype=np.object_) + for k in range(table.n_buckets): + if kh_exist_pymap(table, k): + count = table.vals[k] + + if count == max_count: + j += 1 + elif count > max_count: + max_count = count + j = 0 + else: + continue + modes[j] = table.keys[k] + + kh_destroy_pymap(table) + + return modes[:j+1] + + +def mode_int64(ndarray[int64_t] values): + cdef: + int val, max_val = 2 + int j = -1 # so you can do += + int k + kh_int64_t *table + list uniques = [] + + table = kh_init_int64() + + build_count_table_int64(values, table) + + modes = np.empty(table.n_buckets, dtype=np.int64) + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + val = table.vals[k] + + if val == max_val: + j += 1 + elif val > max_val: + max_val = val + j = 0 + else: + continue + modes[j] = table.keys[k] + + kh_destroy_int64(table) + + return modes[:j+1] diff --git a/pandas/index.pyx b/pandas/index.pyx new file mode 100644 index 00000000..3dcdbf20 --- /dev/null +++ b/pandas/index.pyx @@ -0,0 +1,616 @@ +from numpy cimport ndarray + +from numpy cimport (float64_t, int32_t, int64_t, uint8_t, + NPY_DATETIME) +cimport cython + +cimport numpy as cnp + +cnp.import_array() +cnp.import_ufunc() + +cimport util + +import numpy as np + +cimport tslib +from hashtable cimport * +from pandas import algos, tslib, hashtable as _hash +from pandas.tslib import Timestamp + +from datetime cimport (get_datetime64_value, _pydatetime_to_dts, + pandas_datetimestruct) + +from cpython cimport PyTuple_Check, PyList_Check + +cdef extern from "datetime.h": + bint PyDateTime_Check(object o) + void PyDateTime_IMPORT() + +cdef int64_t iNaT = util.get_nat() + +try: + from dateutil.tz import tzutc as _du_utc + import pytz + UTC = pytz.utc + have_pytz = True +except ImportError: + have_pytz = False + +PyDateTime_IMPORT + +cdef extern from "Python.h": + int PySlice_Check(object) + + +cdef inline is_definitely_invalid_key(object val): + if PyTuple_Check(val): + try: + hash(val) + except TypeError: + return True + + # we have a _data, means we are a NDFrame + return (PySlice_Check(val) or cnp.PyArray_Check(val) + or PyList_Check(val) or hasattr(val,'_data')) + +def get_value_at(ndarray arr, object loc): + if arr.descr.type_num == NPY_DATETIME: + return Timestamp(util.get_value_at(arr, loc)) + return util.get_value_at(arr, loc) + +def set_value_at(ndarray arr, object loc, object val): + return util.set_value_at(arr, loc, val) + + +# Don't populate hash tables in monotonic indexes larger than this +_SIZE_CUTOFF = 1000000 + + +cdef class IndexEngine: + + cdef readonly: + object vgetter + HashTable mapping + bint over_size_threshold + + cdef: + bint unique, monotonic + bint initialized, monotonic_check, unique_check + + def __init__(self, vgetter, n): + self.vgetter = vgetter + + self.over_size_threshold = n >= _SIZE_CUTOFF + + self.initialized = 0 + self.monotonic_check = 0 + + self.unique = 0 + self.monotonic = 0 + + def __contains__(self, object val): + self._ensure_mapping_populated() + hash(val) + return val in self.mapping + + cpdef get_value(self, ndarray arr, object key): + ''' + arr : 1-dimensional ndarray + ''' + cdef: + object loc + void* data_ptr + + loc = self.get_loc(key) + if PySlice_Check(loc) or cnp.PyArray_Check(loc): + return arr[loc] + else: + if arr.descr.type_num == NPY_DATETIME: + return Timestamp(util.get_value_at(arr, loc)) + return util.get_value_at(arr, loc) + + cpdef set_value(self, ndarray arr, object key, object value): + ''' + arr : 1-dimensional ndarray + ''' + cdef: + object loc + void* data_ptr + + loc = self.get_loc(key) + value = convert_scalar(arr, value) + + if PySlice_Check(loc) or cnp.PyArray_Check(loc): + arr[loc] = value + else: + util.set_value_at(arr, loc, value) + + cpdef get_loc(self, object val): + if is_definitely_invalid_key(val): + raise TypeError + + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + loc = _bin_search(values, val) # .searchsorted(val, side='left') + if util.get_value_at(values, loc) != val: + raise KeyError(val) + return loc + + self._ensure_mapping_populated() + if not self.unique: + return self._get_loc_duplicates(val) + + self._check_type(val) + + try: + return self.mapping.get_item(val) + except TypeError: + raise KeyError(val) + + cdef inline _get_loc_duplicates(self, object val): + cdef: + Py_ssize_t diff + + if self.is_monotonic: + values = self._get_index_values() + left = values.searchsorted(val, side='left') + right = values.searchsorted(val, side='right') + + diff = right - left + if diff == 0: + raise KeyError(val) + elif diff == 1: + return left + else: + return slice(left, right) + else: + return self._maybe_get_bool_indexer(val) + + cdef _maybe_get_bool_indexer(self, object val): + cdef: + ndarray[uint8_t] indexer + ndarray[object] values + int count = 0 + Py_ssize_t i, n + int last_true + + values = self._get_index_values() + n = len(values) + + result = np.empty(n, dtype=bool) + indexer = result.view(np.uint8) + + for i in range(n): + if values[i] == val: + count += 1 + indexer[i] = 1 + last_true = i + else: + indexer[i] = 0 + + if count == 0: + raise KeyError(val) + if count == 1: + return last_true + + return result + + property is_unique: + + def __get__(self): + if not self.unique_check: + self._do_unique_check() + + return self.unique == 1 + + property is_monotonic: + + def __get__(self): + if not self.monotonic_check: + self._do_monotonic_check() + + return self.monotonic == 1 + + cdef inline _do_monotonic_check(self): + try: + values = self._get_index_values() + self.monotonic, unique = self._call_monotonic(values) + + if unique is not None: + self.unique = unique + self.unique_check = 1 + + except TypeError: + self.monotonic = 0 + self.monotonic_check = 1 + + cdef _get_index_values(self): + return self.vgetter() + + cdef inline _do_unique_check(self): + self._ensure_mapping_populated() + + def _call_monotonic(self, values): + raise NotImplementedError + + cdef _make_hash_table(self, n): + raise NotImplementedError + + cdef _check_type(self, object val): + hash(val) + + cdef inline _ensure_mapping_populated(self): + if not self.initialized: + self.initialize() + + cdef initialize(self): + values = self._get_index_values() + + self.mapping = self._make_hash_table(len(values)) + self.mapping.map_locations(values) + + if len(self.mapping) == len(values): + self.unique = 1 + self.unique_check = 1 + + self.initialized = 1 + + def clear_mapping(self): + self.mapping = None + self.initialized = 0 + + def get_indexer(self, values): + self._ensure_mapping_populated() + return self.mapping.lookup(values) + + def get_indexer_non_unique(self, targets): + """ return an indexer suitable for takng from a non unique index + return the labels in the same order ast the target + and a missing indexer into the targets (which correspond + to the -1 indicies in the results """ + + cdef: + ndarray values, x + ndarray[int64_t] result, missing + set stargets + dict d = {} + object val + int count = 0, count_missing = 0 + Py_ssize_t i, j, n, n_t, n_alloc + + self._ensure_mapping_populated() + values = self._get_index_values() + stargets = set(targets) + n = len(values) + n_t = len(targets) + if n > 10000: + n_alloc = 10000 + else: + n_alloc = n + + result = np.empty(n_alloc, dtype=np.int64) + missing = np.empty(n_t, dtype=np.int64) + + # form the set of the results (like ismember) + members = np.empty(n, dtype=np.uint8) + for i in range(n): + val = util.get_value_1d(values, i) + if val in stargets: + if val not in d: + d[val] = [] + d[val].append(i) + + for i in range(n_t): + + val = util.get_value_1d(targets, i) + + # found + if val in d: + for j in d[val]: + + # realloc if needed + if count >= n_alloc: + n_alloc += 10000 + result = np.resize(result, n_alloc) + + result[count] = j + count += 1 + + # value not found + else: + + if count >= n_alloc: + n_alloc += 10000 + result = np.resize(result, n_alloc) + result[count] = -1 + count += 1 + missing[count_missing] = i + count_missing += 1 + + return result[0:count], missing[0:count_missing] + +cdef class Int64Engine(IndexEngine): + + cdef _get_index_values(self): + return algos.ensure_int64(self.vgetter()) + + cdef _make_hash_table(self, n): + return _hash.Int64HashTable(n) + + def _call_monotonic(self, values): + return algos.is_monotonic_int64(values) + + def get_pad_indexer(self, other, limit=None): + return algos.pad_int64(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + return algos.backfill_int64(self._get_index_values(), other, + limit=limit) + + cdef _check_type(self, object val): + hash(val) + if util.is_bool_object(val): + raise KeyError(val) + elif util.is_float_object(val): + raise KeyError(val) + + cdef _maybe_get_bool_indexer(self, object val): + cdef: + ndarray[uint8_t, cast=True] indexer + ndarray[int64_t] values + int count = 0 + Py_ssize_t i, n + int64_t ival + int last_true + + if not util.is_integer_object(val): + raise KeyError(val) + + ival = val + + values = self._get_index_values() + n = len(values) + + result = np.empty(n, dtype=bool) + indexer = result.view(np.uint8) + + for i in range(n): + if values[i] == val: + count += 1 + indexer[i] = 1 + last_true = i + else: + indexer[i] = 0 + + if count == 0: + raise KeyError(val) + if count == 1: + return last_true + + return result + +cdef class Float64Engine(IndexEngine): + + cdef _make_hash_table(self, n): + return _hash.Float64HashTable(n) + + cdef _get_index_values(self): + return algos.ensure_float64(self.vgetter()) + + cdef _maybe_get_bool_indexer(self, object val): + cdef: + ndarray[uint8_t] indexer + ndarray[float64_t] values + int count = 0 + Py_ssize_t i, n + int last_true + + values = self._get_index_values() + n = len(values) + + result = np.empty(n, dtype=bool) + indexer = result.view(np.uint8) + + for i in range(n): + if values[i] == val: + count += 1 + indexer[i] = 1 + last_true = i + else: + indexer[i] = 0 + + if count == 0: + raise KeyError(val) + if count == 1: + return last_true + + return result + + def _call_monotonic(self, values): + return algos.is_monotonic_float64(values) + + def get_pad_indexer(self, other, limit=None): + return algos.pad_float64(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + return algos.backfill_float64(self._get_index_values(), other, + limit=limit) + + +cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: + cdef: + Py_ssize_t mid, lo = 0, hi = len(values) - 1 + object pval + + if hi >= 0 and val > util.get_value_at(values, hi): + return len(values) + + while lo < hi: + mid = (lo + hi) // 2 + pval = util.get_value_at(values, mid) + if val < pval: + hi = mid + elif val > pval: + lo = mid + 1 + else: + while mid > 0 and val == util.get_value_at(values, mid - 1): + mid -= 1 + return mid + + if val <= util.get_value_at(values, mid): + return mid + else: + return mid + 1 + +_pad_functions = { + 'object' : algos.pad_object, + 'int64' : algos.pad_int64, + 'float64' : algos.pad_float64 +} + +_backfill_functions = { + 'object': algos.backfill_object, + 'int64': algos.backfill_int64, + 'float64': algos.backfill_float64 +} + +cdef class ObjectEngine(IndexEngine): + + cdef _make_hash_table(self, n): + return _hash.PyObjectHashTable(n) + + def _call_monotonic(self, values): + return algos.is_monotonic_object(values) + + def get_pad_indexer(self, other, limit=None): + return algos.pad_object(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + return algos.backfill_object(self._get_index_values(), other, + limit=limit) + + +cdef class DatetimeEngine(Int64Engine): + + def __contains__(self, object val): + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + conv = _to_i8(val) + loc = values.searchsorted(conv, side='left') + return util.get_value_at(values, loc) == conv + + self._ensure_mapping_populated() + return _to_i8(val) in self.mapping + + cdef _get_index_values(self): + return self.vgetter().view('i8') + + def _call_monotonic(self, values): + return algos.is_monotonic_int64(values) + + cpdef get_loc(self, object val): + if is_definitely_invalid_key(val): + raise TypeError + + # Welcome to the spaghetti factory + + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + val = _to_i8(val) + return self._get_loc_duplicates(val) + values = self._get_index_values() + conv = _to_i8(val) + loc = values.searchsorted(conv, side='left') + if loc == len(values) or util.get_value_at(values, loc) != conv: + raise KeyError(val) + return loc + + self._ensure_mapping_populated() + if not self.unique: + val = _to_i8(val) + return self._get_loc_duplicates(val) + + try: + return self.mapping.get_item(val.value) + except KeyError: + raise KeyError(val) + except AttributeError: + pass + + try: + val = _to_i8(val) + return self.mapping.get_item(val) + except TypeError: + self._date_check_type(val) + raise KeyError(val) + + cdef inline _date_check_type(self, object val): + hash(val) + if not util.is_integer_object(val): + raise KeyError(val) + + def get_indexer(self, values): + self._ensure_mapping_populated() + if values.dtype != 'M8[ns]': + return np.repeat(-1, len(values)).astype('i4') + values = np.asarray(values).view('i8') + return self.mapping.lookup(values) + + def get_pad_indexer(self, other, limit=None): + if other.dtype != 'M8[ns]': + return np.repeat(-1, len(other)).astype('i4') + other = np.asarray(other).view('i8') + return algos.pad_int64(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + if other.dtype != 'M8[ns]': + return np.repeat(-1, len(other)).astype('i4') + other = np.asarray(other).view('i8') + return algos.backfill_int64(self._get_index_values(), other, + limit=limit) + + +cpdef convert_scalar(ndarray arr, object value): + if arr.descr.type_num == NPY_DATETIME: + if isinstance(value,np.ndarray): + pass + elif isinstance(value, Timestamp): + return value.value + elif value is None or value != value: + return iNaT + else: + return Timestamp(value).value + + if issubclass(arr.dtype.type, (np.integer, np.bool_)): + if util.is_float_object(value) and value != value: + raise ValueError('Cannot assign nan to integer series') + + return value + +cdef inline _to_i8(object val): + cdef pandas_datetimestruct dts + try: + return val.value + except AttributeError: + if util.is_datetime64_object(val): + return get_datetime64_value(val) + elif PyDateTime_Check(val): + tzinfo = getattr(val, 'tzinfo', None) + ival = _pydatetime_to_dts(val, &dts) # Save the original date value so we can get the utcoffset from it. + if tzinfo is not None and not _is_utc(tzinfo): + offset = tslib._get_utcoffset(tzinfo, val) + ival -= tslib._delta_to_nanoseconds(offset) + return ival + return val + +cdef inline bint _is_utc(object tz): + return tz is UTC or isinstance(tz, _du_utc) diff --git a/pandas/info.py b/pandas/info.py new file mode 100644 index 00000000..754741c1 --- /dev/null +++ b/pandas/info.py @@ -0,0 +1,20 @@ +""" +pandas - a powerful data analysis and manipulation library for Python +===================================================================== + +See http://pandas.sourceforge.net for full documentation. Otherwise, see the +docstrings of the various objects in the pandas namespace: + +Series +DataFrame +Panel +Index +DatetimeIndex +HDFStore +bdate_range +date_range +read_csv +read_fwf +read_table +ols +""" diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/io/api.py b/pandas/io/api.py new file mode 100644 index 00000000..5fa8c7ef --- /dev/null +++ b/pandas/io/api.py @@ -0,0 +1,15 @@ +""" +Data IO api +""" + +from pandas.io.parsers import read_csv, read_table, read_fwf +from pandas.io.clipboard import read_clipboard +from pandas.io.excel import ExcelFile, ExcelWriter, read_excel +from pandas.io.pytables import HDFStore, Term, get_store, read_hdf +from pandas.io.json import read_json +from pandas.io.html import read_html +from pandas.io.sql import read_sql, read_sql_table, read_sql_query +from pandas.io.stata import read_stata +from pandas.io.pickle import read_pickle, to_pickle +from pandas.io.packers import read_msgpack, to_msgpack +from pandas.io.gbq import read_gbq diff --git a/pandas/io/auth.py b/pandas/io/auth.py new file mode 100644 index 00000000..74b6b130 --- /dev/null +++ b/pandas/io/auth.py @@ -0,0 +1,123 @@ +from __future__ import print_function +# see LICENSES directory for copyright and license +import os +import sys +import logging + +import httplib2 + +import apiclient.discovery as gapi +import gflags +import oauth2client.file as auth_file +import oauth2client.client as oauth +import oauth2client.tools as tools +OOB_CALLBACK_URN = oauth.OOB_CALLBACK_URN + + +class AuthenticationConfigError(ValueError): + pass + +FLOWS = {} +FLAGS = gflags.FLAGS +DEFAULT_SECRETS = os.path.join( + os.path.dirname(__file__), 'client_secrets.json') +DEFAULT_SCOPE = 'https://www.googleapis.com/auth/analytics.readonly' +DEFAULT_TOKEN_FILE = os.path.join(os.path.dirname(__file__), 'analytics.dat') +MISSING_CLIENT_MSG = """ +WARNING: Please configure OAuth 2.0 + +You need to populate the client_secrets.json file found at: + + %s + +with information from the APIs Console . + +""" +DOC_URL = ('https://developers.google.com/api-client-library/python/guide/' + 'aaa_client_secrets') + +gflags.DEFINE_enum('logging_level', 'ERROR', + ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + 'Set the level of logging detail.') + +# Name of file that will store the access and refresh tokens to access +# the API without having to login each time. Make sure this file is in +# a secure place. + + +def process_flags(flags=[]): + """Uses the command-line flags to set the logging level. + + Args: + argv: List of command line arguments passed to the python script. + """ + + # Let the gflags module process the command-line arguments. + try: + FLAGS(flags) + except gflags.FlagsError as e: + print('%s\nUsage: %s ARGS\n%s' % (e, str(flags), FLAGS)) + sys.exit(1) + + # Set the logging according to the command-line flag. + logging.getLogger().setLevel(getattr(logging, FLAGS.logging_level)) + + +def get_flow(secret, scope, redirect): + """ + Retrieve an authentication flow object based on the given + configuration in the secret file name, the authentication scope, + and a redirect URN + """ + key = (secret, scope, redirect) + flow = FLOWS.get(key, None) + if flow is None: + msg = MISSING_CLIENT_MSG % secret + if not os.path.exists(secret): + raise AuthenticationConfigError(msg) + flow = oauth.flow_from_clientsecrets(secret, scope, + redirect_uri=redirect, + message=msg) + FLOWS[key] = flow + return flow + + +def make_token_store(fpath=None): + """create token storage from give file name""" + if fpath is None: + fpath = DEFAULT_TOKEN_FILE + return auth_file.Storage(fpath) + + +def authenticate(flow, storage=None): + """ + Try to retrieve a valid set of credentials from the token store if possible + Otherwise use the given authentication flow to obtain new credentials + and return an authenticated http object + + Parameters + ---------- + flow : authentication workflow + storage: token storage, default None + """ + http = httplib2.Http() + + # Prepare credentials, and authorize HTTP object with them. + credentials = storage.get() + if credentials is None or credentials.invalid: + credentials = tools.run(flow, storage) + + http = credentials.authorize(http) + return http + + +def init_service(http): + """ + Use the given http object to build the analytics service object + """ + return gapi.build('analytics', 'v3', http=http) + + +def reset_default_token_store(): + import os + os.remove(DEFAULT_TOKEN_FILE) diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py new file mode 100644 index 00000000..204eeab7 --- /dev/null +++ b/pandas/io/clipboard.py @@ -0,0 +1,97 @@ +""" io on the clipboard """ +from pandas import compat, get_option, DataFrame +from pandas.compat import StringIO + + +def read_clipboard(**kwargs): # pragma: no cover + """ + Read text from clipboard and pass to read_table. See read_table for the + full argument list + + If unspecified, `sep` defaults to '\s+' + + Returns + ------- + parsed : DataFrame + """ + from pandas.util.clipboard import clipboard_get + from pandas.io.parsers import read_table + text = clipboard_get() + + # try to decode (if needed on PY3) + # Strange. linux py33 doesn't complain, win py33 does + if compat.PY3: + try: + text = compat.bytes_to_str( + text, encoding=(kwargs.get('encoding') or + get_option('display.encoding')) + ) + except: + pass + + # Excel copies into clipboard with \t seperation + # inspect no more then the 10 first lines, if they + # all contain an equal number (>0) of tabs, infer + # that this came from excel and set 'sep' accordingly + lines = text[:10000].split('\n')[:-1][:10] + + # Need to remove leading white space, since read_table + # accepts: + # a b + # 0 1 2 + # 1 3 4 + + counts = set([x.lstrip().count('\t') for x in lines]) + if len(lines)>1 and len(counts) == 1 and counts.pop() != 0: + kwargs['sep'] = '\t' + + if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: + kwargs['sep'] = '\s+' + + return read_table(StringIO(text), **kwargs) + + +def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover + """ + Attempt to write text representation of object to the system clipboard + The clipboard can be then pasted into Excel for example. + + Parameters + ---------- + obj : the object to write to the clipboard + excel : boolean, defaults to True + if True, use the provided separator, writing in a csv + format for allowing easy pasting into excel. + if False, write a string representation of the object + to the clipboard + sep : optional, defaults to tab + other keywords are passed to to_csv + + Notes + ----- + Requirements for your platform + - Linux: xclip, or xsel (with gtk or PyQt4 modules) + - Windows: + - OS X: + """ + from pandas.util.clipboard import clipboard_set + if excel is None: + excel = True + + if excel: + try: + if sep is None: + sep = '\t' + buf = StringIO() + obj.to_csv(buf, sep=sep, **kwargs) + clipboard_set(buf.getvalue()) + return + except: + pass + + if isinstance(obj, DataFrame): + # str(df) has various unhelpful defaults, like truncation + objstr = obj.to_string() + else: + objstr = str(obj) + clipboard_set(objstr) diff --git a/pandas/io/common.py b/pandas/io/common.py new file mode 100644 index 00000000..daf441f2 --- /dev/null +++ b/pandas/io/common.py @@ -0,0 +1,167 @@ +"""Common IO api utilities""" + +import sys +import zipfile +from contextlib import contextmanager, closing + +from pandas.compat import StringIO +from pandas import compat + + +if compat.PY3: + from urllib.request import urlopen, pathname2url + _urlopen = urlopen + from urllib.parse import urlparse as parse_url + import urllib.parse as compat_parse + from urllib.parse import (uses_relative, uses_netloc, uses_params, + urlencode, urljoin) + from urllib.error import URLError + from http.client import HTTPException +else: + from urllib2 import urlopen as _urlopen + from urllib import urlencode, pathname2url + from urlparse import urlparse as parse_url + from urlparse import uses_relative, uses_netloc, uses_params, urljoin + from urllib2 import URLError + from httplib import HTTPException + from contextlib import contextmanager, closing + from functools import wraps + + # @wraps(_urlopen) + @contextmanager + def urlopen(*args, **kwargs): + with closing(_urlopen(*args, **kwargs)) as f: + yield f + + +_VALID_URLS = set(uses_relative + uses_netloc + uses_params) +_VALID_URLS.discard('') + + +class PerformanceWarning(Warning): + pass + + +class DtypeWarning(Warning): + pass + + +def _is_url(url): + """Check to see if a URL has a valid protocol. + + Parameters + ---------- + url : str or unicode + + Returns + ------- + isurl : bool + If `url` has a valid protocol return True otherwise False. + """ + try: + return parse_url(url).scheme in _VALID_URLS + except: + return False + + +def _is_s3_url(url): + """Check for an s3 url""" + try: + return parse_url(url).scheme == 's3' + except: + return False + + +def maybe_read_encoded_stream(reader, encoding=None): + """read an encoded stream from the reader and transform the bytes to + unicode if required based on the encoding + + Parameters + ---------- + reader : a streamable file-like object + encoding : optional, the encoding to attempt to read + + Returns + ------- + a tuple of (a stream of decoded bytes, the encoding which was used) + + """ + + if compat.PY3 or encoding is not None: # pragma: no cover + if encoding: + errors = 'strict' + else: + errors = 'replace' + encoding = 'utf-8' + reader = StringIO(reader.read().decode(encoding, errors)) + else: + encoding = None + return reader, encoding + + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None): + """ + If the filepath_or_buffer is a url, translate and return the buffer + passthru otherwise. + + Parameters + ---------- + filepath_or_buffer : a url, filepath, or buffer + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + + Returns + ------- + a filepath_or_buffer, the encoding + """ + + if _is_url(filepath_or_buffer): + req = _urlopen(str(filepath_or_buffer)) + return maybe_read_encoded_stream(req, encoding) + + if _is_s3_url(filepath_or_buffer): + try: + import boto + except: + raise ImportError("boto is required to handle s3 files") + # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY + # are environment variables + parsed_url = parse_url(filepath_or_buffer) + + try: + conn = boto.connect_s3() + except boto.exception.NoAuthHandlerFound: + conn = boto.connect_s3(anon=True) + + b = conn.get_bucket(parsed_url.netloc) + k = boto.s3.key.Key(b) + k.key = parsed_url.path + filepath_or_buffer = StringIO(k.get_contents_as_string()) + return filepath_or_buffer, None + + return filepath_or_buffer, None + + +def file_path_to_url(path): + """ + converts an absolute native path to a FILE URL. + + Parameters + ---------- + path : a path in native format + + Returns + ------- + a valid FILE URL + """ + return urljoin('file:', pathname2url(path)) + + +# ZipFile is not a context manager for <= 2.6 +# must be tuple index here since 2.6 doesn't use namedtuple for version_info +if sys.version_info[1] <= 6: + @contextmanager + def ZipFile(*args, **kwargs): + with closing(zipfile.ZipFile(*args, **kwargs)) as zf: + yield zf +else: + ZipFile = zipfile.ZipFile diff --git a/pandas/io/data.py b/pandas/io/data.py new file mode 100644 index 00000000..c40b91ff --- /dev/null +++ b/pandas/io/data.py @@ -0,0 +1,1203 @@ +""" +Module contains tools for collecting data from various remote sources + + +""" +import warnings +import tempfile +import datetime as dt +import time + +from collections import defaultdict + +import numpy as np + +from pandas.compat import( + StringIO, bytes_to_str, range, lrange, lmap, zip +) +import pandas.compat as compat +from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime +from pandas.core.common import is_list_like, PandasError +from pandas.io.parsers import TextParser +from pandas.io.common import urlopen, ZipFile, urlencode +from pandas.tseries.offsets import MonthBegin +from pandas.util.testing import _network_error_classes + + +class SymbolWarning(UserWarning): + pass + + +class RemoteDataError(PandasError, IOError): + pass + + +def DataReader(name, data_source=None, start=None, end=None, + retry_count=3, pause=0.001): + """ + Imports data from a number of online sources. + + Currently supports Yahoo! Finance, Google Finance, St. Louis FED (FRED) + and Kenneth French's data library. + + Parameters + ---------- + name : str or list of strs + the name of the dataset. Some data sources (yahoo, google, fred) will + accept a list of names. + data_source: str + the data source ("yahoo", "google", "fred", or "ff") + start : {datetime, None} + left boundary for range (defaults to 1/1/2010) + end : {datetime, None} + right boundary for range (defaults to today) + + Examples + ---------- + + # Data from Yahoo! Finance + gs = DataReader("GS", "yahoo") + + # Data from Google Finance + aapl = DataReader("AAPL", "google") + + # Data from FRED + vix = DataReader("VIXCLS", "fred") + + # Data from Fama/French + ff = DataReader("F-F_Research_Data_Factors", "famafrench") + ff = DataReader("F-F_Research_Data_Factors_weekly", "famafrench") + ff = DataReader("6_Portfolios_2x3", "famafrench") + ff = DataReader("F-F_ST_Reversal_Factor", "famafrench") + """ + start, end = _sanitize_dates(start, end) + + if data_source == "yahoo": + return get_data_yahoo(symbols=name, start=start, end=end, + adjust_price=False, chunksize=25, + retry_count=retry_count, pause=pause) + elif data_source == "google": + return get_data_google(symbols=name, start=start, end=end, + adjust_price=False, chunksize=25, + retry_count=retry_count, pause=pause) + elif data_source == "fred": + return get_data_fred(name, start, end) + elif data_source == "famafrench": + return get_data_famafrench(name) + + +def _sanitize_dates(start, end): + from pandas.core.datetools import to_datetime + start = to_datetime(start) + end = to_datetime(end) + if start is None: + start = dt.datetime(2010, 1, 1) + if end is None: + end = dt.datetime.today() + return start, end + + +def _in_chunks(seq, size): + """ + Return sequence in 'chunks' of size defined by size + """ + return (seq[pos:pos + size] for pos in range(0, len(seq), size)) + + +_yahoo_codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', + 'time': 't1', 'short_ratio': 's7'} + + +_YAHOO_QUOTE_URL = 'http://finance.yahoo.com/d/quotes.csv?' + + +def get_quote_yahoo(symbols): + """ + Get current yahoo quote + + Returns a DataFrame + """ + if isinstance(symbols, compat.string_types): + sym_list = symbols + else: + sym_list = '+'.join(symbols) + + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm + request = ''.join(compat.itervalues(_yahoo_codes)) # code request string + header = list(_yahoo_codes.keys()) + + data = defaultdict(list) + + url_str = _YAHOO_QUOTE_URL + 's=%s&f=%s' % (sym_list, request) + + with urlopen(url_str) as url: + lines = url.readlines() + + for line in lines: + fields = line.decode('utf-8').strip().split(',') + for i, field in enumerate(fields): + if field[-2:] == '%"': + v = float(field.strip('"%')) + elif field[0] == '"': + v = field.strip('"') + else: + try: + v = float(field) + except ValueError: + v = np.nan + data[header[i]].append(v) + + idx = data.pop('symbol') + return DataFrame(data, index=idx) + + +def get_quote_google(symbols): + raise NotImplementedError("Google Finance doesn't have this functionality") + + +def _retry_read_url(url, retry_count, pause, name): + for _ in range(retry_count): + time.sleep(pause) + + # kludge to close the socket ASAP + try: + with urlopen(url) as resp: + lines = resp.read() + except _network_error_classes: + pass + else: + rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + parse_dates=True)[::-1] + # Yahoo! Finance sometimes does this awesome thing where they + # return 2 rows for the most recent business day + if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover + rs = rs[:-1] + return rs + + raise IOError("after %d tries, %s did not " + "return a 200 for url %r" % (retry_count, name, url)) + + +_HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?' + + +def _get_hist_yahoo(sym, start, end, retry_count, pause): + """ + Get historical data for the given name from yahoo. + Date format is datetime + + Returns a DataFrame. + """ + start, end = _sanitize_dates(start, end) + url = (_HISTORICAL_YAHOO_URL + 's=%s' % sym + + '&a=%s' % (start.month - 1) + + '&b=%s' % start.day + + '&c=%s' % start.year + + '&d=%s' % (end.month - 1) + + '&e=%s' % end.day + + '&f=%s' % end.year + + '&g=d' + + '&ignore=.csv') + return _retry_read_url(url, retry_count, pause, 'Yahoo!') + + +_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?' + + +def _get_hist_google(sym, start, end, retry_count, pause): + """ + Get historical data for the given name from google. + Date format is datetime + + Returns a DataFrame. + """ + start, end = _sanitize_dates(start, end) + + # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv + url = "%s%s" % (_HISTORICAL_GOOGLE_URL, + urlencode({"q": sym, + "startdate": start.strftime('%b %d, ' '%Y'), + "enddate": end.strftime('%b %d, %Y'), + "output": "csv"})) + return _retry_read_url(url, retry_count, pause, 'Google') + + +def _adjust_prices(hist_data, price_list=None): + """ + Return modifed DataFrame or Panel with adjusted prices based on + 'Adj Close' price. Adds 'Adj_Ratio' column. + """ + if price_list is None: + price_list = 'Open', 'High', 'Low', 'Close' + adj_ratio = hist_data['Adj Close'] / hist_data['Close'] + + data = hist_data.copy() + for item in price_list: + data[item] = hist_data[item] * adj_ratio + data['Adj_Ratio'] = adj_ratio + del data['Adj Close'] + return data + + +def _calc_return_index(price_df): + """ + Return a returns index from a input price df or series. Initial value + (typically NaN) is set to 1. + """ + df = price_df.pct_change().add(1).cumprod() + mask = df.ix[1].notnull() & df.ix[0].isnull() + df.ix[0][mask] = 1 + + # Check for first stock listings after starting date of index in ret_index + # If True, find first_valid_index and set previous entry to 1. + if (~mask).any(): + for sym in mask.index[~mask]: + tstamp = df[sym].first_valid_index() + t_idx = df.index.get_loc(tstamp) - 1 + df[sym].ix[t_idx] = 1 + + return df + + +_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?' + + +def get_components_yahoo(idx_sym): + """ + Returns DataFrame containing list of component information for + index represented in idx_sym from yahoo. Includes component symbol + (ticker), exchange, and name. + + Parameters + ---------- + idx_sym : str + Stock index symbol + Examples: + '^DJI' (Dow Jones Industrial Average) + '^NYA' (NYSE Composite) + '^IXIC' (NASDAQ Composite) + + See: http://finance.yahoo.com/indices for other index symbols + + Returns + ------- + idx_df : DataFrame + """ + stats = 'snx' + # URL of form: + # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv + url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}' + + idx_mod = idx_sym.replace('^', '@%5E') + url_str = url.format(idx_mod, stats, 1) + + idx_df = DataFrame() + mask = [True] + comp_idx = 1 + + # LOOP across component index structure, + # break when no new components are found + while True in mask: + url_str = url.format(idx_mod, stats, comp_idx) + with urlopen(url_str) as resp: + raw = resp.read() + lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"') + lines = [line.strip().split('","') for line in lines] + + temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) + temp_df = temp_df.drop_duplicates() + temp_df = temp_df.set_index('ticker') + mask = ~temp_df.index.isin(idx_df.index) + + comp_idx = comp_idx + 50 + idx_df = idx_df.append(temp_df[mask]) + + return idx_df + + +def _dl_mult_symbols(symbols, start, end, chunksize, retry_count, pause, + method): + stocks = {} + for sym_group in _in_chunks(symbols, chunksize): + for sym in sym_group: + try: + stocks[sym] = method(sym, start, end, retry_count, pause) + except IOError: + warnings.warn('Failed to read symbol: {0!r}, replacing with ' + 'NaN.'.format(sym), SymbolWarning) + stocks[sym] = np.nan + + try: + return Panel(stocks).swapaxes('items', 'minor') + except AttributeError: + # cannot construct a panel with just 1D nans indicating no data + raise RemoteDataError("No data fetched using " + "{0!r}".format(method.__name__)) + + +_source_functions = {'google': _get_hist_google, 'yahoo': _get_hist_yahoo} + + +def _get_data_from(symbols, start, end, retry_count, pause, adjust_price, + ret_index, chunksize, source): + + src_fn = _source_functions[source] + + # If a single symbol, (e.g., 'GOOG') + if isinstance(symbols, (compat.string_types, int)): + hist_data = src_fn(symbols, start, end, retry_count, pause) + # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) + elif isinstance(symbols, DataFrame): + hist_data = _dl_mult_symbols(symbols.index, start, end, chunksize, + retry_count, pause, src_fn) + else: + hist_data = _dl_mult_symbols(symbols, start, end, chunksize, + retry_count, pause, src_fn) + if source.lower() == 'yahoo': + if ret_index: + hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) + if adjust_price: + hist_data = _adjust_prices(hist_data) + + return hist_data + + +def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, + pause=0.001, adjust_price=False, ret_index=False, + chunksize=25): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Yahoo! Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, array-like object (list, tuple, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + adjust_price : bool, default False + If True, adjusts all prices in hist_data ('Open', 'High', 'Low', + 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops + 'Adj Close'. + ret_index : bool, default False + If True, includes a simple return index 'Ret_Index' in hist_data. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + + Returns + ------- + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) + """ + return _get_data_from(symbols, start, end, retry_count, pause, + adjust_price, ret_index, chunksize, 'yahoo') + + +def get_data_google(symbols=None, start=None, end=None, retry_count=3, + pause=0.001, adjust_price=False, ret_index=False, + chunksize=25): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Google Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, array-like object (list, tuple, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + + Returns + ------- + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) + """ + return _get_data_from(symbols, start, end, retry_count, pause, + adjust_price, ret_index, chunksize, 'google') + + +_FRED_URL = "http://research.stlouisfed.org/fred2/series/" + + +def get_data_fred(name, start=dt.datetime(2010, 1, 1), + end=dt.datetime.today()): + """ + Get data for the given name from the St. Louis FED (FRED). + Date format is datetime + + Returns a DataFrame. + + If multiple names are passed for "series" then the index of the + DataFrame is the outer join of the indicies of each series. + """ + start, end = _sanitize_dates(start, end) + + if not is_list_like(name): + names = [name] + else: + names = name + + urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for + n in names] + + def fetch_data(url, name): + with urlopen(url) as resp: + data = read_csv(resp, index_col=0, parse_dates=True, + header=None, skiprows=1, names=["DATE", name], + na_values='.') + try: + return data.truncate(start, end) + except KeyError: + if data.ix[3].name[7:12] == 'Error': + raise IOError("Failed to get the data. Check that {0!r} is " + "a valid FRED series.".format(name)) + raise + df = concat([fetch_data(url, n) for url, n in zip(urls, names)], + axis=1, join='outer') + return df + + +_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' + + +def get_data_famafrench(name): + # path of zip files + zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name) + + with urlopen(zip_file_path) as url: + raw = url.read() + + with tempfile.TemporaryFile() as tmpf: + tmpf.write(raw) + + with ZipFile(tmpf, 'r') as zf: + data = zf.open(zf.namelist()[0]).readlines() + + line_lengths = np.array(lmap(len, data)) + file_edges = np.where(line_lengths == 2)[0] + + datasets = {} + edges = zip(file_edges + 1, file_edges[1:]) + for i, (left_edge, right_edge) in enumerate(edges): + dataset = [d.split() for d in data[left_edge:right_edge]] + if len(dataset) > 10: + ncol_raw = np.array(lmap(len, dataset)) + ncol = np.median(ncol_raw) + header_index = np.where(ncol_raw == ncol - 1)[0][-1] + header = dataset[header_index] + ds_header = dataset[header_index + 1:] + # to ensure the header is unique + header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, + start=1)] + index = np.array([d[0] for d in ds_header], dtype=int) + dataset = np.array([d[1:] for d in ds_header], dtype=float) + datasets[i] = DataFrame(dataset, index, columns=header) + + return datasets + + +# Items needed for options class +CUR_MONTH = dt.datetime.now().month +CUR_YEAR = dt.datetime.now().year +CUR_DAY = dt.datetime.now().day + +def _unpack(row, kind): + def _parse_row_values(val): + ret = val.text_content() + if 'neg_arrow' in val.xpath('.//@class'): + try: + ret = float(ret.replace(',', ''))*(-1.0) + except ValueError: + ret = np.nan + + return ret + + els = row.xpath('.//%s' % kind) + return [_parse_row_values(val) for val in els] + +def _parse_options_data(table): + rows = table.xpath('.//tr') + header = _unpack(rows[0], kind='th') + data = [_unpack(row, kind='td') for row in rows[1:]] + # Use ',' as a thousands separator as we're pulling from the US site. + return TextParser(data, names=header, na_values=['N/A'], + thousands=',').get_chunk() + + +def _two_char_month(s): + return '{0:0>2}'.format(s) + + +class Options(object): + """ + ***Experimental*** + This class fetches call/put data for a given stock/expiry month. + + It is instantiated with a string representing the ticker symbol. + + The class has the following methods: + get_options_data:(month, year, expiry) + get_call_data:(month, year, expiry) + get_put_data: (month, year, expiry) + get_near_stock_price(opt_frame, above_below) + get_all_data(call, put) + get_forward_data(months, call, put) (deprecated) + + Examples + -------- + # Instantiate object with ticker + >>> aapl = Options('aapl', 'yahoo') + + # Fetch May 2014 call data + >>> expiry = datetime.date(2014, 5, 1) + >>> calls = aapl.get_call_data(expiry=expiry) + + # Can now access aapl.calls instance variable + >>> aapl.calls + + # Fetch May 2014 put data + >>> puts = aapl.get_put_data(expiry=expiry) + + # Can now access aapl.puts instance variable + >>> aapl.puts + + # cut down the call data to be 3 below and 3 above the stock price. + >>> cut_calls = aapl.get_near_stock_price(call=True, above_below=3) + + # Fetch call and put data with expiry from now to 8 months out + >>> forward_data = aapl.get_forward_data(8, call=True, put=True) + + # Fetch all call and put data + >>> all_data = aapl.get_all_data() + """ + + _TABLE_LOC = {'calls': 9, 'puts': 13} + + def __init__(self, symbol, data_source=None): + """ Instantiates options_data with a ticker saved as symbol """ + self.symbol = symbol.upper() + if data_source is None: + warnings.warn("Options(symbol) is deprecated, use Options(symbol," + " data_source) instead", FutureWarning) + data_source = "yahoo" + if data_source != "yahoo": + raise NotImplementedError("currently only yahoo supported") + + def get_options_data(self, month=None, year=None, expiry=None): + """ + ***Experimental*** + Gets call/put data for the stock with the expiration data in the + given month and year + + Parameters + ---------- + expiry: datetime.date, optional(default=None) + The date when options expire (defaults to current month) + + Returns + ------- + pandas.DataFrame + A DataFrame with requested options data. + + Index: + Strike: Option strike, int + Expiry: Option expiry, datetime.date + Type: Call or Put, string + Symbol: Option symbol as reported on Yahoo, string + Columns: + Last: Last option price, float + Chg: Change from prior day, float + Bid: Bid price, float + Ask: Ask price, float + Vol: Volume traded, int64 + Open_Int: Open interest, int64 + IsNonstandard: True if the the deliverable is not 100 shares, otherwise false + Underlying: Ticker of the underlying security, string + Underlying_Price: Price of the underlying security, float64 + Quote_Time: Time of the quote, Timestamp + + Notes + ----- + Note: Format of returned data frame is dependent on Yahoo and may change. + + When called, this function will add instance variables named + calls and puts. See the following example: + + >>> aapl = Options('aapl', 'yahoo') # Create object + >>> aapl.calls # will give an AttributeError + >>> aapl.get_options() # Get data and set ivars + >>> aapl.calls # Doesn't throw AttributeError + + Also note that aapl.calls and appl.puts will always be the calls + and puts for the next expiry. If the user calls this method with + a different month or year, the ivar will be named callsMMYY or + putsMMYY where MM and YY are, respectively, two digit + representations of the month and year for the expiry of the + options. + """ + return concat([f(month, year, expiry) + for f in (self.get_put_data, + self.get_call_data)]).sortlevel() + + _OPTIONS_BASE_URL = 'http://finance.yahoo.com/q/op?s={sym}' + + def _get_option_tables(self, expiry): + root = self._get_option_page_from_yahoo(expiry) + tables = self._parse_option_page_from_yahoo(root) + m1 = _two_char_month(expiry.month) + table_name = '_tables' + m1 + str(expiry.year)[-2:] + setattr(self, table_name, tables) + return tables + + def _get_option_page_from_yahoo(self, expiry): + + url = self._OPTIONS_BASE_URL.format(sym=self.symbol) + + m1 = _two_char_month(expiry.month) + + # if this month use other url + if expiry.month == CUR_MONTH and expiry.year == CUR_YEAR: + url += '+Options' + else: + url += '&m={year}-{m1}'.format(year=expiry.year, m1=m1) + + root = self._parse_url(url) + return root + + def _parse_option_page_from_yahoo(self, root): + + tables = root.xpath('.//table') + ntables = len(tables) + if ntables == 0: + raise RemoteDataError("No tables found") + + try: + self.underlying_price, self.quote_time = self._get_underlying_price(root) + except IndexError: + self.underlying_price, self.quote_time = np.nan, np.nan + + return tables + + def _get_underlying_price(self, root): + underlying_price = float(root.xpath('.//*[@class="time_rtq_ticker"]')[0]\ + .getchildren()[0].text) + + #Gets the time of the quote, note this is actually the time of the underlying price. + quote_time_text = root.xpath('.//*[@class="time_rtq"]')[0].getchildren()[0].text + if quote_time_text: + #weekend and prior to market open time format + split = quote_time_text.split(",") + timesplit = split[1].strip().split(":") + timestring = split[0] + ", " + timesplit[0].zfill(2) + ":" + timesplit[1] + quote_time = dt.datetime.strptime(timestring, "%b %d, %H:%M%p EDT") + quote_time = quote_time.replace(year=CUR_YEAR) + else: + quote_time_text = root.xpath('.//*[@class="time_rtq"]')[0].getchildren()[0].getchildren()[0].text + quote_time = dt.datetime.strptime(quote_time_text, "%H:%M%p EDT") + quote_time = quote_time.replace(year=CUR_YEAR, month=CUR_MONTH, day=CUR_DAY) + + return underlying_price, quote_time + + + def _get_option_data(self, month, year, expiry, name): + year, month, expiry = self._try_parse_dates(year, month, expiry) + m1 = _two_char_month(month) + table_name = '_tables' + m1 + str(year)[-2:] + + try: + tables = getattr(self, table_name) + except AttributeError: + tables = self._get_option_tables(expiry) + + ntables = len(tables) + table_loc = self._TABLE_LOC[name] + if table_loc - 1 > ntables: + raise RemoteDataError("Table location {0} invalid, {1} tables" + " found".format(table_loc, ntables)) + + option_data = _parse_options_data(tables[table_loc]) + option_data['Type'] = name[:-1] + option_data = self._process_data(option_data, name[:-1]) + + if month == CUR_MONTH and year == CUR_YEAR: + setattr(self, name, option_data) + + name += m1 + str(year)[-2:] + setattr(self, name, option_data) + return option_data + + def get_call_data(self, month=None, year=None, expiry=None): + """ + ***Experimental*** + Gets call/put data for the stock with the expiration data in the + given month and year + + Parameters + ---------- + expiry: datetime.date, optional(default=None) + The date when options expire (defaults to current month) + + Returns + ------- + call_data: pandas.DataFrame + A DataFrame with requested options data. + + Index: + Strike: Option strike, int + Expiry: Option expiry, datetime.date + Type: Call or Put, string + Symbol: Option symbol as reported on Yahoo, string + Columns: + Last: Last option price, float + Chg: Change from prior day, float + Bid: Bid price, float + Ask: Ask price, float + Vol: Volume traded, int64 + Open_Int: Open interest, int64 + IsNonstandard: True if the the deliverable is not 100 shares, otherwise false + Underlying: Ticker of the underlying security, string + Underlying_Price: Price of the underlying security, float64 + Quote_Time: Time of the quote, Timestamp + + Notes + ----- + Note: Format of returned data frame is dependent on Yahoo and may change. + + When called, this function will add instance variables named + calls and puts. See the following example: + + >>> aapl = Options('aapl', 'yahoo') # Create object + >>> aapl.calls # will give an AttributeError + >>> aapl.get_call_data() # Get data and set ivars + >>> aapl.calls # Doesn't throw AttributeError + + Also note that aapl.calls will always be the calls for the next + expiry. If the user calls this method with a different month + or year, the ivar will be named callsMMYY where MM and YY are, + respectively, two digit representations of the month and year + for the expiry of the options. + """ + return self._get_option_data(month, year, expiry, 'calls').sortlevel() + + def get_put_data(self, month=None, year=None, expiry=None): + """ + ***Experimental*** + Gets put data for the stock with the expiration data in the + given month and year + + Parameters + ---------- + expiry: datetime.date, optional(default=None) + The date when options expire (defaults to current month) + + Returns + ------- + put_data: pandas.DataFrame + A DataFrame with requested options data. + + Index: + Strike: Option strike, int + Expiry: Option expiry, datetime.date + Type: Call or Put, string + Symbol: Option symbol as reported on Yahoo, string + Columns: + Last: Last option price, float + Chg: Change from prior day, float + Bid: Bid price, float + Ask: Ask price, float + Vol: Volume traded, int64 + Open_Int: Open interest, int64 + IsNonstandard: True if the the deliverable is not 100 shares, otherwise false + Underlying: Ticker of the underlying security, string + Underlying_Price: Price of the underlying security, float64 + Quote_Time: Time of the quote, Timestamp + + Notes + ----- + Note: Format of returned data frame is dependent on Yahoo and may change. + + When called, this function will add instance variables named + puts. See the following example: + + >>> aapl = Options('aapl') # Create object + >>> aapl.puts # will give an AttributeError + >>> aapl.get_put_data() # Get data and set ivars + >>> aapl.puts # Doesn't throw AttributeError + + return self.__setattr__(self, str(str(x) + str(y))) + + Also note that aapl.puts will always be the puts for the next + expiry. If the user calls this method with a different month + or year, the ivar will be named putsMMYY where MM and YY are, + repsectively, two digit representations of the month and year + for the expiry of the options. + """ + return self._get_option_data(month, year, expiry, 'puts').sortlevel() + + def get_near_stock_price(self, above_below=2, call=True, put=False, + month=None, year=None, expiry=None): + """ + ***Experimental*** + Returns a data frame of options that are near the current stock price. + + Parameters + ---------- + above_below: number, int, optional (default=2) + The number of strike prices above and below the stock price that + should be taken + + call: bool + Tells the function whether or not it should be using + self.calls + + put: bool + Tells the function weather or not it should be using + self.puts + + expiry: datetime.date, optional(default=None) + The date when options expire (defaults to current month) + + Returns + ------- + chopped: DataFrame + The resultant DataFrame chopped down to be 2 * above_below + 1 rows + desired. If there isn't data as far out as the user has asked for + then + + Note: Format of returned data frame is dependent on Yahoo and may change. + + """ + + to_ret = Series({'calls': call, 'puts': put}) + to_ret = to_ret[to_ret].index + + data = {} + + for nam in to_ret: + df = self._get_option_data(month, year, expiry, nam) + data[nam] = self.chop_data(df, above_below, self.underlying_price) + + return concat([data[nam] for nam in to_ret]).sortlevel() + + def chop_data(self, df, above_below=2, underlying_price=None): + """Returns a data frame only options that are near the current stock price.""" + + if not underlying_price: + try: + underlying_price = self.underlying_price + except AttributeError: + underlying_price = np.nan + + if not np.isnan(underlying_price): + start_index = np.where(df.index.get_level_values('Strike') + > underlying_price)[0][0] + + get_range = slice(start_index - above_below, + start_index + above_below + 1) + df = df[get_range].dropna(how='all') + + return df + + + + @staticmethod + def _try_parse_dates(year, month, expiry): + """ + Validates dates provided by user. Ensures the user either provided both a month and a year or an expiry. + + Parameters + ---------- + year: Calendar year, int (deprecated) + + month: Calendar month, int (deprecated) + + expiry: Expiry date (month and year), datetime.date, (preferred) + + Returns + ------- + Tuple of year (int), month (int), expiry (datetime.date) + """ + + #Checks if the user gave one of the month or the year but not both and did not provide an expiry: + if (month is not None and year is None) or (month is None and year is not None) and expiry is None: + msg = "You must specify either (`year` and `month`) or `expiry` " \ + "or none of these options for the current month." + raise ValueError(msg) + + if (year is not None or month is not None) and expiry is None: + warnings.warn("month, year arguments are deprecated, use expiry" + " instead", FutureWarning) + + if expiry is not None: + year = expiry.year + month = expiry.month + elif year is None and month is None: + year = CUR_YEAR + month = CUR_MONTH + expiry = dt.date(year, month, 1) + else: + expiry = dt.date(year, month, 1) + + return year, month, expiry + + def get_forward_data(self, months, call=True, put=False, near=False, + above_below=2): + """ + ***Experimental*** + Gets either call, put, or both data for months starting in the current + month and going out in the future a specified amount of time. + + Parameters + ---------- + months: number, int + How many months to go out in the collection of the data. This is + inclusive. + + call: bool, optional (default=True) + Whether or not to collect data for call options + + put: bool, optional (default=False) + Whether or not to collect data for put options. + + near: bool, optional (default=False) + Whether this function should get only the data near the + current stock price. Uses Options.get_near_stock_price + + above_below: number, int, optional (default=2) + The number of strike prices above and below the stock price that + should be taken if the near option is set to True + + Returns + ------- + pandas.DataFrame + A DataFrame with requested options data. + + Index: + Strike: Option strike, int + Expiry: Option expiry, datetime.date + Type: Call or Put, string + Symbol: Option symbol as reported on Yahoo, string + Columns: + Last: Last option price, float + Chg: Change from prior day, float + Bid: Bid price, float + Ask: Ask price, float + Vol: Volume traded, int64 + Open_Int: Open interest, int64 + IsNonstandard: True if the the deliverable is not 100 shares, otherwise false + Underlying: Ticker of the underlying security, string + Underlying_Price: Price of the underlying security, float64 + Quote_Time: Time of the quote, Timestamp + + Note: Format of returned data frame is dependent on Yahoo and may change. + + """ + warnings.warn("get_forward_data() is deprecated", FutureWarning) + in_months = lrange(CUR_MONTH, CUR_MONTH + months + 1) + in_years = [CUR_YEAR] * (months + 1) + + # Figure out how many items in in_months go past 12 + to_change = 0 + for i in range(months): + if in_months[i] > 12: + in_months[i] -= 12 + to_change += 1 + + # Change the corresponding items in the in_years list. + for i in range(1, to_change + 1): + in_years[-i] += 1 + + to_ret = Series({'calls': call, 'puts': put}) + to_ret = to_ret[to_ret].index + all_data = [] + + for name in to_ret: + + for mon in range(months): + m2 = in_months[mon] + y2 = in_years[mon] + + if not near: + m1 = _two_char_month(m2) + nam = name + str(m1) + str(y2)[2:] + + try: # Try to access on the instance + frame = getattr(self, nam) + except AttributeError: + meth_name = 'get_{0}_data'.format(name[:-1]) + frame = getattr(self, meth_name)(m2, y2) + else: + frame = self.get_near_stock_price(call=call, put=put, + above_below=above_below, + month=m2, year=y2) + frame = self._process_data(frame, name[:-1]) + + all_data.append(frame) + + return concat(all_data).sortlevel() + + def get_all_data(self, call=True, put=True): + """ + ***Experimental*** + Gets either call, put, or both data for all available months starting + in the current month. + + Parameters + ---------- + call: bool, optional (default=True) + Whether or not to collect data for call options + + put: bool, optional (default=True) + Whether or not to collect data for put options. + + Returns + ------- + pandas.DataFrame + A DataFrame with requested options data. + + Index: + Strike: Option strike, int + Expiry: Option expiry, datetime.date + Type: Call or Put, string + Symbol: Option symbol as reported on Yahoo, string + Columns: + Last: Last option price, float + Chg: Change from prior day, float + Bid: Bid price, float + Ask: Ask price, float + Vol: Volume traded, int64 + Open_Int: Open interest, int64 + IsNonstandard: True if the the deliverable is not 100 shares, otherwise false + Underlying: Ticker of the underlying security, string + Underlying_Price: Price of the underlying security, float64 + Quote_Time: Time of the quote, Timestamp + + Note: Format of returned data frame is dependent on Yahoo and may change. + + """ + to_ret = Series({'calls': call, 'puts': put}) + to_ret = to_ret[to_ret].index + + try: + months = self.months + except AttributeError: + months = self._get_expiry_months() + + all_data = [] + + for name in to_ret: + + for month in months: + m2 = month.month + y2 = month.year + + m1 = _two_char_month(m2) + nam = name + str(m1) + str(y2)[2:] + + try: # Try to access on the instance + frame = getattr(self, nam) + except AttributeError: + meth_name = 'get_{0}_data'.format(name[:-1]) + frame = getattr(self, meth_name)(expiry=month) + + all_data.append(frame) + + return concat(all_data).sortlevel() + + def _get_expiry_months(self): + """ + Gets available expiry months. + + Returns + ------- + months : List of datetime objects + """ + + url = 'http://finance.yahoo.com/q/op?s={sym}'.format(sym=self.symbol) + root = self._parse_url(url) + + try: + links = root.xpath('.//*[@id="yfncsumtab"]')[0].xpath('.//a') + except IndexError: + return RemoteDataError('Expiry months not available') + + month_gen = (element.attrib['href'].split('=')[-1] + for element in links + if '/q/op?s=' in element.attrib['href'] + and '&m=' in element.attrib['href']) + + months = [dt.date(int(month.split('-')[0]), + int(month.split('-')[1]), 1) + for month in month_gen] + + current_month_text = root.xpath('.//*[@id="yfncsumtab"]')[0].xpath('.//strong')[0].text + current_month = dt.datetime.strptime(current_month_text, '%b %y') + months.insert(0, current_month) + self.months = months + + return months + + def _parse_url(self, url): + """ + Downloads and parses a URL, returns xml root. + + """ + try: + from lxml.html import parse + except ImportError: + raise ImportError("Please install lxml if you want to use the " + "{0!r} class".format(self.__class__.__name__)) + try: + doc = parse(url) + except _network_error_classes: + raise RemoteDataError("Unable to parse URL " + "{0!r}".format(url)) + else: + root = doc.getroot() + if root is None: + raise RemoteDataError("Parsed URL {0!r} has no root" + "element".format(url)) + return root + + + def _process_data(self, frame, type): + """ + Adds columns for Expiry, IsNonstandard (ie: deliverable is not 100 shares) + and Tag (the tag indicating what is actually deliverable, None if standard). + + """ + frame["Rootexp"] = frame.Symbol.str[0:-9] + frame["Root"] = frame.Rootexp.str[0:-6] + frame["Expiry"] = to_datetime(frame.Rootexp.str[-6:]) + #Removes dashes in equity ticker to map to option ticker. + #Ex: BRK-B to BRKB140517C00100000 + frame["IsNonstandard"] = frame['Root'] != self.symbol.replace('-','') + del frame["Rootexp"] + frame["Underlying"] = self.symbol + frame['Underlying_Price'] = self.underlying_price + frame["Quote_Time"] = self.quote_time + frame.rename(columns={'Open Int': 'Open_Int'}, inplace=True) + frame['Type'] = type + frame.set_index(['Strike', 'Expiry', 'Type', 'Symbol'], inplace=True) + + return frame diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py new file mode 100644 index 00000000..3ffcef4b --- /dev/null +++ b/pandas/io/date_converters.py @@ -0,0 +1,62 @@ +"""This module is designed for community supported date conversion functions""" +from pandas.compat import range, map +import numpy as np +import pandas.lib as lib + + +def parse_date_time(date_col, time_col): + date_col = _maybe_cast(date_col) + time_col = _maybe_cast(time_col) + return lib.try_parse_date_and_time(date_col, time_col) + + +def parse_date_fields(year_col, month_col, day_col): + year_col = _maybe_cast(year_col) + month_col = _maybe_cast(month_col) + day_col = _maybe_cast(day_col) + return lib.try_parse_year_month_day(year_col, month_col, day_col) + + +def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, + second_col): + year_col = _maybe_cast(year_col) + month_col = _maybe_cast(month_col) + day_col = _maybe_cast(day_col) + hour_col = _maybe_cast(hour_col) + minute_col = _maybe_cast(minute_col) + second_col = _maybe_cast(second_col) + return lib.try_parse_datetime_components(year_col, month_col, day_col, + hour_col, minute_col, second_col) + + +def generic_parser(parse_func, *cols): + N = _check_columns(cols) + results = np.empty(N, dtype=object) + + for i in range(N): + args = [c[i] for c in cols] + results[i] = parse_func(*args) + + return results + + +def _maybe_cast(arr): + if not arr.dtype.type == np.object_: + arr = np.array(arr, dtype=object) + return arr + + +def _check_columns(cols): + if not len(cols): + raise AssertionError("There must be at least 1 column") + + head, tail = cols[0], cols[1:] + + N = len(head) + + for i, n in enumerate(map(len, tail)): + if n != N: + raise AssertionError('All columns must have the same length: {0}; ' + 'column {1} has length {2}'.format(N, i, n)) + + return N diff --git a/pandas/io/excel.py b/pandas/io/excel.py new file mode 100644 index 00000000..f81cf650 --- /dev/null +++ b/pandas/io/excel.py @@ -0,0 +1,864 @@ +""" +Module parse to/from Excel +""" + +#---------------------------------------------------------------------- +# ExcelFile class +import os +import datetime +import abc +import numpy as np + +from pandas.io.parsers import TextParser +from pandas.io.common import _is_url, _urlopen +from pandas.tseries.period import Period +from pandas import json +from pandas.compat import map, zip, reduce, range, lrange, u, add_metaclass +from pandas.core import config +from pandas.core.common import pprint_thing +import pandas.compat as compat +import pandas.compat.openpyxl_compat as openpyxl_compat +import pandas.core.common as com +from warnings import warn +from distutils.version import LooseVersion + +__all__ = ["read_excel", "ExcelWriter", "ExcelFile"] + +_writer_extensions = ["xlsx", "xls", "xlsm"] +_writers = {} + + +def register_writer(klass): + """Adds engine to the excel writer registry. You must use this method to + integrate with ``to_excel``. Also adds config options for any new + ``supported_extensions`` defined on the writer.""" + if not compat.callable(klass): + raise ValueError("Can only register callables as engines") + engine_name = klass.engine + _writers[engine_name] = klass + for ext in klass.supported_extensions: + if ext.startswith('.'): + ext = ext[1:] + if ext not in _writer_extensions: + config.register_option("io.excel.%s.writer" % ext, + engine_name, validator=str) + _writer_extensions.append(ext) + + +def get_writer(engine_name): + try: + return _writers[engine_name] + except KeyError: + raise ValueError("No Excel writer '%s'" % engine_name) + + +def read_excel(io, sheetname=0, **kwds): + """Read an Excel table into a pandas DataFrame + + Parameters + ---------- + io : string, file-like object, or xlrd workbook. + The string could be a URL. Valid URL schemes include http, ftp, s3, + and file. For file URLs, a host is expected. For instance, a local + file could be file://localhost/path/to/workbook.xlsx + sheetname : string or int, default 0 + Name of Excel sheet or the page number of the sheet + header : int, default 0 + Row to use for the column labels of the parsed DataFrame + skiprows : list-like + Rows to skip at the beginning (0-indexed) + skip_footer : int, default 0 + Rows at the end to skip (0-indexed) + index_col : int, default None + Column to use as the row labels of the DataFrame. Pass None if + there is no such column + parse_cols : int or list, default None + * If None then parse all columns, + * If int then indicates last column to be parsed + * If list of ints then indicates list of column numbers to be parsed + * If string then indicates comma separated list of column names and + column ranges (e.g. "A:E" or "A,C,E:F") + na_values : list-like, default None + List of additional strings to recognize as NA/NaN + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to + verbose : boolean, default False + Indicate number of NA values placed in non-numeric columns + engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd + convert_float : boolean, default True + convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally + has_index_names : boolean, default False + True if the cols defined in index_col have an index name and are + not in the header. Index name will be placed on a separate line below + the header. + + Returns + ------- + parsed : DataFrame + DataFrame from the passed in Excel file + + """ + if 'kind' in kwds: + kwds.pop('kind') + warn("kind keyword is no longer supported in read_excel and may be " + "removed in a future version", FutureWarning) + + engine = kwds.pop('engine', None) + + return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds) + + +class ExcelFile(object): + """ + Class for parsing tabular excel sheets into DataFrame objects. + Uses xlrd. See ExcelFile.parse for more documentation + + Parameters + ---------- + io : string, file-like object or xlrd workbook + If a string, expected to be a path to xls or xlsx file + engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd + """ + def __init__(self, io, **kwds): + + import xlrd # throw an ImportError if we need to + + ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) + if ver < (0, 9): # pragma: no cover + raise ImportError("pandas requires xlrd >= 0.9.0 for excel " + "support, current version " + xlrd.__VERSION__) + + self.io = io + + engine = kwds.pop('engine', None) + + if engine is not None and engine != 'xlrd': + raise ValueError("Unknown engine: %s" % engine) + + if isinstance(io, compat.string_types): + if _is_url(io): + data = _urlopen(io).read() + self.book = xlrd.open_workbook(file_contents=data) + else: + self.book = xlrd.open_workbook(io) + elif engine == 'xlrd' and isinstance(io, xlrd.Book): + self.book = io + elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): + # N.B. xlrd.Book has a read attribute too + data = io.read() + self.book = xlrd.open_workbook(file_contents=data) + else: + raise ValueError('Must explicitly set engine if not passing in' + ' buffer or path for io.') + + def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, + index_col=None, parse_cols=None, parse_dates=False, + date_parser=None, na_values=None, thousands=None, chunksize=None, + convert_float=True, has_index_names=False, **kwds): + """Read an Excel table into DataFrame + + Parameters + ---------- + sheetname : string or integer + Name of Excel sheet or the page number of the sheet + header : int, default 0 + Row to use for the column labels of the parsed DataFrame + skiprows : list-like + Rows to skip at the beginning (0-indexed) + skip_footer : int, default 0 + Rows at the end to skip (0-indexed) + index_col : int, default None + Column to use as the row labels of the DataFrame. Pass None if + there is no such column + parse_cols : int or list, default None + * If None then parse all columns + * If int then indicates last column to be parsed + * If list of ints then indicates list of column numbers to be + parsed + * If string then indicates comma separated list of column names and + column ranges (e.g. "A:E" or "A,C,E:F") + parse_dates : boolean, default False + Parse date Excel values, + date_parser : function default None + Date parsing function + na_values : list-like, default None + List of additional strings to recognize as NA/NaN + thousands : str, default None + Thousands separator + chunksize : int, default None + Size of file chunk to read for lazy evaluation. + convert_float : boolean, default True + convert integral floats to int (i.e., 1.0 --> 1). If False, all + numeric data will be read in as floats: Excel stores all numbers as + floats internally. + has_index_names : boolean, default False + True if the cols defined in index_col have an index name and are + not in the header + + Returns + ------- + parsed : DataFrame + DataFrame parsed from the Excel file + """ + skipfooter = kwds.pop('skipfooter', None) + if skipfooter is not None: + skip_footer = skipfooter + + return self._parse_excel(sheetname=sheetname, header=header, + skiprows=skiprows, + index_col=index_col, + has_index_names=has_index_names, + parse_cols=parse_cols, + parse_dates=parse_dates, + date_parser=date_parser, na_values=na_values, + thousands=thousands, chunksize=chunksize, + skip_footer=skip_footer, + convert_float=convert_float, + **kwds) + + def _should_parse(self, i, parse_cols): + + def _range2cols(areas): + """ + Convert comma separated list of column names and column ranges to a + list of 0-based column indexes. + + >>> _range2cols('A:E') + [0, 1, 2, 3, 4] + >>> _range2cols('A,C,Z:AB') + [0, 2, 25, 26, 27] + """ + def _excel2num(x): + "Convert Excel column name like 'AB' to 0-based column index" + return reduce(lambda s, a: s * 26 + ord(a) - ord('A') + 1, + x.upper().strip(), 0) - 1 + + cols = [] + for rng in areas.split(','): + if ':' in rng: + rng = rng.split(':') + cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1) + else: + cols.append(_excel2num(rng)) + return cols + + if isinstance(parse_cols, int): + return i <= parse_cols + elif isinstance(parse_cols, compat.string_types): + return i in _range2cols(parse_cols) + else: + return i in parse_cols + + def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, + index_col=None, has_index_names=None, parse_cols=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, chunksize=None, convert_float=True, + **kwds): + import xlrd + from xlrd import (xldate, XL_CELL_DATE, + XL_CELL_ERROR, XL_CELL_BOOLEAN, + XL_CELL_NUMBER) + + epoch1904 = self.book.datemode + + # xlrd >= 0.9.3 can return datetime objects directly. + if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): + xlrd_0_9_3 = True + else: + xlrd_0_9_3 = False + + if isinstance(sheetname, compat.string_types): + sheet = self.book.sheet_by_name(sheetname) + else: # assume an integer if not a string + sheet = self.book.sheet_by_index(sheetname) + + data = [] + should_parse = {} + for i in range(sheet.nrows): + row = [] + for j, (value, typ) in enumerate(zip(sheet.row_values(i), + sheet.row_types(i))): + if parse_cols is not None and j not in should_parse: + should_parse[j] = self._should_parse(j, parse_cols) + + if parse_cols is None or should_parse[j]: + if typ == XL_CELL_DATE: + if xlrd_0_9_3: + # Use the newer xlrd datetime handling. + value = xldate.xldate_as_datetime(value, epoch1904) + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (value.timetuple())[0:3] + if ((not epoch1904 and year == (1899, 12, 31)) + or (epoch1904 and year == (1904, 1, 1))): + value = datetime.time(value.hour, + value.minute, + value.second, + value.microsecond) + else: + # Use the xlrd <= 0.9.2 date handling. + dt = xldate.xldate_as_tuple(value, epoch1904) + + if dt[0] < datetime.MINYEAR: + value = datetime.time(*dt[3:]) + else: + value = datetime.datetime(*dt) + + elif typ == XL_CELL_ERROR: + value = np.nan + elif typ == XL_CELL_BOOLEAN: + value = bool(value) + elif convert_float and typ == XL_CELL_NUMBER: + # GH5394 - Excel 'numbers' are always floats + # it's a minimal perf hit and less suprising + val = int(value) + if val == value: + value = val + + row.append(value) + + data.append(row) + + if header is not None: + data[header] = _trim_excel_header(data[header]) + + parser = TextParser(data, header=header, index_col=index_col, + has_index_names=has_index_names, + na_values=na_values, + thousands=thousands, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + skip_footer=skip_footer, + chunksize=chunksize, + **kwds) + + return parser.read() + + @property + def sheet_names(self): + return self.book.sheet_names() + + def close(self): + """close io if necessary""" + if hasattr(self.io, 'close'): + self.io.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + +def _trim_excel_header(row): + # trim header row so auto-index inference works + # xlrd uses '' , openpyxl None + while len(row) > 0 and (row[0] == '' or row[0] is None): + row = row[1:] + return row + + +def _conv_value(val): + # Convert numpy types to Python types for the Excel writers. + if com.is_integer(val): + val = int(val) + elif com.is_float(val): + val = float(val) + elif com.is_bool(val): + val = bool(val) + elif isinstance(val, Period): + val = "%s" % val + + return val + + +@add_metaclass(abc.ABCMeta) +class ExcelWriter(object): + """ + Class for writing DataFrame objects into excel sheets, default is to use + xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage. + + Parameters + ---------- + path : string + Path to xls or xlsx file. + engine : string (optional) + Engine to use for writing. If None, defaults to + ``io.excel..writer``. NOTE: can only be passed as a keyword + argument. + date_format : string, default None + Format string for dates written into Excel files (e.g. 'YYYY-MM-DD') + datetime_format : string, default None + Format string for datetime objects written into Excel files + (e.g. 'YYYY-MM-DD HH:MM:SS') + """ + # Defining an ExcelWriter implementation (see abstract methods for more...) + + # - Mandatory + # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)`` + # --> called to write additional DataFrames to disk + # - ``supported_extensions`` (tuple of supported extensions), used to + # check that engine supports the given extension. + # - ``engine`` - string that gives the engine name. Necessary to + # instantiate class directly and bypass ``ExcelWriterMeta`` engine + # lookup. + # - ``save(self)`` --> called to save file to disk + # - Mostly mandatory (i.e. should at least exist) + # - book, cur_sheet, path + + # - Optional: + # - ``__init__(self, path, engine=None, **kwargs)`` --> always called + # with path as first argument. + + # You also need to register the class with ``register_writer()``. + # Technically, ExcelWriter implementations don't need to subclass + # ExcelWriter. + def __new__(cls, path, engine=None, **kwargs): + # only switch class if generic(ExcelWriter) + if cls == ExcelWriter: + if engine is None: + ext = os.path.splitext(path)[-1][1:] + try: + engine = config.get_option('io.excel.%s.writer' % ext) + except KeyError: + error = ValueError("No engine for filetype: '%s'" % ext) + raise error + cls = get_writer(engine) + + return object.__new__(cls) + + # declare external properties you can count on + book = None + curr_sheet = None + path = None + + @abc.abstractproperty + def supported_extensions(self): + "extensions that writer engine supports" + pass + + @abc.abstractproperty + def engine(self): + "name of engine" + pass + + @abc.abstractmethod + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + """ + Write given formated cells into Excel an excel sheet + + Parameters + ---------- + cells : generator + cell of formated data to save to Excel sheet + sheet_name : string, default None + Name of Excel sheet, if None, then use self.cur_sheet + startrow: upper left cell row to dump data frame + startcol: upper left cell column to dump data frame + """ + pass + + @abc.abstractmethod + def save(self): + """ + Save workbook to disk. + """ + pass + + def __init__(self, path, engine=None, + date_format=None, datetime_format=None, **engine_kwargs): + # validate that this engine can handle the extension + ext = os.path.splitext(path)[-1] + self.check_extension(ext) + + self.path = path + self.sheets = {} + self.cur_sheet = None + + if date_format is None: + self.date_format = 'YYYY-MM-DD' + else: + self.date_format = date_format + if datetime_format is None: + self.datetime_format = 'YYYY-MM-DD HH:MM:SS' + else: + self.datetime_format = datetime_format + + def _get_sheet_name(self, sheet_name): + if sheet_name is None: + sheet_name = self.cur_sheet + if sheet_name is None: # pragma: no cover + raise ValueError('Must pass explicit sheet_name or set ' + 'cur_sheet property') + return sheet_name + + @classmethod + def check_extension(cls, ext): + """checks that path's extension against the Writer's supported + extensions. If it isn't supported, raises UnsupportedFiletypeError.""" + if ext.startswith('.'): + ext = ext[1:] + if not any(ext in extension for extension in cls.supported_extensions): + msg = (u("Invalid extension for engine '%s': '%s'") % + (pprint_thing(cls.engine), pprint_thing(ext))) + raise ValueError(msg) + else: + return True + + # Allow use as a contextmanager + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def close(self): + """synonym for save, to make it more file-like""" + return self.save() + + +class _OpenpyxlWriter(ExcelWriter): + engine = 'openpyxl' + supported_extensions = ('.xlsx', '.xlsm') + + def __init__(self, path, engine=None, **engine_kwargs): + if not openpyxl_compat.is_compat(): + raise ValueError('Installed openpyxl is not supported at this ' + 'time. Use >={0} and ' + '<{1}.'.format(openpyxl_compat.start_ver, + openpyxl_compat.stop_ver)) + # Use the openpyxl module as the Excel writer. + from openpyxl.workbook import Workbook + + super(_OpenpyxlWriter, self).__init__(path, **engine_kwargs) + + # Create workbook object with default optimized_write=True. + self.book = Workbook() + # Openpyxl 1.6.1 adds a dummy sheet. We remove it. + if self.book.worksheets: + self.book.remove_sheet(self.book.worksheets[0]) + + def save(self): + """ + Save workbook to disk. + """ + return self.book.save(self.path) + + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + # Write the frame cells using openpyxl. + from openpyxl.cell import get_column_letter + + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.create_sheet() + wks.title = sheet_name + self.sheets[sheet_name] = wks + + for cell in cells: + colletter = get_column_letter(startcol + cell.col + 1) + xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) + xcell.value = _conv_value(cell.val) + style = None + if cell.style: + style = self._convert_to_style(cell.style) + for field in style.__fields__: + xcell.style.__setattr__(field, + style.__getattribute__(field)) + + if isinstance(cell.val, datetime.datetime): + xcell.style.number_format.format_code = self.datetime_format + elif isinstance(cell.val, datetime.date): + xcell.style.number_format.format_code = self.date_format + + if cell.mergestart is not None and cell.mergeend is not None: + cletterstart = get_column_letter(startcol + cell.col + 1) + cletterend = get_column_letter(startcol + cell.mergeend + 1) + + wks.merge_cells('%s%s:%s%s' % (cletterstart, + startrow + cell.row + 1, + cletterend, + startrow + cell.mergestart + 1)) + + # Excel requires that the format of the first cell in a merged + # range is repeated in the rest of the merged range. + if style: + first_row = startrow + cell.row + 1 + last_row = startrow + cell.mergestart + 1 + first_col = startcol + cell.col + 1 + last_col = startcol + cell.mergeend + 1 + + for row in range(first_row, last_row + 1): + for col in range(first_col, last_col + 1): + if row == first_row and col == first_col: + # Ignore first cell. It is already handled. + continue + colletter = get_column_letter(col) + xcell = wks.cell("%s%s" % (colletter, row)) + for field in style.__fields__: + xcell.style.__setattr__( + field, style.__getattribute__(field)) + + @classmethod + def _convert_to_style(cls, style_dict): + """ + converts a style_dict to an openpyxl style object + Parameters + ---------- + style_dict: style dictionary to convert + """ + + from openpyxl.style import Style + xls_style = Style() + for key, value in style_dict.items(): + for nk, nv in value.items(): + if key == "borders": + (xls_style.borders.__getattribute__(nk) + .__setattr__('border_style', nv)) + else: + xls_style.__getattribute__(key).__setattr__(nk, nv) + + return xls_style + +register_writer(_OpenpyxlWriter) + + +class _XlwtWriter(ExcelWriter): + engine = 'xlwt' + supported_extensions = ('.xls',) + + def __init__(self, path, engine=None, encoding=None, **engine_kwargs): + # Use the xlwt module as the Excel writer. + import xlwt + + super(_XlwtWriter, self).__init__(path, **engine_kwargs) + + if encoding is None: + encoding = 'ascii' + self.book = xlwt.Workbook(encoding=encoding) + self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format) + self.fm_date = xlwt.easyxf(num_format_str=self.date_format) + + def save(self): + """ + Save workbook to disk. + """ + return self.book.save(self.path) + + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + # Write the frame cells using xlwt. + + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.add_sheet(sheet_name) + self.sheets[sheet_name] = wks + + style_dict = {} + + for cell in cells: + val = _conv_value(cell.val) + + num_format_str = None + if isinstance(cell.val, datetime.datetime): + num_format_str = self.datetime_format + elif isinstance(cell.val, datetime.date): + num_format_str = self.date_format + + stylekey = json.dumps(cell.style) + if num_format_str: + stylekey += num_format_str + + if stylekey in style_dict: + style = style_dict[stylekey] + else: + style = self._convert_to_style(cell.style, num_format_str) + style_dict[stylekey] = style + + if cell.mergestart is not None and cell.mergeend is not None: + wks.write_merge(startrow + cell.row, + startrow + cell.mergestart, + startcol + cell.col, + startcol + cell.mergeend, + val, style) + else: + wks.write(startrow + cell.row, + startcol + cell.col, + val, style) + + @classmethod + def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',', + line_sep=';'): + """helper which recursively generate an xlwt easy style string + for example: + + hstyle = {"font": {"bold": True}, + "border": {"top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin"}, + "align": {"horiz": "center"}} + will be converted to + font: bold on; \ + border: top thin, right thin, bottom thin, left thin; \ + align: horiz center; + """ + if hasattr(item, 'items'): + if firstlevel: + it = ["%s: %s" % (key, cls._style_to_xlwt(value, False)) + for key, value in item.items()] + out = "%s " % (line_sep).join(it) + return out + else: + it = ["%s %s" % (key, cls._style_to_xlwt(value, False)) + for key, value in item.items()] + out = "%s " % (field_sep).join(it) + return out + else: + item = "%s" % item + item = item.replace("True", "on") + item = item.replace("False", "off") + return item + + @classmethod + def _convert_to_style(cls, style_dict, num_format_str=None): + """ + converts a style_dict to an xlwt style object + Parameters + ---------- + style_dict: style dictionary to convert + num_format_str: optional number format string + """ + import xlwt + + if style_dict: + xlwt_stylestr = cls._style_to_xlwt(style_dict) + style = xlwt.easyxf(xlwt_stylestr, field_sep=',', line_sep=';') + else: + style = xlwt.XFStyle() + if num_format_str is not None: + style.num_format_str = num_format_str + + return style + +register_writer(_XlwtWriter) + + +class _XlsxWriter(ExcelWriter): + engine = 'xlsxwriter' + supported_extensions = ('.xlsx',) + + def __init__(self, path, engine=None, + date_format=None, datetime_format=None, **engine_kwargs): + # Use the xlsxwriter module as the Excel writer. + import xlsxwriter + + super(_XlsxWriter, self).__init__(path, engine=engine, + date_format=date_format, + datetime_format=datetime_format, + **engine_kwargs) + + self.book = xlsxwriter.Workbook(path, **engine_kwargs) + + def save(self): + """ + Save workbook to disk. + """ + return self.book.close() + + def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): + # Write the frame cells using xlsxwriter. + + sheet_name = self._get_sheet_name(sheet_name) + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = self.book.add_worksheet(sheet_name) + self.sheets[sheet_name] = wks + + style_dict = {} + + for cell in cells: + num_format_str = None + if isinstance(cell.val, datetime.datetime): + num_format_str = self.datetime_format + elif isinstance(cell.val, datetime.date): + num_format_str = self.date_format + + stylekey = json.dumps(cell.style) + if num_format_str: + stylekey += num_format_str + + if stylekey in style_dict: + style = style_dict[stylekey] + else: + style = self._convert_to_style(cell.style, num_format_str) + style_dict[stylekey] = style + + if cell.mergestart is not None and cell.mergeend is not None: + wks.merge_range(startrow + cell.row, + startcol + cell.col, + startrow + cell.mergestart, + startcol + cell.mergeend, + cell.val, style) + else: + wks.write(startrow + cell.row, + startcol + cell.col, + cell.val, style) + + def _convert_to_style(self, style_dict, num_format_str=None): + """ + converts a style_dict to an xlsxwriter format object + Parameters + ---------- + style_dict: style dictionary to convert + num_format_str: optional number format string + """ + + # Create a XlsxWriter format object. + xl_format = self.book.add_format() + + if num_format_str is not None: + xl_format.set_num_format(num_format_str) + + if style_dict is None: + return xl_format + + # Map the cell font to XlsxWriter font properties. + if style_dict.get('font'): + font = style_dict['font'] + if font.get('bold'): + xl_format.set_bold() + + # Map the alignment to XlsxWriter alignment properties. + alignment = style_dict.get('alignment') + if alignment: + if (alignment.get('horizontal') + and alignment['horizontal'] == 'center'): + xl_format.set_align('center') + if (alignment.get('vertical') + and alignment['vertical'] == 'top'): + xl_format.set_align('top') + + # Map the cell borders to XlsxWriter border properties. + if style_dict.get('borders'): + xl_format.set_border() + + return xl_format + +register_writer(_XlsxWriter) diff --git a/pandas/io/ga.py b/pandas/io/ga.py new file mode 100644 index 00000000..f0029948 --- /dev/null +++ b/pandas/io/ga.py @@ -0,0 +1,456 @@ +""" +1. Goto https://code.google.com/apis/console +2. Create new project +3. Goto APIs and register for OAuth2.0 for installed applications +4. Download JSON secret file and move into same directory as this file +""" +from datetime import datetime +import re +from pandas import compat +import numpy as np +from pandas import DataFrame +import pandas as pd +import pandas.io.parsers as psr +import pandas.lib as lib +from pandas.io.date_converters import generic_parser +import pandas.io.auth as auth +from pandas.util.decorators import Appender, Substitution + +from apiclient.errors import HttpError +from oauth2client.client import AccessTokenRefreshError +from pandas.compat import zip, u + +TYPE_MAP = {u('INTEGER'): int, u('FLOAT'): float, u('TIME'): int} + +NO_CALLBACK = auth.OOB_CALLBACK_URN +DOC_URL = auth.DOC_URL + +_QUERY_PARAMS = """metrics : list of str + Un-prefixed metric names (e.g., 'visitors' and not 'ga:visitors') +dimensions : list of str + Un-prefixed dimension variable names +start_date : str/date/datetime +end_date : str/date/datetime, optional + Defaults to today +segment : list of str, optional +filters : list of str, optional +start_index : int, default 1 +max_results : int, default 10000 + If >10000, must specify chunksize or ValueError will be raised""" + +_QUERY_DOC = """ +Construct a google analytics query using given parameters +Metrics and dimensions do not need the 'ga:' prefix + +Parameters +---------- +profile_id : str +%s +""" % _QUERY_PARAMS + +_GA_READER_DOC = """Given query parameters, return a DataFrame with all the +data or an iterator that returns DataFrames containing chunks of the data + +Parameters +---------- +%s +sort : bool/list, default True + Sort output by index or list of columns +chunksize : int, optional + If max_results >10000, specifies the number of rows per iteration +index_col : str/list of str/dict, optional + If unspecified then dimension variables are set as index +parse_dates : bool/list/dict, default True +keep_date_col : boolean, default False +date_parser : optional +na_values : optional +converters : optional +dayfirst : bool, default False + Informs date parsing +account_name : str, optional +account_id : str, optional +property_name : str, optional +property_id : str, optional +profile_name : str, optional +profile_id : str, optional +%%(extras)s +Returns +------- +data : DataFrame or DataFrame yielding iterator +""" % _QUERY_PARAMS + +_AUTH_PARAMS = """secrets : str, optional + File path to the secrets file +scope : str, optional + Authentication scope +token_file_name : str, optional + Path to token storage +redirect : str, optional + Local host redirect if unspecified +""" + + +def reset_token_store(): + """ + Deletes the default token store + """ + auth.reset_default_token_store() + + +@Substitution(extras=_AUTH_PARAMS) +@Appender(_GA_READER_DOC) +def read_ga(metrics, dimensions, start_date, **kwargs): + lst = ['secrets', 'scope', 'token_file_name', 'redirect'] + reader_kwds = dict((p, kwargs.pop(p)) for p in lst if p in kwargs) + reader = GAnalytics(**reader_kwds) + return reader.get_data(metrics=metrics, start_date=start_date, + dimensions=dimensions, **kwargs) + + +class OAuthDataReader(object): + """ + Abstract class for handling OAuth2 authentication using the Google + oauth2client library + """ + def __init__(self, scope, token_file_name, redirect): + """ + Parameters + ---------- + scope : str + Designates the authentication scope + token_file_name : str + Location of cache for authenticated tokens + redirect : str + Redirect URL + """ + self.scope = scope + self.token_store = auth.make_token_store(token_file_name) + self.redirect_url = redirect + + def authenticate(self, secrets): + """ + Run the authentication process and return an authorized + http object + + Parameters + ---------- + secrets : str + File name for client secrets + + Notes + ----- + See google documention for format of secrets file + %s + """ % DOC_URL + flow = self._create_flow(secrets) + return auth.authenticate(flow, self.token_store) + + def _create_flow(self, secrets): + """ + Create an authentication flow based on the secrets file + + Parameters + ---------- + secrets : str + File name for client secrets + + Notes + ----- + See google documentation for format of secrets file + %s + """ % DOC_URL + return auth.get_flow(secrets, self.scope, self.redirect_url) + + +class GDataReader(OAuthDataReader): + """ + Abstract class for reading data from google APIs using OAuth2 + Subclasses must implement create_query method + """ + def __init__(self, scope=auth.DEFAULT_SCOPE, + token_file_name=auth.DEFAULT_TOKEN_FILE, + redirect=NO_CALLBACK, secrets=auth.DEFAULT_SECRETS): + super(GDataReader, self).__init__(scope, token_file_name, redirect) + self._service = self._init_service(secrets) + + @property + def service(self): + """The authenticated request service object""" + return self._service + + def _init_service(self, secrets): + """ + Build an authenticated google api request service using the given + secrets file + """ + http = self.authenticate(secrets) + return auth.init_service(http) + + def get_account(self, name=None, id=None, **kwargs): + """ Retrieve an account that matches the name, id, or some account + attribute specified in **kwargs + + Parameters + ---------- + name : str, optional + id : str, optional + """ + accounts = self.service.management().accounts().list().execute() + return _get_match(accounts, name, id, **kwargs) + + def get_web_property(self, account_id=None, name=None, id=None, **kwargs): + """ + Retrieve a web property given and account and property name, id, or + custom attribute + + Parameters + ---------- + account_id : str, optional + name : str, optional + id : str, optional + """ + prop_store = self.service.management().webproperties() + kwds = {} + if account_id is not None: + kwds['accountId'] = account_id + prop_for_acct = prop_store.list(**kwds).execute() + return _get_match(prop_for_acct, name, id, **kwargs) + + def get_profile(self, account_id=None, web_property_id=None, name=None, + id=None, **kwargs): + + """ + Retrieve the right profile for the given account, web property, and + profile attribute (name, id, or arbitrary parameter in kwargs) + + Parameters + ---------- + account_id : str, optional + web_property_id : str, optional + name : str, optional + id : str, optional + """ + profile_store = self.service.management().profiles() + kwds = {} + if account_id is not None: + kwds['accountId'] = account_id + if web_property_id is not None: + kwds['webPropertyId'] = web_property_id + profiles = profile_store.list(**kwds).execute() + return _get_match(profiles, name, id, **kwargs) + + def create_query(self, *args, **kwargs): + raise NotImplementedError() + + @Substitution(extras='') + @Appender(_GA_READER_DOC) + def get_data(self, metrics, start_date, end_date=None, + dimensions=None, segment=None, filters=None, start_index=1, + max_results=10000, index_col=None, parse_dates=True, + keep_date_col=False, date_parser=None, na_values=None, + converters=None, sort=True, dayfirst=False, + account_name=None, account_id=None, property_name=None, + property_id=None, profile_name=None, profile_id=None, + chunksize=None): + if chunksize is None and max_results > 10000: + raise ValueError('Google API returns maximum of 10,000 rows, ' + 'please set chunksize') + + account = self.get_account(account_name, account_id) + web_property = self.get_web_property(account.get('id'), property_name, + property_id) + profile = self.get_profile(account.get('id'), web_property.get('id'), + profile_name, profile_id) + + profile_id = profile.get('id') + + if index_col is None and dimensions is not None: + if isinstance(dimensions, compat.string_types): + dimensions = [dimensions] + index_col = _clean_index(list(dimensions), parse_dates) + + def _read(start, result_size): + query = self.create_query(profile_id, metrics, start_date, + end_date=end_date, dimensions=dimensions, + segment=segment, filters=filters, + start_index=start, + max_results=result_size) + + try: + rs = query.execute() + rows = rs.get('rows', []) + col_info = rs.get('columnHeaders', []) + return self._parse_data(rows, col_info, index_col, + parse_dates=parse_dates, + keep_date_col=keep_date_col, + date_parser=date_parser, + dayfirst=dayfirst, + na_values=na_values, + converters=converters, sort=sort) + except HttpError as inst: + raise ValueError('Google API error %s: %s' % (inst.resp.status, + inst._get_reason())) + + if chunksize is None: + return _read(start_index, max_results) + + def iterator(): + curr_start = start_index + + while curr_start < max_results: + yield _read(curr_start, chunksize) + curr_start += chunksize + return iterator() + + def _parse_data(self, rows, col_info, index_col, parse_dates=True, + keep_date_col=False, date_parser=None, dayfirst=False, + na_values=None, converters=None, sort=True): + # TODO use returned column types + col_names = _get_col_names(col_info) + df = psr._read(rows, dict(index_col=index_col, parse_dates=parse_dates, + date_parser=date_parser, dayfirst=dayfirst, + na_values=na_values, + keep_date_col=keep_date_col, + converters=converters, + header=None, names=col_names)) + + if isinstance(sort, bool) and sort: + return df.sort_index() + elif isinstance(sort, (compat.string_types, list, tuple, np.ndarray)): + return df.sort_index(by=sort) + + return df + + +class GAnalytics(GDataReader): + + @Appender(_QUERY_DOC) + def create_query(self, profile_id, metrics, start_date, end_date=None, + dimensions=None, segment=None, filters=None, + start_index=None, max_results=10000, **kwargs): + qry = format_query(profile_id, metrics, start_date, end_date=end_date, + dimensions=dimensions, segment=segment, + filters=filters, start_index=start_index, + max_results=max_results, **kwargs) + try: + return self.service.data().ga().get(**qry) + except TypeError as error: + raise ValueError('Error making query: %s' % error) + + +def format_query(ids, metrics, start_date, end_date=None, dimensions=None, + segment=None, filters=None, sort=None, start_index=None, + max_results=10000, **kwargs): + if isinstance(metrics, compat.string_types): + metrics = [metrics] + met = ','.join(['ga:%s' % x for x in metrics]) + + start_date = pd.to_datetime(start_date).strftime('%Y-%m-%d') + if end_date is None: + end_date = datetime.today() + end_date = pd.to_datetime(end_date).strftime('%Y-%m-%d') + + qry = dict(ids='ga:%s' % str(ids), + metrics=met, + start_date=start_date, + end_date=end_date) + qry.update(kwargs) + + names = ['dimensions', 'filters', 'sort'] + lst = [dimensions, filters, sort] + [_maybe_add_arg(qry, n, d) for n, d in zip(names, lst)] + + if isinstance(segment, compat.string_types): + if re.match("^[a-zA-Z0-9\-\_]+$", segment): + _maybe_add_arg(qry, 'segment', segment, 'gaid:') + else: + _maybe_add_arg(qry, 'segment', segment, 'dynamic::ga') + elif isinstance(segment, int): + _maybe_add_arg(qry, 'segment', segment, 'gaid:') + elif segment: + raise ValueError("segment must be string for dynamic and int ID") + + if start_index is not None: + qry['start_index'] = str(start_index) + + if max_results is not None: + qry['max_results'] = str(max_results) + + return qry + + +def _maybe_add_arg(query, field, data, prefix='ga'): + if data is not None: + if isinstance(data, (compat.string_types, int)): + data = [data] + data = ','.join(['%s:%s' % (prefix, x) for x in data]) + query[field] = data + + +def _get_match(obj_store, name, id, **kwargs): + key, val = None, None + if len(kwargs) > 0: + key = list(kwargs.keys())[0] + val = list(kwargs.values())[0] + + if name is None and id is None and key is None: + return obj_store.get('items')[0] + + name_ok = lambda item: name is not None and item.get('name') == name + id_ok = lambda item: id is not None and item.get('id') == id + key_ok = lambda item: key is not None and item.get(key) == val + + match = None + if obj_store.get('items'): + # TODO look up gapi for faster lookup + for item in obj_store.get('items'): + if name_ok(item) or id_ok(item) or key_ok(item): + return item + + +def _clean_index(index_dims, parse_dates): + _should_add = lambda lst: pd.Index(lst).isin(index_dims).all() + to_remove = [] + to_add = [] + + if isinstance(parse_dates, (list, tuple, np.ndarray)): + for lst in parse_dates: + if isinstance(lst, (list, tuple, np.ndarray)): + if _should_add(lst): + to_add.append('_'.join(lst)) + to_remove.extend(lst) + elif isinstance(parse_dates, dict): + for name, lst in compat.iteritems(parse_dates): + if isinstance(lst, (list, tuple, np.ndarray)): + if _should_add(lst): + to_add.append(name) + to_remove.extend(lst) + + index_dims = pd.Index(index_dims) + to_remove = pd.Index(set(to_remove)) + to_add = pd.Index(set(to_add)) + + return index_dims - to_remove + to_add + + +def _get_col_names(header_info): + return [x['name'][3:] for x in header_info] + + +def _get_column_types(header_info): + return [(x['name'][3:], x['columnType']) for x in header_info] + + +def _get_dim_names(header_info): + return [x['name'][3:] for x in header_info + if x['columnType'] == u('DIMENSION')] + + +def _get_met_names(header_info): + return [x['name'][3:] for x in header_info + if x['columnType'] == u('METRIC')] + + +def _get_data_types(header_info): + return [(x['name'][3:], TYPE_MAP.get(x['dataType'], object)) + for x in header_info] diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py new file mode 100644 index 00000000..76848a62 --- /dev/null +++ b/pandas/io/gbq.py @@ -0,0 +1,435 @@ +from datetime import datetime +import json +import logging +import sys +from time import sleep +import uuid + +import numpy as np +import pkg_resources + +from distutils.version import LooseVersion +from pandas import compat +from pandas.core.api import DataFrame +from pandas.tools.merge import concat +from pandas.core.common import PandasError + + +_GOOGLE_API_CLIENT_INSTALLED = False +_GOOGLE_API_CLIENT_VALID_VERSION = False +_GOOGLE_FLAGS_INSTALLED = False +_GOOGLE_FLAGS_VALID_VERSION = False +_HTTPLIB2_INSTALLED = False + +if not compat.PY3: + + try: + from apiclient.discovery import build + from apiclient.http import MediaFileUpload + from apiclient.errors import HttpError + + from oauth2client.client import OAuth2WebServerFlow + from oauth2client.client import AccessTokenRefreshError + from oauth2client.client import flow_from_clientsecrets + from oauth2client.file import Storage + from oauth2client.tools import run + _GOOGLE_API_CLIENT_INSTALLED=True + _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version + + if LooseVersion(_GOOGLE_API_CLIENT_VERSION >= '1.2.0'): + _GOOGLE_API_CLIENT_VALID_VERSION = True + + except ImportError: + _GOOGLE_API_CLIENT_INSTALLED = False + + + try: + import gflags as flags + _GOOGLE_FLAGS_INSTALLED = True + + _GOOGLE_FLAGS_VERSION = pkg_resources.get_distribution('python-gflags').version + + if LooseVersion(_GOOGLE_FLAGS_VERSION >= '2.0.0'): + _GOOGLE_FLAGS_VALID_VERSION = True + + except ImportError: + _GOOGLE_FLAGS_INSTALLED = False + + try: + import httplib2 + _HTTPLIB2_INSTALLED = True + except ImportError: + _HTTPLIB2_INSTALLED = False + + +logger = logging.getLogger('pandas.io.gbq') +logger.setLevel(logging.ERROR) + +class InvalidPageToken(PandasError, IOError): + """ + Raised when Google BigQuery fails to return, + or returns a duplicate page token. + """ + pass + +class InvalidQueryException(PandasError, IOError): + """ + Raised when a malformed query is given to read_gbq. + """ + pass + +class AccessDeniedException(PandasError, IOError): + """ + Raised when invalid credentials are provided, or tokens have expired. + """ + pass + +class NotFoundException(PandasError, IOError): + """ + Raised when the project_id/table provided in the query could not be found. + """ + pass + +class TermsOfServiceNotAcceptedException(PandasError, IOError): + """ + Raised when the terms of service were not accepted or have been unaccepted. + """ + pass + +class UnknownGBQException(PandasError, IOError): + """ + Raised when an unrecognized Google API Error occurs. + """ + pass + + +class InvalidColumnOrder(PandasError, IOError): + """ + Raised when the provided column order for output + results DataFrame does not match the schema + returned by BigQuery. + """ + pass + +class GbqConnector: + def __init__(self, project_id, reauth=False): + self.project_id = project_id + self.reauth = reauth + self.credentials = self.get_credentials() + self.service = self.get_service(self.credentials) + + def get_credentials(self): + flow = OAuth2WebServerFlow(client_id='495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd.apps.googleusercontent.com', + client_secret='kOc9wMptUtxkcIFbtZCcrEAc', + scope='https://www.googleapis.com/auth/bigquery', + redirect_uri='urn:ietf:wg:oauth:2.0:oob') + + storage = Storage('bigquery_credentials.dat') + credentials = storage.get() + + if credentials is None or credentials.invalid or self.reauth: + credentials = run(flow, storage) + + return credentials + + def get_service(self, credentials): + http = httplib2.Http() + http = credentials.authorize(http) + bigquery_service = build('bigquery', 'v2', http=http) + + return bigquery_service + + def run_query(self, query): + job_collection = self.service.jobs() + job_data = { + 'configuration': { + 'query': { + 'query': query + #'allowLargeResults', 'createDisposition', 'preserveNulls', destinationTable, useQueryCache + } + } + } + + try: + query_reply = job_collection.insert(projectId=self.project_id, + body=job_data).execute() + status = query_reply['status'] + except AccessTokenRefreshError: + raise AccessDeniedException("The credentials have been revoked or expired, please re-run" + "the application to re-authorize") + except HttpError as ex: + status = json.loads(ex.content)['error'] + + + errors = status.get('errors', None) + + if errors: + reasons = [error['reason'] for error in errors] + if 'accessDenied' in reasons: + raise AccessDeniedException + if 'invalidQuery' in reasons: + raise InvalidQueryException + if 'notFound' in reasons: + raise NotFoundException + if 'termsOfServiceNotAccepted' in reasons: + raise TermsOfServiceNotAcceptedException + else: + raise UnknownGBQException(errors) + + job_reference = query_reply['jobReference'] + + while(not 'jobComplete' in query_reply): + print('Job not yet complete...') + query_reply = job_collection.getQueryResults( + projectId=job_reference['projectId'], + jobId=job_reference['jobId']).execute() + + total_rows = int(query_reply['totalRows']) + result_pages = list() + seen_page_tokens = list() + current_row = 0 + #Only read schema on first page + schema = query_reply['schema'] + + # Loop through each page of data + while('rows' in query_reply and current_row < total_rows): + page = query_reply['rows'] + result_pages.append(page) + current_row += len(page) + page_token = query_reply.get('pageToken', None) + + if not page_token and current_row < total_rows: + raise InvalidPageToken("Required pageToken was missing. Recieved {0} of {1} rows".format(current_row,total_rows)) + + elif page_token in seen_page_tokens: + raise InvalidPageToken("A duplicate pageToken was returned") + + seen_page_tokens.append(page_token) + query_reply = job_collection.getQueryResults( + projectId = job_reference['projectId'], + jobId = job_reference['jobId'], + pageToken = page_token).execute() + + if (current_row < total_rows): + raise InvalidPageToken() + + return schema, result_pages + + def load_data(self, dataframe, dataset_id, table_id, chunksize, verbose): + job_id = uuid.uuid4().hex + rows = [] + remaining_rows = len(dataframe) + + if verbose: + total_rows = remaining_rows + sys.stdout.write("\n\n") + sys.stdout.flush() + + for index, row in dataframe.reset_index(drop=True).iterrows(): + row_dict = dict() + row_dict['json'] = json.loads(row.to_json(force_ascii = False, + date_unit = 's', + date_format = 'iso')) + row_dict['insertId'] = job_id + str(index) + rows.append(row_dict) + remaining_rows -= 1 + + if (len(rows) % chunksize == 0) or (remaining_rows == 0): + if verbose: + sys.stdout.write("\rStreaming Insert is {0}% Complete".format(((total_rows - remaining_rows) * 100) / total_rows)) + sys.stdout.flush() + + body = {'rows': rows} + response = self.service.tabledata().insertAll( + projectId = self.project_id, + datasetId = dataset_id, + tableId = table_id, + body = body).execute() + if 'insertErrors' in response: + raise UnknownGBQException(response) + + sleep(1) # Maintains the inserts "per second" rate per API + rows = [] + + if verbose: + sys.stdout.write("\n") + sys.stdout.flush() + +def _parse_data(schema, rows): + # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing + dtype_map = {'INTEGER': np.dtype(float), + 'FLOAT': np.dtype(float), + 'TIMESTAMP': 'M8[ns]'} # This seems to be buggy without + # nanosecond indicator + + fields = schema['fields'] + col_types = [field['type'] for field in fields] + col_names = [field['name'].encode('ascii', 'ignore') for field in fields] + col_dtypes = [dtype_map.get(field['type'], object) for field in fields] + page_array = np.zeros((len(rows),), + dtype=zip(col_names, col_dtypes)) + + for row_num, raw_row in enumerate(rows): + entries = raw_row.get('f', []) + for col_num, field_type in enumerate(col_types): + field_value = _parse_entry(entries[col_num].get('v', ''), + field_type) + page_array[row_num][col_num] = field_value + + return DataFrame(page_array) + +def _parse_entry(field_value, field_type): + if field_value is None or field_value == 'null': + return None + if field_type == 'INTEGER' or field_type == 'FLOAT': + return float(field_value) + elif field_type == 'TIMESTAMP': + timestamp = datetime.utcfromtimestamp(float(field_value)) + return np.datetime64(timestamp) + elif field_type == 'BOOLEAN': + return field_value == 'true' + return field_value + +def _test_imports(): + _GOOGLE_API_CLIENT_INSTALLED + _GOOGLE_API_CLIENT_VALID_VERSION + _GOOGLE_FLAGS_INSTALLED + _GOOGLE_FLAGS_VALID_VERSION + _HTTPLIB2_INSTALLED + + if compat.PY3: + raise NotImplementedError("Google's libraries do not support Python 3 yet") + + if not _GOOGLE_API_CLIENT_INSTALLED: + raise ImportError('Could not import Google API Client.') + + if not _GOOGLE_FLAGS_INSTALLED: + raise ImportError('Could not import Google Command Line Flags Module.') + + if not _GOOGLE_API_CLIENT_VALID_VERSION: + raise ImportError("pandas requires google-api-python-client >= 1.2.0 for Google " + "BigQuery support, current version " + _GOOGLE_API_CLIENT_VERSION) + + if not _GOOGLE_FLAGS_VALID_VERSION: + raise ImportError("pandas requires python-gflags >= 2.0.0 for Google " + "BigQuery support, current version " + _GOOGLE_FLAGS_VERSION) + + if not _HTTPLIB2_INSTALLED: + raise ImportError("pandas requires httplib2 for Google BigQuery support") + +def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=False): + """Load data from Google BigQuery. + + THIS IS AN EXPERIMENTAL LIBRARY + + The main method a user calls to execute a Query in Google BigQuery and read results + into a pandas DataFrame using the v2 Google API client for Python. Documentation for + the API is available at https://developers.google.com/api-client-library/python/. + Authentication to the Google BigQuery service is via OAuth 2.0 using the product name + 'pandas GBQ'. + + Parameters + ---------- + query : str + SQL-Like Query to return data values + project_id : str + Google BigQuery Account project ID. + index_col : str (optional) + Name of result column to use for index in results DataFrame + col_order : list(str) (optional) + List of BigQuery column names in the desired order for results + DataFrame + reauth : boolean (default False) + Force Google BigQuery to reauthenticate the user. This is useful + if multiple accounts are used. + + Returns + ------- + df: DataFrame + DataFrame representing results of query + + """ + + _test_imports() + + if not project_id: + raise TypeError("Missing required parameter: project_id") + + connector = GbqConnector(project_id, reauth = reauth) + schema, pages = connector.run_query(query) + dataframe_list = [] + while len(pages) > 0: + page = pages.pop() + dataframe_list.append(_parse_data(schema, page)) + + final_df = concat(dataframe_list, ignore_index = True) + + # Reindex the DataFrame on the provided column + if index_col is not None: + if index_col in final_df.columns: + final_df.set_index(index_col, inplace = True) + else: + raise InvalidColumnOrder( + 'Index column "{0}" does not exist in DataFrame.' + .format(index_col) + ) + + # Change the order of columns in the DataFrame based on provided list + if col_order is not None: + if sorted(col_order) == sorted(final_df.columns): + final_df = final_df[col_order] + else: + raise InvalidColumnOrder( + 'Column order does not match this DataFrame.' + ) + + # Downcast floats to integers and objects to booleans + # if there are no NaN's. This is presently due to a + # limitation of numpy in handling missing data. + final_df._data = final_df._data.downcast(dtypes='infer') + return final_df + +def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000, + verbose=True, reauth=False): + """Write a DataFrame to a Google BigQuery table. + + THIS IS AN EXPERIMENTAL LIBRARY + + If the table exists, the dataframe will be written to the table using + the defined table schema and column types. For simplicity, this method + uses the Google BigQuery streaming API. The to_gbq method chunks data + into a default chunk size of 10,000. Failures return the complete error + response which can be quite long depending on the size of the insert. + There are several important limitations of the Google streaming API + which are detailed at: + https://developers.google.com/bigquery/streaming-data-into-bigquery. + + Parameters + ---------- + dataframe : DataFrame + DataFrame to be written + destination_table : string + Name of table to be written, in the form 'dataset.tablename' + project_id : str + Google BigQuery Account project ID. + chunksize : int (default 10000) + Number of rows to be inserted in each chunk from the dataframe. + verbose : boolean (default True) + Show percentage complete + reauth : boolean (default False) + Force Google BigQuery to reauthenticate the user. This is useful + if multiple accounts are used. + + """ + _test_imports() + + if not project_id: + raise TypeError("Missing required parameter: project_id") + + if not '.' in destination_table: + raise NotFoundException("Invalid Table Name. Should be of the form 'datasetId.tableId' ") + + connector = GbqConnector(project_id, reauth = reauth) + dataset_id, table_id = destination_table.rsplit('.',1) + + connector.load_data(dataframe, dataset_id, table_id, chunksize, verbose) diff --git a/pandas/io/html.py b/pandas/io/html.py new file mode 100644 index 00000000..5ea6ca36 --- /dev/null +++ b/pandas/io/html.py @@ -0,0 +1,851 @@ +""":mod:`pandas.io.html` is a module containing functionality for dealing with +HTML IO. + +""" + +import os +import re +import numbers +import collections +import warnings + +from distutils.version import LooseVersion + +import numpy as np + +from pandas.io.common import _is_url, urlopen, parse_url +from pandas.io.parsers import TextParser +from pandas.compat import (lrange, lmap, u, string_types, iteritems, text_type, + raise_with_traceback) +from pandas.core import common as com +from pandas import Series + + +try: + import bs4 +except ImportError: + _HAS_BS4 = False +else: + _HAS_BS4 = True + + +try: + import lxml +except ImportError: + _HAS_LXML = False +else: + _HAS_LXML = True + + +try: + import html5lib +except ImportError: + _HAS_HTML5LIB = False +else: + _HAS_HTML5LIB = True + + +############# +# READ HTML # +############# +_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}') + + +def _remove_whitespace(s, regex=_RE_WHITESPACE): + """Replace extra whitespace inside of a string with a single space. + + Parameters + ---------- + s : str or unicode + The string from which to remove extra whitespace. + + regex : regex + The regular expression to use to remove extra whitespace. + + Returns + ------- + subd : str or unicode + `s` with all extra whitespace replaced with a single space. + """ + return regex.sub(' ', s.strip()) + + +def _get_skiprows(skiprows): + """Get an iterator given an integer, slice or container. + + Parameters + ---------- + skiprows : int, slice, container + The iterator to use to skip rows; can also be a slice. + + Raises + ------ + TypeError + * If `skiprows` is not a slice, integer, or Container + + Returns + ------- + it : iterable + A proper iterator to use to skip rows of a DataFrame. + """ + if isinstance(skiprows, slice): + return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) + elif isinstance(skiprows, numbers.Integral) or com.is_list_like(skiprows): + return skiprows + elif skiprows is None: + return 0 + raise TypeError('%r is not a valid type for skipping rows' % + type(skiprows).__name__) + + +def _read(obj): + """Try to read from a url, file or string. + + Parameters + ---------- + obj : str, unicode, or file-like + + Returns + ------- + raw_text : str + """ + if _is_url(obj): + with urlopen(obj) as url: + text = url.read() + elif hasattr(obj, 'read'): + text = obj.read() + elif isinstance(obj, string_types): + text = obj + try: + if os.path.isfile(text): + with open(text, 'rb') as f: + return f.read() + except TypeError: + pass + else: + raise TypeError("Cannot read object of type %r" % type(obj).__name__) + return text + + +class _HtmlFrameParser(object): + """Base class for parsers that parse HTML into DataFrames. + + Parameters + ---------- + io : str or file-like + This can be either a string of raw HTML, a valid URL using the HTTP, + FTP, or FILE protocols or a file-like object. + + match : str or regex + The text to match in the document. + + attrs : dict + List of HTML element attributes to match. + + Attributes + ---------- + io : str or file-like + raw HTML, URL, or file-like object + + match : regex + The text to match in the raw HTML + + attrs : dict-like + A dictionary of valid table attributes to use to search for table + elements. + + Notes + ----- + To subclass this class effectively you must override the following methods: + * :func:`_build_doc` + * :func:`_text_getter` + * :func:`_parse_td` + * :func:`_parse_tables` + * :func:`_parse_tr` + * :func:`_parse_thead` + * :func:`_parse_tbody` + * :func:`_parse_tfoot` + See each method's respective documentation for details on their + functionality. + """ + def __init__(self, io, match, attrs, encoding): + self.io = io + self.match = match + self.attrs = attrs + self.encoding = encoding + + def parse_tables(self): + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._build_table(table) for table in tables) + + def _parse_raw_data(self, rows): + """Parse the raw data into a list of lists. + + Parameters + ---------- + rows : iterable of node-like + A list of row elements. + + text_getter : callable + A callable that gets the text from an individual node. This must be + defined by subclasses. + + column_finder : callable + A callable that takes a row node as input and returns a list of the + column node in that row. This must be defined by subclasses. + + Returns + ------- + data : list of list of strings + """ + data = [[_remove_whitespace(self._text_getter(col)) for col in + self._parse_td(row)] for row in rows] + return data + + def _text_getter(self, obj): + """Return the text of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + text : str or unicode + The text from an individual DOM node. + """ + raise NotImplementedError + + def _parse_td(self, obj): + """Return the td elements from a row element. + + Parameters + ---------- + obj : node-like + + Returns + ------- + columns : list of node-like + These are the elements of each row, i.e., the columns. + """ + raise NotImplementedError + + def _parse_tables(self, doc, match, attrs): + """Return all tables from the parsed DOM. + + Parameters + ---------- + doc : tree-like + The DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + mutliple tables on a page. + + Raises + ------ + ValueError + * If `match` does not match any text in the document. + + Returns + ------- + tables : list of node-like + A list of
elements to be parsed into raw data. + """ + raise NotImplementedError + + def _parse_tr(self, table): + """Return the list of row elements from the parsed table element. + + Parameters + ---------- + table : node-like + A table element that contains row elements. + + Returns + ------- + rows : list of node-like + A list row elements of a table, usually or ... element. + """ + raise NotImplementedError + + def _parse_tbody(self, table): + """Return the body of the table. + + Parameters + ---------- + table : node-like + A table element that contains row elements. + + Returns + ------- + tbody : node-like + A ... element. + """ + raise NotImplementedError + + def _parse_tfoot(self, table): + """Return the footer of the table if any. + + Parameters + ---------- + table : node-like + A table element that contains row elements. + + Returns + ------- + tfoot : node-like + A ... element. + """ + raise NotImplementedError + + def _build_doc(self): + """Return a tree-like object that can be used to iterate over the DOM. + + Returns + ------- + obj : tree-like + """ + raise NotImplementedError + + def _build_table(self, table): + header = self._parse_raw_thead(table) + body = self._parse_raw_tbody(table) + footer = self._parse_raw_tfoot(table) + return header, body, footer + + def _parse_raw_thead(self, table): + thead = self._parse_thead(table) + res = [] + if thead: + res = lmap(self._text_getter, self._parse_th(thead[0])) + return np.array(res).squeeze() if res and len(res) == 1 else res + + def _parse_raw_tfoot(self, table): + tfoot = self._parse_tfoot(table) + res = [] + if tfoot: + res = lmap(self._text_getter, self._parse_td(tfoot[0])) + return np.array(res).squeeze() if res and len(res) == 1 else res + + def _parse_raw_tbody(self, table): + tbody = self._parse_tbody(table) + + try: + res = self._parse_tr(tbody[0]) + except IndexError: + res = self._parse_tr(table) + return self._parse_raw_data(res) + + +class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): + """HTML to DataFrame parser that uses BeautifulSoup under the hood. + + See Also + -------- + pandas.io.html._HtmlFrameParser + pandas.io.html._LxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`pandas.io.html._HtmlFrameParser`. + """ + def __init__(self, *args, **kwargs): + super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args, + **kwargs) + from bs4 import SoupStrainer + self._strainer = SoupStrainer('table') + + def _text_getter(self, obj): + return obj.text + + def _parse_td(self, row): + return row.find_all(('td', 'th')) + + def _parse_tr(self, element): + return element.find_all('tr') + + def _parse_th(self, element): + return element.find_all('th') + + def _parse_thead(self, table): + return table.find_all('thead') + + def _parse_tbody(self, table): + return table.find_all('tbody') + + def _parse_tfoot(self, table): + return table.find_all('tfoot') + + def _parse_tables(self, doc, match, attrs): + element_name = self._strainer.name + tables = doc.find_all(element_name, attrs=attrs) + + if not tables: + raise ValueError('No tables found') + + result = [] + unique_tables = set() + + for table in tables: + if (table not in unique_tables and + table.find(text=match) is not None): + result.append(table) + unique_tables.add(table) + + if not result: + raise ValueError("No tables found matching pattern %r" % + match.pattern) + return result + + def _setup_build_doc(self): + raw_text = _read(self.io) + if not raw_text: + raise ValueError('No text parsed from document: %s' % self.io) + return raw_text + + def _build_doc(self): + from bs4 import BeautifulSoup + return BeautifulSoup(self._setup_build_doc(), features='html5lib', + from_encoding=self.encoding) + + +def _build_xpath_expr(attrs): + """Build an xpath expression to simulate bs4's ability to pass in kwargs to + search for attributes when using the lxml parser. + + Parameters + ---------- + attrs : dict + A dict of HTML attributes. These are NOT checked for validity. + + Returns + ------- + expr : unicode + An XPath expression that checks for the given HTML attributes. + """ + # give class attribute as class_ because class is a python keyword + if 'class_' in attrs: + attrs['class'] = attrs.pop('class_') + + s = [u("@%s=%r") % (k, v) for k, v in iteritems(attrs)] + return u('[%s]') % ' and '.join(s) + + +_re_namespace = {'re': 'http://exslt.org/regular-expressions'} +_valid_schemes = 'http', 'file', 'ftp' + + +class _LxmlFrameParser(_HtmlFrameParser): + """HTML to DataFrame parser that uses lxml under the hood. + + Warning + ------- + This parser can only handle HTTP, FTP, and FILE urls. + + See Also + -------- + _HtmlFrameParser + _BeautifulSoupLxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`_HtmlFrameParser`. + """ + def __init__(self, *args, **kwargs): + super(_LxmlFrameParser, self).__init__(*args, **kwargs) + + def _text_getter(self, obj): + return obj.text_content() + + def _parse_td(self, row): + return row.xpath('.//td|.//th') + + def _parse_tr(self, table): + expr = './/tr[normalize-space()]' + return table.xpath(expr) + + def _parse_tables(self, doc, match, kwargs): + pattern = match.pattern + + # 1. check all descendants for the given pattern and only search tables + # 2. go up the tree until we find a table + query = '//table//*[re:test(text(), %r)]/ancestor::table' + xpath_expr = u(query) % pattern + + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) + + tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + + if not tables: + raise ValueError("No tables found matching regex %r" % pattern) + return tables + + def _build_doc(self): + """ + Raises + ------ + ValueError + * If a URL that lxml cannot parse is passed. + + Exception + * Any other ``Exception`` thrown. For example, trying to parse a + URL that is syntactically correct on a machine with no internet + connection will fail. + + See Also + -------- + pandas.io.html._HtmlFrameParser._build_doc + """ + from lxml.html import parse, fromstring, HTMLParser + from lxml.etree import XMLSyntaxError + + parser = HTMLParser(recover=False, encoding=self.encoding) + + try: + # try to parse the input in the simplest way + r = parse(self.io, parser=parser) + + try: + r = r.getroot() + except AttributeError: + pass + except (UnicodeDecodeError, IOError): + # if the input is a blob of html goop + if not _is_url(self.io): + r = fromstring(self.io, parser=parser) + + try: + r = r.getroot() + except AttributeError: + pass + else: + # not a url + scheme = parse_url(self.io).scheme + if scheme not in _valid_schemes: + # lxml can't parse it + msg = ('%r is not a valid url scheme, valid schemes are ' + '%s') % (scheme, _valid_schemes) + raise ValueError(msg) + else: + # something else happened: maybe a faulty connection + raise + else: + if not hasattr(r, 'text_content'): + raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + return r + + def _parse_tbody(self, table): + return table.xpath('.//tbody') + + def _parse_thead(self, table): + return table.xpath('.//thead') + + def _parse_tfoot(self, table): + return table.xpath('.//tfoot') + + def _parse_raw_thead(self, table): + expr = './/thead//th' + return [_remove_whitespace(x.text_content()) for x in + table.xpath(expr)] + + def _parse_raw_tfoot(self, table): + expr = './/tfoot//th' + return [_remove_whitespace(x.text_content()) for x in + table.xpath(expr)] + + +def _expand_elements(body): + lens = Series(lmap(len, body)) + lens_max = lens.max() + not_max = lens[lens != lens_max] + + empty = [''] + for ind, length in iteritems(not_max): + body[ind] += empty * (lens_max - length) + + +def _data_to_frame(data, header, index_col, skiprows, infer_types, + parse_dates, tupleize_cols, thousands): + head, body, _ = data # _ is footer which is rarely used: ignore for now + + if head: + body = [head] + body + + if header is None: # special case when a table has + + + + + + +
elements. + """ + raise NotImplementedError + + def _parse_thead(self, table): + """Return the header of a table. + + Parameters + ---------- + table : node-like + A table element that contains row elements. + + Returns + ------- + thead : node-like + A
elements + header = 0 + + # fill out elements of body that are "ragged" + _expand_elements(body) + + tp = TextParser(body, header=header, index_col=index_col, + skiprows=_get_skiprows(skiprows), + parse_dates=parse_dates, tupleize_cols=tupleize_cols, + thousands=thousands) + df = tp.read() + + if infer_types: # TODO: rm this code so infer_types has no effect in 0.14 + df = df.convert_objects(convert_dates='coerce') + else: + df = df.applymap(text_type) + return df + + +_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser, + 'html5lib': _BeautifulSoupHtml5LibFrameParser, + 'bs4': _BeautifulSoupHtml5LibFrameParser} + + +def _parser_dispatch(flavor): + """Choose the parser based on the input flavor. + + Parameters + ---------- + flavor : str + The type of parser to use. This must be a valid backend. + + Returns + ------- + cls : _HtmlFrameParser subclass + The parser class based on the requested input flavor. + + Raises + ------ + ValueError + * If `flavor` is not a valid backend. + ImportError + * If you do not have the requested `flavor` + """ + valid_parsers = list(_valid_parsers.keys()) + if flavor not in valid_parsers: + raise ValueError('%r is not a valid flavor, valid flavors are %s' % + (flavor, valid_parsers)) + + if flavor in ('bs4', 'html5lib'): + if not _HAS_HTML5LIB: + raise ImportError("html5lib not found please install it") + if not _HAS_BS4: + raise ImportError("bs4 not found please install it") + if bs4.__version__ == LooseVersion('4.2.0'): + raise ValueError("You're using a version" + " of BeautifulSoup4 (4.2.0) that has been" + " known to cause problems on certain" + " operating systems such as Debian. " + "Please install a version of" + " BeautifulSoup4 != 4.2.0, both earlier" + " and later releases will work.") + else: + if not _HAS_LXML: + raise ImportError("lxml not found please install it") + return _valid_parsers[flavor] + + +def _print_as_set(s): + return '{%s}' % ', '.join([com.pprint_thing(el) for el in s]) + + +def _validate_flavor(flavor): + if flavor is None: + flavor = 'lxml', 'bs4' + elif isinstance(flavor, string_types): + flavor = flavor, + elif isinstance(flavor, collections.Iterable): + if not all(isinstance(flav, string_types) for flav in flavor): + raise TypeError('Object of type %r is not an iterable of strings' % + type(flavor).__name__) + else: + fmt = '{0!r}' if isinstance(flavor, string_types) else '{0}' + fmt += ' is not a valid flavor' + raise ValueError(fmt.format(flavor)) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError('%s is not a valid set of flavors, valid flavors are ' + '%s' % (_print_as_set(flavor_set), + _print_as_set(valid_flavors))) + return flavor + + +def _parse(flavor, io, match, header, index_col, skiprows, infer_types, + parse_dates, tupleize_cols, thousands, attrs, encoding): + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here + + # hack around python 3 deleting the exception variable + retained = None + for flav in flavor: + parser = _parser_dispatch(flav) + p = parser(io, compiled_match, attrs, encoding) + + try: + tables = p.parse_tables() + except Exception as caught: + retained = caught + else: + break + else: + raise_with_traceback(retained) + + return [_data_to_frame(table, header, index_col, skiprows, infer_types, + parse_dates, tupleize_cols, thousands) + for table in tables] + + +def read_html(io, match='.+', flavor=None, header=None, index_col=None, + skiprows=None, infer_types=None, attrs=None, parse_dates=False, + tupleize_cols=False, thousands=',', encoding=None): + r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. + + Parameters + ---------- + io : str or file-like + A URL, a file-like object, or a raw string containing HTML. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might try removing the ``'s'``. + + match : str or compiled regular expression, optional + The set of tables containing text matching this regex or string will be + returned. Unless the HTML is extremely simple you will probably need to + pass a non-empty string here. Defaults to '.+' (match any non-empty + string). The default value will return all tables contained on a page. + This value is converted to a regular expression so that there is + consistent behavior between Beautiful Soup and lxml. + + flavor : str or None, container of strings + The parsing engine to use. 'bs4' and 'html5lib' are synonymous with + each other, they are both there for backwards compatibility. The + default of ``None`` tries to use ``lxml`` to parse and if that fails it + falls back on ``bs4`` + ``html5lib``. + + header : int or list-like or None, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. + + index_col : int or list-like or None, optional + The column (or list of columns) to use to create the index. + + skiprows : int or list-like or slice or None, optional + 0-based. Number of rows to skip after parsing the column integer. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. + + infer_types : bool, optional + This option is deprecated in 0.13, an will have no effect in 0.14. It + defaults to ``True``. + + attrs : dict or None, optional + This is a dictionary of attributes that you can pass to use to identify + the table in the HTML. These are not checked for validity before being + passed to lxml or Beautiful Soup. However, these attributes must be + valid HTML table attributes to work correctly. For example, :: + + attrs = {'id': 'table'} + + is a valid attribute dictionary because the 'id' HTML tag attribute is + a valid HTML attribute for *any* HTML tag as per `this document + `__. :: + + attrs = {'asdf': 'table'} + + is *not* a valid attribute dictionary because 'asdf' is not a valid + HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 + table attributes can be found `here + `__. A + working draft of the HTML 5 spec can be found `here + `__. It contains the + latest information on table attributes for the modern web. + + parse_dates : bool, optional + See :func:`~pandas.io.parsers.read_csv` for more details. In 0.13, this + parameter can sometimes interact strangely with ``infer_types``. If you + get a large number of ``NaT`` values in your results, consider passing + ``infer_types=False`` and manually converting types afterwards. + + tupleize_cols : bool, optional + If ``False`` try to parse multiple header rows into a + :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to + ``False``. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. + + encoding : str or None, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + + Returns + ------- + dfs : list of DataFrames + + Notes + ----- + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. + + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. + + This function searches for ```` elements and only for ```` + and ```` or ``', result) + + def test_to_html_multiindex(self): + columns = pandas.MultiIndex.from_tuples(list(zip(np.arange(2).repeat(2), + np.mod(lrange(4), 2))), + names=['CL0', 'CL1']) + df = pandas.DataFrame([list('abcd'), list('efgh')], columns=columns) + result = df.to_html(justify='left') + expected = ('
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". + + Similar to :func:`~pandas.read_csv` the `header` argument is applied + **after** `skiprows` is applied. + + This function will *always* return a list of :class:`DataFrame` *or* + it will fail, e.g., it will *not* return an empty list. + + Examples + -------- + See the :ref:`read_html documentation in the IO section of the docs + ` for some examples of reading in HTML tables. + + See Also + -------- + pandas.io.parsers.read_csv + """ + if infer_types is not None: + warnings.warn("infer_types will have no effect in 0.14", FutureWarning) + else: + infer_types = True # TODO: remove effect of this in 0.14 + + # Type check here. We don't want to parse only to fail because of an + # invalid value of an integer skiprows. + if isinstance(skiprows, numbers.Integral) and skiprows < 0: + raise ValueError('cannot skip rows starting from the end of the ' + 'data (you passed a negative value)') + return _parse(flavor, io, match, header, index_col, skiprows, infer_types, + parse_dates, tupleize_cols, thousands, attrs, encoding) diff --git a/pandas/io/json.py b/pandas/io/json.py new file mode 100644 index 00000000..4ed325df --- /dev/null +++ b/pandas/io/json.py @@ -0,0 +1,756 @@ +# pylint: disable-msg=E1101,W0613,W0603 + +import os +import copy +from collections import defaultdict +import numpy as np + +import pandas.json as _json +from pandas.tslib import iNaT +from pandas.compat import long, u +from pandas import compat, isnull +from pandas import Series, DataFrame, to_datetime +from pandas.io.common import get_filepath_or_buffer +import pandas.core.common as com + +loads = _json.loads +dumps = _json.dumps +### interface to/from ### + + +def to_json(path_or_buf, obj, orient=None, date_format='epoch', + double_precision=10, force_ascii=True, date_unit='ms', + default_handler=None): + + if isinstance(obj, Series): + s = SeriesWriter( + obj, orient=orient, date_format=date_format, + double_precision=double_precision, ensure_ascii=force_ascii, + date_unit=date_unit, default_handler=default_handler).write() + elif isinstance(obj, DataFrame): + s = FrameWriter( + obj, orient=orient, date_format=date_format, + double_precision=double_precision, ensure_ascii=force_ascii, + date_unit=date_unit, default_handler=default_handler).write() + else: + raise NotImplementedError + + if isinstance(path_or_buf, compat.string_types): + with open(path_or_buf, 'w') as fh: + fh.write(s) + elif path_or_buf is None: + return s + else: + path_or_buf.write(s) + + +class Writer(object): + + def __init__(self, obj, orient, date_format, double_precision, + ensure_ascii, date_unit, default_handler=None): + self.obj = obj + + if orient is None: + orient = self._default_orient + + self.orient = orient + self.date_format = date_format + self.double_precision = double_precision + self.ensure_ascii = ensure_ascii + self.date_unit = date_unit + self.default_handler = default_handler + + self.is_copy = None + self._format_axes() + + def _format_axes(self): + raise NotImplementedError + + def write(self): + return dumps( + self.obj, + orient=self.orient, + double_precision=self.double_precision, + ensure_ascii=self.ensure_ascii, + date_unit=self.date_unit, + iso_dates=self.date_format == 'iso', + default_handler=self.default_handler) + + +class SeriesWriter(Writer): + _default_orient = 'index' + + def _format_axes(self): + if not self.obj.index.is_unique and self.orient == 'index': + raise ValueError("Series index must be unique for orient=" + "'%s'" % self.orient) + + +class FrameWriter(Writer): + _default_orient = 'columns' + + def _format_axes(self): + """ try to axes if they are datelike """ + if not self.obj.index.is_unique and self.orient in ( + 'index', 'columns'): + raise ValueError("DataFrame index must be unique for orient=" + "'%s'." % self.orient) + if not self.obj.columns.is_unique and self.orient in ( + 'index', 'columns', 'records'): + raise ValueError("DataFrame columns must be unique for orient=" + "'%s'." % self.orient) + + +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, + convert_axes=True, convert_dates=True, keep_default_dates=True, + numpy=False, precise_float=False, date_unit=None): + """ + Convert a JSON string to pandas object + + Parameters + ---------- + filepath_or_buffer : a valid JSON string or file-like + The string could be a URL. Valid URL schemes include http, ftp, s3, and + file. For file URLs, a host is expected. For instance, a local file + could be ``file://localhost/path/to/table.json`` + + orient + + * `Series` + + - default is ``'index'`` + - allowed values are: ``{'split','records','index'}`` + - The Series index must be unique for orient ``'index'``. + + * `DataFrame` + + - default is ``'columns'`` + - allowed values are: {'split','records','index','columns','values'} + - The DataFrame index must be unique for orients 'index' and + 'columns'. + - The DataFrame columns must be unique for orients 'index', + 'columns', and 'records'. + + * The format of the JSON string + + - split : dict like + ``{index -> [index], columns -> [columns], data -> [values]}`` + - records : list like + ``[{column -> value}, ... , {column -> value}]`` + - index : dict like ``{index -> {column -> value}}`` + - columns : dict like ``{column -> {index -> value}}`` + - values : just the values array + + typ : type of object to recover (series or frame), default 'frame' + dtype : boolean or dict, default True + If True, infer dtypes, if a dict of column to dtype, then use those, + if False, then don't infer dtypes at all, applies only to the data. + convert_axes : boolean, default True + Try to convert the axes to the proper dtypes. + convert_dates : boolean, default True + List of columns to parse for dates; If True, then try to parse + datelike columns default is True + keep_default_dates : boolean, default True. + If parsing dates, then parse the default datelike columns + numpy : boolean, default False + Direct decoding to numpy arrays. Supports numeric data only, but + non-numeric column and index labels are supported. Note also that the + JSON ordering MUST be the same for each term if numpy=True. + precise_float : boolean, default False. + Set to enable usage of higher precision (strtod) function when + decoding string to double values. Default (False) is to use fast but + less precise builtin functionality + date_unit : string, default None + The timestamp unit to detect if converting dates. The default behaviour + is to try and detect the correct precision, but if this is not desired + then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, + milliseconds, microseconds or nanoseconds respectively. + + Returns + ------- + result : Series or DataFrame + """ + + filepath_or_buffer, _ = get_filepath_or_buffer(path_or_buf) + if isinstance(filepath_or_buffer, compat.string_types): + try: + exists = os.path.exists(filepath_or_buffer) + + # if the filepath is too long will raise here + # 5874 + except (TypeError,ValueError): + exists = False + + if exists: + with open(filepath_or_buffer, 'r') as fh: + json = fh.read() + else: + json = filepath_or_buffer + elif hasattr(filepath_or_buffer, 'read'): + json = filepath_or_buffer.read() + else: + json = filepath_or_buffer + + obj = None + if typ == 'frame': + obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, + keep_default_dates, numpy, precise_float, + date_unit).parse() + + if typ == 'series' or obj is None: + if not isinstance(dtype, bool): + dtype = dict(data=dtype) + obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, + keep_default_dates, numpy, precise_float, + date_unit).parse() + + return obj + + +class Parser(object): + + _STAMP_UNITS = ('s', 'ms', 'us', 'ns') + _MIN_STAMPS = { + 's': long(31536000), + 'ms': long(31536000000), + 'us': long(31536000000000), + 'ns': long(31536000000000000)} + + def __init__(self, json, orient, dtype=True, convert_axes=True, + convert_dates=True, keep_default_dates=False, numpy=False, + precise_float=False, date_unit=None): + self.json = json + + if orient is None: + orient = self._default_orient + + self.orient = orient + self.dtype = dtype + + if orient == "split": + numpy = False + + if date_unit is not None: + date_unit = date_unit.lower() + if date_unit not in self._STAMP_UNITS: + raise ValueError('date_unit must be one of %s' % + (self._STAMP_UNITS,)) + self.min_stamp = self._MIN_STAMPS[date_unit] + else: + self.min_stamp = self._MIN_STAMPS['s'] + + self.numpy = numpy + self.precise_float = precise_float + self.convert_axes = convert_axes + self.convert_dates = convert_dates + self.date_unit = date_unit + self.keep_default_dates = keep_default_dates + self.obj = None + + def check_keys_split(self, decoded): + "checks that dict has only the appropriate keys for orient='split'" + bad_keys = set(decoded.keys()).difference(set(self._split_keys)) + if bad_keys: + bad_keys = ", ".join(bad_keys) + raise ValueError(u("JSON data had unexpected key(s): %s") % + com.pprint_thing(bad_keys)) + + def parse(self): + + # try numpy + numpy = self.numpy + if numpy: + self._parse_numpy() + + else: + self._parse_no_numpy() + + if self.obj is None: + return None + if self.convert_axes: + self._convert_axes() + self._try_convert_types() + return self.obj + + def _convert_axes(self): + """ try to convert axes """ + for axis in self.obj._AXIS_NUMBERS.keys(): + new_axis, result = self._try_convert_data( + axis, self.obj._get_axis(axis), use_dtypes=False, + convert_dates=True) + if result: + setattr(self.obj, axis, new_axis) + + def _try_convert_types(self): + raise NotImplementedError + + def _try_convert_data(self, name, data, use_dtypes=True, + convert_dates=True): + """ try to parse a ndarray like into a column by inferring dtype """ + + # don't try to coerce, unless a force conversion + if use_dtypes: + if self.dtype is False: + return data, False + elif self.dtype is True: + pass + + else: + + # dtype to force + dtype = (self.dtype.get(name) + if isinstance(self.dtype, dict) else self.dtype) + if dtype is not None: + try: + dtype = np.dtype(dtype) + return data.astype(dtype), True + except: + return data, False + + if convert_dates: + new_data, result = self._try_convert_to_date(data) + if result: + return new_data, True + + result = False + + if data.dtype == 'object': + + # try float + try: + data = data.astype('float64') + result = True + except: + pass + + if data.dtype.kind == 'f': + + if data.dtype != 'float64': + + # coerce floats to 64 + try: + data = data.astype('float64') + result = True + except: + pass + + # do't coerce 0-len data + if len(data) and (data.dtype == 'float' or data.dtype == 'object'): + + # coerce ints if we can + try: + new_data = data.astype('int64') + if (new_data == data).all(): + data = new_data + result = True + except: + pass + + # coerce ints to 64 + if data.dtype == 'int': + + # coerce floats to 64 + try: + data = data.astype('int64') + result = True + except: + pass + + return data, result + + def _try_convert_to_date(self, data): + """ try to parse a ndarray like into a date column + try to coerce object in epoch/iso formats and + integer/float in epcoh formats, return a boolean if parsing + was successful """ + + # no conversion on empty + if not len(data): + return data, False + + new_data = data + if new_data.dtype == 'object': + try: + new_data = data.astype('int64') + except: + pass + + # ignore numbers that are out of range + if issubclass(new_data.dtype.type, np.number): + in_range = (isnull(new_data.values) | (new_data > self.min_stamp) | + (new_data.values == iNaT)) + if not in_range.all(): + return data, False + + date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS + for date_unit in date_units: + try: + new_data = to_datetime(new_data, errors='raise', + unit=date_unit) + except OverflowError: + continue + except: + break + return new_data, True + return data, False + + def _try_convert_dates(self): + raise NotImplementedError + + +class SeriesParser(Parser): + _default_orient = 'index' + _split_keys = ('name', 'index', 'data') + + def _parse_no_numpy(self): + + json = self.json + orient = self.orient + if orient == "split": + decoded = dict((str(k), v) + for k, v in compat.iteritems(loads( + json, + precise_float=self.precise_float))) + self.check_keys_split(decoded) + self.obj = Series(dtype=None, **decoded) + else: + self.obj = Series( + loads(json, precise_float=self.precise_float), dtype=None) + + def _parse_numpy(self): + + json = self.json + orient = self.orient + if orient == "split": + decoded = loads(json, dtype=None, numpy=True, + precise_float=self.precise_float) + decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) + self.check_keys_split(decoded) + self.obj = Series(**decoded) + elif orient == "columns" or orient == "index": + self.obj = Series(*loads(json, dtype=None, numpy=True, + labelled=True, + precise_float=self.precise_float)) + else: + self.obj = Series(loads(json, dtype=None, numpy=True, + precise_float=self.precise_float)) + + def _try_convert_types(self): + if self.obj is None: + return + obj, result = self._try_convert_data( + 'data', self.obj, convert_dates=self.convert_dates) + if result: + self.obj = obj + + +class FrameParser(Parser): + _default_orient = 'columns' + _split_keys = ('columns', 'index', 'data') + + def _parse_numpy(self): + + json = self.json + orient = self.orient + + if orient == "columns": + args = loads(json, dtype=None, numpy=True, labelled=True, + precise_float=self.precise_float) + if args: + args = (args[0].T, args[2], args[1]) + self.obj = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=None, numpy=True, + precise_float=self.precise_float) + decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) + self.check_keys_split(decoded) + self.obj = DataFrame(**decoded) + elif orient == "values": + self.obj = DataFrame(loads(json, dtype=None, numpy=True, + precise_float=self.precise_float)) + else: + self.obj = DataFrame(*loads(json, dtype=None, numpy=True, + labelled=True, + precise_float=self.precise_float)) + + def _parse_no_numpy(self): + + json = self.json + orient = self.orient + + if orient == "columns": + self.obj = DataFrame( + loads(json, precise_float=self.precise_float), dtype=None) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in compat.iteritems(loads( + json, + precise_float=self.precise_float))) + self.check_keys_split(decoded) + self.obj = DataFrame(dtype=None, **decoded) + elif orient == "index": + self.obj = DataFrame( + loads(json, precise_float=self.precise_float), dtype=None).T + else: + self.obj = DataFrame( + loads(json, precise_float=self.precise_float), dtype=None) + + def _process_converter(self, f, filt=None): + """ take a conversion function and possibly recreate the frame """ + + if filt is None: + filt = lambda col, c: True + + needs_new_obj = False + new_obj = dict() + for i, (col, c) in enumerate(self.obj.iteritems()): + if filt(col, c): + new_data, result = f(col, c) + if result: + c = new_data + needs_new_obj = True + new_obj[i] = c + + if needs_new_obj: + + # possibly handle dup columns + new_obj = DataFrame(new_obj, index=self.obj.index) + new_obj.columns = self.obj.columns + self.obj = new_obj + + def _try_convert_types(self): + if self.obj is None: + return + if self.convert_dates: + self._try_convert_dates() + + self._process_converter( + lambda col, c: self._try_convert_data(col, c, convert_dates=False)) + + def _try_convert_dates(self): + if self.obj is None: + return + + # our columns to parse + convert_dates = self.convert_dates + if convert_dates is True: + convert_dates = [] + convert_dates = set(convert_dates) + + def is_ok(col): + """ return if this col is ok to try for a date parse """ + if not isinstance(col, compat.string_types): + return False + + if (col.endswith('_at') or + col.endswith('_time') or + col.lower() == 'modified' or + col.lower() == 'date' or + col.lower() == 'datetime'): + return True + return False + + self._process_converter( + lambda col, c: self._try_convert_to_date(c), + lambda col, c: ((self.keep_default_dates and is_ok(col)) + or col in convert_dates)) + + +#---------------------------------------------------------------------- +# JSON normalization routines + +def nested_to_record(ds, prefix="", level=0): + """a simplified json_normalize + + converts a nested dict into a flat dict ("record"), unlike json_normalize, + it does not attempt to extract a subset of the data. + + Parameters + ---------- + ds : dict or list of dicts + + Returns + ------- + d - dict or list of dicts, matching `ds` + + Example: + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), + nested=dict(e=dict(c=1,d=2),d=2))) + Out[52]: + {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + """ + singleton = False + if isinstance(ds, dict): + ds = [ds] + singleton = True + + new_ds = [] + for d in ds: + + new_d = copy.deepcopy(d) + for k, v in d.items(): + # each key gets renamed with prefix + if level == 0: + newkey = str(k) + else: + newkey = prefix + '.' + str(k) + + # only dicts gets recurse-flattend + # only at level>1 do we rename the rest of the keys + if not isinstance(v, dict): + if level != 0: # so we skip copying for top level, common case + v = new_d.pop(k) + new_d[newkey] = v + continue + else: + v = new_d.pop(k) + new_d.update(nested_to_record(v, newkey, level+1)) + new_ds.append(new_d) + + if singleton: + return new_ds[0] + return new_ds + + +def json_normalize(data, record_path=None, meta=None, + meta_prefix=None, + record_prefix=None): + """ + "Normalize" semi-structured JSON data into a flat table + + Parameters + ---------- + data : dict or list of dicts + Unserialized JSON objects + record_path : string or list of strings, default None + Path in each object to list of records. If not passed, data will be + assumed to be an array of records + meta : list of paths (string or list of strings) + Fields to use as metadata for each record in resulting table + record_prefix : string, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar'] + meta_prefix : string, default None + + Examples + -------- + data = [{'state': 'Florida', + 'shortname': 'FL', + 'info': { + 'governor': 'Rick Scott' + }, + 'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, + {'state': 'Ohio', + 'shortname': 'OH', + 'info': { + 'governor': 'John Kasich' + }, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] + + result = json_normalize(data, 'counties', ['state', 'shortname', + ['info', 'governor']]) + + state governor + Florida Rick Scott + + + Returns + ------- + frame : DataFrame + """ + def _pull_field(js, spec): + result = js + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] + + return result + + # A bit of a hackjob + if isinstance(data, dict): + data = [data] + + if record_path is None: + if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): + # naive normalization, this is idempotent for flat records + # and potentially will inflate the data considerably for + # deeply nested structures: + # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} + # + # TODO: handle record value which are lists, at least error + # reasonably + data = nested_to_record(data) + return DataFrame(data) + elif not isinstance(record_path, list): + record_path = [record_path] + + if meta is None: + meta = [] + elif not isinstance(meta, list): + meta = [meta] + + for i, x in enumerate(meta): + if not isinstance(x, list): + meta[i] = [x] + + # Disastrously inefficient for now + records = [] + lengths = [] + + meta_vals = defaultdict(list) + meta_keys = ['.'.join(val) for val in meta] + + def _recursive_extract(data, path, seen_meta, level=0): + if len(path) > 1: + for obj in data: + for val, key in zip(meta, meta_keys): + if level + 1 == len(val): + seen_meta[key] = _pull_field(obj, val[-1]) + + _recursive_extract(obj[path[0]], path[1:], + seen_meta, level=level+1) + else: + for obj in data: + recs = _pull_field(obj, path[0]) + + # For repeating the metadata later + lengths.append(len(recs)) + + for val, key in zip(meta, meta_keys): + if level + 1 > len(val): + meta_val = seen_meta[key] + else: + meta_val = _pull_field(obj, val[level:]) + meta_vals[key].append(meta_val) + + records.extend(recs) + + _recursive_extract(data, record_path, {}, level=0) + + result = DataFrame(records) + + if record_prefix is not None: + result.rename(columns=lambda x: record_prefix + x, inplace=True) + + # Data types, a problem + for k, v in compat.iteritems(meta_vals): + if meta_prefix is not None: + k = meta_prefix + k + + if k in result: + raise ValueError('Conflicting metadata name %s, ' + 'need distinguishing prefix ' % k) + + result[k] = np.array(v).repeat(lengths) + + return result diff --git a/pandas/io/packers.py b/pandas/io/packers.py new file mode 100644 index 00000000..7da86565 --- /dev/null +++ b/pandas/io/packers.py @@ -0,0 +1,642 @@ +""" +Msgpack serializer support for reading and writing pandas data structures +to disk +""" + +# portions of msgpack_numpy package, by Lev Givon were incorporated +# into this module (and tests_packers.py) + +""" +License +======= + +Copyright (c) 2013, Lev Givon. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. +* Neither the name of Lev Givon nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import os +from datetime import datetime, date, timedelta +from dateutil.parser import parse + +import numpy as np +from pandas import compat +from pandas.compat import u, PY3 +from pandas import ( + Timestamp, Period, Series, DataFrame, Panel, Panel4D, + Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, + NaT +) +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.array import BlockIndex, IntIndex +from pandas.core.generic import NDFrame +from pandas.core.common import needs_i8_conversion +from pandas.io.common import get_filepath_or_buffer +from pandas.core.internals import BlockManager, make_block +import pandas.core.internals as internals + +from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer +import zlib + +try: + import blosc + _BLOSC = True +except: + _BLOSC = False + +# until we can pass this into our conversion functions, +# this is pretty hacky +compressor = None + + +def to_msgpack(path_or_buf, *args, **kwargs): + """ + msgpack (serialize) object to input file path + + THIS IS AN EXPERIMENTAL LIBRARY and the storage format + may not be stable until a future release. + + Parameters + ---------- + path_or_buf : string File path, buffer-like, or None + if None, return generated string + args : an object or objects to serialize + append : boolean whether to append to an existing msgpack + (default is False) + compress : type of compressor (zlib or blosc), default to None (no + compression) + """ + global compressor + compressor = kwargs.pop('compress', None) + append = kwargs.pop('append', None) + if append: + mode = 'a+b' + else: + mode = 'wb' + + def writer(fh): + for a in args: + fh.write(pack(a, **kwargs)) + + if isinstance(path_or_buf, compat.string_types): + with open(path_or_buf, mode) as fh: + writer(fh) + elif path_or_buf is None: + buf = compat.BytesIO() + writer(buf) + return buf.getvalue() + else: + writer(path_or_buf) + + +def read_msgpack(path_or_buf, iterator=False, **kwargs): + """ + Load msgpack pandas object from the specified + file path + + THIS IS AN EXPERIMENTAL LIBRARY and the storage format + may not be stable until a future release. + + Parameters + ---------- + path_or_buf : string File path, BytesIO like or string + iterator : boolean, if True, return an iterator to the unpacker + (default is False) + + Returns + ------- + obj : type of object stored in file + + """ + path_or_buf, _ = get_filepath_or_buffer(path_or_buf) + if iterator: + return Iterator(path_or_buf) + + def read(fh): + l = list(unpack(fh)) + if len(l) == 1: + return l[0] + return l + + # see if we have an actual file + if isinstance(path_or_buf, compat.string_types): + + try: + exists = os.path.exists(path_or_buf) + except (TypeError,ValueError): + exists = False + + if exists: + with open(path_or_buf, 'rb') as fh: + return read(fh) + + # treat as a string-like + if not hasattr(path_or_buf, 'read'): + + try: + fh = compat.BytesIO(path_or_buf) + return read(fh) + finally: + fh.close() + + # a buffer like + return read(path_or_buf) + +dtype_dict = {21: np.dtype('M8[ns]'), + u('datetime64[ns]'): np.dtype('M8[ns]'), + u('datetime64[us]'): np.dtype('M8[us]'), + 22: np.dtype('m8[ns]'), + u('timedelta64[ns]'): np.dtype('m8[ns]'), + u('timedelta64[us]'): np.dtype('m8[us]')} + + +def dtype_for(t): + if t in dtype_dict: + return dtype_dict[t] + return np.typeDict[t] + +c2f_dict = {'complex': np.float64, + 'complex128': np.float64, + 'complex64': np.float32} + +# numpy 1.6.1 compat +if hasattr(np, 'float128'): + c2f_dict['complex256'] = np.float128 + + +def c2f(r, i, ctype_name): + """ + Convert strings to complex number instance with specified numpy type. + """ + + ftype = c2f_dict[ctype_name] + return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i)) + + +def convert(values): + """ convert the numpy values to a list """ + + dtype = values.dtype + if needs_i8_conversion(dtype): + values = values.view('i8') + v = values.ravel() + + # convert object + if dtype == np.object_: + return v.tolist() + + if compressor == 'zlib': + + # return string arrays like they are + if dtype == np.object_: + return v.tolist() + + # convert to a bytes array + v = v.tostring() + return zlib.compress(v) + + elif compressor == 'blosc' and _BLOSC: + + # return string arrays like they are + if dtype == np.object_: + return v.tolist() + + # convert to a bytes array + v = v.tostring() + return blosc.compress(v, typesize=dtype.itemsize) + + # ndarray (on original dtype) + return v.tostring() + + +def unconvert(values, dtype, compress=None): + + if dtype == np.object_: + return np.array(values, dtype=object) + + if compress == 'zlib': + + values = zlib.decompress(values) + return np.frombuffer(values, dtype=dtype) + + elif compress == 'blosc': + + if not _BLOSC: + raise Exception("cannot uncompress w/o blosc") + + # decompress + values = blosc.decompress(values) + + return np.frombuffer(values, dtype=dtype) + + # from a string + return np.fromstring(values.encode('latin1'), dtype=dtype) + + +def encode(obj): + """ + Data encoder + """ + + tobj = type(obj) + if isinstance(obj, Index): + if isinstance(obj, PeriodIndex): + return {'typ': 'period_index', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'freq': getattr(obj, 'freqstr', None), + 'dtype': obj.dtype.num, + 'data': convert(obj.asi8)} + elif isinstance(obj, DatetimeIndex): + tz = getattr(obj, 'tz', None) + + # store tz info and data as UTC + if tz is not None: + tz = tz.zone + obj = obj.tz_convert('UTC') + return {'typ': 'datetime_index', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'dtype': obj.dtype.num, + 'data': convert(obj.asi8), + 'freq': getattr(obj, 'freqstr', None), + 'tz': tz} + elif isinstance(obj, MultiIndex): + return {'typ': 'multi_index', + 'klass': obj.__class__.__name__, + 'names': getattr(obj, 'names', None), + 'dtype': obj.dtype.num, + 'data': convert(obj.values)} + else: + return {'typ': 'index', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'dtype': obj.dtype.num, + 'data': convert(obj.values)} + elif isinstance(obj, Series): + if isinstance(obj, SparseSeries): + raise NotImplementedError( + 'msgpack sparse series is not implemented' + ) + #d = {'typ': 'sparse_series', + # 'klass': obj.__class__.__name__, + # 'dtype': obj.dtype.num, + # 'index': obj.index, + # 'sp_index': obj.sp_index, + # 'sp_values': convert(obj.sp_values), + # 'compress': compressor} + #for f in ['name', 'fill_value', 'kind']: + # d[f] = getattr(obj, f, None) + #return d + else: + return {'typ': 'series', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'index': obj.index, + 'dtype': obj.dtype.num, + 'data': convert(obj.values), + 'compress': compressor} + elif issubclass(tobj, NDFrame): + if isinstance(obj, SparseDataFrame): + raise NotImplementedError( + 'msgpack sparse frame is not implemented' + ) + #d = {'typ': 'sparse_dataframe', + # 'klass': obj.__class__.__name__, + # 'columns': obj.columns} + #for f in ['default_fill_value', 'default_kind']: + # d[f] = getattr(obj, f, None) + #d['data'] = dict([(name, ss) + # for name, ss in compat.iteritems(obj)]) + #return d + elif isinstance(obj, SparsePanel): + raise NotImplementedError( + 'msgpack sparse frame is not implemented' + ) + #d = {'typ': 'sparse_panel', + # 'klass': obj.__class__.__name__, + # 'items': obj.items} + #for f in ['default_fill_value', 'default_kind']: + # d[f] = getattr(obj, f, None) + #d['data'] = dict([(name, df) + # for name, df in compat.iteritems(obj)]) + #return d + else: + + data = obj._data + if not data.is_consolidated(): + data = data.consolidate() + + # the block manager + return {'typ': 'block_manager', + 'klass': obj.__class__.__name__, + 'axes': data.axes, + 'blocks': [{'items': data.items.take(b.mgr_locs), + 'values': convert(b.values), + 'shape': b.values.shape, + 'dtype': b.dtype.num, + 'klass': b.__class__.__name__, + 'compress': compressor + } for b in data.blocks]} + + elif isinstance(obj, (datetime, date, np.datetime64, timedelta, + np.timedelta64)): + if isinstance(obj, Timestamp): + tz = obj.tzinfo + if tz is not None: + tz = tz.zone + offset = obj.offset + if offset is not None: + offset = offset.freqstr + return {'typ': 'timestamp', + 'value': obj.value, + 'offset': offset, + 'tz': tz} + elif isinstance(obj, np.timedelta64): + return {'typ': 'timedelta64', + 'data': obj.view('i8')} + elif isinstance(obj, timedelta): + return {'typ': 'timedelta', + 'data': (obj.days, obj.seconds, obj.microseconds)} + elif isinstance(obj, np.datetime64): + return {'typ': 'datetime64', + 'data': str(obj)} + elif isinstance(obj, datetime): + return {'typ': 'datetime', + 'data': obj.isoformat()} + elif isinstance(obj, date): + return {'typ': 'date', + 'data': obj.isoformat()} + raise Exception("cannot encode this datetimelike object: %s" % obj) + elif isinstance(obj, Period): + return {'typ': 'period', + 'ordinal': obj.ordinal, + 'freq': obj.freq} + elif isinstance(obj, BlockIndex): + return {'typ': 'block_index', + 'klass': obj.__class__.__name__, + 'blocs': obj.blocs, + 'blengths': obj.blengths, + 'length': obj.length} + elif isinstance(obj, IntIndex): + return {'typ': 'int_index', + 'klass': obj.__class__.__name__, + 'indices': obj.indices, + 'length': obj.length} + elif isinstance(obj, np.ndarray): + return {'typ': 'ndarray', + 'shape': obj.shape, + 'ndim': obj.ndim, + 'dtype': obj.dtype.num, + 'data': convert(obj), + 'compress': compressor} + elif isinstance(obj, np.number): + if np.iscomplexobj(obj): + return {'typ': 'np_scalar', + 'sub_typ': 'np_complex', + 'dtype': obj.dtype.name, + 'real': obj.real.__repr__(), + 'imag': obj.imag.__repr__()} + else: + return {'typ': 'np_scalar', + 'dtype': obj.dtype.name, + 'data': obj.__repr__()} + elif isinstance(obj, complex): + return {'typ': 'np_complex', + 'real': obj.real.__repr__(), + 'imag': obj.imag.__repr__()} + + return obj + + +def decode(obj): + """ + Decoder for deserializing numpy data types. + """ + + typ = obj.get('typ') + if typ is None: + return obj + elif typ == 'timestamp': + return Timestamp(obj['value'], tz=obj['tz'], offset=obj['offset']) + elif typ == 'period': + return Period(ordinal=obj['ordinal'], freq=obj['freq']) + elif typ == 'index': + dtype = dtype_for(obj['dtype']) + data = unconvert(obj['data'], np.typeDict[obj['dtype']], + obj.get('compress')) + return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) + elif typ == 'multi_index': + data = unconvert(obj['data'], np.typeDict[obj['dtype']], + obj.get('compress')) + data = [tuple(x) for x in data] + return globals()[obj['klass']].from_tuples(data, names=obj['names']) + elif typ == 'period_index': + data = unconvert(obj['data'], np.int64, obj.get('compress')) + d = dict(name=obj['name'], freq=obj['freq']) + return globals()[obj['klass']](data, **d) + elif typ == 'datetime_index': + data = unconvert(obj['data'], np.int64, obj.get('compress')) + d = dict(name=obj['name'], freq=obj['freq'], verify_integrity=False) + result = globals()[obj['klass']](data, **d) + tz = obj['tz'] + + # reverse tz conversion + if tz is not None: + result = result.tz_localize('UTC').tz_convert(tz) + return result + + elif typ == 'series': + dtype = dtype_for(obj['dtype']) + index = obj['index'] + return globals()[obj['klass']](unconvert(obj['data'], dtype, + obj['compress']), + index=index, name=obj['name']) + elif typ == 'block_manager': + axes = obj['axes'] + + def create_block(b): + values = unconvert(b['values'], dtype_for(b['dtype']), + b['compress']).reshape(b['shape']) + return make_block(values=values, + klass=getattr(internals, b['klass']), + placement=axes[0].get_indexer(b['items'])) + + blocks = [create_block(b) for b in obj['blocks']] + return globals()[obj['klass']](BlockManager(blocks, axes)) + elif typ == 'datetime': + return parse(obj['data']) + elif typ == 'datetime64': + return np.datetime64(parse(obj['data'])) + elif typ == 'date': + return parse(obj['data']).date() + elif typ == 'timedelta': + return timedelta(*obj['data']) + elif typ == 'timedelta64': + return np.timedelta64(int(obj['data'])) + #elif typ == 'sparse_series': + # dtype = dtype_for(obj['dtype']) + # return globals()[obj['klass']]( + # unconvert(obj['sp_values'], dtype, obj['compress']), + # sparse_index=obj['sp_index'], index=obj['index'], + # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) + #elif typ == 'sparse_dataframe': + # return globals()[obj['klass']]( + # obj['data'], columns=obj['columns'], + # default_fill_value=obj['default_fill_value'], + # default_kind=obj['default_kind'] + # ) + #elif typ == 'sparse_panel': + # return globals()[obj['klass']]( + # obj['data'], items=obj['items'], + # default_fill_value=obj['default_fill_value'], + # default_kind=obj['default_kind']) + elif typ == 'block_index': + return globals()[obj['klass']](obj['length'], obj['blocs'], + obj['blengths']) + elif typ == 'int_index': + return globals()[obj['klass']](obj['length'], obj['indices']) + elif typ == 'ndarray': + return unconvert(obj['data'], np.typeDict[obj['dtype']], + obj.get('compress')).reshape(obj['shape']) + elif typ == 'np_scalar': + if obj.get('sub_typ') == 'np_complex': + return c2f(obj['real'], obj['imag'], obj['dtype']) + else: + dtype = dtype_for(obj['dtype']) + try: + return dtype(obj['data']) + except: + return dtype.type(obj['data']) + elif typ == 'np_complex': + return complex(obj['real'] + '+' + obj['imag'] + 'j') + elif isinstance(obj, (dict, list, set)): + return obj + else: + return obj + + +def pack(o, default=encode, + encoding='latin1', unicode_errors='strict', use_single_float=False): + """ + Pack an object and return the packed bytes. + """ + + return Packer(default=default, encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float).pack(o) + + +def unpack(packed, object_hook=decode, + list_hook=None, use_list=False, encoding='latin1', + unicode_errors='strict', object_pairs_hook=None): + """ + Unpack a packed object, return an iterator + Note: packed lists will be returned as tuples + """ + + return Unpacker(packed, object_hook=object_hook, + list_hook=list_hook, + use_list=use_list, encoding=encoding, + unicode_errors=unicode_errors, + object_pairs_hook=object_pairs_hook) + + +class Packer(_Packer): + + def __init__(self, default=encode, + encoding='latin1', + unicode_errors='strict', + use_single_float=False): + super(Packer, self).__init__(default=default, + encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float) + + +class Unpacker(_Unpacker): + + def __init__(self, file_like=None, read_size=0, use_list=False, + object_hook=decode, + object_pairs_hook=None, list_hook=None, encoding='latin1', + unicode_errors='strict', max_buffer_size=0): + super(Unpacker, self).__init__(file_like=file_like, + read_size=read_size, + use_list=use_list, + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, + list_hook=list_hook, + encoding=encoding, + unicode_errors=unicode_errors, + max_buffer_size=max_buffer_size) + + +class Iterator(object): + + """ manage the unpacking iteration, + close the file on completion """ + + def __init__(self, path, **kwargs): + self.path = path + self.kwargs = kwargs + + def __iter__(self): + + needs_closing = True + try: + + # see if we have an actual file + if isinstance(self.path, compat.string_types): + + try: + path_exists = os.path.exists(self.path) + except TypeError: + path_exists = False + + if path_exists: + fh = open(self.path, 'rb') + else: + fh = compat.BytesIO(self.path) + + else: + + if not hasattr(self.path, 'read'): + fh = compat.BytesIO(self.path) + + else: + + # a file-like + needs_closing = False + fh = self.path + + unpacker = unpack(fh) + for o in unpacker: + yield o + finally: + if needs_closing: + fh.close() diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py new file mode 100644 index 00000000..6d2afbad --- /dev/null +++ b/pandas/io/parsers.py @@ -0,0 +1,2320 @@ +""" +Module contains tools for processing files into DataFrames or other objects +""" +from __future__ import print_function +from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas import compat +import re +import csv +import warnings + +import numpy as np + +from pandas.core.index import Index, MultiIndex +from pandas.core.frame import DataFrame +import datetime +import pandas.core.common as com +from pandas.core.config import get_option +from pandas.io.date_converters import generic_parser +from pandas.io.common import get_filepath_or_buffer +from pandas.tseries import tools + +from pandas.util.decorators import Appender + +import pandas.lib as lib +import pandas.tslib as tslib +import pandas.parser as _parser + +class ParserWarning(Warning): + pass + +_parser_params = """Also supports optionally iterating or breaking of the file +into chunks. + +Parameters +---------- +filepath_or_buffer : string or file handle / StringIO. The string could be + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a + host is expected. For instance, a local file could be + file ://localhost/path/to/table.csv +%s +lineterminator : string (length 1), default None + Character to break file into lines. Only valid with C parser +quotechar : string (length 1) + The character used to denote the start and end of a quoted item. Quoted + items can include the delimiter and it will be ignored. +quoting : int or csv.QUOTE_* instance, default None + Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of + QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). + Default (None) results in QUOTE_MINIMAL behavior. +skipinitialspace : boolean, default False + Skip spaces after delimiter +escapechar : string (length 1), default None + One-character string used to escape delimiter when quoting is QUOTE_NONE. +dtype : Type name or dict of column -> type + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + (Unsupported with engine='python') +compression : {'gzip', 'bz2', None}, default None + For on-the-fly decompression of on-disk data +dialect : string or csv.Dialect instance, default None + If None defaults to Excel dialect. Ignored if sep longer than 1 char + See csv.Dialect documentation for more details +header : int row number(s) to use as the column names, and the start of the + data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly + pass ``header=0`` to be able to replace existing names. The header can be + a list of integers that specify row locations for a multi-index on the + columns E.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example are skipped). Note that this parameter + ignores commented lines, so header=0 denotes the first line of + data rather than the first line of the file. +skiprows : list-like or integer + Line numbers to skip (0-indexed) or number of lines to skip (int) + at the start of the file +index_col : int or sequence or False, default None + Column to use as the row labels of the DataFrame. If a sequence is given, a + MultiIndex is used. If you have a malformed file with delimiters at the end + of each line, you might consider index_col=False to force pandas to _not_ + use the first column as the index (row names) +names : array-like + List of column names to use. If file contains no header row, then you + should explicitly pass header=None +prefix : string or None (default) + Prefix to add to column numbers when no header, e.g 'X' for X0, X1, ... +na_values : list-like or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values +true_values : list + Values to consider as True +false_values : list + Values to consider as False +keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to +parse_dates : boolean, list of ints or names, list of lists, or dict + If True -> try parsing the index. + If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. + If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. + {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' + A fast-path exists for iso8601-formatted dates. +keep_date_col : boolean, default False + If True and parse_dates specifies combining multiple columns then + keep the original columns. +date_parser : function + Function to use for converting a sequence of string columns to an + array of datetime instances. The default uses dateutil.parser.parser + to do the conversion. +dayfirst : boolean, default False + DD/MM format dates, international and European format +thousands : str, default None + Thousands separator +comment : str, default None + Indicates remainder of line should not be parsed. If found at the + beginning of a line, the line will be ignored altogether. This parameter + must be a single character. Also, fully commented lines + are ignored by the parameter `header` but not by `skiprows`. For example, + if comment='#', parsing '#empty\n1,2,3\na,b,c' with `header=0` will + result in '1,2,3' being treated as the header. +decimal : str, default '.' + Character to recognize as decimal point. E.g. use ',' for European data +nrows : int, default None + Number of rows of file to read. Useful for reading pieces of large files +iterator : boolean, default False + Return TextFileReader object +chunksize : int, default None + Return TextFileReader object for iteration +skipfooter : int, default 0 + Number of lines at bottom of file to skip (Unsupported with engine='c') +converters : dict. optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels +verbose : boolean, default False + Indicate number of NA values placed in non-numeric columns +delimiter : string, default None + Alternative argument name for sep. Regular expressions are accepted. +encoding : string, default None + Encoding to use for UTF when reading/writing (ex. 'utf-8') +squeeze : boolean, default False + If the parsed data only contains one column then return a Series +na_filter : boolean, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file +usecols : array-like + Return a subset of the columns. + Results in much faster parsing time and lower memory usage. +mangle_dupe_cols : boolean, default True + Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' +tupleize_cols : boolean, default False + Leave a list of tuples on columns as is (default is to convert to + a Multi Index on the columns) +error_bad_lines : boolean, default True + Lines with too many fields (e.g. a csv line with too many commas) will by + default cause an exception to be raised, and no DataFrame will be returned. + If False, then these "bad lines" will dropped from the DataFrame that is + returned. (Only valid with C parser) +warn_bad_lines : boolean, default True + If error_bad_lines is False, and warn_bad_lines is True, a warning for each + "bad line" will be output. (Only valid with C parser). +infer_datetime_format : boolean, default False + If True and parse_dates is enabled for a column, attempt to infer + the datetime format to speed up the processing + +Returns +------- +result : DataFrame or TextParser +""" + +_csv_params = """sep : string, default ',' + Delimiter to use. If sep is None, will try to automatically determine + this. Regular expressions are accepted. +engine : {'c', 'python'} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete.""" + +_table_params = """sep : string, default \\t (tab-stop) + Delimiter to use. Regular expressions are accepted. +engine : {'c', 'python'} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete.""" + +_read_csv_doc = """ +Read CSV (comma-separated) file into DataFrame + +%s +""" % (_parser_params % _csv_params) + +_read_table_doc = """ +Read general delimited file into DataFrame + +%s +""" % (_parser_params % _table_params) + +_fwf_widths = """\ +colspecs : list of pairs (int, int) or 'infer'. optional + A list of pairs (tuples) giving the extents of the fixed-width + fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try + detecting the column specifications from the first 100 rows of + the data (default='infer'). +widths : list of ints. optional + A list of field widths which can be used instead of 'colspecs' if + the intervals are contiguous. +""" + +_read_fwf_doc = """ +Read a table of fixed-width formatted lines into DataFrame + +%s + +Also, 'delimiter' is used to specify the filler character of the +fields if it is not spaces (e.g., '~'). +""" % (_parser_params % _fwf_widths) + + +def _read(filepath_or_buffer, kwds): + "Generic reader of line files." + encoding = kwds.get('encoding', None) + skipfooter = kwds.pop('skipfooter', None) + if skipfooter is not None: + kwds['skip_footer'] = skipfooter + + filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer, + encoding) + + if kwds.get('date_parser', None) is not None: + if isinstance(kwds['parse_dates'], bool): + kwds['parse_dates'] = True + + # Extract some of the arguments (pass chunksize on). + iterator = kwds.get('iterator', False) + nrows = kwds.pop('nrows', None) + chunksize = kwds.get('chunksize', None) + + # Create the parser. + parser = TextFileReader(filepath_or_buffer, **kwds) + + if (nrows is not None) and (chunksize is not None): + raise NotImplementedError("'nrows' and 'chunksize' can not be used" + " together yet.") + elif nrows is not None: + return parser.read(nrows) + elif chunksize or iterator: + return parser + + return parser.read() + +_parser_defaults = { + 'delimiter': None, + + 'doublequote': True, + 'escapechar': None, + 'quotechar': '"', + 'quoting': csv.QUOTE_MINIMAL, + 'skipinitialspace': False, + 'lineterminator': None, + + 'header': 'infer', + 'index_col': None, + 'names': None, + 'prefix': None, + 'skiprows': None, + 'na_values': None, + 'true_values': None, + 'false_values': None, + 'skip_footer': 0, + 'converters': None, + + 'keep_default_na': True, + 'thousands': None, + 'comment': None, + + # 'engine': 'c', + 'parse_dates': False, + 'keep_date_col': False, + 'dayfirst': False, + 'date_parser': None, + + 'usecols': None, + + # 'nrows': None, + # 'iterator': False, + 'chunksize': None, + 'verbose': False, + 'encoding': None, + 'squeeze': False, + 'compression': None, + 'mangle_dupe_cols': True, + 'tupleize_cols': False, + 'infer_datetime_format': False, +} + + +_c_parser_defaults = { + 'delim_whitespace': False, + 'as_recarray': False, + 'na_filter': True, + 'compact_ints': False, + 'use_unsigned': False, + 'low_memory': True, + 'memory_map': False, + 'buffer_lines': None, + 'error_bad_lines': True, + 'warn_bad_lines': True, + 'dtype': None, + 'decimal': b'.' +} + +_fwf_defaults = { + 'colspecs': 'infer', + 'widths': None, +} + +_c_unsupported = set(['skip_footer']) +_python_unsupported = set(_c_parser_defaults.keys()) + + +def _make_parser_function(name, sep=','): + + default_sep = sep + + def parser_f(filepath_or_buffer, + sep=sep, + dialect=None, + compression=None, + + doublequote=True, + escapechar=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + skipinitialspace=False, + lineterminator=None, + + header='infer', + index_col=None, + names=None, + prefix=None, + skiprows=None, + skipfooter=None, + skip_footer=0, + na_values=None, + na_fvalues=None, + true_values=None, + false_values=None, + delimiter=None, + converters=None, + dtype=None, + usecols=None, + + engine=None, + delim_whitespace=False, + as_recarray=False, + na_filter=True, + compact_ints=False, + use_unsigned=False, + low_memory=_c_parser_defaults['low_memory'], + buffer_lines=None, + warn_bad_lines=True, + error_bad_lines=True, + + keep_default_na=True, + thousands=None, + comment=None, + decimal=b'.', + + parse_dates=False, + keep_date_col=False, + dayfirst=False, + date_parser=None, + + memory_map=False, + nrows=None, + iterator=False, + chunksize=None, + + verbose=False, + encoding=None, + squeeze=False, + mangle_dupe_cols=True, + tupleize_cols=False, + infer_datetime_format=False): + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + if delim_whitespace and delimiter is not default_sep: + raise ValueError("Specified a delimiter with both sep and"\ + " delim_whitespace=True; you can only specify one.") + + if engine is not None: + engine_specified = True + else: + engine = 'c' + engine_specified = False + + kwds = dict(delimiter=delimiter, + engine=engine, + dialect=dialect, + compression=compression, + engine_specified=engine_specified, + + doublequote=doublequote, + escapechar=escapechar, + quotechar=quotechar, + quoting=quoting, + skipinitialspace=skipinitialspace, + lineterminator=lineterminator, + + header=header, + index_col=index_col, + names=names, + prefix=prefix, + skiprows=skiprows, + na_values=na_values, + na_fvalues=na_fvalues, + true_values=true_values, + false_values=false_values, + keep_default_na=keep_default_na, + thousands=thousands, + comment=comment, + decimal=decimal, + + parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, + date_parser=date_parser, + + nrows=nrows, + iterator=iterator, + chunksize=chunksize, + skipfooter=skipfooter or skip_footer, + converters=converters, + dtype=dtype, + usecols=usecols, + verbose=verbose, + encoding=encoding, + squeeze=squeeze, + memory_map=memory_map, + + na_filter=na_filter, + compact_ints=compact_ints, + use_unsigned=use_unsigned, + delim_whitespace=delim_whitespace, + as_recarray=as_recarray, + warn_bad_lines=warn_bad_lines, + error_bad_lines=error_bad_lines, + low_memory=low_memory, + buffer_lines=buffer_lines, + mangle_dupe_cols=mangle_dupe_cols, + tupleize_cols=tupleize_cols, + infer_datetime_format=infer_datetime_format) + + return _read(filepath_or_buffer, kwds) + + parser_f.__name__ = name + + return parser_f + +read_csv = _make_parser_function('read_csv', sep=',') +read_csv = Appender(_read_csv_doc)(read_csv) + +read_table = _make_parser_function('read_table', sep='\t') +read_table = Appender(_read_table_doc)(read_table) + + +@Appender(_read_fwf_doc) +def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds): + # Check input arguments. + if colspecs is None and widths is None: + raise ValueError("Must specify either colspecs or widths") + elif colspecs not in (None, 'infer') and widths is not None: + raise ValueError("You must specify only one of 'widths' and " + "'colspecs'") + + # Compute 'colspecs' from 'widths', if specified. + if widths is not None: + colspecs, col = [], 0 + for w in widths: + colspecs.append((col, col + w)) + col += w + + kwds['colspecs'] = colspecs + kwds['engine'] = 'python-fwf' + return _read(filepath_or_buffer, kwds) + + +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', +_NA_VALUES = set([ + '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', '#NA', + 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' +]) + + +class TextFileReader(object): + """ + + Passed dialect overrides any of the related parser options + + """ + + def __init__(self, f, engine=None, **kwds): + + self.f = f + + if engine is not None: + engine_specified = True + else: + engine = 'python' + engine_specified = False + + self._engine_specified = kwds.get('engine_specified', engine_specified) + + if kwds.get('dialect') is not None: + dialect = kwds['dialect'] + kwds['delimiter'] = dialect.delimiter + kwds['doublequote'] = dialect.doublequote + kwds['escapechar'] = dialect.escapechar + kwds['skipinitialspace'] = dialect.skipinitialspace + kwds['quotechar'] = dialect.quotechar + kwds['quoting'] = dialect.quoting + + if kwds.get('header', 'infer') == 'infer': + kwds['header'] = 0 if kwds.get('names') is None else None + + self.orig_options = kwds + + # miscellanea + self.engine = engine + self._engine = None + + options = self._get_options_with_defaults(engine) + + self.chunksize = options.pop('chunksize', None) + self.squeeze = options.pop('squeeze', False) + + # might mutate self.engine + self.options, self.engine = self._clean_options(options, engine) + if 'has_index_names' in kwds: + self.options['has_index_names'] = kwds['has_index_names'] + + self._make_engine(self.engine) + + def _get_options_with_defaults(self, engine): + kwds = self.orig_options + + options = {} + + for argname, default in compat.iteritems(_parser_defaults): + options[argname] = kwds.get(argname, default) + + for argname, default in compat.iteritems(_c_parser_defaults): + if argname in kwds: + value = kwds[argname] + + if engine != 'c' and value != default: + raise ValueError('The %r option is not supported with the' + ' %r engine' % (argname, engine)) + else: + value = default + options[argname] = value + + if engine == 'python-fwf': + for argname, default in compat.iteritems(_fwf_defaults): + options[argname] = kwds.get(argname, default) + + return options + + def _clean_options(self, options, engine): + result = options.copy() + + engine_specified = self._engine_specified + fallback_reason = None + + sep = options['delimiter'] + delim_whitespace = options['delim_whitespace'] + + # C engine not supported yet + if engine == 'c': + if options['skip_footer'] > 0: + fallback_reason = "the 'c' engine does not support"\ + " skip_footer" + engine = 'python' + + if sep is None and not delim_whitespace: + if engine == 'c': + fallback_reason = "the 'c' engine does not support"\ + " sep=None with delim_whitespace=False" + engine = 'python' + elif sep is not None and len(sep) > 1: + if engine == 'c' and sep == '\s+': + result['delim_whitespace'] = True + del result['delimiter'] + elif engine not in ('python', 'python-fwf'): + # wait until regex engine integrated + fallback_reason = "the 'c' engine does not support"\ + " regex separators" + engine = 'python' + + if fallback_reason and engine_specified: + raise ValueError(fallback_reason) + + if engine == 'c': + for arg in _c_unsupported: + del result[arg] + + if 'python' in engine: + for arg in _python_unsupported: + if fallback_reason and result[arg] != _c_parser_defaults[arg]: + msg = ("Falling back to the 'python' engine because" + " {reason}, but this causes {option!r} to be" + " ignored as it is not supported by the 'python'" + " engine.").format(reason=fallback_reason, option=arg) + if arg == 'dtype': + msg += " (Note the 'converters' option provides"\ + " similar functionality.)" + raise ValueError(msg) + del result[arg] + + if fallback_reason: + warnings.warn(("Falling back to the 'python' engine because" + " {0}; you can avoid this warning by specifying" + " engine='python'.").format(fallback_reason), + ParserWarning) + + index_col = options['index_col'] + names = options['names'] + converters = options['converters'] + na_values = options['na_values'] + skiprows = options['skiprows'] + + # really delete this one + keep_default_na = result.pop('keep_default_na') + + if _is_index_col(index_col): + if not isinstance(index_col, (list, tuple, np.ndarray)): + index_col = [index_col] + result['index_col'] = index_col + + names = list(names) if names is not None else names + + # type conversion-related + if converters is not None: + if not isinstance(converters, dict): + raise TypeError('Type converters must be a dict or' + ' subclass, input was ' + 'a {0!r}'.format(type(converters).__name__)) + else: + converters = {} + + # Converting values to NA + na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + + if com.is_integer(skiprows): + skiprows = lrange(skiprows) + skiprows = set() if skiprows is None else set(skiprows) + + # put stuff back + result['names'] = names + result['converters'] = converters + result['na_values'] = na_values + result['na_fvalues'] = na_fvalues + result['skiprows'] = skiprows + + return result, engine + + def __iter__(self): + try: + if self.chunksize: + while True: + yield self.read(self.chunksize) + else: + yield self.read() + except StopIteration: + pass + + def _make_engine(self, engine='c'): + if engine == 'c': + self._engine = CParserWrapper(self.f, **self.options) + else: + if engine == 'python': + klass = PythonParser + elif engine == 'python-fwf': + klass = FixedWidthFieldParser + self._engine = klass(self.f, **self.options) + + def _failover_to_python(self): + raise NotImplementedError + + def read(self, nrows=None): + if nrows is not None: + if self.options.get('skip_footer'): + raise ValueError('skip_footer not supported for iteration') + + ret = self._engine.read(nrows) + + if self.options.get('as_recarray'): + return ret + + # May alter columns / col_dict + index, columns, col_dict = self._create_index(ret) + + df = DataFrame(col_dict, columns=columns, index=index) + + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]] + return df + + def _create_index(self, ret): + index, columns, col_dict = ret + return index, columns, col_dict + + def get_chunk(self, size=None): + if size is None: + size = self.chunksize + return self.read(nrows=size) + + +def _is_index_col(col): + return col is not None and col is not False + + +class ParserBase(object): + + def __init__(self, kwds): + self.names = kwds.get('names') + self.orig_names = None + self.prefix = kwds.pop('prefix', None) + + self.index_col = kwds.get('index_col', None) + self.index_names = None + self.col_names = None + + self.parse_dates = kwds.pop('parse_dates', False) + self.date_parser = kwds.pop('date_parser', None) + self.dayfirst = kwds.pop('dayfirst', False) + self.keep_date_col = kwds.pop('keep_date_col', False) + + self.na_values = kwds.get('na_values') + self.na_fvalues = kwds.get('na_fvalues') + self.true_values = kwds.get('true_values') + self.false_values = kwds.get('false_values') + self.tupleize_cols = kwds.get('tupleize_cols', False) + self.infer_datetime_format = kwds.pop('infer_datetime_format', False) + + self._date_conv = _make_date_converter( + date_parser=self.date_parser, + dayfirst=self.dayfirst, + infer_datetime_format=self.infer_datetime_format + ) + + # validate header options for mi + self.header = kwds.get('header') + if isinstance(self.header, (list, tuple, np.ndarray)): + if kwds.get('as_recarray'): + raise ValueError("cannot specify as_recarray when " + "specifying a multi-index header") + if kwds.get('usecols'): + raise ValueError("cannot specify usecols when " + "specifying a multi-index header") + if kwds.get('names'): + raise ValueError("cannot specify names when " + "specifying a multi-index header") + + # validate index_col that only contains integers + if self.index_col is not None: + is_sequence = isinstance(self.index_col, (list, tuple, + np.ndarray)) + if not (is_sequence and + all(map(com.is_integer, self.index_col)) or + com.is_integer(self.index_col)): + raise ValueError("index_col must only contain row numbers " + "when specifying a multi-index header") + + self._name_processed = False + + @property + def _has_complex_date_col(self): + return (isinstance(self.parse_dates, dict) or + (isinstance(self.parse_dates, list) and + len(self.parse_dates) > 0 and + isinstance(self.parse_dates[0], list))) + + def _should_parse_dates(self, i): + if isinstance(self.parse_dates, bool): + return self.parse_dates + else: + name = self.index_names[i] + j = self.index_col[i] + + if np.isscalar(self.parse_dates): + return (j == self.parse_dates) or (name == self.parse_dates) + else: + return (j in self.parse_dates) or (name in self.parse_dates) + + def _extract_multi_indexer_columns(self, header, index_names, col_names, + passed_names=False): + """ extract and return the names, index_names, col_names + header is a list-of-lists returned from the parsers """ + if len(header) < 2: + return header[0], index_names, col_names, passed_names + + # the names are the tuples of the header that are not the index cols + # 0 is the name of the index, assuming index_col is a list of column + # numbers + ic = self.index_col + if ic is None: + ic = [] + + if not isinstance(ic, (list, tuple, np.ndarray)): + ic = [ic] + sic = set(ic) + + # clean the index_names + index_names = header.pop(-1) + index_names, names, index_col = _clean_index_names(index_names, + self.index_col) + + # extract the columns + field_count = len(header[0]) + + def extract(r): + return tuple([r[i] for i in range(field_count) if i not in sic]) + + columns = lzip(*[extract(r) for r in header]) + names = ic + columns + + # if we find 'Unnamed' all of a single level, then our header was too + # long + for n in range(len(columns[0])): + if all(['Unnamed' in c[n] for c in columns]): + raise _parser.CParserError( + "Passed header=[%s] are too many rows for this " + "multi_index of columns" + % ','.join([str(x) for x in self.header]) + ) + + # clean the column names (if we have an index_col) + if len(ic): + col_names = [r[0] if len(r[0]) and 'Unnamed' not in r[0] else None + for r in header] + else: + col_names = [None] * len(header) + + passed_names = True + + return names, index_names, col_names, passed_names + + def _maybe_make_multi_index_columns(self, columns, col_names=None): + # possibly create a column mi here + if (not self.tupleize_cols and len(columns) and + not isinstance(columns, MultiIndex) and + all([isinstance(c, tuple) for c in columns])): + columns = MultiIndex.from_tuples(columns, names=col_names) + return columns + + def _make_index(self, data, alldata, columns, indexnamerow=False): + if not _is_index_col(self.index_col) or not self.index_col: + index = None + + elif not self._has_complex_date_col: + index = self._get_simple_index(alldata, columns) + index = self._agg_index(index) + + elif self._has_complex_date_col: + if not self._name_processed: + (self.index_names, _, + self.index_col) = _clean_index_names(list(columns), + self.index_col) + self._name_processed = True + index = self._get_complex_date_index(data, columns) + index = self._agg_index(index, try_parse_dates=False) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index = index.set_names(indexnamerow[:coffset]) + + # maybe create a mi on the columns + columns = self._maybe_make_multi_index_columns(columns, self.col_names) + + return index, columns + + _implicit_index = False + + def _get_simple_index(self, data, columns): + def ix(col): + if not isinstance(col, compat.string_types): + return col + raise ValueError('Index %s invalid' % col) + index = None + + to_remove = [] + index = [] + for idx in self.index_col: + i = ix(idx) + to_remove.append(i) + index.append(data[i]) + + # remove index items from content and columns, don't pop in + # loop + for i in reversed(sorted(to_remove)): + data.pop(i) + if not self._implicit_index: + columns.pop(i) + + return index + + def _get_complex_date_index(self, data, col_names): + def _get_name(icol): + if isinstance(icol, compat.string_types): + return icol + + if col_names is None: + raise ValueError(('Must supply column order to use %s as ' + 'index') % str(icol)) + + for i, c in enumerate(col_names): + if i == icol: + return c + + index = None + + to_remove = [] + index = [] + for idx in self.index_col: + name = _get_name(idx) + to_remove.append(name) + index.append(data[name]) + + # remove index items from content and columns, don't pop in + # loop + for c in reversed(sorted(to_remove)): + data.pop(c) + col_names.remove(c) + + return index + + def _agg_index(self, index, try_parse_dates=True): + arrays = [] + for i, arr in enumerate(index): + + if (try_parse_dates and self._should_parse_dates(i)): + arr = self._date_conv(arr) + + col_na_values = self.na_values + col_na_fvalues = self.na_fvalues + + if isinstance(self.na_values, dict): + col_name = self.index_names[i] + if col_name is not None: + col_na_values, col_na_fvalues = _get_na_values( + col_name, self.na_values, self.na_fvalues) + + arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) + arrays.append(arr) + + index = MultiIndex.from_arrays(arrays, names=self.index_names) + + return index + + def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, + converters=None): + result = {} + for c, values in compat.iteritems(dct): + conv_f = None if converters is None else converters.get(c, None) + col_na_values, col_na_fvalues = _get_na_values(c, na_values, + na_fvalues) + coerce_type = True + if conv_f is not None: + values = lib.map_infer(values, conv_f) + coerce_type = False + cvals, na_count = self._convert_types( + values, set(col_na_values) | col_na_fvalues, coerce_type) + result[c] = cvals + if verbose and na_count: + print('Filled %d NA values in column %s' % (na_count, str(c))) + return result + + def _convert_types(self, values, na_values, try_num_bool=True): + na_count = 0 + if issubclass(values.dtype.type, (np.number, np.bool_)): + mask = lib.ismember(values, na_values) + na_count = mask.sum() + if na_count > 0: + if com.is_integer_dtype(values): + values = values.astype(np.float64) + np.putmask(values, mask, np.nan) + return values, na_count + + if try_num_bool: + try: + result = lib.maybe_convert_numeric(values, na_values, False) + except Exception: + result = values + if values.dtype == np.object_: + na_count = lib.sanitize_objects(result, na_values, False) + else: + result = values + if values.dtype == np.object_: + na_count = lib.sanitize_objects(values, na_values, False) + + if result.dtype == np.object_ and try_num_bool: + result = lib.maybe_convert_bool(values, + true_values=self.true_values, + false_values=self.false_values) + + return result, na_count + + def _do_date_conversions(self, names, data): + # returns data, columns + if self.parse_dates is not None: + data, names = _process_date_conversion( + data, self._date_conv, self.parse_dates, self.index_col, + self.index_names, names, keep_date_col=self.keep_date_col) + + return names, data + + +class CParserWrapper(ParserBase): + """ + + """ + + def __init__(self, src, **kwds): + self.kwds = kwds + kwds = kwds.copy() + + self.as_recarray = kwds.get('as_recarray', False) + ParserBase.__init__(self, kwds) + + if 'utf-16' in (kwds.get('encoding') or ''): + if isinstance(src, compat.string_types): + src = open(src, 'rb') + src = com.UTF8Recoder(src, kwds['encoding']) + kwds['encoding'] = 'utf-8' + + # #2442 + kwds['allow_leading_cols'] = self.index_col is not False + + self._reader = _parser.TextReader(src, **kwds) + + # XXX + self.usecols = self._reader.usecols + + passed_names = self.names is None + + if self._reader.header is None: + self.names = None + else: + if len(self._reader.header) > 1: + # we have a multi index in the columns + self.names, self.index_names, self.col_names, passed_names = ( + self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, + passed_names + ) + ) + else: + self.names = list(self._reader.header[0]) + + if self.names is None: + if self.prefix: + self.names = ['%s%d' % (self.prefix, i) + for i in range(self._reader.table_width)] + else: + self.names = lrange(self._reader.table_width) + + # If the names were inferred (not passed by user) and usedcols is + # defined, then ensure names refers to the used columns, not the + # document's columns. + if self.usecols and passed_names: + col_indices = [] + for u in self.usecols: + if isinstance(u, string_types): + col_indices.append(self.names.index(u)) + else: + col_indices.append(u) + self.names = [n for i, n in enumerate(self.names) + if i in col_indices] + if len(self.names) < len(self.usecols): + raise ValueError("Usecols do not match names.") + + self._set_noconvert_columns() + + self.orig_names = self.names + + if not self._has_complex_date_col: + if (self._reader.leading_cols == 0 and + _is_index_col(self.index_col)): + + self._name_processed = True + (index_names, self.names, + self.index_col) = _clean_index_names(self.names, + self.index_col) + + if self.index_names is None: + self.index_names = index_names + + if self._reader.header is None and not passed_names: + self.index_names = [None] * len(self.index_names) + + self._implicit_index = self._reader.leading_cols > 0 + + def _set_noconvert_columns(self): + names = self.names + + def _set(x): + if com.is_integer(x): + self._reader.set_noconvert(x) + else: + self._reader.set_noconvert(names.index(x)) + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + def set_error_bad_lines(self, status): + self._reader.set_error_bad_lines(int(status)) + + def read(self, nrows=None): + if self.as_recarray: + # what to do if there are leading columns? + return self._reader.read(nrows) + + try: + data = self._reader.read(nrows) + except StopIteration: + if nrows is None: + return None, self.names, {} + else: + raise + + names = self.names + + if self._reader.leading_cols: + if self._has_complex_date_col: + raise NotImplementedError('file structure not yet supported') + + # implicit index, no index names + arrays = [] + + for i in range(self._reader.leading_cols): + if self.index_col is None: + values = data.pop(i) + else: + values = data.pop(self.index_col[i]) + + values = self._maybe_parse_dates(values, i, + try_parse_dates=True) + arrays.append(values) + + index = MultiIndex.from_arrays(arrays) + + if self.usecols is not None: + names = self._filter_usecols(names) + + # rename dict keys + data = sorted(data.items()) + data = dict((k, v) for k, (i, v) in zip(names, data)) + + names, data = self._do_date_conversions(names, data) + + else: + # rename dict keys + data = sorted(data.items()) + + # ugh, mutation + names = list(self.orig_names) + + if self.usecols is not None: + names = self._filter_usecols(names) + + # columns as list + alldata = [x[1] for x in data] + + data = dict((k, v) for k, (i, v) in zip(names, data)) + + names, data = self._do_date_conversions(names, data) + index, names = self._make_index(data, alldata, names) + + # maybe create a mi on the columns + names = self._maybe_make_multi_index_columns(names, self.col_names) + + return index, names, data + + def _filter_usecols(self, names): + # hackish + if self.usecols is not None and len(names) != len(self.usecols): + names = [name for i, name in enumerate(names) + if i in self.usecols or name in self.usecols] + return names + + def _get_index_names(self): + names = list(self._reader.header[0]) + idx_names = None + + if self._reader.leading_cols == 0 and self.index_col is not None: + (idx_names, names, + self.index_col) = _clean_index_names(names, self.index_col) + + return names, idx_names + + def _maybe_parse_dates(self, values, index, try_parse_dates=True): + if try_parse_dates and self._should_parse_dates(index): + values = self._date_conv(values) + return values + + +def TextParser(*args, **kwds): + """ + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files + + Parameters + ---------- + data : file-like object or list + delimiter : separator character to use + dialect : str or csv.Dialect instance, default None + Ignored if delimiter is longer than 1 character + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, default None + Column or columns to use as the (possibly hierarchical) index + has_index_names: boolean, default False + True if the cols defined in index_col have an index name and are + not in the header + na_values : iterable, default None + Custom NA values + keep_default_na : bool, default True + thousands : str, default None + Thousands separator + comment : str, default None + Comment out remainder of line + parse_dates : boolean, default False + keep_date_col : boolean, default False + date_parser : function, default None + skiprows : list of integers + Row numbers to skip + skip_footer : int + Number of line at bottom of file to skip + encoding : string, default None + Encoding to use for UTF when reading/writing (ex. 'utf-8') + squeeze : boolean, default False + returns Series if only one column + infer_datetime_format: boolean, default False + If True and `parse_dates` is True for a column, try to infer the + datetime format based on the first datetime string. If the format + can be inferred, there often will be a large parsing speed-up. + """ + kwds['engine'] = 'python' + return TextFileReader(*args, **kwds) + + +def count_empty_vals(vals): + return sum([1 for v in vals if v == '' or v is None]) + + +def _wrap_compressed(f, compression, encoding=None): + """wraps compressed fileobject in a decompressing fileobject + NOTE: For all files in Python 3.2 and for bzip'd files under all Python + versions, this means reading in the entire file and then re-wrapping it in + StringIO. + """ + compression = compression.lower() + encoding = encoding or get_option('display.encoding') + if compression == 'gzip': + import gzip + + f = gzip.GzipFile(fileobj=f) + if compat.PY3_2: + # 3.2's gzip doesn't support read1 + f = StringIO(f.read().decode(encoding)) + elif compat.PY3: + from io import TextIOWrapper + + f = TextIOWrapper(f) + return f + elif compression == 'bz2': + import bz2 + + # bz2 module can't take file objects, so have to run through decompress + # manually + data = bz2.decompress(f.read()) + if compat.PY3: + data = data.decode(encoding) + f = StringIO(data) + return f + else: + raise ValueError('do not recognize compression method %s' + % compression) + + +class PythonParser(ParserBase): + + def __init__(self, f, **kwds): + """ + Workhorse function for processing nested list into DataFrame + + Should be replaced by np.genfromtxt eventually? + """ + ParserBase.__init__(self, kwds) + + self.data = None + self.buf = [] + self.pos = 0 + self.line_pos = 0 + + self.encoding = kwds['encoding'] + self.compression = kwds['compression'] + self.skiprows = kwds['skiprows'] + + self.skip_footer = kwds['skip_footer'] + self.delimiter = kwds['delimiter'] + + self.quotechar = kwds['quotechar'] + self.escapechar = kwds['escapechar'] + self.doublequote = kwds['doublequote'] + self.skipinitialspace = kwds['skipinitialspace'] + self.lineterminator = kwds['lineterminator'] + self.quoting = kwds['quoting'] + self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) + self.usecols = kwds['usecols'] + + self.names_passed = kwds['names'] or None + + self.has_index_names = False + if 'has_index_names' in kwds: + self.has_index_names = kwds['has_index_names'] + + self.verbose = kwds['verbose'] + self.converters = kwds['converters'] + + self.thousands = kwds['thousands'] + self.comment = kwds['comment'] + self._comment_lines = [] + + if isinstance(f, compat.string_types): + f = com._get_handle(f, 'r', encoding=self.encoding, + compression=self.compression) + elif self.compression: + f = _wrap_compressed(f, self.compression, self.encoding) + # in Python 3, convert BytesIO or fileobjects passed with an encoding + elif compat.PY3 and isinstance(f, compat.BytesIO): + from io import TextIOWrapper + + f = TextIOWrapper(f, encoding=self.encoding) + + # Set self.data to something that can read lines. + if hasattr(f, 'readline'): + self._make_reader(f) + else: + self.data = f + + # Get columns in two steps: infer from data, then + # infer column indices from self.usecols if is is specified. + self._col_indices = None + self.columns, self.num_original_columns = self._infer_columns() + + # Now self.columns has the set of columns that we will process. + # The original set is stored in self.original_columns. + if len(self.columns) > 1: + # we are processing a multi index column + self.columns, self.index_names, self.col_names, _ = ( + self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names + ) + ) + # Update list of original names to include all indices. + self.num_original_columns = len(self.columns) + else: + self.columns = self.columns[0] + + # get popped off for index + self.orig_names = list(self.columns) + + # needs to be cleaned/refactored + # multiple date column thing turning into a real spaghetti factory + if not self._has_complex_date_col: + (index_names, + self.orig_names, self.columns) = self._get_index_name(self.columns) + self._name_processed = True + if self.index_names is None: + self.index_names = index_names + self._first_chunk = True + + if self.parse_dates: + self._no_thousands_columns = self._set_no_thousands_columns() + else: + self._no_thousands_columns = None + + def _set_no_thousands_columns(self): + # Create a set of column ids that are not to be stripped of thousands + # operators. + noconvert_columns = set() + + def _set(x): + if com.is_integer(x): + noconvert_columns.add(x) + else: + noconvert_columns.add(self.columns.index(x)) + + if isinstance(self.parse_dates, list): + for val in self.parse_dates: + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + + elif isinstance(self.parse_dates, dict): + for val in self.parse_dates.values(): + if isinstance(val, list): + for k in val: + _set(k) + else: + _set(val) + return noconvert_columns + + def _make_reader(self, f): + sep = self.delimiter + + if sep is None or len(sep) == 1: + if self.lineterminator: + raise ValueError('Custom line terminators not supported in ' + 'python parser (yet)') + + class MyDialect(csv.Dialect): + delimiter = self.delimiter + quotechar = self.quotechar + escapechar = self.escapechar + doublequote = self.doublequote + skipinitialspace = self.skipinitialspace + quoting = self.quoting + lineterminator = '\n' + + dia = MyDialect + + sniff_sep = True + + if sep is not None: + sniff_sep = False + dia.delimiter = sep + # attempt to sniff the delimiter + if sniff_sep: + line = f.readline() + while self.pos in self.skiprows: + self.pos += 1 + line = f.readline() + + line = self._check_comments([line])[0] + + self.pos += 1 + self.line_pos += 1 + sniffed = csv.Sniffer().sniff(line) + dia.delimiter = sniffed.delimiter + if self.encoding is not None: + self.buf.extend(list( + com.UnicodeReader(StringIO(line), + dialect=dia, + encoding=self.encoding))) + else: + self.buf.extend(list(csv.reader(StringIO(line), + dialect=dia))) + + if self.encoding is not None: + reader = com.UnicodeReader(f, dialect=dia, + encoding=self.encoding, + strict=True) + else: + reader = csv.reader(f, dialect=dia, + strict=True) + + else: + def _read(): + line = next(f) + pat = re.compile(sep) + yield pat.split(line.strip()) + for line in f: + yield pat.split(line.strip()) + reader = _read() + + self.data = reader + + def read(self, rows=None): + try: + content = self._get_lines(rows) + except StopIteration: + if self._first_chunk: + content = [] + else: + raise + + # done with first read, next time raise StopIteration + self._first_chunk = False + + columns = list(self.orig_names) + if not len(content): # pragma: no cover + # DataFrame with the right metadata, even though it's length 0 + return _get_empty_meta(self.orig_names, + self.index_col, + self.index_names) + + # handle new style for names in index + count_empty_content_vals = count_empty_vals(content[0]) + indexnamerow = None + if self.has_index_names and count_empty_content_vals == len(columns): + indexnamerow = content[0] + content = content[1:] + + alldata = self._rows_to_cols(content) + data = self._exclude_implicit_index(alldata) + + columns, data = self._do_date_conversions(self.columns, data) + + data = self._convert_data(data) + index, columns = self._make_index(data, alldata, columns, indexnamerow) + + return index, columns, data + + def _exclude_implicit_index(self, alldata): + + if self._implicit_index: + excl_indices = self.index_col + + data = {} + offset = 0 + for i, col in enumerate(self.orig_names): + while i + offset in excl_indices: + offset += 1 + data[col] = alldata[i + offset] + else: + data = dict((k, v) for k, v in zip(self.orig_names, alldata)) + + return data + + # legacy + def get_chunk(self, size=None): + if size is None: + size = self.chunksize + return self.read(nrows=size) + + def _convert_data(self, data): + # apply converters + clean_conv = {} + + for col, f in compat.iteritems(self.converters): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean_conv[col] = f + + return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, + self.verbose, clean_conv) + + def _infer_columns(self): + names = self.names + num_original_columns = 0 + clear_buffer = True + if self.header is not None: + header = self.header + + # we have a mi columns, so read an extra line + if isinstance(header, (list, tuple, np.ndarray)): + have_mi_columns = True + header = list(header) + [header[-1] + 1] + else: + have_mi_columns = False + header = [header] + + columns = [] + for level, hr in enumerate(header): + line = self._buffered_line() + + while self.line_pos <= hr: + line = self._next_line() + unnamed_count = 0 + this_columns = [] + for i, c in enumerate(line): + if c == '': + if have_mi_columns: + this_columns.append('Unnamed: %d_level_%d' + % (i, level)) + else: + this_columns.append('Unnamed: %d' % i) + unnamed_count += 1 + else: + this_columns.append(c) + + if not have_mi_columns and self.mangle_dupe_cols: + counts = {} + for i, col in enumerate(this_columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + this_columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 + elif have_mi_columns: + + # if we have grabbed an extra line, but its not in our + # format so save in the buffer, and create an blank extra + # line for the rest of the parsing code + if hr == header[-1]: + lc = len(this_columns) + ic = (len(self.index_col) + if self.index_col is not None else 0) + if lc != unnamed_count and lc-ic > unnamed_count: + clear_buffer = False + this_columns = [None] * lc + self.buf = [self.buf[-1]] + + columns.append(this_columns) + if len(columns) == 1: + num_original_columns = len(this_columns) + + if clear_buffer: + self._clear_buffer() + + if names is not None: + if ((self.usecols is not None + and len(names) != len(self.usecols)) + or (self.usecols is None + and len(names) != len(columns[0]))): + raise ValueError('Number of passed names did not match ' + 'number of header fields in the file') + if len(columns) > 1: + raise TypeError('Cannot pass names with multi-index ' + 'columns') + + if self.usecols is not None: + # Set _use_cols. We don't store columns because they are + # overwritten. + self._handle_usecols(columns, names) + else: + self._col_indices = None + num_original_columns = len(names) + columns = [names] + else: + columns = self._handle_usecols(columns, columns[0]) + else: + # header is None + line = self._buffered_line() + ncols = len(line) + num_original_columns = ncols + if not names: + if self.prefix: + columns = [['%s%d' % (self.prefix,i) for i in range(ncols)]] + else: + columns = [lrange(ncols)] + columns = self._handle_usecols(columns, columns[0]) + else: + if self.usecols is None or len(names) == num_original_columns: + columns = self._handle_usecols([names], names) + num_original_columns = len(names) + else: + if self.usecols and len(names) != len(self.usecols): + raise ValueError( + 'Number of passed names did not match number of ' + 'header fields in the file' + ) + # Ignore output but set used columns. + self._handle_usecols([names], names) + columns = [names] + num_original_columns = ncols + + return columns, num_original_columns + + def _handle_usecols(self, columns, usecols_key): + """ + Sets self._col_indices + + usecols_key is used if there are string usecols. + """ + if self.usecols is not None: + if any([isinstance(u, string_types) for u in self.usecols]): + if len(columns) > 1: + raise ValueError("If using multiple headers, usecols must " + "be integers.") + col_indices = [] + for u in self.usecols: + if isinstance(u, string_types): + col_indices.append(usecols_key.index(u)) + else: + col_indices.append(u) + else: + col_indices = self.usecols + + columns = [[n for i, n in enumerate(column) if i in col_indices] + for column in columns] + self._col_indices = col_indices + return columns + + def _buffered_line(self): + """ + Return a line from buffer, filling buffer if required. + """ + if len(self.buf) > 0: + return self.buf[0] + else: + return self._next_line() + + def _empty(self, line): + return not line or all(not x for x in line) + + def _next_line(self): + if isinstance(self.data, list): + while self.pos in self.skiprows: + self.pos += 1 + + while True: + try: + line = self._check_comments([self.data[self.pos]])[0] + self.pos += 1 + # either uncommented or blank to begin with + if self._empty(self.data[self.pos - 1]) or line: + break + except IndexError: + raise StopIteration + else: + while self.pos in self.skiprows: + next(self.data) + self.pos += 1 + + while True: + orig_line = next(self.data) + line = self._check_comments([orig_line])[0] + self.pos += 1 + if self._empty(orig_line) or line: + break + + self.line_pos += 1 + self.buf.append(line) + + return line + + def _check_comments(self, lines): + if self.comment is None: + return lines + ret = [] + for l in lines: + rl = [] + for x in l: + if (not isinstance(x, compat.string_types) or + self.comment not in x): + rl.append(x) + else: + x = x[:x.find(self.comment)] + if len(x) > 0: + rl.append(x) + break + ret.append(rl) + return ret + + def _check_thousands(self, lines): + if self.thousands is None: + return lines + nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands) + ret = [] + for l in lines: + rl = [] + for i, x in enumerate(l): + if (not isinstance(x, compat.string_types) or + self.thousands not in x or + (self._no_thousands_columns + and i in self._no_thousands_columns) + or nonnum.search(x.strip())): + rl.append(x) + else: + rl.append(x.replace(self.thousands, '')) + ret.append(rl) + return ret + + def _clear_buffer(self): + self.buf = [] + + _implicit_index = False + + def _get_index_name(self, columns): + """ + Try several cases to get lines: + + 0) There are headers on row 0 and row 1 and their + total summed lengths equals the length of the next line. + Treat row 0 as columns and row 1 as indices + 1) Look for implicit index: there are more columns + on row 1 than row 0. If this is true, assume that row + 1 lists index columns and row 0 lists normal columns. + 2) Get index from the columns if it was listed. + """ + orig_names = list(columns) + columns = list(columns) + + try: + line = self._next_line() + except StopIteration: + line = None + + try: + next_line = self._next_line() + except StopIteration: + next_line = None + + # implicitly index_col=0 b/c 1 fewer column names + implicit_first_cols = 0 + if line is not None: + # leave it 0, #2442 + # Case 1 + if self.index_col is not False: + implicit_first_cols = len(line) - self.num_original_columns + + # Case 0 + if next_line is not None: + if len(next_line) == len(line) + self.num_original_columns: + # column and index names on diff rows + self.index_col = lrange(len(line)) + self.buf = self.buf[1:] + + for c in reversed(line): + columns.insert(0, c) + + # Update list of original names to include all indices. + orig_names = list(columns) + self.num_original_columns = len(columns) + return line, orig_names, columns + + if implicit_first_cols > 0: + # Case 1 + self._implicit_index = True + if self.index_col is None: + self.index_col = lrange(implicit_first_cols) + + index_name = None + + else: + # Case 2 + (index_name, columns_, + self.index_col) = _clean_index_names(columns, self.index_col) + + return index_name, orig_names, columns + + def _rows_to_cols(self, content): + zipped_content = list(lib.to_object_array(content).T) + + col_len = self.num_original_columns + zip_len = len(zipped_content) + + if self._implicit_index: + col_len += len(self.index_col) + + if self.skip_footer < 0: + raise ValueError('skip footer cannot be negative') + + # Loop through rows to verify lengths are correct. + if col_len != zip_len and self.index_col is not False: + i = 0 + for (i, l) in enumerate(content): + if len(l) != col_len: + break + + footers = 0 + if self.skip_footer: + footers = self.skip_footer + + row_num = self.pos - (len(content) - i + footers) + + msg = ('Expected %d fields in line %d, saw %d' % + (col_len, row_num + 1, zip_len)) + raise ValueError(msg) + + if self.usecols: + if self._implicit_index: + zipped_content = [ + a for i, a in enumerate(zipped_content) + if (i < len(self.index_col) + or i - len(self.index_col) in self._col_indices) + ] + else: + zipped_content = [a for i, a in enumerate(zipped_content) + if i in self._col_indices] + return zipped_content + + def _get_lines(self, rows=None): + source = self.data + lines = self.buf + new_rows = None + + # already fetched some number + if rows is not None: + + # we already have the lines in the buffer + if len(self.buf) >= rows: + new_rows, self.buf = self.buf[:rows], self.buf[rows:] + + # need some lines + else: + rows -= len(self.buf) + + if new_rows is None: + if isinstance(source, list): + if self.pos > len(source): + raise StopIteration + if rows is None: + new_rows = source[self.pos:] + new_pos = len(source) + else: + new_rows = source[self.pos:self.pos + rows] + new_pos = self.pos + rows + + # Check for stop rows. n.b.: self.skiprows is a set. + if self.skiprows: + new_rows = [row for i, row in enumerate(new_rows) + if i + self.pos not in self.skiprows] + + lines.extend(new_rows) + self.pos = new_pos + + else: + new_rows = [] + try: + if rows is not None: + for _ in range(rows): + new_rows.append(next(source)) + lines.extend(new_rows) + else: + rows = 0 + while True: + try: + new_rows.append(next(source)) + rows += 1 + except csv.Error as inst: + if 'newline inside string' in str(inst): + row_num = str(self.pos + rows) + msg = ('EOF inside string starting with ' + 'line ' + row_num) + raise Exception(msg) + raise + except StopIteration: + if self.skiprows: + new_rows = [row for i, row in enumerate(new_rows) + if self.pos + i not in self.skiprows] + lines.extend(new_rows) + if len(lines) == 0: + raise + self.pos += len(new_rows) + + self.buf = [] + else: + lines = new_rows + + if self.skip_footer: + lines = lines[:-self.skip_footer] + + lines = self._check_comments(lines) + return self._check_thousands(lines) + + +def _make_date_converter(date_parser=None, dayfirst=False, + infer_datetime_format=False): + def converter(*date_cols): + if date_parser is None: + strs = _concat_date_cols(date_cols) + try: + return tools.to_datetime( + com._ensure_object(strs), + utc=None, + box=False, + dayfirst=dayfirst, + infer_datetime_format=infer_datetime_format + ) + except: + return lib.try_parse_dates(strs, dayfirst=dayfirst) + else: + try: + result = date_parser(*date_cols) + if isinstance(result, datetime.datetime): + raise Exception('scalar parser') + return result + except Exception: + try: + return lib.try_parse_dates(_concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst) + except Exception: + return generic_parser(date_parser, *date_cols) + + return converter + + +def _process_date_conversion(data_dict, converter, parse_spec, + index_col, index_names, columns, + keep_date_col=False): + def _isindex(colspec): + return ((isinstance(index_col, list) and + colspec in index_col) + or (isinstance(index_names, list) and + colspec in index_names)) + + new_cols = [] + new_data = {} + + orig_names = columns + columns = list(columns) + + date_cols = set() + + if parse_spec is None or isinstance(parse_spec, bool): + return data_dict, columns + + if isinstance(parse_spec, list): + # list of column lists + for colspec in parse_spec: + if np.isscalar(colspec): + if isinstance(colspec, int) and colspec not in data_dict: + colspec = orig_names[colspec] + if _isindex(colspec): + continue + data_dict[colspec] = converter(data_dict[colspec]) + else: + new_name, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names) + if new_name in data_dict: + raise ValueError('New date column already in dict %s' % + new_name) + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + elif isinstance(parse_spec, dict): + # dict of new name to column list + for new_name, colspec in compat.iteritems(parse_spec): + if new_name in data_dict: + raise ValueError('Date column %s already in dict' % + new_name) + + _, col, old_names = _try_convert_dates(converter, colspec, + data_dict, orig_names) + + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + data_dict.update(new_data) + new_cols.extend(columns) + + if not keep_date_col: + for c in list(date_cols): + data_dict.pop(c) + new_cols.remove(c) + + return data_dict, new_cols + + +def _try_convert_dates(parser, colspec, data_dict, columns): + colset = set(columns) + colnames = [] + + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int) and c not in columns: + colnames.append(str(columns[c])) + else: + colnames.append(c) + + new_name = '_'.join([str(x) for x in colnames]) + to_parse = [data_dict[c] for c in colnames if c in data_dict] + + new_col = parser(*to_parse) + return new_name, new_col, colnames + + +def _clean_na_values(na_values, keep_default_na=True): + + if na_values is None: + if keep_default_na: + na_values = _NA_VALUES + else: + na_values = [] + na_fvalues = set() + elif isinstance(na_values, dict): + if keep_default_na: + for k, v in compat.iteritems(na_values): + v = set(list(v)) | _NA_VALUES + na_values[k] = v + na_fvalues = dict([ + (k, _floatify_na_values(v)) for k, v in na_values.items() + ]) + else: + if not com.is_list_like(na_values): + na_values = [na_values] + na_values = _stringify_na_values(na_values) + if keep_default_na: + na_values = na_values | _NA_VALUES + + na_fvalues = _floatify_na_values(na_values) + + return na_values, na_fvalues + + +def _clean_index_names(columns, index_col): + if not _is_index_col(index_col): + return None, columns, index_col + + columns = list(columns) + + cp_cols = list(columns) + index_names = [] + + # don't mutate + index_col = list(index_col) + + for i, c in enumerate(index_col): + if isinstance(c, compat.string_types): + index_names.append(c) + for j, name in enumerate(cp_cols): + if name == c: + index_col[i] = j + columns.remove(name) + break + else: + name = cp_cols[c] + columns.remove(name) + index_names.append(name) + + # hack + if isinstance(index_names[0], compat.string_types)\ + and 'Unnamed' in index_names[0]: + index_names[0] = None + + return index_names, columns, index_col + + +def _get_empty_meta(columns, index_col, index_names): + columns = list(columns) + + if index_col is not None: + index = MultiIndex.from_arrays([[]] * len(index_col), + names=index_names) + for n in index_col: + columns.pop(n) + else: + index = Index([]) + + return index, columns, {} + + +def _floatify_na_values(na_values): + # create float versions of the na_values + result = set() + for v in na_values: + try: + v = float(v) + if not np.isnan(v): + result.add(v) + except: + pass + return result + + +def _stringify_na_values(na_values): + """ return a stringified and numeric for these values """ + result = [] + for x in na_values: + result.append(str(x)) + result.append(x) + try: + v = float(x) + + # we are like 999 here + if v == int(v): + v = int(v) + result.append("%s.0" % v) + result.append(str(v)) + + result.append(v) + except: + pass + try: + result.append(int(x)) + except: + pass + return set(result) + + +def _get_na_values(col, na_values, na_fvalues): + if isinstance(na_values, dict): + if col in na_values: + values = na_values[col] + fvalues = na_fvalues[col] + return na_values[col], na_fvalues[col] + else: + return _NA_VALUES, set() + else: + return na_values, na_fvalues + + +def _get_col_names(colspec, columns): + colset = set(columns) + colnames = [] + for c in colspec: + if c in colset: + colnames.append(c) + elif isinstance(c, int): + colnames.append(columns[c]) + return colnames + + +def _concat_date_cols(date_cols): + if len(date_cols) == 1: + if compat.PY3: + return np.array([compat.text_type(x) for x in date_cols[0]], + dtype=object) + else: + return np.array([ + str(x) if not isinstance(x, compat.string_types) else x + for x in date_cols[0] + ], dtype=object) + + rs = np.array([' '.join([compat.text_type(y) for y in x]) + for x in zip(*date_cols)], dtype=object) + return rs + + +class FixedWidthReader(object): + """ + A reader of fixed-width lines. + """ + def __init__(self, f, colspecs, delimiter, comment): + self.f = f + self.buffer = None + self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' + self.comment = comment + if colspecs == 'infer': + self.colspecs = self.detect_colspecs() + else: + self.colspecs = colspecs + + if not isinstance(self.colspecs, (tuple, list)): + raise TypeError("column specifications must be a list or tuple, " + "input was a %r" % type(colspecs).__name__) + + for colspec in self.colspecs: + + if not (isinstance(colspec, (tuple, list)) and + len(colspec) == 2 and + isinstance(colspec[0], (int, np.integer, type(None))) and + isinstance(colspec[1], (int, np.integer, type(None)))): + raise TypeError('Each column specification must be ' + '2 element tuple or list of integers') + + def get_rows(self, n): + rows = [] + for i, row in enumerate(self.f, 1): + rows.append(row) + if i >= n: + break + self.buffer = iter(rows) + return rows + + def detect_colspecs(self, n=100): + # Regex escape the delimiters + delimiters = ''.join([r'\%s' % x for x in self.delimiter]) + pattern = re.compile('([^%s]+)' % delimiters) + rows = self.get_rows(n) + max_len = max(map(len, rows)) + mask = np.zeros(max_len + 1, dtype=int) + if self.comment is not None: + rows = [row.partition(self.comment)[0] for row in rows] + for row in rows: + for m in pattern.finditer(row): + mask[m.start():m.end()] = 1 + shifted = np.roll(mask, 1) + shifted[0] = 0 + edges = np.where((mask ^ shifted) == 1)[0] + return list(zip(edges[::2], edges[1::2])) + + def next(self): + if self.buffer is not None: + try: + line = next(self.buffer) + except StopIteration: + self.buffer = None + line = next(self.f) + else: + line = next(self.f) + # Note: 'colspecs' is a sequence of half-open intervals. + return [line[fromm:to].strip(self.delimiter) + for (fromm, to) in self.colspecs] + + # Iterator protocol in Python 3 uses __next__() + __next__ = next + + +class FixedWidthFieldParser(PythonParser): + """ + Specialization that Converts fixed-width fields into DataFrames. + See PythonParser for details. + """ + def __init__(self, f, **kwds): + # Support iterators, convert to a list. + self.colspecs = kwds.pop('colspecs') + + PythonParser.__init__(self, f, **kwds) + + def _make_reader(self, f): + self.data = FixedWidthReader(f, self.colspecs, self.delimiter, + self.comment) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py new file mode 100644 index 00000000..e80bfec9 --- /dev/null +++ b/pandas/io/pickle.py @@ -0,0 +1,65 @@ +from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 + + +def to_pickle(obj, path): + """ + Pickle (serialize) object to input file path + + Parameters + ---------- + obj : any object + path : string + File path + """ + with open(path, 'wb') as f: + pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) + + +def read_pickle(path): + """ + Load pickled pandas object (or any other pickled object) from the specified + file path + + Warning: Loading pickled data received from untrusted sources can be + unsafe. See: http://docs.python.org/2.7/library/pickle.html + + Parameters + ---------- + path : string + File path + + Returns + ------- + unpickled : type of object stored in file + """ + + def try_read(path, encoding=None): + # try with cPickle + # try with current pickle, if we have a Type Error then + # try with the compat pickle to handle subclass changes + # pass encoding only if its not None as py2 doesn't handle + # the param + + # cpickle + # GH 6899 + try: + with open(path, 'rb') as fh: + return pkl.load(fh) + except: + + # reg/patched pickle + try: + with open(path, 'rb') as fh: + return pc.load(fh, encoding=encoding, compat=False) + + # compat pickle + except: + with open(path, 'rb') as fh: + return pc.load(fh, encoding=encoding, compat=True) + + try: + return try_read(path) + except: + if PY3: + return try_read(path, encoding='latin1') + raise diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py new file mode 100644 index 00000000..cee1867e --- /dev/null +++ b/pandas/io/pytables.py @@ -0,0 +1,4375 @@ +""" +High level interface to PyTables for reading and writing pandas data structures +to disk +""" + +# pylint: disable-msg=E1101,W0613,W0603 +from datetime import datetime, date +import time +import re +import copy +import itertools +import warnings +import os + +import numpy as np +from pandas import (Series, TimeSeries, DataFrame, Panel, Panel4D, Index, + MultiIndex, Int64Index, Timestamp, _np_version_under1p7) +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.array import BlockIndex, IntIndex +from pandas.tseries.api import PeriodIndex, DatetimeIndex +from pandas.core.base import StringMixin +from pandas.core.common import adjoin, pprint_thing +from pandas.core.algorithms import match, unique +from pandas.core.categorical import Categorical +from pandas.core.common import _asarray_tuplesafe +from pandas.core.internals import BlockManager, make_block +from pandas.core.reshape import block2d_to_blocknd, factor_indexer +from pandas.core.index import _ensure_index +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type +import pandas.core.common as com +from pandas.tools.merge import concat +from pandas import compat +from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter +from pandas.io.common import PerformanceWarning +from pandas.core.config import get_option +from pandas.computation.pytables import Expr, maybe_expression + +import pandas.lib as lib +import pandas.algos as algos +import pandas.tslib as tslib + +from contextlib import contextmanager +from distutils.version import LooseVersion + +# versioning attribute +_version = '0.10.1' + +# PY3 encoding if we don't specify +_default_encoding = 'UTF-8' + +def _ensure_decoded(s): + """ if we have bytes, decode them to unicde """ + if isinstance(s, np.bytes_): + s = s.decode('UTF-8') + return s + + +def _ensure_encoding(encoding): + # set the encoding if we need + if encoding is None: + if PY3: + encoding = _default_encoding + return encoding + + +Term = Expr + + +def _ensure_term(where, scope_level): + """ + ensure that the where is a Term or a list of Term + this makes sure that we are capturing the scope of variables + that are passed + create the terms here with a frame_level=2 (we are 2 levels down) + """ + + # only consider list/tuple here as an ndarray is automaticaly a coordinate + # list + level = scope_level + 1 + if isinstance(where, (list, tuple)): + wlist = [] + for w in filter(lambda x: x is not None, where): + if not maybe_expression(w): + wlist.append(w) + else: + wlist.append(Term(w, scope_level=level)) + where = wlist + elif maybe_expression(where): + where = Term(where, scope_level=level) + return where + + +class PossibleDataLossError(Exception): + pass + + +class ClosedFileError(Exception): + pass + + +class IncompatibilityWarning(Warning): + pass + +incompatibility_doc = """ +where criteria is being ignored as this version [%s] is too old (or +not-defined), read the file in and write it out to a new file to upgrade (with +the copy_to method) +""" + + +class AttributeConflictWarning(Warning): + pass + +attribute_conflict_doc = """ +the [%s] attribute of the existing index is [%s] which conflicts with the new +[%s], resetting the attribute to None +""" + + +class DuplicateWarning(Warning): + pass + +duplicate_doc = """ +duplicate entries in table, taking most recently appended +""" + +performance_doc = """ +your performance may suffer as PyTables will pickle object types that it cannot +map directly to c-types [inferred_type->%s,key->%s] [items->%s] +""" + +# formats +_FORMAT_MAP = { + u('f'): 'fixed', + u('fixed'): 'fixed', + u('t'): 'table', + u('table'): 'table', +} + +format_deprecate_doc = """ +the table keyword has been deprecated +use the format='fixed(f)|table(t)' keyword instead + fixed(f) : specifies the Fixed format + and is the default for put operations + table(t) : specifies the Table format + and is the default for append operations +""" + +# map object types +_TYPE_MAP = { + + Series: u('series'), + SparseSeries: u('sparse_series'), + TimeSeries: u('series'), + DataFrame: u('frame'), + SparseDataFrame: u('sparse_frame'), + Panel: u('wide'), + Panel4D: u('ndim'), + SparsePanel: u('sparse_panel') +} + +# storer class map +_STORER_MAP = { + u('TimeSeries'): 'LegacySeriesFixed', + u('Series'): 'LegacySeriesFixed', + u('DataFrame'): 'LegacyFrameFixed', + u('DataMatrix'): 'LegacyFrameFixed', + u('series'): 'SeriesFixed', + u('sparse_series'): 'SparseSeriesFixed', + u('frame'): 'FrameFixed', + u('sparse_frame'): 'SparseFrameFixed', + u('wide'): 'PanelFixed', + u('sparse_panel'): 'SparsePanelFixed', +} + +# table class map +_TABLE_MAP = { + u('generic_table'): 'GenericTable', + u('appendable_series'): 'AppendableSeriesTable', + u('appendable_multiseries'): 'AppendableMultiSeriesTable', + u('appendable_frame'): 'AppendableFrameTable', + u('appendable_multiframe'): 'AppendableMultiFrameTable', + u('appendable_panel'): 'AppendablePanelTable', + u('appendable_ndim'): 'AppendableNDimTable', + u('worm'): 'WORMTable', + u('legacy_frame'): 'LegacyFrameTable', + u('legacy_panel'): 'LegacyPanelTable', +} + +# axes map +_AXES_MAP = { + DataFrame: [0], + Panel: [1, 2], + Panel4D: [1, 2, 3], +} + +# register our configuration options +from pandas.core import config +dropna_doc = """ +: boolean + drop ALL nan rows when appending to a table +""" +format_doc = """ +: format + default format writing format, if None, then + put will default to 'fixed' and append will default to 'table' +""" + +with config.config_prefix('io.hdf'): + config.register_option('dropna_table', True, dropna_doc, + validator=config.is_bool) + config.register_option( + 'default_format', None, format_doc, + validator=config.is_one_of_factory(['fixed', 'table', None]) + ) + +# oh the troubles to reduce import time +_table_mod = None +_table_supports_index = False +_table_file_open_policy_is_strict = False + +def _tables(): + global _table_mod + global _table_supports_index + global _table_file_open_policy_is_strict + if _table_mod is None: + import tables + _table_mod = tables + + # version requirements + ver = tables.__version__ + _table_supports_index = LooseVersion(ver) >= '2.3' + + # set the file open policy + # return the file open policy; this changes as of pytables 3.1 + # depending on the HDF5 version + try: + _table_file_open_policy_is_strict = tables.file._FILE_OPEN_POLICY == 'strict' + except: + pass + + return _table_mod + +@contextmanager +def get_store(path, **kwargs): + """ + Creates an HDFStore instance. This function can be used in a with statement + + Parameters + ---------- + same as HDFStore + + Examples + -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) + >>> with get_store('test.h5') as store: + ... store['foo'] = bar # write to HDF5 + ... bar = store['foo'] # retrieve + """ + store = None + try: + store = HDFStore(path, **kwargs) + yield store + finally: + if store is not None: + store.close() + + +# interface to/from ### + +def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, + append=None, **kwargs): + """ store this object, close it if we opened it """ + if append: + f = lambda store: store.append(key, value, **kwargs) + else: + f = lambda store: store.put(key, value, **kwargs) + + if isinstance(path_or_buf, string_types): + with get_store(path_or_buf, mode=mode, complevel=complevel, + complib=complib) as store: + f(store) + else: + f(path_or_buf) + + +def read_hdf(path_or_buf, key, **kwargs): + """ read from the store, close it if we opened it + + Retrieve pandas object stored in file, optionally based on where + criteria + + Parameters + ---------- + path_or_buf : path (string), or buffer to read from + key : group identifier in the store + where : list of Term (or convertable) objects, optional + start : optional, integer (defaults to None), row number to start + selection + stop : optional, integer (defaults to None), row number to stop + selection + columns : optional, a list of columns that if not None, will limit the + return columns + iterator : optional, boolean, return an iterator, default False + chunksize : optional, nrows to include in iteration, return an iterator + auto_close : optional, boolean, should automatically close the store + when finished, default is False + + Returns + ------- + The selected object + + """ + + # grab the scope + if 'where' in kwargs: + kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) + + f = lambda store, auto_close: store.select( + key, auto_close=auto_close, **kwargs) + + if isinstance(path_or_buf, string_types): + + # can't auto open/close if we are using an iterator + # so delegate to the iterator + store = HDFStore(path_or_buf, **kwargs) + try: + return f(store, True) + except: + + # if there is an error, close the store + try: + store.close() + except: + pass + + raise + + # a passed store; user controls open/close + f(path_or_buf, False) + + +class HDFStore(StringMixin): + + """ + dict-like IO interface for storing pandas objects in PyTables + either Fixed or Table format. + + Parameters + ---------- + path : string + File path to HDF5 file + mode : {'a', 'w', 'r', 'r+'}, default 'a' + + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + complevel : int, 1-9, default 0 + If a complib is specified compression will be applied + where possible + complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None + If complevel is > 0 apply compression to objects written + in the store wherever possible + fletcher32 : bool, default False + If applying compression use the fletcher32 checksum + + Examples + -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) + >>> store = HDFStore('test.h5') + >>> store['foo'] = bar # write to HDF5 + >>> bar = store['foo'] # retrieve + >>> store.close() + """ + + def __init__(self, path, mode=None, complevel=None, complib=None, + fletcher32=False, **kwargs): + try: + import tables + except ImportError: # pragma: no cover + raise ImportError('HDFStore requires PyTables') + + self._path = path + if mode is None: + mode = 'a' + self._mode = mode + self._handle = None + self._complevel = complevel + self._complib = complib + self._fletcher32 = fletcher32 + self._filters = None + self.open(mode=mode, **kwargs) + + @property + def root(self): + """ return the root node """ + self._check_if_open() + return self._handle.root + + @property + def filename(self): + return self._path + + def __getitem__(self, key): + return self.get(key) + + def __setitem__(self, key, value): + self.put(key, value) + + def __delitem__(self, key): + return self.remove(key) + + def __getattr__(self, name): + """ allow attribute access to get stores """ + self._check_if_open() + try: + return self.get(name) + except: + pass + raise AttributeError("'%s' object has no attribute '%s'" % + (type(self).__name__, name)) + + def __contains__(self, key): + """ check for existance of this key + can match the exact pathname or the pathnm w/o the leading '/' + """ + node = self.get_node(key) + if node is not None: + name = node._v_pathname + if name == key or name[1:] == key: + return True + return False + + def __len__(self): + return len(self.groups()) + + def __unicode__(self): + output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) + if self.is_open: + lkeys = list(self.keys()) + if len(lkeys): + keys = [] + values = [] + + for k in lkeys: + try: + s = self.get_storer(k) + if s is not None: + keys.append(pprint_thing(s.pathname or k)) + values.append( + pprint_thing(s or 'invalid_HDFStore node')) + except Exception as detail: + keys.append(k) + values.append("[invalid_HDFStore node: %s]" + % pprint_thing(detail)) + + output += adjoin(12, keys, values) + else: + output += 'Empty' + else: + output += "File is CLOSED" + + return output + + def keys(self): + """ + Return a (potentially unordered) list of the keys corresponding to the + objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. + have the leading '/' + """ + return [n._v_pathname for n in self.groups()] + + def items(self): + """ + iterate on key->group + """ + for g in self.groups(): + yield g._v_pathname, g + + iteritems = items + + def open(self, mode='a', **kwargs): + """ + Open the file in the specified mode + + Parameters + ---------- + mode : {'a', 'w', 'r', 'r+'}, default 'a' + See HDFStore docstring or tables.openFile for info about modes + """ + tables = _tables() + + if self._mode != mode: + + # if we are changing a write mode to read, ok + if self._mode in ['a', 'w'] and mode in ['r', 'r+']: + pass + elif mode in ['w']: + + # this would truncate, raise here + if self.is_open: + raise PossibleDataLossError( + "Re-opening the file [{0}] with mode [{1}] " + "will delete the current file!" + .format(self._path, self._mode) + ) + + self._mode = mode + + # close and reopen the handle + if self.is_open: + self.close() + + if self._complib is not None: + if self._complevel is None: + self._complevel = 9 + self._filters = _tables().Filters(self._complevel, + self._complib, + fletcher32=self._fletcher32) + + try: + self._handle = tables.openFile(self._path, self._mode, **kwargs) + except (IOError) as e: # pragma: no cover + if 'can not be written' in str(e): + print('Opening %s in read-only mode' % self._path) + self._handle = tables.openFile(self._path, 'r', **kwargs) + else: + raise + + except (ValueError) as e: + + # trap PyTables >= 3.1 FILE_OPEN_POLICY exception + # to provide an updated message + if 'FILE_OPEN_POLICY' in str(e): + + e = ValueError("PyTables [{version}] no longer supports opening multiple files\n" + "even in read-only mode on this HDF5 version [{hdf_version}]. You can accept this\n" + "and not open the same file multiple times at once,\n" + "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 which allows\n" + "files to be opened multiple times at once\n".format(version=tables.__version__, + hdf_version=tables.getHDF5Version())) + + raise e + + except (Exception) as e: + + # trying to read from a non-existant file causes an error which + # is not part of IOError, make it one + if self._mode == 'r' and 'Unable to open/create file' in str(e): + raise IOError(str(e)) + raise + + def close(self): + """ + Close the PyTables file handle + """ + if self._handle is not None: + self._handle.close() + self._handle = None + + @property + def is_open(self): + """ + return a boolean indicating whether the file is open + """ + if self._handle is None: + return False + return bool(self._handle.isopen) + + def flush(self, fsync=False): + """ + Force all buffered modifications to be written to disk. + + Parameters + ---------- + fsync : bool (default False) + call ``os.fsync()`` on the file handle to force writing to disk. + + Notes + ----- + Without ``fsync=True``, flushing may not guarantee that the OS writes + to disk. With fsync, the operation will block until the OS claims the + file has been written; however, other caching layers may still + interfere. + """ + if self._handle is not None: + self._handle.flush() + if fsync: + try: + os.fsync(self._handle.fileno()) + except: + pass + + def get(self, key): + """ + Retrieve pandas object stored in file + + Parameters + ---------- + key : object + + Returns + ------- + obj : type of object stored in file + """ + group = self.get_node(key) + if group is None: + raise KeyError('No object named %s in the file' % key) + return self._read_group(group) + + def select(self, key, where=None, start=None, stop=None, columns=None, + iterator=False, chunksize=None, auto_close=False, **kwargs): + """ + Retrieve pandas object stored in file, optionally based on where + criteria + + Parameters + ---------- + key : object + where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + columns : a list of columns that if not None, will limit the return + columns + iterator : boolean, return an iterator, default False + chunksize : nrows to include in iteration, return an iterator + auto_close : boolean, should automatically close the store when + finished, default is False + + Returns + ------- + The selected object + + """ + group = self.get_node(key) + if group is None: + raise KeyError('No object named %s in the file' % key) + + # create the storer and axes + where = _ensure_term(where, scope_level=1) + s = self._create_storer(group) + s.infer_axes() + + # what we are actually going to do for a chunk + def func(_start, _stop): + return s.read(where=where, start=_start, stop=_stop, + columns=columns, **kwargs) + + if iterator or chunksize is not None: + if not s.is_table: + raise TypeError( + "can only use an iterator or chunksize on a table") + return TableIterator(self, func, nrows=s.nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) + + return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, + auto_close=auto_close).get_values() + + def select_as_coordinates( + self, key, where=None, start=None, stop=None, **kwargs): + """ + return the selection as an Index + + Parameters + ---------- + key : object + where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + """ + where = _ensure_term(where, scope_level=1) + return self.get_storer(key).read_coordinates(where=where, start=start, + stop=stop, **kwargs) + + def select_column(self, key, column, **kwargs): + """ + return a single column from the table. This is generally only useful to + select an indexable + + Parameters + ---------- + key : object + column: the column of interest + + Exceptions + ---------- + raises KeyError if the column is not found (or key is not a valid + store) + raises ValueError if the column can not be extracted individually (it + is part of a data block) + + """ + return self.get_storer(key).read_column(column=column, **kwargs) + + def select_as_multiple(self, keys, where=None, selector=None, columns=None, + start=None, stop=None, iterator=False, + chunksize=None, auto_close=False, **kwargs): + """ Retrieve pandas objects from multiple tables + + Parameters + ---------- + keys : a list of the tables + selector : the table to apply the where criteria (defaults to keys[0] + if not supplied) + columns : the columns I want back + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + iterator : boolean, return an iterator, default False + chunksize : nrows to include in iteration, return an iterator + + Exceptions + ---------- + raises KeyError if keys or selector is not found or keys is empty + raises TypeError if keys is not a list or tuple + raises ValueError if the tables are not ALL THE SAME DIMENSIONS + """ + + # default to single select + where = _ensure_term(where, scope_level=1) + if isinstance(keys, (list, tuple)) and len(keys) == 1: + keys = keys[0] + if isinstance(keys, string_types): + return self.select(key=keys, where=where, columns=columns, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, **kwargs) + + if not isinstance(keys, (list, tuple)): + raise TypeError("keys must be a list/tuple") + + if not len(keys): + raise ValueError("keys must have a non-zero length") + + if selector is None: + selector = keys[0] + + # collect the tables + tbls = [self.get_storer(k) for k in keys] + s = self.get_storer(selector) + + # validate rows + nrows = None + for t, k in itertools.chain([(s,selector)], zip(tbls, keys)): + if t is None: + raise KeyError("Invalid table [%s]" % k) + if not t.is_table: + raise TypeError( + "object [%s] is not a table, and cannot be used in all " + "select as multiple" % t.pathname + ) + + if nrows is None: + nrows = t.nrows + elif t.nrows != nrows: + raise ValueError( + "all tables must have exactly the same nrows!") + + # axis is the concentation axes + axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] + + def func(_start, _stop): + if where is not None: + c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs) + else: + c = None + + objs = [t.read(where=c, start=_start, stop=_stop, + columns=columns, **kwargs) for t in tbls] + + # concat and return + return concat(objs, axis=axis, + verify_integrity=False).consolidate() + + if iterator or chunksize is not None: + return TableIterator(self, func, nrows=nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) + + return TableIterator(self, func, nrows=nrows, start=start, stop=stop, + auto_close=auto_close).get_values() + + def put(self, key, value, format=None, append=False, **kwargs): + """ + Store object in HDFStore + + Parameters + ---------- + key : object + value : {Series, DataFrame, Panel} + format : 'fixed(f)|table(t)', default is 'fixed' + fixed(f) : Fixed format + Fast writing/reading. Not-appendable, nor searchable + table(t) : Table format + Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching + / selecting subsets of the data + append : boolean, default False + This will force Table format, append the input data to the + existing. + encoding : default None, provide an encoding for strings + dropna : boolean, default True, do not write an ALL nan row to + the store settable by the option 'io.hdf.dropna_table' + """ + if format is None: + format = get_option("io.hdf.default_format") or 'fixed' + kwargs = self._validate_format(format, kwargs) + self._write_to_group(key, value, append=append, **kwargs) + + def remove(self, key, where=None, start=None, stop=None): + """ + Remove pandas object partially by specifying the where condition + + Parameters + ---------- + key : string + Node to remove or delete rows from + where : list of Term (or convertable) objects, optional + start : integer (defaults to None), row number to start selection + stop : integer (defaults to None), row number to stop selection + + Returns + ------- + number of rows removed (or None if not a Table) + + Exceptions + ---------- + raises KeyError if key is not a valid store + + """ + where = _ensure_term(where, scope_level=1) + try: + s = self.get_storer(key) + except: + + if where is not None: + raise ValueError( + "trying to remove a node with a non-None where clause!") + + # we are actually trying to remove a node (with children) + s = self.get_node(key) + if s is not None: + s._f_remove(recursive=True) + return None + + if s is None: + raise KeyError('No object named %s in the file' % key) + + # remove the node + if where is None and start is None and stop is None: + s.group._f_remove(recursive=True) + + # delete from the table + else: + if not s.is_table: + raise ValueError( + 'can only remove with where on objects written as tables') + return s.delete(where=where, start=start, stop=stop) + + def append(self, key, value, format=None, append=True, columns=None, + dropna=None, **kwargs): + """ + Append to Table in file. Node must already exist and be Table + format. + + Parameters + ---------- + key : object + value : {Series, DataFrame, Panel, Panel4D} + format: 'table' is the default + table(t) : table format + Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching + / selecting subsets of the data + append : boolean, default True, append the input data to the + existing + data_columns : list of columns to create as data columns, or True to + use all columns + min_itemsize : dict of columns that specify minimum string sizes + nan_rep : string to use as string nan represenation + chunksize : size to chunk the writing + expectedrows : expected TOTAL row size of this table + encoding : default None, provide an encoding for strings + dropna : boolean, default True, do not write an ALL nan row to + the store settable by the option 'io.hdf.dropna_table' + Notes + ----- + Does *not* check if data being appended overlaps with existing + data in the table, so be careful + """ + if columns is not None: + raise TypeError("columns is not a supported keyword in append, " + "try data_columns") + + if dropna is None: + dropna = get_option("io.hdf.dropna_table") + if format is None: + format = get_option("io.hdf.default_format") or 'table' + kwargs = self._validate_format(format, kwargs) + self._write_to_group(key, value, append=append, dropna=dropna, + **kwargs) + + def append_to_multiple(self, d, value, selector, data_columns=None, + axes=None, dropna=True, **kwargs): + """ + Append to multiple tables + + Parameters + ---------- + d : a dict of table_name to table_columns, None is acceptable as the + values of one node (this will get all the remaining columns) + value : a pandas object + selector : a string that designates the indexable table; all of its + columns will be designed as data_columns, unless data_columns is + passed, in which case these are used + data_columns : list of columns to create as data columns, or True to + use all columns + dropna : if evaluates to True, drop rows from all tables if any single + row in each table has all NaN + + Notes + ----- + axes parameter is currently not accepted + + """ + if axes is not None: + raise TypeError("axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables independently instead") + + if not isinstance(d, dict): + raise ValueError( + "append_to_multiple must have a dictionary specified as the " + "way to split the value" + ) + + if selector not in d: + raise ValueError( + "append_to_multiple requires a selector that is in passed dict" + ) + + # figure out the splitting axis (the non_index_axis) + axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] + + # figure out how to split the value + remain_key = None + remain_values = [] + for k, v in d.items(): + if v is None: + if remain_key is not None: + raise ValueError( + "append_to_multiple can only have one value in d that " + "is None" + ) + remain_key = k + else: + remain_values.extend(v) + if remain_key is not None: + ordered = value.axes[axis] + ordd = ordered - Index(remain_values) + ordd = sorted(ordered.get_indexer(ordd)) + d[remain_key] = ordered.take(ordd) + + # data_columns + if data_columns is None: + data_columns = d[selector] + + # ensure rows are synchronized across the tables + if dropna: + idxs = (value[cols].dropna(how='all').index for cols in d.values()) + valid_index = next(idxs) + for index in idxs: + valid_index = valid_index.intersection(index) + value = value.ix[valid_index] + + # append + for k, v in d.items(): + dc = data_columns if k == selector else None + + # compute the val + val = value.reindex_axis(v, axis=axis) + + self.append(k, val, data_columns=dc, **kwargs) + + def create_table_index(self, key, **kwargs): + """ Create a pytables index on the table + Paramaters + ---------- + key : object (the node to index) + + Exceptions + ---------- + raises if the node is not a table + + """ + + # version requirements + _tables() + if not _table_supports_index: + raise ValueError("PyTables >= 2.3 is required for table indexing") + + s = self.get_storer(key) + if s is None: + return + + if not s.is_table: + raise TypeError( + "cannot create table index on a Fixed format store") + s.create_index(**kwargs) + + def groups(self): + """return a list of all the top-level nodes (that are not themselves a + pandas storage object) + """ + _tables() + self._check_if_open() + return [ + g for g in self._handle.walkNodes() + if (getattr(g._v_attrs, 'pandas_type', None) or + getattr(g, 'table', None) or + (isinstance(g, _table_mod.table.Table) and + g._v_name != u('table'))) + ] + + def get_node(self, key): + """ return the node with the key or None if it does not exist """ + self._check_if_open() + try: + if not key.startswith('/'): + key = '/' + key + return self._handle.getNode(self.root, key) + except: + return None + + def get_storer(self, key): + """ return the storer object for a key, raise if not in the file """ + group = self.get_node(key) + if group is None: + return None + s = self._create_storer(group) + s.infer_axes() + return s + + def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, + complevel=None, fletcher32=False, overwrite=True): + """ copy the existing store to a new file, upgrading in place + + Parameters + ---------- + propindexes: restore indexes in copied file (defaults to True) + keys : list of keys to include in the copy (defaults to all) + overwrite : overwrite (remove and replace) existing nodes in the + new store (default is True) + mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + + Returns + ------- + open file handle of the new store + + """ + new_store = HDFStore( + file, + mode=mode, + complib=complib, + complevel=complevel, + fletcher32=fletcher32) + if keys is None: + keys = list(self.keys()) + if not isinstance(keys, (tuple, list)): + keys = [keys] + for k in keys: + s = self.get_storer(k) + if s is not None: + + if k in new_store: + if overwrite: + new_store.remove(k) + + data = self.select(k) + if s.is_table: + + index = False + if propindexes: + index = [a.name for a in s.axes if a.is_indexed] + new_store.append( + k, data, index=index, + data_columns=getattr(s, 'data_columns', None), + encoding=s.encoding + ) + else: + new_store.put(k, data, encoding=s.encoding) + + return new_store + + # private methods ###### + def _check_if_open(self): + if not self.is_open: + raise ClosedFileError("{0} file is not open!".format(self._path)) + + def _validate_format(self, format, kwargs): + """ validate / deprecate formats; return the new kwargs """ + kwargs = kwargs.copy() + + # table arg + table = kwargs.pop('table', None) + + if table is not None: + warnings.warn(format_deprecate_doc, FutureWarning) + + if table: + format = 'table' + else: + format = 'fixed' + + # validate + try: + kwargs['format'] = _FORMAT_MAP[format.lower()] + except: + raise TypeError("invalid HDFStore format specified [{0}]" + .format(format)) + + return kwargs + + def _create_storer(self, group, format=None, value=None, append=False, + **kwargs): + """ return a suitable class to operate """ + + def error(t): + raise TypeError( + "cannot properly create the storer for: [%s] [group->%s," + "value->%s,format->%s,append->%s,kwargs->%s]" + % (t, group, type(value), format, append, kwargs) + ) + + pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) + tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) + + # infer the pt from the passed value + if pt is None: + if value is None: + + _tables() + if (getattr(group, 'table', None) or + isinstance(group, _table_mod.table.Table)): + pt = u('frame_table') + tt = u('generic_table') + else: + raise TypeError( + "cannot create a storer if the object is not existing " + "nor a value are passed") + else: + + try: + pt = _TYPE_MAP[type(value)] + except: + error('_TYPE_MAP') + + # we are actually a table + if format == 'table': + pt += u('_table') + + # a storer node + if u('table') not in pt: + try: + return globals()[_STORER_MAP[pt]](self, group, **kwargs) + except: + error('_STORER_MAP') + + # existing node (and must be a table) + if tt is None: + + # if we are a writer, determin the tt + if value is not None: + + if pt == u('series_table'): + index = getattr(value, 'index', None) + if index is not None: + if index.nlevels == 1: + tt = u('appendable_series') + elif index.nlevels > 1: + tt = u('appendable_multiseries') + elif pt == u('frame_table'): + index = getattr(value, 'index', None) + if index is not None: + if index.nlevels == 1: + tt = u('appendable_frame') + elif index.nlevels > 1: + tt = u('appendable_multiframe') + elif pt == u('wide_table'): + tt = u('appendable_panel') + elif pt == u('ndim_table'): + tt = u('appendable_ndim') + + else: + + # distiguish between a frame/table + tt = u('legacy_panel') + try: + fields = group.table._v_attrs.fields + if len(fields) == 1 and fields[0] == u('value'): + tt = u('legacy_frame') + except: + pass + + try: + return globals()[_TABLE_MAP[tt]](self, group, **kwargs) + except: + error('_TABLE_MAP') + + def _write_to_group(self, key, value, format, index=True, append=False, + complib=None, encoding=None, **kwargs): + group = self.get_node(key) + + # remove the node if we are not appending + if group is not None and not append: + self._handle.removeNode(group, recursive=True) + group = None + + # we don't want to store a table node at all if are object is 0-len + # as there are not dtypes + if getattr(value, 'empty', None) and (format == 'table' or append): + return + + if group is None: + paths = key.split('/') + + # recursively create the groups + path = '/' + for p in paths: + if not len(p): + continue + new_path = path + if not path.endswith('/'): + new_path += '/' + new_path += p + group = self.get_node(new_path) + if group is None: + group = self._handle.createGroup(path, p) + path = new_path + + s = self._create_storer(group, format, value, append=append, + encoding=encoding, **kwargs) + if append: + # raise if we are trying to append to a Fixed format, + # or a table that exists (and we are putting) + if (not s.is_table or + (s.is_table and format == 'fixed' and s.is_exists)): + raise ValueError('Can only append to Tables') + if not s.is_exists: + s.set_object_info() + else: + s.set_object_info() + + if not s.is_table and complib: + raise ValueError( + 'Compression not supported on Fixed format stores' + ) + + # write the object + s.write(obj=value, append=append, complib=complib, **kwargs) + + if s.is_table and index: + s.create_index(columns=index) + + def _read_group(self, group, **kwargs): + s = self._create_storer(group) + s.infer_axes() + return s.read(**kwargs) + + +class TableIterator(object): + + """ define the iteration interface on a table + + Parameters + ---------- + + store : the reference store + func : the function to get results + nrows : the rows to iterate on + start : the passed start value (default is None) + stop : the passed stop value (default is None) + chunksize : the passed chunking valeu (default is 50000) + auto_close : boolean, automatically close the store at the end of + iteration, default is False + kwargs : the passed kwargs + """ + + def __init__(self, store, func, nrows, start=None, stop=None, + chunksize=None, auto_close=False): + self.store = store + self.func = func + self.nrows = nrows or 0 + self.start = start or 0 + + if stop is None: + stop = self.nrows + self.stop = min(self.nrows, stop) + + if chunksize is None: + chunksize = 100000 + + self.chunksize = chunksize + self.auto_close = auto_close + + def __iter__(self): + current = self.start + while current < self.stop: + stop = current + self.chunksize + v = self.func(current, stop) + current = stop + + if v is None: + continue + + yield v + + self.close() + + def close(self): + if self.auto_close: + self.store.close() + + def get_values(self): + results = self.func(self.start, self.stop) + self.close() + return results + + +class IndexCol(StringMixin): + + """ an index column description class + + Parameters + ---------- + + axis : axis which I reference + values : the ndarray like converted values + kind : a string description of this type + typ : the pytables type + pos : the position in the pytables + + """ + is_an_indexable = True + is_data_indexable = True + _info_fields = ['freq', 'tz', 'index_name'] + + def __init__(self, values=None, kind=None, typ=None, cname=None, + itemsize=None, name=None, axis=None, kind_attr=None, pos=None, + freq=None, tz=None, index_name=None, **kwargs): + self.values = values + self.kind = kind + self.typ = typ + self.itemsize = itemsize + self.name = name + self.cname = cname + self.kind_attr = kind_attr + self.axis = axis + self.pos = pos + self.freq = freq + self.tz = tz + self.index_name = index_name + self.table = None + + if name is not None: + self.set_name(name, kind_attr) + if pos is not None: + self.set_pos(pos) + + def set_name(self, name, kind_attr=None): + """ set the name of this indexer """ + self.name = name + self.kind_attr = kind_attr or "%s_kind" % name + if self.cname is None: + self.cname = name + + return self + + def set_axis(self, axis): + """ set the axis over which I index """ + self.axis = axis + + return self + + def set_pos(self, pos): + """ set the position of this column in the Table """ + self.pos = pos + if pos is not None and self.typ is not None: + self.typ._v_pos = pos + return self + + def set_table(self, table): + self.table = table + return self + + def __unicode__(self): + temp = tuple( + map(pprint_thing, + (self.name, + self.cname, + self.axis, + self.pos, + self.kind))) + return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp + + def __eq__(self, other): + """ compare 2 col items """ + return all([getattr(self, a, None) == getattr(other, a, None) + for a in ['name', 'cname', 'axis', 'pos']]) + + def __ne__(self, other): + return not self.__eq__(other) + + @property + def is_indexed(self): + """ return whether I am an indexed column """ + try: + return getattr(self.table.cols, self.cname).is_indexed + except: + False + + def copy(self): + new_self = copy.copy(self) + return new_self + + def infer(self, table): + """infer this column from the table: create and return a new object""" + new_self = self.copy() + new_self.set_table(table) + new_self.get_attr() + return new_self + + def convert(self, values, nan_rep, encoding): + """ set the values from this selection: take = take ownership """ + try: + values = values[self.cname] + except: + pass + + values = _maybe_convert(values, self.kind, encoding) + + kwargs = dict() + if self.freq is not None: + kwargs['freq'] = _ensure_decoded(self.freq) + if self.index_name is not None: + kwargs['name'] = _ensure_decoded(self.index_name) + try: + self.values = Index(values, **kwargs) + except: + + # if the output freq is different that what we recorded, + # it should be None (see also 'doc example part 2') + if 'freq' in kwargs: + kwargs['freq'] = None + self.values = Index(values, **kwargs) + + # set the timezone if indicated + # we stored in utc, so reverse to local timezone + if self.tz is not None: + self.values = self.values.tz_localize( + 'UTC').tz_convert(_ensure_decoded(self.tz)) + + return self + + def take_data(self): + """ return the values & release the memory """ + self.values, values = None, self.values + return values + + @property + def attrs(self): + return self.table._v_attrs + + @property + def description(self): + return self.table.description + + @property + def col(self): + """ return my current col description """ + return getattr(self.description, self.cname, None) + + @property + def cvalues(self): + """ return my cython values """ + return self.values + + def __iter__(self): + return iter(self.values) + + def maybe_set_size(self, min_itemsize=None, **kwargs): + """ maybe set a string col itemsize: + min_itemsize can be an interger or a dict with this columns name + with an integer size """ + if _ensure_decoded(self.kind) == u('string'): + + if isinstance(min_itemsize, dict): + min_itemsize = min_itemsize.get(self.name) + + if min_itemsize is not None and self.typ.itemsize < min_itemsize: + self.typ = _tables( + ).StringCol(itemsize=min_itemsize, pos=self.pos) + + def validate_and_set(self, table, append, **kwargs): + self.set_table(table) + self.validate_col() + self.validate_attr(append) + self.set_attr() + + def validate_col(self, itemsize=None): + """ validate this column: return the compared against itemsize """ + + # validate this column for string truncation (or reset to the max size) + if _ensure_decoded(self.kind) == u('string'): + c = self.col + if c is not None: + if itemsize is None: + itemsize = self.itemsize + if c.itemsize < itemsize: + raise ValueError( + "Trying to store a string with len [%s] in [%s] " + "column but\nthis column has a limit of [%s]!\n" + "Consider using min_itemsize to preset the sizes on " + "these columns" % (itemsize, self.cname, c.itemsize)) + return c.itemsize + + return None + + def validate_attr(self, append): + # check for backwards incompatibility + if append: + existing_kind = getattr(self.attrs, self.kind_attr, None) + if existing_kind is not None and existing_kind != self.kind: + raise TypeError("incompatible kind in col [%s - %s]" % + (existing_kind, self.kind)) + + def update_info(self, info): + """ set/update the info for this indexable with the key/value + if there is a conflict raise/warn as needed """ + + for key in self._info_fields: + + value = getattr(self, key, None) + idx = _get_info(info, self.name) + + existing_value = idx.get(key) + if key in idx and value is not None and existing_value != value: + + # frequency/name just warn + if key in ['freq', 'index_name']: + ws = attribute_conflict_doc % (key, existing_value, value) + warnings.warn(ws, AttributeConflictWarning) + + # reset + idx[key] = None + setattr(self, key, None) + + else: + raise ValueError( + "invalid info for [%s] for [%s], existing_value [%s] " + "conflicts with new value [%s]" + % (self.name, key, existing_value, value)) + else: + if value is not None or existing_value is not None: + idx[key] = value + + return self + + def set_info(self, info): + """ set my state from the passed info """ + idx = info.get(self.name) + if idx is not None: + self.__dict__.update(idx) + + def get_attr(self): + """ set the kind for this colummn """ + self.kind = getattr(self.attrs, self.kind_attr, None) + + def set_attr(self): + """ set the kind for this colummn """ + setattr(self.attrs, self.kind_attr, self.kind) + + +class GenericIndexCol(IndexCol): + + """ an index which is not represented in the data of the table """ + + @property + def is_indexed(self): + return False + + def convert(self, values, nan_rep, encoding): + """ set the values from this selection: take = take ownership """ + + self.values = Int64Index(np.arange(self.table.nrows)) + return self + + def get_attr(self): + pass + + def set_attr(self): + pass + + +class DataCol(IndexCol): + + """ a data holding column, by definition this is not indexable + + Parameters + ---------- + + data : the actual data + cname : the column name in the table to hold the data (typically + values) + """ + is_an_indexable = False + is_data_indexable = False + _info_fields = ['tz'] + + @classmethod + def create_for_block( + cls, i=None, name=None, cname=None, version=None, **kwargs): + """ return a new datacol with the block i """ + + if cname is None: + cname = name or 'values_block_%d' % i + if name is None: + name = cname + + # prior to 0.10.1, we named values blocks like: values_block_0 an the + # name values_0 + try: + if version[0] == 0 and version[1] <= 10 and version[2] == 0: + m = re.search("values_block_(\d+)", name) + if m: + name = "values_%s" % m.groups()[0] + except: + pass + + return cls(name=name, cname=cname, **kwargs) + + def __init__(self, values=None, kind=None, typ=None, + cname=None, data=None, block=None, **kwargs): + super(DataCol, self).__init__( + values=values, kind=kind, typ=typ, cname=cname, **kwargs) + self.dtype = None + self.dtype_attr = u("%s_dtype" % self.name) + self.set_data(data) + + def __unicode__(self): + return "name->%s,cname->%s,dtype->%s,shape->%s" % ( + self.name, self.cname, self.dtype, self.shape + ) + + def __eq__(self, other): + """ compare 2 col items """ + return all([getattr(self, a, None) == getattr(other, a, None) + for a in ['name', 'cname', 'dtype', 'pos']]) + + def set_data(self, data, dtype=None): + self.data = data + if data is not None: + if dtype is not None: + self.dtype = dtype + self.set_kind() + elif self.dtype is None: + self.dtype = data.dtype.name + self.set_kind() + + def take_data(self): + """ return the data & release the memory """ + self.data, data = None, self.data + return data + + def set_kind(self): + # set my kind if we can + if self.dtype is not None: + dtype = _ensure_decoded(self.dtype) + if dtype.startswith(u('string')) or dtype.startswith(u('bytes')): + self.kind = 'string' + elif dtype.startswith(u('float')): + self.kind = 'float' + elif dtype.startswith(u('int')) or dtype.startswith(u('uint')): + self.kind = 'integer' + elif dtype.startswith(u('date')): + self.kind = 'datetime' + elif dtype.startswith(u('timedelta')): + self.kind = 'timedelta' + elif dtype.startswith(u('bool')): + self.kind = 'bool' + else: + raise AssertionError( + "cannot interpret dtype of [%s] in [%s]" % (dtype, self)) + + # set my typ if we need + if self.typ is None: + self.typ = getattr(self.description, self.cname, None) + + def set_atom(self, block, block_items, existing_col, min_itemsize, + nan_rep, info, encoding=None, **kwargs): + """ create and setup my atom from the block b """ + + self.values = list(block_items) + dtype = block.dtype.name + rvalues = block.values.ravel() + inferred_type = lib.infer_dtype(rvalues) + + if inferred_type == 'datetime64': + self.set_atom_datetime64(block) + elif dtype == 'timedelta64[ns]': + if _np_version_under1p7: + raise TypeError( + "timdelta64 is not supported under under numpy < 1.7") + self.set_atom_timedelta64(block) + elif inferred_type == 'date': + raise TypeError( + "[date] is not implemented as a table column") + elif inferred_type == 'datetime': + if getattr(rvalues[0], 'tzinfo', None) is not None: + + # if this block has more than one timezone, raise + try: + # pytz timezones: compare on zone name (to avoid issues with DST being a different zone to STD). + zones = [r.tzinfo.zone for r in rvalues] + except: + # dateutil timezones: compare on == + zones = [r.tzinfo for r in rvalues] + if any(zones[0] != zone_i for zone_i in zones[1:]): + raise TypeError( + "too many timezones in this block, create separate " + "data columns" + ) + else: + if len(set(zones)) != 1: + raise TypeError( + "too many timezones in this block, create separate " + "data columns" + ) + + # convert this column to datetime64[ns] utc, and save the tz + index = DatetimeIndex(rvalues) + tz = getattr(index, 'tz', None) + if tz is None: + raise TypeError( + "invalid timezone specification") + + values = index.tz_convert('UTC').values.view('i8') + + # store a converted timezone + zone = tslib.get_timezone(index.tz) + if zone is None: + zone = tslib.tot_seconds(index.tz.utcoffset()) + self.tz = zone + + self.update_info(info) + self.set_atom_datetime64( + block, values.reshape(block.values.shape)) + + else: + raise TypeError( + "[datetime] is not implemented as a table column") + elif inferred_type == 'unicode': + raise TypeError( + "[unicode] is not implemented as a table column") + + # this is basically a catchall; if say a datetime64 has nans then will + # end up here ### + elif inferred_type == 'string' or dtype == 'object': + self.set_atom_string( + block, block_items, + existing_col, + min_itemsize, + nan_rep, + encoding) + else: + self.set_atom_data(block) + + return self + + def get_atom_string(self, block, itemsize): + return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) + + def set_atom_string(self, block, block_items, existing_col, min_itemsize, + nan_rep, encoding): + # fill nan items with myself, don't disturb the blocks by + # trying to downcast + block = block.fillna(nan_rep, downcast=False)[0] + data = block.values + + # see if we have a valid string type + inferred_type = lib.infer_dtype(data.ravel()) + if inferred_type != 'string': + + # we cannot serialize this data, so report an exception on a column + # by column basis + for i, item in enumerate(block_items): + + col = block.iget(i) + inferred_type = lib.infer_dtype(col.ravel()) + if inferred_type != 'string': + raise TypeError( + "Cannot serialize the column [%s] because\n" + "its data contents are [%s] object dtype" + % (item, inferred_type) + ) + + # itemsize is the maximum length of a string (along any dimension) + itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + + # specified min_itemsize? + if isinstance(min_itemsize, dict): + min_itemsize = int(min_itemsize.get( + self.name) or min_itemsize.get('values') or 0) + itemsize = max(min_itemsize or 0, itemsize) + + # check for column in the values conflicts + if existing_col is not None: + eci = existing_col.validate_col(itemsize) + if eci > itemsize: + itemsize = eci + + self.itemsize = itemsize + self.kind = 'string' + self.typ = self.get_atom_string(block, itemsize) + self.set_data(self.convert_string_data(data, itemsize, encoding)) + + def convert_string_data(self, data, itemsize, encoding): + return _convert_string_array(data, encoding, itemsize) + + def get_atom_coltype(self): + """ return the PyTables column class for this column """ + if self.kind.startswith('uint'): + col_name = "UInt%sCol" % self.kind[4:] + else: + col_name = "%sCol" % self.kind.capitalize() + + return getattr(_tables(), col_name) + + def get_atom_data(self, block): + return self.get_atom_coltype()(shape=block.shape[0]) + + def set_atom_data(self, block): + self.kind = block.dtype.name + self.typ = self.get_atom_data(block) + self.set_data(block.values.astype(self.typ.type)) + + def get_atom_datetime64(self, block): + return _tables().Int64Col(shape=block.shape[0]) + + def set_atom_datetime64(self, block, values=None): + self.kind = 'datetime64' + self.typ = self.get_atom_datetime64(block) + if values is None: + values = block.values.view('i8') + self.set_data(values, 'datetime64') + + def get_atom_timedelta64(self, block): + return _tables().Int64Col(shape=block.shape[0]) + + def set_atom_timedelta64(self, block, values=None): + self.kind = 'timedelta64' + self.typ = self.get_atom_timedelta64(block) + if values is None: + values = block.values.view('i8') + self.set_data(values, 'timedelta64') + + @property + def shape(self): + return getattr(self.data, 'shape', None) + + @property + def cvalues(self): + """ return my cython values """ + return self.data + + def validate_attr(self, append): + """validate that we have the same order as the existing & same dtype""" + if append: + existing_fields = getattr(self.attrs, self.kind_attr, None) + if (existing_fields is not None and + existing_fields != list(self.values)): + raise ValueError("appended items do not match existing items" + " in table!") + + existing_dtype = getattr(self.attrs, self.dtype_attr, None) + if (existing_dtype is not None and + existing_dtype != self.dtype): + raise ValueError("appended items dtype do not match existing " + "items dtype in table!") + + def convert(self, values, nan_rep, encoding): + """set the data from this selection (and convert to the correct dtype + if we can) + """ + try: + values = values[self.cname] + except: + pass + self.set_data(values) + + # convert to the correct dtype + if self.dtype is not None: + dtype = _ensure_decoded(self.dtype) + + # reverse converts + if dtype == u('datetime64'): + # recreate the timezone + if self.tz is not None: + + # data should be 2-dim here + # we stored as utc, so just set the tz + + index = DatetimeIndex( + self.data.ravel(), tz='UTC').tz_convert(self.tz) + self.data = np.array( + index.tolist(), dtype=object).reshape(self.data.shape) + + else: + self.data = np.asarray(self.data, dtype='M8[ns]') + + elif dtype == u('timedelta64'): + self.data = np.asarray(self.data, dtype='m8[ns]') + elif dtype == u('date'): + try: + self.data = np.array( + [date.fromordinal(v) for v in self.data], dtype=object) + except ValueError: + self.data = np.array( + [date.fromtimestamp(v) for v in self.data], + dtype=object) + elif dtype == u('datetime'): + self.data = np.array( + [datetime.fromtimestamp(v) for v in self.data], + dtype=object) + else: + + try: + self.data = self.data.astype(dtype) + except: + self.data = self.data.astype('O') + + # convert nans / decode + if _ensure_decoded(self.kind) == u('string'): + self.data = _unconvert_string_array( + self.data, nan_rep=nan_rep, encoding=encoding) + + return self + + def get_attr(self): + """ get the data for this colummn """ + self.values = getattr(self.attrs, self.kind_attr, None) + self.dtype = getattr(self.attrs, self.dtype_attr, None) + self.set_kind() + + def set_attr(self): + """ set the data for this colummn """ + setattr(self.attrs, self.kind_attr, self.values) + if self.dtype is not None: + setattr(self.attrs, self.dtype_attr, self.dtype) + + +class DataIndexableCol(DataCol): + + """ represent a data column that can be indexed """ + is_data_indexable = True + + def get_atom_string(self, block, itemsize): + return _tables().StringCol(itemsize=itemsize) + + def get_atom_data(self, block): + return self.get_atom_coltype()() + + def get_atom_datetime64(self, block): + return _tables().Int64Col() + + def get_atom_timedelta64(self, block): + return _tables().Int64Col() + + +class GenericDataIndexableCol(DataIndexableCol): + + """ represent a generic pytables data column """ + + def get_attr(self): + pass + + +class Fixed(StringMixin): + + """ represent an object in my store + facilitate read/write of various types of objects + this is an abstract base class + + Parameters + ---------- + + parent : my parent HDFStore + group : the group node where the table resides + """ + pandas_kind = None + obj_type = None + ndim = None + is_table = False + + def __init__(self, parent, group, encoding=None, **kwargs): + self.parent = parent + self.group = group + self.encoding = _ensure_encoding(encoding) + self.set_version() + + @property + def is_old_version(self): + return (self.version[0] <= 0 and self.version[1] <= 10 and + self.version[2] < 1) + + def set_version(self): + """ compute and set our version """ + version = _ensure_decoded( + getattr(self.group._v_attrs, 'pandas_version', None)) + try: + self.version = tuple([int(x) for x in version.split('.')]) + if len(self.version) == 2: + self.version = self.version + (0,) + except: + self.version = (0, 0, 0) + + @property + def pandas_type(self): + return _ensure_decoded(getattr(self.group._v_attrs, + 'pandas_type', None)) + + @property + def format_type(self): + return 'fixed' + + def __unicode__(self): + """ return a pretty representation of myself """ + self.infer_axes() + s = self.shape + if s is not None: + if isinstance(s, (list, tuple)): + s = "[%s]" % ','.join([pprint_thing(x) for x in s]) + return "%-12.12s (shape->%s)" % (self.pandas_type, s) + return self.pandas_type + + def set_object_info(self): + """ set my pandas type & version """ + self.attrs.pandas_type = str(self.pandas_kind) + self.attrs.pandas_version = str(_version) + self.set_version() + + def copy(self): + new_self = copy.copy(self) + return new_self + + @property + def storage_obj_type(self): + return self.obj_type + + @property + def shape(self): + return self.nrows + + @property + def pathname(self): + return self.group._v_pathname + + @property + def _handle(self): + return self.parent._handle + + @property + def _filters(self): + return self.parent._filters + + @property + def _complevel(self): + return self.parent._complevel + + @property + def _fletcher32(self): + return self.parent._fletcher32 + + @property + def _complib(self): + return self.parent._complib + + @property + def attrs(self): + return self.group._v_attrs + + def set_attrs(self): + """ set our object attributes """ + pass + + def get_attrs(self): + """ get our object attributes """ + pass + + @property + def storable(self): + """ return my storable """ + return self.group + + @property + def is_exists(self): + return False + + @property + def nrows(self): + return getattr(self.storable, 'nrows', None) + + def validate(self, other): + """ validate against an existing storable """ + if other is None: + return + return True + + def validate_version(self, where=None): + """ are we trying to operate on an old version? """ + return True + + def infer_axes(self): + """ infer the axes of my storer + return a boolean indicating if we have a valid storer or not """ + + s = self.storable + if s is None: + return False + self.get_attrs() + return True + + def read(self, **kwargs): + raise NotImplementedError( + "cannot read on an abstract storer: subclasses should implement") + + def write(self, **kwargs): + raise NotImplementedError( + "cannot write on an abstract storer: sublcasses should implement") + + def delete(self, where=None, start=None, stop=None, **kwargs): + """ support fully deleting the node in its entirety (only) - where specification must be None """ + if where is None and start is None and stop is None: + self._handle.removeNode(self.group, recursive=True) + return None + + raise TypeError("cannot delete on an abstract storer") + + +class GenericFixed(Fixed): + + """ a generified fixed version """ + _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} + _reverse_index_map = dict([(v, k) + for k, v in compat.iteritems(_index_type_map)]) + attributes = [] + + # indexer helpders + def _class_to_alias(self, cls): + return self._index_type_map.get(cls, '') + + def _alias_to_class(self, alias): + if isinstance(alias, type): # pragma: no cover + # compat: for a short period of time master stored types + return alias + return self._reverse_index_map.get(alias, Index) + + def _get_index_factory(self, klass): + if klass == DatetimeIndex: + def f(values, freq=None, tz=None): + return DatetimeIndex._simple_new(values, None, freq=freq, + tz=tz) + return f + return klass + + def validate_read(self, kwargs): + if kwargs.get('columns') is not None: + raise TypeError("cannot pass a column specification when reading " + "a Fixed format store. this store must be " + "selected in its entirety") + if kwargs.get('where') is not None: + raise TypeError("cannot pass a where specification when reading " + "from a Fixed format store. this store must be " + "selected in its entirety") + + @property + def is_exists(self): + return True + + def set_attrs(self): + """ set our object attributes """ + self.attrs.encoding = self.encoding + + def get_attrs(self): + """ retrieve our attributes """ + self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) + for n in self.attributes: + setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) + + def write(self, obj, **kwargs): + self.set_attrs() + + def read_array(self, key): + """ read an array for the specified node (off of group """ + import tables + node = getattr(self.group, key) + data = node[:] + attrs = node._v_attrs + + transposed = getattr(attrs, 'transposed', False) + + if isinstance(node, tables.VLArray): + ret = data[0] + else: + dtype = getattr(attrs, 'value_type', None) + shape = getattr(attrs, 'shape', None) + + if shape is not None: + # length 0 axis + ret = np.empty(shape, dtype=dtype) + else: + ret = data + + if dtype == u('datetime64'): + ret = np.array(ret, dtype='M8[ns]') + elif dtype == u('timedelta64'): + if _np_version_under1p7: + raise TypeError( + "timedelta64 is not supported under under numpy < 1.7") + ret = np.array(ret, dtype='m8[ns]') + + if transposed: + return ret.T + else: + return ret + + def read_index(self, key): + variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key)) + + if variety == u('multi'): + return self.read_multi_index(key) + elif variety == u('block'): + return self.read_block_index(key) + elif variety == u('sparseint'): + return self.read_sparse_intindex(key) + elif variety == u('regular'): + _, index = self.read_index_node(getattr(self.group, key)) + return index + else: # pragma: no cover + raise TypeError('unrecognized index variety: %s' % variety) + + def write_index(self, key, index): + if isinstance(index, MultiIndex): + setattr(self.attrs, '%s_variety' % key, 'multi') + self.write_multi_index(key, index) + elif isinstance(index, BlockIndex): + setattr(self.attrs, '%s_variety' % key, 'block') + self.write_block_index(key, index) + elif isinstance(index, IntIndex): + setattr(self.attrs, '%s_variety' % key, 'sparseint') + self.write_sparse_intindex(key, index) + else: + setattr(self.attrs, '%s_variety' % key, 'regular') + converted = _convert_index(index, self.encoding, + self.format_type).set_name('index') + self.write_array(key, converted.values) + node = getattr(self.group, key) + node._v_attrs.kind = converted.kind + node._v_attrs.name = index.name + + if isinstance(index, (DatetimeIndex, PeriodIndex)): + node._v_attrs.index_class = self._class_to_alias(type(index)) + + if hasattr(index, 'freq'): + node._v_attrs.freq = index.freq + + if hasattr(index, 'tz') and index.tz is not None: + zone = tslib.get_timezone(index.tz) + if zone is None: + zone = tslib.tot_seconds(index.tz.utcoffset()) + node._v_attrs.tz = zone + + def write_block_index(self, key, index): + self.write_array('%s_blocs' % key, index.blocs) + self.write_array('%s_blengths' % key, index.blengths) + setattr(self.attrs, '%s_length' % key, index.length) + + def read_block_index(self, key): + length = getattr(self.attrs, '%s_length' % key) + blocs = self.read_array('%s_blocs' % key) + blengths = self.read_array('%s_blengths' % key) + return BlockIndex(length, blocs, blengths) + + def write_sparse_intindex(self, key, index): + self.write_array('%s_indices' % key, index.indices) + setattr(self.attrs, '%s_length' % key, index.length) + + def read_sparse_intindex(self, key): + length = getattr(self.attrs, '%s_length' % key) + indices = self.read_array('%s_indices' % key) + return IntIndex(length, indices) + + def write_multi_index(self, key, index): + setattr(self.attrs, '%s_nlevels' % key, index.nlevels) + + for i, (lev, lab, name) in enumerate(zip(index.levels, + index.labels, + index.names)): + # write the level + level_key = '%s_level%d' % (key, i) + conv_level = _convert_index(lev, self.encoding, + self.format_type).set_name(level_key) + self.write_array(level_key, conv_level.values) + node = getattr(self.group, level_key) + node._v_attrs.kind = conv_level.kind + node._v_attrs.name = name + + # write the name + setattr(node._v_attrs, '%s_name%d' % (key, i), name) + + # write the labels + label_key = '%s_label%d' % (key, i) + self.write_array(label_key, lab) + + def read_multi_index(self, key): + nlevels = getattr(self.attrs, '%s_nlevels' % key) + + levels = [] + labels = [] + names = [] + for i in range(nlevels): + level_key = '%s_level%d' % (key, i) + name, lev = self.read_index_node(getattr(self.group, level_key)) + levels.append(lev) + names.append(name) + + label_key = '%s_label%d' % (key, i) + lab = self.read_array(label_key) + labels.append(lab) + + return MultiIndex(levels=levels, labels=labels, names=names, + verify_integrity=True) + + def read_index_node(self, node): + data = node[:] + # If the index was an empty array write_array_empty() will + # have written a sentinel. Here we relace it with the original. + if ('shape' in node._v_attrs and + self._is_empty_array(getattr(node._v_attrs, 'shape'))): + data = np.empty(getattr(node._v_attrs, 'shape'), + dtype=getattr(node._v_attrs, 'value_type')) + kind = _ensure_decoded(node._v_attrs.kind) + name = None + + if 'name' in node._v_attrs: + name = node._v_attrs.name + + index_class = self._alias_to_class(getattr(node._v_attrs, + 'index_class', '')) + factory = self._get_index_factory(index_class) + + kwargs = {} + if u('freq') in node._v_attrs: + kwargs['freq'] = node._v_attrs['freq'] + + if u('tz') in node._v_attrs: + kwargs['tz'] = node._v_attrs['tz'] + + if kind in (u('date'), u('datetime')): + index = factory( + _unconvert_index(data, kind, encoding=self.encoding), + dtype=object, **kwargs) + else: + index = factory( + _unconvert_index(data, kind, encoding=self.encoding), **kwargs) + + index.name = name + + return name, index + + def write_array_empty(self, key, value): + """ write a 0-len array """ + + # ugly hack for length 0 axes + arr = np.empty((1,) * value.ndim) + self._handle.createArray(self.group, key, arr) + getattr(self.group, key)._v_attrs.value_type = str(value.dtype) + getattr(self.group, key)._v_attrs.shape = value.shape + + def _is_empty_array(self, shape): + """Returns true if any axis is zero length.""" + return any(x == 0 for x in shape) + + def write_array(self, key, value, items=None): + if key in self.group: + self._handle.removeNode(self.group, key) + + # Transform needed to interface with pytables row/col notation + empty_array = self._is_empty_array(value.shape) + transposed = False + + if not empty_array: + value = value.T + transposed = True + + if self._filters is not None: + atom = None + try: + # get the atom for this datatype + atom = _tables().Atom.from_dtype(value.dtype) + except ValueError: + pass + + if atom is not None: + # create an empty chunked array and fill it from value + if not empty_array: + ca = self._handle.createCArray(self.group, key, atom, + value.shape, + filters=self._filters) + ca[:] = value + getattr(self.group, key)._v_attrs.transposed = transposed + + else: + self.write_array_empty(key, value) + + return + + if value.dtype.type == np.object_: + + # infer the type, warn if we have a non-string type here (for + # performance) + inferred_type = lib.infer_dtype(value.ravel()) + if empty_array: + pass + elif inferred_type == 'string': + pass + else: + try: + items = list(items) + except: + pass + ws = performance_doc % (inferred_type, key, items) + warnings.warn(ws, PerformanceWarning) + + vlarr = self._handle.createVLArray(self.group, key, + _tables().ObjectAtom()) + vlarr.append(value) + else: + if empty_array: + self.write_array_empty(key, value) + else: + if value.dtype.type == np.datetime64: + self._handle.createArray(self.group, key, value.view('i8')) + getattr( + self.group, key)._v_attrs.value_type = 'datetime64' + elif value.dtype.type == np.timedelta64: + self._handle.createArray(self.group, key, value.view('i8')) + getattr( + self.group, key)._v_attrs.value_type = 'timedelta64' + else: + self._handle.createArray(self.group, key, value) + + getattr(self.group, key)._v_attrs.transposed = transposed + + +class LegacyFixed(GenericFixed): + + def read_index_legacy(self, key): + node = getattr(self.group, key) + data = node[:] + kind = node._v_attrs.kind + return _unconvert_index_legacy(data, kind, encoding=self.encoding) + + +class LegacySeriesFixed(LegacyFixed): + + def read(self, **kwargs): + self.validate_read(kwargs) + index = self.read_index_legacy('index') + values = self.read_array('values') + return Series(values, index=index) + + +class LegacyFrameFixed(LegacyFixed): + + def read(self, **kwargs): + self.validate_read(kwargs) + index = self.read_index_legacy('index') + columns = self.read_index_legacy('columns') + values = self.read_array('values') + return DataFrame(values, index=index, columns=columns) + + +class SeriesFixed(GenericFixed): + pandas_kind = u('series') + attributes = ['name'] + + @property + def shape(self): + try: + return len(getattr(self.group, 'values')), + except: + return None + + def read(self, **kwargs): + self.validate_read(kwargs) + index = self.read_index('index') + values = self.read_array('values') + return Series(values, index=index, name=self.name) + + def write(self, obj, **kwargs): + super(SeriesFixed, self).write(obj, **kwargs) + self.write_index('index', obj.index) + self.write_array('values', obj.values) + self.attrs.name = obj.name + + +class SparseSeriesFixed(GenericFixed): + pandas_kind = u('sparse_series') + attributes = ['name', 'fill_value', 'kind'] + + def read(self, **kwargs): + self.validate_read(kwargs) + index = self.read_index('index') + sp_values = self.read_array('sp_values') + sp_index = self.read_index('sp_index') + return SparseSeries(sp_values, index=index, sparse_index=sp_index, + kind=self.kind or u('block'), + fill_value=self.fill_value, + name=self.name) + + def write(self, obj, **kwargs): + super(SparseSeriesFixed, self).write(obj, **kwargs) + self.write_index('index', obj.index) + self.write_index('sp_index', obj.sp_index) + self.write_array('sp_values', obj.sp_values) + self.attrs.name = obj.name + self.attrs.fill_value = obj.fill_value + self.attrs.kind = obj.kind + + +class SparseFrameFixed(GenericFixed): + pandas_kind = u('sparse_frame') + attributes = ['default_kind', 'default_fill_value'] + + def read(self, **kwargs): + self.validate_read(kwargs) + columns = self.read_index('columns') + sdict = {} + for c in columns: + key = 'sparse_series_%s' % c + s = SparseSeriesFixed(self.parent, getattr(self.group, key)) + s.infer_axes() + sdict[c] = s.read() + return SparseDataFrame(sdict, columns=columns, + default_kind=self.default_kind, + default_fill_value=self.default_fill_value) + + def write(self, obj, **kwargs): + """ write it as a collection of individual sparse series """ + super(SparseFrameFixed, self).write(obj, **kwargs) + for name, ss in compat.iteritems(obj): + key = 'sparse_series_%s' % name + if key not in self.group._v_children: + node = self._handle.createGroup(self.group, key) + else: + node = getattr(self.group, key) + s = SparseSeriesFixed(self.parent, node) + s.write(ss) + self.attrs.default_fill_value = obj.default_fill_value + self.attrs.default_kind = obj.default_kind + self.write_index('columns', obj.columns) + + +class SparsePanelFixed(GenericFixed): + pandas_kind = u('sparse_panel') + attributes = ['default_kind', 'default_fill_value'] + + def read(self, **kwargs): + self.validate_read(kwargs) + items = self.read_index('items') + + sdict = {} + for name in items: + key = 'sparse_frame_%s' % name + s = SparseFrameFixed(self.parent, getattr(self.group, key)) + s.infer_axes() + sdict[name] = s.read() + return SparsePanel(sdict, items=items, default_kind=self.default_kind, + default_fill_value=self.default_fill_value) + + def write(self, obj, **kwargs): + super(SparsePanelFixed, self).write(obj, **kwargs) + self.attrs.default_fill_value = obj.default_fill_value + self.attrs.default_kind = obj.default_kind + self.write_index('items', obj.items) + + for name, sdf in compat.iteritems(obj): + key = 'sparse_frame_%s' % name + if key not in self.group._v_children: + node = self._handle.createGroup(self.group, key) + else: + node = getattr(self.group, key) + s = SparseFrameFixed(self.parent, node) + s.write(sdf) + + +class BlockManagerFixed(GenericFixed): + attributes = ['ndim', 'nblocks'] + is_shape_reversed = False + + @property + def shape(self): + try: + ndim = self.ndim + + # items + items = 0 + for i in range(self.nblocks): + node = getattr(self.group, 'block%d_items' % i) + shape = getattr(node, 'shape', None) + if shape is not None: + items += shape[0] + + # data shape + node = getattr(self.group, 'block0_values') + shape = getattr(node, 'shape', None) + if shape is not None: + shape = list(shape[0:(ndim - 1)]) + else: + shape = [] + + shape.append(items) + + # hacky - this works for frames, but is reversed for panels + if self.is_shape_reversed: + shape = shape[::-1] + + return shape + except: + return None + + def read(self, **kwargs): + self.validate_read(kwargs) + + axes = [] + for i in range(self.ndim): + ax = self.read_index('axis%d' % i) + axes.append(ax) + + items = axes[0] + blocks = [] + for i in range(self.nblocks): + blk_items = self.read_index('block%d_items' % i) + values = self.read_array('block%d_values' % i) + blk = make_block(values, + placement=items.get_indexer(blk_items)) + blocks.append(blk) + + return self.obj_type(BlockManager(blocks, axes)) + + def write(self, obj, **kwargs): + super(BlockManagerFixed, self).write(obj, **kwargs) + data = obj._data + if not data.is_consolidated(): + data = data.consolidate() + + self.attrs.ndim = data.ndim + for i, ax in enumerate(data.axes): + self.write_index('axis%d' % i, ax) + + # Supporting mixed-type DataFrame objects...nontrivial + self.attrs.nblocks = len(data.blocks) + for i, blk in enumerate(data.blocks): + # I have no idea why, but writing values before items fixed #2299 + blk_items = data.items.take(blk.mgr_locs) + self.write_array('block%d_values' % i, blk.values, items=blk_items) + self.write_index('block%d_items' % i, blk_items) + + +class FrameFixed(BlockManagerFixed): + pandas_kind = u('frame') + obj_type = DataFrame + + +class PanelFixed(BlockManagerFixed): + pandas_kind = u('wide') + obj_type = Panel + is_shape_reversed = True + + def write(self, obj, **kwargs): + obj._consolidate_inplace() + return super(PanelFixed, self).write(obj, **kwargs) + + +class Table(Fixed): + + """ represent a table: + facilitate read/write of various types of tables + + Attrs in Table Node + ------------------- + These are attributes that are store in the main table node, they are + necessary to recreate these tables when read back in. + + index_axes : a list of tuples of the (original indexing axis and + index column) + non_index_axes: a list of tuples of the (original index axis and + columns on a non-indexing axis) + values_axes : a list of the columns which comprise the data of this + table + data_columns : a list of the columns that we are allowing indexing + (these become single columns in values_axes), or True to force all + columns + nan_rep : the string to use for nan representations for string + objects + levels : the names of levels + + """ + pandas_kind = u('wide_table') + table_type = None + levels = 1 + is_table = True + is_shape_reversed = False + + def __init__(self, *args, **kwargs): + super(Table, self).__init__(*args, **kwargs) + self.index_axes = [] + self.non_index_axes = [] + self.values_axes = [] + self.data_columns = [] + self.info = dict() + self.nan_rep = None + self.selection = None + + @property + def table_type_short(self): + return self.table_type.split('_')[0] + + @property + def format_type(self): + return 'table' + + def __unicode__(self): + """ return a pretty representatgion of myself """ + self.infer_axes() + dc = ",dc->[%s]" % ','.join( + self.data_columns) if len(self.data_columns) else '' + + ver = '' + if self.is_old_version: + ver = "[%s]" % '.'.join([str(x) for x in self.version]) + + return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % ( + self.pandas_type, ver, self.table_type_short, self.nrows, + self.ncols, ','.join([a.name for a in self.index_axes]), dc + ) + + def __getitem__(self, c): + """ return the axis for c """ + for a in self.axes: + if c == a.name: + return a + return None + + def validate(self, other): + """ validate against an existing table """ + if other is None: + return + + if other.table_type != self.table_type: + raise TypeError("incompatible table_type with existing [%s - %s]" % + (other.table_type, self.table_type)) + + for c in ['index_axes', 'non_index_axes', 'values_axes']: + sv = getattr(self, c, None) + ov = getattr(other, c, None) + if sv != ov: + + # show the error for the specific axes + for i, sax in enumerate(sv): + oax = ov[i] + if sax != oax: + raise ValueError( + "invalid combinate of [%s] on appending data [%s] " + "vs current table [%s]" % (c, sax, oax)) + + # should never get here + raise Exception( + "invalid combinate of [%s] on appending data [%s] vs " + "current table [%s]" % (c, sv, ov)) + + @property + def is_multi_index(self): + """the levels attribute is 1 or a list in the case of a multi-index""" + return isinstance(self.levels, list) + + def validate_multiindex(self, obj): + """validate that we can store the multi-index; reset and return the + new object + """ + levels = [l if l is not None else "level_{0}".format(i) + for i, l in enumerate(obj.index.names)] + try: + return obj.reset_index(), levels + except ValueError: + raise ValueError("duplicate names/columns in the multi-index when " + "storing as a table") + + @property + def nrows_expected(self): + """ based on our axes, compute the expected nrows """ + return np.prod([i.cvalues.shape[0] for i in self.index_axes]) + + @property + def is_exists(self): + """ has this table been created """ + return u('table') in self.group + + @property + def storable(self): + return getattr(self.group, 'table', None) + + @property + def table(self): + """ return the table group (this is my storable) """ + return self.storable + + @property + def dtype(self): + return self.table.dtype + + @property + def description(self): + return self.table.description + + @property + def axes(self): + return itertools.chain(self.index_axes, self.values_axes) + + @property + def ncols(self): + """ the number of total columns in the values axes """ + return sum([len(a.values) for a in self.values_axes]) + + @property + def is_transposed(self): + return False + + @property + def data_orientation(self): + """return a tuple of my permutated axes, non_indexable at the front""" + return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes], + [int(a.axis) for a in self.index_axes])) + + def queryables(self): + """ return a dict of the kinds allowable columns for this object """ + + # compute the values_axes queryables + return dict( + [(a.cname, a.kind) for a in self.index_axes] + + [(self.storage_obj_type._AXIS_NAMES[axis], None) + for axis, values in self.non_index_axes] + + [(v.cname, v.kind) for v in self.values_axes + if v.name in set(self.data_columns)] + ) + + def index_cols(self): + """ return a list of my index cols """ + return [(i.axis, i.cname) for i in self.index_axes] + + def values_cols(self): + """ return a list of my values cols """ + return [i.cname for i in self.values_axes] + + def set_info(self): + """ update our table index info """ + self.attrs.info = self.info + + def set_attrs(self): + """ set our table type & indexables """ + self.attrs.table_type = str(self.table_type) + self.attrs.index_cols = self.index_cols() + self.attrs.values_cols = self.values_cols() + self.attrs.non_index_axes = self.non_index_axes + self.attrs.data_columns = self.data_columns + self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding + self.attrs.levels = self.levels + self.set_info() + + def get_attrs(self): + """ retrieve our attributes """ + self.non_index_axes = getattr( + self.attrs, 'non_index_axes', None) or [] + self.data_columns = getattr( + self.attrs, 'data_columns', None) or [] + self.info = getattr( + self.attrs, 'info', None) or dict() + self.nan_rep = getattr(self.attrs, 'nan_rep', None) + self.encoding = _ensure_encoding( + getattr(self.attrs, 'encoding', None)) + self.levels = getattr( + self.attrs, 'levels', None) or [] + t = self.table + self.index_axes = [ + a.infer(t) for a in self.indexables if a.is_an_indexable + ] + self.values_axes = [ + a.infer(t) for a in self.indexables if not a.is_an_indexable + ] + + def validate_version(self, where=None): + """ are we trying to operate on an old version? """ + if where is not None: + if (self.version[0] <= 0 and self.version[1] <= 10 and + self.version[2] < 1): + ws = incompatibility_doc % '.'.join( + [str(x) for x in self.version]) + warnings.warn(ws, IncompatibilityWarning) + + def validate_min_itemsize(self, min_itemsize): + """validate the min_itemisze doesn't contain items that are not in the + axes this needs data_columns to be defined + """ + if min_itemsize is None: + return + if not isinstance(min_itemsize, dict): + return + + q = self.queryables() + for k, v in min_itemsize.items(): + + # ok, apply generally + if k == 'values': + continue + if k not in q: + raise ValueError( + "min_itemsize has the key [%s] which is not an axis or " + "data_column" % k) + + @property + def indexables(self): + """ create/cache the indexables if they don't exist """ + if self._indexables is None: + + self._indexables = [] + + # index columns + self._indexables.extend([ + IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols) + ]) + + # values columns + dc = set(self.data_columns) + base_pos = len(self._indexables) + + def f(i, c): + klass = DataCol + if c in dc: + klass = DataIndexableCol + return klass.create_for_block(i=i, name=c, pos=base_pos + i, + version=self.version) + + self._indexables.extend( + [f(i, c) for i, c in enumerate(self.attrs.values_cols)]) + + return self._indexables + + def create_index(self, columns=None, optlevel=None, kind=None): + """ + Create a pytables index on the specified columns + note: cannot index Time64Col() currently; PyTables must be >= 2.3 + + + Paramaters + ---------- + columns : False (don't create an index), True (create all columns + index), None or list_like (the indexers to index) + optlevel: optimization level (defaults to 6) + kind : kind of index (defaults to 'medium') + + Exceptions + ---------- + raises if the node is not a table + + """ + + if not self.infer_axes(): + return + if columns is False: + return + + # index all indexables and data_columns + if columns is None or columns is True: + columns = [a.cname for a in self.axes if a.is_data_indexable] + if not isinstance(columns, (tuple, list)): + columns = [columns] + + kw = dict() + if optlevel is not None: + kw['optlevel'] = optlevel + if kind is not None: + kw['kind'] = kind + + table = self.table + for c in columns: + v = getattr(table.cols, c, None) + if v is not None: + + # remove the index if the kind/optlevel have changed + if v.is_indexed: + index = v.index + cur_optlevel = index.optlevel + cur_kind = index.kind + + if kind is not None and cur_kind != kind: + v.removeIndex() + else: + kw['kind'] = cur_kind + + if optlevel is not None and cur_optlevel != optlevel: + v.removeIndex() + else: + kw['optlevel'] = cur_optlevel + + # create the index + if not v.is_indexed: + v.createIndex(**kw) + + def read_axes(self, where, **kwargs): + """create and return the axes sniffed from the table: return boolean + for success + """ + + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): + return False + + # create the selection + self.selection = Selection(self, where=where, **kwargs) + values = self.selection.select() + + # convert the data + for a in self.axes: + a.set_info(self.info) + a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding) + + return True + + def get_object(self, obj): + """ return the data for this obj """ + return obj + + def validate_data_columns(self, data_columns, min_itemsize): + """take the input data_columns and min_itemize and create a data + columns spec + """ + + if not len(self.non_index_axes): + return [] + + axis, axis_labels = self.non_index_axes[0] + info = self.info.get(axis, dict()) + if info.get('type') == 'MultiIndex' and data_columns: + raise ValueError("cannot use a multi-index on axis [{0}] with " + "data_columns {1}".format(axis, data_columns)) + + # evaluate the passed data_columns, True == use all columns + # take only valide axis labels + if data_columns is True: + data_columns = axis_labels + elif data_columns is None: + data_columns = [] + + # if min_itemsize is a dict, add the keys (exclude 'values') + if isinstance(min_itemsize, dict): + + existing_data_columns = set(data_columns) + data_columns.extend([ + k for k in min_itemsize.keys() + if k != 'values' and k not in existing_data_columns + ]) + + # return valid columns in the order of our axis + return [c for c in data_columns if c in axis_labels] + + def create_axes(self, axes, obj, validate=True, nan_rep=None, + data_columns=None, min_itemsize=None, **kwargs): + """ create and return the axes + leagcy tables create an indexable column, indexable index, + non-indexable fields + + Parameters: + ----------- + axes: a list of the axes in order to create (names or numbers of + the axes) + obj : the object to create axes on + validate: validate the obj against an existing object already + written + min_itemsize: a dict of the min size for a column in bytes + nan_rep : a values to use for string column nan_rep + encoding : the encoding for string values + data_columns : a list of columns that we want to create separate to + allow indexing (or True will force all columns) + + """ + + # set the default axes if needed + if axes is None: + try: + axes = _AXES_MAP[type(obj)] + except: + raise TypeError("cannot properly create the storer for: " + "[group->%s,value->%s]" + % (self.group._v_name, type(obj))) + + # map axes to numbers + axes = [obj._get_axis_number(a) for a in axes] + + # do we have an existing table (if so, use its axes & data_columns) + if self.infer_axes(): + existing_table = self.copy() + existing_table.infer_axes() + axes = [a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep + self.encoding = existing_table.encoding + self.info = copy.copy(existing_table.info) + else: + existing_table = None + + # currently support on ndim-1 axes + if len(axes) != self.ndim - 1: + raise ValueError( + "currently only support ndim-1 indexers in an AppendableTable") + + # create according to the new data + self.non_index_axes = [] + self.data_columns = [] + + # nan_representation + if nan_rep is None: + nan_rep = 'nan' + + self.nan_rep = nan_rep + + # create axes to index and non_index + index_axes_map = dict() + for i, a in enumerate(obj.axes): + + if i in axes: + name = obj._AXIS_NAMES[i] + index_axes_map[i] = _convert_index( + a, self.encoding, self.format_type + ).set_name(name).set_axis(i) + else: + + # we might be able to change the axes on the appending data if + # necessary + append_axis = list(a) + if existing_table is not None: + indexer = len(self.non_index_axes) + exist_axis = existing_table.non_index_axes[indexer][1] + if append_axis != exist_axis: + + # ahah! -> reindex + if sorted(append_axis) == sorted(exist_axis): + append_axis = exist_axis + + # the non_index_axes info + info = _get_info(self.info, i) + info['names'] = list(a.names) + info['type'] = a.__class__.__name__ + + self.non_index_axes.append((i, append_axis)) + + # set axis positions (based on the axes) + self.index_axes = [ + index_axes_map[a].set_pos(j).update_info(self.info) + for j, a in enumerate(axes) + ] + j = len(self.index_axes) + + # check for column conflicts + if validate: + for a in self.axes: + a.maybe_set_size(min_itemsize=min_itemsize) + + # reindex by our non_index_axes & compute data_columns + for a in self.non_index_axes: + obj = _reindex_axis(obj, a[0], a[1]) + + def get_blk_items(mgr, blocks): + return [mgr.items.take(blk.mgr_locs) for blk in blocks] + + # figure out data_columns and get out blocks + block_obj = self.get_object(obj).consolidate() + blocks = block_obj._data.blocks + blk_items = get_blk_items(block_obj._data, blocks) + if len(self.non_index_axes): + axis, axis_labels = self.non_index_axes[0] + data_columns = self.validate_data_columns( + data_columns, min_itemsize) + if len(data_columns): + mgr = block_obj.reindex_axis( + Index(axis_labels) - Index(data_columns), + axis=axis + )._data + + blocks = list(mgr.blocks) + blk_items = get_blk_items(mgr, blocks) + for c in data_columns: + mgr = block_obj.reindex_axis([c], axis=axis)._data + blocks.extend(mgr.blocks) + blk_items.extend(get_blk_items(mgr, mgr.blocks)) + + # reorder the blocks in the same order as the existing_table if we can + if existing_table is not None: + by_items = dict([(tuple(b_items.tolist()), (b, b_items)) + for b, b_items in zip(blocks, blk_items)]) + new_blocks = [] + new_blk_items = [] + for ea in existing_table.values_axes: + items = tuple(ea.values) + try: + b, b_items = by_items.pop(items) + new_blocks.append(b) + new_blk_items.append(b_items) + except: + raise ValueError( + "cannot match existing table structure for [%s] on " + "appending data" % ','.join(com.pprint_thing(item) for + item in items)) + blocks = new_blocks + blk_items = new_blk_items + + # add my values + self.values_axes = [] + for i, (b, b_items) in enumerate(zip(blocks, blk_items)): + + # shape of the data column are the indexable axes + klass = DataCol + name = None + + # we have a data_column + if (data_columns and len(b_items) == 1 and + b_items[0] in data_columns): + klass = DataIndexableCol + name = b_items[0] + self.data_columns.append(name) + + # make sure that we match up the existing columns + # if we have an existing table + if existing_table is not None and validate: + try: + existing_col = existing_table.values_axes[i] + except: + raise ValueError("Incompatible appended table [%s] with " + "existing table [%s]" + % (blocks, existing_table.values_axes)) + else: + existing_col = None + + try: + col = klass.create_for_block( + i=i, name=name, version=self.version) + col.set_atom(block=b, block_items=b_items, + existing_col=existing_col, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + encoding=self.encoding, + info=self.info, + **kwargs) + col.set_pos(j) + + self.values_axes.append(col) + except (NotImplementedError, ValueError, TypeError) as e: + raise e + except Exception as detail: + raise Exception( + "cannot find the correct atom type -> " + "[dtype->%s,items->%s] %s" + % (b.dtype.name, b_items, str(detail)) + ) + j += 1 + + # validate our min_itemsize + self.validate_min_itemsize(min_itemsize) + + # validate the axes if we have an existing table + if validate: + self.validate(existing_table) + + def process_axes(self, obj, columns=None): + """ process axes filters """ + + # make sure to include levels if we have them + if columns is not None and self.is_multi_index: + for n in self.levels: + if n not in columns: + columns.insert(0, n) + + # reorder by any non_index_axes & limit to the select columns + for axis, labels in self.non_index_axes: + obj = _reindex_axis(obj, axis, labels, columns) + + # apply the selection filters (but keep in the same order) + if self.selection.filter is not None: + for field, op, filt in self.selection.filter.format(): + + def process_filter(field, filt): + + for axis_name in obj._AXIS_NAMES.values(): + axis_number = obj._get_axis_number(axis_name) + axis_values = obj._get_axis(axis_name) + + # see if the field is the name of an axis + if field == axis_name: + + # if we have a multi-index, then need to include + # the levels + if self.is_multi_index: + filt = filt + Index(self.levels) + + takers = op(axis_values, filt) + return obj.ix._getitem_axis(takers, + axis=axis_number) + + # this might be the name of a file IN an axis + elif field in axis_values: + + # we need to filter on this dimension + values = _ensure_index(getattr(obj, field).values) + filt = _ensure_index(filt) + + # hack until we support reversed dim flags + if isinstance(obj, DataFrame): + axis_number = 1 - axis_number + takers = op(values, filt) + return obj.ix._getitem_axis(takers, + axis=axis_number) + + raise ValueError( + "cannot find the field [%s] for filtering!" % field) + + obj = process_filter(field, filt) + + return obj + + def create_description(self, complib=None, complevel=None, + fletcher32=False, expectedrows=None): + """ create the description of the table from the axes & values """ + + # expected rows estimate + if expectedrows is None: + expectedrows = max(self.nrows_expected, 10000) + d = dict(name='table', expectedrows=expectedrows) + + # description from the axes & values + d['description'] = dict([(a.cname, a.typ) for a in self.axes]) + + if complib: + if complevel is None: + complevel = self._complevel or 9 + filters = _tables().Filters( + complevel=complevel, complib=complib, + fletcher32=fletcher32 or self._fletcher32) + d['filters'] = filters + elif self._filters is not None: + d['filters'] = self._filters + + return d + + def read_coordinates(self, where=None, start=None, stop=None, **kwargs): + """select coordinates (row numbers) from a table; return the + coordinates object + """ + + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): + return False + + # create the selection + self.selection = Selection( + self, where=where, start=start, stop=stop, **kwargs) + coords = self.selection.select_coords() + if self.selection.filter is not None: + for field, op, filt in self.selection.filter.format(): + data = self.read_column(field, start=coords.min(), stop=coords.max()+1) + coords = coords[op(data.iloc[coords-coords.min()], filt).values] + + return Index(coords) + + def read_column(self, column, where=None, start=None, stop=None, **kwargs): + """return a single column from the table, generally only indexables + are interesting + """ + + # validate the version + self.validate_version() + + # infer the data kind + if not self.infer_axes(): + return False + + if where is not None: + raise TypeError("read_column does not currently accept a where " + "clause") + + # find the axes + for a in self.axes: + if column == a.name: + + if not a.is_data_indexable: + raise ValueError( + "column [%s] can not be extracted individually; it is " + "not data indexable" % column) + + # column must be an indexable or a data column + c = getattr(self.table.cols, column) + a.set_info(self.info) + return Series(a.convert(c[start:stop], nan_rep=self.nan_rep, + encoding=self.encoding).take_data()) + + raise KeyError("column [%s] not found in the table" % column) + + +class WORMTable(Table): + + """ a write-once read-many table: this format DOES NOT ALLOW appending to a + table. writing is a one-time operation the data are stored in a format + that allows for searching the data on disk + """ + table_type = u('worm') + + def read(self, **kwargs): + """ read the indicies and the indexing array, calculate offset rows and + return """ + raise NotImplementedError("WORMTable needs to implement read") + + def write(self, **kwargs): + """ write in a format that we can search later on (but cannot append + to): write out the indicies and the values using _write_array + (e.g. a CArray) create an indexing table so that we can search + """ + raise NotImplementedError("WORKTable needs to implement write") + + +class LegacyTable(Table): + + """ an appendable table: allow append/query/delete operations to a + (possibily) already existing appendable table this table ALLOWS + append (but doesn't require them), and stores the data in a format + that can be easily searched + + """ + _indexables = [ + IndexCol(name='index', axis=1, pos=0), + IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'), + DataCol(name='fields', cname='values', kind_attr='fields', pos=2) + ] + table_type = u('legacy') + ndim = 3 + + def write(self, **kwargs): + raise TypeError("write operations are not allowed on legacy tables!") + + def read(self, where=None, columns=None, **kwargs): + """we have n indexable columns, with an arbitrary number of data + axes + """ + + if not self.read_axes(where=where, **kwargs): + return None + + factors = [Categorical.from_array(a.values) for a in self.index_axes] + levels = [f.levels for f in factors] + N = [len(f.levels) for f in factors] + labels = [f.labels for f in factors] + + # compute the key + key = factor_indexer(N[1:], labels) + + objs = [] + if len(unique(key)) == len(key): + + sorter, _ = algos.groupsort_indexer( + com._ensure_int64(key), np.prod(N)) + sorter = com._ensure_platform_int(sorter) + + # create the objs + for c in self.values_axes: + + # the data need to be sorted + sorted_values = c.take_data().take(sorter, axis=0) + if sorted_values.ndim == 1: + sorted_values = sorted_values.reshape(sorted_values.shape[0],1) + + take_labels = [l.take(sorter) for l in labels] + items = Index(c.values) + block = block2d_to_blocknd( + values=sorted_values, placement=np.arange(len(items)), + shape=tuple(N), labels=take_labels, ref_items=items) + + # create the object + mgr = BlockManager([block], [items] + levels) + obj = self.obj_type(mgr) + + # permute if needed + if self.is_transposed: + obj = obj.transpose( + *tuple(Series(self.data_orientation).argsort())) + + objs.append(obj) + + else: + warnings.warn(duplicate_doc, DuplicateWarning) + + # reconstruct + long_index = MultiIndex.from_arrays( + [i.values for i in self.index_axes]) + + for c in self.values_axes: + lp = DataFrame(c.data, index=long_index, columns=c.values) + + # need a better algorithm + tuple_index = long_index._tuple_index + + unique_tuples = lib.fast_unique(tuple_index) + unique_tuples = _asarray_tuplesafe(unique_tuples) + + indexer = match(unique_tuples, tuple_index) + indexer = com._ensure_platform_int(indexer) + + new_index = long_index.take(indexer) + new_values = lp.values.take(indexer, axis=0) + + lp = DataFrame(new_values, index=new_index, columns=lp.columns) + objs.append(lp.to_panel()) + + # create the composite object + if len(objs) == 1: + wp = objs[0] + else: + wp = concat(objs, axis=0, verify_integrity=False).consolidate() + + # apply the selection filters & axis orderings + wp = self.process_axes(wp, columns=columns) + + return wp + + +class LegacyFrameTable(LegacyTable): + + """ support the legacy frame table """ + pandas_kind = u('frame_table') + table_type = u('legacy_frame') + obj_type = Panel + + def read(self, *args, **kwargs): + return super(LegacyFrameTable, self).read(*args, **kwargs)['value'] + + +class LegacyPanelTable(LegacyTable): + + """ support the legacy panel table """ + table_type = u('legacy_panel') + obj_type = Panel + + +class AppendableTable(LegacyTable): + + """ suppor the new appendable table formats """ + _indexables = None + table_type = u('appendable') + + def write(self, obj, axes=None, append=False, complib=None, + complevel=None, fletcher32=None, min_itemsize=None, + chunksize=None, expectedrows=None, dropna=True, **kwargs): + + if not append and self.is_exists: + self._handle.removeNode(self.group, 'table') + + # create the axes + self.create_axes(axes=axes, obj=obj, validate=append, + min_itemsize=min_itemsize, + **kwargs) + + if not self.is_exists: + + # create the table + options = self.create_description(complib=complib, + complevel=complevel, + fletcher32=fletcher32, + expectedrows=expectedrows) + + # set the table attributes + self.set_attrs() + + # create the table + table = self._handle.createTable(self.group, **options) + + else: + table = self.table + + # update my info + self.set_info() + + # validate the axes and set the kinds + for a in self.axes: + a.validate_and_set(table, append) + + # add the rows + self.write_data(chunksize, dropna=dropna) + + def write_data(self, chunksize, dropna=True): + """ we form the data into a 2-d including indexes,values,mask + write chunk-by-chunk """ + + names = self.dtype.names + nrows = self.nrows_expected + + # if dropna==True, then drop ALL nan rows + if dropna: + + masks = [] + for a in self.values_axes: + + # figure the mask: only do if we can successfully process this + # column, otherwise ignore the mask + mask = com.isnull(a.data).all(axis=0) + masks.append(mask.astype('u1')) + + # consolidate masks + mask = masks[0] + for m in masks[1:]: + mask = mask & m + mask = mask.ravel() + + else: + + mask = np.empty(nrows, dtype='u1') + mask.fill(False) + + # broadcast the indexes if needed + indexes = [a.cvalues for a in self.index_axes] + nindexes = len(indexes) + bindexes = [] + for i, idx in enumerate(indexes): + + # broadcast to all other indexes except myself + if i > 0 and i < nindexes: + repeater = np.prod( + [indexes[bi].shape[0] for bi in range(0, i)]) + idx = np.tile(idx, repeater) + + if i < nindexes - 1: + repeater = np.prod([indexes[bi].shape[0] + for bi in range(i + 1, nindexes)]) + idx = np.repeat(idx, repeater) + + bindexes.append(idx) + + # transpose the values so first dimension is last + # reshape the values if needed + values = [a.take_data() for a in self.values_axes] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) + for v in values] + bvalues = [] + for i, v in enumerate(values): + new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape + bvalues.append(values[i].ravel().reshape(new_shape)) + + # write the chunks + if chunksize is None: + chunksize = 100000 + + chunks = int(nrows / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) + if start_i >= end_i: + break + + self.write_data_chunk( + indexes=[a[start_i:end_i] for a in bindexes], + mask=mask[start_i:end_i], + values=[v[start_i:end_i] for v in bvalues]) + + def write_data_chunk(self, indexes, mask, values): + + # 0 len + for v in values: + if not np.prod(v.shape): + return + + try: + nrows = indexes[0].shape[0] + rows = np.empty(nrows, dtype=self.dtype) + names = self.dtype.names + nindexes = len(indexes) + + # indexes + for i, idx in enumerate(indexes): + rows[names[i]] = idx + + # values + for i, v in enumerate(values): + rows[names[i + nindexes]] = v + + # mask + rows = rows[~mask.ravel().astype(bool)] + + except Exception as detail: + raise Exception("cannot create row-data -> %s" % detail) + + try: + if len(rows): + self.table.append(rows) + self.table.flush() + except Exception as detail: + raise TypeError("tables cannot write this data -> %s" % detail) + + def delete(self, where=None, start=None, stop=None, **kwargs): + + # delete all rows (and return the nrows) + if where is None or not len(where): + if start is None and stop is None: + nrows = self.nrows + self._handle.removeNode(self.group, recursive=True) + else: + # pytables<3.0 would remove a single row with stop=None + if stop is None: + stop = self.nrows + nrows = self.table.removeRows(start=start, stop=stop) + self.table.flush() + return nrows + + # infer the data kind + if not self.infer_axes(): + return None + + # create the selection + table = self.table + self.selection = Selection(self, where, start=start, stop=stop, **kwargs) + values = self.selection.select_coords() + + # delete the rows in reverse order + l = Series(values).order() + ln = len(l) + + if ln: + + # construct groups of consecutive rows + diff = l.diff() + groups = list(diff[diff > 1].index) + + # 1 group + if not len(groups): + groups = [0] + + # final element + if groups[-1] != ln: + groups.append(ln) + + # initial element + if groups[0] != 0: + groups.insert(0, 0) + + # we must remove in reverse order! + pg = groups.pop() + for g in reversed(groups): + rows = l.take(lrange(g, pg)) + table.removeRows(start=rows[rows.index[0] + ], stop=rows[rows.index[-1]] + 1) + pg = g + + self.table.flush() + + # return the number of rows removed + return ln + + +class AppendableFrameTable(AppendableTable): + + """ suppor the new appendable table formats """ + pandas_kind = u('frame_table') + table_type = u('appendable_frame') + ndim = 2 + obj_type = DataFrame + + @property + def is_transposed(self): + return self.index_axes[0].axis == 1 + + def get_object(self, obj): + """ these are written transposed """ + if self.is_transposed: + obj = obj.T + return obj + + def read(self, where=None, columns=None, **kwargs): + + if not self.read_axes(where=where, **kwargs): + return None + + info = (self.info.get(self.non_index_axes[0][0], dict()) + if len(self.non_index_axes) else dict()) + index = self.index_axes[0].values + frames = [] + for a in self.values_axes: + + # we could have a multi-index constructor here + # _ensure_index doesn't recognized our list-of-tuples here + if info.get('type') == 'MultiIndex': + cols = MultiIndex.from_tuples(a.values) + else: + cols = Index(a.values) + names = info.get('names') + if names is not None: + cols.set_names(names, inplace=True) + + if self.is_transposed: + values = a.cvalues + index_ = cols + cols_ = Index(index, name=getattr(index, 'name', None)) + else: + values = a.cvalues.T + index_ = Index(index, name=getattr(index, 'name', None)) + cols_ = cols + + # if we have a DataIndexableCol, its shape will only be 1 dim + if values.ndim == 1: + values = values.reshape(1, values.shape[0]) + + block = make_block(values, placement=np.arange(len(cols_))) + mgr = BlockManager([block], [cols_, index_]) + frames.append(DataFrame(mgr)) + + if len(frames) == 1: + df = frames[0] + else: + df = concat(frames, axis=1, verify_integrity=False).consolidate() + + # apply the selection filters & axis orderings + df = self.process_axes(df, columns=columns) + + return df + + +class AppendableSeriesTable(AppendableFrameTable): + """ support the new appendable table formats """ + pandas_kind = u('series_table') + table_type = u('appendable_series') + ndim = 2 + obj_type = Series + storage_obj_type = DataFrame + + @property + def is_transposed(self): + return False + + def get_object(self, obj): + return obj + + def write(self, obj, data_columns=None, **kwargs): + """ we are going to write this as a frame table """ + if not isinstance(obj, DataFrame): + name = obj.name or 'values' + obj = DataFrame({name: obj}, index=obj.index) + obj.columns = [name] + return super(AppendableSeriesTable, self).write( + obj=obj, data_columns=obj.columns, **kwargs) + + def read(self, columns=None, **kwargs): + + is_multi_index = self.is_multi_index + if columns is not None and is_multi_index: + for n in self.levels: + if n not in columns: + columns.insert(0, n) + s = super(AppendableSeriesTable, self).read(columns=columns, **kwargs) + if is_multi_index: + s.set_index(self.levels, inplace=True) + + s = s.iloc[:, 0] + + # remove the default name + if s.name == 'values': + s.name = None + return s + + +class AppendableMultiSeriesTable(AppendableSeriesTable): + """ support the new appendable table formats """ + pandas_kind = u('series_table') + table_type = u('appendable_multiseries') + + def write(self, obj, **kwargs): + """ we are going to write this as a frame table """ + name = obj.name or 'values' + obj, self.levels = self.validate_multiindex(obj) + cols = list(self.levels) + cols.append(name) + obj.columns = cols + return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs) + + +class GenericTable(AppendableFrameTable): + """ a table that read/writes the generic pytables table format """ + pandas_kind = u('frame_table') + table_type = u('generic_table') + ndim = 2 + obj_type = DataFrame + + @property + def pandas_type(self): + return self.pandas_kind + + @property + def storable(self): + return getattr(self.group, 'table', None) or self.group + + def get_attrs(self): + """ retrieve our attributes """ + self.non_index_axes = [] + self.nan_rep = None + self.levels = [] + t = self.table + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] + self.data_columns = [a.name for a in self.values_axes] + + @property + def indexables(self): + """ create the indexables from the table description """ + if self._indexables is None: + + d = self.description + + # the index columns is just a simple index + self._indexables = [GenericIndexCol(name='index', axis=0)] + + for i, n in enumerate(d._v_names): + + dc = GenericDataIndexableCol( + name=n, pos=i, values=[n], version=self.version) + self._indexables.append(dc) + + return self._indexables + + def write(self, **kwargs): + raise NotImplementedError("cannot write on an generic table") + + +class AppendableMultiFrameTable(AppendableFrameTable): + + """ a frame with a multi-index """ + table_type = u('appendable_multiframe') + obj_type = DataFrame + ndim = 2 + _re_levels = re.compile("^level_\d+$") + + @property + def table_type_short(self): + return u('appendable_multi') + + def write(self, obj, data_columns=None, **kwargs): + if data_columns is None: + data_columns = [] + elif data_columns is True: + data_columns = obj.columns[:] + obj, self.levels = self.validate_multiindex(obj) + for n in self.levels: + if n not in data_columns: + data_columns.insert(0, n) + return super(AppendableMultiFrameTable, self).write( + obj=obj, data_columns=data_columns, **kwargs) + + def read(self, **kwargs): + + df = super(AppendableMultiFrameTable, self).read(**kwargs) + df = df.set_index(self.levels) + + # remove names for 'level_%d' + df.index = df.index.set_names([ + None if self._re_levels.search(l) else l for l in df.index.names + ]) + + return df + +class AppendablePanelTable(AppendableTable): + + """ suppor the new appendable table formats """ + table_type = u('appendable_panel') + ndim = 3 + obj_type = Panel + + def get_object(self, obj): + """ these are written transposed """ + if self.is_transposed: + obj = obj.transpose(*self.data_orientation) + return obj + + @property + def is_transposed(self): + return self.data_orientation != tuple(range(self.ndim)) + + +class AppendableNDimTable(AppendablePanelTable): + + """ suppor the new appendable table formats """ + table_type = u('appendable_ndim') + ndim = 4 + obj_type = Panel4D + + +def _reindex_axis(obj, axis, labels, other=None): + ax = obj._get_axis(axis) + labels = _ensure_index(labels) + + # try not to reindex even if other is provided + # if it equals our current index + if other is not None: + other = _ensure_index(other) + if (other is None or labels.equals(other)) and labels.equals(ax): + return obj + + labels = _ensure_index(labels.unique()) + if other is not None: + labels = labels & _ensure_index(other.unique()) + if not labels.equals(ax): + slicer = [slice(None, None)] * obj.ndim + slicer[axis] = labels + obj = obj.loc[tuple(slicer)] + return obj + + +def _get_info(info, name): + """ get/create the info for this name """ + try: + idx = info[name] + except: + idx = info[name] = dict() + return idx + + +def _convert_index(index, encoding=None, format_type=None): + index_name = getattr(index, 'name', None) + + if isinstance(index, DatetimeIndex): + converted = index.asi8 + return IndexCol(converted, 'datetime64', _tables().Int64Col(), + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), + index_name=index_name) + elif isinstance(index, (Int64Index, PeriodIndex)): + atom = _tables().Int64Col() + return IndexCol( + index.values, 'integer', atom, freq=getattr(index, 'freq', None), + index_name=index_name) + + if isinstance(index, MultiIndex): + raise TypeError('MultiIndex not supported here!') + + inferred_type = lib.infer_dtype(index) + + values = np.asarray(index) + + if inferred_type == 'datetime64': + converted = values.view('i8') + return IndexCol(converted, 'datetime64', _tables().Int64Col(), + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), + index_name=index_name) + elif inferred_type == 'datetime': + converted = np.array([(time.mktime(v.timetuple()) + + v.microsecond / 1E6) for v in values], + dtype=np.float64) + return IndexCol(converted, 'datetime', _tables().Time64Col(), + index_name=index_name) + elif inferred_type == 'date': + converted = np.array([v.toordinal() for v in values], + dtype=np.int32) + return IndexCol(converted, 'date', _tables().Time32Col(), + index_name=index_name) + elif inferred_type == 'string': + # atom = _tables().ObjectAtom() + # return np.asarray(values, dtype='O'), 'object', atom + + converted = _convert_string_array(values, encoding) + itemsize = converted.dtype.itemsize + return IndexCol( + converted, 'string', _tables().StringCol(itemsize), + itemsize=itemsize, index_name=index_name + ) + elif inferred_type == 'unicode': + if format_type == 'fixed': + atom = _tables().ObjectAtom() + return IndexCol(np.asarray(values, dtype='O'), 'object', atom, + index_name=index_name) + raise TypeError( + "[unicode] is not supported as a in index type for [{0}] formats" + .format(format_type) + ) + + elif inferred_type == 'integer': + # take a guess for now, hope the values fit + atom = _tables().Int64Col() + return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom, + index_name=index_name) + elif inferred_type == 'floating': + atom = _tables().Float64Col() + return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom, + index_name=index_name) + else: # pragma: no cover + atom = _tables().ObjectAtom() + return IndexCol(np.asarray(values, dtype='O'), 'object', atom, + index_name=index_name) + + +def _unconvert_index(data, kind, encoding=None): + kind = _ensure_decoded(kind) + if kind == u('datetime64'): + index = DatetimeIndex(data) + elif kind == u('datetime'): + index = np.array([datetime.fromtimestamp(v) for v in data], + dtype=object) + elif kind == u('date'): + try: + index = np.array( + [date.fromordinal(v) for v in data], dtype=object) + except (ValueError): + index = np.array( + [date.fromtimestamp(v) for v in data], dtype=object) + elif kind in (u('integer'), u('float')): + index = np.array(data) + elif kind in (u('string')): + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) + elif kind == u('object'): + index = np.array(data[0]) + else: # pragma: no cover + raise ValueError('unrecognized index type %s' % kind) + return index + + +def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): + kind = _ensure_decoded(kind) + if kind == u('datetime'): + index = lib.time64_to_datetime(data) + elif kind in (u('integer')): + index = np.array(data, dtype=object) + elif kind in (u('string')): + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) + else: # pragma: no cover + raise ValueError('unrecognized index type %s' % kind) + return index + + +def _convert_string_array(data, encoding, itemsize=None): + + # encode if needed + if encoding is not None and len(data): + f = np.vectorize(lambda x: x.encode(encoding), otypes=[np.object]) + data = f(data) + + # create the sized dtype + if itemsize is None: + itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + + data = np.array(data, dtype="S%d" % itemsize) + return data + +def _unconvert_string_array(data, nan_rep=None, encoding=None): + """ deserialize a string array, possibly decoding """ + shape = data.shape + data = np.array(data.ravel(), dtype=object) + + # guard against a None encoding in PY3 (because of a legacy + # where the passed encoding is actually None) + encoding = _ensure_encoding(encoding) + if encoding is not None and len(data): + + try: + itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + if compat.PY3: + dtype = "U{0}".format(itemsize) + else: + dtype = "S{0}".format(itemsize) + data = data.astype(dtype).astype(object) + except (Exception) as e: + f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) + data = f(data) + + if nan_rep is None: + nan_rep = 'nan' + + data = lib.string_array_replace_from_nan_rep(data, nan_rep) + return data.reshape(shape) + + +def _maybe_convert(values, val_kind, encoding): + if _need_convert(val_kind): + conv = _get_converter(val_kind, encoding) + # conv = np.frompyfunc(conv, 1, 1) + values = conv(values) + return values + + +def _get_converter(kind, encoding): + kind = _ensure_decoded(kind) + if kind == 'datetime64': + return lambda x: np.array(x, dtype='M8[ns]') + elif kind == 'datetime': + return lib.convert_timestamps + elif kind == 'string': + return lambda x: _unconvert_string_array(x, encoding=encoding) + else: # pragma: no cover + raise ValueError('invalid kind %s' % kind) + + +def _need_convert(kind): + kind = _ensure_decoded(kind) + if kind in (u('datetime'), u('datetime64'), u('string')): + return True + return False + + +class Selection(object): + + """ + Carries out a selection operation on a tables.Table object. + + Parameters + ---------- + table : a Table object + where : list of Terms (or convertable to) + start, stop: indicies to start and/or stop selection + + """ + + def __init__(self, table, where=None, start=None, stop=None, **kwargs): + self.table = table + self.where = where + self.start = start + self.stop = stop + self.condition = None + self.filter = None + self.terms = None + self.coordinates = None + + if com.is_list_like(where): + + # see if we have a passed coordinate like + try: + inferred = lib.infer_dtype(where) + if inferred == 'integer' or inferred == 'boolean': + where = np.array(where) + if where.dtype == np.bool_: + start, stop = self.start, self.stop + if start is None: + start = 0 + if stop is None: + stop = self.table.nrows + self.coordinates = np.arange(start, stop)[where] + elif issubclass(where.dtype.type, np.integer): + if ((self.start is not None and + (where < self.start).any()) or + (self.stop is not None and + (where >= self.stop).any())): + raise ValueError( + "where must have index locations >= start and " + "< stop" + ) + self.coordinates = where + + except: + pass + + if self.coordinates is None: + + self.terms = self.generate(where) + + # create the numexpr & the filter + if self.terms is not None: + self.condition, self.filter = self.terms.evaluate() + + def generate(self, where): + """ where can be a : dict,list,tuple,string """ + if where is None: + return None + + q = self.table.queryables() + try: + return Expr(where, queryables=q, encoding=self.table.encoding) + except NameError as detail: + # raise a nice message, suggesting that the user should use + # data_columns + raise ValueError( + "The passed where expression: {0}\n" + " contains an invalid variable reference\n" + " all of the variable refrences must be a " + "reference to\n" + " an axis (e.g. 'index' or 'columns'), or a " + "data_column\n" + " The currently defined references are: {1}\n" + .format(where, ','.join(q.keys())) + ) + + def select(self): + """ + generate the selection + """ + if self.condition is not None: + return self.table.table.readWhere(self.condition.format(), + start=self.start, stop=self.stop) + elif self.coordinates is not None: + return self.table.table.readCoordinates(self.coordinates) + return self.table.table.read(start=self.start, stop=self.stop) + + def select_coords(self): + """ + generate the selection + """ + start, stop = self.start, self.stop + nrows = self.table.nrows + if start is None: + start = 0 + elif start < 0: + start += nrows + if self.stop is None: + stop = nrows + elif stop < 0: + stop += nrows + + if self.condition is not None: + return self.table.table.getWhereList(self.condition.format(), + start=start, stop=stop, + sort=True) + elif self.coordinates is not None: + return self.coordinates + + return np.arange(start, stop) + +# utilities ### + +def timeit(key, df, fn=None, remove=True, **kwargs): + if fn is None: + fn = 'timeit.h5' + store = HDFStore(fn, mode='w') + store.append(key, df, **kwargs) + store.close() + + if remove: + os.remove(fn) diff --git a/pandas/io/sql.py b/pandas/io/sql.py new file mode 100644 index 00000000..23ca80d7 --- /dev/null +++ b/pandas/io/sql.py @@ -0,0 +1,1244 @@ +""" +Collection of query wrappers / abstractions to both facilitate data +retrieval and to reduce dependency on DB-specific API. +""" +from __future__ import print_function, division +from datetime import datetime, date, timedelta + +import warnings +import traceback +import itertools +import re +import numpy as np + +import pandas.core.common as com +from pandas.compat import lzip, map, zip, raise_with_traceback, string_types +from pandas.core.api import DataFrame, Series +from pandas.core.base import PandasObject +from pandas.tseries.tools import to_datetime + + +class SQLAlchemyRequired(ImportError): + pass + + +class DatabaseError(IOError): + pass + + +#------------------------------------------------------------------------------ +# Helper functions + +_SQLALCHEMY_INSTALLED = None + +def _is_sqlalchemy_engine(con): + global _SQLALCHEMY_INSTALLED + if _SQLALCHEMY_INSTALLED is None: + try: + import sqlalchemy + _SQLALCHEMY_INSTALLED = True + + from distutils.version import LooseVersion + ver = LooseVersion(sqlalchemy.__version__) + # For sqlalchemy versions < 0.8.2, the BIGINT type is recognized + # for a sqlite engine, which results in a warning when trying to + # read/write a DataFrame with int64 values. (GH7433) + if ver < '0.8.2': + from sqlalchemy import BigInteger + from sqlalchemy.ext.compiler import compiles + + @compiles(BigInteger, 'sqlite') + def compile_big_int_sqlite(type_, compiler, **kw): + return 'INTEGER' + except ImportError: + _SQLALCHEMY_INSTALLED = False + + if _SQLALCHEMY_INSTALLED: + import sqlalchemy + return isinstance(con, sqlalchemy.engine.Engine) + else: + return False + + +def _convert_params(sql, params): + """convert sql and params args to DBAPI2.0 compliant format""" + args = [sql] + if params is not None: + if hasattr(params, 'keys'): # test if params is a mapping + args += [params] + else: + args += [list(params)] + return args + + +def _handle_date_column(col, format=None): + if isinstance(format, dict): + return to_datetime(col, **format) + else: + if format in ['D', 's', 'ms', 'us', 'ns']: + return to_datetime(col, coerce=True, unit=format) + elif issubclass(col.dtype.type, np.floating) or issubclass(col.dtype.type, np.integer): + # parse dates as timestamp + format = 's' if format is None else format + return to_datetime(col, coerce=True, unit=format) + else: + return to_datetime(col, coerce=True, format=format) + + +def _parse_date_columns(data_frame, parse_dates): + """ Force non-datetime columns to be read as such. + Supports both string formatted and integer timestamp columns + """ + # handle non-list entries for parse_dates gracefully + if parse_dates is True or parse_dates is None or parse_dates is False: + parse_dates = [] + + if not hasattr(parse_dates, '__iter__'): + parse_dates = [parse_dates] + + for col_name in parse_dates: + df_col = data_frame[col_name] + try: + fmt = parse_dates[col_name] + except TypeError: + fmt = None + data_frame[col_name] = _handle_date_column(df_col, format=fmt) + + return data_frame + + +def execute(sql, con, cur=None, params=None): + """ + Execute the given SQL query using the provided connection object. + + Parameters + ---------- + sql : string + Query to be executed + con : SQLAlchemy engine or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + cur : deprecated, cursor is obtained from connection + params : list or tuple, optional + List of parameters to pass to execute method. + + Returns + ------- + Results Iterable + """ + if cur is None: + pandas_sql = pandasSQL_builder(con) + else: + pandas_sql = pandasSQL_builder(cur, is_cursor=True) + args = _convert_params(sql, params) + return pandas_sql.execute(*args) + + +#------------------------------------------------------------------------------ +#--- Deprecated tquery and uquery + +def _safe_fetch(cur): + try: + result = cur.fetchall() + if not isinstance(result, list): + result = list(result) + return result + except Exception as e: # pragma: no cover + excName = e.__class__.__name__ + if excName == 'OperationalError': + return [] + +def tquery(sql, con=None, cur=None, retry=True): + """ + DEPRECATED. Returns list of tuples corresponding to each row in given sql + query. + + If only one column selected, then plain list is returned. + + To obtain the same result in the future, you can use the following: + + >>> execute(sql, con, params).fetchall() + + Parameters + ---------- + sql: string + SQL query to be executed + con: DBAPI2 connection + cur: deprecated, cursor is obtained from connection + + Returns + ------- + Results Iterable + + """ + warnings.warn( + "tquery is deprecated, and will be removed in future versions. " + "You can use ``execute(...).fetchall()`` instead.", + FutureWarning) + + cur = execute(sql, con, cur=cur) + result = _safe_fetch(cur) + + if con is not None: + try: + cur.close() + con.commit() + except Exception as e: + excName = e.__class__.__name__ + if excName == 'OperationalError': # pragma: no cover + print('Failed to commit, may need to restart interpreter') + else: + raise + + traceback.print_exc() + if retry: + return tquery(sql, con=con, retry=False) + + if result and len(result[0]) == 1: + # python 3 compat + result = list(lzip(*result)[0]) + elif result is None: # pragma: no cover + result = [] + + return result + + +def uquery(sql, con=None, cur=None, retry=True, params=None): + """ + DEPRECATED. Does the same thing as tquery, but instead of returning results, it + returns the number of rows affected. Good for update queries. + + To obtain the same result in the future, you can use the following: + + >>> execute(sql, con).rowcount + + Parameters + ---------- + sql: string + SQL query to be executed + con: DBAPI2 connection + cur: deprecated, cursor is obtained from connection + params: list or tuple, optional + List of parameters to pass to execute method. + + Returns + ------- + Number of affected rows + + """ + warnings.warn( + "uquery is deprecated, and will be removed in future versions. " + "You can use ``execute(...).rowcount`` instead.", + FutureWarning) + + cur = execute(sql, con, cur=cur, params=params) + + result = cur.rowcount + try: + con.commit() + except Exception as e: + excName = e.__class__.__name__ + if excName != 'OperationalError': + raise + + traceback.print_exc() + if retry: + print('Looks like your connection failed, reconnecting...') + return uquery(sql, con, retry=False) + return result + + +#------------------------------------------------------------------------------ +#--- Read and write to DataFrames + +def read_sql_table(table_name, con, index_col=None, coerce_float=True, + parse_dates=None, columns=None): + """Read SQL database table into a DataFrame. + + Given a table name and an SQLAlchemy engine, returns a DataFrame. + This function does not support DBAPI connections. + + Parameters + ---------- + table_name : string + Name of SQL table in database + con : SQLAlchemy engine + Sqlite DBAPI conncection mode not supported + index_col : string, optional + Column to set as index + coerce_float : boolean, default True + Attempt to convert values to non-string, non-numeric objects (like + decimal.Decimal) to floating point. Can result in loss of Precision. + parse_dates : list or dict + - List of column names to parse as dates + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite + columns : list + List of column names to select from sql table + + Returns + ------- + DataFrame + + See also + -------- + read_sql_query : Read SQL query into a DataFrame. + read_sql + + """ + if not _is_sqlalchemy_engine(con): + raise NotImplementedError("read_sql_table only supported for " + "SQLAlchemy engines.") + import sqlalchemy + from sqlalchemy.schema import MetaData + meta = MetaData(con) + try: + meta.reflect(only=[table_name]) + except sqlalchemy.exc.InvalidRequestError: + raise ValueError("Table %s not found" % table_name) + + pandas_sql = PandasSQLAlchemy(con, meta=meta) + table = pandas_sql.read_table( + table_name, index_col=index_col, coerce_float=coerce_float, + parse_dates=parse_dates, columns=columns) + + if table is not None: + return table + else: + raise ValueError("Table %s not found" % table_name, con) + + +def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, + parse_dates=None): + """Read SQL query into a DataFrame. + + Returns a DataFrame corresponding to the result set of the query + string. Optionally provide an `index_col` parameter to use one of the + columns as the index, otherwise default integer index will be used. + + Parameters + ---------- + sql : string + SQL query to be executed + con : SQLAlchemy engine or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + index_col : string, optional + Column name to use as index for the returned DataFrame object. + coerce_float : boolean, default True + Attempt to convert values to non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets + params : list, tuple or dict, optional + List of parameters to pass to execute method. + parse_dates : list or dict + - List of column names to parse as dates + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite + + Returns + ------- + DataFrame + + See also + -------- + read_sql_table : Read SQL database table into a DataFrame + read_sql + + """ + pandas_sql = pandasSQL_builder(con) + return pandas_sql.read_sql( + sql, index_col=index_col, params=params, coerce_float=coerce_float, + parse_dates=parse_dates) + + +def read_sql(sql, con, index_col=None, coerce_float=True, params=None, + parse_dates=None, columns=None): + """ + Read SQL query or database table into a DataFrame. + + Parameters + ---------- + sql : string + SQL query to be executed or database table name. + con : SQLAlchemy engine or DBAPI2 connection (legacy mode) + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + index_col : string, optional + column name to use as index for the returned DataFrame object. + coerce_float : boolean, default True + Attempt to convert values to non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets + params : list, tuple or dict, optional + List of parameters to pass to execute method. + parse_dates : list or dict + - List of column names to parse as dates + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps + - Dict of ``{column_name: arg dict}``, where the arg dict corresponds + to the keyword arguments of :func:`pandas.to_datetime` + Especially useful with databases without native Datetime support, + such as SQLite + columns : list + List of column names to select from sql table (only used when reading + a table). + + Returns + ------- + DataFrame + + Notes + ----- + This function is a convenience wrapper around ``read_sql_table`` and + ``read_sql_query`` (and for backward compatibility) and will delegate + to the specific function depending on the provided input (database + table name or sql query). + + See also + -------- + read_sql_table : Read SQL database table into a DataFrame + read_sql_query : Read SQL query into a DataFrame + + """ + pandas_sql = pandasSQL_builder(con) + + if isinstance(pandas_sql, PandasSQLLegacy): + return pandas_sql.read_sql( + sql, index_col=index_col, params=params, + coerce_float=coerce_float, parse_dates=parse_dates) + + if pandas_sql.has_table(sql): + pandas_sql.meta.reflect(only=[sql]) + return pandas_sql.read_table( + sql, index_col=index_col, coerce_float=coerce_float, + parse_dates=parse_dates, columns=columns) + else: + return pandas_sql.read_sql( + sql, index_col=index_col, params=params, + coerce_float=coerce_float, parse_dates=parse_dates) + + +def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True, + index_label=None): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table + con : SQLAlchemy engine or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + flavor : {'sqlite', 'mysql'}, default 'sqlite' + The flavor of SQL to use. Ignored when using SQLAlchemy engine. + 'mysql' is deprecated and will be removed in future versions, but it + will be further supported through SQLAlchemy engines. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + + """ + if if_exists not in ('fail', 'replace', 'append'): + raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) + + pandas_sql = pandasSQL_builder(con, flavor=flavor) + + if isinstance(frame, Series): + frame = frame.to_frame() + elif not isinstance(frame, DataFrame): + raise NotImplementedError + + pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, + index_label=index_label) + + +def has_table(table_name, con, flavor='sqlite'): + """ + Check if DataBase has named table. + + Parameters + ---------- + table_name: string + Name of SQL table + con: SQLAlchemy engine or sqlite3 DBAPI2 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + flavor: {'sqlite', 'mysql'}, default 'sqlite' + The flavor of SQL to use. Ignored when using SQLAlchemy engine. + 'mysql' is deprecated and will be removed in future versions, but it + will be further supported through SQLAlchemy engines. + + Returns + ------- + boolean + """ + pandas_sql = pandasSQL_builder(con, flavor=flavor) + return pandas_sql.has_table(table_name) + +table_exists = has_table + + +_MYSQL_WARNING = ("The 'mysql' flavor with DBAPI connection is deprecated " + "and will be removed in future versions. " + "MySQL will be further supported with SQLAlchemy engines.") + +def pandasSQL_builder(con, flavor=None, meta=None, is_cursor=False): + """ + Convenience function to return the correct PandasSQL subclass based on the + provided parameters + """ + # When support for DBAPI connections is removed, + # is_cursor should not be necessary. + if _is_sqlalchemy_engine(con): + return PandasSQLAlchemy(con, meta=meta) + else: + if flavor == 'mysql': + warnings.warn(_MYSQL_WARNING, FutureWarning) + return PandasSQLLegacy(con, flavor, is_cursor=is_cursor) + + +class PandasSQLTable(PandasObject): + """ + For mapping Pandas tables to SQL tables. + Uses fact that table is reflected by SQLAlchemy to + do better type convertions. + Also holds various flags needed to avoid having to + pass them between functions all the time. + """ + # TODO: support for multiIndex + def __init__(self, name, pandas_sql_engine, frame=None, index=True, + if_exists='fail', prefix='pandas', index_label=None): + self.name = name + self.pd_sql = pandas_sql_engine + self.prefix = prefix + self.frame = frame + self.index = self._index_name(index, index_label) + + if frame is not None: + # We want to write a frame + if self.pd_sql.has_table(self.name): + if if_exists == 'fail': + raise ValueError("Table '%s' already exists." % name) + elif if_exists == 'replace': + self.pd_sql.drop_table(self.name) + self.table = self._create_table_statement() + self.create() + elif if_exists == 'append': + self.table = self.pd_sql.get_table(self.name) + if self.table is None: + self.table = self._create_table_statement() + else: + raise ValueError( + "'{0}' is not valid for if_exists".format(if_exists)) + else: + self.table = self._create_table_statement() + self.create() + else: + # no data provided, read-only mode + self.table = self.pd_sql.get_table(self.name) + + if self.table is None: + raise ValueError("Could not init table '%s'" % name) + + def exists(self): + return self.pd_sql.has_table(self.name) + + def sql_schema(self): + from sqlalchemy.schema import CreateTable + return str(CreateTable(self.table)) + + def create(self): + self.table.create() + + def insert_statement(self): + return self.table.insert() + + def maybe_asscalar(self, i): + try: + return np.asscalar(i) + except AttributeError: + return i + + def insert_data(self): + if self.index is not None: + temp = self.frame.copy() + temp.index.names = self.index + try: + temp.reset_index(inplace=True) + except ValueError as err: + raise ValueError( + "duplicate name in index/columns: {0}".format(err)) + else: + temp = self.frame + + return temp + + def insert(self): + ins = self.insert_statement() + data_list = [] + temp = self.insert_data() + keys = list(map(str, temp.columns)) + + for t in temp.itertuples(): + data = dict((k, self.maybe_asscalar(v)) + for k, v in zip(keys, t[1:])) + data_list.append(data) + + self.pd_sql.execute(ins, data_list) + + def read(self, coerce_float=True, parse_dates=None, columns=None): + + if columns is not None and len(columns) > 0: + from sqlalchemy import select + cols = [self.table.c[n] for n in columns] + if self.index is not None: + [cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]] + sql_select = select(cols) + else: + sql_select = self.table.select() + + result = self.pd_sql.execute(sql_select) + data = result.fetchall() + column_names = result.keys() + + self.frame = DataFrame.from_records( + data, columns=column_names, coerce_float=coerce_float) + + self._harmonize_columns(parse_dates=parse_dates) + + if self.index is not None: + self.frame.set_index(self.index, inplace=True) + + return self.frame + + def _index_name(self, index, index_label): + # for writing: index=True to include index in sql table + if index is True: + nlevels = self.frame.index.nlevels + # if index_label is specified, set this as index name(s) + if index_label is not None: + if not isinstance(index_label, list): + index_label = [index_label] + if len(index_label) != nlevels: + raise ValueError( + "Length of 'index_label' should match number of " + "levels, which is {0}".format(nlevels)) + else: + return index_label + # return the used column labels for the index columns + if nlevels == 1 and 'index' not in self.frame.columns and self.frame.index.name is None: + return ['index'] + else: + return [l if l is not None else "level_{0}".format(i) + for i, l in enumerate(self.frame.index.names)] + + # for reading: index=(list of) string to specify column to set as index + elif isinstance(index, string_types): + return [index] + elif isinstance(index, list): + return index + else: + return None + + def _create_table_statement(self): + from sqlalchemy import Table, Column + + columns = list(map(str, self.frame.columns)) + column_types = map(self._sqlalchemy_type, self.frame.dtypes) + + columns = [Column(name, typ) + for name, typ in zip(columns, column_types)] + + if self.index is not None: + for i, idx_label in enumerate(self.index[::-1]): + idx_type = self._sqlalchemy_type( + self.frame.index.get_level_values(i)) + columns.insert(0, Column(idx_label, idx_type, index=True)) + + return Table(self.name, self.pd_sql.meta, *columns) + + def _harmonize_columns(self, parse_dates=None): + """ Make a data_frame's column type align with an sql_table + column types + Need to work around limited NA value support. + Floats are always fine, ints must always + be floats if there are Null values. + Booleans are hard because converting bool column with None replaces + all Nones with false. Therefore only convert bool if there are no + NA values. + Datetimes should already be converted + to np.datetime if supported, but here we also force conversion + if required + """ + # handle non-list entries for parse_dates gracefully + if parse_dates is True or parse_dates is None or parse_dates is False: + parse_dates = [] + + if not hasattr(parse_dates, '__iter__'): + parse_dates = [parse_dates] + + for sql_col in self.table.columns: + col_name = sql_col.name + try: + df_col = self.frame[col_name] + # the type the dataframe column should have + col_type = self._numpy_type(sql_col.type) + + if col_type is datetime or col_type is date: + if not issubclass(df_col.dtype.type, np.datetime64): + self.frame[col_name] = _handle_date_column(df_col) + + elif col_type is float: + # floats support NA, can always convert! + self.frame[col_name].astype(col_type, copy=False) + + elif len(df_col) == df_col.count(): + # No NA values, can convert ints and bools + if col_type is int or col_type is bool: + self.frame[col_name].astype(col_type, copy=False) + + # Handle date parsing + if col_name in parse_dates: + try: + fmt = parse_dates[col_name] + except TypeError: + fmt = None + self.frame[col_name] = _handle_date_column( + df_col, format=fmt) + + except KeyError: + pass # this column not in results + + def _sqlalchemy_type(self, arr_or_dtype): + from sqlalchemy.types import (BigInteger, Float, Text, Boolean, + DateTime, Date, Interval) + + if arr_or_dtype is date: + return Date + if com.is_datetime64_dtype(arr_or_dtype): + try: + tz = arr_or_dtype.tzinfo + return DateTime(timezone=True) + except: + return DateTime + if com.is_timedelta64_dtype(arr_or_dtype): + warnings.warn("the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", UserWarning) + return BigInteger + elif com.is_float_dtype(arr_or_dtype): + return Float + elif com.is_integer_dtype(arr_or_dtype): + # TODO: Refine integer size. + return BigInteger + elif com.is_bool_dtype(arr_or_dtype): + return Boolean + return Text + + def _numpy_type(self, sqltype): + from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date + + if isinstance(sqltype, Float): + return float + if isinstance(sqltype, Integer): + # TODO: Refine integer size. + return int + if isinstance(sqltype, DateTime): + # Caution: np.datetime64 is also a subclass of np.number. + return datetime + if isinstance(sqltype, Date): + return date + if isinstance(sqltype, Boolean): + return bool + return object + + +class PandasSQL(PandasObject): + """ + Subclasses Should define read_sql and to_sql + """ + + def read_sql(self, *args, **kwargs): + raise ValueError( + "PandasSQL must be created with an SQLAlchemy engine or connection+sql flavor") + + def to_sql(self, *args, **kwargs): + raise ValueError( + "PandasSQL must be created with an SQLAlchemy engine or connection+sql flavor") + + +class PandasSQLAlchemy(PandasSQL): + """ + This class enables convertion between DataFrame and SQL databases + using SQLAlchemy to handle DataBase abstraction + """ + + def __init__(self, engine, meta=None): + self.engine = engine + if not meta: + from sqlalchemy.schema import MetaData + meta = MetaData(self.engine) + + self.meta = meta + + def execute(self, *args, **kwargs): + """Simple passthrough to SQLAlchemy engine""" + return self.engine.execute(*args, **kwargs) + + def read_table(self, table_name, index_col=None, coerce_float=True, + parse_dates=None, columns=None): + + table = PandasSQLTable(table_name, self, index=index_col) + return table.read(coerce_float=coerce_float, + parse_dates=parse_dates, columns=columns) + + def read_sql(self, sql, index_col=None, coerce_float=True, + parse_dates=None, params=None): + args = _convert_params(sql, params) + + result = self.execute(*args) + data = result.fetchall() + columns = result.keys() + + data_frame = DataFrame.from_records( + data, columns=columns, coerce_float=coerce_float) + + _parse_date_columns(data_frame, parse_dates) + + if index_col is not None: + data_frame.set_index(index_col, inplace=True) + + return data_frame + + def to_sql(self, frame, name, if_exists='fail', index=True, + index_label=None): + table = PandasSQLTable( + name, self, frame=frame, index=index, if_exists=if_exists, + index_label=index_label) + table.insert() + + @property + def tables(self): + return self.meta.tables + + def has_table(self, name): + return self.engine.has_table(name) + + def get_table(self, table_name): + return self.meta.tables.get(table_name) + + def drop_table(self, table_name): + if self.engine.has_table(table_name): + self.meta.reflect(only=[table_name]) + self.get_table(table_name).drop() + self.meta.clear() + + def _create_sql_schema(self, frame, table_name): + table = PandasSQLTable(table_name, self, frame=frame) + return str(table.sql_schema()) + + +# ---- SQL without SQLAlchemy --- +# Flavour specific sql strings and handler class for access to DBs without +# SQLAlchemy installed +# SQL type convertions for each DB +_SQL_TYPES = { + 'text': { + 'mysql': 'VARCHAR (63)', + 'sqlite': 'TEXT', + }, + 'float': { + 'mysql': 'FLOAT', + 'sqlite': 'REAL', + }, + 'int': { + 'mysql': 'BIGINT', + 'sqlite': 'INTEGER', + }, + 'datetime': { + 'mysql': 'DATETIME', + 'sqlite': 'TIMESTAMP', + }, + 'date': { + 'mysql': 'DATE', + 'sqlite': 'TIMESTAMP', + }, + 'bool': { + 'mysql': 'BOOLEAN', + 'sqlite': 'INTEGER', + } +} + +# SQL enquote and wildcard symbols +_SQL_SYMB = { + 'mysql': { + 'br_l': '`', + 'br_r': '`', + 'wld': '%s' + }, + 'sqlite': { + 'br_l': '[', + 'br_r': ']', + 'wld': '?' + } +} + + +_SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. " + "In pandas versions < 0.14, spaces were converted to " + "underscores.") + + +class PandasSQLTableLegacy(PandasSQLTable): + """Patch the PandasSQLTable for legacy support. + Instead of a table variable just use the Create Table + statement""" + def sql_schema(self): + return str(self.table) + + def create(self): + self.pd_sql.execute(self.table) + + def insert_statement(self): + names = list(map(str, self.frame.columns)) + flv = self.pd_sql.flavor + br_l = _SQL_SYMB[flv]['br_l'] # left val quote char + br_r = _SQL_SYMB[flv]['br_r'] # right val quote char + wld = _SQL_SYMB[flv]['wld'] # wildcard char + + if self.index is not None: + [names.insert(0, idx) for idx in self.index[::-1]] + + bracketed_names = [br_l + column + br_r for column in names] + col_names = ','.join(bracketed_names) + wildcards = ','.join([wld] * len(names)) + insert_statement = 'INSERT INTO %s (%s) VALUES (%s)' % ( + self.name, col_names, wildcards) + return insert_statement + + def insert(self): + ins = self.insert_statement() + temp = self.insert_data() + data_list = [] + + for t in temp.itertuples(): + data = tuple((self.maybe_asscalar(v) for v in t[1:])) + data_list.append(data) + + cur = self.pd_sql.con.cursor() + cur.executemany(ins, data_list) + cur.close() + self.pd_sql.con.commit() + + def _create_table_statement(self): + "Return a CREATE TABLE statement to suit the contents of a DataFrame." + + columns = list(map(str, self.frame.columns)) + pat = re.compile('\s+') + if any(map(pat.search, columns)): + warnings.warn(_SAFE_NAMES_WARNING) + column_types = [self._sql_type_name(typ) for typ in self.frame.dtypes] + + if self.index is not None: + for i, idx_label in enumerate(self.index[::-1]): + columns.insert(0, idx_label) + column_types.insert(0, self._sql_type_name(self.frame.index.get_level_values(i).dtype)) + + flv = self.pd_sql.flavor + + br_l = _SQL_SYMB[flv]['br_l'] # left val quote char + br_r = _SQL_SYMB[flv]['br_r'] # right val quote char + + col_template = br_l + '%s' + br_r + ' %s' + + columns = ',\n '.join(col_template % + x for x in zip(columns, column_types)) + template = """CREATE TABLE %(name)s ( + %(columns)s + )""" + create_statement = template % {'name': self.name, 'columns': columns} + return create_statement + + def _sql_type_name(self, dtype): + pytype = dtype.type + pytype_name = "text" + if issubclass(pytype, np.floating): + pytype_name = "float" + elif com.is_timedelta64_dtype(pytype): + warnings.warn("the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", UserWarning) + pytype_name = "int" + elif issubclass(pytype, np.integer): + pytype_name = "int" + elif issubclass(pytype, np.datetime64) or pytype is datetime: + # Caution: np.datetime64 is also a subclass of np.number. + pytype_name = "datetime" + elif pytype is datetime.date: + pytype_name = "date" + elif issubclass(pytype, np.bool_): + pytype_name = "bool" + + return _SQL_TYPES[pytype_name][self.pd_sql.flavor] + + +class PandasSQLLegacy(PandasSQL): + + def __init__(self, con, flavor, is_cursor=False): + self.is_cursor = is_cursor + self.con = con + if flavor is None: + flavor = 'sqlite' + if flavor not in ['sqlite', 'mysql']: + raise NotImplementedError + else: + self.flavor = flavor + + def execute(self, *args, **kwargs): + if self.is_cursor: + cur = self.con + else: + cur = self.con.cursor() + try: + if kwargs: + cur.execute(*args, **kwargs) + else: + cur.execute(*args) + return cur + except Exception as e: + try: + self.con.rollback() + except Exception: # pragma: no cover + ex = DatabaseError( + "Execution failed on sql: %s\n%s\nunable to rollback" % (args[0], e)) + raise_with_traceback(ex) + + ex = DatabaseError("Execution failed on sql: %s" % args[0]) + raise_with_traceback(ex) + + def read_sql(self, sql, index_col=None, coerce_float=True, params=None, + parse_dates=None): + args = _convert_params(sql, params) + cursor = self.execute(*args) + columns = [col_desc[0] for col_desc in cursor.description] + data = self._fetchall_as_list(cursor) + cursor.close() + + data_frame = DataFrame.from_records( + data, columns=columns, coerce_float=coerce_float) + + _parse_date_columns(data_frame, parse_dates) + + if index_col is not None: + data_frame.set_index(index_col, inplace=True) + return data_frame + + def _fetchall_as_list(self, cur): + result = cur.fetchall() + if not isinstance(result, list): + result = list(result) + return result + + def to_sql(self, frame, name, if_exists='fail', index=True, + index_label=None): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame: DataFrame + name: name of SQL table + flavor: {'sqlite', 'mysql'}, default 'sqlite' + if_exists: {'fail', 'replace', 'append'}, default 'fail' + fail: If table exists, do nothing. + replace: If table exists, drop it, recreate it, and insert data. + append: If table exists, insert data. Create if does not exist. + + """ + table = PandasSQLTableLegacy( + name, self, frame=frame, index=index, if_exists=if_exists, + index_label=index_label) + table.insert() + + def has_table(self, name): + flavor_map = { + 'sqlite': ("SELECT name FROM sqlite_master " + "WHERE type='table' AND name='%s';") % name, + 'mysql': "SHOW TABLES LIKE '%s'" % name} + query = flavor_map.get(self.flavor) + + return len(self.execute(query).fetchall()) > 0 + + def get_table(self, table_name): + return None # not supported in Legacy mode + + def drop_table(self, name): + drop_sql = "DROP TABLE %s" % name + self.execute(drop_sql) + + def _create_sql_schema(self, frame, table_name): + table = PandasSQLTableLegacy(table_name, self, frame=frame) + return str(table.sql_schema()) + + +def get_schema(frame, name, flavor='sqlite', keys=None, con=None): + """ + Get the SQL db table schema for the given frame. + + Parameters + ---------- + frame : DataFrame + name : string + name of SQL table + flavor : {'sqlite', 'mysql'}, default 'sqlite' + The flavor of SQL to use. Ignored when using SQLAlchemy engine. + 'mysql' is deprecated and will be removed in future versions, but it + will be further supported through SQLAlchemy engines. + keys : string or sequence + columns to use a primary key + con: an open SQL database connection object or an SQLAlchemy engine + Using SQLAlchemy makes it possible to use any DB supported by that + library. + If a DBAPI2 object, only sqlite3 is supported. + + """ + + if con is None: + if flavor == 'mysql': + warnings.warn(_MYSQL_WARNING, FutureWarning) + return _get_schema_legacy(frame, name, flavor, keys) + + pandas_sql = pandasSQL_builder(con=con, flavor=flavor) + return pandas_sql._create_sql_schema(frame, name) + + +def _get_schema_legacy(frame, name, flavor, keys=None): + """Old function from 0.13.1. To keep backwards compatibility. + When mysql legacy support is dropped, it should be possible to + remove this code + """ + + def get_sqltype(dtype, flavor): + pytype = dtype.type + pytype_name = "text" + if issubclass(pytype, np.floating): + pytype_name = "float" + elif issubclass(pytype, np.integer): + pytype_name = "int" + elif issubclass(pytype, np.datetime64) or pytype is datetime: + # Caution: np.datetime64 is also a subclass of np.number. + pytype_name = "datetime" + elif pytype is datetime.date: + pytype_name = "date" + elif issubclass(pytype, np.bool_): + pytype_name = "bool" + + return _SQL_TYPES[pytype_name][flavor] + + lookup_type = lambda dtype: get_sqltype(dtype, flavor) + + column_types = lzip(frame.dtypes.index, map(lookup_type, frame.dtypes)) + if flavor == 'sqlite': + columns = ',\n '.join('[%s] %s' % x for x in column_types) + else: + columns = ',\n '.join('`%s` %s' % x for x in column_types) + + keystr = '' + if keys is not None: + if isinstance(keys, string_types): + keys = (keys,) + keystr = ', PRIMARY KEY (%s)' % ','.join(keys) + template = """CREATE TABLE %(name)s ( + %(columns)s + %(keystr)s + );""" + create_statement = template % {'name': name, 'columns': columns, + 'keystr': keystr} + return create_statement + + +# legacy names, with depreciation warnings and copied docs + +def read_frame(*args, **kwargs): + """DEPRECATED - use read_sql + """ + warnings.warn("read_frame is deprecated, use read_sql", FutureWarning) + return read_sql(*args, **kwargs) + + +def frame_query(*args, **kwargs): + """DEPRECATED - use read_sql + """ + warnings.warn("frame_query is deprecated, use read_sql", FutureWarning) + return read_sql(*args, **kwargs) + + +def write_frame(frame, name, con, flavor='sqlite', if_exists='fail', **kwargs): + """DEPRECATED - use to_sql + + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + con : DBAPI2 connection + flavor : {'sqlite', 'mysql'}, default 'sqlite' + The flavor of SQL to use. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default False + Write DataFrame index as a column + + Notes + ----- + This function is deprecated in favor of ``to_sql``. There are however + two differences: + + - With ``to_sql`` the index is written to the sql database by default. To + keep the behaviour this function you need to specify ``index=False``. + - The new ``to_sql`` function supports sqlalchemy engines to work with + different sql flavors. + + See also + -------- + pandas.DataFrame.to_sql + + """ + warnings.warn("write_frame is deprecated, use to_sql", FutureWarning) + + # for backwards compatibility, set index=False when not specified + index = kwargs.pop('index', False) + return to_sql(frame, name, con, flavor=flavor, if_exists=if_exists, + index=index, **kwargs) + + +# Append wrapped function docstrings +read_frame.__doc__ += read_sql.__doc__ +frame_query.__doc__ += read_sql.__doc__ diff --git a/pandas/io/stata.py b/pandas/io/stata.py new file mode 100644 index 00000000..ed6b540b --- /dev/null +++ b/pandas/io/stata.py @@ -0,0 +1,1378 @@ +""" +Module contains tools for processing Stata files into DataFrames + +The StataReader below was originally written by Joe Presbrey as part of PyDTA. +It has been extended and improved by Skipper Seabold from the Statsmodels +project who also developed the StataWriter and was finally added to pandas in +an once again improved version. + +You can find more information on http://presbrey.mit.edu/PyDTA and +http://statsmodels.sourceforge.net/devel/ +""" +# TODO: Fix this module so it can use cross-compatible zip, map, and range +import numpy as np + +import sys +import struct +from pandas.core.base import StringMixin +from pandas.core.frame import DataFrame +from pandas.core.series import Series +from pandas.core.categorical import Categorical +import datetime +from pandas import compat +from pandas.compat import long, lrange, lmap, lzip, text_type, string_types +from pandas import isnull +from pandas.io.common import get_filepath_or_buffer +from pandas.tslib import NaT + +def read_stata(filepath_or_buffer, convert_dates=True, + convert_categoricals=True, encoding=None, index=None): + """ + Read Stata file into DataFrame + + Parameters + ---------- + filepath_or_buffer : string or file-like object + Path to .dta file or object implementing a binary read() functions + convert_dates : boolean, defaults to True + Convert date variables to DataFrame time values + convert_categoricals : boolean, defaults to True + Read value labels and convert columns to Categorical/Factor variables + encoding : string, None or encoding + Encoding used to parse the files. Note that Stata doesn't + support unicode. None defaults to cp1252. + index : identifier of index column + identifier of column that should be used as index of the DataFrame + """ + reader = StataReader(filepath_or_buffer, encoding) + + return reader.data(convert_dates, convert_categoricals, index) + +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] + + +def _stata_elapsed_date_to_datetime(date, fmt): + """ + Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + date : int + The Stata Internal Format date to convert to datetime according to fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + + Examples + -------- + >>> _stata_elapsed_date_to_datetime(52, "%tw") + datetime.datetime(1961, 1, 1, 0, 0) + + Notes + ----- + datetime/c - tc + milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day + datetime/C - tC - NOT IMPLEMENTED + milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds + date - td + days since 01jan1960 (01jan1960 = 0) + weekly date - tw + weeks since 1960w1 + This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. + The datetime value is the start of the week in terms of days in the + year, not ISO calendar weeks. + monthly date - tm + months since 1960m1 + quarterly date - tq + quarters since 1960q1 + half-yearly date - th + half-years since 1960h1 yearly + date - ty + years since 0000 + + If you don't have pandas with datetime support, then you can't do + milliseconds accurately. + """ + #NOTE: we could run into overflow / loss of precision situations here + # casting to int, but I'm not sure what to do. datetime won't deal with + # numpy types and numpy datetime isn't mature enough / we can't rely on + # pandas version > 0.7.1 + #TODO: IIRC relative delta doesn't play well with np.datetime? + #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly + if np.isnan(date): + return NaT + + date = int(date) + stata_epoch = datetime.datetime(1960, 1, 1) + if fmt in ["%tc", "tc"]: + from dateutil.relativedelta import relativedelta + return stata_epoch + relativedelta(microseconds=date * 1000) + elif fmt in ["%tC", "tC"]: + from warnings import warn + warn("Encountered %tC format. Leaving in Stata Internal Format.") + return date + elif fmt in ["%td", "td", "%d", "d"]: + return stata_epoch + datetime.timedelta(int(date)) + elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) + day_delta = (date % 52) * 7 + return year + datetime.timedelta(int(day_delta)) + elif fmt in ["%tm", "tm"]: + year = stata_epoch.year + date // 12 + month_delta = (date % 12) + 1 + return datetime.datetime(year, month_delta, 1) + elif fmt in ["%tq", "tq"]: + year = stata_epoch.year + date // 4 + month_delta = (date % 4) * 3 + 1 + return datetime.datetime(year, month_delta, 1) + elif fmt in ["%th", "th"]: + year = stata_epoch.year + date // 2 + month_delta = (date % 2) * 6 + 1 + return datetime.datetime(year, month_delta, 1) + elif fmt in ["%ty", "ty"]: + if date > 0: + return datetime.datetime(date, 1, 1) + else: # don't do negative years bc can't mix dtypes in column + raise ValueError("Year 0 and before not implemented") + else: + raise ValueError("Date fmt %s not understood" % fmt) + + +def _datetime_to_stata_elapsed(date, fmt): + """ + Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + date : datetime.datetime + The date to convert to the Stata Internal Format given by fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + """ + if not isinstance(date, datetime.datetime): + raise ValueError("date should be datetime.datetime format") + stata_epoch = datetime.datetime(1960, 1, 1) + # Handle NaTs + if date is NaT: + # Missing value for dates ('.'), assumed always double + # TODO: Should be moved so a const somewhere, and consolidated + return struct.unpack(' 6) + elif fmt in ["%ty", "ty"]: + return date.year + else: + raise ValueError("fmt %s not understood" % fmt) + + +class PossiblePrecisionLoss(Warning): + pass + + +precision_loss_doc = """ +Column converted from %s to %s, and some data are outside of the lossless +conversion range. This may result in a loss of precision in the saved data. +""" + + +class InvalidColumnName(Warning): + pass + + +invalid_name_doc = """ +Not all pandas column names were valid Stata variable names. +The following replacements have been made: + + {0} + +If this is not what you expect, please make sure you have Stata-compliant +column names in your DataFrame (strings only, max 32 characters, only alphanumerics and +underscores, no Stata reserved words) +""" + +def _cast_to_stata_types(data): + """Checks the dtypes of the columns of a pandas DataFrame for + compatibility with the data types and ranges supported by Stata, and + converts if necessary. + + Parameters + ---------- + data : DataFrame + The DataFrame to check and convert + + Notes + ----- + Numeric columns must be one of int8, int16, int32, float32 or float64, with + some additional value restrictions on the integer data types. int8 and + int16 columns are checked for violations of the value restrictions and + upcast if needed. int64 data is not usable in Stata, and so it is + downcast to int32 whenever the value are in the int32 range, and + sidecast to float64 when larger than this range. If the int64 values + are outside of the range of those perfectly representable as float64 values, + a warning is raised. + """ + ws = '' + for col in data: + dtype = data[col].dtype + if dtype == np.int8: + if data[col].max() > 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype == np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype == np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) + + return data + + +class StataMissingValue(StringMixin): + """ + An observation's missing value. + + Parameters + ----------- + offset + value + + Attributes + ---------- + string + value + + Notes + ----- + More information: + """ + # TODO: Needs test + def __init__(self, offset, value): + self._value = value + value_type = type(value) + if value_type in int: + loc = value - offset + elif value_type in (float, np.float32, np.float64): + if value <= np.finfo(np.float32).max: # float32 + conv_str, byte_loc, scale = '
+ self.format_version = int(self.path_or_buf.read(3)) + if self.format_version not in [117]: + raise ValueError("Version of given Stata file is not 104, " + "105, 108, 113 (Stata 8/9), 114 (Stata " + "10/11), 115 (Stata 12) or 117 (Stata 13)") + self.path_or_buf.read(21) # + self.byteorder = self.path_or_buf.read(3) == "MSF" and '>' or '<' + self.path_or_buf.read(15) # + self.nvar = struct.unpack(self.byteorder + 'H', + self.path_or_buf.read(2))[0] + self.path_or_buf.read(7) # + self.nobs = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] + self.path_or_buf.read(11) # + strlen = struct.unpack('b', self.path_or_buf.read(1))[0] + self.time_stamp = self._null_terminate(self.path_or_buf.read(strlen)) + self.path_or_buf.read(26) #
+ self.path_or_buf.read(8) # 0x0000000000000000 + self.path_or_buf.read(8) # position of + seek_vartypes = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 16 + seek_varnames = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 + seek_sortlist = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 + seek_formats = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9 + seek_value_label_names = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19 + seek_variable_labels = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17 + self.path_or_buf.read(8) # + self.data_location = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6 + self.seek_strls = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 7 + self.seek_value_labels = struct.unpack( + self.byteorder + 'q', self.path_or_buf.read(8))[0] + 14 + #self.path_or_buf.read(8) # + #self.path_or_buf.read(8) # EOF + self.path_or_buf.seek(seek_vartypes) + typlist = [struct.unpack(self.byteorder + 'H', + self.path_or_buf.read(2))[0] + for i in range(self.nvar)] + self.typlist = [None]*self.nvar + try: + i = 0 + for typ in typlist: + if typ <= 2045: + self.typlist[i] = typ + elif typ == 32768: + raise ValueError("Long strings are not supported") + else: + self.typlist[i] = self.TYPE_MAP_XML[typ] + i += 1 + except: + raise ValueError("cannot convert stata types [{0}]" + .format(','.join(typlist))) + self.dtyplist = [None]*self.nvar + try: + i = 0 + for typ in typlist: + if typ <= 2045: + self.dtyplist[i] = str(typ) + else: + self.dtyplist[i] = self.DTYPE_MAP_XML[typ] + i += 1 + except: + raise ValueError("cannot convert stata dtypes [{0}]" + .format(','.join(typlist))) + + self.path_or_buf.seek(seek_varnames) + self.varlist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] + + self.path_or_buf.seek(seek_sortlist) + self.srtlist = struct.unpack( + self.byteorder + ('h' * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)) + )[:-1] + + self.path_or_buf.seek(seek_formats) + self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) + for i in range(self.nvar)] + + self.path_or_buf.seek(seek_value_label_names) + self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] + + self.path_or_buf.seek(seek_variable_labels) + self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) + for i in range(self.nvar)] + else: + # header + self.format_version = struct.unpack('b', first_char)[0] + if self.format_version not in [104, 105, 108, 113, 114, 115]: + raise ValueError("Version of given Stata file is not 104, " + "105, 108, 113 (Stata 8/9), 114 (Stata " + "10/11), 115 (Stata 12) or 117 (Stata 13)") + self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[0] == 0x1 and '>' or '<' + self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] + self.path_or_buf.read(1) # unused + + self.nvar = struct.unpack(self.byteorder + 'H', + self.path_or_buf.read(2))[0] + self.nobs = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] + if self.format_version > 105: + self.data_label = self._null_terminate(self.path_or_buf.read(81)) + else: + self.data_label = self._null_terminate(self.path_or_buf.read(32)) + if self.format_version > 104: + self.time_stamp = self._null_terminate(self.path_or_buf.read(18)) + + # descriptors + if self.format_version > 108: + typlist = [ord(self.path_or_buf.read(1)) + for i in range(self.nvar)] + else: + typlist = [ + self.OLD_TYPE_MAPPING[ + self._decode_bytes(self.path_or_buf.read(1)) + ] for i in range(self.nvar) + ] + + try: + self.typlist = [self.TYPE_MAP[typ] for typ in typlist] + except: + raise ValueError("cannot convert stata types [{0}]" + .format(','.join(typlist))) + try: + self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] + except: + raise ValueError("cannot convert stata dtypes [{0}]" + .format(','.join(typlist))) + + if self.format_version > 108: + self.varlist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] + else: + self.varlist = [self._null_terminate(self.path_or_buf.read(9)) + for i in range(self.nvar)] + self.srtlist = struct.unpack( + self.byteorder + ('h' * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)) + )[:-1] + if self.format_version > 113: + self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) + for i in range(self.nvar)] + elif self.format_version > 104: + self.fmtlist = [self._null_terminate(self.path_or_buf.read(12)) + for i in range(self.nvar)] + else: + self.fmtlist = [self._null_terminate(self.path_or_buf.read(7)) + for i in range(self.nvar)] + if self.format_version > 108: + self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) + for i in range(self.nvar)] + else: + self.lbllist = [self._null_terminate(self.path_or_buf.read(9)) + for i in range(self.nvar)] + if self.format_version > 105: + self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) + for i in range(self.nvar)] + else: + self.vlblist = [self._null_terminate(self.path_or_buf.read(32)) + for i in range(self.nvar)] + + # ignore expansion fields (Format 105 and later) + # When reading, read five bytes; the last four bytes now tell you + # the size of the next read, which you discard. You then continue + # like this until you read 5 bytes of zeros. + + if self.format_version > 104: + while True: + data_type = struct.unpack(self.byteorder + 'b', + self.path_or_buf.read(1))[0] + if self.format_version > 108: + data_len = struct.unpack(self.byteorder + 'i', + self.path_or_buf.read(4))[0] + else: + data_len = struct.unpack(self.byteorder + 'h', + self.path_or_buf.read(2))[0] + if data_type == 0: + break + self.path_or_buf.read(data_len) + + # necessary data to continue parsing + self.data_location = self.path_or_buf.tell() + + self.has_string_data = len([x for x in self.typlist + if type(x) is int]) > 0 + + """Calculate size of a data record.""" + self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) + + def _calcsize(self, fmt): + return (type(fmt) is int and fmt + or struct.calcsize(self.byteorder + fmt)) + + def _col_size(self, k=None): + if k is None: + return self.col_sizes + else: + return self.col_sizes[k] + + def _unpack(self, fmt, byt): + d = struct.unpack(self.byteorder + fmt, byt)[0] + if fmt[-1] in self.VALID_RANGE: + nmin, nmax = self.VALID_RANGE[fmt[-1]] + if d < nmin or d > nmax: + if self._missing_values: + return StataMissingValue(nmax, d) + else: + return None + return d + + def _null_terminate(self, s): + if compat.PY3 or self._encoding is not None: # have bytes not strings, + # so must decode + null_byte = b"\0" + try: + s = s[:s.index(null_byte)] + except: + pass + return s.decode(self._encoding or self._default_encoding) + else: + null_byte = "\0" + try: + return s.lstrip(null_byte)[:s.index(null_byte)] + except: + return s + + def _next(self): + typlist = self.typlist + if self.has_string_data: + data = [None] * self.nvar + for i in range(len(data)): + if type(typlist[i]) is int: + data[i] = self._null_terminate( + self.path_or_buf.read(typlist[i]) + ) + else: + data[i] = self._unpack( + typlist[i], self.path_or_buf.read(self._col_size(i)) + ) + return data + else: + return list( + map( + lambda i: self._unpack(typlist[i], + self.path_or_buf.read( + self._col_size(i) + )), + range(self.nvar) + ) + ) + + def _dataset(self): + """ + Returns a Python generator object for iterating over the dataset. + + + Parameters + ---------- + + Returns + ------- + Generator object for iterating over the dataset. Yields each row of + observations as a list by default. + + Notes + ----- + If missing_values is True during instantiation of StataReader then + observations with _StataMissingValue(s) are not filtered and should + be handled by your applcation. + """ + + self.path_or_buf.seek(self.data_location) + + for i in range(self.nobs): + yield self._next() + + def _read_value_labels(self): + if self.format_version >= 117: + self.path_or_buf.seek(self.seek_value_labels) + else: + if not self._data_read: + raise Exception("Data has not been read. Because of the " + "layout of Stata files, this is necessary " + "before reading value labels.") + if self._value_labels_read: + raise Exception("Value labels have already been read.") + + self.value_label_dict = dict() + + if self.format_version <= 108: + # Value labels are not supported in version 108 and earlier. + return + + while True: + if self.format_version >= 117: + if self.path_or_buf.read(5) == b' + break # end o f variable lable table + + slength = self.path_or_buf.read(4) + if not slength: + break # end of variable lable table (format < 117) + labname = self._null_terminate(self.path_or_buf.read(33)) + self.path_or_buf.read(3) # padding + + n = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] + txtlen = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] + off = [] + for i in range(n): + off.append(struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0]) + val = [] + for i in range(n): + val.append(struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0]) + txt = self.path_or_buf.read(txtlen) + self.value_label_dict[labname] = dict() + for i in range(n): + self.value_label_dict[labname][val[i]] = ( + self._null_terminate(txt[off[i]:]) + ) + + if self.format_version >= 117: + self.path_or_buf.read(6) # + self._value_labels_read = True + + def _read_strls(self): + self.path_or_buf.seek(self.seek_strls) + self.GSO = dict() + while True: + if self.path_or_buf.read(3) != b'GSO': + break + + v_o = struct.unpack(self.byteorder + 'L', + self.path_or_buf.read(8))[0] + typ = self.path_or_buf.read(1) + length = struct.unpack(self.byteorder + 'I', + self.path_or_buf.read(4))[0] + self.GSO[v_o] = self.path_or_buf.read(length-1) + self.path_or_buf.read(1) # zero-termination + + def data(self, convert_dates=True, convert_categoricals=True, index=None): + """ + Reads observations from Stata file, converting them into a dataframe + + Parameters + ---------- + convert_dates : boolean, defaults to True + Convert date variables to DataFrame time values + convert_categoricals : boolean, defaults to True + Read value labels and convert columns to Categorical/Factor + variables + index : identifier of index column + identifier of column that should be used as index of the DataFrame + + Returns + ------- + y : DataFrame instance + """ + if self._data_read: + raise Exception("Data has already been read.") + self._data_read = True + + if self.format_version >= 117: + self._read_strls() + + stata_dta = self._dataset() + + data = [] + for rownum, line in enumerate(stata_dta): + # doesn't handle missing value objects, just casts + # None will only work without missing value object. + for i, val in enumerate(line): + #NOTE: This will only be scalar types because missing strings + # are empty not None in Stata + if val is None: + line[i] = np.nan + data.append(tuple(line)) + + if convert_categoricals: + self._read_value_labels() + + if len(data)==0: + data = DataFrame(columns=self.varlist, index=index) + else: + data = DataFrame(data, columns=self.varlist, index=index) + + cols_ = np.where(self.dtyplist)[0] + for i in cols_: + if self.dtyplist[i] is not None: + col = data.columns[i] + if data[col].dtype is not np.dtype(object): + data[col] = Series(data[col], data[col].index, + self.dtyplist[i]) + + if convert_dates: + cols = np.where(lmap(lambda x: x in _date_formats, + self.fmtlist))[0] + for i in cols: + col = data.columns[i] + data[col] = data[col].apply(_stata_elapsed_date_to_datetime, + args=(self.fmtlist[i],)) + + if convert_categoricals: + cols = np.where( + lmap(lambda x: x in compat.iterkeys(self.value_label_dict), + self.lbllist) + )[0] + for i in cols: + col = data.columns[i] + labeled_data = np.copy(data[col]) + labeled_data = labeled_data.astype(object) + for k, v in compat.iteritems( + self.value_label_dict[self.lbllist[i]]): + labeled_data[(data[col] == k).values] = v + data[col] = Categorical.from_array(labeled_data) + + return data + + def data_label(self): + """Returns data label of Stata file""" + return self.data_label + + def variable_labels(self): + """Returns variable labels as a dict, associating each variable name + with corresponding label + """ + return dict(zip(self.varlist, self.vlblist)) + + def value_labels(self): + """Returns a dict, associating each variable name a dict, associating + each value its corresponding label + """ + if not self._value_labels_read: + self._read_value_labels() + + return self.value_label_dict + + +def _open_file_binary_write(fname, encoding): + if hasattr(fname, 'write'): + #if 'b' not in fname.mode: + return fname + return open(fname, "wb") + + +def _set_endianness(endianness): + if endianness.lower() in ["<", "little"]: + return "<" + elif endianness.lower() in [">", "big"]: + return ">" + else: # pragma : no cover + raise ValueError("Endianness %s not understood" % endianness) + + +def _pad_bytes(name, length): + """ + Takes a char string and pads it wih null bytes until it's length chars + """ + return name + "\x00" * (length - len(name)) + + +def _default_names(nvar): + """ + Returns default Stata names v1, v2, ... vnvar + """ + return ["v%d" % i for i in range(1, nvar+1)] + + +def _convert_datetime_to_stata_type(fmt): + """ + Converts from one of the stata date formats to a type in TYPE_MAP + """ + if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq", + "%tq", "th", "%th", "ty", "%ty"]: + return np.float64 # Stata expects doubles for SIFs + else: + raise ValueError("fmt %s not understood" % fmt) + + +def _maybe_convert_to_int_keys(convert_dates, varlist): + new_dict = {} + for key in convert_dates: + if not convert_dates[key].startswith("%"): # make sure proper fmts + convert_dates[key] = "%" + convert_dates[key] + if key in varlist: + new_dict.update({varlist.index(key): convert_dates[key]}) + else: + if not isinstance(key, int): + raise ValueError( + "convert_dates key is not in varlist and is not an int" + ) + new_dict.update({key: convert_dates[key]}) + return new_dict + + +def _dtype_to_stata_type(dtype): + """ + Converts dtype types to stata types. Returns the byte of the given ordinal. + See TYPE_MAP and comments for an explanation. This is also explained in + the dta spec. + 1 - 244 are strings of this length + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double + + If there are dates to convert, then dtype will already have the correct + type inserted. + """ + #TODO: expand to handle datetime to integer conversion + if dtype.type == np.string_: + return chr(dtype.itemsize) + elif dtype.type == np.object_: # try to coerce it to the biggest string + # not memory efficient, what else could we + # do? + return chr(244) + elif dtype == np.float64: + return chr(255) + elif dtype == np.float32: + return chr(254) + elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: + return chr(252) + elif dtype == np.int8: + return chr(251) + else: # pragma : no cover + raise ValueError("Data type %s not currently understood. " + "Please report an error to the developers." % dtype) + + +def _dtype_to_default_stata_fmt(dtype): + """ + Maps numpy dtype to stata's default format for this type. Not terribly + important since users can change this in Stata. Semantics are + + string -> "%DDs" where DD is the length of the string + float64 -> "%10.0g" + float32 -> "%9.0g" + int64 -> "%9.0g" + int32 -> "%12.0g" + int16 -> "%8.0g" + int8 -> "%8.0g" + """ + #TODO: expand this to handle a default datetime format? + if dtype.type == np.string_: + return "%" + str(dtype.itemsize) + "s" + elif dtype.type == np.object_: + return "%244s" + elif dtype == np.float64: + return "%10.0g" + elif dtype == np.float32: + return "%9.0g" + elif dtype == np.int32: + return "%12.0g" + elif dtype == np.int8 or dtype == np.int16: + return "%8.0g" + else: # pragma : no cover + raise ValueError("Data type %s not currently understood. " + "Please report an error to the developers." % dtype) + + +class StataWriter(StataParser): + """ + A class for writing Stata binary dta files from array-like objects + + Parameters + ---------- + fname : file path or buffer + Where to save the dta file. + data : array-like + Array-like input to save. Pandas objects are also accepted. + convert_dates : dict + Dictionary mapping column of datetime types to the stata internal + format that you want to use for the dates. Options are + 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a + number or a name. + encoding : str + Default is latin-1. Note that Stata does not support unicode. + byteorder : str + Can be ">", "<", "little", or "big". The default is None which uses + `sys.byteorder` + time_stamp : datetime + A date time to use when writing the file. Can be None, in which + case the current time is used. + dataset_label : str + A label for the data set. Should be 80 characters or smaller. + + Returns + ------- + writer : StataWriter instance + The StataWriter instance has a write_file method, which will + write the file to the given `fname`. + + Examples + -------- + >>> writer = StataWriter('./data_file.dta', data) + >>> writer.write_file() + + Or with dates + + >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'}) + >>> writer.write_file() + """ + def __init__(self, fname, data, convert_dates=None, write_index=True, + encoding="latin-1", byteorder=None, time_stamp=None, + data_label=None): + super(StataWriter, self).__init__(encoding) + self._convert_dates = convert_dates + self._write_index = write_index + self._time_stamp = time_stamp + self._data_label = data_label + # attach nobs, nvars, data, varlist, typlist + self._prepare_pandas(data) + + if byteorder is None: + byteorder = sys.byteorder + self._byteorder = _set_endianness(byteorder) + self._file = _open_file_binary_write( + fname, self._encoding or self._default_encoding + ) + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} + + def _write(self, to_write): + """ + Helper to call encode before writing to file for Python 3 compat. + """ + if compat.PY3: + self._file.write(to_write.encode(self._encoding or + self._default_encoding)) + else: + self._file.write(to_write) + + + def _replace_nans(self, data): + # return data + """Checks floating point data columns for nans, and replaces these with + the generic Stata for missing value (.)""" + for c in data: + dtype = data[c].dtype + if dtype in (np.float32, np.float64): + if dtype == np.float32: + replacement = self.MISSING_VALUES['f'] + else: + replacement = self.MISSING_VALUES['d'] + data[c] = data[c].fillna(replacement) + + return data + + def _check_column_names(self, data): + """Checks column names to ensure that they are valid Stata column names. + This includes checks for: + * Non-string names + * Stata keywords + * Variables that start with numbers + * Variables with names that are too long + + When an illegal variable name is detected, it is converted, and if dates + are exported, the variable name is propogated to the date conversion + dictionary + """ + converted_names = [] + columns = list(data.columns) + original_columns = columns[:] + + duplicate_var_id = 0 + for j, name in enumerate(columns): + orig_name = name + if not isinstance(name, string_types): + name = text_type(name) + + for c in name: + if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and \ + (c < '0' or c > '9') and c != '_': + name = name.replace(c, '_') + + # Variable name must not be a reserved word + if name in self.RESERVED_WORDS: + name = '_' + name + + # Variable name may not start with a number + if name[0] >= '0' and name[0] <= '9': + name = '_' + name + + name = name[:min(len(name), 32)] + + if not name == orig_name: + # check for duplicates + while columns.count(name) > 0: + # prepend ascending number to avoid duplicates + name = '_' + str(duplicate_var_id) + name + name = name[:min(len(name), 32)] + duplicate_var_id += 1 + + # need to possibly encode the orig name if its unicode + try: + orig_name = orig_name.encode('utf-8') + except: + pass + converted_names.append('{0} -> {1}'.format(orig_name, name)) + + columns[j] = name + + data.columns = columns + + # Check date conversion, and fix key if needed + if self._convert_dates: + for c, o in zip(columns, original_columns): + if c != o: + self._convert_dates[c] = self._convert_dates[o] + del self._convert_dates[o] + + if converted_names: + import warnings + + ws = invalid_name_doc.format('\n '.join(converted_names)) + warnings.warn(ws, InvalidColumnName) + + return data + + def _prepare_pandas(self, data): + #NOTE: we might need a different API / class for pandas objects so + # we can set different semantics - handle this with a PR to pandas.io + class DataFrameRowIter(object): + def __init__(self, data): + self.data = data + + def __iter__(self): + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] + + if self._write_index: + data = data.reset_index() + # Check columns for compatibility with stata + data = _cast_to_stata_types(data) + # Ensure column names are strings + data = self._check_column_names(data) + # Replace NaNs with Stata missing values + data = self._replace_nans(data) + self.datarows = DataFrameRowIter(data) + self.nobs, self.nvar = data.shape + self.data = data + self.varlist = data.columns.tolist() + dtypes = data.dtypes + if self._convert_dates is not None: + self._convert_dates = _maybe_convert_to_int_keys( + self._convert_dates, self.varlist + ) + for key in self._convert_dates: + new_type = _convert_datetime_to_stata_type( + self._convert_dates[key] + ) + dtypes[key] = np.dtype(new_type) + self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] + self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] + # set the given format for the datetime cols + if self._convert_dates is not None: + for key in self._convert_dates: + self.fmtlist[key] = self._convert_dates[key] + + def write_file(self): + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) + self._write_descriptors() + self._write_variable_labels() + # write 5 zeros for expansion fields + self._write(_pad_bytes("", 5)) + if self._convert_dates is None: + self._write_data_nodates() + else: + self._write_data_dates() + #self._write_value_labels() + self._file.close() + + def _write_header(self, data_label=None, time_stamp=None): + byteorder = self._byteorder + # ds_format - just use 114 + self._file.write(struct.pack("b", 114)) + # byteorder + self._write(byteorder == ">" and "\x01" or "\x02") + # filetype + self._write("\x01") + # unused + self._write("\x00") + # number of vars, 2 bytes + self._file.write(struct.pack(byteorder+"h", self.nvar)[:2]) + # number of obs, 4 bytes + self._file.write(struct.pack(byteorder+"i", self.nobs)[:4]) + # data label 81 bytes, char, null terminated + if data_label is None: + self._file.write(self._null_terminate(_pad_bytes("", 80))) + else: + self._file.write( + self._null_terminate(_pad_bytes(data_label[:80], 80)) + ) + # time stamp, 18 bytes, char, null terminated + # format dd Mon yyyy hh:mm + if time_stamp is None: + time_stamp = datetime.datetime.now() + elif not isinstance(time_stamp, datetime.datetime): + raise ValueError("time_stamp should be datetime type") + self._file.write( + self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M")) + ) + + def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, + fmtlist=None, lbllist=None): + nvar = self.nvar + # typlist, length nvar, format byte array + for typ in self.typlist: + self._write(typ) + + # varlist names are checked by _check_column_names + # varlist, requires null terminated + for name in self.varlist: + name = self._null_terminate(name, True) + name = _pad_bytes(name[:32], 33) + self._write(name) + + # srtlist, 2*(nvar+1), int array, encoded by byteorder + srtlist = _pad_bytes("", (2*(nvar+1))) + self._write(srtlist) + + # fmtlist, 49*nvar, char array + for fmt in self.fmtlist: + self._write(_pad_bytes(fmt, 49)) + + # lbllist, 33*nvar, char array + #NOTE: this is where you could get fancy with pandas categorical type + for i in range(nvar): + self._write(_pad_bytes("", 33)) + + def _write_variable_labels(self, labels=None): + nvar = self.nvar + if labels is None: + for i in range(nvar): + self._write(_pad_bytes("", 81)) + + def _write_data_nodates(self): + data = self.datarows + byteorder = self._byteorder + TYPE_MAP = self.TYPE_MAP + typlist = self.typlist + for row in data: + #row = row.squeeze().tolist() # needed for structured arrays + for i, var in enumerate(row): + typ = ord(typlist[i]) + if typ <= 244: # we've got a string + if var is None or var == np.nan: + var = _pad_bytes('', typ) + if len(var) < typ: + var = _pad_bytes(var, typ) + if compat.PY3: + self._write(var) + else: + self._write(var.encode(self._encoding)) + else: + try: + self._file.write(struct.pack(byteorder + TYPE_MAP[typ], + var)) + except struct.error: + # have to be strict about type pack won't do any + # kind of casting + self._file.write(struct.pack(byteorder+TYPE_MAP[typ], + self.type_converters[typ](var))) + + def _write_data_dates(self): + convert_dates = self._convert_dates + data = self.datarows + byteorder = self._byteorder + TYPE_MAP = self.TYPE_MAP + MISSING_VALUES = self.MISSING_VALUES + typlist = self.typlist + for row in data: + #row = row.squeeze().tolist() # needed for structured arrays + for i, var in enumerate(row): + typ = ord(typlist[i]) + #NOTE: If anyone finds this terribly slow, there is + # a vectorized way to convert dates, see genfromdta for going + # from int to datetime and reverse it. will copy data though + if i in convert_dates: + var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) + if typ <= 244: # we've got a string + if len(var) < typ: + var = _pad_bytes(var, typ) + if compat.PY3: + self._write(var) + else: + self._write(var.encode(self._encoding)) + else: + self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) + + def _null_terminate(self, s, as_string=False): + null_byte = '\x00' + if compat.PY3 and not as_string: + s += null_byte + return s.encode(self._encoding) + else: + s += null_byte + return s diff --git a/pandas/io/tests/__init__.py b/pandas/io/tests/__init__.py new file mode 100644 index 00000000..e6089154 --- /dev/null +++ b/pandas/io/tests/__init__.py @@ -0,0 +1,4 @@ + +def setUp(): + import socket + socket.setdefaulttimeout(5) diff --git a/pandas/io/tests/data/banklist.csv b/pandas/io/tests/data/banklist.csv new file mode 100644 index 00000000..e7900830 --- /dev/null +++ b/pandas/io/tests/data/banklist.csv @@ -0,0 +1,507 @@ +Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date +Banks of Wisconsin d/b/a Bank of Kenosha,Kenosha,WI,35386,"North Shore Bank, FSB",31-May-13,31-May-13 +Central Arizona Bank,Scottsdale,AZ,34527,Western State Bank,14-May-13,20-May-13 +Sunrise Bank,Valdosta,GA,58185,Synovus Bank,10-May-13,21-May-13 +Pisgah Community Bank,Asheville,NC,58701,"Capital Bank, N.A.",10-May-13,14-May-13 +Douglas County Bank,Douglasville,GA,21649,Hamilton State Bank,26-Apr-13,16-May-13 +Parkway Bank,Lenoir,NC,57158,"CertusBank, National Association",26-Apr-13,17-May-13 +Chipola Community Bank,Marianna,FL,58034,First Federal Bank of Florida,19-Apr-13,16-May-13 +Heritage Bank of North Florida,Orange Park,FL,26680,FirstAtlantic Bank,19-Apr-13,16-May-13 +First Federal Bank,Lexington,KY,29594,Your Community Bank,19-Apr-13,23-Apr-13 +Gold Canyon Bank,Gold Canyon,AZ,58066,"First Scottsdale Bank, National Association",5-Apr-13,9-Apr-13 +Frontier Bank,LaGrange,GA,16431,HeritageBank of the South,8-Mar-13,26-Mar-13 +Covenant Bank,Chicago,IL,22476,Liberty Bank and Trust Company,15-Feb-13,4-Mar-13 +1st Regents Bank,Andover,MN,57157,First Minnesota Bank,18-Jan-13,28-Feb-13 +Westside Community Bank,University Place,WA,33997,Sunwest Bank,11-Jan-13,24-Jan-13 +Community Bank of the Ozarks,Sunrise Beach,MO,27331,Bank of Sullivan,14-Dec-12,24-Jan-13 +Hometown Community Bank,Braselton,GA,57928,"CertusBank, National Association",16-Nov-12,24-Jan-13 +Citizens First National Bank,Princeton,IL,3731,Heartland Bank and Trust Company,2-Nov-12,24-Jan-13 +Heritage Bank of Florida,Lutz,FL,35009,Centennial Bank,2-Nov-12,24-Jan-13 +NOVA Bank,Berwyn,PA,27148,No Acquirer,26-Oct-12,24-Jan-13 +Excel Bank,Sedalia,MO,19189,Simmons First National Bank,19-Oct-12,24-Jan-13 +First East Side Savings Bank,Tamarac,FL,28144,Stearns Bank N.A.,19-Oct-12,24-Jan-13 +GulfSouth Private Bank,Destin,FL,58073,SmartBank,19-Oct-12,24-Jan-13 +First United Bank,Crete,IL,20685,"Old Plank Trail Community Bank, National Association",28-Sep-12,15-Nov-12 +Truman Bank,St. Louis,MO,27316,Simmons First National Bank,14-Sep-12,17-Dec-12 +First Commercial Bank,Bloomington,MN,35246,Republic Bank & Trust Company,7-Sep-12,17-Dec-12 +Waukegan Savings Bank,Waukegan,IL,28243,First Midwest Bank,3-Aug-12,11-Oct-12 +Jasper Banking Company,Jasper,GA,16240,Stearns Bank N.A.,27-Jul-12,17-Dec-12 +Second Federal Savings and Loan Association of Chicago,Chicago,IL,27986,Hinsdale Bank & Trust Company,20-Jul-12,14-Jan-13 +Heartland Bank,Leawood,KS,1361,Metcalf Bank,20-Jul-12,17-Dec-12 +First Cherokee State Bank,Woodstock,GA,32711,Community & Southern Bank,20-Jul-12,31-Oct-12 +Georgia Trust Bank,Buford,GA,57847,Community & Southern Bank,20-Jul-12,17-Dec-12 +The Royal Palm Bank of Florida,Naples,FL,57096,First National Bank of the Gulf Coast,20-Jul-12,7-Jan-13 +Glasgow Savings Bank,Glasgow,MO,1056,Regional Missouri Bank,13-Jul-12,11-Oct-12 +Montgomery Bank & Trust,Ailey,GA,19498,Ameris Bank,6-Jul-12,31-Oct-12 +The Farmers Bank of Lynchburg,Lynchburg,TN,1690,Clayton Bank and Trust,15-Jun-12,31-Oct-12 +Security Exchange Bank,Marietta,GA,35299,Fidelity Bank,15-Jun-12,10-Oct-12 +Putnam State Bank,Palatka,FL,27405,Harbor Community Bank,15-Jun-12,10-Oct-12 +Waccamaw Bank,Whiteville,NC,34515,First Community Bank,8-Jun-12,8-Nov-12 +Farmers' and Traders' State Bank,Shabbona,IL,9257,First State Bank,8-Jun-12,10-Oct-12 +Carolina Federal Savings Bank,Charleston,SC,35372,Bank of North Carolina,8-Jun-12,31-Oct-12 +First Capital Bank,Kingfisher,OK,416,F & M Bank,8-Jun-12,10-Oct-12 +"Alabama Trust Bank, National Association",Sylacauga,AL,35224,Southern States Bank,18-May-12,20-May-13 +"Security Bank, National Association",North Lauderdale,FL,23156,Banesco USA,4-May-12,31-Oct-12 +Palm Desert National Bank,Palm Desert,CA,23632,Pacific Premier Bank,27-Apr-12,17-May-13 +Plantation Federal Bank,Pawleys Island,SC,32503,First Federal Bank,27-Apr-12,17-May-13 +"Inter Savings Bank, fsb D/B/A InterBank, fsb",Maple Grove,MN,31495,Great Southern Bank,27-Apr-12,17-May-13 +HarVest Bank of Maryland,Gaithersburg,MD,57766,Sonabank,27-Apr-12,17-May-13 +Bank of the Eastern Shore,Cambridge,MD,26759,No Acquirer,27-Apr-12,17-Oct-12 +"Fort Lee Federal Savings Bank, FSB",Fort Lee,NJ,35527,Alma Bank,20-Apr-12,17-May-13 +Fidelity Bank,Dearborn,MI,33883,The Huntington National Bank,30-Mar-12,16-May-13 +Premier Bank,Wilmette,IL,35419,International Bank of Chicago,23-Mar-12,17-Oct-12 +Covenant Bank & Trust,Rock Spring,GA,58068,"Stearns Bank, N.A.",23-Mar-12,31-Oct-12 +New City Bank,Chicago,IL,57597,No Acquirer,9-Mar-12,29-Oct-12 +Global Commerce Bank,Doraville,GA,34046,Metro City Bank,2-Mar-12,31-Oct-12 +Home Savings of America,Little Falls,MN,29178,No Acquirer,24-Feb-12,17-Dec-12 +Central Bank of Georgia,Ellaville,GA,5687,Ameris Bank,24-Feb-12,9-Aug-12 +SCB Bank,Shelbyville,IN,29761,"First Merchants Bank, National Association",10-Feb-12,25-Mar-13 +Charter National Bank and Trust,Hoffman Estates,IL,23187,"Barrington Bank & Trust Company, National Association",10-Feb-12,25-Mar-13 +BankEast,Knoxville,TN,19869,U.S.Bank National Association,27-Jan-12,8-Mar-13 +Patriot Bank Minnesota,Forest Lake,MN,34823,First Resource Bank,27-Jan-12,12-Sep-12 +Tennessee Commerce Bank,Franklin,TN,35296,Republic Bank & Trust Company,27-Jan-12,20-Nov-12 +First Guaranty Bank and Trust Company of Jacksonville,Jacksonville,FL,16579,"CenterState Bank of Florida, N.A.",27-Jan-12,12-Sep-12 +American Eagle Savings Bank,Boothwyn,PA,31581,"Capital Bank, N.A.",20-Jan-12,25-Jan-13 +The First State Bank,Stockbridge,GA,19252,Hamilton State Bank,20-Jan-12,25-Jan-13 +Central Florida State Bank,Belleview,FL,57186,"CenterState Bank of Florida, N.A.",20-Jan-12,25-Jan-13 +Western National Bank,Phoenix,AZ,57917,Washington Federal,16-Dec-11,13-Aug-12 +Premier Community Bank of the Emerald Coast,Crestview,FL,58343,Summit Bank,16-Dec-11,12-Sep-12 +Central Progressive Bank,Lacombe,LA,19657,First NBC Bank,18-Nov-11,13-Aug-12 +Polk County Bank,Johnston,IA,14194,Grinnell State Bank,18-Nov-11,15-Aug-12 +Community Bank of Rockmart,Rockmart,GA,57860,Century Bank of Georgia,10-Nov-11,13-Aug-12 +SunFirst Bank,Saint George,UT,57087,Cache Valley Bank,4-Nov-11,16-Nov-12 +"Mid City Bank, Inc.",Omaha,NE,19397,Premier Bank,4-Nov-11,15-Aug-12 +All American Bank,Des Plaines,IL,57759,International Bank of Chicago,28-Oct-11,15-Aug-12 +Community Banks of Colorado,Greenwood Village,CO,21132,"Bank Midwest, N.A.",21-Oct-11,2-Jan-13 +Community Capital Bank,Jonesboro,GA,57036,State Bank and Trust Company,21-Oct-11,8-Nov-12 +Decatur First Bank,Decatur,GA,34392,Fidelity Bank,21-Oct-11,8-Nov-12 +Old Harbor Bank,Clearwater,FL,57537,1st United Bank,21-Oct-11,8-Nov-12 +Country Bank,Aledo,IL,35395,Blackhawk Bank & Trust,14-Oct-11,15-Aug-12 +First State Bank,Cranford,NJ,58046,Northfield Bank,14-Oct-11,8-Nov-12 +"Blue Ridge Savings Bank, Inc.",Asheville,NC,32347,Bank of North Carolina,14-Oct-11,8-Nov-12 +Piedmont Community Bank,Gray,GA,57256,State Bank and Trust Company,14-Oct-11,22-Jan-13 +Sun Security Bank,Ellington,MO,20115,Great Southern Bank,7-Oct-11,7-Nov-12 +The RiverBank,Wyoming,MN,10216,Central Bank,7-Oct-11,7-Nov-12 +First International Bank,Plano,TX,33513,American First National Bank,30-Sep-11,9-Oct-12 +Citizens Bank of Northern California,Nevada City,CA,33983,Tri Counties Bank,23-Sep-11,9-Oct-12 +Bank of the Commonwealth,Norfolk,VA,20408,Southern Bank and Trust Company,23-Sep-11,9-Oct-12 +The First National Bank of Florida,Milton,FL,25155,CharterBank,9-Sep-11,6-Sep-12 +CreekSide Bank,Woodstock,GA,58226,Georgia Commerce Bank,2-Sep-11,6-Sep-12 +Patriot Bank of Georgia,Cumming,GA,58273,Georgia Commerce Bank,2-Sep-11,2-Nov-12 +First Choice Bank,Geneva,IL,57212,Inland Bank & Trust,19-Aug-11,15-Aug-12 +First Southern National Bank,Statesboro,GA,57239,Heritage Bank of the South,19-Aug-11,2-Nov-12 +Lydian Private Bank,Palm Beach,FL,35356,"Sabadell United Bank, N.A.",19-Aug-11,2-Nov-12 +Public Savings Bank,Huntingdon Valley,PA,34130,"Capital Bank, N.A.",18-Aug-11,15-Aug-12 +The First National Bank of Olathe,Olathe,KS,4744,Enterprise Bank & Trust,12-Aug-11,23-Aug-12 +Bank of Whitman,Colfax,WA,22528,Columbia State Bank,5-Aug-11,16-Aug-12 +Bank of Shorewood,Shorewood,IL,22637,Heartland Bank and Trust Company,5-Aug-11,16-Aug-12 +Integra Bank National Association,Evansville,IN,4392,Old National Bank,29-Jul-11,16-Aug-12 +"BankMeridian, N.A.",Columbia,SC,58222,SCBT National Association,29-Jul-11,2-Nov-12 +Virginia Business Bank,Richmond,VA,58283,Xenith Bank,29-Jul-11,9-Oct-12 +Bank of Choice,Greeley,CO,2994,"Bank Midwest, N.A.",22-Jul-11,12-Sep-12 +LandMark Bank of Florida,Sarasota,FL,35244,American Momentum Bank,22-Jul-11,2-Nov-12 +Southshore Community Bank,Apollo Beach,FL,58056,American Momentum Bank,22-Jul-11,2-Nov-12 +Summit Bank,Prescott,AZ,57442,The Foothills Bank,15-Jul-11,16-Aug-12 +First Peoples Bank,Port St. Lucie,FL,34870,"Premier American Bank, N.A.",15-Jul-11,2-Nov-12 +High Trust Bank,Stockbridge,GA,19554,Ameris Bank,15-Jul-11,2-Nov-12 +One Georgia Bank,Atlanta,GA,58238,Ameris Bank,15-Jul-11,2-Nov-12 +Signature Bank,Windsor,CO,57835,Points West Community Bank,8-Jul-11,26-Oct-12 +Colorado Capital Bank,Castle Rock,CO,34522,First-Citizens Bank & Trust Company,8-Jul-11,15-Jan-13 +First Chicago Bank & Trust,Chicago,IL,27935,Northbrook Bank & Trust Company,8-Jul-11,9-Sep-12 +Mountain Heritage Bank,Clayton,GA,57593,First American Bank and Trust Company,24-Jun-11,2-Nov-12 +First Commercial Bank of Tampa Bay,Tampa,FL,27583,Stonegate Bank,17-Jun-11,2-Nov-12 +McIntosh State Bank,Jackson,GA,19237,Hamilton State Bank,17-Jun-11,2-Nov-12 +Atlantic Bank and Trust,Charleston,SC,58420,"First Citizens Bank and Trust Company, Inc.",3-Jun-11,31-Oct-12 +First Heritage Bank,Snohomish,WA,23626,Columbia State Bank,27-May-11,28-Jan-13 +Summit Bank,Burlington,WA,513,Columbia State Bank,20-May-11,22-Jan-13 +First Georgia Banking Company,Franklin,GA,57647,"CertusBank, National Association",20-May-11,13-Nov-12 +Atlantic Southern Bank,Macon,GA,57213,"CertusBank, National Association",20-May-11,31-Oct-12 +Coastal Bank,Cocoa Beach,FL,34898,"Florida Community Bank, a division of Premier American Bank, N.A.",6-May-11,30-Nov-12 +Community Central Bank,Mount Clemens,MI,34234,Talmer Bank & Trust,29-Apr-11,16-Aug-12 +The Park Avenue Bank,Valdosta,GA,19797,Bank of the Ozarks,29-Apr-11,30-Nov-12 +First Choice Community Bank,Dallas,GA,58539,Bank of the Ozarks,29-Apr-11,22-Jan-13 +Cortez Community Bank,Brooksville,FL,57625,"Florida Community Bank, a division of Premier American Bank, N.A.",29-Apr-11,30-Nov-12 +First National Bank of Central Florida,Winter Park,FL,26297,"Florida Community Bank, a division of Premier American Bank, N.A.",29-Apr-11,30-Nov-12 +Heritage Banking Group,Carthage,MS,14273,Trustmark National Bank,15-Apr-11,30-Nov-12 +Rosemount National Bank,Rosemount,MN,24099,Central Bank,15-Apr-11,16-Aug-12 +Superior Bank,Birmingham,AL,17750,"Superior Bank, National Association",15-Apr-11,30-Nov-12 +Nexity Bank,Birmingham,AL,19794,AloStar Bank of Commerce,15-Apr-11,4-Sep-12 +New Horizons Bank,East Ellijay,GA,57705,Citizens South Bank,15-Apr-11,16-Aug-12 +Bartow County Bank,Cartersville,GA,21495,Hamilton State Bank,15-Apr-11,22-Jan-13 +Nevada Commerce Bank,Las Vegas,NV,35418,City National Bank,8-Apr-11,9-Sep-12 +Western Springs National Bank and Trust,Western Springs,IL,10086,Heartland Bank and Trust Company,8-Apr-11,22-Jan-13 +The Bank of Commerce,Wood Dale,IL,34292,Advantage National Bank Group,25-Mar-11,22-Jan-13 +Legacy Bank,Milwaukee,WI,34818,Seaway Bank and Trust Company,11-Mar-11,12-Sep-12 +First National Bank of Davis,Davis,OK,4077,The Pauls Valley National Bank,11-Mar-11,20-Aug-12 +Valley Community Bank,St. Charles,IL,34187,First State Bank,25-Feb-11,12-Sep-12 +"San Luis Trust Bank, FSB",San Luis Obispo,CA,34783,First California Bank,18-Feb-11,20-Aug-12 +Charter Oak Bank,Napa,CA,57855,Bank of Marin,18-Feb-11,12-Sep-12 +Citizens Bank of Effingham,Springfield,GA,34601,Heritage Bank of the South,18-Feb-11,2-Nov-12 +Habersham Bank,Clarkesville,GA,151,SCBT National Association,18-Feb-11,2-Nov-12 +Canyon National Bank,Palm Springs,CA,34692,Pacific Premier Bank,11-Feb-11,12-Sep-12 +Badger State Bank,Cassville,WI,13272,Royal Bank,11-Feb-11,12-Sep-12 +Peoples State Bank,Hamtramck,MI,14939,First Michigan Bank,11-Feb-11,22-Jan-13 +Sunshine State Community Bank,Port Orange,FL,35478,"Premier American Bank, N.A.",11-Feb-11,2-Nov-12 +Community First Bank Chicago,Chicago,IL,57948,Northbrook Bank & Trust Company,4-Feb-11,20-Aug-12 +North Georgia Bank,Watkinsville,GA,35242,BankSouth,4-Feb-11,2-Nov-12 +American Trust Bank,Roswell,GA,57432,Renasant Bank,4-Feb-11,31-Oct-12 +First Community Bank,Taos,NM,12261,"U.S. Bank, N.A.",28-Jan-11,12-Sep-12 +FirsTier Bank,Louisville,CO,57646,No Acquirer,28-Jan-11,12-Sep-12 +Evergreen State Bank,Stoughton,WI,5328,McFarland State Bank,28-Jan-11,12-Sep-12 +The First State Bank,Camargo,OK,2303,Bank 7,28-Jan-11,12-Sep-12 +United Western Bank,Denver,CO,31293,First-Citizens Bank & Trust Company,21-Jan-11,12-Sep-12 +The Bank of Asheville,Asheville,NC,34516,First Bank,21-Jan-11,2-Nov-12 +CommunitySouth Bank & Trust,Easley,SC,57868,"CertusBank, National Association",21-Jan-11,2-Nov-12 +Enterprise Banking Company,McDonough,GA,19758,No Acquirer,21-Jan-11,2-Nov-12 +Oglethorpe Bank,Brunswick,GA,57440,Bank of the Ozarks,14-Jan-11,2-Nov-12 +Legacy Bank,Scottsdale,AZ,57820,Enterprise Bank & Trust,7-Jan-11,12-Sep-12 +First Commercial Bank of Florida,Orlando,FL,34965,First Southern Bank,7-Jan-11,2-Nov-12 +Community National Bank,Lino Lakes,MN,23306,Farmers & Merchants Savings Bank,17-Dec-10,20-Aug-12 +First Southern Bank,Batesville,AR,58052,Southern Bank,17-Dec-10,20-Aug-12 +"United Americas Bank, N.A.",Atlanta,GA,35065,State Bank and Trust Company,17-Dec-10,2-Nov-12 +"Appalachian Community Bank, FSB",McCaysville,GA,58495,Peoples Bank of East Tennessee,17-Dec-10,31-Oct-12 +Chestatee State Bank,Dawsonville,GA,34578,Bank of the Ozarks,17-Dec-10,2-Nov-12 +"The Bank of Miami,N.A.",Coral Gables,FL,19040,1st United Bank,17-Dec-10,2-Nov-12 +Earthstar Bank,Southampton,PA,35561,Polonia Bank,10-Dec-10,20-Aug-12 +Paramount Bank,Farmington Hills,MI,34673,Level One Bank,10-Dec-10,20-Aug-12 +First Banking Center,Burlington,WI,5287,First Michigan Bank,19-Nov-10,20-Aug-12 +Allegiance Bank of North America,Bala Cynwyd,PA,35078,VIST Bank,19-Nov-10,20-Aug-12 +Gulf State Community Bank,Carrabelle,FL,20340,Centennial Bank,19-Nov-10,2-Nov-12 +Copper Star Bank,Scottsdale,AZ,35463,"Stearns Bank, N.A.",12-Nov-10,20-Aug-12 +Darby Bank & Trust Co.,Vidalia,GA,14580,Ameris Bank,12-Nov-10,15-Jan-13 +Tifton Banking Company,Tifton,GA,57831,Ameris Bank,12-Nov-10,2-Nov-12 +First Vietnamese American Bank,Westminster,CA,57885,Grandpoint Bank,5-Nov-10,12-Sep-12 +Pierce Commercial Bank,Tacoma,WA,34411,Heritage Bank,5-Nov-10,20-Aug-12 +Western Commercial Bank,Woodland Hills,CA,58087,First California Bank,5-Nov-10,12-Sep-12 +K Bank,Randallstown,MD,31263,Manufacturers and Traders Trust Company (M&T Bank),5-Nov-10,20-Aug-12 +"First Arizona Savings, A FSB",Scottsdale,AZ,32582,No Acquirer,22-Oct-10,20-Aug-12 +Hillcrest Bank,Overland Park,KS,22173,"Hillcrest Bank, N.A.",22-Oct-10,20-Aug-12 +First Suburban National Bank,Maywood,IL,16089,Seaway Bank and Trust Company,22-Oct-10,20-Aug-12 +The First National Bank of Barnesville,Barnesville,GA,2119,United Bank,22-Oct-10,2-Nov-12 +The Gordon Bank,Gordon,GA,33904,Morris Bank,22-Oct-10,2-Nov-12 +Progress Bank of Florida,Tampa,FL,32251,Bay Cities Bank,22-Oct-10,2-Nov-12 +First Bank of Jacksonville,Jacksonville,FL,27573,Ameris Bank,22-Oct-10,2-Nov-12 +Premier Bank,Jefferson City,MO,34016,Providence Bank,15-Oct-10,20-Aug-12 +WestBridge Bank and Trust Company,Chesterfield,MO,58205,Midland States Bank,15-Oct-10,20-Aug-12 +"Security Savings Bank, F.S.B.",Olathe,KS,30898,Simmons First National Bank,15-Oct-10,20-Aug-12 +Shoreline Bank,Shoreline,WA,35250,GBC International Bank,1-Oct-10,20-Aug-12 +Wakulla Bank,Crawfordville,FL,21777,Centennial Bank,1-Oct-10,2-Nov-12 +North County Bank,Arlington,WA,35053,Whidbey Island Bank,24-Sep-10,20-Aug-12 +Haven Trust Bank Florida,Ponte Vedra Beach,FL,58308,First Southern Bank,24-Sep-10,5-Nov-12 +Maritime Savings Bank,West Allis,WI,28612,"North Shore Bank, FSB",17-Sep-10,20-Aug-12 +Bramble Savings Bank,Milford,OH,27808,Foundation Bank,17-Sep-10,20-Aug-12 +The Peoples Bank,Winder,GA,182,Community & Southern Bank,17-Sep-10,5-Nov-12 +First Commerce Community Bank,Douglasville,GA,57448,Community & Southern Bank,17-Sep-10,15-Jan-13 +Bank of Ellijay,Ellijay,GA,58197,Community & Southern Bank,17-Sep-10,15-Jan-13 +ISN Bank,Cherry Hill,NJ,57107,Customers Bank,17-Sep-10,22-Aug-12 +Horizon Bank,Bradenton,FL,35061,Bank of the Ozarks,10-Sep-10,5-Nov-12 +Sonoma Valley Bank,Sonoma,CA,27259,Westamerica Bank,20-Aug-10,12-Sep-12 +Los Padres Bank,Solvang,CA,32165,Pacific Western Bank,20-Aug-10,12-Sep-12 +Butte Community Bank,Chico,CA,33219,"Rabobank, N.A.",20-Aug-10,12-Sep-12 +Pacific State Bank,Stockton,CA,27090,"Rabobank, N.A.",20-Aug-10,12-Sep-12 +ShoreBank,Chicago,IL,15640,Urban Partnership Bank,20-Aug-10,16-May-13 +Imperial Savings and Loan Association,Martinsville,VA,31623,"River Community Bank, N.A.",20-Aug-10,24-Aug-12 +Independent National Bank,Ocala,FL,27344,"CenterState Bank of Florida, N.A.",20-Aug-10,5-Nov-12 +Community National Bank at Bartow,Bartow,FL,25266,"CenterState Bank of Florida, N.A.",20-Aug-10,5-Nov-12 +Palos Bank and Trust Company,Palos Heights,IL,17599,First Midwest Bank,13-Aug-10,22-Aug-12 +Ravenswood Bank,Chicago,IL,34231,Northbrook Bank & Trust Company,6-Aug-10,22-Aug-12 +LibertyBank,Eugene,OR,31964,Home Federal Bank,30-Jul-10,22-Aug-12 +The Cowlitz Bank,Longview,WA,22643,Heritage Bank,30-Jul-10,22-Aug-12 +Coastal Community Bank,Panama City Beach,FL,9619,Centennial Bank,30-Jul-10,5-Nov-12 +Bayside Savings Bank,Port Saint Joe,FL,57669,Centennial Bank,30-Jul-10,5-Nov-12 +Northwest Bank & Trust,Acworth,GA,57658,State Bank and Trust Company,30-Jul-10,5-Nov-12 +Home Valley Bank,Cave Junction,OR,23181,South Valley Bank & Trust,23-Jul-10,12-Sep-12 +SouthwestUSA Bank,Las Vegas,NV,35434,Plaza Bank,23-Jul-10,22-Aug-12 +Community Security Bank,New Prague,MN,34486,Roundbank,23-Jul-10,12-Sep-12 +Thunder Bank,Sylvan Grove,KS,10506,The Bennington State Bank,23-Jul-10,13-Sep-12 +Williamsburg First National Bank,Kingstree,SC,17837,"First Citizens Bank and Trust Company, Inc.",23-Jul-10,5-Nov-12 +Crescent Bank and Trust Company,Jasper,GA,27559,Renasant Bank,23-Jul-10,5-Nov-12 +Sterling Bank,Lantana,FL,32536,IBERIABANK,23-Jul-10,5-Nov-12 +"Mainstreet Savings Bank, FSB",Hastings,MI,28136,Commercial Bank,16-Jul-10,13-Sep-12 +Olde Cypress Community Bank,Clewiston,FL,28864,"CenterState Bank of Florida, N.A.",16-Jul-10,5-Nov-12 +Turnberry Bank,Aventura,FL,32280,NAFH National Bank,16-Jul-10,5-Nov-12 +Metro Bank of Dade County,Miami,FL,25172,NAFH National Bank,16-Jul-10,5-Nov-12 +First National Bank of the South,Spartanburg,SC,35383,NAFH National Bank,16-Jul-10,5-Nov-12 +Woodlands Bank,Bluffton,SC,32571,Bank of the Ozarks,16-Jul-10,5-Nov-12 +Home National Bank,Blackwell,OK,11636,RCB Bank,9-Jul-10,10-Dec-12 +USA Bank,Port Chester,NY,58072,New Century Bank,9-Jul-10,14-Sep-12 +Ideal Federal Savings Bank,Baltimore,MD,32456,No Acquirer,9-Jul-10,14-Sep-12 +Bay National Bank,Baltimore,MD,35462,"Bay Bank, FSB",9-Jul-10,15-Jan-13 +High Desert State Bank,Albuquerque,NM,35279,First American Bank,25-Jun-10,14-Sep-12 +First National Bank,Savannah,GA,34152,"The Savannah Bank, N.A.",25-Jun-10,5-Nov-12 +Peninsula Bank,Englewood,FL,26563,"Premier American Bank, N.A.",25-Jun-10,5-Nov-12 +Nevada Security Bank,Reno,NV,57110,Umpqua Bank,18-Jun-10,23-Aug-12 +Washington First International Bank,Seattle,WA,32955,East West Bank,11-Jun-10,14-Sep-12 +TierOne Bank,Lincoln,NE,29341,Great Western Bank,4-Jun-10,14-Sep-12 +Arcola Homestead Savings Bank,Arcola,IL,31813,No Acquirer,4-Jun-10,14-Sep-12 +First National Bank,Rosedale,MS,15814,The Jefferson Bank,4-Jun-10,5-Nov-12 +Sun West Bank,Las Vegas,NV,34785,City National Bank,28-May-10,14-Sep-12 +"Granite Community Bank, NA",Granite Bay,CA,57315,Tri Counties Bank,28-May-10,14-Sep-12 +Bank of Florida - Tampa,Tampa,FL,57814,EverBank,28-May-10,5-Nov-12 +Bank of Florida - Southwest,Naples,FL,35106,EverBank,28-May-10,5-Nov-12 +Bank of Florida - Southeast,Fort Lauderdale,FL,57360,EverBank,28-May-10,5-Nov-12 +Pinehurst Bank,Saint Paul,MN,57735,Coulee Bank,21-May-10,26-Oct-12 +Midwest Bank and Trust Company,Elmwood Park,IL,18117,"FirstMerit Bank, N.A.",14-May-10,23-Aug-12 +Southwest Community Bank,Springfield,MO,34255,Simmons First National Bank,14-May-10,23-Aug-12 +New Liberty Bank,Plymouth,MI,35586,Bank of Ann Arbor,14-May-10,23-Aug-12 +Satilla Community Bank,Saint Marys,GA,35114,Ameris Bank,14-May-10,5-Nov-12 +1st Pacific Bank of California,San Diego,CA,35517,City National Bank,7-May-10,13-Dec-12 +Towne Bank of Arizona,Mesa,AZ,57697,Commerce Bank of Arizona,7-May-10,23-Aug-12 +Access Bank,Champlin,MN,16476,PrinsBank,7-May-10,23-Aug-12 +The Bank of Bonifay,Bonifay,FL,14246,First Federal Bank of Florida,7-May-10,5-Nov-12 +Frontier Bank,Everett,WA,22710,"Union Bank, N.A.",30-Apr-10,15-Jan-13 +BC National Banks,Butler,MO,17792,Community First Bank,30-Apr-10,23-Aug-12 +Champion Bank,Creve Coeur,MO,58362,BankLiberty,30-Apr-10,23-Aug-12 +CF Bancorp,Port Huron,MI,30005,First Michigan Bank,30-Apr-10,15-Jan-13 +Westernbank Puerto Rico,Mayaguez,PR,31027,Banco Popular de Puerto Rico,30-Apr-10,5-Nov-12 +R-G Premier Bank of Puerto Rico,Hato Rey,PR,32185,Scotiabank de Puerto Rico,30-Apr-10,5-Nov-12 +Eurobank,San Juan,PR,27150,Oriental Bank and Trust,30-Apr-10,5-Nov-12 +Wheatland Bank,Naperville,IL,58429,Wheaton Bank & Trust,23-Apr-10,23-Aug-12 +Peotone Bank and Trust Company,Peotone,IL,10888,First Midwest Bank,23-Apr-10,23-Aug-12 +Lincoln Park Savings Bank,Chicago,IL,30600,Northbrook Bank & Trust Company,23-Apr-10,23-Aug-12 +New Century Bank,Chicago,IL,34821,"MB Financial Bank, N.A.",23-Apr-10,23-Aug-12 +Citizens Bank and Trust Company of Chicago,Chicago,IL,34658,Republic Bank of Chicago,23-Apr-10,23-Aug-12 +Broadway Bank,Chicago,IL,22853,"MB Financial Bank, N.A.",23-Apr-10,23-Aug-12 +"Amcore Bank, National Association",Rockford,IL,3735,Harris N.A.,23-Apr-10,23-Aug-12 +City Bank,Lynnwood,WA,21521,Whidbey Island Bank,16-Apr-10,14-Sep-12 +Tamalpais Bank,San Rafael,CA,33493,"Union Bank, N.A.",16-Apr-10,23-Aug-12 +Innovative Bank,Oakland,CA,23876,Center Bank,16-Apr-10,23-Aug-12 +Butler Bank,Lowell,MA,26619,People's United Bank,16-Apr-10,23-Aug-12 +Riverside National Bank of Florida,Fort Pierce,FL,24067,"TD Bank, N.A.",16-Apr-10,5-Nov-12 +AmericanFirst Bank,Clermont,FL,57724,"TD Bank, N.A.",16-Apr-10,31-Oct-12 +First Federal Bank of North Florida,Palatka,FL,28886,"TD Bank, N.A.",16-Apr-10,15-Jan-13 +Lakeside Community Bank,Sterling Heights,MI,34878,No Acquirer,16-Apr-10,23-Aug-12 +Beach First National Bank,Myrtle Beach,SC,34242,Bank of North Carolina,9-Apr-10,5-Nov-12 +Desert Hills Bank,Phoenix,AZ,57060,New York Community Bank,26-Mar-10,23-Aug-12 +Unity National Bank,Cartersville,GA,34678,Bank of the Ozarks,26-Mar-10,14-Sep-12 +Key West Bank,Key West,FL,34684,Centennial Bank,26-Mar-10,23-Aug-12 +McIntosh Commercial Bank,Carrollton,GA,57399,CharterBank,26-Mar-10,23-Aug-12 +State Bank of Aurora,Aurora,MN,8221,Northern State Bank,19-Mar-10,23-Aug-12 +First Lowndes Bank,Fort Deposit,AL,24957,First Citizens Bank,19-Mar-10,23-Aug-12 +Bank of Hiawassee,Hiawassee,GA,10054,Citizens South Bank,19-Mar-10,23-Aug-12 +Appalachian Community Bank,Ellijay,GA,33989,Community & Southern Bank,19-Mar-10,31-Oct-12 +Advanta Bank Corp.,Draper,UT,33535,No Acquirer,19-Mar-10,14-Sep-12 +Century Security Bank,Duluth,GA,58104,Bank of Upson,19-Mar-10,23-Aug-12 +American National Bank,Parma,OH,18806,The National Bank and Trust Company,19-Mar-10,23-Aug-12 +Statewide Bank,Covington,LA,29561,Home Bank,12-Mar-10,23-Aug-12 +Old Southern Bank,Orlando,FL,58182,Centennial Bank,12-Mar-10,23-Aug-12 +The Park Avenue Bank,New York,NY,27096,Valley National Bank,12-Mar-10,23-Aug-12 +LibertyPointe Bank,New York,NY,58071,Valley National Bank,11-Mar-10,23-Aug-12 +Centennial Bank,Ogden,UT,34430,No Acquirer,5-Mar-10,14-Sep-12 +Waterfield Bank,Germantown,MD,34976,No Acquirer,5-Mar-10,23-Aug-12 +Bank of Illinois,Normal,IL,9268,Heartland Bank and Trust Company,5-Mar-10,23-Aug-12 +Sun American Bank,Boca Raton,FL,27126,First-Citizens Bank & Trust Company,5-Mar-10,23-Aug-12 +Rainier Pacific Bank,Tacoma,WA,38129,Umpqua Bank,26-Feb-10,23-Aug-12 +Carson River Community Bank,Carson City,NV,58352,Heritage Bank of Nevada,26-Feb-10,15-Jan-13 +"La Jolla Bank, FSB",La Jolla,CA,32423,"OneWest Bank, FSB",19-Feb-10,24-Aug-12 +George Washington Savings Bank,Orland Park,IL,29952,"FirstMerit Bank, N.A.",19-Feb-10,24-Aug-12 +The La Coste National Bank,La Coste,TX,3287,Community National Bank,19-Feb-10,14-Sep-12 +Marco Community Bank,Marco Island,FL,57586,Mutual of Omaha Bank,19-Feb-10,24-Aug-12 +1st American State Bank of Minnesota,Hancock,MN,15448,"Community Development Bank, FSB",5-Feb-10,24-Aug-12 +American Marine Bank,Bainbridge Island,WA,16730,Columbia State Bank,29-Jan-10,24-Aug-12 +First Regional Bank,Los Angeles,CA,23011,First-Citizens Bank & Trust Company,29-Jan-10,24-Aug-12 +Community Bank and Trust,Cornelia,GA,5702,SCBT National Association,29-Jan-10,15-Jan-13 +"Marshall Bank, N.A.",Hallock,MN,16133,United Valley Bank,29-Jan-10,23-Aug-12 +Florida Community Bank,Immokalee,FL,5672,"Premier American Bank, N.A.",29-Jan-10,15-Jan-13 +First National Bank of Georgia,Carrollton,GA,16480,Community & Southern Bank,29-Jan-10,13-Dec-12 +Columbia River Bank,The Dalles,OR,22469,Columbia State Bank,22-Jan-10,14-Sep-12 +Evergreen Bank,Seattle,WA,20501,Umpqua Bank,22-Jan-10,15-Jan-13 +Charter Bank,Santa Fe,NM,32498,Charter Bank,22-Jan-10,23-Aug-12 +Bank of Leeton,Leeton,MO,8265,"Sunflower Bank, N.A.",22-Jan-10,15-Jan-13 +Premier American Bank,Miami,FL,57147,"Premier American Bank, N.A.",22-Jan-10,13-Dec-12 +Barnes Banking Company,Kaysville,UT,1252,No Acquirer,15-Jan-10,23-Aug-12 +St. Stephen State Bank,St. Stephen,MN,17522,First State Bank of St. Joseph,15-Jan-10,23-Aug-12 +Town Community Bank & Trust,Antioch,IL,34705,First American Bank,15-Jan-10,23-Aug-12 +Horizon Bank,Bellingham,WA,22977,Washington Federal Savings and Loan Association,8-Jan-10,23-Aug-12 +"First Federal Bank of California, F.S.B.",Santa Monica,CA,28536,"OneWest Bank, FSB",18-Dec-09,23-Aug-12 +Imperial Capital Bank,La Jolla,CA,26348,City National Bank,18-Dec-09,5-Sep-12 +Independent Bankers' Bank,Springfield,IL,26820,The Independent BankersBank (TIB),18-Dec-09,23-Aug-12 +New South Federal Savings Bank,Irondale,AL,32276,Beal Bank,18-Dec-09,23-Aug-12 +Citizens State Bank,New Baltimore,MI,1006,No Acquirer,18-Dec-09,5-Nov-12 +Peoples First Community Bank,Panama City,FL,32167,Hancock Bank,18-Dec-09,5-Nov-12 +RockBridge Commercial Bank,Atlanta,GA,58315,No Acquirer,18-Dec-09,5-Nov-12 +SolutionsBank,Overland Park,KS,4731,Arvest Bank,11-Dec-09,23-Aug-12 +"Valley Capital Bank, N.A.",Mesa,AZ,58399,Enterprise Bank & Trust,11-Dec-09,23-Aug-12 +"Republic Federal Bank, N.A.",Miami,FL,22846,1st United Bank,11-Dec-09,5-Nov-12 +Greater Atlantic Bank,Reston,VA,32583,Sonabank,4-Dec-09,5-Nov-12 +Benchmark Bank,Aurora,IL,10440,"MB Financial Bank, N.A.",4-Dec-09,23-Aug-12 +AmTrust Bank,Cleveland,OH,29776,New York Community Bank,4-Dec-09,5-Nov-12 +The Tattnall Bank,Reidsville,GA,12080,Heritage Bank of the South,4-Dec-09,5-Nov-12 +First Security National Bank,Norcross,GA,26290,State Bank and Trust Company,4-Dec-09,5-Nov-12 +The Buckhead Community Bank,Atlanta,GA,34663,State Bank and Trust Company,4-Dec-09,5-Nov-12 +Commerce Bank of Southwest Florida,Fort Myers,FL,58016,Central Bank,20-Nov-09,5-Nov-12 +Pacific Coast National Bank,San Clemente,CA,57914,Sunwest Bank,13-Nov-09,22-Aug-12 +Orion Bank,Naples,FL,22427,IBERIABANK,13-Nov-09,5-Nov-12 +"Century Bank, F.S.B.",Sarasota,FL,32267,IBERIABANK,13-Nov-09,22-Aug-12 +United Commercial Bank,San Francisco,CA,32469,East West Bank,6-Nov-09,5-Nov-12 +Gateway Bank of St. Louis,St. Louis,MO,19450,Central Bank of Kansas City,6-Nov-09,22-Aug-12 +Prosperan Bank,Oakdale,MN,35074,"Alerus Financial, N.A.",6-Nov-09,22-Aug-12 +Home Federal Savings Bank,Detroit,MI,30329,Liberty Bank and Trust Company,6-Nov-09,22-Aug-12 +United Security Bank,Sparta,GA,22286,Ameris Bank,6-Nov-09,15-Jan-13 +North Houston Bank,Houston,TX,18776,U.S. Bank N.A.,30-Oct-09,22-Aug-12 +Madisonville State Bank,Madisonville,TX,33782,U.S. Bank N.A.,30-Oct-09,22-Aug-12 +Citizens National Bank,Teague,TX,25222,U.S. Bank N.A.,30-Oct-09,22-Aug-12 +Park National Bank,Chicago,IL,11677,U.S. Bank N.A.,30-Oct-09,22-Aug-12 +Pacific National Bank,San Francisco,CA,30006,U.S. Bank N.A.,30-Oct-09,22-Aug-12 +California National Bank,Los Angeles,CA,34659,U.S. Bank N.A.,30-Oct-09,5-Sep-12 +San Diego National Bank,San Diego,CA,23594,U.S. Bank N.A.,30-Oct-09,22-Aug-12 +Community Bank of Lemont,Lemont,IL,35291,U.S. Bank N.A.,30-Oct-09,15-Jan-13 +"Bank USA, N.A.",Phoenix,AZ,32218,U.S. Bank N.A.,30-Oct-09,22-Aug-12 +First DuPage Bank,Westmont,IL,35038,First Midwest Bank,23-Oct-09,22-Aug-12 +Riverview Community Bank,Otsego,MN,57525,Central Bank,23-Oct-09,22-Aug-12 +Bank of Elmwood,Racine,WI,18321,Tri City National Bank,23-Oct-09,22-Aug-12 +Flagship National Bank,Bradenton,FL,35044,First Federal Bank of Florida,23-Oct-09,22-Aug-12 +Hillcrest Bank Florida,Naples,FL,58336,Stonegate Bank,23-Oct-09,22-Aug-12 +American United Bank,Lawrenceville,GA,57794,Ameris Bank,23-Oct-09,5-Sep-12 +Partners Bank,Naples,FL,57959,Stonegate Bank,23-Oct-09,15-Jan-13 +San Joaquin Bank,Bakersfield,CA,23266,Citizens Business Bank,16-Oct-09,22-Aug-12 +Southern Colorado National Bank,Pueblo,CO,57263,Legacy Bank,2-Oct-09,5-Sep-12 +Jennings State Bank,Spring Grove,MN,11416,Central Bank,2-Oct-09,21-Aug-12 +Warren Bank,Warren,MI,34824,The Huntington National Bank,2-Oct-09,21-Aug-12 +Georgian Bank,Atlanta,GA,57151,"First Citizens Bank and Trust Company, Inc.",25-Sep-09,21-Aug-12 +"Irwin Union Bank, F.S.B.",Louisville,KY,57068,"First Financial Bank, N.A.",18-Sep-09,5-Sep-12 +Irwin Union Bank and Trust Company,Columbus,IN,10100,"First Financial Bank, N.A.",18-Sep-09,21-Aug-12 +Venture Bank,Lacey,WA,22868,First-Citizens Bank & Trust Company,11-Sep-09,21-Aug-12 +Brickwell Community Bank,Woodbury,MN,57736,CorTrust Bank N.A.,11-Sep-09,15-Jan-13 +"Corus Bank, N.A.",Chicago,IL,13693,"MB Financial Bank, N.A.",11-Sep-09,21-Aug-12 +First State Bank,Flagstaff,AZ,34875,Sunwest Bank,4-Sep-09,15-Jan-13 +Platinum Community Bank,Rolling Meadows,IL,35030,No Acquirer,4-Sep-09,21-Aug-12 +Vantus Bank,Sioux City,IN,27732,Great Southern Bank,4-Sep-09,21-Aug-12 +InBank,Oak Forest,IL,20203,"MB Financial Bank, N.A.",4-Sep-09,21-Aug-12 +First Bank of Kansas City,Kansas City,MO,25231,Great American Bank,4-Sep-09,21-Aug-12 +Affinity Bank,Ventura,CA,27197,Pacific Western Bank,28-Aug-09,21-Aug-12 +Mainstreet Bank,Forest Lake,MN,1909,Central Bank,28-Aug-09,21-Aug-12 +Bradford Bank,Baltimore,MD,28312,Manufacturers and Traders Trust Company (M&T Bank),28-Aug-09,15-Jan-13 +Guaranty Bank,Austin,TX,32618,BBVA Compass,21-Aug-09,21-Aug-12 +CapitalSouth Bank,Birmingham,AL,22130,IBERIABANK,21-Aug-09,15-Jan-13 +First Coweta Bank,Newnan,GA,57702,United Bank,21-Aug-09,15-Jan-13 +ebank,Atlanta,GA,34682,"Stearns Bank, N.A.",21-Aug-09,21-Aug-12 +Community Bank of Nevada,Las Vegas,NV,34043,No Acquirer,14-Aug-09,21-Aug-12 +Community Bank of Arizona,Phoenix,AZ,57645,MidFirst Bank,14-Aug-09,21-Aug-12 +"Union Bank, National Association",Gilbert,AZ,34485,MidFirst Bank,14-Aug-09,21-Aug-12 +Colonial Bank,Montgomery,AL,9609,"Branch Banking & Trust Company, (BB&T)",14-Aug-09,5-Sep-12 +Dwelling House Savings and Loan Association,Pittsburgh,PA,31559,"PNC Bank, N.A.",14-Aug-09,15-Jan-13 +Community First Bank,Prineville,OR,23268,Home Federal Bank,7-Aug-09,15-Jan-13 +Community National Bank of Sarasota County,Venice,FL,27183,"Stearns Bank, N.A.",7-Aug-09,20-Aug-12 +First State Bank,Sarasota,FL,27364,"Stearns Bank, N.A.",7-Aug-09,20-Aug-12 +Mutual Bank,Harvey,IL,18659,United Central Bank,31-Jul-09,20-Aug-12 +First BankAmericano,Elizabeth,NJ,34270,Crown Bank,31-Jul-09,20-Aug-12 +Peoples Community Bank,West Chester,OH,32288,"First Financial Bank, N.A.",31-Jul-09,20-Aug-12 +Integrity Bank,Jupiter,FL,57604,Stonegate Bank,31-Jul-09,20-Aug-12 +First State Bank of Altus,Altus,OK,9873,Herring Bank,31-Jul-09,20-Aug-12 +Security Bank of Jones County,Gray,GA,8486,State Bank and Trust Company,24-Jul-09,20-Aug-12 +Security Bank of Houston County,Perry,GA,27048,State Bank and Trust Company,24-Jul-09,20-Aug-12 +Security Bank of Bibb County,Macon,GA,27367,State Bank and Trust Company,24-Jul-09,20-Aug-12 +Security Bank of North Metro,Woodstock,GA,57105,State Bank and Trust Company,24-Jul-09,20-Aug-12 +Security Bank of North Fulton,Alpharetta,GA,57430,State Bank and Trust Company,24-Jul-09,20-Aug-12 +Security Bank of Gwinnett County,Suwanee,GA,57346,State Bank and Trust Company,24-Jul-09,20-Aug-12 +Waterford Village Bank,Williamsville,NY,58065,"Evans Bank, N.A.",24-Jul-09,20-Aug-12 +Temecula Valley Bank,Temecula,CA,34341,First-Citizens Bank & Trust Company,17-Jul-09,20-Aug-12 +Vineyard Bank,Rancho Cucamonga,CA,23556,California Bank & Trust,17-Jul-09,20-Aug-12 +BankFirst,Sioux Falls,SD,34103,"Alerus Financial, N.A.",17-Jul-09,20-Aug-12 +First Piedmont Bank,Winder,GA,34594,First American Bank and Trust Company,17-Jul-09,15-Jan-13 +Bank of Wyoming,Thermopolis,WY,22754,Central Bank & Trust,10-Jul-09,20-Aug-12 +Founders Bank,Worth,IL,18390,The PrivateBank and Trust Company,2-Jul-09,20-Aug-12 +Millennium State Bank of Texas,Dallas,TX,57667,State Bank of Texas,2-Jul-09,26-Oct-12 +First National Bank of Danville,Danville,IL,3644,"First Financial Bank, N.A.",2-Jul-09,20-Aug-12 +Elizabeth State Bank,Elizabeth,IL,9262,Galena State Bank and Trust Company,2-Jul-09,20-Aug-12 +Rock River Bank,Oregon,IL,15302,The Harvard State Bank,2-Jul-09,20-Aug-12 +First State Bank of Winchester,Winchester,IL,11710,The First National Bank of Beardstown,2-Jul-09,20-Aug-12 +John Warner Bank,Clinton,IL,12093,State Bank of Lincoln,2-Jul-09,20-Aug-12 +Mirae Bank,Los Angeles,CA,57332,Wilshire State Bank,26-Jun-09,20-Aug-12 +MetroPacific Bank,Irvine,CA,57893,Sunwest Bank,26-Jun-09,20-Aug-12 +Horizon Bank,Pine City,MN,9744,"Stearns Bank, N.A.",26-Jun-09,20-Aug-12 +Neighborhood Community Bank,Newnan,GA,35285,CharterBank,26-Jun-09,20-Aug-12 +Community Bank of West Georgia,Villa Rica,GA,57436,No Acquirer,26-Jun-09,17-Aug-12 +First National Bank of Anthony,Anthony,KS,4614,Bank of Kansas,19-Jun-09,17-Aug-12 +Cooperative Bank,Wilmington,NC,27837,First Bank,19-Jun-09,17-Aug-12 +Southern Community Bank,Fayetteville,GA,35251,United Community Bank,19-Jun-09,17-Aug-12 +Bank of Lincolnwood,Lincolnwood,IL,17309,Republic Bank of Chicago,5-Jun-09,17-Aug-12 +Citizens National Bank,Macomb,IL,5757,Morton Community Bank,22-May-09,4-Sep-12 +Strategic Capital Bank,Champaign,IL,35175,Midland States Bank,22-May-09,4-Sep-12 +"BankUnited, FSB",Coral Gables,FL,32247,BankUnited,21-May-09,17-Aug-12 +Westsound Bank,Bremerton,WA,34843,Kitsap Bank,8-May-09,4-Sep-12 +America West Bank,Layton,UT,35461,Cache Valley Bank,1-May-09,17-Aug-12 +Citizens Community Bank,Ridgewood,NJ,57563,North Jersey Community Bank,1-May-09,4-Sep-12 +"Silverton Bank, NA",Atlanta,GA,26535,No Acquirer,1-May-09,17-Aug-12 +First Bank of Idaho,Ketchum,ID,34396,"U.S. Bank, N.A.",24-Apr-09,17-Aug-12 +First Bank of Beverly Hills,Calabasas,CA,32069,No Acquirer,24-Apr-09,4-Sep-12 +Michigan Heritage Bank,Farmington Hills,MI,34369,Level One Bank,24-Apr-09,17-Aug-12 +American Southern Bank,Kennesaw,GA,57943,Bank of North Georgia,24-Apr-09,17-Aug-12 +Great Basin Bank of Nevada,Elko,NV,33824,Nevada State Bank,17-Apr-09,4-Sep-12 +American Sterling Bank,Sugar Creek,MO,8266,Metcalf Bank,17-Apr-09,31-Aug-12 +New Frontier Bank,Greeley,CO,34881,No Acquirer,10-Apr-09,4-Sep-12 +Cape Fear Bank,Wilmington,NC,34639,First Federal Savings and Loan Association,10-Apr-09,17-Aug-12 +Omni National Bank,Atlanta,GA,22238,No Acquirer,27-Mar-09,17-Aug-12 +"TeamBank, NA",Paola,KS,4754,Great Southern Bank,20-Mar-09,17-Aug-12 +Colorado National Bank,Colorado Springs,CO,18896,Herring Bank,20-Mar-09,17-Aug-12 +FirstCity Bank,Stockbridge,GA,18243,No Acquirer,20-Mar-09,17-Aug-12 +Freedom Bank of Georgia,Commerce,GA,57558,Northeast Georgia Bank,6-Mar-09,17-Aug-12 +Security Savings Bank,Henderson,NV,34820,Bank of Nevada,27-Feb-09,7-Sep-12 +Heritage Community Bank,Glenwood,IL,20078,"MB Financial Bank, N.A.",27-Feb-09,17-Aug-12 +Silver Falls Bank,Silverton,OR,35399,Citizens Bank,20-Feb-09,17-Aug-12 +Pinnacle Bank of Oregon,Beaverton,OR,57342,Washington Trust Bank of Spokane,13-Feb-09,17-Aug-12 +Corn Belt Bank & Trust Co.,Pittsfield,IL,16500,The Carlinville National Bank,13-Feb-09,17-Aug-12 +Riverside Bank of the Gulf Coast,Cape Coral,FL,34563,TIB Bank,13-Feb-09,17-Aug-12 +Sherman County Bank,Loup City,NE,5431,Heritage Bank,13-Feb-09,17-Aug-12 +County Bank,Merced,CA,22574,Westamerica Bank,6-Feb-09,4-Sep-12 +Alliance Bank,Culver City,CA,23124,California Bank & Trust,6-Feb-09,16-Aug-12 +FirstBank Financial Services,McDonough,GA,57017,Regions Bank,6-Feb-09,16-Aug-12 +Ocala National Bank,Ocala,FL,26538,"CenterState Bank of Florida, N.A.",30-Jan-09,4-Sep-12 +Suburban FSB,Crofton,MD,30763,Bank of Essex,30-Jan-09,16-Aug-12 +MagnetBank,Salt Lake City,UT,58001,No Acquirer,30-Jan-09,16-Aug-12 +1st Centennial Bank,Redlands,CA,33025,First California Bank,23-Jan-09,16-Aug-12 +Bank of Clark County,Vancouver,WA,34959,Umpqua Bank,16-Jan-09,16-Aug-12 +National Bank of Commerce,Berkeley,IL,19733,Republic Bank of Chicago,16-Jan-09,16-Aug-12 +Sanderson State Bank,Sanderson,TX,11568,The Pecos County State Bank,12-Dec-08,4-Sep-12 +Haven Trust Bank,Duluth,GA,35379,"Branch Banking & Trust Company, (BB&T)",12-Dec-08,16-Aug-12 +First Georgia Community Bank,Jackson,GA,34301,United Bank,5-Dec-08,16-Aug-12 +PFF Bank & Trust,Pomona,CA,28344,"U.S. Bank, N.A.",21-Nov-08,4-Jan-13 +Downey Savings & Loan,Newport Beach,CA,30968,"U.S. Bank, N.A.",21-Nov-08,4-Jan-13 +Community Bank,Loganville,GA,16490,Bank of Essex,21-Nov-08,4-Sep-12 +Security Pacific Bank,Los Angeles,CA,23595,Pacific Western Bank,7-Nov-08,28-Aug-12 +"Franklin Bank, SSB",Houston,TX,26870,Prosperity Bank,7-Nov-08,16-Aug-12 +Freedom Bank,Bradenton,FL,57930,Fifth Third Bank,31-Oct-08,16-Aug-12 +Alpha Bank & Trust,Alpharetta,GA,58241,"Stearns Bank, N.A.",24-Oct-08,16-Aug-12 +Meridian Bank,Eldred,IL,13789,National Bank,10-Oct-08,31-May-12 +Main Street Bank,Northville,MI,57654,Monroe Bank & Trust,10-Oct-08,16-Aug-12 +Washington Mutual Bank,Henderson,NV,32633,JP Morgan Chase Bank,25-Sep-08,16-Aug-12 +Ameribank,Northfork,WV,6782,The Citizens Savings Bank,19-Sep-08,16-Aug-12 +Silver State Bank,Henderson,NV,34194,Nevada State Bank,5-Sep-08,16-Aug-12 +Integrity Bank,Alpharetta,GA,35469,Regions Bank,29-Aug-08,16-Aug-12 +Columbian Bank & Trust,Topeka,KS,22728,Citizens Bank & Trust,22-Aug-08,16-Aug-12 +First Priority Bank,Bradenton,FL,57523,SunTrust Bank,1-Aug-08,16-Aug-12 +"First Heritage Bank, NA",Newport Beach,CA,57961,Mutual of Omaha Bank,25-Jul-08,28-Aug-12 +First National Bank of Nevada,Reno,NV,27011,Mutual of Omaha Bank,25-Jul-08,28-Aug-12 +IndyMac Bank,Pasadena,CA,29730,"OneWest Bank, FSB",11-Jul-08,28-Aug-12 +"First Integrity Bank, NA",Staples,MN,12736,First International Bank and Trust,30-May-08,28-Aug-12 +"ANB Financial, NA",Bentonville,AR,33901,Pulaski Bank and Trust Company,9-May-08,28-Aug-12 +Hume Bank,Hume,MO,1971,Security Bank,7-Mar-08,28-Aug-12 +Douglass National Bank,Kansas City,MO,24660,Liberty Bank and Trust Company,25-Jan-08,26-Oct-12 +Miami Valley Bank,Lakeview,OH,16848,The Citizens Banking Company,4-Oct-07,28-Aug-12 +NetBank,Alpharetta,GA,32575,ING DIRECT,28-Sep-07,28-Aug-12 +Metropolitan Savings Bank,Pittsburgh,PA,35353,Allegheny Valley Bank of Pittsburgh,2-Feb-07,27-Oct-10 +Bank of Ephraim,Ephraim,UT,1249,Far West Bank,25-Jun-04,9-Apr-08 +Reliance Bank,White Plains,NY,26778,Union State Bank,19-Mar-04,9-Apr-08 +Guaranty National Bank of Tallahassee,Tallahassee,FL,26838,Hancock Bank of Florida,12-Mar-04,5-Jun-12 +Dollar Savings Bank,Newark,NJ,31330,No Acquirer,14-Feb-04,9-Apr-08 +Pulaski Savings Bank,Philadelphia,PA,27203,Earthstar Bank,14-Nov-03,22-Jul-05 +First National Bank of Blanchardville,Blanchardville,WI,11639,The Park Bank,9-May-03,5-Jun-12 +Southern Pacific Bank,Torrance,CA,27094,Beal Bank,7-Feb-03,20-Oct-08 +Farmers Bank of Cheneyville,Cheneyville,LA,16445,Sabine State Bank & Trust,17-Dec-02,20-Oct-04 +Bank of Alamo,Alamo,TN,9961,No Acquirer,8-Nov-02,18-Mar-05 +AmTrade International Bank,Atlanta,GA,33784,No Acquirer,30-Sep-02,11-Sep-06 +Universal Federal Savings Bank,Chicago,IL,29355,Chicago Community Bank,27-Jun-02,9-Apr-08 +Connecticut Bank of Commerce,Stamford,CT,19183,Hudson United Bank,26-Jun-02,14-Feb-12 +New Century Bank,Shelby Township,MI,34979,No Acquirer,28-Mar-02,18-Mar-05 +Net 1st National Bank,Boca Raton,FL,26652,Bank Leumi USA,1-Mar-02,9-Apr-08 +"NextBank, NA",Phoenix,AZ,22314,No Acquirer,7-Feb-02,27-Aug-10 +Oakwood Deposit Bank Co.,Oakwood,OH,8966,The State Bank & Trust Company,1-Feb-02,25-Oct-12 +Bank of Sierra Blanca,Sierra Blanca,TX,22002,The Security State Bank of Pecos,18-Jan-02,6-Nov-03 +"Hamilton Bank, NA",Miami,FL,24382,Israel Discount Bank of New York,11-Jan-02,5-Jun-12 +Sinclair National Bank,Gravette,AR,34248,Delta Trust & Bank,7-Sep-01,10-Feb-04 +"Superior Bank, FSB",Hinsdale,IL,32646,"Superior Federal, FSB",27-Jul-01,5-Jun-12 +Malta National Bank,Malta,OH,6629,North Valley Bank,3-May-01,18-Nov-02 +First Alliance Bank & Trust Co.,Manchester,NH,34264,Southern New Hampshire Bank & Trust,2-Feb-01,18-Feb-03 +National State Bank of Metropolis,Metropolis,IL,3815,Banterra Bank of Marion,14-Dec-00,17-Mar-05 +Bank of Honolulu,Honolulu,HI,21029,Bank of the Orient,13-Oct-00,17-Mar-05 diff --git a/pandas/io/tests/data/banklist.html b/pandas/io/tests/data/banklist.html new file mode 100644 index 00000000..8ec1561f --- /dev/null +++ b/pandas/io/tests/data/banklist.html @@ -0,0 +1,4885 @@ + + + + +FDIC: Failed Bank List + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip Header +
+
+
+ + +
+ + +

Federal Deposit
Insurance Corporation

+

Each depositor insured to at least $250,000 per insured bank

+
+ +
+
+ + + + + + +
+ +

Failed Bank List

+ +

The FDIC is often appointed as receiver for failed banks. This page contains useful information for the customers and vendors of these banks. This includes information on the acquiring bank (if applicable), how your accounts and loans are affected, and how vendors can file claims against the receivership. Failed Financial Institution Contact Search displays point of contact information related to failed banks.

+ +

This list includes banks which have failed since October 1, 2000. To search for banks that failed prior to those on this page, visit this link: Failures and Assistance Transactions

+ +

Failed Bank List - CSV file (Updated on Mondays. Also opens in Excel - Excel Help)

+ +

Due to the small screen size some information is no longer visible.
Full information available when viewed on a larger screen.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Bank NameCitySTCERTAcquiring InstitutionClosing DateUpdated Date
Banks of Wisconsin d/b/a Bank of KenoshaKenoshaWI35386North Shore Bank, FSBMay 31, 2013May 31, 2013
Central Arizona BankScottsdaleAZ34527Western State BankMay 14, 2013May 20, 2013
Sunrise BankValdostaGA58185Synovus BankMay 10, 2013May 21, 2013
Pisgah Community BankAshevilleNC58701Capital Bank, N.A.May 10, 2013May 14, 2013
Douglas County BankDouglasvilleGA21649Hamilton State BankApril 26, 2013May 16, 2013
Parkway BankLenoirNC57158CertusBank, National AssociationApril 26, 2013May 17, 2013
Chipola Community BankMariannaFL58034First Federal Bank of FloridaApril 19, 2013May 16, 2013
Heritage Bank of North FloridaOrange ParkFL26680FirstAtlantic BankApril 19, 2013May 16, 2013
First Federal BankLexingtonKY29594Your Community BankApril 19, 2013April 23, 2013
Gold Canyon BankGold CanyonAZ58066First Scottsdale Bank, National AssociationApril 5, 2013April 9, 2013
Frontier BankLaGrangeGA16431HeritageBank of the SouthMarch 8, 2013March 26, 2013
Covenant BankChicagoIL22476Liberty Bank and Trust CompanyFebruary 15, 2013March 4, 2013
1st Regents BankAndoverMN57157First Minnesota BankJanuary 18, 2013February 28, 2013
Westside Community BankUniversity PlaceWA33997Sunwest BankJanuary 11, 2013January 24, 2013
Community Bank of the OzarksSunrise BeachMO27331Bank of SullivanDecember 14, 2012January 24, 2013
Hometown Community BankBraseltonGA57928CertusBank, National AssociationNovember 16, 2012January 24, 2013
Citizens First National BankPrincetonIL3731Heartland Bank and Trust CompanyNovember 2, 2012January 24, 2013
Heritage Bank of FloridaLutzFL35009Centennial BankNovember 2, 2012January 24, 2013
NOVA BankBerwynPA27148No AcquirerOctober 26, 2012January 24, 2013
Excel BankSedaliaMO19189Simmons First National BankOctober 19, 2012January 24, 2013
First East Side Savings BankTamaracFL28144Stearns Bank N.A.October 19, 2012January 24, 2013
GulfSouth Private BankDestinFL58073SmartBankOctober 19, 2012January 24, 2013
First United BankCreteIL20685Old Plank Trail Community Bank, National AssociationSeptember 28, 2012November 15, 2012
Truman BankSt. LouisMO27316Simmons First National BankSeptember 14, 2012December 17, 2012
First Commercial BankBloomingtonMN35246Republic Bank & Trust CompanySeptember 7, 2012December 17, 2012
Waukegan Savings BankWaukeganIL28243First Midwest BankAugust 3, 2012October 11, 2012
Jasper Banking CompanyJasperGA16240Stearns Bank N.A.July 27, 2012December 17, 2012
Second Federal Savings and Loan Association of ChicagoChicagoIL27986Hinsdale Bank & Trust CompanyJuly 20, 2012January 14, 2013
Heartland BankLeawoodKS1361Metcalf BankJuly 20, 2012December 17, 2012
First Cherokee State BankWoodstockGA32711Community & Southern BankJuly 20, 2012October 31, 2012
Georgia Trust BankBufordGA57847Community & Southern BankJuly 20, 2012December 17, 2012
The Royal Palm Bank of FloridaNaplesFL57096First National Bank of the Gulf CoastJuly 20, 2012January 7, 2013
Glasgow Savings BankGlasgowMO1056Regional Missouri BankJuly 13, 2012October 11, 2012
Montgomery Bank & TrustAileyGA19498Ameris BankJuly 6, 2012October 31, 2012
The Farmers Bank of LynchburgLynchburgTN1690Clayton Bank and TrustJune 15, 2012October 31, 2012
Security Exchange BankMariettaGA35299Fidelity BankJune 15, 2012October 10, 2012
Putnam State BankPalatkaFL27405Harbor Community BankJune 15, 2012October 10, 2012
Waccamaw BankWhitevilleNC34515First Community BankJune 8, 2012November 8, 2012
Farmers' and Traders' State BankShabbonaIL9257First State BankJune 8, 2012October 10, 2012
Carolina Federal Savings BankCharlestonSC35372Bank of North CarolinaJune 8, 2012October 31, 2012
First Capital BankKingfisherOK416F & M BankJune 8, 2012October 10, 2012
Alabama Trust Bank, National AssociationSylacaugaAL35224Southern States BankMay 18, 2012May 20, 2013
Security Bank, National AssociationNorth LauderdaleFL23156Banesco USAMay 4, 2012October 31, 2012
Palm Desert National BankPalm DesertCA23632Pacific Premier BankApril 27, 2012May 17, 2013
Plantation Federal BankPawleys IslandSC32503First Federal BankApril 27, 2012May 17, 2013
Inter Savings Bank, fsb D/B/A InterBank, fsbMaple GroveMN31495Great Southern BankApril 27, 2012May 17, 2013
HarVest Bank of MarylandGaithersburgMD57766SonabankApril 27, 2012May 17, 2013
Bank of the Eastern ShoreCambridgeMD26759No AcquirerApril 27, 2012October 17, 2012
Fort Lee Federal Savings Bank, FSBFort LeeNJ35527Alma BankApril 20, 2012May 17, 2013
Fidelity BankDearbornMI33883The Huntington National BankMarch 30, 2012May 16, 2013
Premier BankWilmetteIL35419International Bank of ChicagoMarch 23, 2012October 17, 2012
Covenant Bank & TrustRock SpringGA58068Stearns Bank, N.A.March 23, 2012October 31, 2012
New City BankChicagoIL57597No AcquirerMarch 9, 2012October 29, 2012
Global Commerce BankDoravilleGA34046Metro City BankMarch 2, 2012October 31, 2012
Home Savings of AmericaLittle FallsMN29178No AcquirerFebruary 24, 2012December 17, 2012
Central Bank of GeorgiaEllavilleGA5687Ameris BankFebruary 24, 2012August 9, 2012
SCB BankShelbyvilleIN29761First Merchants Bank, National AssociationFebruary 10, 2012March 25, 2013
Charter National Bank and TrustHoffman EstatesIL23187Barrington Bank & Trust Company, National AssociationFebruary 10, 2012March 25, 2013
BankEastKnoxvilleTN19869U.S.Bank National AssociationJanuary 27, 2012March 8, 2013
Patriot Bank MinnesotaForest LakeMN34823First Resource BankJanuary 27, 2012September 12, 2012
Tennessee Commerce BankFranklinTN35296Republic Bank & Trust CompanyJanuary 27, 2012November 20, 2012
First Guaranty Bank and Trust Company of JacksonvilleJacksonvilleFL16579CenterState Bank of Florida, N.A.January 27, 2012September 12, 2012
American Eagle Savings BankBoothwynPA31581Capital Bank, N.A.January 20, 2012January 25, 2013
The First State BankStockbridgeGA19252Hamilton State BankJanuary 20, 2012January 25, 2013
Central Florida State BankBelleviewFL57186CenterState Bank of Florida, N.A.January 20, 2012January 25, 2013
Western National BankPhoenixAZ57917Washington FederalDecember 16, 2011August 13, 2012
Premier Community Bank of the Emerald CoastCrestviewFL58343Summit BankDecember 16, 2011September 12, 2012
Central Progressive BankLacombeLA19657First NBC BankNovember 18, 2011August 13, 2012
Polk County BankJohnstonIA14194Grinnell State BankNovember 18, 2011August 15, 2012
Community Bank of RockmartRockmartGA57860Century Bank of GeorgiaNovember 10, 2011August 13, 2012
SunFirst BankSaint GeorgeUT57087Cache Valley BankNovember 4, 2011November 16, 2012
Mid City Bank, Inc.OmahaNE19397Premier BankNovember 4, 2011August 15, 2012
All American BankDes PlainesIL57759International Bank of ChicagoOctober 28, 2011August 15, 2012
Community Banks of ColoradoGreenwood VillageCO21132Bank Midwest, N.A.October 21, 2011January 2, 2013
Community Capital BankJonesboroGA57036State Bank and Trust CompanyOctober 21, 2011November 8, 2012
Decatur First BankDecaturGA34392Fidelity BankOctober 21, 2011November 8, 2012
Old Harbor BankClearwaterFL575371st United BankOctober 21, 2011November 8, 2012
Country BankAledoIL35395Blackhawk Bank & TrustOctober 14, 2011August 15, 2012
First State BankCranfordNJ58046Northfield BankOctober 14, 2011November 8, 2012
Blue Ridge Savings Bank, Inc.AshevilleNC32347Bank of North CarolinaOctober 14, 2011November 8, 2012
Piedmont Community BankGrayGA57256State Bank and Trust CompanyOctober 14, 2011January 22, 2013
Sun Security BankEllingtonMO20115Great Southern BankOctober 7, 2011November 7, 2012
The RiverBankWyomingMN10216Central BankOctober 7, 2011November 7, 2012
First International BankPlanoTX33513American First National BankSeptember 30, 2011October 9, 2012
Citizens Bank of Northern CaliforniaNevada CityCA33983Tri Counties BankSeptember 23, 2011October 9, 2012
Bank of the CommonwealthNorfolkVA20408Southern Bank and Trust CompanySeptember 23, 2011October 9, 2012
The First National Bank of FloridaMiltonFL25155CharterBankSeptember 9, 2011September 6, 2012
CreekSide BankWoodstockGA58226Georgia Commerce BankSeptember 2, 2011September 6, 2012
Patriot Bank of GeorgiaCummingGA58273Georgia Commerce BankSeptember 2, 2011November 2, 2012
First Choice BankGenevaIL57212Inland Bank & TrustAugust 19, 2011August 15, 2012
First Southern National BankStatesboroGA57239Heritage Bank of the SouthAugust 19, 2011November 2, 2012
Lydian Private BankPalm BeachFL35356Sabadell United Bank, N.A.August 19, 2011November 2, 2012
Public Savings BankHuntingdon ValleyPA34130Capital Bank, N.A.August 18, 2011August 15, 2012
The First National Bank of OlatheOlatheKS4744Enterprise Bank & TrustAugust 12, 2011August 23, 2012
Bank of WhitmanColfaxWA22528Columbia State BankAugust 5, 2011August 16, 2012
Bank of ShorewoodShorewoodIL22637Heartland Bank and Trust CompanyAugust 5, 2011August 16, 2012
Integra Bank National AssociationEvansvilleIN4392Old National BankJuly 29, 2011August 16, 2012
BankMeridian, N.A.ColumbiaSC58222SCBT National AssociationJuly 29, 2011November 2, 2012
Virginia Business BankRichmondVA58283Xenith BankJuly 29, 2011October 9, 2012
Bank of ChoiceGreeleyCO2994Bank Midwest, N.A.July 22, 2011September 12, 2012
LandMark Bank of FloridaSarasotaFL35244American Momentum BankJuly 22, 2011November 2, 2012
Southshore Community BankApollo BeachFL58056American Momentum BankJuly 22, 2011November 2, 2012
Summit BankPrescottAZ57442The Foothills BankJuly 15, 2011August 16, 2012
First Peoples BankPort St. LucieFL34870Premier American Bank, N.A.July 15, 2011November 2, 2012
High Trust BankStockbridgeGA19554Ameris BankJuly 15, 2011November 2, 2012
One Georgia BankAtlantaGA58238Ameris BankJuly 15, 2011November 2, 2012
Signature BankWindsorCO57835Points West Community BankJuly 8, 2011October 26, 2012
Colorado Capital BankCastle RockCO34522First-Citizens Bank & Trust CompanyJuly 8, 2011January 15, 2013
First Chicago Bank & TrustChicagoIL27935Northbrook Bank & Trust CompanyJuly 8, 2011September 9, 2012
Mountain Heritage BankClaytonGA57593First American Bank and Trust CompanyJune 24, 2011November 2, 2012
First Commercial Bank of Tampa BayTampaFL27583Stonegate BankJune 17, 2011November 2, 2012
McIntosh State BankJacksonGA19237Hamilton State BankJune 17, 2011November 2, 2012
Atlantic Bank and TrustCharlestonSC58420First Citizens Bank and Trust Company, Inc.June 3, 2011October 31, 2012
First Heritage BankSnohomishWA23626Columbia State BankMay 27, 2011January 28, 2013
Summit BankBurlingtonWA513Columbia State BankMay 20, 2011January 22, 2013
First Georgia Banking CompanyFranklinGA57647CertusBank, National AssociationMay 20, 2011November 13, 2012
Atlantic Southern BankMaconGA57213CertusBank, National AssociationMay 20, 2011October 31, 2012
Coastal BankCocoa BeachFL34898Florida Community Bank, a division of Premier American Bank, N.A.May 6, 2011November 30, 2012
Community Central BankMount ClemensMI34234Talmer Bank & TrustApril 29, 2011August 16, 2012
The Park Avenue BankValdostaGA19797Bank of the OzarksApril 29, 2011November 30, 2012
First Choice Community BankDallasGA58539Bank of the OzarksApril 29, 2011January 22, 2013
Cortez Community BankBrooksvilleFL57625Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
First National Bank of Central FloridaWinter ParkFL26297Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
Heritage Banking GroupCarthageMS14273Trustmark National BankApril 15, 2011November 30, 2012
Rosemount National BankRosemountMN24099Central BankApril 15, 2011August 16, 2012
Superior BankBirminghamAL17750Superior Bank, National AssociationApril 15, 2011November 30, 2012
Nexity BankBirminghamAL19794AloStar Bank of CommerceApril 15, 2011September 4, 2012
New Horizons BankEast EllijayGA57705Citizens South BankApril 15, 2011August 16, 2012
Bartow County BankCartersvilleGA21495Hamilton State BankApril 15, 2011January 22, 2013
Nevada Commerce BankLas VegasNV35418City National BankApril 8, 2011September 9, 2012
Western Springs National Bank and TrustWestern SpringsIL10086Heartland Bank and Trust CompanyApril 8, 2011January 22, 2013
The Bank of CommerceWood DaleIL34292Advantage National Bank GroupMarch 25, 2011January 22, 2013
Legacy BankMilwaukeeWI34818Seaway Bank and Trust CompanyMarch 11, 2011September 12, 2012
First National Bank of DavisDavisOK4077The Pauls Valley National BankMarch 11, 2011August 20, 2012
Valley Community BankSt. CharlesIL34187First State BankFebruary 25, 2011September 12, 2012
San Luis Trust Bank, FSBSan Luis ObispoCA34783First California BankFebruary 18, 2011August 20, 2012
Charter Oak BankNapaCA57855Bank of MarinFebruary 18, 2011September 12, 2012
Citizens Bank of EffinghamSpringfieldGA34601Heritage Bank of the SouthFebruary 18, 2011November 2, 2012
Habersham BankClarkesvilleGA151SCBT National AssociationFebruary 18, 2011November 2, 2012
Canyon National BankPalm SpringsCA34692Pacific Premier BankFebruary 11, 2011September 12, 2012
Badger State BankCassvilleWI13272Royal BankFebruary 11, 2011September 12, 2012
Peoples State BankHamtramckMI14939First Michigan BankFebruary 11, 2011January 22, 2013
Sunshine State Community BankPort OrangeFL35478Premier American Bank, N.A.February 11, 2011November 2, 2012
Community First Bank ChicagoChicagoIL57948Northbrook Bank & Trust CompanyFebruary 4, 2011August 20, 2012
North Georgia BankWatkinsvilleGA35242BankSouthFebruary 4, 2011November 2, 2012
American Trust BankRoswellGA57432Renasant BankFebruary 4, 2011October 31, 2012
First Community BankTaosNM12261U.S. Bank, N.A.January 28, 2011September 12, 2012
FirsTier BankLouisvilleCO57646No AcquirerJanuary 28, 2011September 12, 2012
Evergreen State BankStoughtonWI5328McFarland State BankJanuary 28, 2011September 12, 2012
The First State BankCamargoOK2303Bank 7January 28, 2011September 12, 2012
United Western BankDenverCO31293First-Citizens Bank & Trust CompanyJanuary 21, 2011September 12, 2012
The Bank of AshevilleAshevilleNC34516First BankJanuary 21, 2011November 2, 2012
CommunitySouth Bank & TrustEasleySC57868CertusBank, National AssociationJanuary 21, 2011November 2, 2012
Enterprise Banking CompanyMcDonoughGA19758No AcquirerJanuary 21, 2011November 2, 2012
Oglethorpe BankBrunswickGA57440Bank of the OzarksJanuary 14, 2011November 2, 2012
Legacy BankScottsdaleAZ57820Enterprise Bank & TrustJanuary 7, 2011September 12, 2012
First Commercial Bank of FloridaOrlandoFL34965First Southern BankJanuary 7, 2011November 2, 2012
Community National BankLino LakesMN23306Farmers & Merchants Savings BankDecember 17, 2010August 20, 2012
First Southern BankBatesvilleAR58052Southern BankDecember 17, 2010August 20, 2012
United Americas Bank, N.A.AtlantaGA35065State Bank and Trust CompanyDecember 17, 2010November 2, 2012
Appalachian Community Bank, FSBMcCaysvilleGA58495Peoples Bank of East TennesseeDecember 17, 2010October 31, 2012
Chestatee State BankDawsonvilleGA34578Bank of the OzarksDecember 17, 2010November 2, 2012
The Bank of Miami,N.A.Coral GablesFL190401st United BankDecember 17, 2010November 2, 2012
Earthstar BankSouthamptonPA35561Polonia BankDecember 10, 2010August 20, 2012
Paramount BankFarmington HillsMI34673Level One BankDecember 10, 2010August 20, 2012
First Banking CenterBurlingtonWI5287First Michigan BankNovember 19, 2010August 20, 2012
Allegiance Bank of North AmericaBala CynwydPA35078VIST BankNovember 19, 2010August 20, 2012
Gulf State Community BankCarrabelleFL20340Centennial BankNovember 19, 2010November 2, 2012
Copper Star BankScottsdaleAZ35463Stearns Bank, N.A.November 12, 2010August 20, 2012
Darby Bank & Trust Co.VidaliaGA14580Ameris BankNovember 12, 2010January 15, 2013
Tifton Banking CompanyTiftonGA57831Ameris BankNovember 12, 2010November 2, 2012
First Vietnamese American Bank
In Vietnamese
WestminsterCA57885Grandpoint BankNovember 5, 2010September 12, 2012
Pierce Commercial BankTacomaWA34411Heritage BankNovember 5, 2010August 20, 2012
Western Commercial BankWoodland HillsCA58087First California BankNovember 5, 2010September 12, 2012
K BankRandallstownMD31263Manufacturers and Traders Trust Company (M&T Bank)November 5, 2010August 20, 2012
First Arizona Savings, A FSBScottsdaleAZ32582No AcquirerOctober 22, 2010August 20, 2012
Hillcrest BankOverland ParkKS22173Hillcrest Bank, N.A.October 22, 2010August 20, 2012
First Suburban National BankMaywoodIL16089Seaway Bank and Trust CompanyOctober 22, 2010August 20, 2012
The First National Bank of BarnesvilleBarnesvilleGA2119United BankOctober 22, 2010November 2, 2012
The Gordon BankGordonGA33904Morris BankOctober 22, 2010November 2, 2012
Progress Bank of FloridaTampaFL32251Bay Cities BankOctober 22, 2010November 2, 2012
First Bank of JacksonvilleJacksonvilleFL27573Ameris BankOctober 22, 2010November 2, 2012
Premier BankJefferson CityMO34016Providence BankOctober 15, 2010August 20, 2012
WestBridge Bank and Trust CompanyChesterfieldMO58205Midland States BankOctober 15, 2010August 20, 2012
Security Savings Bank, F.S.B.OlatheKS30898Simmons First National BankOctober 15, 2010August 20, 2012
Shoreline BankShorelineWA35250GBC International BankOctober 1, 2010August 20, 2012
Wakulla BankCrawfordvilleFL21777Centennial BankOctober 1, 2010November 2, 2012
North County BankArlingtonWA35053Whidbey Island BankSeptember 24, 2010August 20, 2012
Haven Trust Bank FloridaPonte Vedra BeachFL58308First Southern BankSeptember 24, 2010November 5, 2012
Maritime Savings BankWest AllisWI28612North Shore Bank, FSBSeptember 17, 2010August 20, 2012
Bramble Savings BankMilfordOH27808Foundation BankSeptember 17, 2010August 20, 2012
The Peoples BankWinderGA182Community & Southern BankSeptember 17, 2010November 5, 2012
First Commerce Community BankDouglasvilleGA57448Community & Southern BankSeptember 17, 2010January 15, 2013
Bank of EllijayEllijayGA58197Community & Southern BankSeptember 17, 2010January 15, 2013
ISN BankCherry HillNJ57107Customers BankSeptember 17, 2010August 22, 2012
Horizon BankBradentonFL35061Bank of the OzarksSeptember 10, 2010November 5, 2012
Sonoma Valley BankSonomaCA27259Westamerica BankAugust 20, 2010September 12, 2012
Los Padres BankSolvangCA32165Pacific Western BankAugust 20, 2010September 12, 2012
Butte Community BankChicoCA33219Rabobank, N.A.August 20, 2010September 12, 2012
Pacific State BankStocktonCA27090Rabobank, N.A.August 20, 2010September 12, 2012
ShoreBankChicagoIL15640Urban Partnership BankAugust 20, 2010May 16, 2013
Imperial Savings and Loan AssociationMartinsvilleVA31623River Community Bank, N.A.August 20, 2010August 24, 2012
Independent National BankOcalaFL27344CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Community National Bank at BartowBartowFL25266CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Palos Bank and Trust CompanyPalos HeightsIL17599First Midwest BankAugust 13, 2010August 22, 2012
Ravenswood BankChicagoIL34231Northbrook Bank & Trust CompanyAugust 6, 2010August 22, 2012
LibertyBankEugeneOR31964Home Federal BankJuly 30, 2010August 22, 2012
The Cowlitz BankLongviewWA22643Heritage BankJuly 30, 2010August 22, 2012
Coastal Community BankPanama City BeachFL9619Centennial BankJuly 30, 2010November 5, 2012
Bayside Savings BankPort Saint JoeFL57669Centennial BankJuly 30, 2010November 5, 2012
Northwest Bank & TrustAcworthGA57658State Bank and Trust CompanyJuly 30, 2010November 5, 2012
Home Valley BankCave JunctionOR23181South Valley Bank & TrustJuly 23, 2010September 12, 2012
SouthwestUSA BankLas VegasNV35434Plaza BankJuly 23, 2010August 22, 2012
Community Security BankNew PragueMN34486RoundbankJuly 23, 2010September 12, 2012
Thunder BankSylvan GroveKS10506The Bennington State BankJuly 23, 2010September 13, 2012
Williamsburg First National BankKingstreeSC17837First Citizens Bank and Trust Company, Inc.July 23, 2010November 5, 2012
Crescent Bank and Trust CompanyJasperGA27559Renasant BankJuly 23, 2010November 5, 2012
Sterling BankLantanaFL32536IBERIABANKJuly 23, 2010November 5, 2012
Mainstreet Savings Bank, FSBHastingsMI28136Commercial BankJuly 16, 2010September 13, 2012
Olde Cypress Community BankClewistonFL28864CenterState Bank of Florida, N.A.July 16, 2010November 5, 2012
Turnberry BankAventuraFL32280NAFH National BankJuly 16, 2010November 5, 2012
Metro Bank of Dade CountyMiamiFL25172NAFH National BankJuly 16, 2010November 5, 2012
First National Bank of the SouthSpartanburgSC35383NAFH National BankJuly 16, 2010November 5, 2012
Woodlands BankBlufftonSC32571Bank of the OzarksJuly 16, 2010November 5, 2012
Home National BankBlackwellOK11636RCB BankJuly 9, 2010December 10, 2012
USA BankPort ChesterNY58072New Century BankJuly 9, 2010September 14, 2012
Ideal Federal Savings BankBaltimoreMD32456No AcquirerJuly 9, 2010September 14, 2012
Bay National BankBaltimoreMD35462Bay Bank, FSBJuly 9, 2010January 15, 2013
High Desert State BankAlbuquerqueNM35279First American BankJune 25, 2010September 14, 2012
First National BankSavannahGA34152The Savannah Bank, N.A.June 25, 2010November 5, 2012
Peninsula BankEnglewoodFL26563Premier American Bank, N.A.June 25, 2010November 5, 2012
Nevada Security BankRenoNV57110Umpqua BankJune 18, 2010August 23, 2012
Washington First International BankSeattleWA32955East West BankJune 11, 2010September 14, 2012
TierOne BankLincolnNE29341Great Western BankJune 4, 2010September 14, 2012
Arcola Homestead Savings BankArcolaIL31813No AcquirerJune 4, 2010September 14, 2012
First National BankRosedaleMS15814The Jefferson BankJune 4, 2010November 5, 2012
Sun West BankLas VegasNV34785City National BankMay 28, 2010September 14, 2012
Granite Community Bank, NAGranite BayCA57315Tri Counties BankMay 28, 2010September 14, 2012
Bank of Florida - TampaTampaFL57814EverBankMay 28, 2010November 5, 2012
Bank of Florida - SouthwestNaplesFL35106EverBankMay 28, 2010November 5, 2012
Bank of Florida - SoutheastFort LauderdaleFL57360EverBankMay 28, 2010November 5, 2012
Pinehurst BankSaint PaulMN57735Coulee BankMay 21, 2010October 26, 2012
Midwest Bank and Trust CompanyElmwood ParkIL18117FirstMerit Bank, N.A.May 14, 2010August 23, 2012
Southwest Community BankSpringfieldMO34255Simmons First National BankMay 14, 2010August 23, 2012
New Liberty BankPlymouthMI35586Bank of Ann ArborMay 14, 2010August 23, 2012
Satilla Community BankSaint MarysGA35114Ameris BankMay 14, 2010November 5, 2012
1st Pacific Bank of CaliforniaSan DiegoCA35517City National BankMay 7, 2010December 13, 2012
Towne Bank of ArizonaMesaAZ57697Commerce Bank of ArizonaMay 7, 2010August 23, 2012
Access BankChamplinMN16476PrinsBankMay 7, 2010August 23, 2012
The Bank of BonifayBonifayFL14246First Federal Bank of FloridaMay 7, 2010November 5, 2012
Frontier BankEverettWA22710Union Bank, N.A.April 30, 2010January 15, 2013
BC National BanksButlerMO17792Community First BankApril 30, 2010August 23, 2012
Champion BankCreve CoeurMO58362BankLibertyApril 30, 2010August 23, 2012
CF BancorpPort HuronMI30005First Michigan BankApril 30, 2010January 15, 2013
Westernbank Puerto Rico
En Espanol
MayaguezPR31027Banco Popular de Puerto RicoApril 30, 2010November 5, 2012
R-G Premier Bank of Puerto Rico
En Espanol
Hato ReyPR32185Scotiabank de Puerto RicoApril 30, 2010November 5, 2012
Eurobank
En Espanol
San JuanPR27150Oriental Bank and TrustApril 30, 2010November 5, 2012
Wheatland BankNapervilleIL58429Wheaton Bank & TrustApril 23, 2010August 23, 2012
Peotone Bank and Trust CompanyPeotoneIL10888First Midwest BankApril 23, 2010August 23, 2012
Lincoln Park Savings BankChicagoIL30600Northbrook Bank & Trust CompanyApril 23, 2010August 23, 2012
New Century BankChicagoIL34821MB Financial Bank, N.A.April 23, 2010August 23, 2012
Citizens Bank and Trust Company of ChicagoChicagoIL34658Republic Bank of ChicagoApril 23, 2010August 23, 2012
Broadway BankChicagoIL22853MB Financial Bank, N.A.April 23, 2010August 23, 2012
Amcore Bank, National AssociationRockfordIL3735Harris N.A.April 23, 2010August 23, 2012
City BankLynnwoodWA21521Whidbey Island BankApril 16, 2010September 14, 2012
Tamalpais BankSan RafaelCA33493Union Bank, N.A.April 16, 2010August 23, 2012
Innovative BankOaklandCA23876Center BankApril 16, 2010August 23, 2012
Butler BankLowellMA26619People's United BankApril 16, 2010August 23, 2012
Riverside National Bank of FloridaFort PierceFL24067TD Bank, N.A.April 16, 2010November 5, 2012
AmericanFirst BankClermontFL57724TD Bank, N.A.April 16, 2010October 31, 2012
First Federal Bank of North FloridaPalatkaFL28886TD Bank, N.A.April 16, 2010January 15, 2013
Lakeside Community BankSterling HeightsMI34878No AcquirerApril 16, 2010August 23, 2012
Beach First National BankMyrtle BeachSC34242Bank of North CarolinaApril 9, 2010November 5, 2012
Desert Hills BankPhoenixAZ57060New York Community BankMarch 26, 2010August 23, 2012
Unity National BankCartersvilleGA34678Bank of the OzarksMarch 26, 2010September 14, 2012
Key West BankKey WestFL34684Centennial BankMarch 26, 2010August 23, 2012
McIntosh Commercial BankCarrolltonGA57399CharterBankMarch 26, 2010August 23, 2012
State Bank of AuroraAuroraMN8221Northern State BankMarch 19, 2010August 23, 2012
First Lowndes BankFort DepositAL24957First Citizens BankMarch 19, 2010August 23, 2012
Bank of HiawasseeHiawasseeGA10054Citizens South BankMarch 19, 2010August 23, 2012
Appalachian Community BankEllijayGA33989Community & Southern BankMarch 19, 2010October 31, 2012
Advanta Bank Corp.DraperUT33535No AcquirerMarch 19, 2010September 14, 2012
Century Security BankDuluthGA58104Bank of UpsonMarch 19, 2010August 23, 2012
American National BankParmaOH18806The National Bank and Trust CompanyMarch 19, 2010August 23, 2012
Statewide BankCovingtonLA29561Home BankMarch 12, 2010August 23, 2012
Old Southern BankOrlandoFL58182Centennial BankMarch 12, 2010August 23, 2012
The Park Avenue BankNew YorkNY27096Valley National BankMarch 12, 2010August 23, 2012
LibertyPointe BankNew YorkNY58071Valley National BankMarch 11, 2010August 23, 2012
Centennial BankOgdenUT34430No AcquirerMarch 5, 2010September 14, 2012
Waterfield BankGermantownMD34976No AcquirerMarch 5, 2010August 23, 2012
Bank of IllinoisNormalIL9268Heartland Bank and Trust CompanyMarch 5, 2010August 23, 2012
Sun American BankBoca RatonFL27126First-Citizens Bank & Trust CompanyMarch 5, 2010August 23, 2012
Rainier Pacific BankTacomaWA38129Umpqua BankFebruary 26, 2010August 23, 2012
Carson River Community BankCarson CityNV58352Heritage Bank of NevadaFebruary 26, 2010January 15, 2013
La Jolla Bank, FSBLa JollaCA32423OneWest Bank, FSBFebruary 19, 2010August 24, 2012
George Washington Savings BankOrland ParkIL29952FirstMerit Bank, N.A.February 19, 2010August 24, 2012
The La Coste National BankLa CosteTX3287Community National BankFebruary 19, 2010September 14, 2012
Marco Community BankMarco IslandFL57586Mutual of Omaha BankFebruary 19, 2010August 24, 2012
1st American State Bank of MinnesotaHancockMN15448Community Development Bank, FSBFebruary 5, 2010August 24, 2012
American Marine BankBainbridge IslandWA16730Columbia State BankJanuary 29, 2010August 24, 2012
First Regional BankLos AngelesCA23011First-Citizens Bank & Trust CompanyJanuary 29, 2010August 24, 2012
Community Bank and TrustCorneliaGA5702SCBT National AssociationJanuary 29, 2010January 15, 2013
Marshall Bank, N.A.HallockMN16133United Valley BankJanuary 29, 2010August 23, 2012
Florida Community BankImmokaleeFL5672Premier American Bank, N.A.January 29, 2010January 15, 2013
First National Bank of GeorgiaCarrolltonGA16480Community & Southern BankJanuary 29, 2010December 13, 2012
Columbia River BankThe DallesOR22469Columbia State BankJanuary 22, 2010September 14, 2012
Evergreen BankSeattleWA20501Umpqua BankJanuary 22, 2010January 15, 2013
Charter BankSanta FeNM32498Charter BankJanuary 22, 2010August 23, 2012
Bank of LeetonLeetonMO8265Sunflower Bank, N.A.January 22, 2010January 15, 2013
Premier American BankMiamiFL57147Premier American Bank, N.A.January 22, 2010December 13, 2012
Barnes Banking CompanyKaysvilleUT1252No AcquirerJanuary 15, 2010August 23, 2012
St. Stephen State BankSt. StephenMN17522First State Bank of St. JosephJanuary 15, 2010August 23, 2012
Town Community Bank & TrustAntiochIL34705First American BankJanuary 15, 2010August 23, 2012
Horizon BankBellinghamWA22977Washington Federal Savings and Loan AssociationJanuary 8, 2010August 23, 2012
First Federal Bank of California, F.S.B.Santa MonicaCA28536OneWest Bank, FSBDecember 18, 2009August 23, 2012
Imperial Capital BankLa JollaCA26348City National BankDecember 18, 2009September 5, 2012
Independent Bankers' BankSpringfieldIL26820The Independent BankersBank (TIB)December 18, 2009August 23, 2012
New South Federal Savings BankIrondaleAL32276Beal BankDecember 18, 2009August 23, 2012
Citizens State BankNew BaltimoreMI1006No AcquirerDecember 18, 2009November 5, 2012
Peoples First Community BankPanama CityFL32167Hancock BankDecember 18, 2009November 5, 2012
RockBridge Commercial BankAtlantaGA58315No AcquirerDecember 18, 2009November 5, 2012
SolutionsBankOverland ParkKS4731Arvest BankDecember 11, 2009August 23, 2012
Valley Capital Bank, N.A.MesaAZ58399Enterprise Bank & TrustDecember 11, 2009August 23, 2012
Republic Federal Bank, N.A.MiamiFL228461st United BankDecember 11, 2009November 5, 2012
Greater Atlantic BankRestonVA32583SonabankDecember 4, 2009November 5, 2012
Benchmark BankAuroraIL10440MB Financial Bank, N.A.December 4, 2009August 23, 2012
AmTrust BankClevelandOH29776New York Community BankDecember 4, 2009November 5, 2012
The Tattnall BankReidsvilleGA12080Heritage Bank of the SouthDecember 4, 2009November 5, 2012
First Security National BankNorcrossGA26290State Bank and Trust CompanyDecember 4, 2009November 5, 2012
The Buckhead Community BankAtlantaGA34663State Bank and Trust CompanyDecember 4, 2009November 5, 2012
Commerce Bank of Southwest FloridaFort MyersFL58016Central BankNovember 20, 2009November 5, 2012
Pacific Coast National BankSan ClementeCA57914Sunwest BankNovember 13, 2009August 22, 2012
Orion BankNaplesFL22427IBERIABANKNovember 13, 2009November 5, 2012
Century Bank, F.S.B.SarasotaFL32267IBERIABANKNovember 13, 2009August 22, 2012
United Commercial BankSan FranciscoCA32469East West BankNovember 6, 2009November 5, 2012
Gateway Bank of St. LouisSt. LouisMO19450Central Bank of Kansas CityNovember 6, 2009August 22, 2012
Prosperan BankOakdaleMN35074Alerus Financial, N.A.November 6, 2009August 22, 2012
Home Federal Savings BankDetroitMI30329Liberty Bank and Trust CompanyNovember 6, 2009August 22, 2012
United Security BankSpartaGA22286Ameris BankNovember 6, 2009January 15, 2013
North Houston BankHoustonTX18776U.S. Bank N.A.October 30, 2009August 22, 2012
Madisonville State BankMadisonvilleTX33782U.S. Bank N.A.October 30, 2009August 22, 2012
Citizens National BankTeagueTX25222U.S. Bank N.A.October 30, 2009August 22, 2012
Park National BankChicagoIL11677U.S. Bank N.A.October 30, 2009August 22, 2012
Pacific National BankSan FranciscoCA30006U.S. Bank N.A.October 30, 2009August 22, 2012
California National BankLos AngelesCA34659U.S. Bank N.A.October 30, 2009September 5, 2012
San Diego National BankSan DiegoCA23594U.S. Bank N.A.October 30, 2009August 22, 2012
Community Bank of LemontLemontIL35291U.S. Bank N.A.October 30, 2009January 15, 2013
Bank USA, N.A.PhoenixAZ32218U.S. Bank N.A.October 30, 2009August 22, 2012
First DuPage BankWestmontIL35038First Midwest BankOctober 23, 2009August 22, 2012
Riverview Community BankOtsegoMN57525Central BankOctober 23, 2009August 22, 2012
Bank of ElmwoodRacineWI18321Tri City National BankOctober 23, 2009August 22, 2012
Flagship National BankBradentonFL35044First Federal Bank of FloridaOctober 23, 2009August 22, 2012
Hillcrest Bank FloridaNaplesFL58336Stonegate BankOctober 23, 2009August 22, 2012
American United BankLawrencevilleGA57794Ameris BankOctober 23, 2009September 5, 2012
Partners BankNaplesFL57959Stonegate BankOctober 23, 2009January 15, 2013
San Joaquin BankBakersfieldCA23266Citizens Business BankOctober 16, 2009August 22, 2012
Southern Colorado National BankPuebloCO57263Legacy BankOctober 2, 2009September 5, 2012
Jennings State BankSpring GroveMN11416Central BankOctober 2, 2009August 21, 2012
Warren BankWarrenMI34824The Huntington National BankOctober 2, 2009August 21, 2012
Georgian BankAtlantaGA57151First Citizens Bank and Trust Company, Inc.September 25, 2009August 21, 2012
Irwin Union Bank, F.S.B.LouisvilleKY57068First Financial Bank, N.A.September 18, 2009September 5, 2012
Irwin Union Bank and Trust CompanyColumbusIN10100First Financial Bank, N.A.September 18, 2009August 21, 2012
Venture BankLaceyWA22868First-Citizens Bank & Trust CompanySeptember 11, 2009August 21, 2012
Brickwell Community BankWoodburyMN57736CorTrust Bank N.A.September 11, 2009January 15, 2013
Corus Bank, N.A.ChicagoIL13693MB Financial Bank, N.A.September 11, 2009August 21, 2012
First State BankFlagstaffAZ34875Sunwest BankSeptember 4, 2009January 15, 2013
Platinum Community BankRolling MeadowsIL35030No AcquirerSeptember 4, 2009August 21, 2012
Vantus BankSioux CityIN27732Great Southern BankSeptember 4, 2009August 21, 2012
InBankOak ForestIL20203MB Financial Bank, N.A.September 4, 2009August 21, 2012
First Bank of Kansas CityKansas CityMO25231Great American BankSeptember 4, 2009August 21, 2012
Affinity BankVenturaCA27197Pacific Western BankAugust 28, 2009August 21, 2012
Mainstreet BankForest LakeMN1909Central BankAugust 28, 2009August 21, 2012
Bradford BankBaltimoreMD28312Manufacturers and Traders Trust Company (M&T Bank)August 28, 2009January 15, 2013
Guaranty BankAustinTX32618BBVA CompassAugust 21, 2009August 21, 2012
CapitalSouth BankBirminghamAL22130IBERIABANKAugust 21, 2009January 15, 2013
First Coweta BankNewnanGA57702United BankAugust 21, 2009January 15, 2013
ebankAtlantaGA34682Stearns Bank, N.A.August 21, 2009August 21, 2012
Community Bank of NevadaLas VegasNV34043No AcquirerAugust 14, 2009August 21, 2012
Community Bank of ArizonaPhoenixAZ57645MidFirst BankAugust 14, 2009August 21, 2012
Union Bank, National AssociationGilbertAZ34485MidFirst BankAugust 14, 2009August 21, 2012
Colonial BankMontgomeryAL9609Branch Banking & Trust Company, (BB&T)August 14, 2009September 5, 2012
Dwelling House Savings and Loan AssociationPittsburghPA31559PNC Bank, N.A.August 14, 2009January 15, 2013
Community First BankPrinevilleOR23268Home Federal BankAugust 7, 2009January 15, 2013
Community National Bank of Sarasota CountyVeniceFL27183Stearns Bank, N.A.August 7, 2009August 20, 2012
First State BankSarasotaFL27364Stearns Bank, N.A.August 7, 2009August 20, 2012
Mutual BankHarveyIL18659United Central BankJuly 31, 2009August 20, 2012
First BankAmericanoElizabethNJ34270Crown BankJuly 31, 2009August 20, 2012
Peoples Community BankWest ChesterOH32288First Financial Bank, N.A.July 31, 2009August 20, 2012
Integrity BankJupiterFL57604Stonegate BankJuly 31, 2009August 20, 2012
First State Bank of AltusAltusOK9873Herring BankJuly 31, 2009August 20, 2012
Security Bank of Jones CountyGrayGA8486State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Houston CountyPerryGA27048State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Bibb CountyMaconGA27367State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North MetroWoodstockGA57105State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North FultonAlpharettaGA57430State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Gwinnett CountySuwaneeGA57346State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Waterford Village BankWilliamsvilleNY58065Evans Bank, N.A.July 24, 2009August 20, 2012
Temecula Valley BankTemeculaCA34341First-Citizens Bank & Trust CompanyJuly 17, 2009August 20, 2012
Vineyard BankRancho CucamongaCA23556California Bank & TrustJuly 17, 2009August 20, 2012
BankFirstSioux FallsSD34103Alerus Financial, N.A.July 17, 2009August 20, 2012
First Piedmont BankWinderGA34594First American Bank and Trust CompanyJuly 17, 2009January 15, 2013
Bank of WyomingThermopolisWY22754Central Bank & TrustJuly 10, 2009August 20, 2012
Founders BankWorthIL18390The PrivateBank and Trust CompanyJuly 2, 2009August 20, 2012
Millennium State Bank of TexasDallasTX57667State Bank of TexasJuly 2, 2009October 26, 2012
First National Bank of DanvilleDanvilleIL3644First Financial Bank, N.A.July 2, 2009August 20, 2012
Elizabeth State BankElizabethIL9262Galena State Bank and Trust CompanyJuly 2, 2009August 20, 2012
Rock River BankOregonIL15302The Harvard State BankJuly 2, 2009August 20, 2012
First State Bank of WinchesterWinchesterIL11710The First National Bank of BeardstownJuly 2, 2009August 20, 2012
John Warner BankClintonIL12093State Bank of LincolnJuly 2, 2009August 20, 2012
Mirae BankLos AngelesCA57332Wilshire State BankJune 26, 2009August 20, 2012
MetroPacific BankIrvineCA57893Sunwest BankJune 26, 2009August 20, 2012
Horizon BankPine CityMN9744Stearns Bank, N.A.June 26, 2009August 20, 2012
Neighborhood Community BankNewnanGA35285CharterBankJune 26, 2009August 20, 2012
Community Bank of West GeorgiaVilla RicaGA57436No AcquirerJune 26, 2009August 17, 2012
First National Bank of AnthonyAnthonyKS4614Bank of KansasJune 19, 2009August 17, 2012
Cooperative BankWilmingtonNC27837First BankJune 19, 2009August 17, 2012
Southern Community BankFayettevilleGA35251United Community BankJune 19, 2009August 17, 2012
Bank of LincolnwoodLincolnwoodIL17309Republic Bank of ChicagoJune 5, 2009August 17, 2012
Citizens National BankMacombIL5757Morton Community BankMay 22, 2009September 4, 2012
Strategic Capital BankChampaignIL35175Midland States BankMay 22, 2009September 4, 2012
BankUnited, FSBCoral GablesFL32247BankUnitedMay 21, 2009August 17, 2012
Westsound BankBremertonWA34843Kitsap BankMay 8, 2009September 4, 2012
America West BankLaytonUT35461Cache Valley BankMay 1, 2009August 17, 2012
Citizens Community BankRidgewoodNJ57563North Jersey Community BankMay 1, 2009September 4, 2012
Silverton Bank, NAAtlantaGA26535No AcquirerMay 1, 2009August 17, 2012
First Bank of IdahoKetchumID34396U.S. Bank, N.A.April 24, 2009August 17, 2012
First Bank of Beverly HillsCalabasasCA32069No AcquirerApril 24, 2009September 4, 2012
Michigan Heritage BankFarmington HillsMI34369Level One BankApril 24, 2009August 17, 2012
American Southern BankKennesawGA57943Bank of North GeorgiaApril 24, 2009August 17, 2012
Great Basin Bank of NevadaElkoNV33824Nevada State BankApril 17, 2009September 4, 2012
American Sterling BankSugar CreekMO8266Metcalf BankApril 17, 2009August 31, 2012
New Frontier BankGreeleyCO34881No AcquirerApril 10, 2009September 4, 2012
Cape Fear BankWilmingtonNC34639First Federal Savings and Loan AssociationApril 10, 2009August 17, 2012
Omni National BankAtlantaGA22238No AcquirerMarch 27, 2009August 17, 2012
TeamBank, NAPaolaKS4754Great Southern BankMarch 20, 2009August 17, 2012
Colorado National BankColorado SpringsCO18896Herring BankMarch 20, 2009August 17, 2012
FirstCity BankStockbridgeGA18243No AcquirerMarch 20, 2009August 17, 2012
Freedom Bank of GeorgiaCommerceGA57558Northeast Georgia BankMarch 6, 2009August 17, 2012
Security Savings BankHendersonNV34820Bank of NevadaFebruary 27, 2009September 7, 2012
Heritage Community BankGlenwoodIL20078MB Financial Bank, N.A.February 27, 2009August 17, 2012
Silver Falls BankSilvertonOR35399Citizens BankFebruary 20, 2009August 17, 2012
Pinnacle Bank of OregonBeavertonOR57342Washington Trust Bank of SpokaneFebruary 13, 2009August 17, 2012
Corn Belt Bank & Trust Co.PittsfieldIL16500The Carlinville National BankFebruary 13, 2009August 17, 2012
Riverside Bank of the Gulf CoastCape CoralFL34563TIB BankFebruary 13, 2009August 17, 2012
Sherman County BankLoup CityNE5431Heritage BankFebruary 13, 2009August 17, 2012
County BankMercedCA22574Westamerica BankFebruary 6, 2009September 4, 2012
Alliance BankCulver CityCA23124California Bank & TrustFebruary 6, 2009August 16, 2012
FirstBank Financial ServicesMcDonoughGA57017Regions BankFebruary 6, 2009August 16, 2012
Ocala National BankOcalaFL26538CenterState Bank of Florida, N.A.January 30, 2009September 4, 2012
Suburban FSBCroftonMD30763Bank of EssexJanuary 30, 2009August 16, 2012
MagnetBankSalt Lake CityUT58001No AcquirerJanuary 30, 2009August 16, 2012
1st Centennial BankRedlandsCA33025First California BankJanuary 23, 2009August 16, 2012
Bank of Clark CountyVancouverWA34959Umpqua BankJanuary 16, 2009August 16, 2012
National Bank of CommerceBerkeleyIL19733Republic Bank of ChicagoJanuary 16, 2009August 16, 2012
Sanderson State Bank
En Espanol
SandersonTX11568The Pecos County State BankDecember 12, 2008September 4, 2012
Haven Trust BankDuluthGA35379Branch Banking & Trust Company, (BB&T)December 12, 2008August 16, 2012
First Georgia Community BankJacksonGA34301United BankDecember 5, 2008August 16, 2012
PFF Bank & TrustPomonaCA28344U.S. Bank, N.A.November 21, 2008January 4, 2013
Downey Savings & LoanNewport BeachCA30968U.S. Bank, N.A.November 21, 2008January 4, 2013
Community BankLoganvilleGA16490Bank of EssexNovember 21, 2008September 4, 2012
Security Pacific BankLos AngelesCA23595Pacific Western BankNovember 7, 2008August 28, 2012
Franklin Bank, SSBHoustonTX26870Prosperity BankNovember 7, 2008August 16, 2012
Freedom BankBradentonFL57930Fifth Third BankOctober 31, 2008August 16, 2012
Alpha Bank & TrustAlpharettaGA58241Stearns Bank, N.A.October 24, 2008August 16, 2012
Meridian BankEldredIL13789National BankOctober 10, 2008May 31, 2012
Main Street BankNorthvilleMI57654Monroe Bank & TrustOctober 10, 2008August 16, 2012
Washington Mutual Bank
(Including its subsidiary Washington Mutual Bank FSB)
HendersonNV32633JP Morgan Chase BankSeptember 25, 2008August 16, 2012
AmeribankNorthforkWV6782The Citizens Savings Bank

Pioneer Community Bank, Inc.
September 19, 2008August 16, 2012
Silver State Bank
En Espanol
HendersonNV34194Nevada State BankSeptember 5, 2008August 16, 2012
Integrity BankAlpharettaGA35469Regions BankAugust 29, 2008August 16, 2012
Columbian Bank & TrustTopekaKS22728Citizens Bank & TrustAugust 22, 2008August 16, 2012
First Priority BankBradentonFL57523SunTrust BankAugust 1, 2008August 16, 2012
First Heritage Bank, NANewport BeachCA57961Mutual of Omaha BankJuly 25, 2008August 28, 2012
First National Bank of NevadaRenoNV27011Mutual of Omaha BankJuly 25, 2008August 28, 2012
IndyMac BankPasadenaCA29730OneWest Bank, FSBJuly 11, 2008August 28, 2012
First Integrity Bank, NAStaplesMN12736First International Bank and TrustMay 30, 2008August 28, 2012
ANB Financial, NABentonvilleAR33901Pulaski Bank and Trust CompanyMay 9, 2008August 28, 2012
Hume BankHumeMO1971Security BankMarch 7, 2008August 28, 2012
Douglass National BankKansas CityMO24660Liberty Bank and Trust CompanyJanuary 25, 2008October 26, 2012
Miami Valley BankLakeviewOH16848The Citizens Banking CompanyOctober 4, 2007August 28, 2012
NetBankAlpharettaGA32575ING DIRECTSeptember 28, 2007August 28, 2012
Metropolitan Savings BankPittsburghPA35353Allegheny Valley Bank of PittsburghFebruary 2, 2007October 27, 2010
Bank of EphraimEphraimUT1249Far West BankJune 25, 2004April 9, 2008
Reliance BankWhite PlainsNY26778Union State BankMarch 19, 2004April 9, 2008
Guaranty National Bank of TallahasseeTallahasseeFL26838Hancock Bank of FloridaMarch 12, 2004June 5, 2012
Dollar Savings BankNewarkNJ31330No AcquirerFebruary 14, 2004April 9, 2008
Pulaski Savings BankPhiladelphiaPA27203Earthstar BankNovember 14, 2003July 22, 2005
First National Bank of BlanchardvilleBlanchardvilleWI11639The Park BankMay 9, 2003June 5, 2012
Southern Pacific BankTorranceCA27094Beal BankFebruary 7, 2003October 20, 2008
Farmers Bank of CheneyvilleCheneyvilleLA16445Sabine State Bank & TrustDecember 17, 2002October 20, 2004
Bank of AlamoAlamoTN9961No AcquirerNovember 8, 2002March 18, 2005
AmTrade International Bank
En Espanol
AtlantaGA33784No AcquirerSeptember 30, 2002September 11, 2006
Universal Federal Savings BankChicagoIL29355Chicago Community BankJune 27, 2002April 9, 2008
Connecticut Bank of CommerceStamfordCT19183Hudson United BankJune 26, 2002February 14, 2012
New Century BankShelby TownshipMI34979No AcquirerMarch 28, 2002March 18, 2005
Net 1st National BankBoca RatonFL26652Bank Leumi USAMarch 1, 2002April 9, 2008
NextBank, NAPhoenixAZ22314No AcquirerFebruary 7, 2002August 27, 2010
Oakwood Deposit Bank Co.OakwoodOH8966The State Bank & Trust CompanyFebruary 1, 2002October 25, 2012
Bank of Sierra BlancaSierra BlancaTX22002The Security State Bank of PecosJanuary 18, 2002November 6, 2003
Hamilton Bank, NA
En Espanol
MiamiFL24382Israel Discount Bank of New YorkJanuary 11, 2002June 5, 2012
Sinclair National BankGravetteAR34248Delta Trust & BankSeptember 7, 2001February 10, 2004
Superior Bank, FSBHinsdaleIL32646Superior Federal, FSBJuly 27, 2001June 5, 2012
Malta National BankMaltaOH6629North Valley BankMay 3, 2001November 18, 2002
First Alliance Bank & Trust Co.ManchesterNH34264Southern New Hampshire Bank & TrustFebruary 2, 2001February 18, 2003
National State Bank of MetropolisMetropolisIL3815Banterra Bank of MarionDecember 14, 2000March 17, 2005
Bank of HonoluluHonoluluHI21029Bank of the OrientOctober 13, 2000March 17, 2005
+
+ +
+ + + + + + + + + + + + + + + + + + diff --git a/pandas/io/tests/data/computer_sales_page.html b/pandas/io/tests/data/computer_sales_page.html new file mode 100644 index 00000000..ff2b031b --- /dev/null +++ b/pandas/io/tests/data/computer_sales_page.html @@ -0,0 +1,619 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
 Three months ended
+April 30
 Six months ended
+April 30
 
 
 2013  2012  2013  2012  
 
 In millions
 

Net revenue:

             

Notebooks

 $3,718 $4,900 $7,846 $9,842 

Desktops

  3,103  3,827  6,424  7,033 

Workstations

  521  537  1,056  1,072 

Other

  242  206  462  415 
          

Personal Systems

  7,584  9,470  15,788  18,362 
          

Supplies

  4,122  4,060  8,015  8,139 

Commercial Hardware

  1,398  1,479  2,752  2,968 

Consumer Hardware

  561  593  1,240  1,283 
          

Printing

  6,081  6,132  12,007  12,390 
          

Printing and Personal Systems Group

  13,665  15,602  27,795  30,752 
          

Industry Standard Servers

  2,806  3,186  5,800  6,258 

Technology Services

  2,272  2,335  4,515  4,599 

Storage

  857  990  1,690  1,945 

Networking

  618  614  1,226  1,200 

Business Critical Systems

  266  421  572  826 
          

Enterprise Group

  6,819  7,546  13,803  14,828 
          

Infrastructure Technology Outsourcing

  3,721  3,954  7,457  7,934 

Application and Business Services

  2,278  2,535  4,461  4,926 
          

Enterprise Services

  5,999  6,489  11,918  12,860 
          

Software

  941  970  1,867  1,916 

HP Financial Services

  881  968  1,838  1,918 

Corporate Investments

  10  7  14  37 
          

Total segments

  28,315  31,582  57,235  62,311 
          

Eliminations of intersegment net revenue and other

  (733) (889) (1,294) (1,582)
          

Total HP consolidated net revenue

 $27,582 $30,693 $55,941 $60,729 
          
diff --git a/pandas/io/tests/data/gbq_fake_job.txt b/pandas/io/tests/data/gbq_fake_job.txt new file mode 100644 index 00000000..2a0f09bc --- /dev/null +++ b/pandas/io/tests/data/gbq_fake_job.txt @@ -0,0 +1 @@ +{u'status': {u'state': u'DONE'}, u'kind': u'bigquery#job', u'statistics': {u'query': {u'cacheHit': True, u'totalBytesProcessed': u'0'}, u'endTime': u'1377668744674', u'totalBytesProcessed': u'0', u'startTime': u'1377668744466'}, u'jobReference': {u'projectId': u'57288129629', u'jobId': u'bqjob_r5f956972f0190bdf_00000140c374bf42_2'}, u'etag': u'"4PTsVxg68bQkQs1RJ1Ndewqkgg4/oO4VmgFrAku4N6FWci9s7iFIftc"', u'configuration': {u'query': {u'createDisposition': u'CREATE_IF_NEEDED', u'query': u'SELECT * FROM [publicdata:samples.shakespeare]', u'writeDisposition': u'WRITE_TRUNCATE', u'destinationTable': {u'projectId': u'57288129629', u'tableId': u'anonb5ec450da88eeeb78a27784ea482ee75a146d442', u'datasetId': u'_d0b4f5f0d50dc68a3eb0fa6cba66a9a8687d9253'}}}, u'id': u'57288129629:bqjob_r5f956972f0190bdf_00000140c374bf42_2', u'selfLink': u'https://www.googleapis.com/bigquery/v2/projects/57288129629/jobs/bqjob_r5f956972f0190bdf_00000140c374bf42_2'} \ No newline at end of file diff --git a/pandas/io/tests/data/html_encoding/chinese_utf16.html b/pandas/io/tests/data/html_encoding/chinese_utf16.html new file mode 100644 index 0000000000000000000000000000000000000000..59fffc0d19c5731ad445d2b1c15b68ef5337e8ab GIT binary patch literal 824 zcmb7DNeaS15UjJW7`%%{@Z$CY?}8T@jT%7>PCO`nKtCYfe3cLJ18TLyXc7_z2}wHD z)z#HK7=wp$7w}W@1jnIM7DQ6-|#YmLWb2f zUtQNHJC#f-+X^{P+JWG|+W$Ps&J-%~aXUzC*X$PE^gdpG0#{e=t9TQx&i1>VuBZFi k@2i~mzprND31xjfhONC?4{`%n6K0~fk)=czo17f}C$`Vc17Z4c6m zM^#4s{23V;S>Im~yAg3O8gVSH#hECA@j%>(Q$d{fJ+UG-+Go-8L@*bhQHyoVCGK_3 z^^Om*8-Xu=`=TdGi*viw7!7*7YF$wbtjC)%d9dcA@V=ZAaXcRc@2diTbXVd++=^{c z1fGtQ4|8@zSKdYOjPqh1xiel9csQ2#T@=sM^Vju#Q4h{;T92%w4t4$W_rJ~xE&FzF zah{w{?O7B9>$}#q#!qWq(?#m!KEu1GOYk&|I8a07y;?KP*;zZUUtV`oEnI_1nsN1- zr6+2SyubJ7h&S=^@;~*O$8Q<++Ej1kb=-He_QPjCV(T#C_2{>sYq|`-ypOKl$=}D_ P-K=R;8+88fa=(87IBImA literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/html_encoding/chinese_utf8.html b/pandas/io/tests/data/html_encoding/chinese_utf8.html new file mode 100644 index 00000000..ad1ca33a --- /dev/null +++ b/pandas/io/tests/data/html_encoding/chinese_utf8.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
01
0 漊煻獌 漊煻獌
1 袟袘觕 袟袘觕
2 埱娵徖 埱娵徖
\ No newline at end of file diff --git a/pandas/io/tests/data/html_encoding/letz_latin1.html b/pandas/io/tests/data/html_encoding/letz_latin1.html new file mode 100644 index 00000000..7b4b99cb --- /dev/null +++ b/pandas/io/tests/data/html_encoding/letz_latin1.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
01
0 Gét Gét
1
2 iech iech
\ No newline at end of file diff --git a/pandas/io/tests/data/iris.csv b/pandas/io/tests/data/iris.csv new file mode 100644 index 00000000..c19b9c36 --- /dev/null +++ b/pandas/io/tests/data/iris.csv @@ -0,0 +1,151 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Name +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/pandas/io/tests/data/legacy_hdf/legacy.h5 b/pandas/io/tests/data/legacy_hdf/legacy.h5 new file mode 100644 index 0000000000000000000000000000000000000000..38b822dd169945b5f5022a219cc784a4c9951320 GIT binary patch literal 14928 zcmeHN2|Sfq8^0(aNlls-)tzdjME0fVxU%M2vRxdOIp-4O%o+G)mJFVB)2b> zR!SU1C>2wKk%ViP>|4lp-|f3TrP24zeA85~pZ9;3=RNN^&w0-Cod0=msJ^!D6rnjn zB;*nhAWa~NqDkU5j9;cJu@WLr)H~n_9vt`K$l}EkJS2V+Qa1}P7scBr^I-)Ic)5j{ zjt+?xV&MLp&<5Sx#dQlO6X@v4 zboJ-jvnC<%foN`QqRqpJUbwlE1SY({0m44XV(*tD`rjx1emOE;okBGHe({gCmWa=V zGq5h2;yva1qmRSi@o+pO>H~2{Nc3A2alo7lu`T4{<5&bDf)pd0oQZgzmzNiPQh?(x zkpv%)=wuFXuH|@(SZ+FQQAnYL8L#nZJ&g3@h=~&qQ_RdLHk>?H{%t>wm?$AHFQdT8 zjg%jsp3#RmH_ZNd|84&%#6lnhMj>IQ`*3}Ys{U-VOdXnvA;WUrCc2Wbs;$4VqShJ} zIjX5?fRhr{z{Fpkrf2DE%kXtIwAHtuu3=dC2T-?K(3q;$dQMs@6cq;3NZU8qc8i|5 zk-s5b(NLKYpcD|`tK{WG*L77=Hc{T>>`Hf)w=&Z6u-37qF^z1Tf&y*b6cv2dIT^?) zY+7UKq2;ckX1>nD*2%!%YK@Y=v5m67vBD->McqJy&1tzUG$3 zI*yK;186Q@pNf@cND*qJh?G2|6krD7KXgv^Qm=)}oqlUGwV+&J~O5cICj* zZ3za2T@7%CnqcS7NhPrKzVO7_;w)%hQd0SAPAL#ns~O-CZ-)!geN}8D?f{aDII|pC z85vqzQ52`|L~%>R9u%M62t~0$G7?2?lNc0ZnLndA7|lYFmUR(DPo*>z^VVmfxOjIi zin~|^D3(7eLa~ifg(B6x4n@|%W)vsfXhYHASvQIS#uJ|-{VNO*LUDQYR227R&OouI zem07Iwo6g8_)Z4Jqyz;Ng&(P+=-#D)Am6H*nxpxH@cG52z&*pcu*kXnY4D6bSjueJ zb|fepyf|qhq?%F&9d15}kIHxfLlQ#r{F|Zy%cxH*n%w{rCy*q>O5ecU*L-{u=)VDx z74z!y5*lE)Mc>BRU)O-C8fUdv);)s9&UA0DkS>FEGn4E#oahG^=1(r`yD$Xp?b3gU zF{^?_#mfd(@TCF%FvI%P^-(}Tl=n>X+-D#(M_R_ZI~59xSlOOl(+LNbP{rBOU2x@G z+bP~bselrebh>&OoVZ&L%t5fqn zyZNPPt|b3$z!~kUW{Ol4H5i_JP$EU5=nCbhFrcP zjJCsY^mcLd-xv_ga76`>u!K+vB0g$61~}^aIeKBWNcp?@3Rwngf4qF9bOk0)?DMeP zm;2|U0b~t~94imb_jz!`n~zt1sNHkrksYT1X)}lLAWb25J#wGuAK*v#c0=r05Vdtx z{JmrKgAd=YP9phZ(;jV#XpOmW^R zOp--MjPM3zKHlYTM$U`5s~;WXrSJALm0x2D9Iu}_F&mR7_T9Mpzrbgrfw9g-_ThF! zsNs)37a@3YtT^@t8vY{gy86(#kI9eZy(8oPe|vt(Lz;}mdffjJ{jQvYb(qWNMNv8A zT$1pAh{V`_Z}}9gg};~I8_nnU{mur3@GG>r*l54pF%66rN4fStzPQ4be>=G5fA2AhT7S)Q2Cm-{EDfzffDjE~R(wvRLg{NYV{=NJk7T1x{Vwdb9JTV4fdmIwU zfjou)UQGC(6N}$3_o>wWb#Yw?wNn}%SVs2)r=9=t-3bvLdw)8J+cEJZHbb8pFX#HD zxA#vYc{lew`c^5s8AQHx5|XJbfk|X(E&H4% zaQ^ZJT`QLs&e?tU9owU?p|l+TuR8(4zu83%;mnN(w*>z zaonK#&H->cQAuqmGXs{{x5*7|dkJ@_UcILGT_sd^lrpEfJq2+hZ#vKK?Sm}WBX{eL zS3>&4V;&tBs=!sZZjJ2kSztwE((`2YSs+81)*r!}1`gD0$l4{526@W;4d!_~ftUFQ zXY2F6fT;^3ykgqw;NI=5r$JAj1Njv$p356`EsK<~ot!xElTJ1qTrc8xes44MovI^g*j@^L zig-kQl2QQV*H;&a=rzFRy^^p_fQwL(Q+hY~rA_posTSZ9zgWq+Jd@pTpuLmc6>8NE#b;eRs2JS{dG#3a*_R4HB14?^t9~UoK>eO&|N1eI1pSuP@!0*L2D|tF*LH-l$TM=GQ2^FkY8tclco|j2e>lmA(87q>#g^u6I^Lnad?YafwghNv2`3pFSH*77_Tt zG`tdiZr=_G-d2fkbXd}uEBjF1hJ5!ia^xJ66B|XYNBoldqJhsw17q>bJbYadiukj5 zCT$t^1{(e%?tbp$=9y$Utiy!+_;8--z7%WW@69t$;CjHP?=2vBW*+|i{4D{z0NaIg zGT6`6#uEEv%184%f_HH3e|+bdT=}>8&%4i$|7f0xetPE23&EJMDf)l&9IhTd)NlOn zdVV+0#J;<0{+rKZ7XHWc%tOkU#{XKL`Dg3@lUev*2J)9^pky^Kxc}@KX#Tm8-d*E6 zUGwW;!?CP=Cpmnt9+t-J>}ml&9&epOmP-J!?hcGxk8+s4%P_<(z7x)EbW$*w*9aay zc~u>@{yw<+u(sc30vpEMuav1=R0XNM$>Jt^TEY3;_Wc*<6oPuWc7fyO9bms{_T7rs zSFlt3)PtNDrw9I9t3jdI$><+ezJ`19^|XfVs-V=JoCVvGdO?=N#YHyWzW{qpSiF2e zEszYk{b2A|Jx~ysrt3{@0z6UIQqGd^fmTJv-lMv$;2)i~X@{;S0K343@8m3dfpht` zqmifYf(=!V8`xXeFvF&g{5_=#lz7s7ww-SW`aAk}2(Ntw*{-HLX!onYy!Gl^4((_F zvZu~IIrQ=gbU&T_;^d)PxKvtVi|*lWu(@jcw~>!{GCDTNOpZVI3W$6iQhM064SYp) z6MQ;ltuv5t^$X0lVP*>1@ebc!d_0x?i9g773V>!UxKr z+ghXP*Y|Wo+5@RW^$%{t#i?E4TCg4%T=jc(di^y(qu5^d=^cQASsFlS#ubp@F!P>y zOB?JPvba}t^(;K2bP6O!6v81dp`FrlQK0X5wm_(CCS2^Uus7g!4M^O*Dk!XY7`UYQ z(+xY*;R*5liEW_|U_|hip)A!dC{etOA?QrXXg+Ls>e!2VsC{f1Fwd_52f{br-TpKa z9L;fy6mzWrb`Q7aR;yP7i+Hn+21Pd1F7mLf%pCyYB7&6zce0_Lg44AMhblOHxZ;5Q z<3iY4U(@HY_9f&yAtjX_(+FO=#b!&LZUzZ`hWl?l=mJ-7?Dd;5aR8V+HVqNqT?bko x8?i3QR6}yy+Rzlh2C-)qD{|JB!30;^z>@kW@LtB&8lx@sKrE{CS2Ol6@LyVSHP`?E literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 b/pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 new file mode 100644 index 0000000000000000000000000000000000000000..b1439ef16361abbc0756fbf7d344fd65d8a1a473 GIT binary patch literal 238321 zcmeEP1zZ(P*WW9Mh@u#X2_kkN4d&3&T_)1fDF_yJcOWK;ofw$lV1pvkCDIMjDWKo% z-Q9bENA}+5jqmsV-q%&`Y|MXW&Y3f3&YUy5yGKi1Q&3>60ETb;{1`7L#N4HRevw=K zB_8G}>X^E|n%rTK>25N;-jCQ}V0;*UZ8Ujah`j$q;&xa$dEU%KLjyxOsryqYc^Ql4 zBlf8KQ0M=o{~H|8(oiwNDUBe>WgejY%+16JO0ISX6UXa_1RIP^XYS#0$dCS+X`AUG zLBmF%KU^*;xn*WTKiYl{?kf0?D*mH}4I|~Xw_N|-`}WSSrl(?Rips<5(I5R$^;lwJ zXpYmy&X3ocAG6RfG1WFSVCKiW4wYY8QVJd6`IVR{1NAgKztK`N6;(YAQ)^8_6MYpk z>%QZevD$y#LUMe1*g z6V4v?4(rf?t(&KvtCY3BjoVrWU-q#x>U{ViX0y)OR~n^M2K*S8cuM|-@JwhkH84~s z`Z#gS&r76z{ZhEq6Z^_RoTudYup@mwiq6p+>GMq|FXxP;&o`R9ypTga`n&x7tCYTz zBKc=Twi{a?ai>QIV}xG9^$Miidf2!-;9SS^ch}PflK5F%PgC^q zqkWX0Sy5jM*R$w9+LW2U6kn*6jq6X~?48~5jD6uqb=|Ezyfl8y%f`dr#@E^}(2H43 zW8Ej2_yqj zd!qjYnK_%1w6nEp5oPbWOrnLGuh0v`NAQ&GC~*V$Egdi}8U9XxFnf8T+pnh%%h zOhc)x(`be1DBaq2dpn*c?NLd=(~0BK#b=<@^R|l@o~~FueZ@?a-l~tfiKjod8n@%A z_Q=vDvru~KO3+C>-LX_@>ui*!R>mjdsl)~wu{kL95o~eC(^u#BM&apQ6Rqo#DD5iB zX~9#ywcZ+YQF?}t?>L@ThMX>(D|qOb%@(b_89*zvzoFW&c(_Fto=%wF2vY}K)qPd> z5{x#MD2lgjfb+g@D_$i&L^%unH}$4 z0n(&$ecJCA0zRXbyOM1U@bpu^x8+zp*r&>$n}7NPc%{!X(rRfbs2On6r8A%w)@c+L z#qX{M(=Ke-F>hfB=!>sw^t|SoSgHmo>KLr?@X=J)mDMuRRB~6a@K$kJE90VPr6Rk` z&);&bgR8cgbCBjb8!aCeoHRnm`+GsZJPf21jSPIAz?(=zW?L^9+{byx@;89<8 zn2L#s%2HHkHa*K9>5+>@#w(Vlh8law9r=;PpxpTOHQ0sT>Rva|E_3baPE6#?{P=jq|= z=d;$%&(lZ1OIl#M9bsKd`Vm&Nq^F~!uY;ejfa+RbXAcKoUv-;60WTQ=J8Ns(wa#vS z&K}m*0-m-m4t9P5Ua|rPekNXW0@LliOnxC{G+YR3c zIVJWnthdf&_hW8#~{Ef>1$=8fynWd>eT=0{`{w({XQ6(>p% zEy8VFF>xx9pBT-k*8G$HuW+FE98!xUJa$q~4*aX*ic~yo_e7$c>tuQC@s?XVTZ(yu zKk+bvkN?T>uGq1^ThYCgl={E@wqjF6+=zrEvOyXAZ5=y^=^N(`GW)AM}q zqlx5+$-{eJ>3K{t4;wwB=lR}8{N#z4DZQ`sJZ6n|r2nMn)qmd+TfhBvUfO;Bf&5?p zmHti2kI;9&;xs=X`~;L=hw2}c51ZZJrq3i=F@|zEFb{k8H+Fk)=g+13`SxMN-NT6F zZ}(3UhVhpPwtbpW16C*bCo7nB!4+01#Y;EmgUaof`JZh02(?#4YDJhQLiahvS`UW( zgsE>#RD-}>IKEz1cF3nw9lX_WEj<>Auxq zTe-LGpn3JcYLb}zFR^qecxim|5Saofqat!6{Co^Js4Zb!|Dh5VI%^I2wxbRj?(@x^ z99IHd7mK-l-CPbV*6@{wDZK)h>;~4`&CCJ3?-Fm^^vMA>Z?8V?e4Gzb4xL(Gd9@9` zo*2SAcuE7<`ouduKDHj-UdhODvB`mLzB4|2tFMQ`mpiAO`_u|o6c@iR=P3fpm(o<` zWmiK5$*;CI9ExDfhr@e=Q{thT$R`(%qg^m|#t6-E8J(bB>K%AKq7EK7I`u`r+-6{M zE?Ib3OdO~iza%d%D-|5Oco(W{`3enkopgrRCBy3ZEoq6r8la`u#jYD6)sR2j#?rYW z3JU(}7r1j$88Ck59QAZS3fTTZ-)>*wFKF6mX*FOU7V}8B|F4-zMR42d#wU?&Mev3C z!hvqiHDJ)*(vF0PG@vud@_FT@LU7UWQSgbRF4$PR+(EP+$;Fw_l2q3itfQ; zVo%)^(d*0wr$2n%niW|Jz0V}i{;bso-_4!lxBpc%thng8eD%F1aO(bY5Ao-v@WL{Xz;}(QhNe%TlZma1fsVPeo2#XX)EXlPcU>PJ zO+OjOKqYe@KWR%D4HZ3U&v|}Uy6U!m8gAakn%?g6{*G45*1G%I2dXXechPrK^j7o? zG}6&m(DN`^=dGmVp9Oi$iz`)nY*o!jj$r8=I{ zOSISMYI&&WnYh}j>#ft+Jz`GniTzhVC*RKrYgTzdal0-VfOJSr*3z=|$%I?-KtQB}&ZFRnJqzX(8J>ER2umQL}*6UE?&jIQK z(?yea*8*J&y<&%N%^+CvRX=rrV(1(9Khh0qhazL&ZTD8G1IqmK8=XsIz}kjziPO2w zaOHacy1ApSfJfKml_eh)fUAyr23-eU!v{&0Pp1#gfWBKgVr`>4pxhi8&xp^7;N!wK zUq%clfp}*Z=e{ zZq`n5dR|6(G0`rRKcGh^`E&4J&jHH6yjqbU)t>CNz4bn4edxu!0lwx5mrHUTkEa{4 zjEA|7X$i_fIrXNy;R}|~G9c4w%x5H~3HYIvNjB59WzZRS8y8O>Yy6Q7{Io27S{6U; zj8-MFPRj|{pan#OvFl{HgkRLd)5Dr5+h!g9909(5D)_JEqSZ)ls65kYZqjIR5-I`h z$)G)SQ?w_G_T=$Bv^WaCu0ZbNITZ1ITTf3WP>DLgugt>_&^iemftEWF2lxfF>dD5> zOB#_UiRw#@8+|_yf%{1*K6#+$EUc-8Y3Tev?gysICEAC1OZHP7NQSWd+B^h{crYnu=zS_$+<>c4)&pZ` zALj+`!)J>zIzStKuKT(lwmdU0WB39q9E%s(gKXp^#uxwKjyimq7JeO1@eS{;L>2ja zV$eftxGxyT;GRG{59Q$d-I{_PsGec|*eA&LG#ep{ErV64(Fy{yh$` z)&Y_WO4blU#d;q8pSRnP5TlKOq@1}1T3bU;-CD}pKt*2zKaa}@B{%pn)t}hvkT;Ec9vwQinJ@DQ2W{*CBNd7%ujypW+!@tMNbB9O$ z`1g1P?(nEjv&Wn6^h;T5YU?q3BHnH|?DiTCbPYzS-9?c1ke9h{O{yN zhK2uo<;t>f-Sy?~%T>ltqxuLU-AdJ4j)ni7?j-UvVXU9u(=X4u&p$@L0t??a{S@E$ zP#f@L1}27Q2!8>KHMP3=uk!=;x*y6u7h=vU z*mG_IjN3`w{VCM*W9;X_D0_*-v+RF1zWcm{T`q+q{7$6ZFJ)~{yxIaU7dK%jyB7Bt z;mtJ|Gr|*eo_H?_3rCWQ%`VY;6AZsE>5_`gDH| z!_m#t#t$)}kNQYctjFtvFA<~YApJT_?-#Q&R82^?8R3=dK5ytzADQm;!TllBeUaZx z7GDB4n0)H87k^~B*GH@yj;s&qc4O8E?X$Vl2VZ2n*Mq;?{fPFJk?W%#vK;C`It-ch zK>OY6LHVmVJ>RN;YJ*{%{PtEB%xp;J|7`z@9O%6sU5I3Ym2ppQP_ARDpK0~H{I`#& zc)@5x0;)uo*L(iRIbN`uc>}t50lsvWsyDNr6Y&DJitiaTU}65AL1L-)r~Lfg>))e}vgmC@B!;r3C=5;$?#id)0%4Zu0B!lCf4^H? zAb2@(X(O_$$TvK&aS?XJ#eDL{z2gG0X2kVBzkiPdz2gGwNI2!s`{&~Vl>Oe*zuY}8 z@b~@uH1Pq}C_xa?-A~JoOZ1En{C)pEm+^r<{rg!IBwtxR0^V75a{QsPvp6&1d$LMGK`TM4y z;v08`wzK|Se)_aN{Pa`9d$uI5jPZ@7kcq=TzkfLJhXa2&@P`9`IPix9|3w^N z+lznv?Rc{?&}*CUc>_7$<-*@?#gv9#{_P(AcHD7ASy`lin{q=j9m|v(in(l+azl}h zceXv=qh5df?#x!Ad)Fx8ckk<+jYiJQX2aS@{{kofe1toZ|Brv3;KTpgZ(siTwlzID z@$d4_KXoL^$t26`J#XafpVwmEfX+WZ2Ic6Xr|`Fb;!EJsr!`2|e7E>y1bM%{{qQ^f zdO!4x0I^PAWg5P4968_7FY>yhUv=g$8E|8&LNegS^v+WOQ>J&G3Yaq2@#6>hI&$Vh zell*zT*ptw4bd`wv{)al-zVb-l%Jw+|9f};6!U#8-5l_}^)dPV11`Q}K$Y9~dVR|O ziZiyG|D9Qd|2_Y^7*iIlNP+yT|GkcReJTn%t08_-?`gt$hx=&oH3B_+*2q1BpTm9R zxGNgJE$Nb`U2~7G;m0u((I1W>-6PoV{82s1L%uxxfm9qzwJYT#@9u-@fkFPeMI?rk zU%kx(Wkj15Nklilx{(iYDH&Z-BX-$-_23}l%&9*7>RQZI4HW86$}I+Q+!A+hcI?*90m{`9}^59iXK-lspDOMiNw{%|h+>8L-k^>ugp+5Ygp z{kfFB7DV@;){V3M;QyFEo9z$(o&F%&m)Xp$pYH9)t_RyM-d&gc9{)SJVf(}Trk}!b z=g+3(y|-QR@A7B2^x@C$xI}4Lf`zye#5a~ABo6=l{^7tM4*cQ39}fKCz#k6$7jb}X z?{$A~0XOT;DSM}fpOuUC-@o<$N_AgX{2Tu-<<4OGe<^nc)9FFEGwA)lT==Cae@~Bf z={@|?zgc(vd$$ZCfX&w){Q2XD=GqVa`|p;=lIAMWTAbI3$7~}yEB}8{pZ!A$_Z;ZU z4;{R@=RIuEw{P){bf5jx-@c;mZbR=LttoPbQM8 z*J|=U|LTNV*RD){KY2CDA8zNT|M+`G1erV^O%g<{gXPDa;b`+B`#Yz2)9&3we(FBN z(VyS{6%O?7=gK7EYe@Z1@uB;7squ5J!h=RNV~aBwIaN7%vK$N#32{P}mi--`S1HlUCh>-mgC{9!44BL`$w9Em}_kqM6u zu(#gN2lRe#B%8lI>E$fvRDSN>W6Irk8~T%9x5KcQ5H=0J+ashNy>BY?cURfx2q{P3 z9Y4!H_q#oQ^5nW*zq`slN9o_hz3g-UNqcO4^`HKJPWpcJkA9byT0cbf4^|wyUq5D# z;ru<$iOl9?kR5}HA5-xXs=u)NUH(3z{gv@$2&}Dlzhk!xcXU&B=tQy~gpqgdeIHK! z)Y|Q$;k>sEplwgUT~*tB*w#<>{cF!QP_#Yc+UG+h@WqOxVaFZXK#a_aZSEt!!ew0x zEVk)IgAv7BR_CWxLGv7K-ltg|@Y2logZ^K(!e_%}3*-~qp-|V{;M}5Iz_X=3TIX#F z3~!QE7%FrZ%yZuDBxh0yZ+mQVu2(65ftU7)SA}K(&FSe616)(UM|shqk&?0Ct55aZeCv_H>|W#_UYH37$WI&g)YNyAx92lg$oUk4g&rl1loL*P4Bd8FRmll9^=h|UrmgJY&mWkl5VtcVg zNg{mR3WDBSSAc@>#i3>!ia>5hyhOOjdlFn(Hg11F3QM)sr{3z+W5MZ=N0f9oQF%BxpaW1Dit^>5Ax-z_oSh`}}iqf#J?3 z**91#d^jX%hv|b_IH=^!7v-PLz%j_!@1SWr$mUJDZB+9WR0kfNqw*^SI7qlC-`W!k zIs(^rm941)l2^(_^e&e}=@q92j#^#|w)~37s+R8p*K1}4je1cHhu$rWD!W++vu(F| zoa6Zh&sDt1A26^2P99&n?|VxwoG3W;OYNg7$Y_+`W%aV3^14g&g&nW;Q@$Vbi&2o* z0bkYLif}>iy7zB8`2OZ4!I&Y@%P;1iONXBVTrARdJc8vrhhC5?ZinJImxn0$R6()H zyyA6!*AE8>H^EQ3YM=Qg zRKpjGjY89&G{CzBvx{aby@rjm%6-Ehr9jc(56Y#VKZ7fiv#Z+D%m>=OHC?D$soac^O21ci@~Eo&6n)06JTglm1xM_HaMvM=*@%X2Z7{rbzSL# zuV8_@UeWz|EpSK8Q-{2?0vJ5pW^=)g0?4=i#X(2(oe;s(sUf@Nf51b7Ewl8p>)`t6PK;pKJ-~DfLroDIhYl}BXgA(oGxhtuiMWCobV`vhQ8kt6#Tw}mr{yneqJpF zDvZhMqw|X4p^d@gwbiO&amCIZ!@noM8Ot*zN)zirqiWqo-w6dEqHgEpct#z3>0MMS zZS(^nA>e#Rs`+8c^R+1oduRB4cwnC(*h+2mwgpo zej8*rMBaVia2cA{kJZ_u*9_)1deztEmBOLIN%hj(KEdR*iWh4oGof~>FW>ob(ZHbQ z(F60J4CTUdL!Hb1d9YRHd<=g;G8|A*ENVWW98TyD1%}9d2HTH`WB1=Tz|;w)7o_4! z;Y2Id^ViS42MJG1<)sZO!F}sX;=GZ$(C?ex26f>&C_76^xW1+Wu8#g5^*X8!td1J~ zJosG$7zxvWzg{66rlMG$|EwH@9Ltd#i~3pO(X&QJPGtiRhec|$+c0GV>DAg_f(oGA z%YrXE=3+7ZX6br~eD8$ok9DaBoT~$E=B-PIZ%u`Z&2y^#HF%X@-Fx>^leZqagdQ?` zG5aSR^JVaciGERV;+L;RAUhd+E$@G5#@jOJyLqh0I-L%%>D#3mO|fRsAO{mcT@>1zh^K}?Pnw0`!uX@zI+o{p~!nleM=}fbvb>-8SQo;|3T=yh*bkz zUvbj)MO_n|{oM7sz|>M0`XKtQ)SE)k*mm#C*}Ph4Fis$O?lbyhThVG1EIQo{bg+y2=w@Z_T%on{?S@RVMQ!-GRGWRq@F=h-3v_pR<|5Uqz- zqZ&L{94my)Q7R=Tw&uX7ev?&iNR^=Waa!xwm$Il0T;4nV#-oHX9hXN*1^jcU%gb@kO8uw z7EeG>4a|1j;UksZ3U4{>u>aDY4~Ov?Shp@H0@GAXSCnka25Ze9S@C`-1YV&rzYZ_^ z3?A&8TrNB&8uE_1qBcvv5!CHoKUpfS5a#;l2Grlcl#6YghE}@N0o55>7XR8(4SjvQ zOz-Wf2M1-R$`|LQ!`y*qW>37B2*S(;Xv{j&0{m+}jE*h44Q7?YeU-kYVE%wB7YxNP z<%-3#%QwbWfJq_+b*(q+Kw++GwM%Rf6xhE|TwCTje5#L?#cof6y!DCujh7Tc>9r># zcOR*M^FJ;6;lsERFFtg=n;O3DjaP!J*7GeTraL6j&*(VM# zV)jZm2#@ot0FgSfu_G*!fV%f~Q!x2EXi#`^I{91;G?MRk`l0rBSiLw-Ml7xg?mYbI zV%zaHC^L88vu`3{;FS0IEC?1jqq;u@o=npa)_GhH!P3TxORx<-x-k0OkS&Gd2@ zFl?_?-jo8E_&hgUG&c`OcdU@|h)9AH-uIu^bw3Ua96E2PeNi)1w~ARNR@??3Ze3lQ ztH4mMdA&!dwK5%E^_8`an|l#Hm6jU+zOxSOS=#S`fNwp_%u(p#o81P)M6HAm&if2j zm%m&1aprgEnfSn|(5n-e`-*Jy2j78tko>I1&@MRA^SLHw+6Xfr34HjK)&y}xrss;&|^6a@XeDP0EMz1;X|8~Xm!_^wm7 z0&*>&LEZGi54A>Exj6pznANRt&4^0rA1}*5lF-zV^Q|FRZ=-B5aAy{X9xf|>PwqZg zudG`+Y%8XG;oJt%_hALdesztnF8cx2pE%>%b-D%)IrQtvORzaQij5I?l=W9Mb2bb|@_!ccKLx zsgUAdaJmJu*K-YGt}FBNvGMTr^7N$~=zXm#Bd6yG^Y`Y-?EIYGQ-l`Fq3!Q{mj(AD z^fmuy&u4hasWrVHfn85xKEr7q!p{GnJ}>3YpTVYwyZIbDKez2l_=ot(3FlwuvD|&< zo|?C^+jj)nzI9Cbp?)~cS2?#YaX$un%Q{YB_wU>%Fy)RY$jBRSM42qB|RM-eI5LK1ytAiI(s+I&|>|t#!fF3$vwp} z<1EkN_78v>f2pS$(E1s)w5JDJmBEPC(VQTh-vBB!-Y;Ds)&_Qp-<>u0ODp68t%hM$ z9y2zbF|3DD>4x{JPB*}~!9F6dpVf1NR%b9KMDF~x<7zsvXlm1U5h(yO3nf;~Zu~%x zmf}00G+9uE!H9d(Ja5mFHn9K7?A)p@OgY&h{O#Bqzv$^E(5e_FJ-ooV>6PuDO(+5jQKSAWu0^( zoOSfY++f=b;AK(8SEwFMU-sq@@m);1E93SB%pQ{iZjNr*K6XI`P&YUby=hAcec1$B zm4QuOtMkKg;yZBZqo8)PR3)@i7v4~Xy!^E7O`s9o@6EEV6qja!_LY0x;tw;F6PBF3 zy)C$ao0=oK7sL;p`T0!-eE2wM$7<1EzHQz1w>B~--{x}WMjbT5IIm1@uA1jXc z0TtIXp?t7G;;K`H^koxhM7KY`#n4Wt5)kEa7pqyB4gJHjOJv6X;s%ZAHu1Xqd0Rw2 z7_a~=YkZA-S@xD=bw}rMgGO?0^y$>dZ>I{tg5*hW9o!SZ<6{fnFY5Y9kDl;cXYv%B z?)#l_k2Vkd3FXbdxgR;70Ie@9kKQ{hh8r}Z`$drRTnV#xVAMQu?Q>TXVa3wHms2H5 zxk0O8jNBya)Lfl`?Y5g8S--y?3~txmoNdwu z72eq0($s_WXaNs?p{a;&2LGP`DJeC1Qi#3=X zdb$ZT(rY7pOMhK9S`U9Nyd;p|(gt<%*H1p4z*X654D2~?{57#tO+fnWiRfn{17bSb zjh38H=fHEKY(%$E!v52`=Uc$Fn-Ao4UlziJiD`L1ORDMVzJ6Hx4zs<@Grko)I@$RSPS8~nx)xwvmp z0bCPX9|<&D=+T-Fm+4HyI`_uv- z*p$mki+#Rv>eQj<%eEp!`jxXkUAToPm{JwQRxG5XSxq7qI=69=V!9~f{ zNr}6k!{p)mZ@q02>FFlQM(sT&$~;UgAsYr+`dmEowidd9a|S=AbFjF!w%y*&q-zM^pY zniFUw=Zi$%WQ4jj!0JOuy2{6|z%WMA=Z!l#aE(ACx|2lS+(AKcXgO-R`&IFL=xgE- z8Sy-Yp65w>R8nxd8Sy0}ALndr0M&jUzs+nc02zyRJ&3Wn!VMbHEwg}cmrPX(7_78e z=I#3e_&6&0w$JinZqSHs-5-I%(u1ntE?0h;nxXkHCTCzlJBo$T_7bAz$ZmMA?XL->R_H;hRdp z;MphI!`D>8QMU&D7;>M3J}1yfuMNHE+4#Vy76zK#+8J=E4qTsS_dWi3EPacuSUr8k zOkA(Y_?7gZGmlR>`P;ITrM_9PD@bZ)o7F>Z(1`BTj!m*28{R|z5cAi2-133Y^^Z|& zPV>ajN-9w{ve!;5nYWfztRUdT|(T!pA8zRSBf2@U~v8PtvBajuMud} z-cfrJY+qEjz?Gv8Td#Rv2@C>5wFppjlH5PI=etECXQeu}*6fo9T8!o`opW+&03wMUjNnT6V$@lDTt z#r&ULaQ((Y`3YT(;LxXjrH1=BXhi~z=)V1O9egCrP;NauK*u({9{hT!oV_=WD>Rbx zyqXz1%-f5=bHn|i%U{)lRR>R9HnVS}=NeHqYHuY*^frEx6sRc#kI6-rfSjr->8_(3 zG}EaoK_?O246HV5iXddvgX@Ct48?;!!Q{2sJu07@veCRmd;S1;*rEc6 zt*{FXSyKoPjnA4I+t@~5_Ku}WTW2%rK496XwDlu6D&q-+HYdZCd(9_Go{gX{n?R%S zdwJ@^>m>$C5M(Q{bVDhZYRW zE|~FIGQ>S5lN&VB=LP9kO|)M%z{B+`NLyd6jysW21k@>*MkfHcFpF?{~Y_JiVgWGEn z*S?JXvZx7MSUBc}ooOMME-hUx#&|_fH-ScUUv_z=Zd{1Pq<*|!si|56vM;_!Y^=M( z4I1_DueBGVO0zzI#)w=KUbI%PTsz_9^!O9ppiz5&7c~*jSIULL4KbHC-f4vA((2Z{ zY2?sDeFR(FnRM%{db>k;X9?Ic>{$Mc_&1<*QTjg1$N}^tFo8yLUd2;%dWCfo7|b}F zV>jjpEGT%gGD@x$(2m~(8nyS!FCR4w?xw;`SyMJ$(5r%jPQ6|lsD^P9uW7^^f1@Ed)GaPqDb>*Ko>^u*i@R9IX4hCR*1eaXD9$vyzR9_zn(q zJeo5`stTl4jW6TT7|80qxQ~EpSJJIGvp}=-m+!U)k9$akR|okAAF%l zcNOKdFzKGIcC96HOfksnT)y$_h;-03h+oOPlR?V$u3_3xjJ z^FHmd?f{bDqP>gP2k6{ayO+MLhvGqg6IsF;*o`W-de8&;p4D6}e0dhCTji@xzxTeR2gL}o?ppjk^U9fQeNT(Q3zI=>y4ORyY9$AYBw_l?{ zqY4-JwN*&q_PKw)c%tj(dbs$G#K(6(zJlrR`vpz7-^>jf-NyFF5aEv&`S5fEcGz}P zCTyKM;nCD{bef4(Hlq9P`Q~fU!#{wahW4(Ln$5tNarn4^$3AY#M(rIq*RB>DQVcGX ztY?IKyaHP#Mfq**=y;CGX2nB?he*^_l*WO<1zo}8_tk^*5l>3}vl%pFC<~4Bn!XVb zk2{tEJBosz-P+O$EW&gKKW(7%fV0p@&JT*2<~w$lfHL*m(lfH<@MFOTk*lV3^D!10 z(QOzOaJIoR6O8-VdDWsd4`f?Uo@7{0HxeN<84tx6VvWYxP4nyFtwp@q(hZeBRN0_Y zK!^j~tazyXy;*bb`!<0OD}?lvPS(McV(*4lQ@W@Ft8CQXDj`p!r@2*uoR7_ei{l!> zz8TZK=Ga8hw>MEXlJgS*5h^JU-hqB_^ZdattD(w}lF{;RbUsZ~HWd$LoOiU$+L&Aj z!^8(*>GHwN5*dG=L`W{PIXw=?Ljj}>ZAJqXp@qS(lN8bZ7I&;em zLpkuAjE6E9eE!0G>p#_jyKOfD8bSn=g-S(!Ovs_r$f#@rjr6&l#<6`$-Zh~AbNL{( zxE82AVgBC3Tj}gI78;FX=Z3dVG%#YsJOlAQ{^3`khgJ3=Uq?Ef%0i>|j(TOeDKzsn zsIOhf(>S0E1|*z0o@d9QY$_g#mF?LrqIx6>I%imJ5kAlfM~LR`b`7VSL8G!+@lcm= ztF2#PE0}-VM1s$_9*&i7&A+{kF51FEqxMeSZ;|AxTLm3Pj1tSamj_}igRB|f=zLr( zG@@H(#^KbcsuD0sP+`^$DptOO){0HL3re4;O#6%XCGDC}nbm`JE}BPGwtt_&VueJ*3k5;{K_zhWhwy{^wb3c5cVLFg$`=V~bfZ5i zn-veO$=@y))%5~Of8SP!4Q>WQkK37+2ei<)7=cE9rn8|V-+Zqx0hU9c-Kp6Pp#93? zs}oF{=xa{KLm7-u2U0fa&TIg81I$+@C>4T+*>!h;c@>R>vfx;bpLNVrzz2`Z$Vrm(9C(s%1LwT zn&HDXnVMs3D#7gS)1^hj=w>x6G#bC9_iAm4(D(t&G?&1W-*TYD&tFF>rqZp3VxbY; zHK+T%@PnhESeJx#d`unW4SMYSup@(>ZZaN# zbAv{e*gI_F(Nz#Vo#(93G+d|=o)NgPZ0sUBPZ0tUWg|HcJKR5A$Ug^`y-nC+7r+}c z@YJ`eexh-ZwttiHPzL52{~^7#G!@)dFj}-|QZbxvl^t_L{4;&c$#^KnC_QQS#dAeI ztO=R1;E`@7Fqpsl++7zAy@VAH9pg87-JD2-KI_efXHjP1#7V1*=}T zy1uvwgzO3*`n0PCxJQgjv>hMM4I0fu!$;Jfce+~xC)^!=#ARdy7%um|3PVyu%QY$< ziVZP+$*3Ch5KK*ceS^oh4u(6Vy{J0Op_dS7G=7I%%{scxuNpWWiX6LO@qm~ydLIwX z*iEg_V#A!_!8h#?s@Ivi+E7KM?Pt^ULif*R6G=GtM_?0sHy=>I5}_4 zI=v3KJTE7B$P*`eG%FtZ*dv2)Wa(SzH;n&s(7|_bg}XxBTFo!?TqEP57{h33ou9qJ zYj{vsaJ(xYrd-$fC5lIv!`y%s53P7KP-t{{0jQbscxK4>N~mevIBJp$hgl7QM&q~j zo%p@Gz88ZlMpNaMq9=d!$dlJQUmV?<(1+S!M-KqqX$fP<6DL1g|tUtJ>( zGYbNZ^x9r)HDAW4Pw=bQ=e2>uGC||hhcxRbn!7HEo3~Df!&f9|ICQA1&@cF`JRlPBp3NGd%dR&o%bD~$#^KnI8$8HIjK4w zrk(NAJ-@60@E^RlZ`wsVp9c$#B&9*LgD(uOXy{diicwU zYo6~^{8l{`=xQP z*Qj_XgO@S!s8?YVYy*pL?XBqqhjoi9Z>Foz%Q=BYbf?K*cd(t<4!jF)Khv961%3Co zb!?31FfU=nLnoiuexoAiD;&Ii_w`$^OJLQq&Tu)uTyDxndTrCyM)^ms-{3;GS<_Va zdY>B4)dlXt&CvI|LB{tW4z>eo73e2?+SKrlgN04T zLoo(#G{5nu-K8)#N#l-E zCvW61hLZ763=^9$&dTpg32Znf2WlJ9`w9{w9p<}nu&~K^D8?9ct?b4Q$4dCr?aiqV z7m`3%x#^UETsr?b;u?WQ4FVt*PhG^#m) zMms}iYJHfKnE>y{hOC&KR|mHl4E7qYNcXG*3ytJlpLg<{XXBp0*Xx$=-*WaUFs%I9 zx$6{%`8O*b`uOw>rP{Md8$NKqsdzIJtSDZg7-3sRFXs|OJQRKFNYd+Ja_t*9C&#<^ zd~poi>}Xusu!w_|OQ6ww%=X+(N0G@bz-ejc_LA>tL`uJX=<#w6BQSwR^O8lGX}=a( z6oTWsr}C+fX@cRW^44?~a*!4Rjr6%$xc$M!C>9%L$?tBL*$fo>PaW}a4~O|T6%WPC zqHc-HPG|&5l}ATd6ug3?bj{{We1_4tHyIDbFbH%c6Zu==`O%dlijW^U_xZV$mvtPp z0~rrxU@3yG>lxQ8(VTzs8OfQI@Zf=%pk!gXr!-v>2(Vk4`pEMmu`JdnI{PFGj0e8+QjN&ZJwXiCT<|hoUtHHyVv5JV^yz*PKqfwif`C zfW|ZaOE}C0S@FRyBcTI>#0qm#qhqWvA_BTR3<`S@BS*^n?X7zLtP>!_M!O z-B|?`CrN_lDb@5`Bhbh`zrJNx{7K0hP-l}wpvU-1AZePnS9s2M`t~N!sJ-iE4%wP$ zk_alFCnxLADuI*Eg<0rZaD_&4zM>?|PItj)Fud^m9p%;suxS5~XwOxV^fhP2LqE3i zR}Bv>gwLj|21Vne;O6BCbyh+t^ff2rp%@mv=hbe{rL8a_Z1?1@v<|51BN{r{or6v# z(1`B6yiVJWjcfwNiv$9&qu*ip5_75CIJ&qXk`@Au=xz^Hd|n+_4jlG4ZP`1l1X``w zS5{`iA;Lw)LmB6C7G2=oSP3lVD>CM*wSaA>;m%E*v?41WD)spDvSlxGz~pgyYfUBy z!~{kh`xr0rou21pJQTw^RXZd#zrKgtee?zo8}ec4nz6pM^W z?-zPK9wY|cSDWEp3gh-TYJU!+vs+NvgS2m`8*Js6%QTJ5+i)4?lHX4b^Q5OvvkneKF#XQn4k1qqvD~AFw9d#RWw=Us?5fiG3zy1#FUgWCpbdt{VkI<}m=(q>D9`WLbX z%&me=Efdui%4NYP`hm@j5h?UMC(wv)vw^Qp9n>uYS=U_-Wh$kDNX^+(-X+mJU5bjN z;-QSiI&v?6J~|7H9L5GJrtWpw^C#^u$mePDTNpUVa zH{^qvN&0J6Jg2jI5SomKqGwx_B5Gx{vq3}jx={4;OE_oP)JaDVa*$L4jpk!l8%vM8 z&QAid4+o`&<`saS7w24;4|z||H3E(Lcg&+lsmEO(0{pmq2y#oHt4MnP81 z(R?gLp!Hng=u+4)|4R9qhDH#wRW!_?Kiz6agr?%5C`Ogwx${;g4D%JcI%0qyG(LAI zMEL{NKyI zT=;ynv||+Q>KInd(fB=Wn7c^ysdjKA$zJF7t@DgB<&OA8ebWyt79XRdQ?g1aTQtyJ#J16PF%mlog6 zhO}cS84qP(Q*t^UTGU(0ux^Wx#dxq-F|Rp?@|sKp30ve&kH z6#kIOsK2JhYhqhe_P87I<^U{4VdbR%o~=>GWP34w6d7 zLovp+8NBkPt7G6(TVun=Vuj$=%ZHbeuG9GyQO(JCC!E`e!R?Sg+YYl7}8}Oh5-d!@`plEOk9CbFSAtRYXi?QON7xFKZ9QyDE z4B0A^aCGN87`OSwCW(*N=p~gE4=p#Db}>n=9qjKao+`I05B6_tcV96jlN&UW^E#>5 zvUA)Uq0K10HD*&1fqua1O}<8SaUw)F6%S<$2@y0%I`IJrS!)hPb~+d{-~^A$nJ4r- zXT?K5?X}y!KCu|SXc^ii)E{|>ekCb+iMDWqMs|zDzF6%8AF7~n=!wT&1ubBXRkYv1 z5p*+2RC885bZ?+u>CAKK0Dd>xo357uO^2@vI`ZQtH)W&ods9MQf?r}IxT^WWTWnJy zcrP_idEKHmdNeB@nlrLC=ego95G~oif#S@F(&8Cc7y8RfPkBfyya<0*( zzPRH{BaA#3kgM7l3kOW9nVYuo5Ixlf*U;>TgT=1axw!=%CfcKq_;mLtNz;+RnCjSK-xJJf9 zF|6P4F|$0zHp3|s`R!s9^I^i%{kh|oR?*j-KqI;X)>NL*UiA(>6K=eERQVztZ7^)@ zg&QyEYtD*?3Tf{@ZolU}m@P9fd*VSAo#zPrc;pe#GBb8;qy7Inw z=dd5}dqH}*RSXB)fsBWuH}1%XA5QRm38rcvD+yc17h_h=`)cBLy7dmIY$_hgfblCI z#ecsBi{1oZVECqhSL*WemUHUqxkkl9u@UVpdFf*5Fy`3!le09k;F+V2mN5(He44DX zk$t{skn5V#V-@h0N0|(ErxNgeTX(JXDTnc!iia}ZHJvUiUt0{#@~anb_?!%mCSb2a ze{mSUS@F=H!@o4kpf_qAF+!1tT^~WD;kom=j z;42N0JdITa^wL7cLoqDYT(M9ssR6bPGyb@-b{AZ5obRBm);D^dlkrdncH~p$Sd$S= zV9X3b$N44Ya9e43)~yK~xJJc8vD_hrJpL*HT?6T?UX2nBaE8kdG5tRf5ON%xo{w#n;=SS~+rCLId zrsAR4z4=mFk2kf!AcMU7x`)2Q83Jd23StlFxkkl98AJI3olY)BtJhP#c=Ri>LE-gf z{Y~c4tuaCEO`wt8GV48m?w-N*&|fWk)25jZfX3~sLj)&quzJXND1+fKbj%YJyjT8a zZuErzav@NC7Fkh%z6e086^W>#3Wh1%oe=kYcQ%Z+&kR%lUJJ+QB?eh}aOj}~8rdxe z#BYWcZE1m{wKVLW%zq0E=WoCFf9>6AT#Z}UKk&UvnUk5!awtPIDZ{avG^bHWNku9t zB~6ksLuMlLkSVi_8M4Z(NOMZ_sChRh(zBfHoLA?z&#UM2|NNidE3d9~t?PTOweI`2 zlEJ~<`x^FF2jj-5TL*Jj??0Pus=U4nW*=}$CQ#5;yg=(8Gex4acm)8a-9hbO8 z_h%*ezV{LQzTf}chnCb%EkE8rhe-8yjFG)`pMJlp@Uc_-cXyJkWB=TTesKRD6}|O4 zjotdHM#Vpy8mZi@67v!9=JD%3l*hX~v{7a%d-{Lc?z}!@Z5ovu+G~U9```VH$v?T- z_xFQ_j!&CU@HH+gq;0(3+(1{<)J&Vv6!-gn|8pOzxYH%5uLDnG%Pc=>@xu)?cTPYA zpZy&AZ&%B&`%oV5aAJU&Oa3XkPBv`+*;D~Ft-Q5&iQa$8&HA0C7k|;esgABU?o#cp z*hGSQw%ZyV7xC8f>pqmnugg4jCLy|zhDLl;Jj;InT<_6Syxv|f{Qb=Tz7OR;?mvHV z?Y<)Fn#z6@J0XLV-H!5}RQ|i4Ut{LK?n8Mz_wdX$mX{mpyjdx0?h^rBkrY=hyY>2i z$<69ss36b#F}{{gmv2>>EF&N`#og|HnBGEuYyRs#l*c>MwR*$Dr~)b(t2B7v#dsQ@ z6zYD=Uc@^8`#zLs6{4r!*zFUk)HZ8+eDo6aTdkq5Rd(a|GynTOl>N(f@8#4}Kvi!_ z9+ypMqN_)*i;E2v@m2HdK9u!)NQTmCN%ns#`E)t=Mo>!{txw*3(YyM;ENOB}3e^#8SlR@asO5&u_bzZJTc$PizOKO0K zhxSxZ_tB5fq}^U7>h+sgOm~=F36tt8Vu${^4`shetSC<<{7xm+PMi99mu?f?v!J(( zjMDG^T@71nf8U4lznzU48K>Dq%8LJp?ntO1ZQEbgE$S`e&Ewa7D37OLcafiUF z{F3_oJ8cg?&`Rlo+jDyq5w$%xCau_&^k1UcxfZ=n&f)2>5)xY;U@$xV4Rt1OpU5>Q z{(iOmb06wHL~3rWTnzcVLp-J7csZ4hR+$&0nDYDFzwSeMyd6C}GsCh>S8nZF}U;fL?*>}koC&?oQZY?BG zL4VhM_LB;N#@8#xKIc;VfA2T=Vnz^v00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHaf zKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZaf&Y5~g+6T+vb{C= z{G%hC@`q(5k$rY2Hg6mwAm;5UL4LW7g#XJ#?AM>po}P{l-aMW#P>au3+i_&<6aRCh z-eZr1*`|7uGeXL_>3$jgeV{gSWsp3lPz z6{k0+&~tNoJX~oVOXrVmmDKDS$eD4P;$9l}6X;z&g)3Jh(Ikq?~e_iqrzY0k_+bc zC9}Tg(8w+iY-~=R=TsGDWPOy*?lmiOODk>D{V>FRS~VH{aJr^T-$D_AI()vivC+`$ zE|p~Jpoxc;ITsS=>;qk#@(MVoI^wb6xIccb)wX%5Wa{&T78qdwM;iGE4lfY8Q6WZ$?dq5r(Cl8GCqilOE63D!)+7aL|+W z{MgosVa5+{@HAFvY)G4JRU;yh_0h3&Xjn?#dve25>B{O;Su|^TcmjQ$AR>^h)kdEu z8#8)+p&|1&d>pm)8*To)DJ$BwNJJppk2~Ax(^#=)`t0naRCA3S!V7C}nxs-nxVF`z z&XY@i*2tUc)24WQ)GK;-{>Otim0hWmQsLmL&!PfZAI-I1N;fTQqz&iBsg86lChoK6 ze*JN&oO7zejI72~KPs6Lxtl7u%rX|F=hDl|+k$%*)rpuYTdRDlhr3?8wv!7wc9&nv z7ZKN-_ZDBaa&N21{famKsxd)5`h$MbO4?t4*L6&!&zIk(~D{1zP%0ZHWHkNl`xR4Rpl`?XLL`ekOlLeDleMzm1qsQ;uc%{ofAhG z$ojY^V)O%3-F&)yzFhyD@2TX-;A9!?@Jvpe)-kWYu^RdONTT~YhSYX#IB9Qg5*EtQ7^sIV1U+>^ixu(y=D{RSujpw;fIblZD$12|RciEbiw6E>V z@11>8sAkAKsiGBWoHYsqb@;p~CdcI5FNtZio4twN7#>ZDgG5p{d+v7V-nB8Dz-r`G zYHf2moR>&uK5XzfdZCsI?DG92!;&~N3IkamJqPz18(WZ1YGWkl41bhO9j`a<^h)K% z5eBkzHRPa}MdpZVI=|jnb>`&;^6*7VrPHN+PE}zbJ9FAQ4;o(F-9(4GuX7shdx9>w zcQW_3lYn!o$}5IUm-~6<@Q29v{d09KH9bDUGvjS3k<5HlyWu?dDJKkM&qu$N^)p7_ zDXc99qk**7QmJg3n|W2{W@Z^@#?5Wlx3U`fd^zodh z`SYIJ?mgQ(iQT$|Hhcbv;>;)vWczW>*YADyc5kMA{}@#0E6dXutb9mYDy3dTAbVfk z=O(Ec$a_U>#&2s(l=whr1xf5!Ggu&Es%))3daat^;vY_z551EvA*iOa#!Swcs#DAv zIC@yd82O)T)lBw@l=Z$6dSsqnl!j9&>Hnx;lX3rY5rJAfo(j**LDj#LZho^j=~zky z705~_ZVcvrA`3IJK8m{xtm*vrhI%bKljpoLg(fTV!uGAo5|NSBxFMaixtHhBOPd<^ z>)K_};6A>e-Oi?Q26|q1Tg__Z^Th(UNp5-EM#!HkP{m*AgH9Y#W`zAEY6co)380Dk39$c35s$BVj97M?y=SY~v235p{`25gD7rG`Jq+ z!i?%C?pKc#}!h zG$Fpv9OeG?oKqDBvi%rztXRqXRxGg#zol!KUP5~qZd%#>k~rt{Q5eW-EOS07e&I?z zIe4vIEnKyPtV$5Cdv~2LB9OfUoZ6^9_4tBjk`SNLxWvwv6u8FFZpY&|r+Tga=~Y%E zkGG=v?xTWY@z^1AR@^_eyn&v#iGM8dxQ=qE3Io}GOxNuBZsN>B8g7=?_m=aAs`ZZ2tAH8U=@TTfUBZOFtJ6a@v!eOcy$jP_KW*{k|#;WNWo=U$w{2 z`_+?8)^!VlFK3YPp~;iZUak_6k)1ho8t-%L$ZF!f@K&8pk2-pxRqczTv8buC{ivE{ z)_dLgR60OB?9iV5C3LsgT>I~rpK)e17*M7+>1Q9~x1Q7QWmG{=Y+JN8Ix>N#O!5i# zmN`kd`X~%!XO8jQh`FC+TWPpnp0e!RFc;Z^e z??l7n@Hwpm+%*a_vS&x11rlwB&l^e7YV{jpFG}c;e$wJvK~0=fJ$f9aw^~FXtI;T4 z(O$4TkB$p0eNvxONxltf^&~&ui3ntUe5V|oY~j~L>rX}R`<_%qH@)U5hsdW=u6Beh zQr|rJ=d{5iiN3F8uqS%(%aFgn?|Wda2%;tbeGHJTn`;MyVr* z$cI^X>vy1vbE?8XwjX20yna`=w4AQ#J$do6DXG+T)~)v;L*hjQvi&GHXk%l4tb~l{ zR`G4{cL9;PpxAfpe2xMz@dv1r;-GdDzf2q{-i-kMrg0Z_KE=fP;43rIW7&YbR^HEA8QOZWShP>52 zpO&naNj#I<^Ur&8XA}mq8pmr7%(-HCk?iyQR2OIXie7neCT>z4_h+CmkoEDSUyRhY zfL+vV;K0#Q$Fk^y<#LPa+PN2iFp#ZP@2fE<7az^18)n9w*r=63-CF&(_bSSuTx-?4 zcgrePBaiPL_hs^kh#I=n&Chsw=tp9j!Jm`q%KeQj3}nv^DXV7nS0?2&;a#@b^RNce zf98Ztm6nSlGP36I;G)T)iEraX1hN_xw&?cRKDC-I?|1Hw z)$=cOT-nvRIUBjZJK|1UIm;r3Io}hqj&U6ukFE^MD;9{`!h3{q*S3xUM363=EmB?gER|C+u(}z0k^piD`6n(W1dxB>afLCbgW_bci&`h(4kYB z&$=IY$%!KjWNX#+m~7?BUEip*_Hh}L2m#p^CoMLkphQF<>tofBC)4}SswH_f-#6}U z%%j6gpFHR@G?z2bc&*pmsXzDQXW1vd?zSEDnR9brmFj%5-${D(9)5Suw?|!bd* zhSq`K+llo+_w&6huLELZ1 z!a&x?%eAp)#+TQUH68vu$q2(l(mv z@AbIn_%Bo{!_r`vCil@H%*f81zRT^EV}jx+Q9M>=vG)$;C6A20cRiCcV`cd9qMv=_ zS?#cYGe<9j5y)z+$nV#1Fu#f%?RBH9&$D`BoO0{W=Ml-2 z>m(Ou{Q2w%ReMqKDV@#;te9yYf0ZWJNM@%!$`g^1eOqa(6lXYZ7SP1uW+UzQRg$h< ziSmJH?oZwSdpC8c7y=N000bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb z2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$## zAOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;| zfB*y_009U<00RFj0xoKcmRV^n_gU@F&M)@_|P=sW_zt( zqxLTKe;_lx|9%0-5Nn*wHI*C`6<4{kSrjS6Oi5YYO#809`E8_i2lch7H0b&`!7^qb12MPDoA4nfnf|+ljf#%3 zm!*mMGDkLxsB6`_i{I?06S{96#vhYI7Jk2EqT6zVMp~!;IQ*c6V-}7x=KGqs`7JQ9 zVneK#eD}U$S|dGJXI2q$r-q!|E#Y^4{};-CkR@&HU(7Ltg;9X+>KQIdrdq5c51Oat ztqgia)^?k;Yn(w5@$t2CIBF42SKbg$aeLmtQOLAqPMXV1&AjziGmB+qo(HZktfKRe z$Y0rYE|vPlcr8{Nl0q9gqF#R6|B7Q4CSLBA>bh#~?iOqoS7skt7k9da>{;dFqBb#_ zm`~W=`><>Q4VbgC>U4WK#}H-#TGLktdIg#ovmti6kNK!oQBFq9TIq4XtCmcDZnU!f zP!m0$D!=J;)mM%}bOZh8t1A1f)b(UTbc=bS|17PO*qcVlPCd|7!$M$E^Y(55k=i!O zX^lb?#}Lz|&+ze3@iAYc#)b$882Of5Y#@geW;-^iHIf2_`) zGY{}GF`KTbqsN*$|9(G{%Q6kLVCCbldKqP8LC}u`gTZf!gM3+zO=bbdEF4xlu5wvu zK37|r&EnMC$h1!nUlNt0_j7ma6p{`_Tm4(J1@w6POQl(6r5r>mWM;1~K{+7%^*#eFs9JHKuSL+z5Xgl!P7`7Abr0!G@ zfyKMw6SsEI){v`9o}8?s{sp5`jz8ta{`(#=)iXCSSmvN)Y0gIY8d4wB&##JZu`|on z+F3(-j~%dA-zJm}IN`P6?CwJH??VKv)^bvwZth~O$Y$}et$6b3UF{@z!2o}Y_&lmL zB6COD>q7ccO@B9o^hT_Ui50Lc04})6J!&%^Vhv zy59Qc&RWXGR&0p2IeT?|A{*$6TaV7PH>6RO^=`H~ON+_lOKx`sAFDWq(4Xd}s%+%% z@8Q7;8RiijX<%PQO+9+gEE(2G0vEL0nL0L=o~-N8&K#1@F@&D`0v9cp!0FxwY>1aF zHgmU5`a(m7gie@Z`i^vVkN)wtwL*T+=xh!PH7y4(O+DXLW;$$vf40u(Rg!&= z$W$4ag^VvHzN;GEdW}e=A0m!gdWJM}3}Lj;#AemBWe$$ZnMK%Bx%DrGrP27sy+3;k zKGC}hi}Eab))OPgfZn>T>E&xefwBH+`##@u?q*gDbsWrV zw$tQj)6n1=aXRux<*lxM+h?aHhm`Ni@d@=GROf<`~^}Dcg zTz-Ar%frculQNb1%D(``MxFyRts*l*GTeZ(hG%KU~+^oRM6C*0+ZA&&%1%Ra(GY3+YAYQqCtQNpBzK z4{~+z^pxV1-&lGn`9nP1Kc|T{Bra^T zEB8aR;Rg?zkEj2?k1F?ln|hZq^)qUC9{t~V-l=dc+1jbUe2t|mPajp)SXhmlB7ZQD z9+u6e_@t1anbU*WWRz!gV6dP694SQO=sCyMjy*TaH)y7RILFbW%tU%ooO0?<#Wn9D zyvw!G;%w@qb0}^)T2i94rPePO4e5DInufPOz|Q&C~kv_5E4{vK_(K7ajPrbmu) zeXPxG<#7%9fA_v&zvn6(FI`IwZm_eHtD}#lkDJ4A`tw5m{crN#d6adn&3$Z~)t|<_ zzkZap|M79we;W7x`ILL(iq-PD~w|wt>MnR!gnx2L-x1naJwWhDR zeBy@ls>^nR)3;u@*CA)f#r5%z8!3HooX!F z+;zD=->Olq>zJxV|wvtQ-9~EGehg(~WU1!^Yi_7I;GQws3{!R+jxV^&Rt`YzE{L{p?S58t57Kn*mncuy%kosT&$Jj&bHqFVP8Z5B>P5Pk8 zrMWbhF75AkHRijVcW$~sN2D4oU~+z5obiLJRD%UNEC@-iyPQoPEU>_Q&7uw7x2cbi z!&5_z`DIBurI~xPMX3f0G(TM!Y_a4m&4mTloCqD{UYkfYSRf?tv+=sabEwAn2}a{h z_!JAfih>(E!2)fgn`V9bU4|&taBi79#FQ`A*KM}K{xq4gK<~{- z(+3pYr#@I9w3$o#4~F?vg9Rpxh^+LBsG%BLeww$*lwTEbv1Q!oMzl^@AnL&7W>Gut zP#-L?sr#8PqMD{t4Hj5kyIpn4&T6U=HP$ZDj1QS=u`wm?7|n$RR+|r+(%Pqn`e1>K z>pia6yp}y!pk(xz@O6b1RO4RZ^&~TXU*%@K>b8eyE-a8+-0w|FbCGJWfXLZM=n%%6rHy`DYk%`K^}zyJ!;<=4HBX`% zM?Hq_H|Kku&r5QdQ%QZWz;F9v0}SrorMa*`^4@K)OReuv4HoEldgo8J2cFVg-ef(h zGUsQu7KrdjIcPJdfcl70SKna4$6tT2?1<*i zqEv$ghU$J{+^k0atz?0&O&5)S@l`(6V1dm8zxds9y}Z^7H*d|f;4e16ZZPCoHub>* zOJ^kviX4_qb76tr3mx6}@T&Gw4Hjs-R#2;Pd?oxd|C+(9AOHafKmY;|fB*y_009U< z00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa z0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV= z5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHaf zKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG|ge@UP@rz2OgY{UwVP-lB>&QAWI!fA6F(!WsOU_ak*pNP4k{&Kv9^sXhn zROD|QH#y9Ay1!3^@5CT>T!m{QAJeNpu2X*=i?(dU{`$C=hm#Xksrz87Kg{K|WBD3N z0Q>(afuT+gBWS{7>SxVXE61hiDJzd1P*$E29a^9N@9vcKYHO$}mr%~H%^AtxRXBC| zzVLGPa+MY^NB0ArTRFd+B)xr@KgiX=(^HC5eq-sSYTe2GUg&Ip^=%cC{3#)NcA^Izq36J0KmR$> zcN(W7-DvDyv*%{{2F>&j=eVX)+(dd&oO0?<#Wn9Dyvw!G;%w@qb0}^W9avT>xc#i=KQ`JJ))Ezyc`C~{#1iDPd9f*73nCO zgS1ZSpEv9%-v6$EigdkbQ&C~kv_5E49@7oi=dbIOeoB<%dh`{Oowl?hehx2FGA^cqh8vnj`ctluWaKQWX zZz7*}a-e^ZpY*Ju{Fd*X&nPI=O4HL&<~Gy}wbt}CmrvYqUUk{e&BJ}Pr*!0BxhdIxQ^o`sRsJ4NLXacKUqSNt471t6Nlq z1zPU&PIat(Lp4}n#%}c|swTft4Hj5#H|E>HHrJ`fqRm~G>+`J|)w=Ha`L-a{V1cxw zb*^H2_g9SpR|4MDYBZq3R zz;wgN+5KzM1gQoKH2Uo1vDrmWXfB$;8af7i`xV<=HOg|yg9YZQ&Q2cPGnwYX0-bwb z+TFKf5%s|WC#xGJe127x<}$W!o3{Z!cCqaVvnd7C2Mb)7>(gE@zL;vTz_0_iN_W4y zKz*>lmoKjw+}&0}ee7Rrb;N*wqx*Zv8AWsRj$oPq9z;*tL=R z=oO%6YsCN3D!90XlkC9)-H(|h?$vujb76r8I%jo84K5-N7DzR{kQ39socf4-d3dQ2 zzu>zAIhVF1(Og)-bW8sZr!G`cA1qKaS+|Gyq?l^3fcm7^OYwJ7sK)IT4tI_C$LF6W zw!LzaYOp{|?8^M^rG?Z73pmCeTDNIV2GwAJ^>5M#O)kx)xpZlNzpF9d<-Bv#1v(Dtf3k#5F2$bAh+UI z>VpMDpY?D3j2_WkSim*cv|ET}Hq|&(cJ`nNzdgYD_~AbCeaiycZatfAxlu%Yu)r=u z3*+dobEpp%a6H##gsz zQ4JPo8{IVP)9*4wsfKgQ+##lXvA%Az751melm&WkPMSWT=sxwq0-?=Z(tj|_ry49U zVMJu5UqlVn*z(i7O{V;+h>I=bMmM5$$^uabE;ozXafkX~flb}dd=b?&oocYa>e}t9 zQ+8HUji|A9k!F0zREv!%amQ#bEU?;q(3I9bHPis7ZsL~~(*+~R(3QksiYg9UVsv?@7XUL{CU|%KGV1c1DyC=T z`<4Z&6XN_fSIYMb3oLE)+gkgZuc;3f$QqW^@2Ytc)i~-gbiX;@>wI34%bZH;g9U!u z7aL%3_b$za1(Nq}dtGXMhib4u$J0B1vOVyW=JF=%QI$DAJ0|_%xV!Sb&jMF^7kQ-C zoTj<3z_N&T3sYvw&oLHQ)7ISJ3b)7lG$$~e0)H~pplTWD+7O32}u=};ecc>2* z7|^YCNl<7yd9Xl)N6JB)IR(^5jJo;;3qJn(gJnlFe-@=0EHG5}3*%-r@^2*zbZxq5 z{EM&hsRj#d9{9!Ymh0uUUbuN{rUieo`E`RK&$6ix7FaqfVNm3-WSR>L^j_%bzK2(} zmuj#;+qHsPjpHlfpZV7eW(5HVKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb z2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$## zAOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;| zfB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U< z00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa z0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0{=?_%{d*p(&pAG z^8TbA+iP=n@^=+Zo70f~h582j`G)&M%nkLI<1M6jE$PLwIc{>8?{t5k2;Yf8?6?Zo zL_Ve`XLHh7*m>McROQJ2`nZ>elM_{``(Ud-%&D8Pd<`Xl{eP6eP$!2GG~qGzvu3M^ zFvY(L9Py-o>H9h8%r-Ge~5?s=QPoVe7fs2xKU0Xp3d%Waz1sUNXJMq~GyJvYlYXr_NS z$2FDWCen-UWy)b99M`;y@GjRzi?gYd4k&JJJ=yhnUVoW1q|x$NRn-BOEOxF5Ye}le zak}-DD5u@VUq>`JFz5H()TJECqs_}5M_Jd}+{eaQ{b}6$>qlApA0KD^r*ZF}Pq{ZfKF-o-u(PYH(kEFc9u%Xk z;rEr7e_`4Dtlo`#|30MrzmB)3uoUN|CxGH({ciln?oF024}164(1(`nEPMD&lXfhH zo>Q9AOF6%vZ-jqD;Bm-1zh^uHfByZUhZ${M*$(dzfB*y_009X60|gv7E+gV) zL3jRkw-~PnukwWvrq9M0HP03K!VR0;cGd_3m4v-l)-B5VDOG4zozyuh=DNtO>0ufh z^+p_|Bz&HB>abd9hInSPsrRq(X~LpMDO-H2Qbb23p>=rpi%A9*VxLu;IuIE4oO)mhYKJ>C8E0N8#<`kZ z5r@VK(}Ti#&$m;xmwI3cjcQXnST8OUPi^j|KQt#(EWLer(hkk1g46>`xOiZ#Nw)#j z!rNPWBd*205T^ONcrAO8EJ{5r+T3-yKL7pN#^uiDFNLpDH-s*5dLrt3wfknn(rW5~ zC7k-^Y=5g!MZ$t##_7Dcmn*L9u+^`_nemP1 z9;p-U_V9f|cRm;04Qfr7q&=n{Si-UGqeeZSRwYi(Xwj~4;S;e)eMri&_^RZ?V72|2KYGMkK!s}{ZxFPAO;-uBo{;p89gTFcEds0Ynp z4IKl1#I>VK-Hl_#RXLwbHJYC-EMGc$zNJg0DD}V+=2td-vQfI+&hFnW9-H)Bymj;H zZc9FadSD5!*QK^Lt~(f~Php~0rybbtqZI?K%-<>UJ-SyY7Qn?_cjB`0Uu}uj%u!Q`zwqC0vGDSh> z<|f|hmFTT)9n~qKka}PV$JTp&zazRpoO3fbTeIIQ(du{K3!ZAJbS+pymeYQt$gVHN zxzWo5UtOvccXzC4n-X`Ndf30#>WBf~)^T6SqTinh8@@cO=U016yfY%CW_Uq4Ik1H9 zbe@~;es)Xnavz{I_IQ=}=I)iY+p<%s2bK`Lb>tteKBYop?R4`AAuq+eXJMJi)^*ea zOZfJaoZ&k2Dg@^-dv2ce&J!v!GTt`!E~Or-yZ%vQz$cr>_8U7VNpxLa{sQ@tWz9}Czd zY;D%WZ_6(a#Lf1FE^iK3Qx7bmdDx^qiR15!er8R#yFAGfV+w|NM?TLGr5;$q^yTgH z`d=>)PH7#SKK{XOG4|LGSGO-cO+B!LFFfAp8OD@}OFECV=#f??EceyE6mKfuo4o?` zY>oJrV~q3<-YpQ0mxdNh*S;jE+`MuB%g^u8dSD3`!-X=VsN2Gw@aV<;W1a{P9W0h@ zGcP9xme6itk#;BdE#hL%vVUj2d%`C%qX+glpHJ6 zSVHmDZ38|TnIK;7@o>Y>TIIs3dka-ZSzzolc(2u}l_B zj_>MoP+NXJu!Q}Ui`s44oG(QBo(#M`AxEf8x^vei>K^sL5?V~sSn+LckkDXbVhX~WVYLZYkj!w zj#w^XLI0RXuRFXFTBdE7`{ScSz~F z8gIfM-?4brjer92tg3CVacQ@OL*d_l^{u`9yU7y1icsrS;PzU$s?*N+2a~(vMt9R6 ztp`4*9#{fDc7)!g=`X~oW51c|7X4f>SY5t*Uf^@;fhAninY!kH-b10+kNsMOe3~cp zyFKgkyw3{hS{y1nd(ecp&$|5Wv@VYY{_9;=+TBVLt5yy=oqXUDIk1H2{ncGttav19 zpV`-`XwqfDzWJ6Xu9VYNL2F5R* z*sfgYzoD~M!H?C#gv7Z=$N!K`4zD9ezB1uUJN7Z#_pnNA_Vb3dcb*grnz0s5W;qE2`snP1n<3i&lh+*T78yKkN-46EBZ=v=GEv4y!#g?DuZ8GIQ`@%(Q z5C1~+S()8EI{Ja|lg{TY9{0XPJ+OoYBd;B+xRx(Czde0#{qM!Xp0{mE$2}~g9$3PZ z2j5I_56BRD&M2#G)8-Gc%kk|g(Yx}g2bR#c`($t3KF@{wqlQ}gop~$tHg3G)`+Z00 zT11Vti!|eFxrBJ{2zC3(31;2*X{3usjhYTAX~iW-J+Oox9ol#s)xHqNG@ct*-Xvc* zImUCS{+XxL14{_3ifPe$NT#4OyQ*J8_FLgrVPIW+O)B-k5|&lPKVIECU8w7+nwe-- zC>Zzm9&>HUbL!z<;q@dle&T04cWBNi7xg> zn3Mb}j;lx(S4=xM+5h!(>VYME|I-g+%=g?A4_S8I_Vnl_G3ABEw>Qrhic$|O;c>gN zqC}(fqH6oLTCNFxVw>A)I;|LzMXsW`X29Ac{<{jxU1K(Bk6li zQ4cKP=QiSO|AN!v4+paQg`RyVzB=;Kebu;Yg46>`xcOk-?U|M(;(hNK+EM9`#XX7n z-7aw9vs_8xX7L`}~9om{H{_)-J zExOk;#7O^ztJkiT*Cb2GdV8Tj-{h_ERo^i`fBO4V;g_DRewsEkpL$>kh2f#&`JK6f z)3X7*<=t1p$m7FCSB9mN14|IzRN5q%UJ;7+>Gsolk|kIhM*pyxe@Z=^+p+R&Pu{F` z`-e}T<%sK-2puir)5Wc?Lc4wyUqKEm!E~u_fBm&_V%70hJ7Rt>6S%o1hf}tfQV%TQ z*0Xl!+69#huQS(lx;-vK`0cTJctk`o^}rJL70;>+n^P$+oiWeD+qO|cK+wqXCi(Jj z)=`h4`_1{TS8TfJPkbRBsXlnV{f`-frta2NF)M1R2bQqXe$APJ_h$_P+B+=GT>UEnH{LaI)ZM|BDslz9E&f1YCj9QhlA$s3iA?wz; zn~&me3;)3Xm4GQh00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb z2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$## zAOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;| zfB*y_009U<00Izz00bZa0SG_<0uX?}|7U^boQ~Z72}4zRf8UtxwK+Tay9%exX-NM< meS`gc!+j#=hWg9#7Sg+>^iq+(aop@cKYyPH--$u&F!x_eLdmTF literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 b/pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 new file mode 100644 index 0000000000000000000000000000000000000000..958effc2ce6f83dcc62caa866b7a7de7c98c4a30 GIT binary patch literal 293877 zcmeI531C#!_5bfo0t_L6fT+QxPTh!0LlVMft1}4+CK{5M37f6IFi9qmGMNmM2?Vqi zTq?C{u|@1tqWL1Y^h)s0n1aaY_ULW<`RVc$9nW*Me{~ggmE3^YA^bijk~RF-KSr`Zl!JY#yi?roepGnG7L)47p+LkJ`q zr$Oz%2}9W1;P>%^`j&dym6yn_dpk}&Rn?_7Zlq+t`5P>bfhyK1*{4I@)+|?zL%QFT zms87(YyEML{9V50Qls@|_@l3zzsr>VF6a$3dE4v5i-N+clwTdfQ_@;<$lK~OnOY`O ze$&Q=ne#93h1y%%0xG}gI_1yLE8zdles+k=Cym#rR{CdQGnCI84ElJ>xUtvECS`&bf1KvP= z$QO*cAM*1&w~#0O4&wU`=Jze|3C7}nrpCTo%8{OobcE8KJ~4nxi(Pw~I-SP(h~pEv zdpVh>l8r1$u@MvL?r<3s=Y*`2|A@Tga)vuy(Y6!~VMTlRJ(lLjbF;W0-ld{T2*cJ3=gkAkzXS_6Z3W( zr$hEe>9#$xn=v0bG?CI*gtHM_)ZwT@e1ZT7fB*=900{IW0(G6GfsP3^^TsZ`*HiKI zdn@eje=Q1+>Ad(O*S2Gx3*C3&eIZ4-{7h>8(dSbzxKI9TmR+yQ{}Bk zj4Iw0`Pui+sXA=ov#vQ!XV)~$Zw!oIbowE`{MQZhUvh2ufBAo1F!{5{s0Z)u8oTGI z$ocbzKC<|`yCU=3&YWLRd-kNNf|WyVx+Z)1mdM4MR{W#r$<3~t9%(%5{b7>C355y&)q$G`v%v5CCj%iKkrl5{Icc}Z&UtQ-=z6n&%X1kXa3{* z`DyKImU~}tP5Z~?K3{2wY~Z-92yO)Uq?1L(^w2?#L>*BmMDrUE@cr`@H;; zk6ds5A^nNZ@?MEFJhd`??c*Cdgi(-$6UP2wY6o_Nn7u}&t?1W`R|u)W?g@OZOXdi zs<*l7YR@StYVjhFVIF|3TKKPhGY@O@H9MMgNIB?6~^yoty58JQK|Pz`1Kvq-tz&<-CfL z(7drVlm0mGqUG1#!olA5-GUt;f;r%^|GsEj{n@b^QRWg?|lEl z&`)m~yC?F4A59tC^oxgF6S5z<(*3~)5%0Q*S30&0?3&d!XUe?NV()^65zm})>B%oY z6S?i_KVN(4celIVeEknsz54t+k-uI(ebTzKw?%?;&pmTdVe5=eU;Pz3cQ}9XTh|3w zef;Iz4cj7VPi%MW9QKmy?kDD#t-17VS73a@-02Gn&#hVX`^#^+z3ZBPyWDsA{4d?K z!*$dN8?PUG&4-btd28E#6517MuN^HLdB?~dD>Ei`ZjYQ^+`g#em!Cy$`dv=NgKaNH zp5A@M^&egJNu;xCMs>bt!mQ$slOEZ%>W*b^xn4Z%#~W{rL|muca@EU|8@5NL-+W?q zY1$UoxlmLFJ9ak8MeOgvn}7+?V5brYv14Y_G6Kw17lu2>5wlX zp0g(u&L7(_rMTdWOGp0Y`PIz*RNGNG?*DFd{paSNZobj6HS+kTJ5PFdM_SkX&gyB! zK2KS5@vS$Uxpl?2wz$q(^Tp3M4f)j7^vh>{SI=&fx>{;ct#NZ_n-RV8PZH&2{3 zXU^j9|L)a2uH!ov3_Nnn3z6rJ$h~0qm=7Y?|MtnV&;Q{^u72dU(G?H?0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!XNErbe z8^CP5v>aVxHp`ZKYmX3T8Cxqq8LO);D`Q-8k7d)BS=6EQ zBLAWKD@1#I?as!wK-k+7@P(YsZ6Rma+u-*($22t;jFG#HWwKn^LMm@^WwFPj%NuAa zudbr1w2VVArXSu7v$yZa)-tphbxa!Ee+bGm>!4rgyS&hI3@2^ z;%rHQI-aPHk@Rl@G!9qmZqC(dO_8z#^6zY%I;QcO%wE%Z8A_TlvR5lbPgQlPjZ2aL zUw$Ozuu9@M83EEyaEjt5ohr|e$e$C&}8Zi4KU^5dh-0LP~K+J>tNabwqV%rTj2A{%={w}KXK0yn~`6&KjdrbXbiWsah1z0 zQGV)&jh7(z^@#zTme{qYsncotSll-=_>Yb5gLW!GRwHuA9hx4}@etlWA!*b1FS{(y z#@gq0G&lQwvP_Xtcn;w|ntaWEZ`ddEi1RcaW*@KW>k=pVHiFBGO`N^q+RM%{>@U{7 zhiEw=FK^GqwY9~w#F5xkIclFV<;MnSYi6wL6l*_8Ql+{2Fz4jta>5S9<~$*}9QkAS z*yM6WH&*?c{ohcN zyr(S`_BA=Dd;J~0c3BVkr9pChmCdXxt(dCDRs3?8vd>KdL!Pkddo8D zt=1cu<1>a?pY^n)$9l?#WCtZ}9f2lqXb}ya zzWO(e8+HFBzCD>QnEjY1>M?=J|P9pvj~S=!51@`%@or z?kmv&vhV&XOZmVa-skDLZ>C4xM zauc;&&3BT^b!+T?Jm~#=$}L+*Eu_amzCc3u(P&pp|f2*P%e-Ul+IJgIoX(6 zucr*w=R?x}&ZAX{#M`Eyi;ej!JyEE1Z}#1j=Rr{;w{Q0ux-&cvW$QM7?TdX|*G(i; z{-JO&##bN3LK>q9X{+8LN8;5xi*?$nQos={~IE0^iW|3qHjc2527o7_%je=}p%F(3c<(g%w^`1p=ruUUBB%)dYTz|H^M z?0fv^w;w$EvVv^3wBm^));yMV_(kizPyMRl!w+38Gw<7R?Tb%M^Vo(vmMmFgFTefD zamPJ*O4i9YZq9jg+Uj+8?%sa%qYIbcwsq2tOPOQYyc6y{b@+qd{&4HLukHGx^qCWH zxO(m5o?{&jer@w9PS2=sf6?{vGmqXf?K$71d$Jave%Q3b#-4cHnh%2m?ary$ z@ARqdhy9}9k?_--HdZZs{&yqlw%IRnv-Hmn&3$9{;{z9NSv=>SKTIA|)iUajCq6Xc z*~-qxx1WC5SiU?o{#O$s?`}Bl+ix7T?$;ed55H^7Mfd-9MehH)9qDd&{nF3xT0iQs zUo}q{a>lNPaX)_M_GO=3amkpiZ$EgGs(Ra?vOTSZ0IP&|q+e%!L-Y^AZ8BG{R4d;9%?V6~FxYRC2sBtUV?Gj-pcI$eEM z_mQu|@mYOKJ?%N6oS{OmAu{sq2J zy9^c;`OUw_ke^q;|BH6I`4!RMAu^wYZd5DlJyc$oV$2&1`gqIuuTRvscKE|BvK{=A z@%tF692uQl&!dy|!ndgZC{sDP{$N2yZbg2EXS5U@oe?a|2nWYzc=KdMnQXR9Mf|GZ zq2A9J1&Z><@*mQZEYCAqi^(YA`g~ca*E71o8{)gXY|rTCwl=<7MAh)p=jwVCw?JE< zUbWrJe{Do@KiyhSZUt|nul-14x-aviG7mWEm6n`~(OkO{c-{f<7y^^VHLHBnc-4&+ZI-Shra`=xZTHjvSaHos6S8Q%p zG=vrH;TL#|V2=|7yI=k09$DY7(fcNP zUuD>PY&@RXztt6&$lnJum}|7fhl&o?$N|Ah<1qHM(wI-bbFNZ=l20>h@pp!XgSf!bveSv zQ9145P)lG=QNCdZs(x-?J*hu>A4*ieC};2W7Zw@qR?nkVJgCRR+nLCI$^Rr;58t+j zXO6QiIgfg-E{dCq*rGQUb%;+8009sH0T2Lzen{ZTVdF9aXKamJedxeZcWwT+Ye(n* z4SN5%ovyY3U)SN27Mdj-dCiEQ{dnQ)k=AAPPoH(lGm+Ih{&CTgs^3S#3;FXbm!qf2 z6{mlBL*193MNXOOob%+CXI*ng9R2qJ?w4HQxgnpA%hB`h?`>!2JTY*CYx}(O|9NZC zhmotF`u)$ZF5jl=<+C~c_{%~o-dQ(f)aFR-_Q_eB7w?F?_Rss)Z1}+%m#mk|iDzoD zy&B%QF6Z)hB2V49xU21hZIPxsKYrYH&kmAXtfx2Z%oz<8-~ZG4$c1;LUvlW|HzObY zW<&mW#=RAh?Q8HZ7IRXu-8cD@dz*jziRHRgWixI;cITmPYI-~5hFdOr5v zyYV;dxEEbl&wBUnm%2W5jk@#Fzpb;srt0N#gH>_tq9ZpvebTPouJhgJ{demHpGGFU z^4k9mulZcn%jL8f5)c3Z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5ctXo*w_GOSBm+4x4zOJ^c zjB&|5mQ7z~4>u}#2c1CrjUrH9R$L=GJX81Dsna*=m0jpV))Hv)b@GD-Z%CeMZfldN zutBj+R7sCNhiiLXd7i50VO?J)E0<;1*g)ps|LQ90D!Bo(L-@}L{D(_rDYs~ zAunoou<2#Bo{H)!DWAsWsQ!Yye3{S2hRSm}@`&+Xsd31Bs$W5kQH?`;!g07{t`c-p zISbWsZ@KF1NDeXrG!9qmcF)!6&PB=&$gb{Ktd6;<$}O3_rgI6az3Oby#8XvWYGY|+ zwQTkks{E*cWrys%{GsG>bU)dt??;wm`rRSvCtLHAPL=mEQ+r18KW2Y+7{5;Xk^W5j zFzIQzU-Ngg2F(5_)_)B!=U48l$NsA@uR!V`laU{D@VpK-rJ}NqyF8Z76{vLRmU7fi zn@#N*{Te3owRZ%4p$30j0`x z)zSQ_CZ{~6_A?egf8*SafNpy7{HjpiX62b}Y_M#9TQKbRE%5neX8w_gpSb6U&B(9X zAM!PIG=^K+xXR_0C_nYX#)c@{(I*CQZHZlbnmSGPxVA<0_hDVGjqZnbDoJ0^Txgm| zoQCjz3Q3zD?si#^jkVA1Xm0lVw2cTm7WLa$ldswD4f|vsQL;nNP}A&bRJ5#^W0dAK zLYE*;{^i=s&N1vU*4~GRc8EgxCU4ipwY9~w)Uono1GF{s`N(|TV0Ag>GgqHk)|OmO z*rC{#{4lv3`Qy!<$>rR#G{)wJlgnwb{ho+BfnaR4zuw+wta>#2y`j8((xay>6!tYa zr+fV!zIJ}z$TvuitFoDOr4>_Ed*Y90g~-M-WSOgV!PfB*?H3pCUXHQ^TZ+bmb`QKQ zy&exTYpTXW@Ac)DAmiPqM)518ud0sL20qztYj#SH>1+w8Nw>_*Kl-Y_ReRGP_Zrf1 zFt@BSkw2r+MNWr~zXE?I?scMg*{prl-{j)Sd=|p@gL{;a1fU#Zls%3>c#8_r}~;& zya8uOs3#SEFjeE|Le4;J1e1Ac70#W?eb{vVh0NF!HTSuVD8(Ks|sjq$yZrSvB&o@qfh3+?%-uAGw zwr!!aT|JmCkPoKLQ^do|L7)y`>sh0?v*xqLX3?M(b!sqK8j#afj- z4`utx>|8?S9|{)}?}@}h9HWVBtKK08u6q2X{oN)nymlS;vckSslHmJaSQ?=00JNY0wB;I2qc;> zoRpw*BR`8jUx-df6Xe5b5C8!X009sHfrFO-#rHbBY98R$>tV+F@<18iQCd-3Cqj6{ zI*56FgVsTa0Ij>xIw`H=IDf9n?`fRvs$b~#=z<-$sAF0OrTUQePXdYJY~Q#|*#X(r zvfI`1zOFZ0=~1~mA5`VmXaQp#FZR9WNG?O;phCpE%lJ?pfgg+glqt4Ead0#qHvdwMhb}4IyZ3SC z_siCJP+Eo8#YW%9skeKKmEWN1rR&yI(O_)Vb;;$jb;dhZB$p#Uvi8>Ga&BGT++Qb` zv$l7C9VdzBl`1#&q=0NcS7FzV>IF=*m9& zH5y&?-iwa+_8xX`el3pPuH#%p@iOba>g`~S4_(q@J*k;J#et|%V8k7!PA}28(6M8+ zPDEX?dPzL5IsoxG^g6xI@oaq^Hx&O#sZZ+uweKC@TS%+ioC@2Oc2|9!w~ zlgl}DrsY>Bmn+wDJ8w=dH&M&Iabt2hb6fZ4ekz|!oV?+5ABo2A^)bG4?Ny2!@!pT| zoonyaD&zL8mdttoV&|{>X5ZFz6Vmy?7T@XCPGF|)K(ck#p2T-r^-J;Tlp5u$^~SGz zoM#Ow){>r{AZ{b-NG1OX5L0T2KI5IFb=^fu1($^`R;=*0BkM}i)J00@8p2!H?x zSO`#@=Q91hhj-|8u~f%-I{r^Bdi6BUbJ?Rx-mTla?OAn9>z{50&m`X?s`wgQ5$7pvFrKJ@`IQ;w+a? zdh=c{**NLFzLbR?3~`>x{2GleidUoKy~P{s&9B87A9tOs;yh*6ebrlkj`LJ{%8?#y zU*kM`u9s+>=d@|sXNihq^+NN#uf_MU?bkR@rH}aY0gCISaYM&n!MG9rP89#O9XI0M zPw^uMBF+=zru*N4F~?`6`uje65?2U)bk~P@-f7bVXUSrxtv4i>3k%)T)CFp@R#Ir6SZ8|)5+ybP2I1%seCSR@|M$mI#qF=s~=X} zh_^KkkiNf@(!JR?y%$fkULu|=w0*CBU8{`ep=>F$@BR|!nX8?^Ol`HsdSL82>o+mZ zb8>>w{7{0p4dJ`7fdB}A00@8p2=q?^y^ZtSm0-RQotXAdOz0H|fB*=900?}e2~eD8 zQ2*WU<$7H#)p4HN-qVX-dLcs0Q^cwsy@#Zfl;QYL$Ta(j zJJQEtZv+2p)r{4*)YGoKLhHq}POioGn$UQdxk1@qr>@Uf&yHPx6m?O3RDo6A{C-0G z&Na!OSzA4=h94X9vGJ*5$dJ78eL1RUHu-J44i@zCKU>*eA6^s`G}QM$n&00whrF#m z6Cz`L598k$8O^`I7iyP(tXzD5r1+cWRBwJ>0sk-BN%anq`6P6sTKT<`=J)E}V9>`~ z#+6hrn~=%Wio5tb=b~Qz=azh(^^I+Q{fCo8y+@hq&GiQhGIA^OGd!cE=;(}KVMaJO zHp838r9^w&G9?b`?|K;pit@w{^+T5D8C^0u!|NGc%s*8zyd0qgA*Xl*+5+{eZQf4t zhg(G%-f6l1zwylsBoAXStq*3>vIXS6R%_P5C!_KMbZv<7&os@$HnFIR~3_9^=a zcmwqzUr@HoY#;o*lr!|J{@$PbBlf(;M1Plx_}JKCl9fZBh>bg4C9faGD}9;QBpJyX zlYf}&EzzRn+Y{v!TtwfBwSchaWSfv*s|DyiWzA+aj%!3gd^5d=S@kb*5%>JmZxTKO# zAo%#|J}#!0#s7P=g?Zy8b3?wTew^MjtMNOfqO7vCKEJ-Ic&eygjz4p|XgnC>&z!HI zTRveKH10}d>qLLz)mNA>AB_#eFK6*G9E{pIrJ}M@6m1@V;+X0c`(lenRwYuhebom-_zvD*i`E zJzPGK-e}w;TJO7P56>KDJ9c|_eKxe$Dy^&e-(khtp5a$)DY|#FjQNGvQAC(C@wTCYvD2u!+gmpI7auZXGsg zc&;VO(j7}}*?g|U1z&k_Fw1wOy_k2(PM4KV$aM?VI*Q z9{IyYH(Pq1eOURQhW$^E>mnoJf4_zQYA+of)G19;`%#x@*4vM|L<Rkpz(TYXaqE#WkXXGj%VWI?dE} zqz_q3pvl+C4@&fjVx0;b6kBiYWw^H2=RZ^Sx8-w{X0mcwo{bG;4*svMqOOt~Fgt|* zoWOsmz6#MEU%RuhEfDs$1biW9b6dz6_BQx^&M{5RW8@x}&t$p$Q{+?SO|C5Vc=)j> zZ=gKSKloNxQB_*TAsF(ab_bhYR_m##u9EVig;ak*UcSs{V?*V+9C^feuhcj!+O7H% zRKtm>acECC4jTJ=(Q&nARV`VL@B)X^AjY2zxFTcZ5b4;vezY)79Mz_lfI?P=;X+2d}_`(a(K zjqZnbDoJ0^Txgm|oQ81w6_Pf6AG6DPY^;55M{~2^r)@;ov8dn1ntaWEZ`ddEh>{)J zv!vP6sAyR+$0*Hfgf2mx{L8hMonzQzti2Br?GT0XP2R4HYio;Vsbl5G254($%sbM=`=2PT&jb||(7Y{})wA0L&ST+S^^W9-Hu$>p@zeoxZ)$Q6vO_Sf6{j8%_j zzc-YZPkQvUg~Glj=X9^X!`IHw8~FywaaA_6uC!vRYL6o3}jQgUp($@z8sHxh2YY_o-2mue{8vI$9g}WV@}|DLtmMC7>qV z>T2Khw`y?gGjXpI#mi>xtNtbzPv%oV!=LqDfBBk6 z>~m8Vze3}!)*G1PGlpBA$J?@A>nlgKQ_myh0so+zL+bQnry`k?OorDJs~+T9rN*otxYqeMJ&#^7OwEv&i^0h^n|?l|b%mw+Ih4{= z+-F3S((Ly4N9f5pPt+8WpO2jnbe~6=*Ne^bD87tbS6e;9G-HbvDZ0hzdDJP77(0G| zu3R5m6E!PhES#2Hj%vC6%jD~7<+_5RtmNfQ)N<(=$>pr=-CxIKH$_@`^qg&spV)PJ z(UaKwhg&u|-t&#qBwy)m4?Any7CPJ2gXseKVCp=DoENS4Qw53ae1(3mMtZQ?Ijyfy zx;Hx~?Ie4y)OJ4dNG;40%J!AnxrE9;6fP#-6N!a5Mibdqy+aPht9KUbv{k1ne`PI| zr^|_+Abk=~lIv|s_H;#zJ*klp&pw9pN~r#*eNabr1ET8-e1P)*d~?fB>pEY+=l8Yx z0%18h;S$z$7OE#h=H`Bj)?@gwQl?X0>1(;-eUw@KZ?P~U0Yjn+wN9jER@m8++5wy&I|t@NneQ+0f8jTSK0@nYX=j^Hvh4k|>vyNnMl ziN=PCB8+wL*!M_?4;B1WeC0PUJ~UI=jy53cdx|u^N-ZuO!hgh@q(pI4^nQZYhv|54 z?&Z33kp@eblsZf>JzWz zCzs3Ca&PH=Sj(Yf@*}$^B`?pdGu${Xxg3?dztbe~yi(q*V* zDGr4E8tEr>dWpt`PMfA~eX^=py{J0?*||9m4lC{r=OU3 z(68;d5%+#=-mE0W@t_ztYCVAX9_Tnw=%e@gFwf6Z9sdb^^j;rve&0YxU-cy(H+kHr z^*NKC`$m3GJ>SgMH~LiEKZLK~ah}iXqQ!b5J%;w`xiVRtC$;|){T$b++fRu#?z41E z^5-};>*2o-_?_f(6le8hZgRPD-H=l!B$u10HT1f?+j}vFjIFR**a@a;ybPS zrTBD8jq=rcQVA;t)}A})G@7l z>eER2DS**dP_j7BwfuUj<2*%~V%uAsXZkcX&bp-Z{)_W$`hlufR~wV4 z(azYT=O&jEb||(dbwAWm>?iq=_ZyR!=hj&-sZTB!Tf_c26DN*M4n5vlsq)^xpHpuf zD4q&&o=T5NI7`Fm&3ip_yoEI$y7&5W%Lc~N*Vir1GnrqbxK4^!qvO5B8|=-m#nIbs z9GECxX5Cl4_2)QGrKcQaSNr(=pFP)0G|ux(?zkvhoH&r&Zgy^dPu}0-Je59l|1IBt zOmUqwZs_a=DdTz&zprGDUrF`%eJ;_8-upZn z`XGIn=N$(k&J*LK``<}A&~ctZAG8j#Td%Lo=I6wgiu;EcqCC#?foA>aM9(Cm{NCI@ z%zniBeV){QzkZJE6os2MYn3ikMHyBpN4>5fdB}A00@9U5&{(GdAt7p!YB2**uKVjcG~{EzQD%v9Z$ONx#*HF zjydh3%OVrkzTiK>o_=fk7bC|7u4PLXk3K5h(KPsiTQ?2;k&w5E1=~Nb+ELv)Y|!xB z%$)q$T}y0f1J%yb9ZPNeMV*ZczFfdpdK_s5uiXFMg^lT1%NMSHYg~KAaObwOR!#ri zPmVl$iTj@a9zN8b?#On!m+-%Aj2%FLC4A+hVCnhxtZxq)R?wTFynb*Nb7rQm{>Pjz mvu?Zi_G{KZaK@%LKb(GY_pu*!0;Zk%q2GJwO*Wy&mHmG_5(N4H literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_hdf/pytables_native.h5 b/pandas/io/tests/data/legacy_hdf/pytables_native.h5 new file mode 100644 index 0000000000000000000000000000000000000000..a01b0f1dca3c00cf45e93fb69862077e681eb8ee GIT binary patch literal 74246 zcmeI%O>Y}T7{KvoohEf@Dx4P4mh!S74oFC$Nz*qZRBsxGL^5$?yQN$RAu(W-IFaK> z?TH=`f{y_Q#E~O#;D|VK0zLsp)C)%x=9w3Jv+Jf2AP(d|RlK`1^UUu2JUct~^~EmcLy^aGwf1MEW7_MT-OYDf z{b#8@qYg?M8?Lppx@jMI7Y+(AW!k2?Y$t&ha>tD3nug#w{mf1DS1OLBe&km&L_j;ryd%gol^$aoj(8c zzWC$jIzoTLyfQrQ(nI6IoH9Ia=HNIpuMF?!<$ZDKb#ZoX8Gha?`{K?0+V5wWUxuIe z>b|)2d9(9OI(|I${mw5|tNQ)w*a-8z9tUUL*EBqSY9$@-V%)@$k1yMHA~_{b^v^%t zj~_PwYe2po_hzH*4Ek}Y-%fg7KYaYmKa9V;FFrgQhMzy&pXqUar5~@)3GH*f-(_vi zZd#GtHexKL_)dz6V9-?BVx{u>F0{)ysZvpWmOO z40`lGUVdIGO5G zBKL1d!T$!aZGZm=@2a}ZRG&80%QE7Ae)#;`vhL`6geHA;&Q!l-Rzis!^Dom6%DNMN z$xYVKu{FP7h)arKKKgYA(L zSg0h++TlCC*RkLJaKViWe<<7C*3Qmuw-t^%tDAN|)VdZ54!NE1zr&A;YivuXygoH2hB<}aLY*GgW;mUetlU(Z&Q+m(7_v9@IEkC^gszNyKJ8fK19x;$SnYGQP^P$<}(5pzfC&P05} zlxQl;%t@Kq)5c7Se7_?f@3T!^7dH}p|GX*`AAQ0+di?(E$FI9z{AJ01J$?_GN1n9(a95^NM?A!G yaKGdo)A!f*z1|0X{LtC|&GvfPb!FCV?=O^>SG&FT+D7Zu*rv;w<13lttA7Cz{LOp- literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_hdf/pytables_native2.h5 b/pandas/io/tests/data/legacy_hdf/pytables_native2.h5 new file mode 100644 index 0000000000000000000000000000000000000000..4786eea077533929868e41b46764b039b2c10ed7 GIT binary patch literal 12336 zcmeHMeNa?Y6n_h->qiK+f`g8($`sZlj};NrX%~fp8G{uVL@QiAmW0>^m&HI0cIXfA zqx=I6kx-FI)GReKKTvcOO!*MeX&5C$bHIvELCPm9bnmOt22n~$K$dKQYv5b^=fiHu;7#t!dS{)M~p;!0+=G$WbGM1gqy3J-~tlM06 zE+`Fvf9P3cWyO1bFdE+gJdP{hRpEhWD^Og)r$=qqsAqdWOZ5)F_wfCb_#ko%&S&Ae zSr@?w{$WAyUT}@76ak6=MSvne5ugZI1OkLts_<$^IS@8u=;aN^E6+|q^GQ87v^RRy z(hxAmD~0@!tXss4F@U$Y@LeZhpacixC}=a2VeF-H%o7xh`&h=k4{%fw<~IX8 z2zef!_aI)yVnPcwc)xKB%R7M2B?f;X{vnIOh92_@IADp4W&vBBIw>hxrjj4 zj;C*k4_Wu~Y_a3eKBf3FOFCxENj@IJyxokM#S9OQE40sM2Y^772mvOH9};U%8b1ac zKX4xO`=s#`;b!c7PN-?_?rdN7$RoZhJF9AcU2nMWnTBfD zYuj_WtA8u<3atNQL79rZ=$gH~{ReK@Wo>cmdCB>R6K9V%b$Q6Ei)xLYWh>pw{Zq?Z zj&i=GORI8bUbv&vahb9nOM8*0{`5J807K zi`P6{wBh=JZH_&%cQhrN9<+4c-xe1$O<@4rKQ>}JDeX`YXe9ibEHHk0c03p0H5q)- zXrl=H_YfFdC&B+CnAW?n>IHQ#E`0xKeYDr4K7yWxs*lFO`@oxl77(E^sjcCN3Z9-@ z(?>+ctHIsK^GMCy*_yeHA zsEdtW5sPZB!IlFfru8pBCl1wR0~UjOrgLJWwaJ=DwT@VgB^@&o&V@A}$e8Z$C7&MlNkmz1^?T(S#N?%7q~ zZ6z5EsIK*cWhvVi*zS(WySLbW-P--_el@4>8J#wltSU{5={~%3~XxIs_UEHmBnY?Jsh^>Yp%Yt)0+D;6j;@| zHZN%?3Y(R~H7>2_%AefCg|`2+E-5$i=<>#6JKwAxmzjF=_Nk`g&HL7;P!>(M66k)) ZQ)NM4;4Rlvxhp{xOB+RiBJeyA_zS3>0-yi@ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle new file mode 100644 index 0000000000000000000000000000000000000000..84fbd0d989569135859288eeb525f483414f55f4 GIT binary patch literal 4381 zcmb_fTWnNC7~Zp&UQW3cTS34I736T~%B3J8LR%`OtSp5xc!9N^vwPaJvX{=DS!qkx zg0fp%rQi(_K`Z*=n~90Om>6SXq8Lp~6!ih)gU0BChD5DT`p=x1-E+2GsK(QDzdf0M z{`vlI{`u$MW9|%=6y%F0B6=T&7mtFx5tZoJ8&0I-z5V`hGNt?D=@^Nssg&9;cct`j zGGUTbI!t8j3-_vth-&(Ysi&g4>1U4QR`5WgalM_om2p|Xl|_;^ybqTv0cli{vzBu? zLi&4k8CM|p?o9z#Kmvu-42N5GX)kUL=pB>G2kL>oaJF4spteOvbB zT$ihMAc-hOs=RMdeEa;_s}$Y-hUTvT_rCwZDT?DA2d+^3_4}V!Uk2Q24jrX<(p&S% z<+&tS980ph+ti5((e%q$UU3{(W$+9Im3LVKcxIR?MdOs|M?{YiRmQV`qbuNoP-Y|V z;Uf_f&&ePu5s=baa5$$SPc%VrwUJkj32e#Mo*)|=xn^_on%%%PGeD-Okf}xqcqYrl zg>kV*vcTo$=;7$)=;K(%v7BQC$4ZVQn9XTvRA%ikuFc>ZA!FjvK0Six6~$rM$ek=1 zWIR6~#7!&+UzP^3M+C8e1u^K{>unH32rpFH`0|hzg^h_tHF-a1Z4x!!E8|6@Kz8i2 zPtTpG4hBK-n~|qGnZ$K%O?juS&p=F7w? z|4GR-sx*7MctYKl!OIPZLF}^GtENodFH(uUiG)(}3AKU|sLUEmN6=2rnM0`<=I~f* zIL_y&a9r6$6i2$OWNXaM8W3cki$B*(m+2lWnft5aID_L%jouSx`Ud^cq0_#JCqFGq#&u;Xb8xT5gfdj14#do%tH#WS1Tz6rn|E}iM5*ce)MaiY3()6|kSqY2$KS@RTxv$`jP zI}}uzNT%XyEIOd$y(6fMcT)R|S+r+jT~nvw=$G1tCM?;JPoqT)?(z8GHbF zQhE=*4-_isc|TJKQk3fXfD*zFD#)X$DSXg4NWTo=L&hPa&Zt8kobJORBczA$Ltv9k z=(r1c$dM#|*yu9y|GG}#s*R>hf>mgROn70{O3UD|V$2ieitVvgtZZc}0-@^;6X^f+ zR6>oJR#o1uCe$7sig-6z_JpuO@1R07QG0egGIe$o8l1z-8rrOXeJz6qF}iZ0g2$OJ zXxDu?ddVtz^?Rq8`au7e@xM;jKYo{gyi`IS1gZtKx_kw*08wW)O1$H=p6&3Gjsdv%nit*xiqNySL zq~lbK*a?n02&y!^dKTVOEWD>f_@pzuG5hROj@iaC2=BBbyk~6T!8yo>XC-Rsb8zi( z(Tvjb0^vRD2=6&N!RH+WX?SVmgP+}c48M@UXA1oiD?n^O?x4dmei3%}EED0S5I*M= z;br?8=iwJ!JJI9K_=5GN1;4`gdzmo?lDz7WWIQj)sN^1|)nQy5TW8e8>bRP$aq+W4 tU_lLQd}7GyN6Zr_<_Q$@1&S75_~9zQ=1}>fU4hpXJ#7F0 literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle new file mode 100644 index 0000000000000000000000000000000000000000..f0787f30c14533a5fd96d1fdb0b2707545b169a5 GIT binary patch literal 4338 zcmb_fTWl0n7~X3yQ*Omp5U@f887^H*DTs*DmP#ojOQ9URz*=W^r=69(^z6(^TUZO~ zZmr728=`_%^u;$56MZo;#>7NJWt&zeoZx*-&X9n#v{zheOeHMhzvi2_r6NGV-v{mrmG@=8M9^ex1U z;XzfPRm8VzljM=SU{Ebe0pi^f4u^OC4G;T}JUNm0QaVh9KciqME54lX z@zjr`31X}!@D}2`=gwY1^o5$*zXIIz;YX(sCwlf@M*Q{rpVwXjyh|TFf_NfO_t~ZS zMz}PUbbr6D8agC1EKp(9aaxn3vqVzaXAaQWQ6vh}6!0UaCJb4ib09{Khea%hqwbyF3OPjRVzG;_4x+HPB@tJIeZaJC$jL#0F6my^vaw}- zb8EBX5yD}h{zl^OPp4@^R~z@E8d^}ZIlYO;Q#VdTYo%)u#)t?P3W^51r`fqJy^60= z-`2Z3nq^nB$nfzZ#CSIL_ucha=nSGSyg*$_Y6m&^6j} z!cXqxXkjm1YcbhpQYmyK($Pb_wlqm|fofLX1t{LEtkY1D)>|=WA3Ln^R$i8y%W8+M zilG|{FlT#N$GtRv_j`1U_(ML z(*WEn@^&;8=td~Ww~0BrNhIEOtw|DGtKW{Rd-EEs9>B_165M9TFb2m0yTj#S3z}3i zacu$%$1`(r8A^e8w@nG|PRHu#waBXNJu;Ab#bUrf=&YZQR&BUb~ z8{1gW9aDmC$0b1(xSu!$?~RhcI~*oRI$kv__-(G>JJIa_V77BgkaweNzU{DhX@J`u z!)E-T$1YcJJJ6(pG24tD?C09d0&`irhPf_w)&_MKkJwIJC6Lz^!DT0*fQh#QltR3n zDm(2ElHD`igT}Dg&NgY+%-f{T8$NCSz1HJtcqeiiG1-3QGU5kc6q^13tcfi?ia5F8 zn}I(8yWUQ|hj?bQFE|PK!^Jbbh^>(|7ba`Vw@DpYJ)Tl^owZA0IGww5v_~Y>sdOeO zC*mV2-7`)qc<;1Vn}b~v>yvslSBKO!I%$$lf(jNby3c?j;Y-FT$D!JW`}4*i>ML$AlRf+D(xeF<8Ycj zr1fdVe^oW$R*h$LgUZnS=5-Z*wLafdV>o?aSuA9Dzep|ASaeGD*=z;*3txZaMiK;`qmnp?bit z_q~4N0 zJ#T*Lps(?LUZI@?I$n3tF~RAOd}CM#CafdsPPmj9Pn$Vz{ZO#bCc+$_9CiD#=2>XX hv(TDvq1CoqemKEzxCp-B$o@@HJFoG4kzPF;`UmelFZlof literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle b/pandas/io/tests/data/legacy_pickle/0.11.0/0.11.0_x86_64_linux_3.3.0.pickle new file mode 100644 index 0000000000000000000000000000000000000000..e057576b6894b010a78308041957610d51e609a8 GIT binary patch literal 8978 zcmcIp2Y6If79Nt&2Bis#4HerU5*1JpTPTW{CE^7UOT?LE!b?ayXKtj(h^X@-D2NTa zVi$YwioN%Sx~{J4>RNVJ*IgCe;{N}A?@cC=eB!t6TIy|WQZ+d*K2y_GO^q3=Embxy zB~{JI#_D7yXy)|9CpFqVMops1xO6R#<)qvAE-C5WDm{$!Oq6#ikLM&FXpdLhdClp# z^qNsbv!i8$@>qG7R@u-$Yu)u%pvx7(ZjW3hqrn+_vZ_bR!Hc*n)(e@!eZ9X|Fg;GwG@J%hM%%Jlb%Uw-)c z&TD}CWfmS!Tv{^Vxi$S|TPxdDL~|Fp)l^fHN!v_(0;XBEAG%0($jXihHwldbhg?i% z&#BJH(5&nD|Br;&%gPw9#B(#d zH%;ziWZ%%_e%_YOb1!j~gcZZoQ>Z%SG@3TE7hkEWSQBgcehW`--OX0#j|tf-`s z6gG>WtSBp0SUY*=0;j&(KXk4vz#bv4T`1L7(vfpDv^vWO5bd`MWMTXL7AbjRYH_73vT{**Q7K zmvd9;N?CzHg+~gGBX>~AdGz&s>c7Cqg<+5{YB!N#F60wG8heCtv6V|WR;TMy>blIx z%23zk?Q}(bI_k{F;R-8PrqUIaa#g}x;+>tgapG8*1mKt7TP`BG=R6 z4MuJZE#BmvfQh&{F)=by-lc65udhwHoibyS4b5@6WeEC$kcpX#{6?fwx?;-ehcV@4k5z;_8w8hy4KT{p24E6v;zOg$@$pGC!8GYIDZN!r*@Ql8cfb-Mm}HnVekc}!P(samgUb@{*qtVf2GKJdpJo?P@{2BuPq(47kPNMfs<|s8cd+} zrhs!uV;q6@L5)=&PSLDuKQ=<|{y?J}frh;YP*9iwIuJEhHwI0$>LviAgFxLCI2GiA zZtB`>2I}kPpxFCp9(!MnCL7Ya1?sZ8C8&2)hg@+D;=$&JS4Hwbc8Kd>v>m!gw*n-x zx;4RF#wX}D)bdezJX{w#ggji?7HD)kw&V_!)9opMEAB0%JAkvgBWOMd2{u5_p+KWM zfrdSYQNW(OtLx6-tPTf707?NycVUCzWYk?*hL+u&WCSR+bj#^T3b62Yg_45b&u`W` z3O%jv4w~|H4}f!hobE}D1+I@K57T=AjgA2gXK`-|q8xBe?1LJs`-0}(-;a&Zdw-zO zGSIO1SPIyi#~h=l<4|LDJSaji0buk1HVQ*r6<*+2n%@|@5{a8{c@5t?RzQDK8HG|DoxWSqo;QVX)9g@XSnJC1_} zt8+jRg1LZ}wzJ4RqJvjKaG2^mG{7OG^MM@D@j#;sKslh2oMtJ&EmKcmT^$2i$2C{S zY^&qOleW4LfL;4UfUk={87^L*^dvCUdos}IDWKGg9LP}+eGE|_oI9|!)ipfTHC)G6 z5nW6PKaA)S_CY@5+lZcqvahFuT3renB-gKd#nNR?{TZMY9xg|^9Hp$D2^w9I^eix5 z+toNU=<@8Dlm=Ra8&JA*sO*f3ytHI~lae56{RXUsr*qJg>ma z^{Anl8-PY{1P#sHL;=m*%(}M3gUVY_1C_S|e7y}6Pd>dJ7%i_mSccL&fky8FbxO;- zmKU|^-4r50RtCY3Xteu87%K}UAx2*YB8(mqYZnIXBK)QOBR$cBB+|2Xq27a$TD=!E zm8{hJIPSLjxt}b2d;nV-?j%p55mN3cKwX$lPorF^&$!-g!+jP!hx;7R=<}2Y>GT2x z9PW#(Yd6m?p$5Z!8Q|+Hpeb)EX8Ki>qM3e;WvG1}X!H%xaHf5IlOkMc^QR2>cE8T} z=*k<(JM#J#Bv#jea$epB7R<{#WHF+5fkuA|8anzO1$6XxtZV1!`>27V9{_y)Jt*hp zLn59MbzzME06vQGM<`e7A3+m!Tr-}BxO~h)c))zZGDh(ypwUl3^Kto%qV+$0qkQa` z*FRkc`Z+XP{Q{J6`7`abCBR?EVnlxh8vPP9jLTOPFfM;%UAwq^jT*$|?*L!_0m`_1 zLri(iQCz+SAI0T6lq+>DXyI1&J$Rb_0ciA}lm@x?FA8Y-->hqA`ah_F=^p{U^4-TH zji1l73m9rE0vas_r8WpkD4@2kC|K=ANu_ot6vUwiTcLYTAf7Ok27~EE0lRO&I`?fR zl~mmjT*%DV-e9l+_F)}gU^TUhf4%eto6~-vzV-*jQ1jdPM&M~~0MO__P?{^w>Bbb$ z+$OAZ7Vtq%2Z3{{t*7atIms##;g5 zwfj(}TZ7>kwgDO)LJ`K$HK*HBfF+t}w=TgT>h9MObvyL1x;<$AMz#Z$K>LnBqY2PZ z`%nrBwS#Z>i}=Wj?t~hv!$A3APIpEjQOD@k(MPyG9N=pyD4pB|n0InlFdW8iK%*ll zf|GF5*O3%q#dUDA^A)9|&;zd>(7c;_Pzm(!2{bwyG_QXziV7W#974VOb^JD`V<54* zHz@ts2UwV*`+}#b{eVXIr%U)*?^|ftvZ>Q@>WIby#jo+-VZ^!Qm23xj{i{bG-`lG zr&5aX<1#yq0vbJxb#3dt-DP$Rboap)0XsP{-9o;sk^i#$G>f?qTN zmFQvh7*Lp+3Gnq;P|ijYn9r&d7@DjC8m*>?Q^BlCQ?&kBh5JMedRUzWn$IeaN}#_M zXtWMAuYWd0>vAeuLs6DRyw;3@^cGny@>@7Zk^WO?A%;rsxAN;y%021brA(rbukL= zfry2D3Cd7)DZtmuKnqo^1W#3$1M$)T4OLx90aaavf?K0dbv4RRbq&DRYeBh3Tt`f` zc+FW|g`%4UP0t#k*Mo}=?i*N!og0BhZvv$qtm~U8pq*P#uzD*cm3kYYAb7X46}sO6 zGhqUh^GpoLL- z7(C6d1{!??l;-gsdz1p2e~fkcgZy!D`IGwzFfj5Yz#U~deF{Zip9W>lJp;_|InRQj zrRRXSs#63@J#zX21=tbFq7@%uv=c;#cJ3EpbpAxBFG6DVB~Z8+ToPUeL#wXKmZ$cz=^N+8yt2p~lxWY>;Uw{Vytj`!fIl literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle new file mode 100644 index 0000000000000000000000000000000000000000..f0787f30c14533a5fd96d1fdb0b2707545b169a5 GIT binary patch literal 4338 zcmb_fTWl0n7~X3yQ*Omp5U@f887^H*DTs*DmP#ojOQ9URz*=W^r=69(^z6(^TUZO~ zZmr728=`_%^u;$56MZo;#>7NJWt&zeoZx*-&X9n#v{zheOeHMhzvi2_r6NGV-v{mrmG@=8M9^ex1U z;XzfPRm8VzljM=SU{Ebe0pi^f4u^OC4G;T}JUNm0QaVh9KciqME54lX z@zjr`31X}!@D}2`=gwY1^o5$*zXIIz;YX(sCwlf@M*Q{rpVwXjyh|TFf_NfO_t~ZS zMz}PUbbr6D8agC1EKp(9aaxn3vqVzaXAaQWQ6vh}6!0UaCJb4ib09{Khea%hqwbyF3OPjRVzG;_4x+HPB@tJIeZaJC$jL#0F6my^vaw}- zb8EBX5yD}h{zl^OPp4@^R~z@E8d^}ZIlYO;Q#VdTYo%)u#)t?P3W^51r`fqJy^60= z-`2Z3nq^nB$nfzZ#CSIL_ucha=nSGSyg*$_Y6m&^6j} z!cXqxXkjm1YcbhpQYmyK($Pb_wlqm|fofLX1t{LEtkY1D)>|=WA3Ln^R$i8y%W8+M zilG|{FlT#N$GtRv_j`1U_(ML z(*WEn@^&;8=td~Ww~0BrNhIEOtw|DGtKW{Rd-EEs9>B_165M9TFb2m0yTj#S3z}3i zacu$%$1`(r8A^e8w@nG|PRHu#waBXNJu;Ab#bUrf=&YZQR&BUb~ z8{1gW9aDmC$0b1(xSu!$?~RhcI~*oRI$kv__-(G>JJIa_V77BgkaweNzU{DhX@J`u z!)E-T$1YcJJJ6(pG24tD?C09d0&`irhPf_w)&_MKkJwIJC6Lz^!DT0*fQh#QltR3n zDm(2ElHD`igT}Dg&NgY+%-f{T8$NCSz1HJtcqeiiG1-3QGU5kc6q^13tcfi?ia5F8 zn}I(8yWUQ|hj?bQFE|PK!^Jbbh^>(|7ba`Vw@DpYJ)Tl^owZA0IGww5v_~Y>sdOeO zC*mV2-7`)qc<;1Vn}b~v>yvslSBKO!I%$$lf(jNby3c?j;Y-FT$D!JW`}4*i>ML$AlRf+D(xeF<8Ycj zr1fdVe^oW$R*h$LgUZnS=5-Z*wLafdV>o?aSuA9Dzep|ASaeGD*=z;*3txZaMiK;`qmnp?bit z_q~4N0 zJ#T*Lps(?LUZI@?I$n3tF~RAOd}CM#CafdsPPmj9Pn$Vz{ZO#bCc+$_9CiD#=2>XX hv(TDvq1CoqemKEzxCp-B$o@@HJFoG4kzPF;`UmelFZlof literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle b/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle new file mode 100644 index 0000000000000000000000000000000000000000..e6ed07d75da64251660e5e7471229b899047b758 GIT binary patch literal 5822 zcmbtY2Y6J)7T$z}vM5auY@pZ{m8ggsv5*iHVI|^#8cW2z$qjcyvdK9+ks@nE-5aqW zHtdSM_uhNOUcidIzxVX{o*G}{{b%mXf*78U&%59E|5MIA=gc`XvorVjZaIo2YD#0V zSOce;eJaXPv8^|?q%?J_YfWY9Q`xe*7WT`GOniJwr}h)+IF*c;Po=*SNoTarnN(9Y zKCY>yZsz1vCN;z7ICY0)k6P-PsLQl9x6X<5HMceCbc#7OCqAA1x|U2aHU*^9RNwJR}ATYC$t5^&Qz;;%e1 z8hN@!6f3bz+i2Pr9wtH>;rX?+9lSIVjw5X*vb4RpwS%}dQc;-+SF$vu9j#*x<+M}e z*v^VZ!Li`CF6{!BrdW%Xz8`A~6%qHe+}R%0*Vf;!AV;G$?dqj!X}6$PVA8s~I9aY} z51Z35!3N06o{3;^Aeq<;!>eY{i$jrBY;ZJX1x{d%ehWF}SyGRwx*QdPof94{L8 zRkWXIEXmUZqj7(u(d4*_CK?`3lVBkU3kQTW8MzKjHml|R0~JlN**qxNa$S<6G);xd zX(1gf$vH&)GU+%}^wuakO!Q)&gmic?YBkOOJBFR@bcCiO#Y>~DRO@<;BEO)m!A_OQB6sJ0$L~qHMt( zZIalQsF6j`(9*Iq9pj3lXqGGPLbF|QG|h3vU1=`ze5_<+USXEx#Kdum+Kq`k9d8WG z4+bMHCm??(hIEqnVDYjgD%0`mwf3Ng}&=Tx?YT%nSO)F->B#&k;lGubC)w{t=rHonr?;4 z+d{fsVta?1nsnWWk0>r|CTmW`wbh+r*S}_+?$vakY$o@+JUt+mA5`>^ z4Zy=D0FSr;JPL0g3+Zv$OrG$(wwmsUXx8VnpQR^r^i(3~w#J4Zb;hR}nK(Ty+v+o7 zie9y9Uo*bHo|xcPY_iDxL+Wf( z+KW$1H~U$gYHp3w8>jS)#TLH&;)N&mudeQ(Hx<2QW(yVyEt^nbHDD1ecd36{ti7Y? zU2E;VbxZyGnm#} zGA-rl5m5}uHID>Bo_IvD-j>(1yuRf=mg#3rok-I!A^pQ!%ONaVmugDku=dYjRF$=Z zznXuA^lwG~aT9KO6y)f9XHPyWlYKF2`7dJmbHTv52&}jmD8nz#b2sU*XQ9ztf|~g& z;Zh^m-3WGZ_TV1q!M}&`M+V0|(V)4PG&`elRK#d+u;TTA*68{uHi~X2kO)e&xew}^ z`vScV?gvubA6O#-UNm=jfRWh%81g`%m#q;GF!jtKmcmy=+{2LSEMlTmap%L3qPG-y5$i1AMWDLx43#;>FdMN>`BG@u>-!P4=E z#(xMjXg(B(@yn?^n6&!%50e&HJ{+w02%sIpktn($NaVQ|4VsSv7REmv6?yq+u;LWZ zzU-msUN#e`4h@>?ff&CJQrrM^bUNZ(Q4Gmde<10m5 zHl6Xa+41-W!Ffnxiw;3K?f@ptsW6h|lhIIDZ=)8M(AE1*Iu+*y=s+y+>)=yRR(vYZ z991$c+?;Ak&-M8u zORf^W^WyV=5H2noS-A{+(4&u^A-rVDAuEKx`}mt3mV@`nwjV2edda{SmJi@7jryyA zYfY|6>eY}yQm+B!_*!7X{LxRhS-lP|jV6OdQ3kI^gW?;cIm%OGl%5;WV$yRH3L)PN zjE=c-*yCGJmB`)-R(u=KM)r2;=<3GvM-P5{2Q+KG6Nm+W7bxVrfjA_~*%9CE{3gB! z4LQCSxayc5o zbjVKhn6w$w6Sd!5$&aI6sE*?&P{scABna+41q%6Tpcf1a`uMf{49aegeOAga_8eI8 z^FU)PS(+^F;1{G5`|OfvVK{XAtm9aP?Xm2>PdV;q+uipn$LspO)8lRhPETh$FNLR? zUj*V}$}frEc3*#4${64)V8yQjZ8Tq#4vFUL($*+3Y_ySk17$?+O;E^h0daHXWy077 zWHL3Uc{!>%ej8}6eC*HQcTjO=-<2}Vz6VzPKG2%|KsvqZNi-fwG$tyV!+c-A*TXDff8qa2{-ax8RabY_tLpCRevcE&a#9jc zR5rCWq~}R#PRinmiiV_3OL=`;N9)}ARrT$fu4?UQu}w+Ueu@vciQ_?B2!wbO)mfQLNx#2LgD z(`J4|{Q8qGcYGJPPj=Dq#HAGjUwn6f46?FO?V546H#TO|Hj7B-$7JK-C&}QPY!XlO zoE0i$(|Tr+If#)oq+4tNPYkm7l(|@ z$tWzg)~0#s1{u9ZaABX=bhnOTvKu4YossQfWQ>cfGB0~FvV+IS*f6rajEr-UEtI{x zMmFBcKB>iZGCn>zdL}HY)sxuLl#0#3yku>%b#6?mmmq>=ue@~8sr_p)1rv-+Oq4ga zx68hhtE1s*E0Z!68JF<=_*nZJnd~3yfXs-+iMV(4pRy=ZtQ?5!)C6)+sy2|Rfz+ll znCy%T`e0h0X5l35w2P!O;(zHY;)+g#Yn4n+xCBpt=-C8#3q3#W6K6&Q;D+HCh72j9ZL_#8JXuj z%+EBg^V9aLmIYRhUt^VrTX-t#X|#nGa-rww=>#K-=n3+PnQ>UWCs{c;UMHu-eaNS> zTi!_E9;hrYoea6(nhQ>TVP$cmyuB?gOY(ABAg8Cig{8Ex%*YweLU~@6)575HHoBZ? z<*Y=xJ+58O&dWK0oSRD5$$5C7sB)wDyJJDlXBZdI`h`X=@(H`xRie>yNw^X&HF8`JP8+-W1KyKn}-yFy-m~EStTi2MOs1C6@_*!%uW4oPx?=W(w^NYjlE@lqLP5AS^ zyW2|ZTsrTzat~s4XXd^1d7qK{z0U_Seg8j>xCgB~RGZ;3_Ha%fiDx#=6i%u~jXajY z8TEL^A5c#O@+4FER3J~IxcAP^ofYOxp2^9x@eJyMFSOaYNzJCKqLfZ|l27V$OzQIt z@C73;x&V9UWhHmm0Y0fAc2^JjRfqb45e|+p9^nqv@{*O83%lnPI(yZ~YtGpQd3l{P zI4?93E;sW3?F_~ugMA`gFT(XB>>HuHVdYH}ptk~fJJro?D_fszNoq{q$uuT#MDd3z z_--KY8F}Alc~+R_*u<9h`Z-Zm`JiS4zhQpKus<~ND;GA3#z)*R&Ke(0KDP4f!tVNn zoquEGx4!f5TxTBiZbyA;<+D_${NBjtiJqxw-oLQ&WguUrGDUl7ZBt*or~bg{|0BKq z$;j7!`oH<#PXC{+e9P%~o9Qof_E#g{`RV_j#R(VAAHw3a%E*t=h4ZI1ic?Ph7G602 z?k}AG2;`quen!c{`gwK83lqNUUs5vU_PXW7)(PK!M11&-V5jeaeHu1fNL;n?`?G%l zPQIb_UgCLU%PLm^KV5m=bmHi`5myqo>i@;kAAxhP%za3FY}9}eKLPtbbM1U$-ORn7 z|Ea&SKw2#Y?e=;)u_N2mmd<8*P1Q2=9Db6P19Dmc8c+0YYuDCfOVfh1R-$2XqDOdt z)gF*g1aMi!bya&pGTMu6Lvq@i7*zv}K?t-DXe!fXmFjvil30kaf3YzNF%>YI> z2Tk-&*=)Febqm*J2xy>Nf}*PRLRD+jWNTW7q9LbSfo5j;9TzQb*C7^`HN5g9_ve6^ z4ntq$d22vCr`r$`r4w{pn)sk57VS&jjxvJS9%wYqZhS2&&FgS#5JUx!W!(Xi)e)eD z_;+Ln4BZK6vh|EU#0kK?({d ze3*xIPeIs!^Xd?XMmVfmYhl^y!JtUZG(e(sq8>tbMX52A@nF+|MrVM!2P?~Kf*Qo< zDmdT7bS7G?9tv7W&tdF@;fDi_9s%lyA4v^|b8#`fdK6l$)`22LM+1z`Vy7rY$B>7q zq|>B8ohhu6dTRcwRnho|?g6k-=1@~uB`uJw zwt^PY(8dlJ+72{27t{|GYBE&^#zJ#nVv(2sGRYH2MorXBZn{F*WX_)Foiu zVX3FNZl}9${6?co!LE5~>N4v2rAE&H7oDBD9DJZ>f?7QbH2ebbzdc0tY-j!)(AwrQ zP!!~I0XaPnG~QeZy12Oq=;~AMd`LzwU>i=k3yJ=ey9h#{7lWoIFIp8&bfPlF3zuGk zR;!m%m(6tK^fGXX3Qf-*sh5LAg87Q4E65|kR{)J(3CaXRaTOJuxU0cfy@r}Py_S%v zX!a9-9eZK;^+2OHP|MQ-6MrK$9DfrV6XmI->dj!uEznyWhi@5rtK+0&p56uyi{W-Q zR5ms#zUJKlKCgFz26`7L)-*X0-Z=FEuEjUS>dU>B3PTB`-?50rs5;_!$4*`Mx z3Y3}T$1nW|0u6r*H2P~$XSg!2pHPE0s(6${H#y`vIs9>Q`179=^*0z{^|zq#^*cbI zpMr8xd|1tP`S3iLQkt-9lfl7#t5t5f)+gdg+}1vuRx>UffjlAo=UvhBZjY5{J4A| zF8O0eF8i3c_^Dx4gMjbNxNjkG`_F1t4(hKzV1(6Gpal;<(g-~K1T^|L&>|0ir;@Y& z54I;NP!V?3e}Z@HNd4Kfqm=XIT1RWCXLr*wuvp{e0M1VZuz0;yLZFVN@)px#zLYG|uJ7p`_6wv5s(9o{iRJ&2Rw(nbh ziDS8R`FR=R+^;)XcSjGadw`~rbvlNAR?qyN6!A=BfkyWN^|6klhOzF=#xAk$gBHX( z9uR0XXe!yM6NstIET89zkbR!_1z)F=K#TLdA7uL8A82$kwNaiApoYGuu(6Bp1JMHC zHGn`50!?LFk>{!4e4cB`!|uUAqtif(@_Y!D>;9IpYEhWo?uwiLB!(hqVA<+)Q093C z{jAP&f+C)2CeY}ipgz{as9~&!v$0F8N1z3<9tjBaC{X6Pj+n}H_&gsC+2?r{_&PlX zv^dX6$n>288m*@`%5wuX^qpp77vGI&f$!OXKr^7MdhTA;1cBz7fkx+m(i{$|7Ak11 z6^zw3YU;F|P?U+e?1kY1G^*4_Pm`sF<1HJ*dsPS6@LqK+IPA0I0PYTu*Lh$Aoex@E zGZsLm{o{c~7lJzb<$29fL;EMNF}yr20-LC`8BM44MDqOft|x(muag1ps*%@Izy^9M zC~I*Zh?v3)Q~-evJAt^!Q^kOAJ};&QH=VV9WY;CoT0ISvsdaa;(>Va9mI93~1NElP zpvId5?|wvXdtDCB>Y1SKqI4FRt{0`V(Guu6u0x%kOTG7b9t8S4ABf91Reb#LsK1cP zbwBF+RqI6zQ$Gm^ z^eIpj)MgfAB=>26UnkFyhpT6SMxO%>U16o;Z0ywMsmD@T5zSi!zHoXhsg4v8`hRQm zxVoT*{kki_-i{yV(U14&|E5=DghGSKKNpgyCoQp1eC e#>Qr*vN_7(>)?^YHvoaY3CcqK7BQ9WsQMRGdHVAJ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.12.0/0.12.0_x86_64_linux_2.7.3.pickle new file mode 100644 index 0000000000000000000000000000000000000000..3049e9479158173effed407441fb6b50274a87cc GIT binary patch literal 8768 zcmcIq2b5G*wyjQtLP>&RLd8^wv?LJ`r9l+2rNs+{v2ClmtEp}}om00ZNQtQRdQeeJ zm=!VSoCD^BIqNt&&Ww&>W^|lU(Fy*&-@C7wIySo@4>DeK@jP#CG7F9-b5)FJr z>+GVoR8;!Ji)T)dO)4XmMIExKk-o9Y(sXFhws$6Es{why7agxkgRMOi}$Pv z6tYz*JQRVf7 z|LGOZQp(D-bZOep`~mFXfkvjg4jz;qu{<6NUH(!!Wrmf5k)}#d4oOydGSic)WEz8> z_M<VI{4tJ)G_*qlk4Z9p^F7iwOX&#ZlhTtggO^!3IaV#F#KM42WwV?S zkIjm6(!qfHcb#hTD=W+6C9N$fIV~rrdvZoHG_ZmO&NOls4V2_$B@GPeZmG-JR?dl+ z*afX}ZcfhgR9@;^to8J*LZR*NBKHWuE!|bjNH&= zXxt{my&>F)Ze&FWg}>*%XrPG02jEevc#^0V|5`9k-VZk9UmN+rZOFg*UmNm=*pPj{3_?D#^4t8f{Fr8c zXXN+J>>tt}pe?TgOrU&X<y7;2?xR0;$yZkX5!^@r ziA?+V(SLdJZ!13`i`7Vpjb z9z6Yq=6lJjCKi{i2Y<3|)ok*(>Je+mI}G@I#Sh@NHQ5iyYsU^8@gunJ)7LH{SI^z& zxgQ58C$`lhV7Ir|sqLADmQ*IgTdWqNnT0VdO(CbMqhYc>TM?F&qAn7Z1J<(?hN zbIs@gU@+NqAQhZ%TuYprx*2M$4g!WVR0p#Wp57d6bPJ$+dI%MTvsJf5jn%Dy$qwBb zWON%~ym!)Og1M{P`b~xcJ>3q7jjtCrzIshGr*wN%WOWB%x+YwfZesflqQTCG*P+CL zd?2dB&=y(V5fsbnPNaCz6y2F7-l~kc#jCrpj2LzW8;!9UXI4&!Q-K&tdCKT+C|Ml= z%tyaF8{o;2V58+g_v9W_@JX&!9fgwB(Lh8#24r+B8w7{1j-w1Kd-|I3Kw9aU)4iy` zRy_em3Vs{ERqF&;u(~%e>FGpJyl8*jhh_`npTsi!?+Z4%AJ9cHnF>Zh=URc1)hWPy z{8QNgPwo#kIt}QaJb(&5$&FeMM9J!OAmTp=WON1_xcCpI3@eqs<`5vQL~}Zm3dEm3 z{evWIeEdQCZ7KrZ-tn*st%7B%hXRq8S)h2)R6UID3i4uDM!#l*jm`nGU&T3%Q-SFG zmBE{a&P9#Y!-4tS9Kl9-`bev_Yx4^LBc(y)9XJ!sc04ra6;;dV0L6F*yu?>cBC|?%c#IBgr3a0 zMs6^ToK1~fYK_ci%IYbgs+^t*@-zozns^$jhXU<(fQ|kNNV`}7%c<~BMm-Ipe>m#t zeycP5R{Uh6DOCV2Q1|~BY75}pF3QAdh6&Op8>$Ecjca4Hu#%pMVDgHXBQ$u`JOleD^ zDHFUj%uKW-=A|^MZv^e%1Xb%>z{0uqHparXQj}yB~p#{uW5PWjXzr3gkH!CKqM@*L4)b50m;kw6OYnAYAf*DuvC@r2bs0rOMm&ol!2e*qi) z3YfS5HB~$(El%2|FCipUIZqN zIrAg(@+XIt4+g(G=e{N6T|cc{H+X=4k0-3I2j=bnKqIjKBiQIafO-4>q>6+6FV@CO zv5Absen4lcKNU1J0D^xS!qh;>Ftr)T(?P&O zQ-e{asm;N7mjF6bL#UvsEg|^FDNJnz8K$-ddAbd-(A2gl)6`I~(d~fF)b>=+)D95* z;~A!gL58UvL7wgeOr|T4TV%tPV$+PIm(v9YHO!irsp5 zD(G}1>o%-ij8Hjh;Cc^`r=x($^x+t=(U8K8c?@Nk9Sb%(4(OZZI@*&e%=7#O8S|pk zue&my^M9efYgxym#_C?cWTILpfD6WaZx+#`iD0Aq09~Mys9>P`vaVC0`=JJbP6l~e z0Zb-3bP73{t_gEI73DC;`$MkQX~4o9AAmAl9|$%&o!TJB2T?)SGg#Nj^}(ot>q?NP zhX9l5X5@G#q%g--lwtNzu+dq-e2x#JYU5u*)-Me*+udFkeow=mW?Tun}<+ru0mi*lIb8pzdp9I!CQ z36$wN2{u|wZII(SD(E`Jx=ya^Q3KcWK%S<7+~)ktR|5*P)(AE_A4qFBlA5TXwPpxb zTd1klR#HJ8+Sm$@3)rYq8+0Z^1s}Jp3oc*n5QEFt@sO~*7J&S-KBo&Idb$W$xKS)d zndVOb8(jjV`I4Mwsi652Sr^=}mO_k|*|erodJ<)RtJh_a;ObQe=OIER;0QFvsz6`DV)!lfFro`xr^o(^QL{X5(lGy>}@z(&sm zI_qaqQD`0IPQS36x4sfJR?i0dx2AI-biOs6iyBYQ^BYv_`P7HqScL++aRC^&da4kl zfAU{M)yALv{VVihJYn?`U_Q)CX$1B!0~@^@n74lgRfU0u0ioXig_qI%b!Ig*R<8td zu-1SJr{7g5)78~rqt{T&P;z=L6`UW}LGYIX4wCC3V-?&0^7KYv;Z(Z`WtzGfZ1fhO zGj%H!G<6$<@T9yQGECh8^7Kw1&!oG^_$XF}{|Jz3(z{X2>OH^>U&Q>kS6>~S#YR!R z7kb=J?*oPF{(i{S`T(#n)(25$2Ok0(eVAIriu3CcD%inCSr;#xVN=ahbS(;jJXIfq zj5Y8$2%UZcA7%hP+{;4cc|LF^6`-=Ryvj4>|k;NN5Ke`19zg)fa%=z}JBbHm(<0M2s(i pjlK+Y*?ffxX7g3nH8PEjE`P53!D{I8NYY;-hB)#u*)JS$kWFla?=HckYsly0dHJU7$Eu*ggkF)d3`Gt5f!QE3#q(8Ip(%sDf2E_ZqOlV1%#?l&{% zJKucY|9r1=_UbB^L{%-GEo98K1RknYEtWAHlf*g;rF>6sqO(ZNM81@BvIeC_uhK!y z&SJrKXsOdtNW8PhC}a#f;n*h4ns$N-QWk;)C!05ypjd^}BqM!VG`;ysQmZDTTcS#z zCtS%my*;Kv>a?n*bCZ!|6gcYDWK4_BPo+|qy$TomNk&{%t18&ZDE@Q$o%DdVEE2ir z*~e=hZi+<4tSf3rGs9vxBE9v7>kc5TO3Z9~1n3pJ@4Oc2&Sfk2BYp9y=g-{-^kRGC zdZb%p&G+qVa#H2C6uY`?)3L#tUWF*rH<4j38Ln!z9o_(G=tQK@P6dB5X3jAbG6Hmr zBqLzTNG-N{O~xjpTrFBiMoa0`=Ki!+oh>+|(ejaF0DG`4DL5C-=9o44n2qL`SpZX3 zz|^cJtB-_<0~29a)FY89POCYMaT@2ehSOS3>o~3F)JgUAr%PImx58wMOHKig$!FJ^ z88TL2#;ou-l;zA76fzDVJJltp0e1%=Wsva_H{f8>f|0Y8<=H}a&TP*WJ6A6?3P!g{ z6*2)PpRSXMR@@4pn8cun_gT#Yn#$FS9Lr9ozoC=KfI0JN1w5AdGMVCoJXJ^rkS`Y? zPh%i&kbQ9hfUJ`<)J5X^$*3^Pv2)q9vI2a;c8q+FLeAU*U~YTh{_C%4Or^jh&eE#8 zibZnvqEZu&rI|4tpsRpq0U;oD_1le(!x}q<&xP#^$T=zp``kpqkV5EcNNPx zS5p0rLKLi9msq!XLk5TMG@cbNE#TI)dbEzy!Hvwii>x`mbyfK+na@mnlS?iZrY+D( zYsFAriW!rKa+~jE?Wk>}&zd%%JE~4DQ6*azGFz&ccaIWq3pkv##QSE(Tg`aK4Hel= z+SJLV@<$eh0L#wiNq6y(fHiY~Sd^ho_`gh-B|_xl0gdqs-I1dHphrq4m#JEHv0##< zR_&}QlDC*nKRW+X$*z=;H7^w5AOfzT7Hd>1Tdd2i_F%Cbn=E6Cb%jf=6pOW7CoAA% zK{e9Y$CN&+{-o3s3v0$om%LTA8myo!yv>pd0zmL~34(VBtstE5Lul)iQN8fFs;KS@4<3tm(ozjiJ>)k zGlaM_PhFil$$*7H8aJ7RBboEAeZ4JIxWHkY4(GIi(-EACe9{%fFE`jL4$GOnZC;`1 z_CwfWy4Uwv;Gm1SC{BiUP>-Rx)uZD$RXIJU)mgN0&>IujcX+xetSIt4l1+}39Ay<# z1@Z;N9n_gf1WhfA&DCKZtngkuLM_Vmv+&DHD^vLYSs5sZN>?_Q>$u9ul}u9fA+n}x zwQ26LJzA^-eUJF<35Dvp%tH6HUE-*gp*3-@#2!t!W{JDzRXSM%00b#vtpEV(iQkj< z9%oMqzWm_CJ#7aYBaw->=-ZLz+YanUddEZkGhYTeEHi!s(!sHhb-x01(arhWknWgQ z6(0op^xhrIk_R%F@wu&sfcETipFp~MR@02bKu6qtQ!i3|k6P>S_Qgh=WZGsFM~4 zR=KmkFYL%eIQ#pv3Oz30d;-`m|$nFW+nt3;epI z)j_WAqSz&Vz$F7mWNJ2~c!>+PV8*4$s*o+P>>qT=hcFn1^I-6#Y_*1J$>nBz8%VfT zC)=Tw5D7}!wy<5^hy5J+5e96G^*Q9Dsupe2qU~Dr5-qyWWNBAghL16BsKVip`}lE& z92CgcH{_g@g<-P8C7)0&cfbnA#C38^Ky&p;Y_5W!xSp9(%ZkHXbkg8D{^Af60|){I zd$t^QG*;fvwVqs%h(z7!t!RsfvgW>60t_si(GLBxJV@WRf7d=&!L z5OEstd@aoLb%^K+Jl_cO+yWtAf#;iHo^L@;@OVr}fJCdug90x(-KsToWXvuDc2XUy z;edpETZ^ahjDUPc8|{yTT7=vD7NL~3t@$pwU5in(yM=rgYz;PH-^1?*o3QW8`=cK) z?Rf6G1GX$_gOVR2Sx0{4ZQplm)ugnR{1`UxQgq(Qrh?leh(F=Gox8l<&QF0EV82`b z*gb4U9gFA5!Y#1X7dt2H)-X6K#QM+5=ZY;tCFt}L}3vIqMmILFJ7`*0g( z!+Ta%_PXSL)sjzj*a#l!iF)#L$d3b@KZK=}@=TcLPkt(ZFvsI9^r2^&4?U-o z=VdKlU^96f{h9j^ZyXY^@PCmO?5`d-TlY#kX0b>jEXtq7j#`YDKeNMy7bVkP3M=?G zNkM4i(pC#I?*KFJ?>hO1Y~I1}?3bn4)*LYJ70JB8ka@>n{{}aDW*)1+@Pn>Hl6i;2 z3jR}4keN51nfEVd-oJJ7AKAQDL$fI+GD!e@$j`(yilgCVOsnus*nV2gd7_>_&=`}5 zS((P+mL+Cd19xTFnAVDka7YHBL+iYWZ~{f^abn!Ds0k;{c#j8CbQs*RsG-A=RnnpX z-{7AvAXkY~E<_k!`~T(8r6Vd#5x#{Kr?Rw6M`U};9s|!6GYdVXjB_mG3fXDz3!D{I8NYY;-hB)#u*9J}Qk;7kb$DojGS7m%Ff({A%{c{buHT z=bP{QpYL@JmsdD6s%r5}K5cHI@KC90v9w{EG}f6f=DK^DI}6Ng&K0wE#$e3oQ978} zS;$*9D|Xrnjdyk%`Ltm*+m^{Prq#>^Da$~DoynQYQLI9%5|KVFn%Z_LtyUA!gwp5A zR?>D)x2ez?tztz>B9e##L#>*KY0>kO$>hba!Nq?P5l7W3@)k^mKfB*f4QMMPk-MIK zyz1eGNTg|VK|`7z5xXAg&DULX5NSp8?6yaMuG)M1)kt@*TzdfNi%&g&)_$NDSiM`2 zZjUwIyT8FsmYP!N>atAR0%Lj(rXwxQ5bpkpKv z0aHe6u?-v379HczdM%nyM2o59w*Hh>naSIXJx>pp}Bg1dR(?C1|yvHGFr2lOVWu@>*j77UxtlCMya`7uxL=QvSW7VMREYILWUb_^aUY+(N%n%E_LW?xlU_zs=>GX60{b- zrSy^76H-J77K*nx^sOqXBYrJ-8-XhD_COT?2;LEb;GIG(NNrHt+)E-VU16y8K4u0- z#C5V(LRHjit?mwy&)?zG<;zzEq*R2Svxw&m_b;x+Z zyB<}q?IYlx7YRv%4DFsSLu0E;#|x?oI#u1 zXJuNf1KrN@+7TP-3zFaBb~3$+JkgT{d3z70qx%7Jb`rYyoOnafsVTKh90E)+J$!?t{=KrT~f3%dDF6Z z&$<;raUA*{RjbVxSkB01Hk-6}K&uiB?B$T4pjG9~>J^xv-WyI(?*pX#Ah;5dI$=RT zN+yh`D4mo&j%C>}yUqvhePF>V$ zAV+sGYzjZ%&<`Gwq4|*FW-Qom85bg}Lbt=B?{MhV7zraqB)C#`lA-$VaOeFYknmxh zegyglnVLdd6uzbVsFxf+hPcLfXG3?XTC`1zF43Y(wdgXFr`ymXd>omf_y)u6nqh_; zl*QLK-0UR55cz~duT@F!013v!Covwb3&aD|vrl11@p zOLhMea#eBN&qg8sb!^>_;saj}d>loV==PPsunRr-z*j*xaMrJ(v-(l`b-@bz4|)>> zsUc(2XZc2e<(m-1Wms+wuzU-GybQ~?11z^d5^z~eNO+c3i>Cl?GP+f(>qwhj25go( zHo)Nr{f-t-;Q;{st~S;i3G@Ts^ZJ2e$|8##`h6|N%yo0=ZD6Or&-wv=-rr~a5K-`B zYA=q8T=gT^n4}CwZ%4A3{@C4|-=S5~;wJhN*qVoC-ib58(-FVB#Fpl#?w00m;03_% z34iKdoKVBVce3;b?C0g~2-`A@hccnPuVkv+29%@jChI;pV(H2-%cl2ZV+03uY5D-} zv@CefOUQ!`{h3O_$2EKe&+26D_&H?5hY05XU+DCgVF!9R0No?y)4bp7??>kUJE|0uNTH*q~@`S+F5j1&W_o1DQ(335gyW>fu z19wbqZUXw~np?Lbo&HQ>UsHXFkS9DL=t;kZ(^fbVI)@(MbS{08(|PnaoX)3Dae6NO zt?OgI3$MxVxiHr$pDrKN&jeWhP(G-i4X`}tMJ(Xwr-tVPEH8L*4?&G*Rp>Q;>$S5d%mYZZbUi!?>3;r51>u&+= zUJhvo4O)uK} ztU4^Ph6}_r%xdA5XJV$nT}k?7Bg8y7_=31$Bi(s$uEgqaUfd?kG}$P;$73co8t!<= zFg#B~qkJ9e@eQUFo}7nq(#0B%oFTmnDU3p{fWs~}mVfMi!gH^wvqt_L{CJZPb(|Ru zkGn!foD`H(E~!ZcvA*CXPyMl0v2j8#FT&Y)DbEG?WOyu_P}nJ9#tEEJ^a_{|idr20 zOUTT!i7?`@Q{mU&y6}AjA68Z(lfov!I5^;G$cCOc8E$ko1%AVxI2GUco_IPwYVqZ^ znQLRy;K99L!p?wu;hQbw5SV%3!D{I8NYY;-hB)#u*)JS$kWFla?=HckYsly0dHJU73ekTL9pW1BQ<+6g8|SqKuGY~EagVii)8jPz>J^p?v>t(uG`m0nM_ zl5u*vO@-8HRZHh4BgrT*)T_yu7M-6;r7n2|F7}g*xT;oFuwg3vIekvLUt1Q5-2KdB zH4imKB4gGUHKdtgu^W)ya{aXjkX9vTwml5=vfX!FgLLPzmHUyt@Z@u6?*n?Fy=eo| zt+D2N_cb}Ga#M<3UAF1iU`&rflP-v!tKN&OU7z!ByI!2Na zFlD3`TeCJ}lToe~EhM9*bZSdqTC2_$9MWj{z%hJ1)Rq*S^JjC&ntaGcdyrXxQdXeU ztR}0EM2X`PVOP{6kt$BBIgN1|=d^~?T2AXYt>@H9_4cJpT8+29WQE5H$K$H;dpiNybb#2Y|T zVEQ>nktB>17>j47_qkRA^tMoAE#8$Y8V+e;iqCS%Y@zshIyqlbe9lomH<+i2C7dfM z{>EU$$Vzmn_P0CFl>QNS}Vrz zQp}V*hTD7xYe!`xz1FmT-BERNkt&(85J#(+Uyl-43ox9t#QJ7rt!Av_28(DXZR+G= z`4fu*w6em(7Y~Y7Ge?I-7ixt6$~0NRB`)sQ7`sp$DJlC(=U~bN^w#1LU9ce;2Lb5Mzykay3}e9*2%HSGPX{ax#V)OPRn(&0=^X#APv8z z^jh^Nq=Z;lCRV!St*X^v1(o1!mQ(`(fVWEkyn`zRi4E$ScL_X&BMi3M$IRb|_d6|3 z09DfJ?d}eq&EMq`UA0<74zs%rYMTj>$k~%f=kEr~I$ZJ|EP!??4V9Z1T7x%3h)eT) z)v1#V7#L)36Ac{6+;-ioZJ@#h4&!tZt@tY9})pD7Io@u+pQ7uDj;tq)onefUI_sT1DvKH(Qa=|)of4Kkb8DjqlZv4Q+ zJ#7aYBaw+W>)Vm$+YanUdi#TYGhYHaEHi#1(t)v$u6h~hqMP!!BHb~sDn0=8sl7Xv zBb}|!*oAaTuhsD8^}UdupiO0s>ZT*o)0Kcz5CWSKrxO+eR=Jk*^Tq{Qd^irS1UMdKO>(G)~RuSf+e+JjkD?(5K!2J~vA9UoOLh(_S z=3`j&%ToOHp<#4=2e;Lq2Dg`V{LG2f@v|sCREn?(DbsUjhy~5SeJ9iPbLG13LQe1u zsIY$b1I%y@9Z!#cLaRq7b2>FtX^+79^Y~egGfpeh|Ax>owirzJFO=(kBXZR+-OoWG z!F6oikKzMg41F9$mg)AT(69$R_`pq|8#wEi(OH8S{R(G=od@|Ugs34RGhq2zh~?`L z!WCG)5n{O+Lc9XYH$yDnf)wDfn2_#>R*(CCFBRRQHFRXmE(5kn9c$p&gM3?yr*Z2~ zzN3xyM?$^8t$r_1O54_am)xessJW_zd>8Btc39uT&j&lK@5|etAK)m@RJX&HByCXg zLnQ0TkGyU9ZmpV>){!5>rd*1~J8&j=I^y>ezL~kx+synFcmeRcj80UJpF=LZ-@@$wfKGlPJJ2sf&^=f=$p?M@uaNnFIiUd@e=Xzq5Q+j}eV7-H zWBV?x-ow6-N@qslhoPKkgx@@stX^<{9QS&+6nkS;OaXA`hECbFbk| zK?2tLFVZUgl`-Q|SK0uJl@U=;{wy}gV!ZsBofNzvne}2wyT3`=L3@_AT9|1Em}!64 z$v*PPOX|Dv6DJCpwK77Vcw={~Qu*81GrB(PQY&)&yJf0iS7!!!8m&W0iWnWqYcV#J; z*7A99%muMR>%4hzibU&iUfi*$31`H3kB3Th7~HWip~I0?(xCz0U>acwc?joPbi|P} zghwHTQOFN)v_(gx;a74)IuYOa`GZcvM=id@G4pM7GCX+q?Q{y<^Q1DzItZpt zg%=WX({L(EO2(5_TdPJoVp-xY*>JO@FlQ%HWW}~OV#VGS zvG?A4@4a`i_g(`Jey$pYQVUzH{cxoS8W@b7t<{9Nr&M@kMFfxmkBnx@)4ms61ZK@t}=( zwV69B6W84;i_!Yzk$R2tSb0&iUQ_AriSm-lBIJ(JJt`_HcK;iXh4NTl_e_j0Zpi6g zFou6d_xAN#M)yfC96u!%d+hVKI=$LE7VA5&aUXJZw~|Z9H=lRbkL2RgEylhE-s}Dc zPa-d#vhRE3@0NbK`Rm}(xdn%kPblg0)a$)6niGx}Q<==6Hi=}r0~PN>e*nN(c&M~7qOv1Yx#(i>Qx&zxPI(*yE)Lp$0AJ+Qo} zDP7U52l;w%y0QZ-);1Wukrz4K&P(9!NeyPZ5uvL$rmHvc;O9-19uoRF;RUX(9jS)~ zuH8)OVTtk%p=&o^#kEFn5jYXe;-UZfaCZ2xH+;BO_;3yJ=@d|mQpjg+Cb*a%? zRuuGBzTP_BRLSDOs?o#qdYfP^*4NI-RO=DW({_|Wq4-q0!s2>cIC{Ih-ae>M1^=e> z$dxL<9_oxBpydTSKDwX@Qr>8*|h zJ;ochaC{={^S=&Wml-`4gBs`S@#$1jPrww}ofF5tuscTY!a3QMtxQyUx3HB-;hgL_ zLhl~TNx9N{xH*}xC$BswV~nmyFHGt^6BAv}%8T5fO|Gss9p4-4D`!&mv*LQnvGDzg z&pmU_(Y-3Li1t!i*)>$t*r=yYEDJ4UjNY4F-p3oAYS#Ov?Lurw)o1j66-{YxBafc0 z%y#;byJ3BYm}Z5n)TW= z>s;VgrjvE8(OKG?=}p6s>e3!ZVuM^wU1Q4VdON0uyl!-3nw8YrjY-&ePH7W1-qdE} z?QH{nh|#k#ra8Wz%RbC=V>*;(=PR8L%^ns`ea%R{Aej2Yl|I5v{V09p%2OXR`Y3G2 zXhjBt{eDb1=(2#jguHBlKHBI4JK%c;PG}CDz;4&q>SL@27UuP_&I89K_3>@SmC20y zVO%+ZYxqPCC(kJ@q}y^u0#kM|<~s zi!k*Mw3>R8(+}FIe<-gXc2oaIQa_58?fLu|TYg;WC)hF$=O?}H{~rhGQ${~s;qkD1 zCa<4OcRMQZfvULL6)0U6&_meD`hheDwD5*cjpzQ(m2?zbD(w}kAJqr4B4th>F=6^k) zR`+y{hX%Wayhh0EBcJX8#q<|Oe~B>nm9M|1kG^S%9iaaW@7AdF?d zl=pD!=~s5?H>H=cO9)lVxtVcx#$p`4USag_(Z>ISW&c$AudwXjwk$6@TyMgOFj53e zH%l={Ne7@WB^EnUkxEJ@a5%Bjnd#2W(gm!fD=?U0iBs|4oMh>S9vE2zSW%EQLB4bc zrc=$*gIvkwQd3)3S9M6Lt|=o7?~(Jp{p=k$9qa|^1)aTHRAJRiZ%Qq9KUoVhZW|AJ z*d)>iLSFg;6W-EFT)(m3?DpX0Iou<%Hq^m3k#&$vN0FpfxWPmNWp)DA`Ysf%ItDFyc8N?!d{Nt=@p9&xFNZSTtD{5_sI9Wa@Z2`pk7~`upB&VN&X%3{oC{&wgTM!(aYwNll$)U#EM?BsqMxPV7u=s&sVeap24owiF+(H3O11@ZhPoDHJ1YK@Awjl>MI$4DEo-x7 z_sb4!5IxxutYjx(u(f3r6|EzI?2H^Eqk%3S$QTwv?J}^EvB0qQI4W2>iVfqDV`Ksl zv0)dGl3jtmOtiQg74Xs|uv^EwGY#wIU?qD1!*x6v$t;)ae{-Q#pnW5I0xPmyJr&I| z1?0(U8 zgKrI~U0ES6`=TtC^M1A>-)$L07|6pQ64>3J7TbGaTK{8`x1_-g( z1Nl-5RYW1xC4MhsDV8jQu4nJ|U z%y(UBW&>#X7_gFsK(<^|kYlMp04d=GR*r*Y4x*!09w8P ztmHx7*ujvOOb0O5`+ zK}xOy`f|0!Yp8JUxE2DVQquZp1j`egwR40&*w+z87? zZUVOHrkmL?dUOj|$*n;42&?5bDmdq~D7Qm0atAPS(VZ-Sns5e@%YNJgFmM#F!e1yJ(^u#y*nVa=DQU`@NWUWR1k6(ENH zDoDv|Kwn0Jb*uBV?NT39RI2U})+WDro9gB(gOCnEDL_Q_DcUEC;4D z&9Z`QPkMX3bkiK^2?57@FHp2( zdqZGTYk`&Y0kSEC#J*IpskNEsJ}SvNklc4A=?4J@*9CFGuE&yY4E-U{@cLjS8&Czq zT?#UQ3hae}ZqLGnG5$xs{m&?BkE0EdV`LyO>h2&2v^p59WFuf`bz>@=Ra)cI7Rx3m zX=GC%KbXl7+6cbe)(U=XNwG4%fF>VI(Wf(A>3rgFYivNIL<1`vJsb$&y>eK;F@+|MvF z8X6;GfM}`=cBKm8J8Cp&qP_6FY=KNf4I{e& zqvbG(Mqqz;u#$3MWPcB;+8R{QPRrtOvP?z|BNf2N{+=`f`%}P5_5w!s6;=3diT;1J zc=@6}cs1T}_=)e67d||=v@iILy>FjS9`b4V(|vo%RMarCH!!ll4~@Y7zF;N$0VDhT zh0;#;{>-&81{->7IRK)QhRcDiq;2G&R?-Mbwvx7$X^^lysz98kgTbv2mlS07Bn?(F zom$RRL8__XqRSxR4wo9FG4nG(zIZ^4C^}qfA=6YAtYjuIG*w3hP1Pge4wnX`VX6`2 z%Pe4PQyMZ&30O%E7@9Iv&{Pu=?nF2QX_%S~@?{RNwW+y~X=)x=$)Ui|)O;#vDvyLa zrw&6JrWSyFIULBt}LcG<_V%m*atHZyF}-1f*PSJ&|dcJqfI25zw0DwmO+AEc@uv==QQ(=IzO- zXCJ3uRhCnbW8_pIKOme2ZZ+qpQ$&x>04q5Y7!LF-Dmc)ynYYqF&p{3bdM?P9^MHJt zKcAfT>Ya}-fb4vHA<{`%3~cS=iy*VC2FVrY{Hias@E$ z8Tj~0q@0hhVj5jFY$FTz3dMW=7nN9rxR`Msc97;j{ zqJs1NHxhOw<0XR^Q7qyjkT1nRbU8ZcIzXl=emqgq5g3~4LEnNc*=bRvn+hLUHL1JuZ+vKyK&uL^>(GfUSqy z8#4R278v(?YB5~i@2OxP*JhsWz$BBYA1muXj>K`&43!D{I8NYY;-hB)#u*)JS$kWFlvca+fA<6DaBHp;5F!Y?(xqD}s3-__-%v@oC z38{8zEtFnlWkqC`JuORnSY)P&RF~s_o@9Z}6X~S-EY?Ee8yNNMUmI4PSlQWkiTZL35BE4EPwc`p>ttO%grPt%F zq@A8_Qz124#fpWANFoXZwQ3@!MHeTN$;)1bi~UJNTve;c+prY=Iekv5Ut1N4-1GFK zRSz~qBI7m}G{p3X*o}y{-f;b4#EPcbZ4UunzVFWK5cjNFdkFFQC!Rh3AmByzwk?P| zV~zJ6Y;cmLsua4qY}2tpnI45GGq;lwE*Yt6)g9goQrC%wLNyiqlQy%Cp^#A^VV#w=Y%HsyzRaaV|Lx;w6{aWTwe@ zVKSzKN1?1{fuN8HV6oF(at2U$fT0XBQKAMjm^5!>ZDn;PzbokY%{TFM&KOL*QkZxK zGx1hA5SM_7b#j)vOng2O6-qgFHj`4;Kp@zTk?U5-*~{lP&u^YLr)5s)Qj$rCjB~Wg zu0nyFyR4P-F`X31s9IBu9VsoAH*zMKnJlKPdJC<1o=aw_)~$`=NGP^MC#_}E zc?A|tp3ZH4thJ-CkzQ*?zwW3yxmcA{S&DBfScs1iwid>4QWELwk+hPLP8ck?os_AQ zOXN>1&u5>1-^Jgz5BQX|)H=a%{4aIct?mE*H+aLMN-?TR}rwgWpnmt=d!4QY>s6 znoHKIR-F}ehBsPLPXGhFNix8jxlrID=yTpB>?`a!n9GN(=V0fW&frBWrISujFsS5d zRB$wl-A%8yf(mCig5yYzbsR@=6t%<*l9%i2^@r6=-8QdRbou$a4n?o+wIDbz<(xPS z9-JOQW2=V~II0}aYju`w8}Qx<%XfI9D6A{;8e%0zOHq(Hr}cxDJ=WLJ z7A3j7fpb~kuhrlTfbr$vsFv+xshN`;m;b+U95hO$E0fK3TxDd7CMozPv}S76DIU+= zTC4*jllb!sg|52N!r--C;;7b@S_6++oZ&>^7J1-arIU@Ie^4$qas9#O{>a)h%#jr$ z{r<`O+m6&nB9m{?cOmB54j)3ic4?nM>x0l+5@ z>|Tx7qR-lkIIaGfoksz?_qvZE?wi*z>lomuyKm}2)Yo2k@3H#9N89$IoynWF%|_hK zV3aK`*{W)_`2x)u+015>xcyocpLgE@6$~a@HrsAHu7X_+b_%j&JKE`#Wr4$t!)uL@$v|Wo{tVNfatR72F_z)69Zw@EhhliPL&^KS* zWOI@hCdh7=d_=X}ek+^}*UQ-eGuKCP<_gl`W2i|rYZ?nNK7(ucO(SR$5dI3V4<`4I zBl~Et1vNZ|b3sXozajJ*L!XuRPe9m9GJf*Z%J?Z{58Vp;462WFN{O6%LL{gLZdIAA zpDvYkFH%BeKoj-{Bp`-s`1bVZC$)MwmE-hKGd|AFpTW;^JL9qv`ELxp#wmo!{@GI5 zZ$hdnCi@&@5?n*eeu5bITBqo$QqSfM&fme!d)#^IZW|slmsg4bB zQbN9=#Z!1%K)$Js_1}bMf!q99pqR3)#V)yBi&1l3Gx-+i8SKQqjh_#8V&9RsN#Dh{ zyi(l(yOxwe$@dU8lka=m_kCI=DQ+S^fX%y);+IJL z-y8~oAATkK;Xz~t%=)mdoWwDK`OtC=k^@k)d=(4qULKK zN9@0Adeb<-hu7S`74h7s5(mcBmoRzQXM&OV2psIfq0?OQD8pv*Ylid4V+>o!Zx~)c z4tX*9xa^bPGG<;-KT*CY{4PZD`|?HM40iKl*h8iUOX7Vbna$qUIT$ zJS$8194_Q(_a`1Te5hywU;kP1(q9-cy1LvRd#KD1$tS#8`pga^o|n{mAtc>jCFx+i zOIgiKwZlxczv<-fvT8>{i(ix$TMIz7mn78&eAR~E9S62~Di76x_@UTQNws4k>HZ-} z$5dO)RQo4W?O!_ix2)RB!D5QXPttn$j9;#46yJi2{fbR1@RzXew35?!VL)SyA(n0$ zhg(*?X%*a+lx$kf*THcbCCWn9bM_}$?eLsUQnn$? zr(D4k0*d}5{`&#@ARWu)@&^mrDEN7f7zdAKiw!+ZCLGTQ`5c19qNv5;f2PbVod9oK zdOG|KHbrcQ!d@$@kVv6t!26lo=|m)hrbH*fjZV*mzf!o;$@q(3XXq4s)Z)t>GuK9^ z!h?6;PN%^=FFp&bBcSSZ_(8JV3|xww67fXE4(vF4xDPpqTof2-3!GF{8K2pm*~eWL*kutEL_md$zy`}gv_9BH6meoi;9{@ac4y}U8Wlzh}9u}EtDwbs_YKB>9J}Qk;7kcRToqNuG472P~@~he3Z~phr z{mys4_xGK%+bbLzRke5~pEfsBxT#dNSlX~n8tcjzb3MJyT?J+~=ZaZ7V=!j)DxJ*i zD&#Gj6}xPO#=ClqeA=*@ZOddC(`x37lx4ud&g9G$$X20MiAbLoP3>4stJOp_q4c@D zm9*X4V=A;pt5~@>5lKXWpjJ)9wCFj>Wb)!y;KRQX5l7W3@)j(GKfB*f4QQ((k-MIL zwCcgeNM!t$f`*u`i(QX+^L5uALab<>+wlKS4;}1TKpSS>gh#1Xc^I5m+lw5m+a1l)!p{c2bM3?C(z%wJJCI=tPH}4AGp+ zY&O&M6zNcIVwXZ$!^2delfa*o9eOHIcY+%YIweF6PBK~E$Xd#pOn!aVT$(L(ZCq*O zjrAr|=xMNcs!peoIPv{79W{;j5ztf>wu=ns1a4eIr>BEidAcc(Ogzo#8J-Ex6pSGN zE|CE+g9pIYaGETE0MO}7b-8?fA}W=#t!yTxtc5VOY$MmBP<6%pmPIWK=C#fXe3WDo z;&zr+*RowdfHYo)U^ovwp-`R(#D ze3#NkYEMYRBiPj5n$YIvA_Ut4H@8VLMSZhL)mjEmk--*(B3+^Od6`F z)mlBBqPoA`p`9vOC~KP4V=&8XmW7?)$@2aV(9CdX3fr|+Ou?K%yR>?Dg=Ci&MGJzk z8x-^_`Fd1v43FK-ueE}TU|1(`l)!p{qXo)ZLj2?v`nvsL4Oh3r?G+g>e>b4$wS5GF z^CH1Xz|g_z5;V2DI8mS~@T_)w`L;p#nY4VTD~iIqqNpJo6JtVAl0`|NR^ZsuDu%n< zkY$fg9cYV^T+RzFxdE*PClt(A8wbe(^nD1Go{;0{|Bd6IQ7YY;Y_{`qBU>~n^GrzQ zXw@kZ&z2VJ#K^S0`K1V5bv41@z1*RjRMMq2ikQU-QwHv45xBiN-2(di<>CsVKiJ$G zsJD)ApoU1lf66@_FE&IXQ*P3CA?7*`9YnnCf&RHK0oJ7_ZABcM@W}d?0hiyHy9IG~ zTSa^j@X7tV*C4j)v-cuSYj|eoVZfff&SQxC7BtR20yz548+sA-b!Xpwq+#g6duh?i zwxw@bQ`%6H#AtOL*ZSUvMmwd?Y zkS=O9P{g|#4vz10==+Z<;rx-}mNeKp8W$p|LU)48uXgA)mZA_NwPc(5^8){m|X27VXfYOSS0vT6CGo>v6~lA4Ov5&B0{5 zc7(|Wee<++010NpC&JkP?e039x%_naBx+L4o5o^{PyaLWrr|dU z2!93G`;+^pkbSg|Kn;)KTu@TtKOK0Cq0h_v9tiu8jGsBNGVVq8(5w zg&hk08lj zo81O%r#d%g@@aaj7Ej@60sXc%&U+G=1-|3W0>zX?mN@jgT8x?NTj=*dPk$$N8@}G( ziG3ff;OEp2@TsU&`(W3SG8p|K!WMeFyM6zWR!NJS=^e0n53T$$u7pTO``sz_J9oMJ zou2?L82s+=tM0)CHGCRw_oKnRlG`eYayunaZUZEi$Z~HP){>Qc)kyC_blAb|hx457 z46|%{9}ZP;QkkYd#ciAgzw-w4Gl$->i`pa+t{wiSl z2g^4Je=G4268|TMLg0r3VLv>KtbkcR)|KNpCJ1@|5syi5gFT#$OWgLT#{~AJAjxxk z4)1D$d9JPH>cLf$?rKcH>m$mz9>8yAbF~MQTStkQ2F;vV4U%wFY(qB0-y1LvRd$`Py$tS&9de09d z4u#bETR^(Mhopn?E=3k{)n4SPy`SDi79S2RCT*bFk&tTt@KhUlcO2a2syti= z5{F{{45{|7fOP*3Nyk-N!c}_(RbzO-q_Ze|hZJNL_>1R0R*ARnaOjPNUfpIfLF|=o z78eY?>dmUc468Xqtl_K%E_oSewQyBZyO|=^!MPeF4Xbn4!HE?ch3n!rVW!FI@fr`X z*l4)o`NGB^Y0wc25AaZ_lHybvTtLM?62n={8p?B#At$-Xd!GvwYu1R1$$f+uXj5n7 z_UG($|@vM*s1Uw0(Ac%otlww^uGsV>T- zlo)9j&jrd8Yl&mU?=q<+9`Aa|N1j_rK5h|UGhIb|m#I<_*JZQtb_3dCgKq?^2v)KA ERs0kyq5uE@ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.13.0/0.13.0_x86_64_linux_2.7.3.pickle new file mode 100644 index 0000000000000000000000000000000000000000..0ff7f6c68f34e468b08a9516076f0e07072a8357 GIT binary patch literal 7278 zcmd5>3!D{I8NYY;-hB)#u*)JS$kWFlvca+fA<6DaBHp;5F!Y?(xqD}s3-__-%v@oC z38{8zEtFnlWkqC`JuORnSY)P&RF~s_o@9Z}6X~S-EY?Ee8yNNMUmI4PSlQWkiTZL35BE4EPwc`p>ttO%grPt%F zq@A8_Qz124#fpWANFoXZwQ3@!MHeTN$;)1bi~UJNTve;c+prY=Iekv5Ut1N4-1GFK zRSz~qBI7m}G{p3X*o}y{-f;b4#EPcbZ4UunzVFWK5cjNFdkFFQC!Rh3AmByzwk?P| zV~zJ6Y;cmLsua4qY}2tpnI45GGq;lwE*Yt6)g9goQrC%wLNyiqlQy%Cp^#A^VV#w=Y%HsyzRaaV|Lx;w6{aWTwe@ zVKSzKN1?1{fuN8HV6oF(at2U$fT0XBQKAMjm^5!>ZDn;PzbokY%{TFM&KOL*QkZxK zGx1hA5SM_7b#j)vOng2O6-qgFHj`4;Kp@zTk?U5-*~{lP&u^YLr)5s)Qj$rCjB~Wg zu0nyFyR4P-F`X31s9IBu9VsoAH*zMKnJlKPdJC<1o=aw_)~$`=NGP^MC#_}E zc?A|tp3ZH4thJ-CkzQ*?zwW3yxmcA{S&DBfScs1iwid>4QWELwk+hPLP8ck?os_AQ zOXN>1&u5>1-^Jgz5BQX|)H=a%{4aIct?mE*H+aLMN-?TR}rwgWpnmt=d!4QY>s6 znoHKIR-F}ehBsPLPXGhFNix8jxlrID=yTpB>?`a!n9GN(=V0fW&frBWrISujFsS5d zRB$wl-A%8yf(mCig5yYzbsR@=6t%<*l9%i2^@r6=-8QdRbou$a4n?o+wIDbz<(xPS z9-JOQW2=V~II0}aYju`w8}Qx<%XfI9D6A{;8e%0zOHq(Hr}cxDJ=WLJ z7A3j7fpb~kuhrlTfbr$vsFv+xshN`;m;b+U95hO$E0fK3TxDd7CMozPv}S76DIU+= zTC4*jllb!sg|52N!r--C;;7b@S_6++oZ&>^7J1-arIU@Ie^4$qas9#O{>a)h%#jr$ z{r<`O+m6&nB9m{?cOmB54j)3ic4?nM>x0l+5@ z>|Tx7qR-lkIIaGfoksz?_qvZE?wi*z>lomuyKm}2)Yo2k@3H#9N89$IoynWF%|_hK zV3aK`*{W)_`2x)u+015>xcyocpLgE@6$~a@HrsAHu7X_+b_%j&JKE`#Wr4$t!)uL@$v|Wo{tVNfatR72F_z)69Zw@EhhliPL&^KS* zWOI@hCdh7=d_=X}ek+^}*UQ-eGuKCP<_gl`W2i|rYZ?nNK7(ucO(SR$5dI3V4<`4I zBl~Et1vNZ|b3sXozajJ*L!XuRPe9m9GJf*Z%J?Z{58Vp;462WFN{O6%LL{gLZdIAA zpDvYkFH%BeKoj-{Bp`-s`1bVZC$)MwmE-hKGd|AFpTW;^JL9qv`ELxp#wmo!{@GI5 zZ$hdnCi@&@5?n*eeu5bITBqo$QqSfM&fme!d)#^IZW|slmsg4bB zQbN9=#Z!1%K)$Js_1}bMf!q99pqR3)#V)yBi&1l3Gx-+i8SKQqjh_#8V&9RsN#Dh{ zyi(l(yOxwe$@dU8lka=m_kCI=DQ+S^fX%y);+IJL z-y8~oAATkK;Xz~t%=)mdoWwDK`OtC=k^@k)d=(4qULKK zN9@0Adeb<-hu7S`74h7s5(mcBmoRzQXM&OV2psIfq0?OQD8pv*Ylid4V+>o!Zx~)c z4tX*9xa^bPGG<;-KT*CY{4PZD`|?HM40iKl*h8iUOX7Vbna$qUIT$ zJS$8194_Q(_a`1Te5hywU;kP1(q9-cy1LvRd#KD1$tS#8`pga^o|n{mAtc>jCFx+i zOIgiKwZlxczv<-fvT8>{i(ix$TMIz7mn78&eAR~E9S62~Di76x_@UTQNws4k>HZ-} z$5dO)RQo4W?O!_ix2)RB!D5QXPttn$j9;#46yJi2{fbR1@RzXew35?!VL)SyA(n0$ zhg(*?X%*a+lx$kf*THcbCCWn9bM_}$?eLsUQnn$? zr(D4k0*d}5{`&#@ARWu)@&^mrDEN7f7zdAKiw!+ZCLGTQ`5c19qNv5;f2PbVod9oK zdOG|KHbrcQ!d@$@kVv6t!26lo=|m)hrbH*fjZV*mzf!o;$@q(3XXq4s)Z)t>GuK9^ z!h?6;PN%^=FFp&bBcSSZ_(8JV3|xww67fXE4(vF4xDPpqTof2-_B{v8@F}K*b)4qC_GNmRQ!fOEzql6z<&@DY7E!z8Jf|y<_ja zSL_XY@4a`iVgJ8x-n>T$U^&0%{F`&W@6OzrJ9pZhnFoioD@ZI+*&!B-O;0ss>ZGI~ zr8Zep*O>C8-4tY3r}9ImG-_t3EfSx^f^liT!93}(l)ART%V=^hx4S9`OjXX4VMvK0Nkpc9tLs1511D9$?$)Lcc!A8~&UD`(0S=yzTC+ntL#wOsd{n2m! z*em|p1r%%VSl07oeR!DgWdnGawZpU_eb~sz#%_#;WP78RLlbrsM$0Ca$H*}BZd249 z?#qaDCMhG)Lpw`xOuL;O*^FJQcStsmOyvy462&Xeuasic&Jqd1z~9%E#z z8^v*U6n98u`(?Qsvd9bhEo`j@uGrUb?s8!}RDD|#~;C4-7JTq08=SF)^K zYb3|oor^LpyLQW1H>z;P{leuNa=45`4wrMtQs+s1YUR6_DS66jls;(&`2alh-IPkV| z<<>=})xw3jHV&m3P2~_PS5?Ef%*!&S9-5THI2U%mAI@I|?iJXkU(L716wWMr$+qp=cPv7M;j=we=adLXYK&JdSM~Z{!5F5iiPr zTmYCjN=^&{z)40>tUG40QT+-5%u4tA^ zeYuP~=W@(e=$2bIL5vO7&FCQBUcsThl0$u!k*hh>or-czcAJ*5qk@GJV%xwM2F$hr zu78&bP|LJM7}3VaAoYd6o=m&e1{a06Yq(a247p`<7`ho{&`xh0tGTaDbd{BVCe z=CwUHwA|szozZ5#ize?jvcQ?VCwm*pbz-@DJ-IL4EcY9Ez?TPod5D$lP&~{^j~IE> z4aH;s+fXd@dR}sydHJt4OV>9$Xl$4E%Y`!gB{TN_xI;DsI^8}zylo*E|YgW zd3U*;e2?wFZ{!2Ek4gQ|&H(GnN1l8fb?_6G`_#y1uH5IgT%6^;@Z?K$?kgi-`|^!1 zi=xhb%SzuF`JRX#Zy;zc{CUUE=(2p8Os;{|Cxx z3}=JU5@1kTOCj0{oK9_rl&9^1>1OQ!GTIU7YbT4HsmLU?3pk9i8b^YryMm3bM3o&y zU6~5pue2M5O;VnAhfq<}9w1+P0_hwBwf15W80if*+6U;2^reDER-rHqM*2a3k^UfG zR|Q5!R$~zuSsiS24WKi!CKWWY7KIiV82|xB27-JY1dNOfW)T=!8*FqPpfj>A6*RIQ zg(G2PeF!j;0QtHBFr8`E4at?lVmcvIyset=f0cl84UZ+J>ohnWdgPItn$> z_?DogZUw~5^Z3-QDPz310UO;G$nox8)a|GUo(L01>Gnw3_^M^L;AmTLIcKVlp@zq* zj%A(TWYuwyecb`*>5f2z;+g;U5gUsQ>iNkrKjFxoetzL~@`4e)Hu@Rd?ZJy@k&}CF z{m9Qfb-Z; zHsg(1Q^@f&4MgCY0y0_+^fhC#h6;?*RB*WPvXo)H7Hl*JbPI19#2nYx|8t4dp?y#5 zffYF}kBVk(0QuSo#LG&jlXEt<^uh?Z*uo207%BuMayGC8_jlf7Rgu}0&bG_BXpl#` zpdK)hvq6AEnsyn?L%*pS^d084~$pbmFgTKrt>P{`h9#uUkMdor4xVEdUX(4+i-< z7sw@X2sqq0^C+VShk}hB24oMgaSo^AU)?xGSoG8fE_dTJvq5-r1lVW`&@KHVsaR^` z9EBWDj|N5?=NJ}3?PI}4j{~~e$5XM?#yJ5wp8f}jjdLQ%=t)3dPqugp6&R&c!Qnig zMj6&m2OB*D=;rxMD*mtYd=@tzuNHy1c1J?8}}ercWr+m`G&JjTSP7$ zI%M>V;N2G7dmMTGq&?pxfA!9%8@~kJEU_h z1Pnfu)Ys0;RO`b?FLfmfzJK7Of&22IkDvsG?NJavR_J38lDZHWU8f(X4n2MXZ1hR0 zFm~;W`VjqlUHEMNPsjJcm2Ruy$G$~wxqtA`zTlU4zhw@2{SV6*_U)-(puDGF z0;49rLV`_v4L14>Flu5ERh;8*nQOxhK2&b1-ys>MhU@pOQX}+-R;iKtW2@9=x)>>} zke@)Dsh`2oLi+^?y74R6=x|cS7#ZoxA~3QN*yzeYXQUeyG}4^{pIkj4z(`M!uf2eg zk=`rkk1&Rt5RG8j$a{)ye7Xgq+)>Yang+=$a7las)<0 zH-I{{H4toc5LFmDTx$nY!M4_B9``6le;vqZZe5VC>jCLAKW^*#NYG9KY;*%4?Le_1 z6|}Pv1Wz}nCaFV6;YJ(EBB;9w*yu2-0v|S|f^~;8&qh^z0M`*nxx{Q8i3B{`3`A$_ z&fOdd8r%YGbQF*V+ZA<7Drj&k=2;u~2(DWrWsgZOtC@^$!z91;;Ip`Hiv;X%2jaTh z9vnGdh6LLg4K_N4Dzt;3G?t3i`#0RDScdl3mUSFzc)9~H>cx&U0_)?!Mt1@_>pN4? z+B(up{lawEicUa|r{zF?6W3iJB(hCed;!;qkbSMNs-*5}HEtLuAwdUr0~<9|p?m0{ zual|z`vI^=neL7np6&sRI=Cl|!2Vufqx=RC+25P0)?Ey{f_(c6|1lxFXX`%Dc)Bl; z)3hHrTH{G1*v|f7Jie%6KZ<$)68uS885V(&8nDr+KxZUN z1&!2FV0g$ufRSk+U+aL}l=Wo3R@|m+K-zA~MhHor4oqY>&JKX{LY12XrHp3tV51&O z!S_;>W&BxvVYFdh;>+QD8uo{E38)J{MRJT!o4>O_#Q zCjq(cP6mflcM4^+cq-WFX~1aePNxcOMpG9q?_dgI%bf3_9`)>RMQJ?)B|JS7$f-Mv z?YOBsn=;%v2W<3Qpc}~fRB#~AW1btx^C6>m7l3@d5Xh;!h>Y7~t4H|7$SUY1z(j7D zOJndPiBEQ4?Jh+r+!QafRW7$xHbX}$vh%{1_!Y=Z>XpE7+ODDuudfChy$0x}?OH1S z(X?HM8lGMcM8|Fb`FbOe<8%`^oVJ@Oqs3dmMsEeW7H^}1<8(Xo#)nf?UJ_oEy?kA5 zI=(x;Tjae|{q(rrfr3ZkrGM;`kDqaP&k8&e?nI8!yMXq9!Hr1oMq+%Kvxb1Y0BQPk z4>-pTW8m*)UQ;@^DTXWOhVee+74&{!@VTSaFg}3Hq&^5lr|hT6hbW^{4}*<90%WJ~ zuzM7uy9r`3{>x9W$6(ykg}{oOduBb3L_wbbM$fD#k;pCmw0a6zMxO@Sr`0nMe0>(k zfq4!bPS*33(XSW4coqO%zh0tZc{>dG_Sg0R`MV!#^krDU^8rX_UIphko|vWAn1?S? z6T&Ha9T^3E1Gvl-y@|}Ez6C^A><#K|%IL~FV59E>*%jQN-lKx+?tKWJen3r9KO}|2 z@DYok?#E!GpHLM9$xo@kF!X2}F!VEMJpCLPoyT9WFsytDHu@EiR&Z_lnhIFyzoJ3Y zZ&1S1ML2rm0E~S8nMPp! z7qHP^fzJAGRJ67pT4~z~%Gsa(4$0F$fcBEa|Aw>VidceFbSErD0z0)G$XEU|C|5!U zaM)8X1Vo1 zW>R|svjbqJ7X^32=uH{E_5tIy0d&5uLdDLkiM?U`UBwTYRo|NvPZu2 zb#>$wbPeD#zOIQ(yi$O<3!D{I8NYY;-hIrnz%GlRAOb23uN9W%A&~5@B;t+hDhxfRZSLM#=E8mKIWt#S zWJ06eZ8L;kWMxHUmOU*?dst+qsZ^Gws2OIZ`KUBXUF@OXcjlaVahF~A$**RAzx`(B zeDj^}`+tw~opY}$vq@A|;+cF}-%8-2TvlRf&C*G%GhfVg_cV4EsNR?>X041ysn#QP zP`$H|H!WK1v?LPm?AG#W&1|$xon~~ikqMGkfdng))7PL_iBu#ay-GCIe>tg?6VZg! z>&TYUR!_Gskt(HZ?b1Xf5e0^7IT2H$E0W3NC9lH8{v{%|td!+V_!R!EJ}Wh#v_~R$ zKmBOMgY}Wf_-zFRX}TtM1JYZrzwQXqvc`F>4*|XGz#Y4h?rYz080qs*JUjOg&I>QBy;p?EbCT}fn=8)C9kd1R7GXSNG zK&e4alpl){$0fq9s6!%UoR)JM<224`1*esqR&iR*sg>*ntt-00y@fOx?YDw$@)4bM;0(b9OY0hY=fyU1`(VBl&hIU8VQ=_Wxku{0y+xDcMp z1w#N_Dgs~z3xMsxG+7A&ppu#LYB7BxDvYwsY$heGhcGoQE!Qm(dCh{R#Z3$6H_!K7 zQZfl~J4-3=Dip}<)h%3(ZlyrSIRR_up0u_7DJ7QIayprpET)V)13l4blk;R_ih;n+ zSIL6FR81#6l}Z+ZBoE2v<46*i373gyX8WRmLW_?hhKFV(oNkV%`%?y5t;D;s1R3cvW~bR?BMXiN{&a1L#sF)+`;f^ z3FzE<(mD%#%5oXf#=yDICaVNEZ&b-eUj6YE&b7Ea@zS%}&6|r+jYzLCV?ecJm8_8i za9o0;Wh`CC3ZDz-S*ZYP0$Ix$>%_2MMH#hHx=P*@oMLSW95{?_We`&f8k#y;LWE!q zi}JMBlPbAXR>})`owO_E)|LXfOt<8n|e*r6OgwuL!-J&5s;R#1$G z^oAe|Z^qG)A^jF)9SPEJ4NmbkZim12hT##g2^vEB?IlDA*087`((jOscEiEfjbFBO z_E3fol?B5#6BvR<0E5h_$6%99HU}}thA`lMD=O7ycL(n!ayH4!#uBlDn%x>T^+wS~ zuqV+>6aeUMn-FXc&0-3=T0)gt=L;duiC_Zm<>F6NRr8M=Hp3=J&~oye)o>FgG3 z^^QSjOd#Lks3Nf*j92+>i7|noy%IHn8iP%8Gv7*VA98VHJuYGqssT2avVB0Q!hRU@ z+`?_kyP?a3$lVce+*SWKj)Rs;>dIuZ9am`CqE4=MAvESGl_?(2?@?kM7@0fWZuz~? zf2jrrZ?8@IWP>X8JZ5n}BLcUd2k!e+G7!kz_jCIJb9ei8%?P(|5b5_%xwrMjx=3Wo z&FUVcxz;0xk>375-@KQA)}$wGM>;s+k&Q0{U42vTR-}8EmBj~vK6!BOI;73&oc&0r z)jhN8DA4Zx_G3s7EUcgN3eeGa-PnUv-LU+gSL%k|=Cu{gOkOulw%z*xfU?Ub*UCzD zzCd$YHnUA9y9bmCzW3V$1r>IAY@heRP%%--hX5xpOFoP^ow6)3!m02PUl3y=m5B3? z`d9*SzSoKKj|KdE9ryD!1IquWe-r-~zGzcJMQF}QR5 z97wo9C7*}(MU*#RAIpo|jc$Sc0zw~SEiCz>tVCOtXqytfP>HV6S;-Fg=_X``Zo;3B zUm9UP!b<=8=A)Hl4*RlAzH*Giz8Z8GG|peccFxO~ucIZEY+YH3A?#fvUst?!1TsVd z@!mG@W)vUoHK5GLu#*X`0^jfrV|K6=;G2*c0Ud8SwK{$a#lwPzOA(rnomEKBZ~Iu# z3_RIky51Vr^)}>$$bfa)ZRvm+u94&E@K0`W=u}Rp`_}7YaQ+TX%W=lfLi&H#H;j!6 z)BSs4-M^1q6-@W}D8##l*Znv#aJz4E6dBU(2fkqkdguav2)aSA9>8Gra`Z=>6}Fk= z4oFc=WTwaRV;{?%kisQce&S=f3sSrU%iTVfpF#<6Sac|NJC$lY!*WW|JxXmyTJO?e zPu#INlTVX-m3Rt|ugHDMICsR?E*x~*g<{Gy+HCSOB}Vm)P2}fbr?<)b1y1j6^6p0z zyqx+aj`B+NE7*FbG)f*ovW@)OIZyhHQcj9n$%Am36!`KX{1PG^@jJvftq(h!)<=LB z0Dm+%)o<~GD%LQ!c+ue8i5;7uVy7mk*tiH9(&e!dqJ=0i)kvQ~b=Zdg4h|T*GSsxl zVeGr$=sZmx$Gx}-@7V(Mdz(BV8^Oa$HiD<J z@OB$dBlF3eQi30z3Hsq#6a~Wiu&$iIc~QXo&$&YUd-vfm4!P|wt`OKsgC@`IJG!R< zTE1mX*FKJP;LhodEb8rr)A?WL9YEhGx9f*X}qBdZ@VD&ZT?@EUyd%Q9Ag z;iKSf!0=k}6_583uA_rH95aVotDJLGI@+1mGklDg)-fDUDg(+g!`qnQb@(kBUJo}a z9fzy|!yE98XZU!03>faK_VhH)>=t}FfeUnNKAjj8csdh^m7ksgx2*WnNpKgc_;fP= z4i5St)##bdcW_uqr{H&S%b>bW&%%2=yQEX$j%690hO7aK$z#?1Ee*KY#Y#LqhbeW~U}YU*=Un#eTO8>OF8&0IC7!hgjGgJ1Q>tDHg~jqF ze(b|#l*(K$FS=$4d5$Ay!y^R5v8@N46J(sr7i4%==Va7I1>|c fcuhFpF~sxQGGU10wdMG@3E|k};ed6)EH=IdbIz@n literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/macau.html b/pandas/io/tests/data/macau.html new file mode 100644 index 00000000..be62b322 --- /dev/null +++ b/pandas/io/tests/data/macau.html @@ -0,0 +1,3691 @@ + + + + + + + + + + + + + + + +Traffic Statistics - Passengers + + + + +
+
+ + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+
+

Traffic Statistics - Passengers

+ +
+
+
+ + +
+ +
+
+
+
+ + + Traffic Statistics + + + + + +


+ Passengers Figure(2008-2013)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  201320122011201020092008
January + + 374,917 + + + 362,379 + + + 301,503 + + + 358,902 + + + 342,323 + + + 420,574 +
February + + 393,152 + + + 312,405 + + + 301,259 + + + 351,654 + + + 297,755 + + + 442,809 +
March + + 408,755 + + + 334,000 + + + 318,908 + + + 360,365 + + + 387,879 + + + 468,540 +
April + + 408,860 + + + 358,198 + + + 339,060 + + + 352,976 + + + 400,553 + + + 492,930 +
May + + 374,397 + + + 329,218 + + + 321,060 + + + 330,407 + + + 335,967 + + + 465,045 +
June + + 401,995 + + + 356,679 + + + 343,006 + + + 326,724 + + + 296,748 + + + 426,764 +
July + + + + + 423,081 + + + 378,993 + + + 356,580 + + + 351,110 + + + 439,425 +
August + + + + + 453,391 + + + 395,883 + + + 364,011 + + + 404,076 + + + 425,814 +
September + + + + + 384,887 + + + 325,124 + + + 308,940 + + + 317,226 + + + 379,898 +
October + + + + + 383,889 + + + 333,102 + + + 317,040 + + + 355,935 + + + 415,339 +
November + + + + + 379,065 + + + 327,803 + + + 303,186 + + + 372,104 + + + 366,411 +
December + + + + + 413,873 + + + 359,313 + + + 348,051 + + + 388,573 + + + 354,253 +
Total + + 2,362,076 + + + 4,491,065 + + + 4,045,014 + + + 4,078,836 + + + 4,250,249 + + + 5,097,802 +
+ +


+ Passengers Figure(2002-2007)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200720062005200420032002
January + + 381,887 + + + 323,282 + + + 289,701 + + + 288,507 + + + 290,140 + + + 268,783 +
February + + 426,014 + + + 360,820 + + + 348,723 + + + 207,710 + + + 323,264 + + + 323,654 +
March + + 443,805 + + + 389,125 + + + 321,953 + + + 273,910 + + + 295,052 + + + 360,668 +
April + + 500,917 + + + 431,550 + + + 367,976 + + + 324,931 + + + 144,082 + + + 380,648 +
May + + 468,637 + + + 399,743 + + + 359,298 + + + 250,601 + + + 47,333 + + + 359,547 +
June + + 463,676 + + + 393,713 + + + 360,147 + + + 296,000 + + + 94,294 + + + 326,508 +
July + + 490,404 + + + 465,497 + + + 413,131 + + + 365,454 + + + 272,784 + + + 388,061 +
August + + 490,830 + + + 478,474 + + + 409,281 + + + 372,802 + + + 333,840 + + + 384,719 +
September + + 446,594 + + + 412,444 + + + 354,751 + + + 321,456 + + + 295,447 + + + 334,029 +
October + + 465,757 + + + 461,215 + + + 390,435 + + + 358,362 + + + 291,193 + + + 372,706 +
November + + 455,132 + + + 425,116 + + + 323,347 + + + 327,593 + + + 268,282 + + + 350,324 +
December + + 465,225 + + + 435,114 + + + 308,999 + + + 326,933 + + + 249,855 + + + 322,056 +
Total + + 5,498,878 + + + 4,976,093 + + + 4,247,742 + + + 3,714,259 + + + 2,905,566 + + + 4,171,703 +
+ +


+ Passengers Figure(1996-2001)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200120001999199819971996
January + + 265,603 + + + 184,381 + + + 161,264 + + + 161,432 + + + 117,984 + + + +
February + + 249,259 + + + 264,066 + + + 209,569 + + + 168,777 + + + 150,772 + + + +
March + + 312,319 + + + 226,483 + + + 186,965 + + + 172,060 + + + 149,795 + + + +
April + + 351,793 + + + 296,541 + + + 237,449 + + + 180,241 + + + 179,049 + + + +
May + + 338,692 + + + 288,949 + + + 230,691 + + + 172,391 + + + 189,925 + + + +
June + + 332,630 + + + 271,181 + + + 231,328 + + + 157,519 + + + 175,402 + + + +
July + + 344,658 + + + 304,276 + + + 243,534 + + + 205,595 + + + 173,103 + + + +
August + + 360,899 + + + 300,418 + + + 257,616 + + + 241,140 + + + 178,118 + + + +
September + + 291,817 + + + 280,803 + + + 210,885 + + + 183,954 + + + 163,385 + + + +
October + + 327,232 + + + 298,873 + + + 231,251 + + + 205,726 + + + 176,879 + + + +
November + + 315,538 + + + 265,528 + + + 228,637 + + + 181,677 + + + 146,804 + + + +
December + + 314,866 + + + 257,929 + + + 210,922 + + + 183,975 + + + 151,362 + + + +
Total + + 3,805,306 + + + 3,239,428 + + + 2,640,111 + + + 2,214,487 + + + 1,952,578 + + + 0 +
+ +


+ Passengers Figure(1995-1995)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  1995
January + + +
February + + +
March + + +
April + + +
May + + +
June + + +
July + + +
August + + +
September + + +
October + + +
November + + 6,601 +
December + + 37,041 +
Total + + 43,642 +
+ + +


+
passenger statistic picture
+


+ + + + +


+ Movement Statistics(2008-2013)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  201320122011201020092008
January + + 3,925 + + + 3,463 + + + 3,289 + + + 3,184 + + + 3,488 + + + 4,568 +
February + + 3,632 + + + 2,983 + + + 2,902 + + + 3,053 + + + 3,347 + + + 4,527 +
March + + 3,909 + + + 3,166 + + + 3,217 + + + 3,175 + + + 3,636 + + + 4,594 +
April + + 3,903 + + + 3,258 + + + 3,146 + + + 3,023 + + + 3,709 + + + 4,574 +
May + + 4,075 + + + 3,234 + + + 3,266 + + + 3,033 + + + 3,603 + + + 4,511 +
June + + 4,038 + + + 3,272 + + + 3,316 + + + 2,909 + + + 3,057 + + + 4,081 +
July + + + + + 3,661 + + + 3,359 + + + 3,062 + + + 3,354 + + + 4,215 +
August + + + + + 3,942 + + + 3,417 + + + 3,077 + + + 3,395 + + + 4,139 +
September + + + + + 3,703 + + + 3,169 + + + 3,095 + + + 3,100 + + + 3,752 +
October + + + + + 3,727 + + + 3,469 + + + 3,179 + + + 3,375 + + + 3,874 +
November + + + + + 3,722 + + + 3,145 + + + 3,159 + + + 3,213 + + + 3,567 +
December + + + + + 3,866 + + + 3,251 + + + 3,199 + + + 3,324 + + + 3,362 +
Total + + 23,482 + + + 41,997 + + + 38,946 + + + 37,148 + + + 40,601 + + + 49,764 +
+ +


+ Movement Statistics(2002-2007)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200720062005200420032002
January + + 4,384 + + + 3,933 + + + 3,528 + + + 3,051 + + + 3,257 + + + 2,711 +
February + + 4,131 + + + 3,667 + + + 3,331 + + + 2,372 + + + 3,003 + + + 2,747 +
March + + 4,349 + + + 4,345 + + + 3,549 + + + 3,049 + + + 3,109 + + + 2,985 +
April + + 4,460 + + + 4,490 + + + 3,832 + + + 3,359 + + + 2,033 + + + 2,928 +
May + + 4,629 + + + 4,245 + + + 3,663 + + + 3,251 + + + 1,229 + + + 3,109 +
June + + 4,365 + + + 4,124 + + + 3,752 + + + 3,414 + + + 1,217 + + + 3,049 +
July + + 4,612 + + + 4,386 + + + 3,876 + + + 3,664 + + + 2,423 + + + 3,078 +
August + + 4,446 + + + 4,373 + + + 3,987 + + + 3,631 + + + 3,040 + + + 3,166 +
September + + 4,414 + + + 4,311 + + + 3,782 + + + 3,514 + + + 2,809 + + + 3,239 +
October + + 4,445 + + + 4,455 + + + 3,898 + + + 3,744 + + + 3,052 + + + 3,562 +
November + + 4,563 + + + 4,285 + + + 3,951 + + + 3,694 + + + 3,125 + + + 3,546 +
December + + 4,588 + + + 4,435 + + + 3,855 + + + 3,763 + + + 2,996 + + + 3,444 +
Total + + 53,386 + + + 51,049 + + + 45,004 + + + 40,506 + + + 31,293 + + + 37,564 +
+ +


+ Movement Statistics(1996-2001)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200120001999199819971996
January + + 2,694 + + + 2,201 + + + 1,835 + + + 2,177 + + + 1,353 + + + 744 +
February + + 2,364 + + + 2,357 + + + 1,826 + + + 1,740 + + + 1,339 + + + 692 +
March + + 2,543 + + + 2,206 + + + 1,895 + + + 1,911 + + + 1,533 + + + 872 +
April + + 2,531 + + + 2,311 + + + 2,076 + + + 1,886 + + + 1,587 + + + 1,026 +
May + + 2,579 + + + 2,383 + + + 1,914 + + + 2,102 + + + 1,720 + + + 1,115 +
June + + 2,681 + + + 2,370 + + + 1,890 + + + 2,038 + + + 1,716 + + + 1,037 +
July + + 2,903 + + + 2,609 + + + 1,916 + + + 2,078 + + + 1,693 + + + 1,209 +
August + + 3,037 + + + 2,487 + + + 1,968 + + + 2,061 + + + 1,676 + + + 1,241 +
September + + 2,767 + + + 2,329 + + + 1,955 + + + 1,970 + + + 1,681 + + + 1,263 +
October + + 2,922 + + + 2,417 + + + 2,267 + + + 1,969 + + + 1,809 + + + 1,368 +
November + + 2,670 + + + 2,273 + + + 2,132 + + + 2,102 + + + 1,786 + + + 1,433 +
December + + 2,815 + + + 2,749 + + + 2,187 + + + 1,981 + + + 1,944 + + + 1,386 +
Total + + 32,506 + + + 28,692 + + + 23,861 + + + 24,015 + + + 19,837 + + + 13,386 +
+ +


+ Movement Statistics(1995-1995)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  1995
January + + +
February + + +
March + + +
April + + +
May + + +
June + + +
July + + +
August + + +
September + + +
October + + +
November + + 126 +
December + + 536 +
Total + + 662 +
+ + +


+
passenger statistic picture
+ + +
+ +
+
+
+ + + +
+
+ +
+ +
+ + + +
+ + + +
+
+ + \ No newline at end of file diff --git a/pandas/io/tests/data/nyse_wsj.html b/pandas/io/tests/data/nyse_wsj.html new file mode 100644 index 00000000..aa3d470a --- /dev/null +++ b/pandas/io/tests/data/nyse_wsj.html @@ -0,0 +1,1207 @@ + + + + + + +
+
+
+
+
+ SEARCH +
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 Issue(Roll over for charts and headlines) + VolumePriceChg% Chg
1 + J.C. Penney (JCP) + + 250,697,455$9.05-1.37-13.15
2 + Bank of America (BAC) + + 77,162,10313.90-0.18-1.28
3 + Rite Aid (RAD) + + 52,140,3824.70-0.08-1.67
4 + Ford Motor (F) + + 33,745,28717.05-0.22-1.27
5 + Pfizer (PFE) + + 27,801,85328.880.361.26
6 + Hertz Global Hldgs (HTZ) + + 25,821,26422.320.693.19
7 + General Electric (GE) + + 25,142,06424.05-0.20-0.82
8 + Elan ADS (ELN) + + 24,725,20915.590.080.52
9 + JPMorgan Chase (JPM) + + 22,402,75652.240.350.67
10 + Regions Financial (RF) + + 20,790,5329.300.121.31
11 + Violin Memory (VMEM) + + 20,669,8467.02-1.98-22.00
12 + Citigroup (C) + + 19,979,93248.89-0.04-0.08
13 + Nokia ADS (NOK) + + 19,585,0756.660.020.30
14 + Wells Fargo (WFC) + + 19,478,59041.59-0.02-0.05
15 + Vale ADS (VALE) + + 18,781,98715.60-0.52-3.23
16 + Delta Air Lines (DAL) + + 16,013,95623.57-0.44-1.83
17 + EMC (EMC) + + 15,771,25226.07-0.11-0.42
18 + Nike Cl B (NKE) + + 15,514,71773.643.304.69
19 + Alcoa (AA) + + 14,061,0738.20-0.07-0.85
20 + General Motors (GM) + + 13,984,00436.37-0.58-1.57
21 + Oracle (ORCL) + + 13,856,67133.78-0.03-0.09
22 + AT&T (T) + + 13,736,94833.98-0.25-0.73
23 + Trina Solar ADS (TSL) + + 13,284,20214.831.9915.50
24 + Yingli Green Energy Holding ADS (YGE) + + 12,978,3786.730.6310.33
25 + Petroleo Brasileiro ADS (PBR) + + 12,833,66015.40-0.21-1.35
26 + United Continental Holdings (UAL) + + 12,603,22530.91-3.16-9.28
27 + Coca-Cola (KO) + + 12,343,45238.40-0.34-0.88
28 + Arch Coal (ACI) + + 12,261,1384.25-0.28-6.18
29 + Morgan Stanley (MS) + + 11,956,34527.08-0.07-0.26
30 + Pandora Media (P) + + 11,829,96325.520.130.51
31 + Barrick Gold (ABX) + + 11,775,58518.530.000.00
32 + Abbott Laboratories (ABT) + + 11,755,71833.14-0.52-1.54
33 + Banco Santander Brasil ADS (BSBR) + + 11,587,3107.010.467.02
34 + Advanced Micro Devices (AMD) + + 11,337,6093.86-0.03-0.77
35 + Annaly Capital Management (NLY) + + 11,004,44011.63-0.07-0.60
36 + Alpha Natural Resources (ANR) + + 10,941,0746.08-0.19-3.03
37 + Exxon Mobil (XOM) + + 10,668,11586.90-0.17-0.20
38 + Itau Unibanco Holding ADS (ITUB) + + 10,638,80314.300.231.63
39 + Merck&Co (MRK) + + 10,388,15247.790.110.23
40 + Alcatel-Lucent ADS (ALU) + + 10,181,8333.650.010.27
41 + Verizon Communications (VZ) + + 10,139,32147.00-0.67-1.41
42 + Magnum Hunter Resources (MHR) + + 10,004,3036.330.467.84
43 + Hewlett-Packard (HPQ) + + 9,948,93521.17-0.13-0.61
44 + PulteGroup (PHM) + + 9,899,14116.57-0.41-2.41
45 + ReneSola ADS (SOL) + + 9,667,4384.840.398.76
46 + Corning (GLW) + + 9,547,26514.73-0.21-1.41
47 + Cole Real Estate Investments (COLE) + + 9,544,02112.210.010.08
48 + Dow Chemical (DOW) + + 9,150,47939.02-0.97-2.43
49 + International Game Technology (IGT) + + 9,129,12319.23-1.44-6.97
50 + Accenture Cl A (ACN) + + 8,773,26074.09-1.78-2.35
51 + KeyCorp (KEY) + + 8,599,33311.360.020.18
52 + Bristol-Myers Squibb (BMY) + + 8,440,70946.20-0.73-1.56
53 + Companhia Siderurgica Nacional ADS (SID) + + 8,437,6364.36-0.05-1.13
54 + H&R Block (HRB) + + 8,240,98426.360.311.19
55 + MGIC Investment (MTG) + + 8,135,0377.26-0.10-1.36
56 + RingCentral Cl A (RNG) + + 8,117,46918.205.2040.00
57 + United States Steel (X) + + 8,107,89920.44-0.66-3.13
58 + Cliffs Natural Resources (CLF) + + 8,041,57221.00-0.83-3.80
59 + Newmont Mining (NEM) + + 8,014,25027.98-0.19-0.67
60 + Altria Group (MO) + + 7,786,04834.71-0.29-0.83
61 + SandRidge Energy (SD) + + 7,782,7455.93-0.06-1.00
62 + Molycorp (MCP) + + 7,735,8316.73-0.45-6.27
63 + Halliburton (HAL) + + 7,728,73548.39-0.32-0.66
64 + Taiwan Semiconductor Manufacturing ADS (TSM) + + 7,661,39717.07-0.25-1.44
65 + Freeport-McMoRan Copper&Gold (FCX) + + 7,622,80333.42-0.45-1.33
66 + Kodiak Oil&Gas (KOG) + + 7,543,80611.940.161.36
67 + Xerox (XRX) + + 7,440,68910.37-0.01-0.10
68 + Sprint (S) + + 7,291,3516.16-0.14-2.22
69 + Two Harbors Investment (TWO) + + 7,153,8039.790.050.51
70 + Walter Energy (WLT) + + 7,152,19214.19-0.36-2.47
71 + International Paper (IP) + + 7,123,72245.44-1.85-3.91
72 + PPL (PPL) + + 7,026,29230.34-0.13-0.43
73 + Goldcorp (GG) + + 6,857,44725.760.080.31
74 + Time Warner (TWX) + + 6,807,23766.201.332.05
75 + Synovus Financial (SNV) + + 6,764,8053.290.020.61
76 + AK Steel Holding (AKS) + + 6,662,5993.83-0.11-2.79
77 + Boston Scientific (BSX) + + 6,629,08411.52-0.15-1.29
78 + Eldorado Gold (EGO) + + 6,596,9026.65-0.03-0.45
79 + Newpark Resources (NR) + + 6,552,45312.560.090.72
80 + AbbVie (ABBV) + + 6,525,52444.33-0.67-1.49
81 + MBIA (MBI) + + 6,416,58710.38-0.43-3.98
82 + SAIC (SAI) + + 6,404,58716.030.130.82
83 + Procter&Gamble (PG) + + 6,389,14377.21-0.84-1.08
84 + IAMGOLD (IAG) + + 6,293,0014.77-0.06-1.24
85 + Safeway (SWY) + + 6,268,18432.25-0.29-0.89
86 + Kinross Gold (KGC) + + 6,112,6584.99-0.03-0.60
87 + MGM Resorts International (MGM) + + 5,986,14320.22-0.05-0.25
88 + Cemex ADS (CX) + + 5,907,04011.27-0.06-0.53
89 + American International Group (AIG) + + 5,900,13349.15-0.30-0.61
90 + Chesapeake Energy (CHK) + + 5,848,01626.21-0.20-0.76
91 + RadioShack (RSH) + + 5,837,8333.44-0.43-11.11
92 + U.S. Bancorp (USB) + + 5,814,37336.50-0.04-0.11
93 + Eli Lilly (LLY) + + 5,776,99150.50-0.54-1.06
94 + MetLife (MET) + + 5,774,99647.21-0.37-0.78
95 + Yamana Gold (AUY) + + 5,742,42610.370.030.29
96 + CBS Cl B (CBS) + + 5,718,85855.50-0.06-0.11
97 + CSX (CSX) + + 5,710,06625.85-0.13-0.50
98 + Carnival (CCL) + + 5,661,32532.88-0.05-0.15
99 + Mosaic (MOS) + + 5,595,59243.43-0.76-1.72
100 + Walgreen (WAG) + + 5,568,31054.51-0.22-0.40
+ + +
+ + + + + + + + + + + + + + +
An Advertising Feature    PARTNER CENTER
+ + + + + + + + + + + + + + +
+ + +
diff --git a/pandas/io/tests/data/salary.table b/pandas/io/tests/data/salary.table new file mode 100644 index 00000000..ea780333 --- /dev/null +++ b/pandas/io/tests/data/salary.table @@ -0,0 +1,47 @@ +S X E M +13876 1 1 1 +11608 1 3 0 +18701 1 3 1 +11283 1 2 0 +11767 1 3 0 +20872 2 2 1 +11772 2 2 0 +10535 2 1 0 +12195 2 3 0 +12313 3 2 0 +14975 3 1 1 +21371 3 2 1 +19800 3 3 1 +11417 4 1 0 +20263 4 3 1 +13231 4 3 0 +12884 4 2 0 +13245 5 2 0 +13677 5 3 0 +15965 5 1 1 +12336 6 1 0 +21352 6 3 1 +13839 6 2 0 +22884 6 2 1 +16978 7 1 1 +14803 8 2 0 +17404 8 1 1 +22184 8 3 1 +13548 8 1 0 +14467 10 1 0 +15942 10 2 0 +23174 10 3 1 +23780 10 2 1 +25410 11 2 1 +14861 11 1 0 +16882 12 2 0 +24170 12 3 1 +15990 13 1 0 +26330 13 2 1 +17949 14 2 0 +25685 15 3 1 +27837 16 2 1 +18838 16 2 0 +17483 16 1 0 +19207 17 2 0 +19346 20 1 0 diff --git a/pandas/io/tests/data/spam.html b/pandas/io/tests/data/spam.html new file mode 100644 index 00000000..935b39f6 --- /dev/null +++ b/pandas/io/tests/data/spam.html @@ -0,0 +1,797 @@ + + + + + + + + + + + + + Show Foods + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ National Nutrient Database + + + + + + + + + +
+ + + +
+
+ National Nutrient Database for Standard Reference
Release 25 +
+
+ + + + + + + +
Basic Report
+ +
+

Nutrient data for 07908, Luncheon meat, pork with ham, minced, canned, includes SPAM (Hormel) + + +

+ + + +
+ + +
+
+
Modifying household measures
+
+ +
+ +
+
+
+ + + + +
+ + + + + + + + + + + +
+ + +

Nutrient values and weights are for edible portion

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Help
NutrientUnit
Value per 100.0g
+ +
+ + oz 1 NLEA serving +
56g + +
Proximates
Water + + + g51.7028.95
Energy + + + kcal315176
Protein + + + g13.407.50
Total lipid (fat) + + + g26.6014.90
Carbohydrate, by difference + + + g4.602.58
Fiber, total dietary + + + g0.00.0
Sugars, total + + + g0.000.00
Minerals
Calcium, Ca + + + mg00
Iron, Fe + + + mg0.640.36
Magnesium, Mg + + + mg148
Phosphorus, P + + + mg15185
Potassium, K + + + mg409229
Sodium, Na + + + mg1411790
Zinc, Zn + + + mg1.590.89
Vitamins
Vitamin C, total ascorbic acid + + + mg0.00.0
Thiamin + + + mg0.3170.178
Riboflavin + + + mg0.1760.099
Niacin + + + mg3.5301.977
Vitamin B-6 + + + mg0.2180.122
Folate, DFE + + + µg32
Vitamin B-12 + + + µg0.450.25
Vitamin A, RAE + + + µg00
Vitamin A, IU + + + IU00
Vitamin E (alpha-tocopherol) + + + mg0.420.24
Vitamin D (D2 + D3) + + + µg0.60.3
Vitamin D + + + IU2615
Vitamin K (phylloquinone) + + + µg0.00.0
Lipids
Fatty acids, total saturated + + + g9.9875.593
Fatty acids, total monounsaturated + + + g13.5057.563
Fatty acids, total polyunsaturated + + + g2.0191.131
Cholesterol + + + mg7140
Other
Caffeine + + + mg00
+ +
+
+ + + + + +
+ +
+ + + + +
+ + + \ No newline at end of file diff --git a/pandas/io/tests/data/stata1_114.dta b/pandas/io/tests/data/stata1_114.dta new file mode 100644 index 0000000000000000000000000000000000000000..7df75d0d0cdede36e72cdb3145fa7e9322b5cc27 GIT binary patch literal 1130 zcmXS7Vq{=tU}S&*;f7nB_v;xL1fcXq-jfa>ndct7U>d{*0flEL9KZ}H!49QO7KDPu zGb=e97!)iO{F6%*j0_Bo6bucm42&55{r~;vZ(2@%Vo7{%W^pmH2}tae{L-YHR6Ht@ zDodc!IE={5!)pNpLr#8PIvxv<90O#kTIw03qYB~YsTvxPrbE?&H1!OmYcMn-(F*)t z93--J&H|w1s&JOo8RQ~#8tlpk^{G$mpZ@<3RL8)^0#*kW$3ieNj2cLJ7(xRU6y87p G4}Jh1&UZfm literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata1_117.dta b/pandas/io/tests/data/stata1_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..18014cec77a9109d4fec7f0d3b90ec389f7c8ab9 GIT binary patch literal 1569 zcmeHIJx;?g6gHqDF(SdjfC^Gq+>}aC^kl@+rKk%Bh@93fk({QoT_jk_HMjr=VPjz6 z1}q)80JJQ@j-9p&YNd*W0T227e*Av?-g7`Is;J*ql)yEkZ6*i^#u=52c-{%%jDX-) zu~-;|%d0xz83_$iK!6(rnq&3VRuW9;j7?r`F8Z{6|_H!Nn~$AaBGJA8LIcDTD~h-pPZ& z(wTg}W$El(reFRy`1?DYdu3dipmj|JZ7C-7z=kbJ0RP~$^*e56d1xE(ZOBH$QIZS+ VrxK7#aC6Y%Tyq$1j-34!z5zE}`_upc literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata1_encoding.dta b/pandas/io/tests/data/stata1_encoding.dta new file mode 100644 index 0000000000000000000000000000000000000000..b4230eda73e06b49c18e8c685d15ee5f5199e021 GIT binary patch literal 3507 zcmXS7Vq{=sn9cwM^&o-)$YWp-n6cCm#Qspi2BH`g7C`t8(`PYG2daCJ0}%(Q1p(*H zrVb!FLG~n=R(N*8fx*a7!Lc-5!N|bSSi#WT%EXL8D7z>%v)ItW#FC*nJ~_WMuLPtC z6)-cX8d?;iic!i7C@L*3&P>cxa0b~~soZLfax(MM|>mV4!jV02;b2 Ap8x;= literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata2_113.dta b/pandas/io/tests/data/stata2_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..09c90dca943d1cdf84bb15b884958ed941705b58 GIT binary patch literal 1490 zcmXS9Vr1Z8U}k`T316L{EFh&|sNkDeq+n!VX!4K|A}j=z-FrN=9K=j;Hgp0>9q{gU z0MPMPq+h|%*viO+;s5_XfByXb{U;@{B()?nH#I&PtQwPmD@e*r$EpaV z08<;1NO@{%c1~qHZgsi&c_kV6R1}sb7L}wH;Z~B7n3EP?nVN`G2|hU_ry?^|4Ndh7 z(jftg!cJj8h?kTzz}1xGBC!jR*cnLdN;n(Yt`W`dh~o$QzhU94H|?cPS?-!#5lmJo zx7b0VGeolwg6L(F%?^rvU;g*V|99V*KmtJUpdLm)tp~x>)TdyPQ3FTAkc?ylNlY3y M-vSdM162|u07r08-T(jq literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata2_114.dta b/pandas/io/tests/data/stata2_114.dta new file mode 100644 index 0000000000000000000000000000000000000000..c60cf480ad5dd82db28475872f08a5280e7a80ed GIT binary patch literal 1786 zcmXS7Vr1Z8U}k`TuZEwM#2Zf-J2W&hG6IDfv<-ps3=9Vt6HsUdi{;E9@ed#O$N#Tq zVAxmxX)=h0=`ZV<+WFtmSiv{3NWsVe$S^dwGB#oO|NqaQKfizfNl7e8Ey>JHjZcPH zi$TH_BxR;!RRmIip$l1{JT)~tr!pS5y4?J{k_>z*3QH4Yn(E6GUANsF&cO~k1L zpB%DVk=Uvw$qWYRNCJeIswK{(YDi%q#R3M^l5$ctvBq-v-nO#`7|AC(^s zfzc44dkARUeA`~?#L%$t)f*t4<*vyU!DN+kiyh>U8KT(-LG-f8W(UQ-FaLYw|GRHY VAORqF0IZuJ^wW9}Oig`S4*($EbN>JU literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata2_115.dta b/pandas/io/tests/data/stata2_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..ad7dda3fdc4b38fe4a34a2615414ec13bae8389e GIT binary patch literal 1786 zcmXSBVr1Z8U}iuDMnDb|5QAt4a5jd}6N*2xfLI_|WB}5SP|d)g29h+gP;g64QZOE@qDw7+1VLGCaI);Bq@QI(TE8G zf^kMOMm+BZ;6i~Qm#il=;qJXTz?BLeGK~Ot2rQ7}2q|%a0H*=tM9EeFnWRijmG;RQ zuidyc=r*plylV}wJ=4AkxUvE+R*}*^QKOmXxx=%^tZ>I+FlF_}4lkW>LGXp#f*U%=;`O!_jc`!@RqrU4?+h2-;zpstuBeN~SmChkq}m z)Jj5-boV7`Ldo7wSYvF7=l*0~+FO?KL5!6F6_Sa`l$i2GKWB6zjQuN@9CM%+UJ-LA3mu6 zq+-;+Tjd>^XfM?7`7eBD5AVVKIi}&=`~Uh2pFVkk_Tj(#PuSwg3l>fPhtDjSK6%cJ zX`z3IXBhPZQ|HfH__v=>KQMFNlD~al`-U0wmn{0*PpIE8Ep+DJzRyhkr#DsDT+hD; zYws+YGIP#?$&04{?Z+0+n?Cm+vcNE*^Z)T}hM74}n)A0`ufFkTqW(S&{|IK7?t}Vx z%=qgw|I2Id|N8g;2VVal!ua>?|GzTu=Co0DcupmGwjf?OZS_b{ZagA3Rrk$Yb`#l*fm3_Z0qTT>Dsk`9XIo9lb_g%>71Km-bYU>Yosfdk9EV=HD;;H;4byMr)RT zJLP{D2Q@AGSY5(&7pnYUlN=$pBjq`647fMl`VvA5rl0&@0s3Da0%+mcZ!My#7J89q0)eAviZ<4pU&#~zVoC^;Ll{%jlDv+<9&e%eL-qR)tcp` zCXmb!UXs%I3%dS-O;t<2c2uG%oz?sF@T5wITV!#|?E=Tu2;AwHM3QRF9A_(ZigMK; zB?tP$v1Yy;*A-R_6GJ_z?0u6k9@~U9ho0o)P>daD&t>9zRMsv({otL1Lx$zMW znFZ;rvD&3Cm7;Hw3%6Gb8wb`2T*XT=I=9X&J2piuEf9VwA3kh=;D1{-As*9yR_EQ{ zgGx(ok}57r2z>llVC-L$+sjpmo#2q!29WFgi$4ayN?AdspC`Yc)jO=>s9bu7eAOvl zXmELox84z#peJz5PP2lIm%o=aP^j;fZ*mFx7*-59(g4ccy-Vzzz7ZN8)(gte4;VO4 zV8tGFanP)Ej~;}t3NQu5p8~|_?=+Zl3m*`2;#(oh@=jnn>uK$%r^r}yL6UkPW9Rr6 zKHeHcl4?vDtgd>Y4^?V^BUxScpbH zpOCH*>B8>s8xdxc74@-sj!F0@L?ukt=pm>P3y!(MF{8fJOu4Sl@$I{X;I~#p-rtCJ zih6EV0vwR!ftr;uhM{nJxCxf{1FUY^C_<%kFUas42ZXHSdcvG%sZEt?rIjEjc_U+a ze8je)fU~CBQEdUy_bd^?u~#JQ^GxAkw2sKl8^g*=SZRY;%*fbLWXuN?_26$W-{yOO zH3UWtr*imPV%IcB=$U6tq@6aDVd}x;wnFWw@tR|6KY2Ylm8dXjXE4OZ?BZc%uZdyo0Wk)y z980+hD;n|Ql#rOl5J_x9!+hQ%mdnjNV0GP$ku#wm01r3Rr6*J9mL&A7I4={>P#&Yx#y4K_id@dR~ggx6j9Ox93-C6qGP9O%FakA! zfa=Hbhgn_KqeWC1)0*C6YJ@b`&P0mt4J$@i>4Q#Or&-DGFGBj-A;~qS!>s<^>qS(a z+?I~)cSDG(b|OlxE38;qWXFCWOqd1{kBWfLfWY?gJbSCm< zcZA=_A}4u)@EbMA&X4`TvFW^z%{W*wMqOG)x%Ta7@vQs8TRS**74t^2M#faROV*Nn z{h}YN?7*-X$EV%-4or%)l3mj`|VHgY4GEIZ{3?GJ6Y(~a0DF7C>Hir3EXW3h8}1VS)`n_;wa9 zTLi~WAeISy{HbA(j!lrh1E%j-ostqsrHkF^eOHEz9odmcM==Sw=79Kv@TWA$K$qdj zUL{1yq3}CapJ|Aq%8eehF5tc3^Fw2E^EAa1@nwF0{CJy&&hcAyTFf zGS&u!8HthfLW7LI{t-A<#%KENG9AGVV}z9^;MI~kp)%J2qT~y;YzN0iBIIj;MDyuV zfN=Wdd2PNNVfBF)Kq`9E-Cj?Gh4%J@d2NQ3_DFIdNK%PdlK8-;BVZ*GR-E}Gtf9q% zXey6#qt*e>g^f>|5-EWI;)Gbn!2dI_lExn|9RVHK!{>I4f@4NUE5vf$mA>Zg2`Rms z68U!lkWQ%nIDG6ISjpo9mX3rz?@~Edkj?6xv({rI^`%u2Iude-CCU{)(6Tce3xY(< z1jq9ERQr!X9tNzgFlDp)-AiJqRNz6|ue2hyc~*c72c!!e3qkma8f4)9k3pWDsEY+w z46*SWsMMn$P32iK-o=V=Cq9B>U6JIekcb3utcV{yJ`mv##p_$JVsv`80aMJ2hFKX% zy|oVH9|4YaLsv?u{!$H6{97Ozd9d1UrW{t+bKXYEtRG0f7-ma`$LWaF=VR2R2l{0; zXgO62zkJFlP5NxAx&orVbbb?+8xNtGHywz$6Z6G+6bRE3(l-yhT8bne$IPNH0_NP{PKA?42=3(~o%DCGb+wi@hZF#nTyIeZMr<7dw8 zxDi&2(s@{^r_^&(FCwF%+5dG@%Bq~27bX_ASlHU5IAUB^3E7T&T zMWZVlH1enX^eHSlnf-6SEj-E^TmllQoJXnN)q&_gsz)r7QT+gm)MGsqMvwWQ%`aX! z7IBTo`fw0djJZd4Q00n+CTwj**0n}mib7yzl!be~9<}%dj#cnu>&Bu(oy|TrenrPv zUC>Y4(f5RQ{ia^%JL07v{W1ww#(rLjo2>KYH7y zPIx%>xuA@h1;}`de0?K6b_B85ncKV>hwz_3`d$&n{ZbIiXj+-lmN=4E0(Tdy12YY!+=h>(BmC!lz4b(p|AKn3f)&G7`&}3oW9ayY zZHeucdVy;ahFE5xUv{ASyEH35r%u%3s_&Hyi1Gc9-LNu_hQDznoxEP5lncP*StzB9 zjBNqO-tZOfpFsx>!^-T!JXYt}F%^SvB0U)AOnRE%2+ZpRa4Zy7cB1#UB4bba_J==1 zzu1DwQAK&I-gfdnD$o3kHfr35)Q|r|Q1TZeV+&Er6b!pvh~+Nd*GHAX{g`Hp3-VY) z@T3eXul}6Aj~qzCl=p%%Cmb1Df+X*OW5H5 z_yOUU@<;wqAgHP4u5IPA?wli1lVE{3f6n~F?nMoaM z1Bh)u119oXSc$<7lMRm9qpkA!u%9O($wo7j-*kf2B{n-sIm2u$@Bt)dvVm|PB4K4C z>XM5VYpp??Lqjl<%;^2zMJI3;8*-E~w`NhFSpg(okNILB2`gK`v3v|Z2OQ&$@(y;B zQKL@*Sp$e6bHgzz^_fe*n(t4pUo;ZwM{FBXf<=-FK*gS#6+2}zWY8C}WEGxZjrC1( zapIdxhvyJte;$xKQLvH>9V^%7=V) z-c+=k6%?kbAfGinjm)9Us#$bh$8IERiw%)aM#1WSXmT00qwWa5o-dv?4gJzt)o0&) z)|gp+j4~;6sl&zoWVCe?qO4jENG3|Dz_B@qrIx?>WE!eJSk*E9Bw{&q0#j@rtvD-^ zjrW@p#RGc)cLWMkiGG=+L0o4{NBE7A@yv_d2_i z;!`b&axfN-6r+%6$SMdeNI?j%z~{vJn|OV`2CQsJIT0j^=xp zzd%_#KnHT*m{GqBjukASQ777yw2JnGJG~LHoCcHYK>qjWeOL4R(KF%LsKVM&w&p@s z=Tel1lSMeS@pC3Ok2w+f_SfkBvw+lVS;7Wxep)gUj?LoLjl7W6zopsmg;S zJ7DLpjR&LzwEP21ZUT}xn-fRQfUa z`F$JS7(FO+tvFqK=j28 zWt5u~O)I?Skt1F`2=`8gmCF{{n@2HB zzqH2%-mg&pTl|+ej|)9)&q#-n+nISBmB-J>QT7{v2PoS zSmKH}uCMtNVi|XX%28XW*NQnLq;d#hI;6p|Y74iuCm_X|W9ez3py&{k(jQg~^3q?Z z+-f^@ahOj0f430nZ5piHu*iAtfb7?-bQi_aTSppgd)7bgX-=a#(EZgANGC=eWZg;74Y6q^>=8(-N1;Y5JftGhcOK;TWdu_Z{C=1Xp%g`_P z05L4>b`N(-Ni@nZmn?oRK>7D0{2Gg5>jTHmY2(#>!9vW+Xei8kK#cD852=hTHh0BJ za`$JEDCq|Pd7wdRN9Dr*e7@_ug(&6c*!f%VXK+R)zfo>#G7WbOBTJW@a?hant)PeOSMQn-9XINw1 z;om8@CxsSNMG%)W$k?c{St$9bRK_})m6NBPNl)SC|9_K z#1xwdHyeYHtH(lO!Bw;k6ef+|n!5z#FX3^01Bl-F{7cF`-$fkqDsmhnl7y*vtCeKcjO?Jw-|f`8GDUV?&o_pU#1=3 zuv-FR+)7_ld2TA*S@I<@*2BtRY#Yo6Y~!d=*$dR=AfJ=I41~FYeQdSq9IK1{`7M=> z@1+rBB}sVXN0da!dJTi83d4xHimFz|jtoa#Vi8Lae~#6c-grkPF^$@cTS@Nq@+aIS zY#Xcef)qU*vD9mj3(m`-W8Z4zKZj=))$b_tAdSu%6HY2a;LKBX+prS2=K&-r{x_}q ziFOeP-xf!n) z`qJTGavn-)EifVDNT%O+h~+x(wc;xbi;p!Xm#~JBZivNtAFU3bMqF1S7Te>1G{&Li zGxF!n)eSyq{t8Vl)H~P`oRNDqV6>*uM&5IYXQG900v4128;sVe;8-LayUUjjSP2~q z!0Uyk^Q^9}_B~~;?5EG3%^(v~0clf!j5R^VW~2JcP|8|7vsj6pc{mn$FF^F}<%}p- zX3*flOUc<#3t{34Q5Rc;9|{={!8bqW=Py`=-uDN`a(&OUh6UaAqLO=%rk)Qc_tFVb zB8!mZrm(Ucz3;0*yy{lL(Fq{T?Sk{H(LT~3DpxY8<=QIZc!oeOPJ@yM6vGAa&{9_Sd$B>3V-C|XH`kJkJapx>B0!qK${Nhd!J5Fn&sh!W>jzP?@-1cc zCzFh#GUz)x>p>)OvKdOGDW?&Oy})c*hq{=xjHPy7gHpcMp7NBkhWPI*BlR3Vi7*CmGb9pYtYDM48E=>OIc&fW~?Zm%%SN&#*%f7h7cv|9O}{n z=a`Ln@Grnh1;6{{8pQI_gw8bm!0HZ7vKE!1qtvE)GnqfSKT&=?i|V(;g~ZqB{a_7p zHgGM{(E)^61BiZrkB!K5$fZG_$s{>|!*W%MQaT9C<#_Z8}uD7LYntSH;0zoyR^DzU zDo@W*ckw&2^ROF{T3<&gI|+=(I`qqbGzgv{VgENnU>i&d`o+49DF1$*9-sL=i3sXV zI2JoiO(#4ZjRYxlidk(@CK}O79=qW za*+aOQuzEue?rs^R$&cU{o0F+heCJHJwtLXyWyK}qAt#GEDEJmA3!AYc8}G|qCT4R zDLB7O?I0@aFVXx>CFK5AEQ$elkg+a;5)h4rM0G5IcMgmJWG5cjE#oh;y6m5wL^-LP z&TUgpoci@7a@c)jtc$>{jn=HF_ZJV-VxVJheNVsK1c<)FuuiCcC4HE1k(6A(23v&H zf$IuQj>g>o4;+i*bL}=@yKsUiIRj$oG1XaAtglgV%@yJm+?z0iusYNLQlut19^r4| zr>xolD?gdkN1hj1Uo* z{3TX*EVa8Rt@)XjnC_6JHK@zhr?Ao;mn$)l@i>$+hCl8d3y3#1r2^9>R^RVw50U$D zi*8`A-$Q4$YtN9!5#0D`w@Kzcla5Y z+)I#4W1wS+NOCma;1~xckHy-P11mc=Ad9As=2pHLBpi5!ZDDNjt7 z@$;yMeMM1yGNK3xPsxU>XzWZ+VLed_?Vh^ksMtFFN@*mG> z@a}tL<9`MaW{VMy;Rj?zn=vcaM$sOHjhs* zY{HFV34W{4%=bsukn8CsGAZxq`^$GoivyVZR~m^N=Y<_60kK@vtawk`gp+u0m2a?O z3>e}iDv=FTzwZvY`-dNqdSG@Y4G`oBNwAWnS-JcDCT!zWeK1s5!Rq{8d5O}Gf6%q5 zcSx))7R9|RtPF&}CWA1Wv4iFF^YxqI*gg|7SWv<0D`pK8xi*YAw*D?wxKV^T(ij;7 zq#*@^Z!sJ@$2(2jjEu#qzajv{aP6zXqI63qT87pVv1SyJzik4J4Z?ic4TT9csf%KG z-e%OL2gYk995Yhap(2y07wz7>AYSu6A@WRHBzdsF346dXNsA?b-J)qSe&SGch1IQC z;Uh{{4PpdePa?XFB{GZIDGh;+rD;~wy7-UU0xPXy<@>@btX{106_q%n=pW8dm!NTk znPVr)j`(fDrhS+%G91h1Gxu%56q|wDo_4Te=;F(Z++i#6NLM{gZTlIKCt(nl;qhkY z3|P60j2+|`TB|qM)4{hluwrz6XvU_*it~Ro((clDB7fc-5EFhElYyPz7EC_KUk~02 z!dR&*oKGdITXanjnW5HVX(~(YMohrdH`R(6j|lgp_ucT#`}w*1wql2g#1KDTSjpn4g)w2j#Ja3eY} zDHsb$b5!3Ca&Z9CHyOR3!pBe929C8b+f-#2RkFsietsfz$3|?kwh2wUJCP`ZTOi4P z0=M-b$iE7)B=ecaw!uo7FY3f!#oWI%3_4~j#>6(IIh`jH?v(=^^GA{oV(@JOdlMk4 z+qLTB0fgx)tG^yI96L-?@x>=i=@G9iCy zi;9h%=svLtb!H|J=51?mY#7#_14weR2DyJ>J7W2vP(7MnWsS#L1c=g-X5!*WEUiw0 zmFI26XInlJxaKQD3STLQ)^V^BGT}B~8eS~?DJIOn+D3)Ql8dr3U)h(%pmDVjp_da@D z-p$4@RUE}3A3lX6hcJ;#z~p4Uv}Om6qWwY3U4_?Jz5TbNuqU?=w+8-!(&5bF<|Mi# zPedbUX=#R{*pdSk*c zk=fQvJkshYeyO&SFiV_7`(qQ1}9OfoV9 z26r7fC9>-P4^NM-1Gu3By_u$W@Bho6$Qe@>nn8d0Ct}Hzg^OqW&1V+Qm@;?f^zgs@ z0>=D-X$$5r`pZw4KQL?l(!YFP{f3zfmM;FwPnf@9dibore4m;2cW;_u^F03=tiH2& z>a4j7r!1cFmmga)f5yDOi2}ofFZkQH8D`deY3^Tsz4?tl1NGNo_**c;^c>R9W9I+x zng90o{~qA|zh(Fb@ZTc-pa0SG|6-K>b144#-+v|J%#`OP5W5w<%8~2I(m(~nD>>i zF7v4%H9jF6_Yja4LO~bGbKD+2>WZh3bh0g<5$C| zLX^tij7+o*SQdk2Kku0}KyV4^&pX8?0HP;t`cY|6E!li%yO3r1Ops^&PENf9gs~Aq zk~V-PkH0*X6C!*z_{Kh3&1bQi{_i{~6ZA8gePgc>>2P0Q!e5a3;CiE++8p+W@sgCu zpVtf&tZUlv^}#8IELQ8&%abadZ;>T!whJ6rCvc};5=p8za-5Ce6zi%&$`1^LVJ&?* zt~*$C6T>~J?0u6k9@~U9ho0o)P8jMn%jo@S5UNy{hotL0gy75u=ImKD5zSgBb zmEvxa^S4(E8wWQCT-8f5x}d=*JG6i=t>JzJA30)>;D1{-z#qeYR^vU;gGx(pk{T{n z2zvZjVC-I#+bhfvTfsi3Js{Wl7k>-_OJ#A6pC`Yc)!IjMRIa!~zHmwsnp~dZt#`yF z}rHVVq{56J!#u-G9l_9{!y zxFPteKto8CzHzl8)Nv{BD$_*23t5erm*uZ+Z(gd+hj{Jx_hJS?%})17$8f zChueR3f`vysnXM=lfOeSt%cY}o;<_k^2g(cLT?tna;giT&Fad6j8rLlLb^v~3A?|u zB+MoY>SO&JviuWb7EJEwVaO2+jk&@wy|%(gx$e*L?Yo80w-!X+Z%LhEpBt4xdjxr~ z$}&bb97YEiz{DS5HPeF$mCn8(0XGf^xyQAHImc4#8Z%2n+$d`oOw zV;3ggf%~IqM7uSEWY-T;kYNt2*gmTSwH2H3kz$NI2ysHgu?DHCs&O(%Zs9G<{cMVX2WVpSe2OzrD z_dkVTEa=*>GGT2rQkl?}o_hHjf^mZTQ^1nW$M537ve&>cc7W)ER*j`xl?9D@aZ*Uh zWQZiTrx8AH;Y(K-7Oq0tdmB-)I^O^GPliLRMt^z|RVuA$(w3{j8Rk8jQU_%J9oj`V z6mk@=RHpH(zcZrt_ZZBAImBv1NKg7~|mcP)W0$^7P# zCe%(hvtMA*C6Q^A$+e-?Lw*qwLJ+ZsT`0%>0hV6>W=Z6SeK3J#8tTOa7QJ`r=TynC zqn!*lg<{r<$j;qh*axtr|Gafg)O8M+CXfaG2HJd%c*-Q##O518xYhwT?uo zcLj@uNp=_j!9=JK@u&#+Of=YjzK2=e;Kxg;?BYPZW9|s1u&zYj><;%WO>(Li1iw** z?EKgd8k@oUSdRmXKK9~r%5~~U%Vysf-rB;jtC%;E6(VMCcWG*nuV3&3%MJ{SA-=h+ z#_Q=9Xt0jdVR60Sd954a?qU+8HUrC02xhP9zQN01oi#}(s|~rxi_>bV4D3qB=o$t8 zcxS>~8wf~q6BF%=kL5ziVSG`(KU7`7yM!ba=d!x)+BK9J(w$aVHV7fTyAoxf7c|zw z#4sjw%S;vG6EFgy*o=tv0*ik97C>Hir^TzD3R(S}Fu?}FeOnWkCqiQ<;L8L)>EsBs zj!kHN2Mph`8YMl3N*8+4`>qTbJE{wjj$#sUtpM?d;7_WM!7c%aUNxGMeaW}1Hb)mr zl^eZiL*RSC=Z8*2`f>;$Z2%boDPB||L5BhmvBS{X8nEarYhx*Qpf|N{uO-9o12Sqj zAZ;O-0JwY$_Dg)w$d3^4gXsL10MWM}8cSu33oYy0D9DBFiIn4mh_#1cMqwnqP$A>5 ze*}$H@;QFH3`elT=)uw)x?0*GR2SHzDfyyXc7$Q0;PN#<;`l5nP&oDSoH}2Qu-YIK zAXRRK<1qjIntwF-PLY<$v! zNXY~cNBA-Z_MZkzCV#wQB>KP}zMxAi4AVPUz?bW;^fh-+NblQ%$iEAKI3fGv@UgGJ zQpg7`8-@P7%dD~DJXYhJyB;H{Kdp(Rm#U8ZNDhpCyXaPt7Al+bC7~D@$A%pjS4DswlUQA%oB_wU2Qm+9tgJ;Qj z7Yo9j_y~q|N06tXMI=LGrTpmeL2!RKUf%+X-tpN6OffGSVWA_9RvL(ZBsA6oRVg9+ z%T!3&Z$T*Jq2_Wk_`|&bbJEtPddi&~4k$VN5jZYLj ze1eGC8)F{@mwv};H}{aKbd}Q0FZ&ShnAZZgXDT2bCZ;qRv$76(N#pY$d;<43n+@Z? zV|Dt`sgz3)Y3RgmEoQaK2Qt%iCT%>PtgjvNE=_!$ejYy^v5ItP|UNvOBedXxBUv4bZCr0=;gOeE z81n`Z+r!Te`P96P8?pKTqFsGCmC6qZU1#Y?vf8~A<5~pUhC0ew3z>S{E|gu;n#Sq4+p`bFF3M; zDpyQ2d23s;t{w7H8U~hN6Zd>Qa`6ictK!Etj75by8-1+(N{_Lcke{}r?g{PwRin^< ze4r)QYkrr9(T|X?Hp*#WhUxn60-j#`q(*S-^wT^ zk4Ijr)Sdqr`dH#ulsgefduHm1P3yM;6Fm#QOh$=qf_k%%eZ6sXhY2X;OFSyGq=40o zYWoeEQV>m_+M48_dnGVW=OQm*CV9eExcoH?Yvf(mO@Lok@WqoaV6|KQ(&5W!TAkj3 zIFMHYcNePzGaaeihL2^z{pWn6)kKK@y!l`S7Ts36T^JT)==g^nh|QKpfomQCUuL3S zb|Cw^RFy!oji=wZ9b+d&+k@JPGw;10~0n7P4BKDf_5A zYZA3=){iuf|3grUmLOt_kjiunyIt_*F5lnB+y?h!nk^|VWObpFv#GrLGx|PeFo{s! z3(DL`L~JR7ya$FIN04vu^X`YJ`wY%zhCU2wdTIIHn_1W2E$)O5>>c=q{2;B?W5*eb|!Kib6)aKC~-@`oAnOikr+ zR=Y9dTPnSmsO@JaGH(aEWz=f8zZQM$5Smhr+AU9Q4~3Ckj83s1l^$nx($qW@@=V$g z;!lcfv_x6I2JXk9#B$ZaXYR>ibTW+WXsp@80HQB1`wsOoi#pT?5}N@!Oysp-iN_9; z2aVaGtcv)EpC=>8dLuf&;RLHmX?c`#x;a?j14;Z89pOI2fMp}{Qh*X`r$U^=!!VMJ zsQtdBCvX-Uc9b%=W>cTpfh0+b`C=CXmMzd&5eA<k{tfpvB0lsw}4fGR82-}Rv#yF&M7a%1ls}OX{3VvtH zX%NgTyq-`}gnjI%e7OHPZOrl{{*g9B2{XID7w(^dFHK10L%z0f8p_QA9j2zZh}Auf z$)`;8Y`U&X50bmZn#kYBf^|Q7awWE-o^ZdBFPl9b_0rYcpM8s1eNOE$%B0Vu_7?_{ z(N@ig61^Uf9Hdf#VRPY2J%97bbYy?1xsUPR!75af17XOGVj zv0iZh2SYKdYaD%ya-X0Pz35AF#*<j_mx7`9r1qxY%`bdKIPyUDoELlV@Z|O;t>0bl#qe-bZ zL%ORPt9>&UJ#8^>9!sUzEwe9C#Uq|>ELnhA=}MSiccNaZkjm!pMT>|%NIF=qqw}{X zjeUN_tgPc3Ixj%Pd~gzv1&eNsPSPdeQZw3sz4)?o~*hiODh+T(p_>x%4as7=a;mf!i zRF2(3y;ja8Vb#M3(>W7{)tb1iy#Xmx4a>?5heU@VmHuGS$;*DBa@+0H#eN3y|J_8S zx0ztMVUi2o0okvz^b{jdq1_=E4?y%cH{7J$yX~~of*Iu5ZwA8H?}IPDppXX-J(kAcA6bAmt1UVBwVK~u-rDuZ@h_9Y@Rw^*YXSD%QhIc z91yL&+g&Q1+<|Mgxn%SA0%80!AxQ9EXR2r+B zN0vMn(E0bn{W_Ck;{(Ias^ist;Udh+ICPlzfau-r9#R=wY{AM^l8jW;_Aj_5-iE`->eEH3UI~3e27a+(-`TB^( zm}qMun9GLK*kSx0s^%9SDbUuHE)l|KBPNku{C|9zE#FrTeHwS}|Ys5lg z!d0|2I!q?NwO}d4U(VzD1`w_DxtElCzKc2xS&chOf|Gb2e0hWU0`W*%6|!;MG7P>= zNag2%=z5-bMV0hDG)r1VX1}1g?#PEPZ!!1?BK8`o+|T!JwOl>EVYdWCzm>kG^1KYX zv;1E~-w2kW*fy9C*v65g$`{DXK|VigIRtYB``Bv3Syq$q^IIw%-%F#&Dw6!jk0>c< z>vasCDvTiJRn*Kfc2p$tk^o;q__M6G;>J5FiJ8=T+$wUfk3ZoqV%vz;3Q}AEd}&l6 z=bcxekA1C*|16$a)V`z4gG@SmOeCoegE3Fd+lGa}Jr5)yNx!MtPqB@H`!+c894|S` z>K|slp-kFddNXMjPOkn${uU!D&k`eO6qH;I@t@+8??%D><@Ht1XchWcAYLyroMSZ&_3tTj zWj}rPY$lnQ0Z98|M65X?HV4^Xj#Sp;nZ+vX%mG;7y#Ud=UuHzPI-7tvGw*bou)V{9@ z@oI>Mp%WmO+r{TtyRIcPu)3s>gaGIc9oPsQE!4geyg{~U*aO`Szis6EIcm=EZ zy-X*{@rUV{n`=pSA*ynEDIhJuvIetqsM=uP=dVWV8-S){;akCKzfaYR%8+mA><2N# z(Rw(Mrk;W?b^^0$9r9vSBbL#14O019each8>J|iAh;q+7y6932S!X+x$jT|SzE*gs zk^uL4HI+YaTZ2M2V(@kUzJk@aX~~N6_xUvI#{{y@au`u^&mu3aagN!Dr*HyTs`%Y6 z*T9#T22`fu2Uc@vvX!Wm9;Mc`o5_OF1BvqM8Dzf=E+oE0?T4z6GeK(+j?NIw8bGvz ze5^&La{&#RB$Kp24$D;qQfV(Rmy%E~ad7`EfAQE_Y_mb;vlt+{KF-ZWd3zyk+_r-> z4jDqYk{?FRq|3 zLzW%!5Hb<9zgP8TOq)0~SU|b^w_o|FK{F+W0FS#T8RjB>WP;MeIpZCqf zqW`=&ut<1hv(He;YQ36Ti%Or%2;Vt*Cg(os-s5+KXnDkQ8r4nw?++F>eL zU5sEaDr3&jyJg>!dHvl9cfSVN?}EITWw}djeW3yC01*IZV3@x8c3V+-dX~D2-;$k& z-H6ogI#TH*FdpkrFW;yTJVV0%Z$yJ_GAO7QtM;P&`#E}i)^{W-q%YxE>@;;wcsd#b zS?)o^GWm4t^$<)to>(pA6;?Y?bP(mT3OejVDUqkR6XxIzuyhq9F&6D29mb^dMT`H0 zm^WC3)#VQ8Br+Zf-97I#$-m@=Z@!7VIK!}5q|*EVB8|6wY`!e&r?x%?=a*@nMP>a( zTC}O0+~0~tG4Ku|)=f|X_%i3fJe`3tNj?`KC*PQ6U$wZBXp2lOU# z#C=4po4~D&Q(4US7Y{Sz(Z}BUo_e_n5N+oXPRM>WeVBZKl%K~2TZ+|z>yDlrhq?a^ z3`^t-Y&T%La70sb21M6unzN`_U8CZfE5s|bFJXpYb*KZR)Ewj_xW9>?8odE5KN-xA zJTI{Nk^Opz%J^DZKCYI0-VDCHXn@9g2(ouP*3m?j#p&J#2~64tUy2GCdv=%XwT74AQ)8T7mes+E_ei;hz^s98d=LX{L2g(1b93C z6031fbVIxNjh4NAM5;!*67B$odR<>+KLPINBVy5fa#bQGayTBQJTY9tZxtT)7p0x` zv>~X8Sh@@#oZ&ruaTAzZ@mP8b;Y%d%@3|3Y*3A0-xN|J}5ge0hWQJbvzB_I#f2dS4?t>zF| z>Yq_%+i&FNOfRB{W{9UCE&UR!!c;Z;?z_zwl6Dvtd%&U_>ES8Le>|t5yYG>W-wYzm z7Cj8ZuQ^ILV^;QqVX3^{G6_Mh^Ud*V#b0LiYcGN2(kp5sT_Y2(3?YoA4ypCT4@0(L zSp0%Vv}j77C7~zJ0i?|EBdduV$B9z*Yq~7^CP^CVgKH>DQ3>Rb%5BL0E!6u_zG;6F z1TzEIgO5sn#LoZGKv8OVLmS55A&#<<$RDvH!wkfg=XR`c*YSw9kWbca!i{1%eyh;Z z_eWM&;OQkY>F?2&;YXxin4PJE1bIR#SW;D%yWefXHcrhChDxefjo<QTp)@x;EnuNwC49xR(XX zU^Lh?2xc>Oup)keb~6myXFvptt5|K-?7%$MEhFyRLCq8M4Y8F}f2@mdYT z^wf2@$fRgR+cz(W*Zfb2Jj(__9x8Cc9%xKbeF_ zQ9Z^InZ@jshM|vTsx0Qb_y=zROFOW9S8|2biZ#BXlBgH`BN^%vGLA5FZAIAuzfIV* z5A#KaVR?MczAczyGjZF~5iGiHzP!jCwh)hW*V2p*lZZSSgRl~hH@jwo4xeqA0~#i~SE<(!r^rSV&qS`+jH_ z2hjSap!U=Gq^aAWvDQZGn!M6#R$n>5Ph{>`i|yAor1ldpEwjPA|qv1;$ zpL1**SSo#yC;lqt{>2gKV>V)ZLJOMTbqe8L*~2h@1ouC?giPm9aDIrycDgE;KN zr!eFYCUQBHoW@tw?Z8oVAY{3#u)0&Ma<@qD$ID6mqWGEzL@< z@<{>2%9!JK9V|L&K#-_huoK;{z9!`EGQt%)h{<=yW4;ua_hdBKQa)*S3i1+AYQ9*x zhWq4kBSr2|OR=}t0}^>TipZKyVuITk{3z@UATNE)k2z}Jq?nJQWg$tn#_Oym!w{AfyJ)QhR+S!uvotmHbY)hCY-nQbk_BW;i3 zmujmBv(!$O7+dB%^XGpy|N8fH|Mk2EuXA28TJxX&{r7*p&8)v^VWGZmGO7Q}YSv%<{J-D# zT@8F!&#ogUFX%eJ!*gI)5AUG^y_mOe-u*BCMNXf*;NO7%_;1AG$qN=u|C`S&m_B*V zjA`Nj5v=~v{DG18z zkJVjV>Q3cXKas}NO@jZf=Yo>(h$t1m8JSf!5Vl1Sw*9N%051|!GW2u?AsDx~bdKseUYm*cv_if&@KCzZW#5XNJhux{)_L4Ng|FifLSspAJJiKqwbC1pE0-$9txLd*kC9%nxOc-&C% z&B9ksapALAU1^Y!Dus_o_oz%^_ji_r*MWmh9lwq2{@&%Qqd8$2ve=(`x?|Fo2?#XwbPn?M7icVnsld7;No>e-qnUS z^?3UjTKKN$|O z8vW@{s8V4?6SrIu&M@z=2X#R8-#{+9p^>9_r6QGI{hblLzsF!E%pq1AQXC4ou%^Qj zuL!P7-w8}+M_RMzEy&*;+O-(5B=MU^n$SDl%yGerE|E;7OpXn$9`cJ2AA*e4ccC2j z2dwn^mz4y5*n1PKOhvz#U`6j;@)=dq?Pw>%4WWp&BC>NgIQAY^T;P8^>av!98z-pb z@xOK4kj-k&ESZP8w5G$Ho(Ru+HYZX>cbeZtV^S(y@y%Pou}Hq?y@2X}1A+a%D4W&3 zHO{9@pElGx@1amsV@tT^J&+DS0{fy9x53InJ|vN#E|z%RozG@p721p zn`Tdx&b_E4=}a8w3c1^-Lf#FefT!}Oo@e@Iv-;8J7C@BR(v`~_g!8xB5>D3#gwdI# zI#;ka4N0EE&nl!)n9itv0)Lp*)I3;3l`-w;U8YXRaP3N@*uJo$hn0Tl#C58b!hs^B zuOpINXE@Ai@4j3_<;flBr~%i7m|91oG`PZwg-Lc80K!D55b>x8_zVbaKi|WwZt%k; zRCaNo-qE)OQ&?9bZ+1ucmL@sH3xwaOLUw-W2aZkWeXPg9iazGzGRk%8NK0qk6JFcG zu`8H2k`*#$mb+9n$=5FU!O9K{iy^)_tj6o{7Z6xS>aeIm@VwfMaCa~XQkucaP!MLX z8ot5HU!66HC#wxPsEgBTstoK($LN{_|2SvDTpb8Va}%@57az+3lf(GJJb$n{pLYpK zEXrYZ-L-2dGo(8$w`>$bdUqwtKre8t1#YlRP|FMz;uA0eso0E+^@0`s_$`3E>`seT zJ{B_jIbnhgLin~OE>{G{P9T;EeB#Lwkd94|z5|ADS&fnwO{EJx={;A5j2+d5NJlXV zxK@DpgYYL+$Y7TMWUm^cWMBL(tIgKMQ000r+8Fpw@cE$=k-i)PNE<*#fQlDYNYJ4G zWb81wwgy&ombEdIJJ6e2x7U(k_W&6+9FVpkOaMZ@3I8QNXygY-_(3TDB|!A;hsID@ z<3daOHVJY;dm?4~AY<)8m{Ax>&s50xYaf7P6@0eeF2fP*FnU;N4qh#36sq&>Axgeb z%Z_kt6hgiVNGzWz1q!F0pHt_{5mp;y0;H-h-R<>ASZHTQn3qOa>4YQ)fh5(4C4~=a zF%njyVa1t0!s=Qth^2C{8?_32Dr|hzf=EdO5J$u^2L7Lhl??uP`AF!%9zMTI3>?!t zSRj^buJk2$S4ivIg2=xMfH2E=3WP|IS3pT@_`{s5?)V$qdvRr06(^f4?t8vSp*E=zBuDf7K-PANgF6ewhR-V?fId_~slybjHw$;}Cup8hNts zF;*M*J_(DWAH8MWDAbRADkx)S0W#hsU)zX}9YHL%#`dqqA^b;>zGub9SY6tN6e=YJ z(BlpoqMZ$nxy(Sne1htK2^~9!>RTDbr17XrmAdmEgN`M9MY$7!v}cB%*tC8vFso)F zmPu%_O<->(s;@VW?l1w3e2GVA7U#2?QEk70CFFAcGbrqyX3hy!^caCfjeFw;=VZTMIw!hgy)Sxp4_&zlcc zu%g>)w+q8!3>{zJf!J(m61e6Oh-C))We2LiOSSTI`b0IZraq~F=-&<74J+elA?hN(%bk-U|ueOW8tu} z6TQC`8GFokI{XRx#Rg1{DJfvJHk0>JdFCh7vROaUH2x1kDO`+(kH02+DW3dFJ;kbRK8vnt0NmwyV1-Zm4a_&BTac>qYCDb#e+i+J|= zL*R6)kg=7Z<$lP;4+y`UKk|ne@dCNg&i)G}%{!e0v=I|Nax zQPuL)_E0$4#po3CLCJAeCr!ykBhR3XA^xP$MoX0SYY=`cS}aE$d}d7!qm$rdM`O(% z1`vH&>38Utnbe^nkk|~+VIr@El{oA$x!{-`+NzL`_<0hNtT#gW4JTMla?7KX)6K>L zA4uXR>j?Kg8df%nuh8RHR~Av zJz_a@0#j@rtvVx;jrUp*#RGc)cLWMkjeeP=LR@D|NBGT<eH+59HA4y^NhGLS?{;-3r6=1E5kwHxSE*uwr9$+B*ZWIHO;B8H-qLvQ<9iqQY@W z@LH>8>eOKfB z(KF##aB)Mhjj@>3xRey&WD!ZN{hY~-V~#|=^(A`$3?NNvmau^vAD7L9W3zbkMqbQn zpEZ1slf?>pvBra>+GFQ$NCcz|wEP21ZVr+-8j_mx7`9rIT~4m`98H)SC}MG zMj3?nJ*-p_I)LixwHH=Pxdui@LM|d$xQr%owNj0DN0MVAtx{Tvn0$qD`Hi>igB=X)}uu_AJwIS)@ch%kxTx#Akd#j}^ zWp(`vYp`7;(lDQyB&N+^K+<648kE01Y3lPUW@R1U*m*uO=7W=X46NwKPr63AvP4?; z$pVt48BDkjF*_@2(J!5Ff%hwv|0X|X&-{PYh2c-K8rvncRC>LM23(IIjMX3_`KH6M zYY4w9iFr_|LhL$(BbI~`j_YqciCD&6r*g~|>a}7H39BAPn9dn+tk%SB?F~q&>R4t* zI4C*{rSyjtoxJoHD!1KEUF@e5|KCkSdYu6)*G+PPJ0Sa2D?P;sbZB=F#sd)jjSV*_ z_hvioG=DmI@|%G$_WKaaFKFaJ1Xr8L*fl=yTm%N+2DDg$;S_fMb2q6{vYlo{%pn(B z8VT1a6IO1SBLvTL)FXX$wS%^}8ik-hTe;Q|G@*Cx*rqW1< z2(omEktmlAA(r1vxI@9caz2uLly8Vwgo(BmgsC)~#t!565I4PF)B2_*B!7j0NDB_b z%0pCNz+LP`IF`eQSuDnU*@)dz4=cLNRgF|Ry_4E*Tt>26`Vyuh2T6Wp;;IDn%RLqH z*|&?;bD8-ldYaWY9{!zjd(vo8O%!oCjf_n@0>_@1lsN<(GanQ7^W(jjpkLy!j?UxH zu$ro8PpLF?7v+lAkhoF<;bvnHa!puBOt^}+hQegQ}FCdSkRUsS4Eydv5gi?M6h_2^}7gR~xLo=n7WY#l^>yA9c@*0DW zAY(64%Kd!rR?F1m8+J=T^jqmmD$h-)JIg*N`X*Qzifw~=k8K<^s(6OF9OUydmw_;s zv5&1boMkofKfk8Z@x3&PtRzVf{D_haS+8U8RAB@$ucBrvV@E}zF7b#Zgg?t_%dfwo zl9)lQ$E_rH`}h;?BDRfHT0x2pKrBrvl~`+I3bi!gkA1d&YnRCZti84WAP)qY7${~VBHgZUz_ z3_E|9*HoUKL2Vy~lKwOROfEnvtpp})9Le_k7O`C8y;gjIVez5LEw4 zDmWGm$L{dugH}Sv0`YpG;T)@JYMR;MbSXI#ZX!%VG3sK2@WUbFVff~!{QL#0(EI-2SibK$R=1$1 zR#fs2()4qYb>tgOMT9I6WJ zyS&wqz5x&=3*T~9`+bUDREB&@XWfq`j@HA8H02axu@jh0>rfY?nz8h*Yf#FU>QkO_ zRyRM;LX>;v(uJ3z$vWGiL{?5g`dZ9!Qj5 z&!GBka3S#}dOuW!oC#WsbaVz`)&Qa%*L&9l(!eqrfoY&(~u#AEB*nHws@qL2##%6HF;gHXb{E?uNnRa zoc9mfh|;VR)IMV;X>{WV6Rsd*?F6ME5zqR}hxc55cTBWOOY>un^Q@+oMGH~deS+RH zrIDX64?JtnfOG$sSV*!|Na%4h zq#Xp-7Z82N)h$J3UQLMEX1_o}gsX%h>9^}}l&U%_fhKerRP z7p2tU*EG^}$sN_NLhpA5a}$X9yl);B{pZHQiiAft`wSJV*6VF+QR#D+c5d!1Z@)xd$1`;M1(vgD`1$Vzq=9F@D zM4s$Un1k10rK=!`F_4QiIFrT~F8UK<-e470mouP~$apAp_uSJY?~)t7`3CCZ498+n zO7jDVRNnTX`Ld{=Dt!vhFH<{<%KD46a8nt%w-t+G;B92Ao1g^7Vj(d*mc%;;#R0Mt z5A2rl7g$a1&yJ#;Qc35wuOyBGdJ{R~9x~QV;MT^fR?PPo^%-%{vDdz*o^Jv~+j)c& zs$Wg(lP-|5^VncZusU$vp~|lPx?WSAMaAkW71vxQUZH&n zGX$$c9UvvBL`THOJC>iqe{&X_?_RSz3p>Y<&zXJ#o1b2N_R5DdYI#-tmBV zV^bG&T-|*{niN?TuFz#y-0@tmraZx`dfXRIXxjYU!mW(9F@^2jyz~r%5d-7mKuXk`0<@!3>bMzw+23`3@ z6Lic4kDwEvFd68PwS42}X2>AG+wqrJjeCL{8tF>70~qRceNp{*grA3u zt>Tla5-^d&@i66);SzoxRo`Efb~ez)ptr=*WdPv}?+}Zdz}$?((p!L7B6)w$jW{F6 z0&)=$-LjS*BKN3~CWpQxjd_SAk`bjkcQyP7pOdPU(a9S@eor|05ggOIck~n)o8PH* z)Jx)3=T4+7OtqB$f;1=&u^d&ca8Ea?8?1fyJwqj{IUMaFO4=vXaqI(fKE;zTyEURD zc?e2KyjmAkWW_H5xvxG*tz@;DL$K2Dgeu#9BR6Jv5k)jZJOydVmsl01sMUAhZN8AS z!?4%`E4q;$o}&E6QyRMaF4_3aAi`|X!!i7TtYkB0Wj{EU!s{&)k>onxY`<1~C97Y1 z305w>pf=J~GV$^d!dU81T2K5IZ5xKgFL*=?QTj9ynmikjQp1m|CUP7nN?9-I(ySXK zai|Zjp)5rukV7fAq53z`???H!`x8N!>9`(zQ2Zly{tpI6@B0^FHwnpOSSuMlRJO-5vdnuXUZT!o{$17DXNt_-)+J+PR$R7 zimOGaTDzKn9DdSZ&p;!6Mh5 z5yv*&!3q~lm?O=QF+kp?Vel=6V`q8CiJOtJc=N9a0MT9jVyGzH)QG0x4MeO9Ci2(K z!LcEjFT0^I;Rf@f7+J6xb?JriS`Eka)OEPXBx^<6SI>yoypM=H(*{W%DsaLca7% zYkWl|K`;78GSnqx9AW0zin0TKo3Lpg=8Fu+a{26iTQJ3D;I^kDtmwM=@*;QGLOjx4 zOVc}iLgYypgcW$a*)I8|>-e+bdYnJJ%brDY4@GAN91SG@i(xwgSX} z-^FBM=eGfq5AxSSw}LPh<`vGTn$;}2Du~Q*E3rJCrM4p{;OU#$iV=?p_oMgS@Xh=A zx%;+ahl$1zKVDqTYI|f8QI2mWelggBR<4;qm~9xhxhB<0%^n`jy9W>J&5#DP*fv8o ztNYqb6s1^evHxL9Iyfa13rQPT2;IwKiJUC_`H#$$kR2^&rT<3bCZ}*~hlQN`)`##9zVOzc>OqW+TSMx1f1lCll_4 zJsk5#k`H3=Z326fAgbHd>f-@~;R>t078igWriJ+IqZagt*CfKxc7Oy3Qm+G8NMsda zGi5t;Yz0y=0T5l*jsQ`ywiVqcHmAmaB0k zS6R)HT3Bh{8Gh19QOWWIC2ORxeQEB<;&}K;3zr}wA@vEmDSpP9gID> zwYW9t50nmP7B@%HC3PYiIY(7XGn1=)k^!+YX8T=(6`eF7NK`J^iSAck5^`rL;R+nY zq&wp=U-Hd+G6c4SPu!i1x&)M%FIKMNK6%_okvr5%-S<^|3cN>Erg`EN9 zxsUlVNA0U*^HH=kB+=G*jn$+-A0aBgwiGX3JV`=ZM-thglUR0Oj37IdnVF0qO=*mJ zF~vM94LFMxUt_iUq!A*st)+OR?NR(vZ6#rrIEr@5#-jQc5Ze&^{;iF%F*OC!Hx6|P xEV;(&K79pRrrC*w6$PYn%^F$O7+dB%^XGpy|N8fH|Mk2EuXA28TJxX&{r7*p&8)v^VWGZmGO7Q}YSv%<{J-D# zT@8F!&#ogUFX%eJ!*gI)5AUG^y_mOe-u*BCMNXf*;NO7%_;1AG$qN=u|C`S&m_B*V zjA`Nj5v=~v{DG18z zkJVjV>Q3cXKas}NO@jZf=Yo>(h$t1m8JSf!5Vl1Sw*9N%051|!GW2u?AsDx~bdKseUYm*cv_if&@KCzZW#5XNJhux{)_L4Ng|FifLSspAJJiKqwbC1pE0-$9txLd*kC9%nxOc-&C% z&B9ksapALAU1^Y!Dus_o_oz%^_ji_r*MWmh9lwq2{@&%Qqd8$2ve=(`x?|Fo2?#XwbPn?M7icVnsld7;No>e-qnUS z^?3UjTKKN$|O z8vW@{s8V4?6SrIu&M@z=2X#R8-#{+9p^>9_r6QGI{hblLzsF!E%pq1AQXC4ou%^Qj zuL!P7-w8}+M_RMzEy&*;+O-(5B=MU^n$SDl%yGerE|E;7OpXn$9`cJ2AA*e4ccC2j z2dwn^mz4y5*n1PKOhvz#U`6j;@)=dq?Pw>%4WWp&BC>NgIQAY^T;P8^>av!98z-pb z@xOK4kj-k&ESZP8w5G$Ho(Ru+HYZX>cbeZtV^S(y@y%Pou}Hq?y@2X}1A+a%D4W&3 zHO{9@pElGx@1amsV@tT^J&+DS0{fy9x53InJ|vN#E|z%RozG@p721p zn`Tdx&b_E4=}a8w3c1^-Lf#FefT!}Oo@e@Iv-;8J7C@BR(v`~_g!8xB5>D3#gwdI# zI#;ka4N0EE&nl!)n9itv0)Lp*)I3;3l`-w;U8YXRaP3N@*uJo$hn0Tl#C58b!hs^B zuOpINXE@Ai@4j3_<;flBr~%i7m|91oG`PZwg-Lc80K!D55b>x8_zVbaKi|WwZt%k; zRCaNo-qE)OQ&?9bZ+1ucmL@sH3xwaOLUw-W2aZkWeXPg9iazGzGRk%8NK0qk6JFcG zu`8H2k`*#$mb+9n$=5FU!O9K{iy^)_tj6o{7Z6xS>aeIm@VwfMaCa~XQkucaP!MLX z8ot5HU!66HC#wxPsEgBTstoK($LN{_|2SvDTpb8Va}%@57az+3lf(GJJb$n{pLYpK zEXrYZ-L-2dGo(8$w`>$bdUqwtKre8t1#YlRP|FMz;uA0eso0E+^@0`s_$`3E>`seT zJ{B_jIbnhgLin~OE>{G{P9T;EeB#Lwkd94|z5|ADS&fnwO{EJx={;A5j2+d5NJlXV zxK@DpgYYL+$Y7TMWUm^cWMBL(tIgKMQ000r+8Fpw@cE$=k-i)PNE<*#fQlDYNYJ4G zWb81wwgy&ombEdIJJ6e2x7U(k_W&6+9FVpkOaMZ@3I8QNXygY-_(3TDB|!A;hsID@ z<3daOHVJY;dm?4~AY<)8m{Ax>&s50xYaf7P6@0eeF2fP*FnU;N4qh#36sq&>Axgeb z%Z_kt6hgiVNGzWz1q!F0pHt_{5mp;y0;H-h-R<>ASZHTQn3qOa>4YQ)fh5(4C4~=a zF%njyVa1t0!s=Qth^2C{8?_32Dr|hzf=EdO5J$u^2L7Lhl??uP`AF!%9zMTI3>?!t zSRj^buJk2$S4ivIg2=xMfH2E=3WP|IS3pT@_`{s5?)V$qdvRr06(^f4?t8vSp*E=zBuDf7K-PANgF6ewhR-V?fId_~slybjHw$;}Cup8hNts zF;*M*J_(DWAH8MWDAbRADkx)S0W#hsU)zX}9YHL%#`dqqA^b;>zGub9SY6tN6e=YJ z(BlpoqMZ$nxy(Sne1htK2^~9!>RTDbr17XrmAdmEgN`M9MY$7!v}cB%*tC8vFso)F zmPu%_O<->(s;@VW?l1w3e2GVA7U#2?QEk70CFFAcGbrqyX3hy!^caCfjeFw;=VZTMIw!hgy)Sxp4_&zlcc zu%g>)w+q8!3>{zJf!J(m61e6Oh-C))We2LiOSSTI`b0IZraq~F=-&<74J+elA?hN(%bk-U|ueOW8tu} z6TQC`8GFokI{XRx#Rg1{DJfvJHk0>JdFCh7vROaUH2x1kDO`+(kH02+DW3dFJ;kbRK8vnt0NmwyV1-Zm4a_&BTac>qYCDb#e+i+J|= zL*R6)kg=7Z<$lP;4+y`UKk|ne@dCNg&i)G}%{!e0v=I|Nax zQPuL)_E0$4#po3CLCJAeCr!ykBhR3XA^xP$MoX0SYY=`cS}aE$d}d7!qm$rdM`O(% z1`vH&>38Utnbe^nkk|~+VIr@El{oA$x!{-`+NzL`_<0hNtT#gW4JTMla?7KX)6K>L zA4uXR>j?Kg8df%nuh8RHR~Av zJz_a@0#j@rtvVx;jrUp*#RGc)cLWMkjeeP=LR@D|NBGT<eH+59HA4y^NhGLS?{;-3r6=1E5kwHxSE*uwr9$+B*ZWIHO;B8H-qLvQ<9iqQY@W z@LH>8>eOKfB z(KF##aB)Mhjj@>3xRey&WD!ZN{hY~-V~#|=^(A`$3?NNvmau^vAD7L9W3zbkMqbQn zpEZ1slf?>pvBra>+GFQ$NCcz|wEP21ZVr+-8j_mx7`9rIT~4m`98H)SC}MG zMj3?nJ*-p_I)LixwHH=Pxdui@LM|d$xQr%owNj0DN0MVAtx{Tvn0$qD`Hi>igB=X)}uu_AJwIS)@ch%kxTx#Akd#j}^ zWp(`vYp`7;(lDQyB&N+^K+<648kE01Y3lPUW@R1U*m*uO=7W=X46NwKPr63AvP4?; z$pVt48BDkjF*_@2(J!5Ff%hwv|0X|X&-{PYh2c-K8rvncRC>LM23(IIjMX3_`KH6M zYY4w9iFr_|LhL$(BbI~`j_YqciCD&6r*g~|>a}7H39BAPn9dn+tk%SB?F~q&>R4t* zI4C*{rSyjtoxJoHD!1KEUF@e5|KCkSdYu6)*G+PPJ0Sa2D?P;sbZB=F#sd)jjSV*_ z_hvioG=DmI@|%G$_WKaaFKFaJ1Xr8L*fl=yTm%N+2DDg$;S_fMb2q6{vYlo{%pn(B z8VT1a6IO1SBLvTL)FXX$wS%^}8ik-hTe;Q|G@*Cx*rqW1< z2(omEktmlAA(r1vxI@9caz2uLly8Vwgo(BmgsC)~#t!565I4PF)B2_*B!7j0NDB_b z%0pCNz+LP`IF`eQSuDnU*@)dz4=cLNRgF|Ry_4E*Tt>26`Vyuh2T6Wp;;IDn%RLqH z*|&?;bD8-ldYaWY9{!zjd(vo8O%!oCjf_n@0>_@1lsN<(GanQ7^W(jjpkLy!j?UxH zu$ro8PpLF?7v+lAkhoF<;bvnHa!puBOt^}+hQegQ}FCdSkRUsS4Eydv5gi?M6h_2^}7gR~xLo=n7WY#l^>yA9c@*0DW zAY(64%Kd!rR?F1m8+J=T^jqmmD$h-)JIg*N`X*Qzifw~=k8K<^s(6OF9OUydmw_;s zv5&1boMkofKfk8Z@x3&PtRzVf{D_haS+8U8RAB@$ucBrvV@E}zF7b#Zgg?t_%dfwo zl9)lQ$E_rH`}h;?BDRfHT0x2pKrBrvl~`+I3bi!gkA1d&YnRCZti84WAP)qY7${~VBHgZUz_ z3_E|9*HoUKL2Vy~lKwOROfEnvtpp})9Le_k7O`C8y;gjIVez5LEw4 zDmWGm$L{dugH}Sv0`YpG;T)@JYMR;MbSXI#ZX!%VG3sK2@WUbFVff~!{QL#0(EI-2SibK$R=1$1 zR#fs2()4qYb>tgOMT9I6WJ zyS&wqz5x&=3*T~9`+bUDREB&@XWfq`j@HA8H02axu@jh0>rfY?nz8h*Yf#FU>QkO_ zRyRM;LX>;v(uJ3z$vWGiL{?5g`dZ9!Qj5 z&!GBka3S#}dOuW!oC#WsbaVz`)&Qa%*L&9l(!eqrfoY&(~u#AEB*nHws@qL2##%6HF;gHXb{E?uNnRa zoc9mfh|;VR)IMV;X>{WV6Rsd*?F6ME5zqR}hxc55cTBWOOY>un^Q@+oMGH~deS+RH zrIDX64?JtnfOG$sSV*!|Na%4h zq#Xp-7Z82N)h$J3UQLMEX1_o}gsX%h>9^}}l&U%_fhKerRP z7p2tU*EG^}$sN_NLhpA5a}$X9yl);B{pZHQiiAft`wSJV*6VF+QR#D+c5d!1Z@)xd$1`;M1(vgD`1$Vzq=9F@D zM4s$Un1k10rK=!`F_4QiIFrT~F8UK<-e470mouP~$apAp_uSJY?~)t7`3CCZ498+n zO7jDVRNnTX`Ld{=Dt!vhFH<{<%KD46a8nt%w-t+G;B92Ao1g^7Vj(d*mc%;;#R0Mt z5A2rl7g$a1&yJ#;Qc35wuOyBGdJ{R~9x~QV;MT^fR?PPo^%-%{vDdz*o^Jv~+j)c& zs$Wg(lP-|5^VncZusU$vp~|lPx?WSAMaAkW71vxQUZH&n zGX$$c9UvvBL`THOJC>iqe{&X_?_RSz3p>Y<&zXJ#o1b2N_R5DdYI#-tmBV zV^bG&T-|*{niN?TuFz#y-0@tmraZx`dfXRIXxjYU!mW(9F@^2jyz~r%5d-7mKuXk`0<@!3>bMzw+23`3@ z6Lic4kDwEvFd68PwS42}X2>AG+wqrJjeCL{8tF>70~qRceNp{*grA3u zt>Tla5-^d&@i66);SzoxRo`Efb~ez)ptr=*WdPv}?+}Zdz}$?((p!L7B6)w$jW{F6 z0&)=$-LjS*BKN3~CWpQxjd_SAk`bjkcQyP7pOdPU(a9S@eor|05ggOIck~n)o8PH* z)Jx)3=T4+7OtqB$f;1=&u^d&ca8Ea?8?1fyJwqj{IUMaFO4=vXaqI(fKE;zTyEURD zc?e2KyjmAkWW_H5xvxG*tz@;DL$K2Dgeu#9BR6Jv5k)jZJOydVmsl01sMUAhZN8AS z!?4%`E4q;$o}&E6QyRMaF4_3aAi`|X!!i7TtYkB0Wj{EU!s{&)k>onxY`<1~C97Y1 z305w>pf=J~GV$^d!dU81T2K5IZ5xKgFL*=?QTj9ynmikjQp1m|CUP7nN?9-I(ySXK zai|Zjp)5rukV7fAq53z`???H!`x8N!>9`(zQ2Zly{tpI6@B0^FHwnpOSSuMlRJO-5vdnuXUZT!o{$17DXNt_-)+J+PR$R7 zimOGaTDzKn9DdSZ&p;!6Mh5 z5yv*&!3q~lm?O=QF+kp?Vel=6V`q8CiJOtJc=N9a0MT9jVyGzH)QG0x4MeO9Ci2(K z!LcEjFT0^I;Rf@f7+J6xb?JriS`Eka)OEPXBx^<6SI>yoypM=H(*{W%DsaLca7% zYkWl|K`;78GSnqx9AW0zin0TKo3Lpg=8Fu+a{26iTQJ3D;I^kDtmwM=@*;QGLOjx4 zOVc}iLgYypgcW$a*)I8|>-e+bdYnJJ%brDY4@GAN91SG@i(xwgSX} z-^FBM=eGfq5AxSSw}LPh<`vGTn$;}2Du~Q*E3rJCrM4p{;OU#$iV=?p_oMgS@Xh=A zx%;+ahl$1zKVDqTYI|f8QI2mWelggBR<4;qm~9xhxhB<0%^n`jy9W>J&5#DP*fv8o ztNYqb6s1^evHxL9Iyfa13rQPT2;IwKiJUC_`H#$$kR2^&rT<3bCZ}*~hlQN`)`##9zVOzc>OqW+TSMx1f1lCll_4 zJsk5#k`H3=Z326fAgbHd>f-@~;R>t078igWriJ+IqZagt*CfKxc7Oy3Qm+G8NMsda zGi5t;Yz0y=0T5l*jsQ`ywiVqcHmAmaB0k zS6R)HT3Bh{8Gh19QOWWIC2ORxeQEB<;&}K;3zr}wA@vEmDSpP9gID> zwYW9t50nmP7B@%HC3PYiIY(7XGn1=)k^!+YX8T=(6`eF7NK`J^iSAck5^`rL;R+nY zq&wp=U-Hd+G6c4SPu!i1x&)M%FIKMNK6%_okvr5%-S<^|3cN>Erg`EN9 zxsUlVNA0U*^HH=kB+=G*jn$+-A0aBgwiGX3JV`=ZM-thglUR0Oj37IdnVF0qO=*mJ zF~vM94LFMxUt_iUq!A*st)+OR?NR(vZ6#rrIEr@5#-jQc5Ze&^{;iF%F*OC!Hx6|P xEV;(&K79pRrrC*w6$PYn%^F{$E9G|0B`c?H{+^ zLkxC|x7!ed_b|g*hQZ}9gLg#egs=z$uHRpH$A?FTK~!Y4p|MAw_NEz=+xK?&;M#li z8PLbe+wC8J^>+J1i^$Mu1Aa~YFXO|fgXaB3;D^sXy7%{!KYVtx{_x<3k6hdL7v zhkL%i5B~6(_~CQOhfi-ei&XEKp|Rni6C%RK$0tOG#Tj0`d;RXk|M1V-?VldDfTxGT zSVKZsXzX9r`KP;>8KJT9VgKQo*s#!u$&;f03HGO3%L5amrpNv1tG`}a9tfX4^RHk2 z`A!uPH8cLt-~Hv<^1!6%@V|Wbmz#5rBtBR&4|{J-ABPY92Q4UG@`*S~!DX2$fe z$p7F4hKY{)kAG&E@aa;-|M>HkKfHI;|Lg`Z`R{@Cj*E(oj|h*8|2seb{ZD(l{ry32 zDJnKHG(OJIxqmPB$^ZWGfB)_O9^m!AmGED{zjys#zU};f(M#TLfAAJ%En-I4c(t03 ze_!AJ;-3HB{D1BNOMUz22>+uIs&S7qFk-pZVyE37#?Jgq?Yv6;A?i2tKm2{dg!lK% zhrhR-`2PM6+#k=o|Mv%(KYa9uPyXw}zq{nWv|;{W#s9@j`O|Fuq5gPJoDv!vIuS!N zrmygcaRzU@GY^v^>xC0G@8*}>baz;KM z(%D1!qm*rDwdK2AsdVTvc^3Jluq6AjAk{r09Csg(hC*Ij%5&USKJlW5kanyYpB3$6 z+|KIG&vK>ms~<@1#+8EK=I4U4{1H*gelaogZ6PHdQg-kjIlTp!AUD2Uaw;Ht(zF+q z`c{)wd)5g#)=vaE{8w`PIUtO!5R|qIQV#It6F4EpXPF`S!2&*q)wuoUL79Lb$<(Xc zgapUC0u%j=)CAU;?=HQgq{@J*ez;jWF(Og(agN3i7Mxgkfq;(j8mD z$EJmPF#PjtvF%whG_E^bs>e2tvFu|QbSuU6nHo|6%IwI>amhl!!-oQ6|B~F8V}aNS4!JD=xxzpDy)UGc73TVS z@H<$o!+eg)r8mjv?b3udE|2l2--t`lBPeF4N@1Am{Ox`Hgt~48Mwg)BkfJ*%^`+dc zTg1-kOX1D^IzjP%hv+|o6nn(QL6y>Zaew@jzcDE77$AB-rva3UyGO{;uY`T(-vp+; zmevG5hQ}HUlGFtrJH8Yy$(A$gm$P4GGaNQIuJ9s3o8X(A*)@ZcF{AAcle z0QhEQkTYEPovf}Tz(kdTN2Ft7jF3L;8+hRrq`C5 zDA(~RetnA&{Mw4hJFIED+s)VNpxia^DtGUO+KNA=fulb+UGsLCA`B;vI6A7RR1H&HPF%YTLl7_sw48| zrnIixOF-HPi4MVPzNFtWf(erG#+*)o=oZ}l7=*E+i$Y3-Me`BL)MoVf^Oqn@JD48| zDVcoAW*$(jJOMtB@96CSPx~&<}Hd+OGN)Sl#32Xv})@ckiR2p*9_Q_&aWD3M(%X5*aa!NG%}Gg`)p}t|DS}^Ab9M4 zTgq|2LrRwqQkL@r-%|GghI9+k)1n0v3HQ-0{v4Fmqq-W z6hS4A-;E>2Tvl^(=5)lR2_4w(iSVp*10rR2qAy zn9FM4n4&1ttts_7_)sXUvLjrBPH+bx{@sy@Yau0$4@x75i#4u0^0}<8{9-g^E<4bE zV;=~&GaZQ1rVEuMotfi$pxkX&A#eLoz&G;8pXV5IS^bDJu_#K-=)5^K!r2?m2&d}? z!syJ>ogQFsCY&6>Pc5LRFl`Y1<@{b&Q}rO8DkGcI+sqvyyGMH>E$$8}dPwPoOkAQ$ zDd;1j^tFPM?-=*8+S@PVsT|sp4(okYNUnAwN=*+)u`QuN8^XH%|qD_S!3 zuJGCpid{m#k?O%?mU6dIb@G*SzL2sW&7!|yAFJ_v{5cA&6LpNQ5j-w;AlxlN;u{Ql@`LT54S@1a)msls=x|SVJ?zm{BdWRfx~vL2$(?c&rPg=*O%E zqct6gZV|69T5yVgYdqCX%_0T~1; zo>w6Odj`W}d%?AZkfO7$PNv+huGFT5mJGZL$S`j}nt?EbVe)n8FYy6Ghrr>xQTZv*Dc1)cYXQOxLrZ$5LdIMf0*;mOxxSl?`!U1lA*BI$HM3Tz z%yU3dGN4+vf?~s9@?}64^Er~gaQyihwZH6VwE<>8D!S7xo{xk$dwasXG(k#hI5_|$ zse~;Vd_cpYkdg!`&isB>*Cci^l>>Xydj3y^6^|McDV+e~1Y1Tz{}Ygs%^xWpiaM~B z&ug0u#q^F=u;oe*`jWdXWOi>zTVJS$R@Zw1I; zKsrFN5SYI~h4kAo9OT)AxR@bDmzuVWN?m%>ES@D}T&xIpbO;pd2q#ZOiAV>>iun;^ z0${#3uCGIi-s#CQbTLmFW2GZ?^)w*=P;jghvQk3yXQ_~qUjmTG11#xgJjiOgPG3Qp zrTyrXLAGS@at)Ds4M$wMAYZ0|mJ?O;%O?b?rO&3SBOu!IXI4_V{y>_0&4GxU&|jPb zL71*6ebd3KS#a_pKIC>F?COuoKL-$Z-HVvc$YUL;IVXg?3m#j zRx>hm17#jlS{?38$h0>CH!TE^Uf|egsFp=AKa)?M_7ObhU`l=vSo{sEUDZjZ(j`h) zJ?}=ml3ohj)(L>Po0;PI=#_U6myP_vdmq94RTjngZ&;mvL@bd zL@4`0u?1i+gZ`hv%LyYv9$!;l+ZB+am(D;+9i<*CyAiQ@tss3N!D9mv{Y5BB&s8RG zs2zzJri&>n+O_x)`hEnY3?ww~oD*4)jA>kNk-{S`$x!AMJhqje8uYPc8aH9|0Ytl? zJcG*j30-3CL~@!x7v#pFkV4H;_F`n^8{PA0z_uZs?`Iv4*9RL-YV+rxoq zht$EAP(#1_lOw-piBTJeiF2x~`5@_b0 zE=4SUf?^f?sM=A;P-l~mjc@T`RulBYI^;c}9lxv-+=f0Eq@Tt^%1F?18Gd;mAUad< zxY00wClWc-aG2Gmyi3QR=u2lfISq7ET2h$^t8ls&Bj=4-mzI=k{e}Ou72GOr)64S>ZE*0v`e;9Ra`InSC>Q6go z>xpfX*8(#?9JY){ime2Da}a&KX++DhNaPDVGP5X;)eLL)Hx#7+nmM5fIe6xUz&wsX zTtdw9*flWuD=1dS_gFF(cGZI|9(*3FUE`YxTSm~z%$CHFyb!os7#)~N2<2KlmIL#j z@^$sbf&6DJ3oA&`t+C&XW-*eEx!;o5uC5cf1~IT@GV*0TqQ6;{@?+LG)vmg38v)V3 z9k>NjM$?2>j-;LE3xqNjOrDBR%JA4~aO@Rd;ra>cz+On1R+P_b9NT80@r|RqmphZL zrdI;-p$8S26v#F%_z)gb;09zQhC9r^lef<5~I8ol!yd)Y$lw%6^b2!ldtlT zcZ1Y<25U28K6>S-ofvyUXz8flq+~D>Ic_d&nFGjnl)h6c$DHPT3X0ye5T@t|tMPdN zNVf^peBG0Hbo*W4bo1e{d7$MEl#B0Reks5IcMIf+8pyHCv|ay_bL z;sThz2z6`^ic*zYEsw7YhLUYf?UIKSA7ORUgab(A$+R}ej}+KyiL!Jd%wLQY+ov`@ zOHB@;h0KVX04d?{ zSdZcYR#UJw55F2o{e1-z#MURWX)!{%8IU5RRS>FWDZi=FL=Yw%*JFzcFpvFk5av&z zbvYiyFTs{5Ar|wu!Tgi3XC$ev~4UxZ1 zhSVLX$z_<1I>Y=rzGUhow4(Ixi5%n4nW&Tn?auG@eibcSd z8vfd&Nr?UcOC96Cg)MuIqKi$Z6(>cq;%-BtxML3B_M^g7B45Ub^dlgiDC0Vl zN4Lq~+fqKp=O$msYIfQn`kSKY?e?yu`sMZg=S^>HP|v7Qfy7_woQgD&d8T8 zrb1S`ptZ9+L~@(n1ZmjK^-^<#q`?QP^@q!O+MP1WLLB%-0>B# zSW zGtOZku_AsqEV=)NFTFGktX^l?9|J^N@}`V(;}_Ek&*@~pXBWc#CPT^vv+TtZADecN z@|bTiHUfR$vhP<7h_3MUIm%2~Lj8;4Ncrl{M49vzAm5vn8VjVO%CYKK5vXbLyrnG_ zW47FRfhz7PbVX4VdSws7{JaVIQh`u5fGt{h>^^^fRwUxm7?7iU39I3@d{31m@apnp ztSovGX8tA+<`Uv!hut{63X$$aLdr(ViVqNN$<&KjzAdMVE>9yV&K^YGwi!~Y;IXD8 zEBdyY`~I^mb7ogHlqIart)L3iMH&t92`9--`vH;(DOXVWTadbLKciRP;cMGO!DBvH zi6=vfe$4nQlslb9Pk$0ic53<&Za8{pSvB&dH8${mM&-ZGkK7vdAubGmjMdo9tftcI zm2~jc7{b)+OC&=U6uScR+mqx6s&ivZF!Kfe57^r0C>XKT)~aI_lyOM*My?6X|s}q+B)2`L2NMP^EMh zW00X8K^S*H^w*YMqug)nXzQpj^5hpIVH~!@mY;mMT3aJUqgZ>44$KT=*%1<%# zH{nlUjZA)_+{BGE!7+x+nrR}+g*~w47c=%yu&>+` z_bYn8ZYIf_Ya~+aUPyU}=nL43JrBk9@gY_-&|g+yw$ww4E@yr%RZeW8b}MF++(rh% zl@*+{v@lZc=nV^12$8xXdy`{F%oVX8X;GQfy9ihXd6_RY<^ANOpyOH zkL?>kw9aRqQ|{?z>ezn)_ACij;s;>MEA$tTN7Aa06{BaN@vTHCrvReseDnoXGPlwk zX&#ySjAFavAZ&S!#z)|>mk8w!zH8&z>hcYqzu5c!MwvX zju@3aLtJ+A2XkhFFc&e8Eij&9HK{+orqYpZG?C0B=?{E~vH@lN4ueyLLBukOTBM8` zmVmgV!j>TZ6ss-0`WuzRY-%%l9=YAkk8tNPZOqpS(&E9erA~#Mb)JJd_LWNhQ#iAz z{*5yCvgy>338XRv$~?AA8&(4M)Sm>U{h~&HgIyxbx5bj@NYN=)e?R*bWj1c3*V4kV za`hwf*Jx1(tkHsofytF1|8YM3RwB%w4UYBZPqUhUn3q`ZZ=*2{L-6<@BAN53?8pK# z0#c5s`LZ$VGe9;NEgN~KG4r>5P35p`YWFahxY5C2ay~*?PhdhulU(1gVapZXbMEJ8 z7QbOcx*5ePQZZTZ-dr45gbc`Vz>CxzVlGW{Ba#;Ji}^gYu-}k;tu-c$z(Dv3y>Ct z@K^(QY#O3J8=EYEO;)x~z!ib~#Ynsp|D+|D6HNh*eu8$!w)anc$3lYkf>M2httBdlt66MYZXxxP)vczrxk(J{peT{Lbk_z*AHIzTDU5G?B zq49P6wv^R3ZN!T5w+Cs?_o-xw^+2NRJB7G3!8&FIPT>Sdso=LfUkF>C8b!*p?y8itMSM(hq&2Xfa z29B*$Yx0sVNgzy5Tr>Q4Snuz)6{V?1sYCWAQrnXwOtb=zH5ZhcG@SKW7Vii6Ey+nL zEiK0!XIV{StA?Vq`jyt=Ma|`g1jRQ)$*`fEwip$qy(WRHwHwzdqhK# z>G>^9e)=WO#(ERE^c6>P z8g&7RIbz-a1qPCxDkS)b1=1V^)&PjU)q+N%(x;fFmTf{-cE=WUIcV7mhmgyW``c7o zMmAlH0_%%w9bd+3NP_%+$Iw-?-xw9-OT8AN{OcKdB>Wqa7}T9`EM}TJ?QlAp1X^x|$FliMo24L3 zCQhto@(Qc%BRYz5Nhuxpu9(Q7u7ufr6;j#@l9-HgkqKoo`GWZO5X%Isu)2M{TZ@dl zLbpVoAO|n>#4le%T%4g;GD2xNfY`{}J+y2V^-@cpg7wS9HlnihJS|vxn%rH3LDByv zJk~)_{1;;&u_%_#I|rlyvIz%vv-xwZ=D?3mqMT7qBU_Xcr`}zO9CH^Q>mYE87OPS$ z`xp1KQ&7iV8;(C;35d4MpmvCUCB2`1j+{P=3APxc1J@BXc`^F_-=NrXKF@9$rVA$& zC1*f%T_!q%faMP z7<&#vieB&7QQ`gYejVKD}Z&I&w4)Qyv*F;9Z6LZlbiQhSmnWA=WOv z31@r@TY3u2^%M-f`LHE{_w!hRHS%IW&I6*G-N;?!9@Wwf!7oYeLD-VOh|(Qb)%+Nr zW2%%98&-h)9#HZlD5iI9)&0;I0=!Uv`i1P1GY4DcYWX0e55@xj?is5aJ z;#KICy`WeIueVNvlkXUEeH-)TtbWl2NV)KW+DeznxQqP>W35AIJ@8`4S~QEFa72ru z^l2Ju@-#q7jNh}GgwdQR?R-gR?Yu_P2KZnb%34(XIfQa8qJJIveu#gwBMpQJ!}j2V zqVF;D59uRHwXbOHn482&HW7IUD>6(UYkiQCX>4NrJ3B~lZhquUV z(291io)ORK9}zj+7ET@@aKct_Oj2#}XIHB=8Lv1LUt~3N=lY1!C7qba*OA0dqlnC+ zcS-|M$Ffx^mbmx@u7;H6kn&B@MOG_T8AN5dUi3>~s7ugj!bI4KvLoJ2Sh*ejMTTMr z_}uNQ(ZwcXx2F}P=sFm9k=tt}?(e9jSuH;y@_01DG8}KV-w7!f;IZ9&Ts_MUb{P2f z3R3jW_f43TSTX8*J?$)wA@Zk<0Wspen4OsUZNcQ-{FUG}AdHn|g!8FnHSw1Pk@2o4 zmS(ZkZs=H?zFDN0a74HRx!)7Nyn~P2z6LW)5}NptqDofVDVK z%z!xYm(cgm4?-QY6;o0h(u3_o3HQPQiuu9GyV3Yog1zY|s_WF~;{d{ViPc_78H^dG zq4?~hhIGH@c*4=$kxyCTcd^b)H-+{9m$W;POMjAv}MBonW!VL43 zT4CzCrDIh1NUfGttmezTfSj-w3kO+I>#u{b7IPGn9}GoN+6`K6g2z_!cAuwXPF}4J z@Kvn#-tb`})2p!<*^#AN_Jk1SJ4bQJC>&|>yD-A7S0R(C(=jTztJ6gltBdXwAWA zX)t*sUwUUfmZE(?%gsfXS*`t7ftZtFFf8Y#fI~-ltYe3zI8+HULu3 zl?e;N zDsp=oiCsPKk%aO@B5PWUsXa&HrLdELJom92b5y_DU|EWm1f|)TuCSV{=YvG$=SJfB z^T$YVlLR6=wiZwC8Y#$*r!7pzOH&$?Ud*ucN+Z@{MORp@K7EkLtZgLjZ*~Z8s?8(J zOefKP_9#UE9Bk{4_urbDYBy$}^o>Sb{EM%!x=&w$mYMcqL0LX2U$_vP_HD$p31d+C z%T>kX3`f&Wjf|D}M9WIN*xPNA<-J4ixcJxz%WH`rKL7MiwdLD?y8Xxhd-eYSAwt?D literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata4_113.dta b/pandas/io/tests/data/stata4_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..9d7d5abb1b92156dfa03b076bc88c20cd9afcf71 GIT binary patch literal 1528 zcmd5+y>1gh5S}Ci5|B_78j2Jvq)2J7Y|904le_eJ26HB3d!_YVxI5b*A>lS9;stmS zBo{nD9w0@U^eLzyi8J5cIxP4HnTnNWc6N8>o7tJ!eY?6s>-3n4G5Y%Xa7DyAe;AOe zsG}%V)pDgaUULd=y88CX>d|;S#ujWa&dzt9SOGYmy(;h5Yc=|Ldvkr;>KXICVWK7* zJ|;I=BgcMIs8XTlE9m%5(mgSpvj^lWIPk_m=bc7ZXPJ)MnXk|?Nt72cAc)0d_f@If zb}GC3o?NNkFXrjsE)CN(#FkGx`m4gP%g#Y?_nUJn9`9XDdoOlEWmB$P+EhPEb<{Lm z!f}AohhU%}ctCUkMxbG^PBa3e%a24~f`^ucaguj9T>!f*;8bjCAP!h%NtcVWOhvMG z$tckM<|{!ySxZCP+DPjN#Q2o!Jxbq%J(olz#oC73QtAyo9b%&$K(T$6QbAO z5A6(#ij|6P0sd2=XF|_~c7$FC?FqdU0tjw~Mnmn;R?<*}Fky>;fSJTh`5>XZlQL5) z>7~@teWpymqm26;6YF*-r&vQ#mUS`u;~mWm7*4l^Fd$C?_5+&^03MJM{~^kYpXuSh GNy#011gh5dIPfe+7!5p#W)x6p02azDOvf$z4N1!7Es27~3nY?;_vXh6=Z-sHovl z+#?#EAPr9eM9g=$C+^}ObOjPCeX~2e^S3j*lkM(c4|mWU-@bm?;m)JxHt@0BueO(7 zuHtX{#itK1f#8{WA$=1Zc8^T=_#`;OkIP>_F9%cS-t}DE7bkDvr^*25hCqN)t}bAc z*Xj7oNiH=*bpda@H@EqyH?~=3lVMgD7`Qaf%NQ8M_Hl6BIUH^mX`UWDZ>!J^+G+gz zkXVGC>pMt6n-?VaSAjoO+F-#KX2WUS#j5vmCo;ubF}~hr7Ef&4chbSDYhHpg5r)@+ z3lb2Ea1U6L;QM>vb9kd5QIh5(nQSx*L@f=j#AfE|Kung7r98__th`S~r-*J_gli1y zE3fo51#T)?2h~OkgT<%NpJL8Zo8+}*wlknbFn@P4FyZE?!xM%F~|10qlLTUi pqa2B)pt2{bwDuoe37AgrDLCo@*8^J(fF78VwO{#)hyNxezX9{t7M%b9 literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata4_115.dta b/pandas/io/tests/data/stata4_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..2c68cfb393b9ec5defdbf78959bb9ea2b174a942 GIT binary patch literal 1713 zcmd5+%}yIJ5FVhEpF@S(13hrCDj{(pN}?PJ7rYP9S0LKJB(db|qV*;LaUfR?eTPcC z2#=BrPtg)FpR))@DTx*d2}|FsXZ_81>=}<+U7>Z_rgCGopx>R|U1PdC_^nq07k;j4 z>Q|hpdbQS2)uZD^gZ@q)|4rI`Yp^S?%M~o^H4za6p!FxBF$4i4y-suwK@T@X zfAnWBq9o0`oGox$*5O=iYA749a%qpJYfi!Skwd<$OZg=5quMT7wids2o3~?0vr-2L!p6> zvXX)#gb7;&1fOZbln+zNyBRaJ(|$&6GhlM&k{k~>AvT?EPKg0g)MGJP`i_KyO}`2- W!hplT<^#YExRRw$SvVi&N}d1$bO#*( literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000000000000000000000000000000..2c68cfb393b9ec5defdbf78959bb9ea2b174a942 GIT binary patch literal 1713 zcmd5+%}yIJ5FVhEpF@S(13hrCDj{(pN}?PJ7rYP9S0LKJB(db|qV*;LaUfR?eTPcC z2#=BrPtg)FpR))@DTx*d2}|FsXZ_81>=}<+U7>Z_rgCGopx>R|U1PdC_^nq07k;j4 z>Q|hpdbQS2)uZD^gZ@q)|4rI`Yp^S?%M~o^H4za6p!FxBF$4i4y-suwK@T@X zfAnWBq9o0`oGox$*5O=iYA749a%qpJYfi!Skwd<$OZg=5quMT7wids2o3~?0vr-2L!p6> zvXX)#gb7;&1fOZbln+zNyBRaJ(|$&6GhlM&k{k~>AvT?EPKg0g)MGJP`i_KyO}`2- W!hplT<^#YExRRw$SvVi&N}d1$bO#*( literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata4_117.dta b/pandas/io/tests/data/stata4_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..412c2c0d7b3569148266f331338aab76ff4b46fc GIT binary patch literal 2185 zcmd5;y>1jS5H>#n0ul;81wsg|B1H-Xx7iR05?Rh}6Cnkq;Q>~@JKvop@9t^sONan* zZ5{#9Q1KjTDJZCD)9?%snq$WH=9asJ=n5n(jc3Ll&wQS-y-T&>hNo@A8SM$)7A2#l zP=ag0qUb3ND}^@qjVMYqc=2+JhGUg(vpGV;ZAN$4bwWVgVN~&^Pz+=zP$M$|p|U=^ z6|Duypc|~NL{EZfZDVbnhT~ZpRwy#wXW-4Y_nzOq;Dk-jk3GMC(Ty)&@`j#$nsdTu z&(#&DzxMp?HK!kVPD6}JcX=s!Q;F2<_l0J^Mn6VBN5|09aAFEga}HZ$oq-Z=X(qLnc~{pBI;!BtVVI2W%q)I~m+GQ4 zN@{bGWHh1S$uR8{WyXzWOV1iB-FlewN%8hPD2y!MN6Ec<|*tV-_NwgqV)*MzGeCz{e-so+Ztaz#hxMAFZG|-VL&Cy$rX$ z+N;%UI-NX!W3Sl5mec*cB46g6zLMmH>51}8x3fE(!WQg701@aS6K+li9CV+t200Jm zDGuR6RqG)IkTu;P)xZo6?frvG8T1R8n_4nwb69(2e3@vg$4lNaqJ#@8Tbj{uYNQSC zID_{vNbpfa8g4@E77{F=En;RV35BBnh zZ;wXcqB?drK*WZwBP%)Z2;?Tv9iV$a4}cbdmVq7vVKy)YxNqDn?h-!%xF1}L$kAt! z3u5*PVtSMt!f&1WyM!s!U4R=z8+;ba2$t*=p1VGA9ieH9hxU+ zK9c>lzH0b6`)kgb`!0nBv3?$8g+Z75mpMn0Hi3 z0{0)818yiwTU!kQ=#i*lcK5{MiDYW&Jh7+z!u)@^2ocp#1IWDOT+o(q5iU9|!rG)w zbdjmZI&)R{X8Ax9V4d%+IFw2=EQM}}%4@=QiyLrtq;tf#F# z$Vwt?s2G`PM1a`PQCuA`<9%@y(-To_pp>4Dn;7dF3Js!mtYX!J{_&*V$h198YowHO zA(0yScsp3mOJTRTn7RK*S`yT$_CKx#I?A~V|(tQD8?jgoP8Z*7z8e_i5`=r~&XUck5UV%saKf~8Y7@iI0d<4GkhnrvQW7lIB zPyGhlvB{!rc+a`mPPiavMZiyF2 zj6aE7{7F3cm*_=L#z;JCLc-Al2NHp+L`_5l)|uHESXx>%CPpZev~S*f-+Mdr?Rz`B zN(l!_!8Txyg_px%4*fi@hKV6*FI)0q1NNtXT;t{fV=1ezv(jn9zR~Az#?4F!3whWg zDA4WVz0l=5;H?Hf=NA@@#cy1oUz08X7tf$MG7{I2zW^Ybh=&Vi2t^YD%8`GFiBuqp zR*`pz0$QXd=Jf+rQzCd3c?SjL-zESRB`c#HKBp^8O}{m_fE*%Nj&*yz>QWvRusHyh zfR&fdr~QL~0wk}xDu-ovs{*o(4&O2X*}hHvj~UPf6v7n>GYkW}w}0)Z#kh&tAwQOn z?bw<-HWTJ@0ya5Zt855qe^EP4W^o=ICB-=iK5W4mIjXZ8FhO?f#^Q-& zb{yMNZey-rE&`}g@Bkq5j&gw|;v!gdTm;2Q9o0p~K4UC;v==pHSKb<+Rct_9h{aYH z;sBSKGz=YaanFHja#h+ep1P!<#+s7Z>t52^{M$$|zW3PeCnmS9XsM`EcM zj7ssaCc~tp#3Gs|i7?O)+cZHFobW$S>JClo)3`@UDHj~-k&ky8%V{g@Wd}3WpKw_Q zYHy~}Jf8tb&ijdFc2}H8b|~MpjgWI|>-23R>+jk7FrYRqllPHZpRbemgX9h~?qT2& z(OKSS-6lR$)vTg$i_*aunmjpK6!O=lr(n_-fk3J6toQ`*e)9|AWA_fhK>a5z^}#MOZL(aAv%K8Ye1A- zBMO^(8sYvm0Le@`u_i-2nUOJvcTq9}(Ij4lXHjIlNThi70}~lKS%qg|0Pk%AP{WmE zB-J6mnD8WArDIbJz$&kA(}7^nTty5|S9Qo--wd8irPkq_+E%<}OU6ETz867k+a|u9 z-@EC#Fq5AM6Z}*a*dlbq+S1U`q2TKsHU+Q(;~#`M61yWmb z6aY1z(DdHEWGbD>t{lhrRGgR_SBohAok#&7_m*ivYOF=9vlfnvT-AcTREu6U zRE)l{L5I?UwNOeeE|f7LH*4D#*5an4#b*m#FV>>pFwwx^(D29&mMt`vR=d15ZjPub z0CQLik{8Y{XDqPiVWcVCe7q&n+IHgPsnhLe&UTzT-`UlD;o_ysJ)2jHIGD4Fw0M1` zRSMJw>kic)4vC;-w2;&}77wntSa=-i(sUC#|6iacn-D*^VN8<^IfhijJP6GUM4F7C zZa@q~nhI0$u$~%7!K9i_APr_!Bc&sxDlj?%JCTfJKm5;01Mzulp5%y+X(8Yo`Sc)C z$x`6}znGc*Tv*{?{x(+gH$ZkVOg;I%X&ik_0!RIRaC@;rDHqZE-35utkt`EmI{B-#FKK8$G@WgJiYzr^S z0w0+c`>+=H?5JTprKCj<&JlKAu-!XaNd7=Aag>r4m4CJ1=LV6$lEhv)ErseuVz~;B NEtiWnH$#s2{{X{R*%$x- literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata5_117.dta b/pandas/io/tests/data/stata5_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..afbd3b0e0afe31bef669b8e9398c623beff57d74 GIT binary patch literal 5366 zcmeHL%WoS+7@s9b^`SyS$^p1^qtL2IDBg7fA=PMUQ{t4SgqE~C%E64iWACQBUUzrg zCLpoyt&tE9i3=}@1Fr*r00#YQ=e2>$%Zx^`0GwDT;(W)%b50K7{G5a5 zoxKQgb>6`NDbv8=x>7LnL(01f8_xaqxOu!;Hw%_MVcOOqi+9qg#d$r7kTe_~O_ujS z4jW3D0a?CB*()F)z88QXZtYRZNKVObhLyYu@;8t-K;pdU6<;8+Ps#x;vE>F$oWHZb zvd!#fc7y-@i3~oe9)M_2CwzjAV5NjC8~WcW{5s?HI{<>HUfVH3HHZk0v3+r)rWpGU z0Kx~!-0cAgi+wSQZ3_UlpQfZpQG$~wW`UnDzM2PMopfNFZ&%`K6o-T*&i<(}tEK^W zBKx8NUyj{Px-c=3+zC4xvwtetT@N^~SA?f`(4-}~lS^t*b<{zZBF}rd=^P)DvBW_? z9F%zvhZbt0nQ5V0Kb<)V)>{k*8=eKxV{yu15#O+R{k`gX-mj1g6tSrlfcHi|gGSgQ#S7hdYIy$Rt=c=?X zqwnR-ZGerS-745a-_<1k4_y!8b$x4d8a)3o9zb3B^}LLi`zRU`E-m%-bg3&|J1{83 zFp!#a>q+I>5HclYglM(q`HSa*upTvf+vz#=6BlK^PNE>x`WCnll59<)$hPuImB$pIGz z-y=W2TO5I{@QA*c!T!wXae)6ANc|>h{4*^*n-A8d)R*#GLs#UzwP*buDUW_IdQkul zFM0Ak^7ij<$oD~GShf4O{E4)6+%FatzP0b1jH~e`^mF;m6^WOXUH>#5-&FDSJPv-Y z@WMWlW+@+3ihcwx9^zc^v*Q5786+;w@;w6Q1 qZcj~G!s;qIx4K+X?T0-whV$~X1oLw;5MM1IyFSK+x5w=K)Oj2C0b$Aj literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6.csv b/pandas/io/tests/data/stata6.csv new file mode 100644 index 00000000..27a1dc64 --- /dev/null +++ b/pandas/io/tests/data/stata6.csv @@ -0,0 +1,6 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,1960-01-01,"a","a" +1,1,1,1,1,3014-12-31,"ab","b" +-1,-1,-1,-1,-1,2014-12-31,"abc","c" +100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d" +-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/io/tests/data/stata6_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..2e4795b167f266cf86afd73033bc1a43ae9afc5e GIT binary patch literal 2752 zcmXS9Vr1Z8U}b=S1A8YsF#?%ZsWs&c3Wf^4iA4%V28Jez%uoeT>cf$VV08@t|NjRG zf%JgDf~Bh5UB>f5omR2Sf48Q;U{rCUh7sjN@lGJzx14FPW2~Zx7f}H%k zbi5{{<>V&<&A<&(@=KF)fUdv^QWAlVEJ?vB&rn=alnHVLP5@Pa+nWpwYD}oQRW0=l z(xE~mvr`xldW}p>is9xIGr-uWwvAxk*QG32e4hvugQaCG(hRWl&i6Ug0i^E1k4|tt zSgOo0Y9KVgM#B&@PP?<<#M!}c3Y^v%p?p{(egNe&pG*g< z{{)kVx&S8M@WTo2CcK#gs%r1!La=!pbMwK~28@Okb&W{o!3~Bo5*Zm8Aa*p^Lxeuq zJE-mPNK9g2NKH%6$jr*l$<50zK-UNJ6J`hx-LOakrWz1nU})G64U7E@ALh6vCMA!! zut-jMQV&cQ4ay(>|1&OGVj<5ELH$kdkV=JiA9OYC8KPJJT!HB#1v!rt78RG2mX%jjRzY(MI4vSl}3&VdPD7`tI;j1s#T!}4{8B2iT%}=H?lmoHk zWTVysf7Y_?{ERX=Kz$eaw=e_%_%szPpJ zMP_bku0l>~UV2G}LP2U#Ze~eIYKlTqr9yB?Vo9Q&!XP&^rIE4W$^ZZLz_6%;gvFhD ihJ+MXV7f>_&Lf3I#U-U>Zp6cc4dZ z9C!c_XOQ{^Jwz%G5S1T++QzKyH8vFvqNh@r%lwaLKh5r#bt%vXy%15qu9Zuw*$z)W|04R8&iJkx7(@COOUY1LLk4bdgCiAp4jA!y&v$g`wg| zPEXXkrT-#9pnShWj}{8-Zfuf>p(4BA7n0ORE}d=n+=2iRw%~opB;WS~-@wUy_>MF$ zHm8F4?Ll}dC_#I|Py&%+*<+Im`cwRSAZMcC9RC^)X9JGHSy!YEvYV#VyASm+{zvSW zjl_QMB3XtVFNe(A|Gpa5>-*w#2d0?EE1{mMCsVE&Q51;2;Y~!w*2LuS-Yii8v?bjz zm)~1<*;%15E_;s5L2eL^ri&tw8yebV&9o~vT3T7>2CitXZE~(r zRI5102`3}?j~AtN#XI)>n!w_7gvGZSEYfK)7c_lGR$ay`A3m;C*X1P%XX&=(^>FWq Fb_Muvm<#{_ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6_117.dta b/pandas/io/tests/data/stata6_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..bf4f3838e8be7d375d83dffe1ab3c06493f791f2 GIT binary patch literal 3490 zcmeHK&1)1f6wg@2UK9lHM*4A*&Td} z(2L;BKcUi-pn_*F9t!pkC@NY#bj;`OvSs^mleU53_g?bLOY-val0l3UPKrJ!7zLF0 zG{q>T5haY`eEu48t%AHdr%5Uaw`Zr2+fvaq?ghx5#%Kng0YHc|7)8XR5f-u*5D#N2 zhKj5BLVg6^s#V}{Ejs}2d~Fc9a#XZP(vWx&E%JJmGW<9DlWk|evtQ!-2^_Jg4ub4ZA!318 zdZh?*c{BcQ2rFE)??H%?N_kHUb5TNsjqb%yYI5#-5Qvy0_q!7qPeVD2uEh|#FH^Zl zy9BW$oOk-Bl|Gcm7zXeARot7T1!|x$DdpUkyA!rfI`V_z znPeq0jV|2Ld=OjyiNO;en&3D@*tFeSCNfbeASo$vnu`1!mKa9vt{Pvw51JQob5Jc1 z0X9lInzWi_nH0|dxUV98V|!Ia{+qrVTF0N2{tSFlH@9PSVESzsz2{?D@xR-4Gnj2} zY)_h+edc3W@4hW4d%drc9$974UQa7+_L_!`KpY3?H8;j})z~ ziYrOAv!7klQSyZ+_DL218tcXv>tO5U6!A*O&KD(rb6Lbiqi-wQ8R2iyr6+3{xZ@Y@ z1tEj_!2|=spt!hU^N_JCkR^}@gu^g}g2)33W3m*+wHQXUQsx1y(lib^r#|%Ra8^Fo zhv0v1)L(HLn_0FjeDPHK;_EVa;1@((`0Zz8A!WR__-LvA*qjy>OGhhTP~@r~nwMF`qz3{voigalBALcoA2M2YEup$?c3C&Q3V$Y5q@z`-Wn z)-K)EmTp6pny%Q>?%H&N@w@kafA9Cc-}`&-dmWzn*|Zan{?lbARmxGOveZCgqRMf!yKv84dw}InSwGE1tOrP zpY9scyJbi=s)%Y)VRaX-5nM-mT&&zK%B{g6lljDO zr-fOzTCaAh?f9z)9=p|A)vUH^oY)dmjks?f0q8uePg1Bq#9@S?gI)7*ca}nrBU`P; z3IUBn5G%z-iO&+OZ)neSeZ8=?{}Xee#W~q(w(sbKJmpq7*vyl-KH>Qx&Pu(AvRUn* z?r#i63)Tg0X>4u|?k@OaALSZ{dKYn64i39e??lOiMPVEGj`Jn@{EvFn4`eKQs@3lj zyP~n%sQ>g(Z@E_LaS!NoA2jg# z2V50LN|$^1Sy{u+8g{LNSYcQ+Jm3W}B$6p;c+~QtT~~PC4=IB^F1)&M_Vu%8FQ~ib zL!0NB%I0gX@LUCp58G^-(7(wSY`8OdP$3qB@iv2s79_(#ouKBzX%wc8N`Yn#y%BaD zmb;FqTw7o;v#Oit>4weMe8_4joL>mM>gKJi!zxAU8vJ3ZbKbQD&Pmr6B+Xa>=K(k8 zTudlPUQeUuSaQV*d(j)~+Ak_~I#B?Qnd3+S+sQ-S{#It~ zl)iN1hifO)bmK4Nc9$w3dDbEOoDJUTedteZzh;>B>xXH-Hf=j2rMGDNQ~1azDqVXv z;;MA`Fm}@T)1~R-NWmA~9+lUgmEd(+dyc+d-0soWOWG&s>-6@C`g&=5uD+^vrLQyE zvlD*<8!ncR?&(*#IqJ&qFj70F&c2zg-pKv5Qr~)2?c-_7!CgVlpDq6nN&YJHsO@(=&r=!y*py+HQ&yI zEVw^-^YcyncRky*{~_FaHtsw~)EJJx)pW=H{5^@oYBvgwZ{Pi>O99Sf_r_E|;M@{@ z)P?(lgzdfuV{oM_KGCtj!!^0K2lL9B@7UkIcIUzFJr88N@>JgR+v{id1LpK^T((br z^MfAFWr)OU0D%<`J_0=a2>PF&Sxfi3v;G6Pf@aeczgl8+4so)9Ya6GZXd?tqDcQ^n7{X7>DPOLMee=) z16q^Smbo78huMEl(UCdkX$Fu$9uN* zqyMFpe(rzn!O7VV{j%|D>eXKc?$x+(ZO$iOh&S;gzfW8Jg!xevBQG4g_vf=*Tc%HU z-G_U@9s3Kj#If5ROC087xrFa|N#XlsH_f{ol$o3HO#S(Jd~|2XdlAkJczoymG@OZ7 zt~&dT*MGk5uId$ay8T++ew{x5g>Grr(kC$IXpQ%{K8c}YC?#tN!$FTZu|Dui#v#W(HP(G)B$UcF&;LG`U`*Ka5QjN;;(s|pJWx5Z*RmKGPc zwzd}4b5v0vvYi8?#nmm59l@5^?ppzDJ|+}3#2N~*j6hF92dxc_fmqq3iOa&l-DLn_ z0o&2i*c>aWt`7w(A}!nNal39!q^0qmNOP>d$rp?+E9Q7iv!mOK{m#M+iz-Vso!k_t!EV|VWL557#Qa!i$H*M zQ&~yrf(6Tp^(`Tfh*LCH9}8AA)rSUg!gy?BFcfSl0~B+sQ7kq%E-N;OFEefD=EZF- zjX`iC2Dvhe4HBJp_W8hxG(>U7&9;>!@8DQr#ri;f6NLG~VM7W*wknK4J;3V;NcIJ0 zkQk7%}sdi9CRjCbUg8=uL0QZ)h<9AbqMw+3UeAfgXD zvM}>g;~mROrgP-BA4>7$69?TkRvh%@j#dlP4ULgdz25QjcejV3!LirP_4Av?z;>rH zHYwt}so`Tgqp@HE8-2k*?0l5`cSax?<gTLg++MvWR68nTL=Jp zl=p_@0NSY|@4XskxNR_t7r0=gQaDsHH23B&OyQoz)#2JpO1%pfAPq2hl%|yl3`%Nu z*v+vZ6WE-!aG-eRJ+JNcP+=d5FmAl%A;+u{xADKdx&DH*;u%pmR9WAJ8NYF1TD!gY z#v9HL6?IAicJIQm+v!CimBgX4AcnhSNfP^v-A>yLULAQLQi8%Q~x*4r|mJ_PU>Q&(rCW22@-sD~oAIS6c%2CW*pf zYFnfofKaLYC%+;lCM-Q?wZplb7=OQ0c=DKyx z2cO?~WNn<;#!6}%Gck4>GhTT*p>4OxTqf)6wuiDgE<34B<}zh!k9$Oqj#(8*V#(SjG_I$$8CfYd`nKVM66D%v4(}sLWAKRsTEX-k+~} z>NO43jS*8-;CStS)x7`f8ipGfO;w+mdUs*@p{J}CLcMfv$4k#F`i~WaN}Ok^b_;5j zhWgoEjp>8BEgTA|NO%5f9db z{2L9Gyqu|ySWvSy)bH(m?d${Bzo7BR)=(c-FD%-$_C*aNTkk;C4NrzD{{4(KhEN~* z)UUU{y65{EDy3njDkuYy!(G=M+AtkWVr)$(iz#=iTM_#JOTTXR0%@A9tp3$8)CXQ^AZ# ztQSMckM|?_%5<+9Z@@t#- z?28)1i!rVg<2lpR%r7;@daxb~FBGcR4U=!QP4h@5HPvqq(q=!4r~T|Ky}7sS^iY?P zlu@%Wv%JQzG&uHp%)V|nuPE?#Z9HcxpB9pBCb$~UnL6bn^~JQU3S(YLZG({JvsYX4 zQz@h@BxlpdbEX&Wi}-ZO;Pb(Vr%xkYNMF8WMBz|neHUhYJZCy4f~%o+JH04mYPi!U zN9U(+9?zND|3k;0B(xr$`{OxN>uJTAiA=*I6YW@9vq67BJf1WCljKa9b75{$a-4Gi z$I6-RdsjJA=3AL^vh%1WJId@AkIeI^94VPo<`SjGj2ZVqYRthgcPBMw%9wMrGo>q& zGNsP^t7J-@`B%x5I`gmiUy}jRnSYf`>6WBSsWbm7nNnx|HET%zRWhZ{{HtV2+mkY- z&it!nN}c&vkA~{Zzj}t`UnNs|Fey{&%)d&e)R}*kOsO;f3NB&?ocUMDlpe95N~YAA zf0azBGyj@1B>yUzQfK~EGNmT}Dw$Ge{#7!i&it!nN}c)F2}AO)ltVlJnmZ)_nmZ)_ znmZ)_nyaDO`PUih6A*%f$iJq^!0OK#R$|q2XJGXpU<8dGl7W@J@$(qi0WydmT~P7) zJ6TjJUdN9=j~y7m=S*3XgO6LGj8?flG$+)n-NGUB|h zG0+l;Mz+QZZr&9LHWlb~w1zCHSX<5yzI1z$ZZDGGSs~o>UXjWf4x%lpS&=h=$ij)P$$o-QRs@cq42i=+fm5huc46NPoPi^et^PXn0X4t$#*a+ z>Bq%T)$%Z0JAb&u=F7VJ0N!OpqT7T+=v#^M!i;+#Jihp-i}&sS+@^nbPUK3d52Fp= z^=U3vuU)m_D*08QN+3SXow!IP2GCF9Sa`69#}y!ZQoE}ipA+JP!Y}hnz)$K9YlJLd zX5wd1AZawHdD)X5JSzinHqhky5fzE|M>|!>f4a5X744L3cvH0V_pROTXlEcl633k@ zvL_tph`_A6eAYSAj0i174f8UnCy!IojFU-P&Cl zD=&?7R_!mF7D3%w!Ce_GFAaDJ(O#o!e`Sm<)*3F~p~#-M94q(^h5Y`0dxg8K75sgR zp9%TPODkhs&#X|F-M4zpsPy*9aEw*6y0)t-ZiU@bJWYE$t61#hJuMI}P!mYbf!Dxoo`G6Ku?foWNSUfrbFbj@Uo|Y7l|eZi@NKgZoBt z1#qd)1UG$ciJGpEw>&T2>+h+GmgfcXtD?1iZ3*7Td&!0JJbzE6FATK<+ttflvG98ABp=q0{ImsVSiP$q9o+4D2c?EpAv$x+RWO%@;u_t_DUal?Cap1iW0)8 z>8kXFyliD3=>mr?ZSAfo@pZH%!hX)GD1nW@m_Rnl2%d2#RP|`bZ@EA#~U%B1Ixal>9w5k?~$a zps3b#Rg@6vnyyHkCJ*)n#6>%aE{U!vA(6muMj*eok2APAT!ZUYl#s-#s9;#0$JHxJ zNG4`vh5dvH(*E1xX>Gs1-{=DOnrIR&Y9NUYd5Jp7l>2yZB+j--ob5s`wSxSWks+`2 z*L0D{ke5jEjwKSOoN=8%zQ0H2&;r!<5njAE4%`470anGZ)mBWp`JS0h#wfVKpw zq4Ws`yeCDxhP-5$*%4}#j3SmIdxlwo{6N0Y5cU(#crPVZ=2O&ax@0Xfk*yF%KxR$) zM8e~})Ee#$@#7wFXKMQ>gv6@0PxhQ6K#j-X3{wBcVt%xfo)Q;^fZa$EjvSO7@y)&=*BJ zc_p_gSJ)s(EjNK%K~<#XA_oYA$R?+DcSH=xoLqwN^iZIjq z(I{}9ST2szM2OhyAvMzjLQWooMYQ-o{N)y^RZ_1d1LA zog72QW84(irCf7w$sfumXR}5+|F10pDG&SUb*ZL=$GN0coSlJ0w*{^t@S>D(4aArd zU}xmKPFei-_xXW`@E_-yB5{;9Vw91jXLIjG(zS+6p&23-wA4a_P;AS&Xf-W2b-*dv zq7SqK)Bx7FFT_mrfxAcj@pTYW1`Z4gs-ohoMDpn!WOt;MxS3GRjo5CzAs7N-xiAJv z1ZcY}Bk)69*d!VqLoON0Ie=K(H&N7twmyUhL8`xTU}6d?N^1MOPm6#vv?Fo!7_`>J zK;mPKT^3u(f&pJgHlIis3@(1WDvJ69;zkoFY9pk|+L~0uTgXOFI4-;uQfX`H7^pd! zyDwRWo|$vFAtamH%sW~H_Bn}-cJir$PKt;M_a%5{*c0}14nuvzTei|m$P8k~5uEMs zS^PM+LLTLvXxvWaS4e`Gk0S|nMQ`HY#bM_d&p{Fin?I?IZ}GTj3t@N+z);J0QC(axd;o)SS=B=ZO$F6x_t9<)t+}p`}8oQE;Owf~+VY!IN>^`sAlE+INxD zcrTxh5(}w#;HEJQ~*~xeNp;T(r0n zDpE9Pj`%dlM_VD+q2z|I;Z1lRd(z2Qc^)HSv>yaRgeXTG8fWvoLB5jDbZo?iTEwR>*2F>Zc~IsFp@K2nM)&IE zDY~_IuN%A42h*n?+Gr6p)EqHVq{5BtA4dbq1m}FKO}~?}7z$`yOAAD{ z8Ef}}M}rUqx*mx)PFzLb1J5&n@S5eEquu`F0WaoVF1#W8X?dt`be+^MUggb&m|=OA zpy-j;z{h%nLLH+(F%;vj^W_A^oSuhoN$h73tT&UdA&8Sa%~X_dBqf5O94%I*k8=n~ zUM_GBaUq_3e?w!>IsP6Bervb)X}S#tYqXEvr@1SbgO^X`dAO=qtsu>J2pDT>sh(5< z`Z=MS*Jvl-`iL4(euy3Sw2M zE##nh4QeR$!)UYp-hi4zn=647`{_^UH$)qyMlChe8ttWl;waHc-bqkOU8eUk+DqQb z7zui5A-Ez7y&%^Y7cVtyhu(2$`fE%2{l}~J>$itm9%Q#kCxd)3m6U0bD`HIFM4Kd! zS(F^wHfg8qvX59(dsq`X#PHH;i6J+kDaVp4T!U-V!!axs`^B0q3u5poKior(7wMp` zh}6h<2DIEA>bSAr{{F?UMX3veQqLD4c666c>Zt)qQa?pv9f=?4Lg<_bgAc2)h!rK- z)TiaA1|Pv-pNKzkuIWNd2?*pSSEB^es(7E~8w>lD3bl?B8uHR+@cpZ3r|qXXT3+GE zJ2h>;`TiQ@xhrGgO||__LWzJ)C>cfZk=7%d#d+tOb6@~!|4GY^O^5j|)*HR3G zkuO^69PQTcP%Kj|;vwov!Qp5ME$7+qRj8a2?1`{SC`du2D6x+Ql{FDD#vsn;OHp?T zyu?kLFpNQ#sEN2(!V$woQQ;d+F#{YW^pUk>Fh>$gQ8=M-RF9TL7Q=mqY7T`rf#Ik# z4HWNvNuPP&+AU{lf*Bu}RddMSm-+a}p4xu#nj^y>Rqmy9lE{mscaZi$Fi(jXi2%;w z&M_BPb11abcS!azlQ{uz2${zC3za(T&1{{(tbjNA6@Op!E9YKkRGeFoBZPnmf9sRw zrRU~_nL#oTm&eXM!)#Vveqa_}*{+YjaBd>Z9IUdmwKp(JupD`_Mn$+;fP}+%9|}PG zsR@~K5Sn&~0bj;*z(HpB{aQ+>0 zk-`|>G%BS0{^ONVX2?gwMUIsKeq?ER$3_S)0Wr(yj8Mp5H>>r@Q4pdzk53gc5c{l8 z!elFczVT;&xYR2A`-9u@FZ|txe}nO3quvkxJA4VpkqH0~25rb>V1{EH%l}DudOolJ E|39OKD*ylh literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/test.xlsm b/pandas/io/tests/data/test.xlsm new file mode 100644 index 0000000000000000000000000000000000000000..4c873e55a5300797bd9055d29c7820e1c0526da2 GIT binary patch literal 45056 zcmeFXXINCr(kQ%eh!Q0S2}%;lk`xApjEDq5K|q)dNEp%}afE?Uau5+wP!LcN5CM^l zqydItLXaq+zzix`W|XjmVYrLE&)IvQ_nhy0-{;<+_bwiK)pS=^S5;S4cULbP3m82w zzz8q{0B{7z&p-O(8XW*=G5`P%zKQ6#6<}c;4kr*{=l(^zyS#V z|2_W~YoPnQg*8EitzD8L{_N7ZylR-Ck0_I%nf`$o{nMW`6qI}DnsFI zU0(S4hNaW-O7(VC&a?Z>oj!8hLSLirpu0^}P6F5~zWpx0zE>Q*EYoMt1mAWf%5`8Z z4W{(!4@)>`oqzw7kGJt?iJ#kRoi#Sx#uduK?qgxA$_St8c&pg57YwKG@}1be&y&G; zOr~xpNbBL9%vV2GL4n&d85Qf6MbgKV3l&ArCme~jx&JQr*?<&cZNYULf(%`P*$`+KZkKgTY=?L$qUpRF4uXvpUF_F#a`zc_C6yYmI?kmG)a{E`)NTx77P ze~7Zue)<0#_Wxop{hR0&w=Jy*DqIO$r$2S(M$gqr#M|Tta@j^OC&13GqTIRnUWFfN z(@iundcu7nwpe#=eyu-m!%Iass+6m~_CU4?%Z-ZAGgGD=e!F!|O#D1f32n7fv!@Sf zj&6)@Cq$n8b}7K|Q5_{61&mfuvqdiXS%O}H6hFJwo+sUz2X zAHQxmqyO=l$Zs1T<_q^8_p?i%zjw<*z=vog!)orWT+usd!~O69_Wd}erRdd2#{lfJ zE6113D8>-+U^2^s55Cs!Pzp zm($Dp+?M1kxu=};rpjE(Lt*peIa9tUrT6@`xvD-wfiXk)yq4E&jYprLsw^+2C2F! zaeTCip&e6fl&bD~h1_2sM^NKR6vepp_w!TwflxCVE^j z^EW#eWF_*&_ye)b)dMMaNT;uRa>&cQzIa&OQ}3ZgR&Sp{$nEO826=6P=d!QTB<9p% z&uY(BSMqfSM6AA^&ZySU|CTE4rWgGw%jC;U*aL;zNv~Jj-ry4DaGH_> z?YIs0|71Ak)aRXZkTFO>hQkS21Y|gWH<=r@mgp%Jwl>a>IId9ZFUK0D7^>40P%i+2 z%}SAO`3WA@YAK`J-U0;Lj?c?Q?e!WSx!aLnqP}c<4YL1Yc`xsGu=$?sooks5w%^VV ziqCA-`kldYm2jtP*bK^&*GsT_fuEftV(K=}a&>^QV@K-bds{0ESP_kig6|Gj4j-Kz z&cBBiO@1>D7--#)HGf}clW4^(zl9Lo#GL02w+|OSdr$wc`q5di-M+H)>-JOOrb59( z9~zMHy)TmzZ<~Ta&Sk&_12ycV7>%Xr`R>Ou##*?!z=N6}319Oi?=rstZ#{X+ea3c* z_S<_A5C7zT*3>sCN*^mnUk&f*NE;=+zo-B3+_NZ#w!O4ddaub3&wB2FyNh>c`p9w5 z4AbW>UyUW(t?)dx6Aem*zg89Oo*Tb%;7Pp3$CNf-zMkIf58OO=K}{tITm2~gnuV6T zhJB*XdXmW{xf$_=D8IW={;@wI=Z@-GGJEx1MFgHoZr=$CxDt*7`apVUr^5iU9P zm*{lv+-75Bp?rE?8X`X)c`@KY1*c98x|>ByozmfG)AsiAO7QGgacH*xJ&@@32Y=m! z?7@#703d_ew}+6>tNxxL`vK(69m|Y76|Tg!yW8RlGg?-I4U@(l)3-JUFBDsryRr+q zS0ZldJ{d&LsLp&48*=?^<^EOp&l55+-|LP#5L@eH9wmJ);C4Ice8x_8z3)^M=5aSn zOWI*9IexQicz*q7=|`ig?kN*yiDVVMM*`Qrb>CA~G7Q1jrmHNJRSX{vTJ-%UB8yox zx^1s8xrRT{U;Idt_#_Q?-RJxC)VD`3nu!u2Wm4;~G;Ws%uD%|k1AN(&qGDH7Ym<7I zTC)>sr|n-D9j?6~%}S|$BdK#js`!d0JI40on7*N;aT1T!S}#r0G3e1XyYs?3h%ayW zzOTqxco6CZ#}NtgFNtnh4AZAw<_fru|x&6j8!j<^{# zoX;hq$ggk5Xx%fbBM~-yvhjKM)%%{zq8ZMp3S)+kIawRht(}_SJ4GNhuebP1y~d?S zufN@R-4TB9(TMZCGUOoVIqO)vh~^DuW% zq~@(K%MxQQSZEV1i?+w@=es-!Uzt7B1kybN>zdBFFQ&w!&Mf$ywm+F*=OTqJK7h(T z?!b4;F}(aa;k03sbR6le+%dG=@#*6ymEXrd-_$zy#$G(<>DmupH+`wl`o5=qkBT>x zGilHf>fa4SKJ~a{24o!D2LV9vuYeKaUi2(>^t@vkUMTdb;e%kVe$-9lUu$J0=}a$p(#?L{dzP_Ob@TbNiPCYBAbh#` zwdEL2UZP~nnz5l5_{^zvYO|Om$P1m(|>cfUZ3>lD;HEy^qM*h4|3!5ZOcoFz%-w8YrR)b`Bku- zQ?jLlVeEug47Ige&%Aq`c-^9-NY$ES+f#@0&`1CEbKXk420!NqeX*)7_uBTp-vmE- zEk5E98#b+aM=@)!XFA68G_i8JxU^`uzm@v<=Wb{09mB(yPL@nH)(4(=6{piy)NU2f zd~fud^tE{Rrv=BuzFse7;d=KVh{YCDe0_eNA-88hUruM)K3Y_4X)Ib)Xh}0#v~9tC z_R0~Xz}s|-32ZH`AGuvN!d}}neqRP{8W)$lO|&Fn2CX;W;?T{$rzA${hRxTtX$-@KQ+8RYHy*c?!-lmW3Yw`D+pu3O&)v@@z(kx zf~6d$V5K?mTl4p`sLxNsYnLixD<7-N+Z8k_Yw3T+?&Xf{wED*uO}~68%;KX~U)W0P z%zdUo@u|=-+4%`X-A#ObP+F)3(C%DA@$_?{vi_$arox%*cZdl9rbYL&M8B_cp+24gp31+Ms{8S4#nBoq zV#14;q==eEp*>{7h0VgAeXMr&88)qU9+|`FZd$3e)aDAwQrPRwq@;`uF5POnqOV_U zPqNjse8`+SQq_@n|MP|wJn#XxzJ7>LTP=R;TieWz8g6r`e(v5QqoYz^&DEZ|KBrHX zIwC%F`4ijs&6XU^15$ieiT5b+>Tzg#3ZH3=D?DbACZKN0Vy zp#1ZW6gjZlH4jd0(;BDEODY@=QRwiJo8vWlO5HrT}`AuwZ3`bnqc-pSM|8mU##{G&!5lspG_`f zsF8m8{MXwrC$1AapWU~+!PN6)G_dlDXXzjsINLv`op04*r?@Tza2SzCCT+(Ymjll` zzkJXAiuPdq^YamT4FTj3Uei0IXw-L9 zZtH_JB4&xnkfC28lJYAHS)hBB($j~J`S}qx!;i5^+(Mm|YPP&NsF%^HWpu;ulu+_d z^L5FyqAZ862WBkYOg%iiD(9tlDXr1Jiy>9f2W@33mAqO(f#6+H<# z*YYBta}##WPi$H%$#{O!>rHBZId$@i;ne*ngBm}dgf^kgx>Z9mh{i-!^JK$qQG`VI z+~?>sqOK9BV@+Qq6TaJxbd6Y;7jir>&Sm-hP_^50>8Ft4YEW$yXZ^{aSKEv@T%>sE zy~O5r;!ml&_VFZm6u!~B>3E;)e9=TtGRp2y_7%}Gi6M`!lS^zVcl7S-9H1&P?8+p@ zd}ZHJ%Ij6KJr+4Te3doC)$}_#Eva@f-D=g8g>RD+adU5Gs|zi)>r=n#GUdg)-H=ZVmVcBmjaSvrXI!ZVUt|XzZICt@vanXuA9E~v8`d^ zY9FiCwB(zM)|NsqrTeFyyAG(YbUn;gOR};(+&#n`vvekrm<~m=@LXLc-fL28OviSB^iK>^xDUs$NxNQ*AtOc@dwg8JT6LGzk3Kri)S~|QWo?F|>=zyrV`X8nM|63~(QHH}t9pddabi~&Z(Yo_ zDUJSyt6w+ADu(Z@)v>+5>5*-EayffDT&C(ItJdz57oG6C?_e{n4I=aJ*HWK@X~D%0 zdiT^E)`I~pWs(3^45rPtxpV@@NrMSP$%!s>%k7Jzw-kenSgWpMo7^?`Sc9ML zHEMW=-K$BAdpKF@A$3ea_BeO^ih=WqiHO7+--h*zcNOd|pLllp`Ps?|Sn%nK$gPgB z&v7tbgJ*exx9jQRbn0~mgwNK^ZinnbO zli{}b#l5G@_jg?%35@8?Bu%Hz{*3#|X<&~Wa$!GlTI=+VAFv~)@H4f>1B&d|JbJGc=`Z@E9WLUdKI7m#^q^ zSTb;+4c18G@xoiZ<}ElY`Xs9B^7u8*(g|ys)mXn;t3o&R+tNFi;}^&$5jftnAGvFg zr++Bc82G8?`y|Zft_{u}^uj3mI=nT+Z1@umyWX)Wws2_*`06o`qZRxRmE%()AL@SM&XsJSx{NAiPhFy1Mf8@J0=W%$xBD~2WV3f^?p9mctRsJH^-mss#Ii<-Jtvrb1S`SRo%6#mma}gQKczV!e zSZ2WKzAJ0BvBBf5+mH9Ajb2Y*ta0vU^iP3n=iXO)aDSpD=LZ-3eu<*L1}Dn6tKhTq!lKz39kDs5v>Yx19H+25FEb85n zkQeyR0%1{J@;?h7$=80HsxJ7_P+>r^Iec(yGBeTOzBK}F-|$+DgS?ZFs(=WI&~jM! z%+zYA*Acj`^qswJ)3HyRaoD{K7u$jzX|M+MeZb6AMTcqTBH?P$7{Qz40&GP zR(X|CS^M?Zg?u+%1Mg|=kO}v(kMn>WP}1-oYxn^btMWDGO3H=Smp!|~vF}K-nC!w$ z`<$rIrpES99)!03uVT2VG`GGy)s<}3h?r54rF0@o=8sk4^u@6}p~X8YDS|=8PkPPR zuat}RN&dzwTQ(#`QPGXBjQSXFeCeR6=NayjQTEwy^iK|1){*2g0nRJ1`G>1vPu8eQ z#UYYKIjtca35LfkrFj%Yd$6ODGDjcZ7`|g+a*X2jB=7a<*bSC5M~|b<2$`P}YC{hj zo$fwn5&!MH?=RR1ve~EgzAD%4Ea)u2D+}FnWZijkD>nK4^R{JoWataU z?+>TozhAMDCyMn5nsLEZgt!AECDNO2v8_GPezYga&+&o1dY_k~5yx%VdY!(ZN*A-G z&+&RK&x(&i*&V8;>IZZ;pP5NqU^tiRCvDe!|Au~^nDNmv?!+oDvLD!d-StO_Ma)K* zw1)Po#l*{aF!lT)#ip_38O6>>iBOyKQXN5$x@}@NQqoMhbAuIUBV{{}-N;kpljf2) zF&|z>1lxRGe+|j8e-_9O=1MY2J%9Z8*UB?5wCC>KGPGqs#n8NPw_sUK$v~S`=B2es zhDhme83adD#uB!T*r*YcNVfGKSsfQV+kcklw5Pkbo*jrh$9yE^%t>!V?0EkD3mFu% z>p`)((l)J(;p=A0Y=m>?&Y%`lv{gmfTt;Mf=>t(;#E(~`M>5Zd95HMR@jmoU8BbT)~>%;I(oe-g-ytooFt?yx_>M zYWd^=s`_}He$9oh{*DbHQT;zvbNJgmd>j6gZM!AlQWoFB_oxb7lIMh*i@;AC6-687 z`=-jVs29{oSlzdtXn&V2@xvnUEiDcex2JAdIxnTNy7yGhFR&cvb9u)1;q)Ec`w**? zn|i{_fgGniv-`eDYkjuK*);m>Zr^@(4H3hlH=!7t>_c=!7E{Y7YrcyH~<{r_6jz=@CW_@c%;vH2MFee>Mx^0&Cma%J~GhV${K>9z6T!r zp)L05?Bl%h2r+_m2^@vsX&+DX{W*sCfdQ92jZGj}0fNPR!cPB#KI}|LxZQsJ?{p%4 zjI1CSq5})|bvt7N!NL%n9E`Tz9}D6c_SnKCCw=)aD;Nz!n1it@g))1DdXbF}DtB1A0Uc#DT?*Iu{2doo@hxNj$U@d>r{Lgd$2bDkP{&#uz>-MGHpM`&p z_jg@;0{$F^93mW&9L5~t9MT+T|HP8u5a3YdIL#powTS#VZ~lL^|BEarz!B2bKehUw z$J+M*6mS9J1O-GuxWSNC_ca^{T!pCq-B13(DZLoID!uuCvD%aSw){^n7)=-Rkyjb-#Bw#J4}h zw`Zs)23k7+zz7|5H5ldX6Dn~+MMX=(09wjCCCmdoloTY~{QV_5(e)AZinPbgk7-3EZ&%)Pyx$9sFbMG*Z@0MOyTM+8Dm zFeonou(aHd`p_3-^mM`ibnEduXL@uZf5N78$DjbdUw;4ql28EsKXCSA_5QOTU+MmQ zA9#4+pBQw&L)brI;K4rX-V|W8zkB#UdiII@9}(E^yw?F7Vg?@20Wi8lz=1<_utRiv z-2ikgNXPI8?R|FlAG!lDdIm-&W|o7j5JEi{aDWa5J3tR(V4#N-K^G0R1N4U&xR0nB zF!G#pV>)_;_r&$gN6eCj)t!8{AIMT_?!mDv2l)jAg@mPLWRJ-mSJ%+gI(bUl$k@cx z?6mnAyYm<99iZv;xa{fW?St|S2@MO6z+8=ti%&>QN=`|=optAKcFw)~xsQvU6ql4f zEqhi|TUX!E*z~fwi_qQE+xP1Ao1x*6(XsK56O%;J+?TK4<`)*1C_jF#Z)|RXzqa?~ z+L!a6&wrE)QUcupdU_Z=)4p7E2O{li5rB0Nl?2GohWdAe4V*j@!`%|#LPS}qdh!O5TG|^?v?Y}<;UsoE%zsC95bZ8Jeb#boQl$s{pnX#x-s7*@?sKW_06Yp zl`3C!>5}!)9yYE;{-J*L=Rui0+>-FyI){%R&2|r>Q~zZ9S($ne`GpbLH^5>+eoL5P zp3mu`3D>>^vnYN9G0u^r{AK*bj~A_Sw14@yPOGuyG`;ES{KLARoOJ zlMm7O-xK41Cajn>MTeL{sMK--wKxiAGgv1~DzJ_XL-iL|>ztYG&J8jp$mwUOZ9khA5M|IX*`qO zdw}F1-M=Hn=*M&-_^b4^Xde&)G!d+25-}rLJzd-!tgHZ$1kC~%czH7W4xDM{3}%^l zS&iMSBizPUjCVKyE>{ZLzNTtWyaXsYsT=0ssYqgZ5_!bI`5FSb2N12W$M%57A+rNi z*}4Im*+UqOcfKoQTUL(^(qz=xJ>W|u@B?uZ$JQ>oP{2Eyod1pm`x&?S+eoVZ!1ZuqzK2hXz7It`tr5!+)OH{Q4yuKOU>Wgm)4`bn7u}Hq$9wd6$!jSJ?P^Xa{@m{wZqKGr)@0Jmxd3gEsnDjIud8Ny zvMVtGqur)oru4n!@o&zqa;&u=wP@}&EA4ZyBthjlaUNB)TBYG}UktLR>N=(%Gx{*htcBXz!s{=Xcu+mE= zx}D)$0o1OyHM#=b;xE=xSH6SAvBdCLebh~%wklP~b zRaB{rJwRO@zTl{1&j)}p|qGGE=e@S)S*_QG}gR3M3VXn^I4H%rOWerx=W@LcN*;;8x@QzHGDoMoc2z{m1W<$l3=X}Fphm$C#W{+~{A z*cD@sTd+4Yx|pG6*;R28XS%B}h$FRG&71>8C>)TBZzVf-<2fXU0^nBJ5s@VsoEcbH z7-!nra5tWP1I>%^>!ophXgxTCSF;Qv*1oVs8! z5zW1iO=o)lA9ip53W* zKdQQLQ{x>of_M~OB<78)(AdTlGP#|3Bt~&DqH!`KG3M~J&sGkO0kR$z5Z%WvlR+%J z(S8(6CtASDFaxI0WO_J&X|o81MKRUu=t4roL-`b>GA>ESV^9){%t+0bV~^g`NI2I0;)qB8Ht&tQv{ zpYsgIa(JT!X9Bh?cY5w@bD|q>a;qTTco&)qwGZbXYo~ za>^dSQwn;QP^AakkAcIT?et_hvO`x?RdTLoA&wZJ)uT173^vZL9l~VU)sgeZ?#>~2 zov|m$jW@nyb%-uV^U*LJjLgWY|6$mQ_Zc$GlOewLnG=RDrt|fpkDFJdzoAo}@VspSdPZzwE{TQ(R^jp*C z<9$M>KRX)b%%kXG6SxMr=N@3_Sw-X1#8WFFm-XmHuFE(H#?oClP87!tKJ(sFYv~W>W_Mbb^ht?}9ah{$* zg_fTgX-C zES`d362AaiJ}TtFTGzY;@x)4PB!CH5Px!up=hrMod-JE2EP5@CiH0xzI6X(+_&Qpj z*cv*ulU(@#RzyLp%FUt~7POgCu@|a9m=Spz7?PzoTNlYv(!PqV=x#V~0>mYl0UEt%+|s(<3i zZY#q3WUJ~Nr!b;d`TBfU0YEV+jBWLN-bYCzH1a!frf$e7KOR<>FRY!k(CGPAvh7;6 zd(Zz!YvV|k$+%Zd0ZEk8;r1(7D;T$9gFOja6FJ$j2H^cU&V+U*vTVA{Fp->e18YKY zAZ)sdP)?1imu)Pw)G1z?(k&?S)Vv-LMiLmQl+LkgAa4{;w?#xSW}>b{aWS+pZQdel zrN+o$(q{V-sV3xk7qFJ7s1cUIgVi2tms!YZSNx#fn`V~3rd)T=e=_vuBeb=2B(;H?MC$>Sn8~kV_3-9kLCshK_dsSp4g`2 zNN$V^O_!NMj={~UIAm%#d8_nW< z2v&?%FYcfQf(gI2J+Sb~)5R-OOEX*w8tKR$H4{?&?Kh5Jb;~`7Q^>Xb&3n!5(oWA%%PC2be{ZyEAxOb6li*f;_(& zvH4LGfU%z4t=3~(cG`-?suoerK*Tdl9LXxdlY|BqBXK==PM3btXC}M)G3C%U3$tui zhx!6trvXCy(dMDwZjM{=wiUs|%-YyNf^Pt_V<8Q%k#FCh-~^=4eJniZm6 zP>$v4m7^r-XA#OsPt}vLMw(e6WyNU*o?pS>Z^PXKPp387Z^2csdXns(;(nW+Tv7wV zn+H}s$W?^NzKlBdklq>A^?q-nYk~&}nPRVnC2$nMAq?nwL zGVtvyib6YUvx}^C(%IW97M^p0N?6&aglP6Blin$n=ukxi#d-_*mW%q@zw{JPcpj|K z^Q9R2CbIX9br*r&Ll_IEQK~qF$B?3%m_x#IUMW>#N}sV~Y1 zM0!qOl#kxT6iCqhY)n9Sh##OTf#c6&hH_Zv(%ycS?86DRYoJ`kDc0RM`Y9?L#LXdZ zIGKDar0DcChvr%nJ~Q=1gl|PMe5>{5D4fFyfu)2=_JX3Qu#&}wh+7w39f4?3^3-M~ z*{m1AyhPIj9jmAa%1j(w_i9urnAF|O)E#hq&VszVnO8FDr!|&KtwoXbQZyRUo}#v= zl*&X)c6rVXZ#=z`>$02wW%lsVm`@1c$q-z$s!~G8r1=vKYSozUASfB?(w2E^4sj^y`u?+>Q=0x9Fg6*=>q}IU>5HUV=H~ zViBTdt#VR~^c^tMJE)y`e3l~J7xz3gAjTp<>~h^mCDsuYMtu9$1SL~D7Vz9(DO+gv z?)d}O>@ib#I=DwA#*Q$}2bPn)UB_A>ya9etyUz-2eE4Qsn*2Q!?miiF0?V1xG1-oP3L>p2 zXK{jXy-DNk&5RPq<>sEW_X=r~C0P%BoxD>tEKN6cqUEi~@5`Tnxj(kqVHbusgKA?Mp-?7<+)&6C} z2dcrQkdj%_MoXTHz@-44^Ubki`kYz~yLt62F|vi}(;O6m9(>|tX?j46z>+vGR&rK3 z#g(g;EYS0IEipH`ppY1C-7D;nq9=yQyEIhK0@v~`0nLyt(7x50-UXA_6=|RO7?T)a z#eSmyF|aj|f)G|8h->F*qs;V}k_)fFMN2;|jyc2}r66wlhwkd4H_{f=`z5Us@+?tR zVY<$?%?-orZ6|s~@{P{5Hzkg}nY{Ytyu@Jv0_ROU$2uY@=HQ177Obyw-2mIdrk48v z(YhzVcJPw(gvnvN*GL_nVH(v)UbFl9`QztLlkb~u3q3IuyoMd0-2=F$_W;60Ov{{i zH&WPV`V@JzB&!mGB)Vq#kvTVS)>81DdjQPu+(MScBNUfjLI}}h1dA+Rc>N6g*|Q~V zFitfQeV+dK;-|%F@xj&54!RBR)EI3nd|W+ZDD)THcf7GVF-5eg6||!`oNZ8Qp3ay1 ziQcegnE2?-p4l+|@@0zSW^cviP-CV6dNj+5o?m)Nm;;}KZAevkvD$?2fwXs|IFxHX+pKNIKE0K9XcXIa@O{$Y5 zp0*|1Q@}jds(f{JU@C`zqpw><619534c1+w%}sEIh`Twhn>D9UWQiZHQr`m-C$|#q z2<;q={UG^Ig#yD6H!;K+(KyIet8_8OI*4*HDxA1re=M-x%q9LrR8_fik z+kdFL+eFy|`oxo6bt$^|o>TRoWi1A&YFDoMRe~bcI~@6*M4xMNpvWd4xJ+J}BWk7B zS*IX4!f+|#wOUDf3|M(kpu~5EiFV2f!`h2v)nmkJ@0RNcIQroPY~P>SC>KwTksQSN zvEv+@tXZTtuRmVaV;Qcj^lGc7*q6%Oq+3GrA>0SJeG#?uD^=t)qE@%OCIz`d+(Cd- zo4&1MG162iF#jqM&x=tbp*!-FV_g*}@!HBstHL{$6w|~>*E|xM!)MslWziu!pd6#O z2iUzxI&T@%4a*`US9q#0;yv2ytaF^~m;s_g5>{@f#mH08s|K^; zNi+@$M;zr%fTjp5wB_%;t{hu0DJ90!Ry|I&e6l zoJ>XXJo`GHZn6C^xd@YYTKIc4dt(xn-0Izcgv_!db(I13KOAJ6;eN}|C?Gpn7 zdr0D<;P!Io6^!%d?U#cw5?~dHsW&45C(J`AfRlHYVT5-28t@X9orFuovBt<_d55uz z6r>@!F;&kQ$}HP&%M9>+lqCnu%(Yb+Ng#mG>`SNS|iNFiU_C7L7%vuqB~67}EILPD(QQsw&-sX$j|vzFKX6(`jlX^TwgSV` zm!o~wsk-3~YRETx(@D4T%XYA%2Agn!nl~+ zcwH6$w9%6VSD}%L7(uE=134$DRjc^(3#R@8D9s$}i$zjqsFGTfrI@!CWX?pa0C8s0td}N89#4;v z#$KR^-^2#&m{|}DlbdfodW2D%Qyqd|D9cmkpsLn=z_M3h#_UMSe0zZ3gv{M13lD{G zhjH$2tJ2h{lAs+iJ@_p#Vr78rOw7ojiGnA63b;T@CH8chHJG%c3!Nxoi@^~6QMQpZ-{R~pHi%40^F>8WIL7@@jRbUGWv z4`HFCJd0~ml@msXxV3s{c8vw&Z9IziLL}g8p_39PyYDs(HLXnHqvb!wYK~xy=9J^w z)uQlmNIq!66YQBqh<;qMcRfI!Pf@z2cW<`<=K<|nurP3y;xXsiGWQy8@EFV@h}XHK zSGMQPqSG2eFy%Jj8g%fT;dge%lLPC~A!Ph){aV~(^1B$`F%l?Q4B{-n33x89n=@ss zug6tv_|vtRLLV{p50Zm*7bleu1!kBJXEs2mOm3yGwFy=el7sg}gE>TnGLzm_7|>tC z`t6uf502`|_)Uw0dE_aQaz;is5=KRW&XlAsy#rv;6ROQv45LyJdHKx#@uSEI6JkOh z-^_{2>uwjPXx2uAkra<;M5W<)4T_g}P+DXo-8&D(-LWDA)Z@4$$M-lQJ9!U48kOps zxqj@`t=1P8a>O2^yxjvjjaKwjd~j?U?2^RQ3<5ft#!Jab+fd{oj>jnkYUGFMiC$W8 z>P@P|D0e%=XoIgqNFo{RpPeo_Vb2eeYQSiehdNEP(7Jhqy%J>l)Q)&fuG~~BY*=tX ztRRZNUDG)ZeF^lC^K~I8=g_Hq6UQHhr@KT|CYxnoqtPgG#9UHByQsyiwa8)&yEx+> z;5)+$X3h4@cN9*-dKAkrQ=LPaaS-oN=AgSSt6Te$n8GMrN31BScnb8eu@>wq6=9(?LXjSsIyMb}QB|l( z&(f}1=&*dccJ%v~k(KAlfvsi~VPz)28W8fgBV(=RvPpo#BBC~_?=w>$vYzjqI7_=0 z%E$>s?u39LJz`2^Tu+qcG3?bm60#T1yo`I%kVQ@m(3#`NssMxcfLVu*HXT>8LUZCH zCW6tdNK^_{+3o4l*kncHS-Ov))eOm@8-2jV2{SjV)t!+#S-O(2!wWLaj@9m*M1pui zyD-JO$9mF(G!ToGuE3gosLv2a+_a95@Ii25b!M3Z;Eu|v8iE`dl&C%v(??HJ)bnm} z7PgA)D9}diBXd^v00n5Xqc*whWYX8!W=4G9h9j+UfVfV4O-AwxE|61%>a+VyU5#i! zdE323rXHnet;~?DIjBOQxi|YjcY0KhD?rxEpA|ne0bW9C6ayiyEW39QC`)<5v>-AW zUyJWU@MD!eJV$uH!ug@&g0L6K?~6&e$(%WKf}S{SJkqS!`lc%@il#a7(N9c(A^@!d z$>OyFH@?e`V=ZPaEqeHA2V%F82hFqyZ*}z+V2k3D9-HC23q>}bCV>s6 zloHmiwM&yTdNU4!k|fuhC*WiBq{aE&2PAStS4Qe~Y3c@aYoj^d9ggRqDYDzH*-)G? zi90q!8kJ~E@UC|Wvj36D?nxLY(QIXB#{;80x0TWAM_LsiQE966WZ6{>*=}fa%tFBv zRt@wteQ3wQDDus|%|Nns=L|cz)1wu0xBz zRxYBug(O@OlsLw7V;L!k z6>lOUX$HE3@D9)vMfAO}B;u1badq{HWuzuAB(qPbE{YLxtR9E6kOtyu^c1F5yEz0OmK8La3qV_qwOWw-h#bYUtQO7)jv2Ejn8Myj z`#5OHXhZb1YYf9x-AK^Uk@XD4hL9NM%${yByg&9&1FBk8d9ffU8oNx^mySiZoIDmi3N|)SU+4*0bX)jXO9;tnqGg z*o>HYi8)z-SeUM-usE62CF)1oO3muQv*{g#a*YD3a&E;aasIkp)r~!X4eLsYNUT%l zr;36B#HeJdA4$iW-jW!gJ#5l&Q`EY*UGE;Ozg-#EiJ)m3kvW|x%3TOLFs}>A6C;Pc z2$ol2aGP!)G<5PqhehRxP5&E@YX$5p->`TUm zK>KwE65@vY5S5Pv(=olmjKw?0!SXKQzOES&KRwx6aCf#m;{X^l--%?0Yf;L3v}Ty~c)_>C z#-w&_0}rC587U`G`UBj2=fVtV%PH)WNkxpb8q8X=E;Ki$!j1T<2)}WbBA~@GZl->` zD`~eX5a-+PZA^i#2fC4I&?Of(fQS@YfR1R&X>|iUzMSNr#O$QGUsN^X3>PX=9}Jic zKy+`bEf(;R^$JTToEHx{A)OVR9@un}3J+b2skC95^QQ6&%%WG(6sx{7r|WnqRfdfp zaHv&TVqszogNrLwd^F}T^&+J&7TPq?-XQsCM_j<+xl8s1lI@@cN#0rr=L_x-#v`3T zr>!1gT&j9TlBoq#^Ki!rd;oOR-vjEIdh5x8gL=GJhw%*@Ggb-wOm6AO9Y~^v%~ml_ zfm)kYEfmgeV|my1tf@9it0Qpvi>JX@ERd zRY5&j4Z8Rdty#eG&PF`X#w)l}Q7@0;_flLH*jbz@kQ-#BaU+EMgp&v}Vi;TH2|{=CCFEg$$V+DfIM%wyV-&!& ziq0ZTG5p-zMJ$5{aZ$2vC=4Nlx+V?>EoL41pp7*K%023Ff^<73U_@6fv?VabdEwJ% zSYiYh#7>c`VyWgteC#ySh#rp%)^VuSb5>M*Ghu|va%C1&M<2wzHFwsv=q->L(No5} zC&I5*ftr1GBmQuMV@%~#dXRI+3R3IRZV?nx=Zz>>1S{O3BxaN`z!^EZ&h(*x(4%(> zbc-g7DRY7u=9KArl6dN55Lq^!YIda<^yr1o32Za9@souqR8Hb9x;s)6U-~wZZbyvFt8tP(7ZU9GD+!0w<={ydYk7T9VX64V@CLAQ!v zYp261j<<_p$~E|AAEanTBTI+WcTwl0?@dK3mk^OI6mD=N?Br(9GFAjYcx z=;1zg5YUqugqslefI~6U*|^$~8FqLyQ9L1I7I!0j+lQixM#2S)qm{$jq!aK-whWT87pDUHJ|xBzf-&rZkr(# zx?iqdR;npCM31zY!-`2|N7;w0dw~yWlQczGQ~IO2r-?gvFsoY^_P3ahl0QRD;l@3^ z%CLU$h1~VdfnVNM1Ys$fDfE^yvj4#uU8&LS)ytpO{Bf~?x$(%sZ%m#%!|5n!n9=qV z?p}Jg(K4mm7=GkmDGflrIJ5XyVBOz>KX0}In2d%}jo0QW=q@?spV-}mJDf)CjER~t zT>f=m9eb^U<=6jBFDFjIl?ZY~K}j0-q8*=b!M#aIBKxQNpG5ooYIv zQRy&ePRw5z?5w$mT`p`-C6oHk3$&KZ%~MjV$cS;jsQLVv>hh@TChw9s@v#W)I+=hL z$T|pSY@7JY%I7(H?!HAugUeiX9nDme?}i%33PcQza#JLrJSP< zo2uutNEBEKNeU$KoMD-aBM2+-Ar$nX+II4ioLrvI+%M>HeG*gCsa!1#P6}-ySjNEYy8G;br2DK5 zBE&bJWR3Vj;yS%I6Auw?Y@6d30u9f8O<}irLWLcACl#^zyKU;oAXux&uVBx+kC`SX z(jy~t=ELaEEW^ORHUlB%0JAO8v5vhf`bJ_M$zcWW^c@Yv#D1e|-J?3V;@@L`?Wo1jY`Sl&<2oVyhtUCX&MkI2G7#7Q2Tot8i$&N~ z2;(cH;tsNyv5Z;UPB4`ua_umDxf(A$i%@b!_6Q-QhwMBN?I}b&OQ84rWaKYOr1K3_ z^+yPAeKL|MiOM9bVunV>d(On%nXR?LhZvrb0`V=>43{lnNHX~D<+e8%H={>Y@7+U@ z`f7C=-;{ZdQy{SynZZw}PRu87W$jJQgY&s8bA~2J6#H}nnJQZ&=@+;c?(%8gutj;S z0ryX(lq2-1DxH?8`Gl#_RYJ51scz%hvA40D`UTYMO{wj#XPi1g4HXcI*^5zzCRUy) za;?F>M&wyy{A4I|4;HLRqETs1`=S0Ab%4FJk?l5^B&aoq@%bY6Pq?KH=r8G2>|beC ztYEmZ?Sur+4#0Mx)HJk>?SuZQ9ZVEj3fgOX-@_;E98p(f`0@iNH(RX&c!sTs>q zzJ|nyMl8bG$=0ZuS~9$Y!7U|kgL9=_G+j5$xz66$LDyWF77#5*nPHlo`=go}hQk?(3<1Zd@^`E28cH!AL8(Cxno!!2CW`Je{GUYEty@Uz4|=!$M>^bU02 zBja1g`ewU35Y>gb7;nZLQx8xZ4`ZJ(oQ zC!2X~^AXK&+82E!ztV>od!kqUTjul!Y2ZL|X@;UMX@m_tD21>Ry8=1!@@X%OLd3L1 zX(^&0O;dq`l0o68PZ)@E2SvbXqGkI=p7X-$_~`_;~Db)9P@GJkDm zX5?633pov`o{=Km&{(!N)oRj?W`U)uhz`t;y8j^f>+D^qKlJBc*MJ>;r}39j3;3bL zMih6CwH2A4$+|Zo_K>qKA|^+XOBgYop#H5X2W*K}IXR%0gjdH6Jze)4H_sd9>7FCL zvTlW!S|asGZEIS}ZM!4dqCX0@vgzE2vwyoHYAnxNl0kD65#a*!yOc;kg5;$XoiHR} zf$rGx9`;611-3syshuW)FVM6kgxFd-&))hHXE@tKS_&m3d)S5be=z^$;Y8mq$Iga( zFNfQH+gtx&)#4y%x5+)+5z}^p(6_S`O?rYxLVPLLEKzKMFm8~2v2`dQ^&HrqOp0t? zvV`OxgZRCP7#u^EFb9%Ut{4K7YpM>O|9R_M<^pFG>A%?KES zcp1hmG>ilv|4@p^BYS42_$$Xh4#&f&IW#xz7c01ZM6h#vG%dbw^!=t6);7OKz47?1 zmG#TR1cxFVOAm1gtWyXL4CVN13%Tu)RQx0bKWjMPVbWpT@EnbeL+~ACtChK!h1w(L zBuvO4ifto|qo4r9I8W|utiD%NKP`x_SG}8lk(;ysOmFu2${qi~p9ND7r(HQqo>Zcn zVDRv6`Bz;0fANxuiZ&9c3A-jfrq0d8)w{5XnU z;WtDIg7uK3(768Ymq7X8;v*PhUi>GEsNb)K1JLO=Eu{x-j&yd6? zrG_IG+)TrfqQ65qb&F9y9 zVli;OhpGv^LXXp=e-#y+Zkc&+wSb}kdMhsW1<3>qAMfUm$WxSxxD)7UBnFbS)=}GN zFK*o~(FZXjHTWE(FY6#ES4sz3bT}2FSWeAAyy0qfw4^?PV|99f6;qXXnT+r6==kft zdSl5UobD5jOAzAQB9#DO<#8XOF=~o}2yh0Yi;~@=v=pyNtXPh&D~+~CufW~VSpX3R zYUpn$F7LXaB>xVV<*>NAdcx5&|Ka;U>& zGc1S1LPDDIRb4t$2-`wnWGJjy4`gIC>)$7gRz0G0I4%tR zrs9B{EY_{o-%Ss+kC^!IjOojVHGcD{i?b=w?Ypz4W)#{uS^Qf)v z!2uDd$pmLu(!LOPLu5~f7-r0!T3uQb>Vx#R6PA-`aH4b%8tPK0A~=9gE9v|Q1}6U$ z_+wxroWp-eH##E?S?aW41G%r~5G#;$^)@f*sp5TS2H_wpyuV(+&mmZ)P;7(Lg#3!N z{g|`b{e2qrC-*wrX)3kCr*}s1`i_vea%9}Nt+7w!^(6Rd_`{*#82;o5zmVSYU&OBp z-oI}#?RNAnCRwN`!}vhZLoQVnU6r=^ai*o<=_qI-NR_!tJbUH%DES#0=5{>>Esupi z9;CWSN9|s6_X!3sd^lWapXWi^mGszR?_@{S_s4y4EY=JQTEzE9ViD&PLkvfGS?L5tm4|MBi(a*B4~DoUWLsaz0v4{L>m>aPwp(KpUiNPJv8r{W{# z3$~H`q1S|hvRn1p>X9g0+TBOJPm^azi5U*t*}WVS4=i239Hj-KTkcPAky4dXDk-2W z#v;{qPP!!Y;v;MgU@_XK>TKj+p&XTa^QC=XzDDy`MhJXuFbW!*mXubzNn{l-JyM~2 zyJOJ|4ToJit@g)rfzFYb?JIc+WV=+g{k1blW9Dp>ud29Gr8h9tXRi`(oELtU(tTaQ z1lPyh+`g)ig(9+C;?a~Y2O-{48U228ftn|{dtll`8!o`x+o<20@GXdq;YjtJI_kXUYmINn6!!>c9Z^$Hld5?m>G&a zj{9c|hd~@J=}e`(7*yV1S;OyHar~);Hq58AoxR0yOZeVicNOMlr6_aAc0;Vt6vQRO zg^tf~@T0>>OXXSn@T z5Wb0EUf$_fb(^c-hBh8X{{b)nVU>`w9CLu>?GEdE>x%EH@vCX^iuv{iYCb_>gNC&# zTx!+nLHn@+RBdei<}@{mT0@`ZmQIeU%>KRnw!<%*)*R++I6uq!mVIK^9_^# z{tKJ5?iHKHyDaL*yb!)>8UNvXY+K1uB)z+}n-Hkq8g!54564NVSv(7)S3-pc>?u@l z@xT_n>i5UDdeN|DqN+b|63>w7Q(A_|_HGKOPzF6c{gWgP8GDKSa0PW=q z%|GERhbQM#lOV&s!+xh!gsbCz_aeqGIDA|1wCTpeqtbgB=@zgo3Y{>ON|s0; zuzb;rkH-;aTwn;mm5!iw^%JG?+wQSe4X;G~_t2;(`(iG6N1iR~^EfcK?q-?Ks~~;KR$IwO=-SOq<;KcBblSS<1}N{pv%D=E4e6l*O0EwLnFsfjY~pf*y}&>hx5! zwtlkoHUMstCOV=m$c${o`LM5Itt(xXBQpoCnE0>DZwV-wVfzC(4iAuN`03y$+&@M4 zO#ZK4!gq5=3coVk5k`eVboP*6XSu5O{V{1L+p@Pr`U$rRV6ETb*I-1%$E z{iyju{t70TW||S|%xZv`EN051Q0*fou}xwjvPFEqMk_VJv?ZWVFxn(6Y%Yg^zc7A^ zVUC?0>W7{-i5z8x05$aw>APUxsz`doU9)WFmSlPtWZQn&wkD^?<-UG9Oi|V>6 zZmMW@;J^o*#1-!!O4N6UY9A*a%ThaEo>!kJhgJn^Hs7rN*|ZG3bkLCLDcYm=QtUz8 zkIQ{h8K%r&NS9TQMWYZSMop((C%E`5C$Gc;&?!g?HwN{Ic2%=B86IvEf~VXCW^~XA zOPYD@wEct5=9_TDt|ufP~6AL?F;c{L)r+2l3vSaC>dt3|{}>uvHfnBUGp z0|}7FCtUFyFSlYt6GFKbE9n*17IIcbp|xR6Lm9ic1r-*CYbp35pstza6*hgs<HNrmcESg`* z3TV!iZ660RYd`|#r&!@1Qexj39a>?{fdNk3OrX&=-6O6*WSat#ZlpEo;v?F_W0GO*H!kNb z_jT^W?KW*;Z-*t*GcZRgHaFyp;t;Qn3g<>N7Y^KQxQn@gSV%5!&71>-#yvsHrrOBX z+)(*b($aLOdkKBib1;R7eqCXW$O|tdJEwKR5yO}tY^-}a>GxfDJ5ITp-A#B|-3G*I zw8a2`2@y1E=<|alF&C#_N!&hOa%b~zLvM-%u zS|viiH98B?T0oQf&9`>Z)faB5cBM$b+0xzNvbM2Ys<7j0{EH~zxT#r+rSTVE@=`+) zM)`y@&k7f#j4D(PZqPDL96#IWZg^nyI0=9Pu^YAmiJC7E*+T?p))#}AMHO0j3Ls`w zJJf8`PO?|u>}w^ShPq1^0*h2le$l08RW4lp7r+1o&f8BoJa!rNg-v9QG%GjsB0`kz zq}b2mSF^4lypE#K7K#lixF($W;e+z%AZs7wdI_{taRs24b&E>vOqG{)*4ERMZ`PI> z-Xt~<&*RARav!a2e3R$%r}PG~SOf_+zKo6AeU4)D~l{+MI}78P5sG92~o6 zOpp=jf-<|_NMuGCKVtw%sdUPsaitY|-6YL^V-M>}E!x3KL8zTgl9|$kTI_ruc0y^E#5xM`xlIOn zi{7M`#sn8(G~;uo+L{Y_F!tBV09LL3l@`x`M{fRs@?>6?G?Ua z{0m6vj-rvr`p)*(3yXG+oBRc=&Jr`p>7r1zD0~>90wA4KH=nq#7XRhMIw?;8d&g^WWq$W$$;1r zEj3!xa>*M&=TI!#0&Ah-tjhR^4_qj|WPdM4tQ>uh@=h}OSXf$3M!J(~De7@aABZJ1 z!E!)7sU-?qDGx7pjeo(6R0p=xifktccrYoliM^XqZZ*rSM|2UM+6Sj7uWdxMIhZY& zF_gIXC^xu=?@d_>lufk#{kB2Ru1AJ>to(Xr@n^GLN&y=ynS8?H8|LX1-BZ)bUvf@Q0g# zu5iYL`d{4oM5{@!Nx9Vo-Sgg}1v53b$T9lm!=0t} z8OG1w{c`2;_{FMGKK!x+h`r4%E*&mO8NGkyw6pNV&aQ%wezk|j@BFUL2q-by9;A#% zA*DuJM<5+_0(k=erx);9lJnHz%1}e~oO!U)k|}8?SoEJl2}rpv9=%=0Z#6JFUrh0q ziT&xcl|Fn&!m?jX3ov*4>&6Z5ByD;pf(5hWcq4KmD`{DCO;-&X_Vv#%`&X`8Wn9O8 zyHHvr1d6PykLqft$Xmf)7(paZ0O&D->*87!AAV%{>bGIG#h}*t>r&w2a_7;Pt2@i< znQg;=B7eTEPN{5FM^X!wtLMft67k_SPC({KYN&I}{mP7@uBdn>}T5A zyy2sYnonei%r5FvF&aGI`3YC(*VvwZOsbCMV%yS^?Au433uI8&qx?7+sS{CBFB)E_ z@h7)S**I^~QrJY3dkx#Jb^@q+0^o*fAnI@`#G9_x?_!;Lj&4yqUpfRI&Q%$8s}_p^o`C1HB|V*>oI%4>UEeKypeOJ}yUs8AbHRSvw(cF?Bs>J! zJDlu%Sgy+Eca}#HRB`S0L*i{0 z2GIf0EPyy)7h8Nc6$$I^Le7WQoW-poIf2t_9-5U{uuenD;s!oK&|T~$uVi_ncs`x> zvNYe5WfaNJk;14bwC@Hyj?jeqk0c z_7RGZ`ge&LE|myTwX6ku28tq+tU-XGRG2dt4KDdVR z4#OMCYvC^ybKHWtyXL15pZ18&<(E|sG<%?ntmp;Ym|I5($dnFX9=XfVE5X z{V+90NU&k{e=X>p<(Z)b>4}j!;%fTU;@ZdiYNtx57fNeSpIbE7(Ooz_*6?#ql1XRp zK5v_jZ8zf`J03olZ9HF$M791&P1>+!TdCEXRnMN;1a9!l!4b-%0}MCL`|#JsSFD2> z?bQ4T-!?Eo8}8PT-Q#ICv(y5L6YGjP$yG-)RHkSZh0t$eYqS^4SNZZX;K~^}hH&#! z?wWL5LZ9GWPVzG!y=Z`|Bsp5|cb@MSGhB?FCqT!$o#kw}CkjU@H92>Y#+EPzxkeoo z*b;41x*w(#&T=h*jki^0u|w7tjYE9KivC17)#RrJ`WA)xoF_T;2g@0;3z+uZyjiJZZEG!p|@+XI@Oc z8`I{P0mSBfr%9JJ18o1_Pom*$?~aF)J56icgq&kR+|W>(j*{177Wl4_`V8;dVrO3S2fd_mDRvmX)^ioUjw}nSn; z=4O|7G;s@4k{qQc1^<#=-MU~naGL8T3!t_D5fP*TH|FF3+oBL}$^3+)`pL2N)zI{b zh2sNb?U!{8@(NcuDLgGbwRXyHly5V+P&8ZXS#zm2@#ScuJcB^TqWZlFH#lZSH*AA^ zmYZkiXx~B2t?)rF*3l*^9GasJy$M2OIdhbiiD6gMs7s#J%?T@NyoZXszx&ed;BR>o zx7HEXuv`u0|XW`8}2f<%mf-9tJ;c0sv;u%v}xW&|c>%VZ|- z)U2uhkTkt5v@OiV4Xv&I9nH}jZEHw-YMFui%UhaKa(Eo8WpQozcgC?c7fUo<#a+ex z7=_@aR7+!zoFlkU2iLX$V-5o#$Jt@_ik(^aq;Msan=z}j1qR$}wP-M8FV*H^ME|A1o7`A2t@ya73-LPG0edD-wIc14rkp1 zAU{XV$Q5sc57)uvtu6$ZkTt8cqhAGLm(+W#eZ6HPtz;KuTZ3Qjwb>|7jstLRASahR z@2eBE;2mKtW?Qe^U8=j<<4N9vir)mG0kE%>w}iA0A+*v~VyU1Rc!aisJ`;06WNobw z!wQS;C4a`lyGhfwuo#qLk`MGJ8e7BUww8&nLFjC^U3vK?d4}naXt~jh0Ih;&I`bi7 zq0;(HG9v2?ptd%0Y>hTb^a&-$=!6c#FCk*vgon|kI*SwzH>Km%snR<@U`J~QlZGgj zez_qc`oZRYPi*~n-`8R52y`Gr+=V!`a2(h>u`_5X;z=#CVV1WvD)Fpk3T`XI9^4Q6 zHc8IV4n<Bf08}%(s>L z3A^4mD|0I0{>g~nZeOj=Kk%Bd>Ai5=bda-5@<$2g2aqNI-G=0G3mog8dn_|>QsnB| z5x$JFhP4^6ILX~Pie*LjSgVor;=8I&P+AB!Xty+t{23OESXNhF5pNB?YkSDp1&k(9 zz8ysRri#0!H|E^^>E#5Qlcz>9(m`^Kaet(C!O$ZC2W3@{Y25CmxK)e(8Plhs{_5QP zImWwBxKrqH7_TOF0?8eBCQ^mZXK#816N-(im^&#ZOv0Xqfv2paHB%=ztJ$Bk++QFR z-L2ct*7eWUxr}tke?6~_wew^B^(*}z{x|9hW0yog$tM3BzQo{T?%=2$CkLQFf&4<& zLAjyxn*?-BTft z5X~HEr!Dab+0fluVTn>1^rsz~y68@4^A5IOA8}(=_&~k)A>2Q2T9nVnW*idg*j*Px zKu*)gwySSzYOrmhKGtbd9-&TMoIV04fxdX5=OMuT-+J%1Eb9auOVMzl+_H*To2}CeGiRN|^p8w0>EyJ9L zByrkhtjmV-B-S37b4+3oi8o-|Vd4x(<5=CPN5W<*FNtUtQ zFj7B@3O#y{l*?!@%UBy=ysU9h>`HfW?)Qlzs^;5$o0|u)XiufAVr?&rnvU4FEgY!g z$8*oMtd&Tq!=bur7(Zi%T#6b5vyna;p=BHG+5Jt8d6ESF64q`s4I#mVc+KJo{)^UQD(_xzOk&#N~)-vNX#9{$Y!TB2DW^MDp00gP_ou)&C47R1=22t4M7F{m; zb}AySlaHTt`C`aseqSR>gX;VbzeKqYn3+=@qg%_r$}`a_-(7;n3Q4B0{Kxb=I36VJQ4Ag zWEJMBq7+20K*rh$*S{T|Y_u!e+Kt4ukR1W)l9wdfmlgxn{=+(Q+hwEVMNhrZ{O8>i zpA_J_QwkdH>%l56nN~!Xk9YB20H~@Pl>}dH|SC4nDU&wCGQ83&CbexW_q7Seih2 z*;xZCHKuKNQ|tiZo5x8k*P-_IS)}by_iARE=53*hmuvidkhNW16sc+8B9V}Atev}p zZpT~3UKMyzzooueO;|mg@cq>264*EA>Fl?7(?&xf<-%*)aDPj*6E)Mtw*){~7VOVX z33;vV&AvLqpm7sArm+}n`GV|3lxYk19q%~@MCp7L5PVKLi1wf6vknO*h6j8yf$qz* zo(rYqdlM}uZhu4A%erF7kubVf0dO!A&C&$?yVIODqg8s3c(%>pIW$JcU7C2i(Z$yA z0GJu~HlQ9r-N&;QyCsU3=+70xAHPHl-Pi3qE396iaY$figrJPmIrU32!Jgi9YTeq# zHtn#ft_S9E|8*7E6LQGwUN`i6@@cIKM!_t1_<^=n@-nAHjURDS$I0*oiZK~KXFevn z0l;YHYU;FHLtMe!*A}d|O`LcHDAZb28X2n_&WcRmA{lWwhfnfz8KM<;*c!Q=OS35UriK93^&85?{Xn zM;pCEyb-?5uVh}L>0>j4(i05ar_sZcl*E@sA5?nZ@(V>f)~zG_KzTvTF#2Nz)E{}< zMft`#?JJnrF+#C<^T7CPsnQDI+iBDuA@REMg)9Tms%0*(t&oQMD((xmoC2&YY7fGf zL&+0!9u&RTk!NTna*;$bx(}&^xPM04IM01&s)2iqV8$2Fa&(-Rx4`$3oG0W0AI=TR zS&734%j+2&#k~P~Za7dD0Sbw27m$SYk8VbF zXi*I_h`kY&w!BQhIfu2;-i_#{OQJwJ522wh79RNu%~x|*(^x>bg1AaK7Aeiax|^l# zG)Gp9CJ7%1OiJ2J$1$MF0h3>mti=G#lPPK~dTMkFw}WOO43C9#ThI2**6kc?@5zW^ z{|5Z&Gcq_=Gd6vv9-VIqR-1|3^}fa~V0`i|xXi)>j}Ry~yp?P__S+_{R9PrlEO1g< zO;>wy1WAQx9Z6mq;*1pj*b==2t*QYBy@Qy5`2+D11?UoJHl(EheB2FmykLJ59HN;1 zVW{uD#HO9MRJ8nPGdXqzsckiG7a$F}4?ujyHrDC;XxX^QO!sg<6xI>tq-!-iW^V+u zgYF>csU-M4u&e`c$reb~mw@9r_ezAEF*&QeaDF3 zr{?Axm-S!)42WSuLK4L$Xbi0FFiUuf9{1KW3dEL?NoPKGHdU2&%rK&d*eL%?C0ecHpj+;ZYoEp|SBlnGC*Jle&iA-GSAT0|s@>n@#_GM@wjTw8 zV?hpcVX?cG^q4*el!q9m7*<3R0B9do4Ok}t%&nfk_6awB40P7U6*Clc5JZvkMR<^g zRtnY}4od(i$YT>VZ^TLMa#K&45LgB9&j`jO&$zr)(tw?;Xs~ z04V)kSf)vBV_YM=q~v16d-1w?MO*=y#59L~Bby7*(uZAYn)N6%le&y`5INWU1RXoh z6!<%^C@QT@t7Yy(ixs-hfN}1C;W3M2Uqp$&Yp0PZUpD{Qohf!Ojyk@BB?V*$=DA+U zxh{t8APPbF>zt)RZ9(pSq%>#lZWz^2g}Lz)DK;QOCqSD$qpiQ!PzrUlyVwewGrz^t zgoY{s-jU@6hjA~EH?j!mKKNs+bDw^uVX#zKcM?zPIRaS@tK~d0wIK)!f!SHB2BfoGugb zOi~DV=4PS^YsV{~a6Jx5ia+5R34kYz8@aolCk5_cZ>|Y;65XN?NQhnrRC4}pycHjO zA~{R3{YdN|Y~M^-+to|)7c(wZEHyNcH=)wQuuCZ)i#7ZtvaV!>H5w{JM-jZ{H{J@d zOLZ@?R;Uf-f`L8h+r!FbbdTfwOchAem(hYeAlYBY7Pv>Ez+Po9S&s1S^nk z;44FNrIMwBW~Yv_qKAHGd8=3_=7$=F8tvy@ycjFdxbPm#cO!e#Yi`&xTqmHrV?eMN z0l=ts2^GX%@{)1gWwDjvu@A$B-EG{_56o|ujBhW~aG-3hLsGAiH?Shn$4Nd)TQs$m zV9OV!KEd0dv*dy-sBU ze7INfEz4lLoApzp4nVao0T44qz{vqKPF)zSU+6?|U}m*k&22AYbC!;Fc=D>5UZ>!p z$_EL4B%qI*VelClWW;cjbV{;@wMBgi3;{qrG(yRx2q`zjHcYApj5J%IX4Bp1FRD6x zi(hJ<3h}Rb z(7dNII2@l1?@O=)A_H zWt2?dT-nj4?$<@WerC99+`%-3s#nC)z$2lC?>OYt{T#0*JM=XYXVlisUY1 zIsxce6;p&-IO~@s1Nee+?ItTh=2b0mxTOwLr z%Z%&*VnW-*Cgn`JnhOA;23aBH9REf?Yc0siG#ne*JqS2Lf9ONuBj&tRG2a%jb27ng zaDxafr#s&&uB`qq(9FS9uEJj?5g2bG1(ZTXpfv3l0d2Vv5P{a12PG?+ve@U=Eq({z zfpzG`!0br+zLtgC)xNoc$I03y@kfU-A}U?R1u-7BM@rf+f5O>3{qR&sw21!iBE9cL z6~RQdBbc->z>l`KZ;!q7$%EEt!^I z0O4#)3j~{>tc+@e3zSZ=$Zo6}Cu) zu^W;X&RLj1Zbd#5WN@*HG2MAs+t%e9h?L5lqSAB+P$@FjAz2HY(&Wk~%m;F56SciN z_k_}Nww7-ek*Kr~TKMiqJi2~ZxYb=C^0@Rcyo1OMqGgFU(yaNJEIL$VdkoF3o-mTo)24ZRNn$gV z>UfKr=_xGlwghNx3xQ*Nt8^BBhV2I(?sgU)1*^0~$>bbBX~-Lye5k?+W=MYoB27EC zGkPDQSs35;oMbjiUHlMxZtM_w2Wtyr*Qi!Hp!+nOES8yuYRQy?5U=W-*>~PiUdKT= zt04{;9#l2J@DNUNzbE=W!7urH|AgDwcSqmaojY^pmvd)=+`bR;UWDYf&~8X9u~5WU z_cI8-(XL^?E#<+FC5aMq%ob(X=WKxIb3M<1FdR4)vM*$O~{BDA56FlwrF+4Ao<@(QHu`NfpgKCjb~0{2}>t~-p+sX2YMu5P0DpHvSZ+J4r& zysG_4(gzh|hxhNlmG}P3vRbF_Gv;yt*QAUF6wqIZAV7Wp9DczW1MxBG%(zduJ5PYm z)qa37o{De`a}dvX?y-sg5i~ui*1*%#XhSKC7r|o@oWz9Dl>_Www#&3L>=9^-vD^1o zNptR5-d61Q6}T2L>JE7XhxH2e^S8&MV4Zk$Yi~-vH^sz)dKmGVaQu*wf)Yd9%)d?H zOw(-9!mlF#Ir}~2k0`3v|^@jYiFBY%gx_f&vFH>yOa}FUbxWB0i(geSD(%J*-Ldy`s z{9gN2?=*>09r84?`uY-kZ^YCyzs5ut(lgTkTlymc}f*zjedpo2u z;EyJ+i1>;n4M~ns2fQpRJ^$0g++}%ZbEUa=*xx71Q=9Y4{BPe{lZktQectB=%I2`p zeOT~5Q5Qmu=&$M}nQh8>S@UM2ZE#JZj-mRiVPMW-lTdg&cxt@fvm%&&#qkXM<&njp zQe3k=k>j5p*50J6WvuQJ0 z6STJADbAIGuPYifAo97A^d}1G=_g8O|JHN!Uq5TK^=K}xy3E?7E7ZHEyn1!g!}E`_ z(M>=96zG`$@ONBgi&dR|@{QUfDJA@TXmIIqI~NPbpPEh`Yi^G?Tb2RyMmF}JsR)lN zZvAWL&F*a_8#e?w=b3ucGB*d<8m&TzL5OAJqlstCNB(pV%3%RqQGU95eGhBD=lN{_ zvRxUWi2Tl(CA{ADWnYQ?k02o`l>!$|5@!8TiE#l2Y>U^|iw8#ze*7LSiGDqh z{ODDZ`;n#>a7u^q2l=yd`cOad99;SP#LSiRA;OXHH|D2CN|yX}uvme6%66KVNr}Ch zQgm>1qUO(p#_Aw=*X#Z>2I%kEsY(b>?j&_>8K!Rxz7zY!Z^Z`}Pbe*)3Yf0KKA+~W zAYsmA+IUiO9kJCfMc^wx6Wb&%7=6?B`F`h_zTZ!IbSXCcernstr+r_P?QX%Lm(CLF z=eE~PR)$Kh)d6?!v}6hBTa}8YeNKu|C8y0*SNl4ySTX9``19?9(f&;9|2#dKjXO`V z5KLXFEKjE$Z6rGv#FI$h^pWspFy?CqsGXm4_xz2mlf3Ia6|pFRbd@oDj82=qvikl% zpPAf2;;yNBMLxA}s_9J~_Xd34yM=;;-{dJ*U=4t(x-#=Kg&KZL1J}$1)Fzk0cTYrn2<8abt`8TW`dzO5==fAFXxR?l*SKqWp zS5;08Cni3mb?5B}-)l)8L9ZKwgBy10jQC=k3o|jrL(@NWbqOwxygi%08n?8Zn5$ce z%@&tDVrE73Czl8CV)_I#l-ApRvAj}WMT~wkj}CqDbVl@=H=Bp$g~qnnKHBup_~I&g zo55wlBf|Xs-VZvwRnSTWW`C8C@!_=mQA6=(Nsr_kc&&YbPeT|ht@bTxD){}El}Gli zJzrz9HRF^E?i!*GY;qJy~d$!&F z{I}cujobadjPTyNDAc$*=cX>0j-PSreioh>6Y=tyQ3?~@ou9jZY18;n(s>{ov()>D z>xrAo;Kcv5m$pqNTwwQ(9sJO;YL2nEqiVduz3CaUOc=MbD&|R|U#%jXxwmUBt+$#K zlJW#(@W#7InwaYgOkcmc9bCC2Va<=?~|9Pn305(=Vl6a-Ny#YA8|Vc@i0?%Odj_7sd+P z%sdAF@U`tGg@uYrtwTtC4+}TG@WL_evr_n61-^}nr~B>X!ji9HU5TpJ$>8^^HDdZT zlA{?!@GpK&wdtNv}oA3x^Q*@!=77kb!FgQW8p&O${GE9j(I#7jb(W@v+1W=l-0`V; zTXrG&zfZ@)8(X*f|K^%@`P_kxb-1)R>=cE%jD2+s{WoO69^c=}t9WHxh2wNcXW_W9 zdQLWufP8BpIs7xlgTODz{D1fEwbl6?h@@v^i4e$H3E!$7#$MJX0TU8LKrBh`rDR&^ zUj}9QZ-0OM`@=m;zR{CVoka<_g#DY2T9`Yb>h3-TG$%NU8d_j_eUww$z*+h;yJ96a z*9`-djQfY}U)_;@V-@owpVc}m4<7hwW>4h~Te@%b%|0k^hdw0*W z@qfC;|HF@zA$ZIUxZNDb%1DGiIWd#$bJ9A--^LEIp*+WT=I!suPjLmf%K+2?%%je( z@K3zy722?0YN=={QkP!rTn=x4eZe>8#@N~;%Nqapf|&kR;etw@j#HK5;PW*&yvuP?9T!PT%i&#N;EJm`TCg ztGIyrSlXe{77i(*y4yH;loyg5M?b>a~{SM);%D{33Ee8#usXq@+P3 z1av>?G!*XRG;)Q{cnKN4I@SIR$F5XjdvA}lImTW0)@Bmci2N%jG;K|uyp-AmHa^4) zWH>bk1~tA2uFWH*rl%k+xv|d?d@*b@IRtfll5Z6w^ybScj|5Smc zw5k1sTd*OW{)A&O8Q>QZJQv* z)7tV${*Gb0h>xpcJ~Dl_FXb*^_*uFY&tX$JBZKSx#p1*g@(=#T(U>O8WJk}EM$#xQ zd-lTTa5?>ltcfkj5;37bE5evYGE4w5GO1Fk3BEnVYw5YXQf)#%!id%L%Zv) zA_pIj25jEdDG!kK9Vk}xkn-e;St!hV zQ(CoC;e7J>-l+<9r=;#7DGe3myJB8U+Nl~sZb52ijM!(fYNcT-U9{fs=^`$kIydtG8XNCZgQ;_-K9XMM3E%s3JNry2;m%OGFRB2f{dVl;K(OH{hj~b zpYIKi^gel_xSHKW|Cs<5&2fy&jTPtv`o)L=Pt@_i7m3!y7Xh;xDqs=Eg86ZBkTDA8 z0ZO9fAVlu#Rh||BS|czCM6)~skpJT6a>FP`Gp`J;s?%?en8##~E`cI8ZE{4;#HR7r z^Hc{0IaKx~|6*ct*9yN~=_h=yeOxe@Xkh$Ddox#A#@`a2BMT$NJ2ech(4*zFcizmf=2(>Q%M8od%XF&xtM_?zAgxF0+JeYT? z-w(u~^(`ouhZ;p~?5;imVl%qO!F*J~mR?eBP+k$mfL#u>sbCN;wB;Mo<3e!L#)K&{ zgH#t;1DGLJ&;}H4;b<3$9n{&22HueXnGG1LOO2VZFsRSR3v`|uNeIluRD>%Nv_Y9T zX`rS5ZGA$BjNuqDW}=I9^d#xg6%I_fTpmM?L%T=>?As*CJwg4RCBxdiw!al0G58@X z0QfI|XOC%OiBq9iaYWd)Pr;2AsK`^etU%$LGWTL=Gx&SP-FRFA8*?NjI(aT8P5?m( z|9eSCCmTT$8Py^w7qy|n6cyw5M?aF6x36@plz*q!-rg{{+vW;A{{&l{rwR)Ja0B+x z{^0uxuvKH{Vy;8c=OMAw*RT8Cl5DM*9Ll`3E)IJO%9U z1}Z2BP8e5ievBMU9!Xwsc>vO1Q_m_%z46lZZNcw;0oI;%pc{B~iyl9Ick#nVbIh_% zU3vKEx!_Vig|8+T`jz-Gc#vDkHY4ucda89m=U$qXoTf~~6|hfX0Yg57weITyxu8|= z9{GZfFu56a2NOSM=0WA@vTCH0a;9yPA5TY!Khz!xt;WJ?0)Vw+U!h?GM-<5jFmjk0 zD4T}5T}Q}!s|;_2shGpbVahzXQQZO@(GUn04|0>)AMAN;NYkf4W8dUT61Bli0;w27 zWwOC`bnFS#E4HK4|cs6OMtD!5XAM7|00P8%j4}gfs~Mb^QASe z&2vSL^5MZNzP>tI;~cYL^m)+^N6n;^L5D~DG~34-6ur5 z>d}|`nBdsUm`wNCEDhdu&ew>xkSOSU&W|YZOK!I_tc^7+1p~Y(kPUHa6M+TRI3%wG44Bd()GuqXf+OFz2sLiZBwr%rXh`{;ZG8r# zO|^NHnw7q2x^in3eDc^mbb>mbL4yVq2@iA`Psp~GK$0V!!dc{ao-ibPp#s-o4h?)U7 zE7Lk#WRL|l{O~^9)l^GeO{iH6Q3k%x?aHZUT)+@I%3GN>-SZSt59Hd{F(&v#tVAQ4 zcy;hI@`lN4vUpToJSG+I@Hur%VNZmLvIz*J^MU2^NVVJ!0;+|VE=jEVq@%Rx@Gg>M zWohq1R=-p|&DkVAxwq0haqI34%ra~2pPJ&a*&iTZG)b6G5ZD@|8Sv8TLUHRifDQOx z*0eYCR@q0tMJzA6UcOpGnNw5Paicg)8>A%w(JsdIg9aDfij45)J&Cc-XR8{Y({0t1 z1!d+o>2D5@calUw{?m3~aDP?H1lF_2sRwqP5g`~CUjtzm)%_Qx(ag`OE6FLdx#~|^ z*Y6~r&J|$nm2sI1?q(vYYrp6o^Y&{fDOh$VawRd-O{eivum>D3lE#6%q+5< z@-e~c{bN1zh0=8_aOxkX-DNsjAtq+N_avar2gJl;=DTby~{#U-cj^* zXlQpqbPzqL0WSb~69d{lf_IZt5V#8jVVuQ1?JYfQtPQ+89In}UieX%wbANlL*CfFl z<|B+3Sn7Q6Awch72Sa4sAomE>{iXp8#|8GPmo+B?t+It=H%j~|WS0ucJ&olUbx8xJ zfiybi3loVCT}QusB03^8r1TkuILh5l2iDY8y9qizO3DpAR)!1=FMov;-^0D;e0&7j5G8G0l z4p4SL5QPFfn4id@V|Z!)0U(7v4Fs`5^iUZl2C;o>poz$aIAWLKVEp^7iJ)628UjmBe`+uPw8A#NxjIb+BfW{KAG$uFytj`BYhIoI5I1!5cJb)WiOW5ehg{}Ls_ zH!RTS<-6of|e;DB(5Js5aRXA!4`Y{GA<)0*9^K%VCr0&u@DPZm&t#llw3()|7_9YOAWPbuHRM-S_m?CO_WY*A-J!Ns{nhWf5?+8wlxOj1Rn{ z=TLPiA@luvtoHMWhQsF-n4Ef4g_HZZ2Rt9dmeT$-$D-Bb8I2ida}cjeHZ9{2!=%5VFhW~IQ=7s$YoVgeD9!zp`%v1k zjb?TF{bbK9O}!KitGQl1L{Pwx8QtSflQq#3nQzc(;i%0=FUrl0*|o|p z#OJ)?eYAU1KFv)^I*-CVz1(4lbx~2lrDH&Ug_zZCbl7VoBn?C*Qhw-X^uVj0^2WurNHwMiXFwS z6T03XW$1KB{kP;UN+af z(1thMY&^}k_*!4*X9Rw)4KsEE1RsGQF#Uw!1kCg5evUA2Dq2#K{hiPpbgf zi57x({sgQ7zRZ^S_PXJ0^UZE=2NjJxEEEgCHW(p@`zKIKfC+$Z-PhCJ#>UGNI04`! z`B&tVKt?#RAs`69Z#(_>vwy?lWpCqRBmV96*k2)_Y{X(xKsPe~OE>I!{<0fCU;!ZC z4gu)F9f^jfa0uG^1N!cVLxvB6{?l%I*?YTOy=Lj)EavEDW4E0m+r6LfSj6-GB<6Ns zXgh|V9ft4l5TumMDux1ux7|yD8NP~%cYeee>D{Egv%-$J>@J@ zP$GcxuznUBD3SEvZaD+m8-q91b316`}#UuGLw6%=D+@O z)g^|t-KtW`dXfB*7?6IE+4w3+_fpD;vIzgYmSThOCmY4647OMw=TqYCJ-T&bz9$HY z4ljb?>%k;nBArd$wo(+5IOKLO+=I zCUc)wZ#4CBiK`0#F3AWIx#^JL$%a}q?~n_)MNgIu^8C<8$Y8lSc*jhcKJG91rvW08=xLc57z}2>`d>UX;{AnuZpc%oAV=haJQW$>?i;A0yj%Z25B_onAFg$3glN$;-AdHl!J8U8c$_?`li@g*&$-@&FkTLepC95{8QU<%<2fX zyIaouas=cb|L#44;PT7gkg80W0YD7c%Y^Y&`O|?y(E&cLXtdAnPWN|bVuH3i$kYDs zF3orQy&+?YTZt#cehycf+<6lB=Jh@O3coEt-`P>N?nuoftTn<&-Rvl!~o?ZM<%Q+n~pp_w&RQo)5MM6R3&Q)6{WI6KX*h9%X%X^aV zkxirBd5+1yy>w9B9sbxnv#0k=;N4gE&*ZiEpU=8Umzq^)epY?%Rk=`?Uj&z{R>BA^ zv<{v7VJ1^+g;|?medyE;7tWDaMt0Y4n^eCMIQphDqlq92<|1#5@0sd++&AQ_AYwnuwJ#jq#=8$U9-x<<>ER~ z=Yhq9H*!l|Yy15}wM=40l@X%hmHzWRy{UVpD;`wDX$ynf^=>bF-e^BlnDFlAqID<{ zREVv8?0(7v4oE)WsN+n}Yk$FnER8ERO@k8l)@8MKM$UMBc(S~&^uFK8&?;`T{YnOA z@f~U%Jv82g3ZHh+x!O74n+IQ~vrk9xS%!a02RP>qo(>jH{N1#PD&?)~shVXt>vGOwnGH`*P=Bv&lC0Wz~N%QT~t? zdri#bcF@}IK6RFtF?_l4xZu?=1AFwHU#{3kAv2ew^q7#8e&Z-N%lmUX-!*bhwi4JF z8@XU4fyF|^Pt;UHlsC^1Rnw7#>JD?e>ZbFwS)s#Z^KIh^_S7(GPFmpoJ6~@vXGUG? zJ@VNj@n}*u2S?WYoon;yx7DRvE9MUOYHc27Rhw9tP!YW*MRs1SFODB)6cHe(} zP|LA1cUV%wF)Tr*U-#+(>lN+Gm=@|UEoiy_J(B442YlOr48n&603eImHHg5Vo4)RW zyAkBxJ&O#;91>RUZ^0CaT9$)#6NYWmw>Jha7F#@b<`!`+N8Hvc8blJ06F*80IbU7M zxv9H)LN@Ye&0%{|bB*ki#HoA%7f#2swsLE|r@}D>UCdfC_M=I08y&-QYpbQ74Jx}P zjrK|*Ple$UJ%%`O_yepJbogc#XE^i6x?Sg`7eQeR{VI*4#7O5}1ci z)>m}h%yHiQAXVaWP9J9Ay8xm^={uLPvz^%Q@t)wuHH?e|-6+d?>>j5t0hLw@3J zdY*&|E+G}4J8J1U=&Ska&f9}GXAe{AYwonP8(;dxa5J-4qU9~^l_N!7G}k6s6mE^# z&2_kUe%tG&CYfUUDzK?0(N81TXrf z?xRTV>F{*J-z#M$Y3wfr(@cInc$TqHnO^a1ymX8#qPy7m*5VZJjku1}**W9izeP&Y zsoQhO`(k@azG>>^+6;7RO~v64AQ$M0OG@39h8Ith| zDN(lJ%2Pe)fx`^(;IC)% zFNDTT-V332f}Y9d$SbH)AHV{)N5!~f59dXWXO~#QGG;c9d7wte{b3dFg>TyB^(^_a ze0Q~4oA4BRE~2R9F?mq;x(ly&OI~6qrt!Q>^MhLI@BGE=l1&{9Tf6Ryp%#J_@%`I` zX!Ewh<5oOd?mE2upMBTPdn)fgvpP5EjXmD9sJ@k|Aa?!*tqgN)r3&Xew6)b zLiu!YY2i+PGp%5Cr#*y9Z9o>nz$uNJuV>Mo5S*Xhul1{BWJQamc{`F4KJczNY5S0#56} zZ>{TpE`rwe^NU?ZT2jnsn$zDu%4~f4`pqB<^$tm9{dnpc+d~a81KWr48+t$P zRgF0;zyIvPb50F4FKZJmtPM{|KQOaTOx&UkTjaAaLMr7(Ld_$A9h zY11>0d!28x+*2^nx0Y$>%6q%FhAOrH#vMM;mW3uJsr4qiVg7FB#ub6W(dzx>=6pz5 zR-0Q2tkkCas=RT!d*2JRS<4zfa*L>om*zQDoD#k0$7$^HC8$I-=Gt4951owjvHpC> z+94geI0ln1*sueLR(xALYooz^h$pN*R0@+GlkM!SS3~(Nc5ZYU9sysHvwR z)eGe@9Q4C+-QHRG*!Yl9PK6-B4!f& z?DH!}uVLd?jw7=ey$wsXrs`Z#IVyLp$&n+5XRh3CxN-V)u^rh4Zt-|;%1C8fUe46I zrLO-&fzzi0y;`bUH@~+Kx7BbP3$?Qko){cH^36=`sdEKO(vd^3p=)3E{oH8E*4%SM z$THypHBLQN{!V*sc_{I9iq=I#*O=4rJ9!PFhG&cOjvYG@=c1^xdhZCuzsu;BkZua~ zo%M|Lpm_EDPiNgDWO~#&tYh4)!#&jt4xNc8Kk@n9{ZJ8I*|Q7Lx0CbjEDmzSN=RPN z(6@BY;|NN+tQK`Mf%eoY{bH0z7N@g%Y|3vgySj>snf`N0Wvo>)uPc7{eLWFPYJZkv zdyBoh=#zi>4foPPG;pqeR{N1*gfrX#EKZ)lk*_APk<7-dRjNM!6S{7!w-;fsrK?k3vDd$iM+chfm<>&pLu>Njw z>n^VV)x=Sz%v4}B8FIgX*OIW*Nnh;a+OtcyA`ZOtrU_a0ru}+f@Tc^E!+vzFC zi#vI9?Z;Qrrd_r)hFNa5YNoFe@q4F__aoY>HS9O2hH%cJB5<1@tq_q5?E0+ziec2> znaF&-o7C>!*2vY*%)|p2>x50zxg(7h>4Wf$b}fTjKBq*JR?XI=&q;6`jP}o1NKZL9 zvn=lczmnSUbxLqh7u&X&$(~-`VtQ`>`g^Qeio}fa+>7q`?5Mo33W3fYvlAOu%ChcX z;5~_i*OMo2=uhSp4Qi|w1vQ{ex{e2CkPJ!3&64!DBoICL!}B~e%;lJRe7wtjVO3Or`FeFYZ|%v|n=J-BPDl2$cu3A}$DLAl?iGx8!@q;4 zU(TU8UNVA9huiXJ-H<4g8glD6xv($!9y~{94^4@6M>Zky8~3(yUXQZP(Xg4}n_Pj; z#y=^sA#t-KiOV04D7X4w!EZ_JRyAn{Zzoyp3B*)bmks%=W#ZxP$DQM||x;fvD z$6?)9&6W+Btu6B6F0PIHYJy5_d)03`O?vF#s(VE8oAhU!e3|q+Wv-{pqh;@=itk4q z%-tparzyW>7W@(;C!fSJu!+tW`hlPQiIz48yN32gE}Tu^IQZq#Rq0q3dziJ*`r(HigcdO`8MmA=N%MOF9CJo$ zOP|WAFE{NsYA%n>@lBpOV%LQAN*r6e9{J_yh{*Gv@8|NYYD$C@b;c*|pOXhSWHT?; zo;M$`()dQynl7HPR}n0Vm7TAUR@`$E@84Lj`n%mr>^3A- za-9EO8+@e0z}4XEzUSgZPAxI@l+>bKz8K$^ z%E~%>?}kFrMEi-V`m$gud6dI%Y79zGE@kP#|@5?5~N8n$+m8yZLQn^u_SKm70AY z(%rI*PcCL{g~(Q(N+0u!o^$zZL*!?e#H`h~`rc zWi@WJ$z>8&AP>e5B_%j9EfVG>ZYy0k;Hr$qHn?gsxB{Ls>NPxrA55Suv@z8`S@4B^1_Z4lgop^Sw;#~PSbAah3O8{z z4758F`)a!I-eUj!weBI5&&k+}x{rY01;3fn9$Kc8T$HZ_bYM6X6j! zF>yL&W;OO3?-@JfkQ4U_Q!UeNA7EQj$1Bt9dDV!A4y@mSa0y8MjiBU?!J~-82W<~{oxY*ZV=2I%7Up`ophv6a+x_|H zB#Od2u8l?UmX2G=F30#(TjE{R@5*dni~B)2iNNhY_gSC{Y5Gg4>Wt6vM_%zWxhsP+ zoE{h@Z~H!d%(^d0zvKNrr6xX2VQ)AqB|_2X%a#{AX7XD3SYty#$)#VOM;$vb2>0pC zfvS7sv1A|Sb+&tDvtzbd$63Gj%SmEM`$f{7^@edSA7AD8+k`CcUJLwt8o=1fz&wD` z0Q~=d8t}(9ddqGyeNvMzLS+!gx8|R$b*I2Z=Mpl8&2-RcSa!f6$C>Mu;hBQXy9JDC zgSXR{svNu6e3Nyxb92-l=8QLG|KiilDN*v(;6)jBv|V^Tb>P#iE^TpOGgEz^L-xhV zrK`u(c##(31V`Fd-_p1yaiZn8GGOrOLuX6ix_bH~7X9u;ny;V2(vq*O2Ty&!+n3R_ zw{=qn-wt`iy=i!#qOc@d(m!zh^Hjx~{)z9sLl-LKQ11@~zHFTm4i5J?wu*mptor-p ztNgEZFV0Y{4{~l!+(|f-V};PQt9vWSL)nf`QA7lWYT2*3-_fe8)e(+X{>k04ak*EU zZP>L=X(pOCeDJ}PZgb(Vg;-DOWnsKZ{7rwA3g&D*tR{P=PIde_7Ya+8?u%-Z)#Kr{ zCcF$Ct8>hem!9Uq~Zy-1)Ukn!YreCpv#mHCg0(aZ!&6_l@V0z0z0r%asisp{nY| zm4|WkR{Yez{k1mYMg8ZpS2jm^kDg+#_#UcEP6Xo-7LLTm;6dxJe#++NH^CS%qmkA_PdQg18#%SkXCFYUq9Wol)%jV;+){7Ma@ zNkpagFsUHx3rE_nKk2fLSx-(i7RU`ynhBF@KYA-qO-P3Cn337=8X~}YYV9o~$8Ik0 zD1a}~DCL5J!ng9XFSTbM+}5|@KE>Mj<9`04n(`TKF4@;sMj7IzS2H><8?%+{TaO7J zF$&wa_A9gPqB~*rc{U-M8u3e%DI?9HHp3+lPhE0%oeg{vbeAF z{Q0w}AFA5NCH6Uu$nCKBqrSovD$oA7T@m_xB`spLZ1aKtl{TCL%;fo z4J6Zizr5n{wSC+dvbt}p$?r<0(2t+tFK~(O<1S9ZtJbPY)_Ze|Rbo&tX<^JY-@7Ax zoi ztg|-^rd;g^=T;Dr9Pn|am?ST{b53~N2VTDRu>sYFrZof3_e(e(x7qrS0aTTLie^yg z`lB2IVB!yiA0`68?w>H!b`9-j9e}+YV6&x{#>@2s(zF~G#c z1eih}fI((&Ff%YfTH9C|o1HU)IskzEsmt~2zI&tqz|TJ@z{W&h>e6KgDfV&bS7uS@ z*DW!C%f&75`Wb8MbG!5J;_g2GwQP+3c}YL4^q>3x75l$@|Aa67fpfve0K(bj zmB-D?F8r-M%-_}03WA}rdkTEe=DT!uao)HE8bG=P z4ny#?m%G{S8bkar1J~RQjUZSNf+f9zP5(xp`D|c_?QZ)Ybi%w0EFl=8!yMr4a@HDx z#UMB-0By577sNAjfxEBC?pzRk=AP?8)_>!H`G>!+Ba15#o)dC?MEomp6z$%b-hukqHDtbXb15`(NDE z8w9h8Tyrtp)vYoF!vQ-c7r-4r1Flf*2KWOzfD~W@m;n0F^Ewa!^>_d%z!z$9hkD(i z78DaV-~+Y&XFTgaaQ+Ns5Dd+{%ZDAHe_Jg0=wD-`VgisT zw}A`_#LYjXME|27qVt!;5YOm;)&LL)Y2feDDgCpada=zSfAvUxVrjTCyssAvSp9eU zW&O(fgY^?skF$RH$9UFB)-TX!95@55fbSn1`9WNv{#c2?zwng)MTT|=GYD{jY7&Hp zhIsuur~k5|9{#t*{?W6m3HLNS_7nT%{Fwb!wQyw{}NBpmK^ZiHvzsQ0DE@An-JB1_LIMH$|A{foW<h9b0ASCG^RRaGsiGtgG%E@kHLrmQIC;_EB*M^~U!fP0{OK#2P_Wngz7+#Lr1 zt2Te^Z%hXU{?Qg>0RVbGp_n!Jk2Z&30C*7yZOiKaXgdnUL?Ju?G-$d72Za1d588P9 z0U*#r?ZhA7nfB>Jv1yvYScT#h`&|Ioxx-*=7cdw*g%JHO0MO>kAOS%}7?cM9SXk^v zedwJr7A7$O`qkqP&frYqf5FB~N1*_|+r9??5>Wul-*9$g^={dXuS|bE_dMS7cMK-r zG4tPH;NdPRV-hgfJw5y%m0cqLM+9~UGui< zhnCmvn!AUm7s@*@C^#e(b2BV9EIGezk^Xx7+1P`Bx-n(D_&XZT|9MWnw6d~7ukDLdN zsFP%j$4OE4kM}xOP{k;(Lg1mdDqJ`DW$&ZOsko=z+eiNoy^a8p@;h9~~z_d*gxwKaI}G00a$EoP>!gzee`8X58Il=XWDl7 z7(o0fN+CUYAQnN%A4dF^EMlhbpXaYS7H3#S$3U!uF+SqCcRmY67GHwO3Ux>*#DzYYS9Q32XT&h?VeulmfGe3 z3-yY2=`_*LQ5=tX`*}YBz~!26s|ala(EFkp%f?obiRBBmL$CFhI!RiKpvU?eA(%N%#>(D zP5r$xuX|Cvm;P-fqIf}DJSXDELr{$Ya4bXBogcuzVMw+yvQ9 zN^h5#aYhl%ZWOCR%Clmko$xb1E6Kc5tQ$Uc^j;f0Z>Ze`*9ynQ9S8_6P^q}dXR|$a0Zf0b_DTkIFeoo?w1~L z6V{7JK=Q0vB{WtP7J||pTkbT6m1wE-U%f^z>t36pnv#yBX8?wC?Rd}%g{NhlX8(6Arr+(` zs@~50U^`W=cMQi*IE>YxB&K53sEe~&om{YSF&*kdym_Rb%hC6u4$jwP0Y zuPedWu1K-rwUt!%u3W7o#Wk*FEt_<19W#1IZNmv5G6S;u{|N)VG=0#D@~*QDezLWS z!W&nv;@b=(`!tz9kKnRQAx|t!8nxt3lR0bvs>Qyica0A7y;*os#K#So&8Wl?_o17) zW||g>0LFOX*J?-P!N#V>hQXkD&)2UTJ}!OFzgZj}@XK7i;+L7Uz=x&xU3V>jVEn%= z2YVNRXXhTxhm7aMgmz`&M9bAvEkaFRe)6j+1#h6A);;20ZnTofM7E|XIZ!5#C8f33 zMfKI=?&VAXx{-(&`f)0vmncH-x20X8R>rNj?!%s-RCZ6ICA{dO!!#Wf8O8%9c1`pq z_>l*LdD*cSGrG5~jY5j7HMck`aZ%l)DgDdukWGi+F552vbT5u@AqtK6kyiF&>NnD; zD;bX|p($8BGPjlqRX1N{)bH>};U_=eA?Otu92x%F#V>dxye~B{AZWb!K)tYP*I;vp zR~y0rSJmvp01P7ULq>6j0enx0;$5l~V*rE1VjSrR?)Q_&%-hF$TGzDZk)Roj?&Ss0 z-%%@fxVP--0wxR~R}4V>8wKfXBWiq?O*uVTc`c*^yvG!YUF|_4ex!7ROUoDx#}; z_(Cg^14HO`{^sa-?~CJ3*4FYlg~9sVkLGi((g&j??I~Ov_dg=M7yuvEn=;nP0E8Ao zwldm?NudmYw^@H$2D~Aei6RG8H+@paV8ds`pJI`b`7$8-Ii|4FS&Nd8M$TAMS>_eM zu=gCe^SQk45&C8r(;EK2VK}=K)Nvg0XlWR#&EP_1F4#sCp2u5Jgv#%YZHk7hu3W1d zoeeLmH#@ZIiGnH5g|tzAeaP{ovbiwjE^ zCyZRH#$00lIq<)>M&^AW?|Ep?c~aSY5PPnU=!)crC1%MvIqC{+OMUuKz4KV zpWS%@Mko`HUK6z!A(LEep4c)kigRm!&z= zhuc@ZAw{S+VD0GpslvC?o64OqZoOjPHu56KLK$}ML-DooPuWK!<*21STvIaT!5J~v zy&8Qd(v#}L9)piaJEr50N7ptk!#mN%CK^5Sb|>E%FT6Y0s$(i8*|+m*R%J_22}dcb zWdPs&0NAAD3CLYfj6s`T9+GYcx#xeuxM|Y0GNIEBWA&umukR-EVRb;iz}YEJIzIzo zJ2Q(&4pWX-Ne;dt^YDbjJSyB%G)w2^y2Od;jo-8u?T)z%L**+K%X6O;rC$SGLpUEq zyu&&J=(ad!Os(vcFtG(C+@3GsMHHpLoZ^T|(&+ui=Fc4$b6b;VGA+~1^`uQ32Q?;@ zOzhG;hs!m%#V-thZ!}-zdX9;bpC{JIwWm2=0OJcj)6Bq=Ta_Jt`@lSM#QlM8=!Ae8 z0+~Bs6{&=lgRg3jg`cc((lXbwd%0L|f7b+-SgBn8LSwds_{GR~Mgo@108m!U7?m@F zy0+wUT$>myfp(GPWZ&y7I;IkNXr4IU$N*y2^p07LY2buVTGsPb#Xn=+ioLgtzE0dT zD$faykQK={e`MJs9>rr|xl;>gUD(9$Q{Pb|C$>LdWB_R@H-$U$j>46}n-$HWH%b?6 zF&-Pk22m4=FdUU&vaOFVhP883VW5jVs}J$Pb1h3UTBGHLr%Hz`8|^O1fW%M0I6 zVNX${3C${nAZ|HJG*k^@?aRy}4&*$!vXeh_xk5!lYiJ#})ak=It5~!75P@(Yh#2JoFNg8(AfV0IQ-9lRuOdTUB1Nc;fwi zd}-ZWzJ|-El8;b~Qfv&aPb^nnUudfk4!SEhBZ@jbhKoW5F{z#P!i%sR%^dcmW<7YZM3KYB?ajxc z4)x29lflNjHz|`pHa9|tql}(l-3RHXK*x518hE}8bnD6IAE7xu zLpg+|>I%(1QoB%Z;o50PHNIbV`9*)Rsrr4J3V=d6iu`3v%B!rb0u+i>KdkH2NDt0t#$1$0TBn!ECxPj3bu2dy7$BEn`SOoj-O^^sm#`8#Q)ECE zt;u_2q3FP+qSBeInJ*h*Of6rFCZ+bN2s|>_X4@D$|A4u66z2nrkCYtMjd<*nuL&Yc zd=Q*#$6x}qv#%OKd?7nUIqexdM>6{cImf1ZxhA~){CW%Twwh^EzMju{r$EEIeU|xp zZWiH>_nk!e;i9u0vOCMg#>mJH#}+s8%pHOlO0!jPt?7|fkF!a#JH=^eUK^`@E!)*m zMWj8Dq-QMt!%uct*TN_Ki8qJev&&8s9ZX|yscsfv-b{a}Ek&j`R@bxO2SGKp!(p^? zh5;ny3MVhAw+exQU0Y6duSrdlEwWsBUK%Zu48SkpRjHY3)39=1`_27&bq0_;tu$OT6e;-w zeypj!X;4KY?_>}I5C;dk;gW;C>*AB;S{BvKF&)3&F#lLdAl78Wz>k9__SDUeejJGb zq|-!6i+xjwF1XAfTpDFBhD=#L~WMDNq9&Ea-^&>YOUv7A9 zXk}?d4j>6`kU&=1J2T4>a0RR+y%S7)M)aCy8-(v zp%d13&YGWbDfG72Y!jtMPUEWM=-k7RhdQa5Ggt!Kr}Z6^j;+uXH?W_p>W`RhhIP%Jd279zoOcLK2zHX|8p|H^605+MU5N^@!`FmHV#L z3O`Mks+G|!THsSjCUSTo`PU+>oChDjL@zyZzPWU@s(*GUdwFV1GFedC-N<2P_pp3+-4==4j zp1u&NI@D}NS?!wme*dW%sYy?(Lwkn^*^U6_@M71r;80DPV3(TrbySugA3FhS27WBT zN>CoHq|+6&;%H1W%kVkNswS#%GA+=pbbb-{0(qtcB^IPO;bReAwlH!3Qyso|K1=n> zf>_#mmu8*a@WA0GzzjhI8w`d%q3KZUGM<{xo{uQOc$&OwI)LMhJm#GhL!N$j^xL*( zaWSa8YSEe)hFAXC^Yi;?aM$hr!+c%L-&@(igiX3wXE#wCei$4!rVb>Ot3UGTfDR68 zdB`YSO)x%<&I^h=WEkc;ahXQ19m8`f7m|07#uoOy`6>-VJgwxT%RznuJ#&^6dZJGW z!24X+mO4!O{<@HmS^9R?uBBx}-E_Hot^=Ra=>95rbZy=$5BChC{GsEsZ(3bT|NFB8 zbsBEc1uxmFa2@C;xB(Rb97$*vLASPsa&`_vJOj{eHiPmR`&$g)_ZcXqmfaGj{i4dz z`JJJQg>S4tWLV@RqB?tbyMJ`{-3n>tmLg3FO-4rH58~V_li$uFAR7BMN?(EYR0iM- znpa@*dh(ep4s`SgfvKbn@FKsXm$rXXb?4TLT=KNHLwwGlRNY9FZeee0HLM4jjF+Gv z1{pw)_CUfl3an=_BObSxE(PK`;gaC&O}&97DrJDo)=JenfA$JJ*zdY2NZvb=mW$GhA{Suw^*xYj(L$(8cu{vTKl-Dar8Ld4<`el6l;GBQ#iH!4T))qs*6~Z> z+b5U40Y3=hSkav#ER3XQ`F*&{nQf4+PCL4Dmo6UpuC>}eWd#nyUZUbUBV-wX#qip) z6*;@x!l#`9jBDF9`A;7oT|)yX4pQIq;gg8HYm?(&n}&iDVy^OvQ&yHFs3$tC*d3iQ z!yS#0Ib=By%4=(!c3Pte+L-Jfg48Ghi2Uq)e5blRQW}HkcBo4$D(}sppQPziSCW_M z^8FR_+RlljywpA?blU{(bI_s+mm!l?8}VgjFEY^{#?kCo0C`2=bDHT+PV*_sF%q(^ zMa>RGP~Q!=IiCP8Q`AUf%ao<5&Tz=dD8@u?VlR33Q6O=Fl2Z5EE#e`)lQFbG4si`{vf8IPC(T4XK0p7zS|l zQFt3&e(W()1mn|#<80O;SxrR14|ox@KnrTfsv{Mdiu2|LXb32L1i{j6Q7=C`LTk&T z&Vdyw6P2PoT25~z`^h;EznDhzPJzb9rmi!q&DzrhXS)%iSQBdC**vmG4yiI8&lkLs z(t4m-M=7y`H>H01)JK9U?flN&X1(pBS_!p>)a|xp3n{UkTal@uv+C6`-(mcV`imBn zYtX`u=1Ti)JzyV)RWpF-W+Teofo|=wt{uC93t$?xa^nG2xECgN86((+uSie=5SXNWSf%~5B-wJ>*t7R+FA?o4` zJ_#;?LD~~2b1AJPS-vEr5Ij8!6KF$>yWMseI%eBd&pYvl&kREq+y z{KHLyicxUOK?X3B+1PWRjAkvL>qYYUXe)}z=0_d|yXi36wQ6cfG5OdcdgAOHnYFk+ zCe)dxNsWUmP?5{>O1Rp`#VG$+--*<+610A!zU3p^kdmD2z14xkBDP1NAVaD3B zh=T9~%|ua?c48v(AnlYVrG9oL0joBfF+MkgZV!jWP2i$y{K74URj}qHJvZ`~ms4M` z7e{=R+#9PXnvdZy48q|hp(eH0CdvboN$2RKGTSvl;iuG#C_HQc#UB+uaHV(5nJcw} zSabhFES0?*#|NG9j)8;u2dQ(3#7mMA6q)Rf*%`DTO$~IUI?j~jk3bH0UnXZ9xAd*(1BV|z`4)C1W0l^=_*$%Q5x<;f)bI%dQ zYs2^BM<0D_kI;Y)(+y_h2Tt?Wt!Idzp0Iq{xOap+=4eT9I^g8imaqGI$n{fNO`&Dx zh5D+G%V@dbyOfTx!3ihFFMYl;rg8~(m-jROS~1zVM^|2i?tOibanovGJgMaXabu(v z{DhfiObGXE(wVZ!a&4_#*dv1;6qPjHn-Q>E&1M3N(~=ET%a&qH-7g6_eaeSo zH|U1ZSj>ueS4VBa&P)8l}Q!m7=*^3aFiT9n>|W*_P?a%B=Na27qG zjoe2w8VTm+YR8MF-yxTu8^?r_(n?Zi{M>q~@cfv{ZRfY@<#I+qA2_>xzzKU9Pl9#e z_xWTYxmrDNF~ogX9A%}w(~y$bo906{bij2X#uv%FZ4!=WTPcLnY7)E@|^5=@131fE^f3ma}H z^0s-%V9C#KjmV63s?O_nNbJR> z5G28>i75sU8~i*R4;=$oE3p#732u~&)cuVbSbQ59bAF3k^hx>ZTmE} zBv~@PpywkdH7LyTVOn)NLG@0k(nE!iw~}#h7H?nw@%|Rv-F5E~=2_%~){I>lRm~Yn z7P^QDM1Cw53MDW|)(+w=I6oA@B;?2m)s*AGutWztC>fa&8}7_iNocKUJ~%A{4N9)f zYpUc&aXkOx2o`XwT*@oG=$^tO4lq6b0k;jTFb4P)ct#Bg?$-Jk6qRH(o53Q5NR(N7eOw+9MA)QOA1B z@qAw}{tH9oVRA{LPW`U+4aLP?>^Z%OD|$QsFL zYDTw%WVO$B=p>eD2d^tQn~_M+q2B+VJ@eTrtrdkApxMeFa^AHaI0oUGnT5%QHkekkmjdd`9NqlGi zQEF%`{n!M`VD>#xfUW_WIPS#P;N#R#M3JCJo}JkYU=M+HequA)eMDm~Tnq+Gez78j zbDP-JeR6NS%G28s0-fp!(ink-;a(UM;Rx356XKf6=h8L=TP=PZnxva9K^aShD?|>% z4qydIN^WF$+6&A_S^!qM=?oZ7A_4{|@>}frr)@#F)M;%64Q@lTqAyX{of+b|=SD&G zNUP^>SMV}5f6QS>8P2_c;0cEAn}ji;opkPGG|ZGf46BCje#p3SxeZl{RkE%em3cNh4LWtQQdQz2 z)iL(%FoCrexo*M8A&1hXIpJ8seysf`UHcc9vGzKGbS)T4_43XROH3K=(wa!PQ|=Rn z?s~3wg6_AMINv6M<3xB8C&k-wu;z$aq_9`3@EkR*G8%y@gG>Yd^j8dgG`CqM%O%3PdApM@>iFY%UNUe}wkHW-H9gXh(Cr!2Nj z^ryd`u{wa2qsS#8yz!#o^BH7^invc$NG%zefY=+X0ysE~gCSdb(p_}5p&AhlD+y>0 zYDEqyl~>UFx|f4}%F>IZ;qPWaO?>rNWNHnCfqB`b3|C)n`>!8>UiVIFNk#TIT?yZwsePp>Mg9`AM#R;pZ z)_pV-)zfkW&bNpZZoZJO1iDb{$tqE;D0A*(reJO9d`2bcM&3+q)_8)cpIPZdbA2(H zMe=}ETxs7 z=5`=^rcwyHetBea8rWD#!Lu>|U%k_jlaaiXiI?W!%2onQs(+0Gzx`)x-C$uwKCkcW%NtK@_Pq%~_rRo}rZXAi10M zs6$*ZPH)8i=HtrAWUGk^Xal;988*xy(TmBvdoW0&PtE7aBIS|dK*XtS7)^4gg0LU4 z93HrB=c6r`szN_ZLxMW9XijioH(A3vm02xsUjB zl*;I}9r!z6jr zHHyN*<_!HgWscmNt}8@pQQ5EoE2*3-J$Ny44OL>Z1Ih1z>D#vNotA7G8tWJ%937^e zFT!e{xTH6l&vbtqO_SSs3fHKN>WQWlk_oNPK?CU8ZY6%2WfY5;tz^X*6Wd#BBZx_n ztOFFsjg}@01Ud4KB7#@FU!}8ke}#IBUAU8Z4?(25X{5${An`m*EYIgQj|$mpX>mK^ z%R(!~hC8sg{~6Z24qVwjtP(@#M6{vPB#%?lD@(EGs;MhIeYgbZzc$ztY9l;J-5DK= zNKTAZw=;JOh_5(9{v@fIgAUf&!S!&zSeTX=Y|~5Zo@NCzW?LufHyo+W_-giEUXJ<5 zJ)nD`4;>B;7lGMrNXi|s@6rf#kFeTtC-Q@&%n%uzb%T7A*()?swu$Q5<>bLF-Yrkg z5bK;d2IDo8D9J1X`_2Ks=Q@yk2nbNR4Swu31=q$qq0&hZ!qYj`wl!$}pgE=YG<6^z zD@?ZHrX8TbpHOla$sg{|ssKNrq+Pjm#yo?)y%1yD<8XF-!C9TE`GgwY6Us5$k87{f z&M$N!bDY*g&d9alCPLd=>+P3eRQq0SSSks;Om0!zc&g?-L9cuazqKJLPaWvwT8W}! za9v?m>FbqzG`r!%Xk9t7PMS)mq#77HS(|19u$c9f$EiFHE#P z!F?8oP8UN*k224|CVfby%dAvpv!O$&=Y}GcXv{+db($pABn{3&JNa=P#zk|aws!P$ z>Yc-EcTPaJvsfdl<2eetOG%9`(3ioD>DCqYE5`77H?k-Eq#>X)L3m#LaP3uI6zERj zBKfyaR@)s$%L*24y|pV7YErl7U8uy2CD*$P-#^HEV3wA2t^C}*Z!Mi|_d)OD&&WD@ z=d?8wRWq8b!V9KutX1#imw>3*mAy1Y95c@7K&J7n? zdxfmM-}iZ+=QnNLmXR3yMPA>0!5z`{HxJCZMV~qR=w=56-|vE>Ac#*3HMP++cR_UL z-t=Mmt0|AQPx@&}0g>KbvoKL;L8ucWUOm6dx*S3O`Aw2H&i;6!(Ic>}#V}v1KgWSD z)o}j#3+QYDbrxT6cN2Pbrm?-cps$eGQg9XL6COwaT3>)a-J5V&w^q!FAX>2h9xK1a zfZq)OsgGfn#tTc(mb%Ficb$cEsBV21?Ab0hh_Rzz6Y24PKr`fP0AOu{s;QYHk4IVg zeoUK_k$DSYS&V2|^J(WJNOcPtu#HiijIfmmca{o&?z(D}OjL-tJDw3|%HZkm`&NKn z`dnTvHKLHNhO|ZVQZwu*Trs;1u}#?bF<^)U*<3_?T6G)ISU9l5!|WLnJDgJ?jdZXb zE+1&w_+DHe*umgiLg9Vwe7*UKmpXg46ktwCuFHNCt$uIIm(b1F67ed`5%%*Jp}qHm zAlh(^o4=8=F3j;6@~+@q1?y6)d)UgPv60>VDj_YJJT|l1LBh~@lQOY2Ut9J7$_B7i zZhAk_X4DbK`05n&nUY1ODX3|HYK#?RG+yR~!?m=K6S@to13`5KO|y8VY191^p=gcs z0e|~VnOLMPtP|vjO*lW%(ieDbq>Keu zv4iV?l0-z$(?%J-aE>7T9O#RZIS-&+B2g0@{cEFXK^X(*(2EqPTahOjns3>EQVBvE z@YK_YCfIR}x9$bSQ+rQ%_&yep*Tr4~uje{Rn4@jP+&JQLS4IdaJLTl$6eQ2~~uWLZu+jxc_g84bI zREsw(Psw)dINB_ur5<68cLZDpRMDJ3=qU@~*!nA-!)d&eFFSOF#V3QFSAQ@+xNz$a zqaFu4?Stwq;`3B1767>6GC>C^hhmA=B4=b1h4-L!1EO_+D{`YO*51V~!2}7%&P8Y? zx2Gv8a)tIV=SV>tUO~5uTDXdikxUDS=I=%JO$enjy6gl18vNySbVF~a-|S{^Vkax; zsSYdZ(Rf59&udGFcs<5)R3@ZlDQ1~K(S(pXrK>Pz39D_o%Oke|?vfDog?{kAVBM!! zwDV1{GXj=G{(Ep&wkv+c^tP$sbuaSuZpn$`lK_WwZ(*84zJr)K3;)aCc!@ACy&m=nzGYMjebX5p7PkEBr}X`@{A zRUz{3nFn-+vZc8WCR6P(nv$3;S_F^=X_C{Pr*KhfDMUefbA>w}M1{F#FZv{IP<1kB z>7qq&s5phP5^eJB(>b2NT+u=$5&vOgV=W-y=VF=T-HPy;i*hkR)r^k1r&DZLfFGbq zMq>qw!%!k`gQLK0$4U(8zJ<=7eg)M$YJNYJH97uT)g*aI*`&=SD5Me^G0 z-PJ!2#lElWUCR2{V#f08qFR?CCZBUfOW%x70|+7^Tp1!reavDUc={Nc zluXm<9gPMl&e8XyT8PE;RRz0HADE`ZcOWUs`W(PU()L~@uRTAfKV4?{_9 z!Qu?(mZl%ymqPRUFnA=}#m zMJsC2Sj4?`$Ha!mXsqH6IbO39azm%xM@7Ubx=n21(RKk`qg0qbL7YX`5z{}X!_2vo zv5pxXp>b#~Y&HJQeD4@jodj2aO2-@9qiGl8=2c6l`S#UyfDE~hhAVrGUtcDpt-&Fr`{^n9izb4K}QF9?25Y?@Cb%iP%nfR6#Ojb;)`<1d>0)=$2!D>Z#aN zuo2?q92__!57p<8_M?vOS)9{(q9Ku`>&&llVV;#uxXe`L|0V(~h zaJatOLm(9{8xzP|*)J((NM0I+@IGM7^77%Re1Zh;5J_;XgukB(AnUVWHY->)L?F2fd~Lvl{ap_3mLUP~GjuSq-B1^5syXupTqo}4qpFr*sb26 z=!NxZt5~zUD`-n`_P9lfl-1e?B3H2_IW2W2}*O4_Dj5jyB2qJV;Lkh7Kd- z0KjD@lLRs1+{OuRMJV7#i@JTdEp)u#)PQs8)6`lE2+-oxS;-&98~^PMD)^Jy3Lg4gf2b zlt%rR26)i_m5+#vbYoOop$U8@0CsL@=Wv5Zy1D`BOX~tf=S6W$kp;Rjfz%3g(2HZ~ zdXWy_pUwl;2(e>xsJCPxuGOBPi0h(eip+;a>mUox$`4XJK?bE&Z;94;K0a~K$H9PU1ZOqIJyxmC4j#s48$Rem8YKwxI$$3Q% zTA=xn+xKHj6egSO!#nZ`N#gU4ciBC*yiVp=Rww5%J`Fl7GYQ-`;7IunRsn(|TZ$L@ zUyM@{Q+$~QBa-eAs@>G-sgU&t5Ts>1lt*65J(3b6p?X8kNFojXDi<bOiSl@ye!l1fPkreVcCRlmxa-@2E=Q z-XV2(0Xn?aP0aBzwz8N6n7`wfw| z-w*z9I5=nufOCFe`8o}d&&itT%UFJ`4V>CHBdvnmqSfeb1)vZDZ1HWseWK&gUK*TI z{JfcX`;^$&$;3jycI7BW6PkeXz~Lz zpJs;r0Kek{VM^Gv6u$)mfMkdeso_PJz3zFyIxqD>&WUcx3y*}J?pgmh#GZ5HQU8VP z3IF-O1Lo#%6jq<`qKv7h`|~Pw7Sz_oj9~}!fiJl8L_Buh#IKcJ4U$rl!EB74-pJyU2e_ae#zb44E0VK|y$dl90T!(PXL74;9!cj!_GvawQ!HR&7cXt0CiUUo zGR8gSo)O{UQvkhvg&iBLjv-l(z0b#HcjTO@h*TcKJJ^Y4QCt1-KX9_e zhB|xfFiKEJtsJ@;9c7u2f2erRv$l>3jwt37vFzj|*b+D%(r zTb0t1@01$!PS1DSrst)lKlLYctnesyTK?BFS3NkdO=SkHFHj#sXtU-radI#(5!(!l zr0RQv9QnauYz=B(&iZE{5cLQ2AqP*{7%_GbOmNgdxXNKI%8iGWM<3s|-yAH=3I!XQ z_IQ|dp?1@Kqvw8_E9}3Ewm;L}uEEa8cxls0FAjZ}Ag>ZRBD9(6n)l+`krc?T2HOTd zsZ|sR(*KG0cmk-*ym;wCrrh7?0D9`)?o*0?>z&tDe~h>t(3mijXh&ShiKHdd4akQz zekfh(K2v}0-t!IU$%$=4Tk1HY;T?@}UA>mjk7(Hg)JN8vCR+X6H0P%nl^N@yX5WZt z-D7CK?qyz^$n@nr-D2thY4T;Xo6L0Nh_;Nf9CK~b@^u@c_)8%7GcvptWP%hG!y`&T z+dJ(Y_G+FXN|%dUc|<4@<;h4!kvhx%YA8aw9J8167Eg<8hM=SJr(4JlMQ$U)ZSw9M z+dlGPtC=4ib_4KasTJ(G3pVA;`8gb%Tf%yk{A@liYmRNoTA+>|@z|}Z7PF2_w&x`h128xAk*wF8 z-7mGLSVquI27pFGpfmMB@{Xa^B_G6#RQ1G6Dt=Bd`-!xCifYD7sk;xw$_r`jS@+jJ zl2sEra}w^;(aXJoO-)Svlqr)&$pJ8(C= zKPdgso}|A|DA(S+xr?|ATA?k$@S5l-$`;f|K{sYyhlm1}sZ^1dG|S$^3YX)vu{B7g zd^vrEkOnV|cRES7eQ5kEf@YKrnKk~(wnh>%r+)rm+nZ8fc5X0y$hCQ~{(EMchpKBo z#U5R6Zy^^;a(sNUBpx0BS>kp?$NeA@J_^~8THo`o(5P+ph@IV?d*f?|uSdB*Se)Hg z{da5JfA9rD_nOzNN%j3<^6^q^J2RbXrL)Fcu{L@A@=@`*(>_kt(B^#U%w9S~foH}N=xe<(a!gI(hLuwunrqN(ef zP5wUL6B@=i9ief@gbh3m^WY{p@+0IjaJuo#h8!CQoiu9upEje559-bE8HiMigPef5 z?L4rZ{SCDD{|le6A@=Lw@R^>6Q~YHi>PN$_QiHIm+YZ~|%h^Rlv;x31|6z5>IqCC! z?k~_$IId#{OuBYq4frSK4;f?OZG&h=6>B)uqIGQVIO{_9QDk3kzGi&tF460Q$Vi*z zOxcPK9PWqwt64|e>9zB!OiqM=h+1?us=5-kx3aC!Xc<4Fv7&=$8;m;opa; z8Hybbr75|cw52RMDCo%)M|5B};xp^}09#kk;h^^zZb+R2zhN6IN~P>ZZKneV!Q5`} zNoq0;qqG2#t^J=ydr57)QE3a&xL&o4AvVL#RAXM+67hM#5^xR^#z^g|tVUjYJbjcY zL#M}8%mO*e>F)Jcu+(QRCFyjsX3l6;>T^H>Fb zw5T7S5nS`EvWex{M#tCL$0%|W~h%BFpVVTiOX`LYL3+{9iZL~`rTZfiZv6wA%eOICZe>uBXw+20qTvBw@6eqx( z0CeY-3Y5QhAhwq)S~5aDT#2n4W*Y{*3B`7g%v-o*l%oNi2-|R?FSKq`->_1 zFmdLb`>Yk>ClQ-KzU7fb!Z$FreL7xX$_i9+Iyj~*lDtryP2LE1N|e+rk#W@U#PUb5 zdn(%z0CC>|c_I^ws{+zmtC0+BnX@jZ`eX=tTGbKUJY1*uH*^eU&MdnRG0~AyEOZ3( z&}{|GS3AcH3xeqx6j$uWS~R5G?+UChp;)00kQKvn$c8CnP;(g?SMdE?bdHTLh2Bo> z{&w=&8Ol!H-ht8)0p@opZZ)E)h{sx!@HR=;4$37L7)n4kK;@P!!tdz>_7Al*^?{$-9xJ z5Y)?gQo@C2fsh>Vqaz|y2Hhe?px0fEf-{A^0e&l-aClkIv3VW2@TU>I7>!f=K7&a! zOwe$lCvAH^OA<((A_BTy;{}m`RF8l}bpdbyp~`^ZRd`$`Hb$rAodQ`jsXcfSz+|k3 z7T=AJkfOY@3G^zMr^AT-WTru1|EDG=+$YYcq?Kp{B+ncrVAB&Fyc*GKR0%JGZSx2f z-(^)Qd$V<0qKfp;>x$3=-Cc2)7ji2V#3goo)Q=npHipNvXuC)_>=ycqPIuzJBXc3g zAI1Nu_gVBzz$)0=7n^e?v>raly%*^bR{UVvl+7SLwfcpTR#}2 z*6<@ON8$*zjou7P61nDn_AaAI>Wq$G+s5uKazo^6SIuqQ zzXk0&Yrv|+2ch_V001;Pf<9ZHtn8t8ZdM91*=|FC=x*|-E6?XCo}hjV1q?H%}=xGfmOc{shV8d}2yXNq-p#oi^%Fjgb@wZz9T$vG6Pr9*TvXQZai)GlBjTO?FC)LxFouF1Y5(GYtBWZ&ls> z6@tJ&=CzI8zPY|Iwkon=7ttM{mHwLLQG6>5Ujn%d3(qLtFchR2sXW1Dq;-MW-aED( zXmY_}MZuVT!dRvM4raRh2z8ko-SGVmy50FfayVkqex|)>9sHycG3xw*>|ZlJG5uuB zyG_pyqiB;Zh|H#m_)_#*0)VC`)@ttwboQ);E`Rh5!dFOhbl+luK&S8pcK~)MgX7x8 za3;1BA@HzRIgQI%9%FB}XC#KR?E8Edo;<-pt!iJ8bZ`_+Z+_5!FU65s8T>J-(tjjP z2-qeo9Z+tf35ea^n{Cn+ly9&I{(dmYNkb5JhI@O0r|e)V&_V?hlWVXgPCUq61}2IL z)KJxaH7xZ?F@R~m;KmkQe#<|2XM@q3F#~C?Vd_2OPo$-FXZ8<3PdHXlf6nmFa2Xz? zl(olBKizWAXQ{8=E0cD8*&81j9 zS{1fA(vU$}yBpYXpFEcjX!GE_BS=>^w)a64XW2|~^B>6Bg&My6PF21A<^%AHjPk?; z3%FZ(@Ky;#h68RoCF-`(pH~3zt@g&}{ouTuJ?huU+EScTF)(lEx5SxQ_1|;O27@ce zM>_+)U+V_iDR^8iZKHAd^%J!SV>i&1npaL>-e?!%F)0@n!bjV!>6Y`+@k$yuCLItI zR68_@WJZfq(9$OCng=QnX|iy`gyDe7HQ~}B?qPnwQ?6f7EDryYW(h^~&lj$#dnzo%UPzW(H=z&k_1f#1KFv;4wRn|Iw95_I>M`00QJbP7H8JkMYzo#jHb0UmUR^Ub_!R4=&R!fh0y}c zd&n&ssA>W+7ikVCJSyF{p$v-YNQ^`-9u1fM{tNKpNNWWtF|X$4f2(nQbE(t1m&9|4 z^{jTluNZ^EP=SDK-RF1ry?Nr#PpFTnSGG5Lz0~q>I8r;=ETgvVhF9$AM>S(v(Ew7bkHk}`s6hk%o;P((!DMCNDam-Q4 zOtgYy%Xk@KPDkT)7SZ9p8bX`ckdZfXDa1cP-u)w<^iup|;e{5K?O)7H>+ZW#m&J>n z*{)*%<<_UH7S$u+I-Aexdr07Hyqsi>GCM>Y(IG`o{5~`T(N|D&gn^{a-j!(nM+E^} z2Q%`5rQAA|@zf#7YFx2Q`f7WZ7+;1mlz3Atww{v?L_@h5@ELhUOG13G_IIK0McErW zd3Y;P4-94|uKoZVNUPGhyrKhOKEDg}0a3h_J02-`rs&NUkCPs~T(V?%M>r4r<+GGTBDfOL&c)d-@bm01MGl+3Ay4xoUkrg@{p z2ToboW~4;HodM?UZUSWu$Lu6a;lKG9O25nEc#Dv*R2X9&^&bpJ3@cr(CnzK9oE=^a z@4{E=3|U4eQ15d!Q=)Cd^Tc!q3fimrewwq294BCf$cyS_K(`&upqj>j-w?S#N`l<( zqc=PA%gTABD6K_j`$YPHRT(lE8@k7yelkBkra5HUR(y>ML=X3>Z{Hr_up`sv?(L0d zvvv(Tugl%(cs4F`4!4EH4}Kt9tLSncs1U z^-l+6<#wwf)fscYQr>#G#e(%&{cSyHbhBitSp!9H4sn&MDo3sZo5aAyfJq3CMsM#`O?0+L$^Vaw}=$u@G@B31$xScT?)(tm)&&FG&|i0vVt5uvvt zo@b8j90c4PQ4t7Q%_2vXs(eG4ZUb-8h^Y3Hk7PjJ3Zg9phN%wX5|nomp1ejr1w7Z31%+6u9R;fv_p#U+3Zb7 z>|Mlp?iyloY*4fol`5Vz6~Dh4o?EZ72SE>884H4}>QAGwH-WaolEn)GCvgh2Wx{z7 z3aN}3w(YnJ$gLNg$MhuX*(;QtQ*3=i+=aJ88H&yl_@@)BOI-_BhQ)FdD*kjse@z9& zv7!En5OH{>=pLVLrUwnrv+@E5nPow-euW{&i#`SjaT>Z}?#^B83`#?&a%*uZwCj(T zqD>GPINoyr9sL8`Gc^?uOv@#^1AFq3rhzfmx>RO$)x2+cU1Z6PH6HuU`uy+!hf3tc znNft$>S5F|&auS2fxU7bHrwFwplrq`wE`OG{*W6OlWK$WWoP25f~V|RU~n89%pVB{(3%2%bcZCP>Z zv?gsyEHgFC5ZH1!cx=;CQD2?CAu*M>oNQDMve#~R^s zWQCU(7K>|#uU$lF9qM3OtLmC)6}m=2OxG<1jIS6p1MZ!8Qxpv4xfE;sG8R{%&Op9L zuTbnp{ghX*i*f;;Bysj@XD=W6gvWQ4v+7=tx(IRPu68NBAHd2L+zYN?ya_|IEGlv- z+osqiO(-9}4Y-TLm^&=(>`mn_VH13r@I2W@(VIH(o?{_e^&IB3>739|Wm8D~C9z|Wy^JLWx5P|jh3$BB zd*BE>lE}*pQLvljn|mm;)su)*djeZ3@8x1N#J#ijgXo)~`4B1|4R0&5=)+N7zl6># zzG|Ii18!vYs;<%xe+B`jqH)LgcY(ir$6H>VU1_?~%qJN4uaCM2#fKxGWuIpc!;@_e zj=&hrv(^nrjUZnz(-*BlLd6}unV`yqr7Bd$uUYN^L!hS%0J1GRn3jg^FR~e`xiCA2 zxe?R&M~2VV4v`PIjbCw@73`u^qHbSsYx(jKZN`|;ZPOy~S z0c%qZ?Re=LtuGT~&BUk--80y$(Y#K!-6#Se!}M4$yB_4o$@lqzYGxaM=?Brq_@FJH z-jl;01dX43h2Cg;_ZpC*>BN6FNjW2P;P9-_x+HpffE3cZ_+IL`v{heoc8Xmvl(ZJ-6qoFwVX;z>*T3d{5MC=q!k_3*Np2;TeMpW%Z&f43Qjac}jL2$_U zXb!!C!nGx$Z`ggndp7$MM5>(9t!_w`kIZCMfPpfVn+Xg|x>X1vjlBjsPz$^%(KSMM zxp6(vPFfR3Y086npfru!Zsv^obr96EaIJa*?oefsZIras3F{_p?SHiuA~%)~P(nA! zdB#zVdF>qiu*gUa3EJ|qo}Lz53bJqNd%0@L;*b=$K@#$Y*|u~iZX#Ke=bm3#?#5d@ghmqFo z<#1eEe0=D6l6{nV>-(_H$nfJmPd`~_GhRWnaojISCPeVtUpbD z?%ho#-eO8xTUb~z6%2sX0SHjL+Hdn8qvbHN5X2wCH zMe@QeaMr#>2y)&vVBIo;_Gur+_=U!GJP|EHD21RKwZ-Dmb%?{rF|?-=OcliRQ|Qr$ zQdY8e9b$Bij1Pa9nBc1kWIqQlmHC33Nzj0UXpmz4MmHqd0JJaEP;@(}{P#~9eWcXc{A|4313H@%)@!!Yqz`fPk zvj5`OQxiE6w53nxQkYYW#UX1F?FFz<{(c6R9S98D`3yeQx%1|e2bEr`~u%K6o zBbqpip)JF%L`zgDq1r&PrMiKIifqW{6cd1tbaU|;x-q5?$IYBdsEPFWG~vk&9-z3f zIERqVKk^#w#b8i0i5!dNwUb69A?ED1t=4)wE%QDC6q$P0Sn+d=R|V0$-jIg1oJROi2f z=1^Y}3YuV-vc;<%q?%Yj`|vw}0#Ol|EHsceAf3$s>=CR8g*6HVydij+T)b=uphvG+sU^!}|BVAk5Qdi*Q! z44uk#e}{R(!x{IX{mJTrm)NqqtSp(ADw9IQwhwWlt7%IAhM8hH9uP)ISP3i+0aafm zE3CPjX> zuN59;Q=wqnnq(l3%i$Q*33!Kk6q`-r6~_S-KWlXb)&vPr-orbXZ!s(Ghg#f%g|c_X zO~YnYe1!!G?L?a?0A-E3caTOI-tg^8K=aqJHXo~h3iD?zfPKTNp_Q5HUDHYQuaqz(We={7|W^yT4+ zF&u;U`NTw>CCJP3LyyYlu3a&bG*$K*N3k~#$HZ4N72F)R%4LyQy51$0NNH1~3(1~(1Vsce@~^mZsg zeN2R21K9Q$JM{syUVEFY4=azooPWj5smCu)xT$`I0t<_I#KQX83fhB6;Ly%b<#rIW zGUSs;ScBDlqZbnv+6-|t)fzHG{S)qC?zP(A1=u>xkJwINzd5B@hrSq>7jlzkp!3uS zK@!Dd7}Sv>CeVNtA?}hMPZ_sOkQPkJjG-;Vwj%o;L;gUVNdw6T?a_X1euz`2gF(T5 z+>~gAu2b4@HeUszdbITnQrcK}JIxYssR|%f$#4Lja`hCwE?WdrA(TYBh^y2#5)m>R zAsc9RMR1m1pcqXr&?N!_r|li`1A7x1{ji&qLRq#xuPm~GvOY!MPJK#yLwtdnLM2kq z_QZ9G7SYXsl5|w8X2MPx!lSh^PCCG}SD|OXxfnYFhTTxq>h1)}JvyxW4)s>@({&r^ z?;D4c719)k9n4gs*%mi}Stx5f(AI}r(*H_78%%7;ngfupHdki_k@~;lj9KEPy3LV% z8_oqH7wj#k4riJ=8tnl+lw1nrN+2D^;al`NaQu=KMj04YvmgCyx^+z4+N^)PuTHQC zI{+UPD|>Uo@JLQYPMwzpNU=DhG{Luz>>^%Zzu+$wZF(<#f}iz99D#i5u}>z&0KJM& zqqv}S88h2ho1lje>!fTOP=b zQKeEAKOXtCS!H$-6?BdjUdL%8;7KfNjG(<%C)R$Glt{G{p91qsUDwRNNPL%v`k7`Y zdSh%2Aw_2hj{Hz@j%a1jUX2f{`vmw(Bt+qqE{@)|R^*}^MDSeU8hE_x>Z&+N$L*_} z4lVuAaJ`s2=ANqBf*Dox{Gzwu26w61sFUDpbVx%Lpz71gDx)TB`c(7Id!`R$h7^pC z7i>89wDIYGet+)TzunidC@8~K@81$wbp%#rfY)b-ImRM$s$xLV?*au3eQzwG&xD z#@ag=Ri2>l(;ylkA4ZfdbkkFCXhnPTdL|GqeQaOB+OF`)a+?MIn#zviSKXi}8nJF0 z$9S!d2z&(Wa#!7Fy_KjC*wfxOI8d~ODi_EK?p4FXU8&e%xLdL#N3(;$dF<1LH-;Qh zX90A~gOn|FubOJ)%#!3@@rThH3vn4Lh{Q~YHK-Gtx`5b;>KTi~t6IUV^Ig_}oS!6& zzlb=3Qq~4GjfplD+OF|o_fzeCl7x;*(h}6Ki$nOSZXD0`xXdjNrCm8I9UYliiFPUM zy(QLyxCBZuLuO)LU2i;JJ}{5N-zJkZXR)BBLCR7SzYe+{^X$-nQM>zSeCsxro%~iW zXl4W^Se*7L#?(+=e^{Tl{Pwk0CoqOxQq!)H1OiXag?MWWsw{ap*6)~ zpujs@|96-)`v|DaZQ~L@HFK1tnYXeWJ4Lg2W?sQd=Qx~2kAvxLr^Xh&sTh%n$BjC5 z-vV`8(3vXqcUCZf7LT^`YsC^8F6a$(xUmi?=exJR@zJn{Lm4OF?=QpF*si%n*69d)Xwk4@4~h0#>-h;{{j#%n@;ZeYH8!5UBje?Y_>l?r7}~0NRJ0d}`}q?(FVL z)=f1BxqYV;^5m;8Jw^?_7A&JIkR5EHkal=S)}VZe$QUh5P&OZ80gLl`xNMkP9&vRf zZ60s@;ifos{;6<}-gDhc&J%G5HH~OVjW8zL0pu{0i6iDv>_A?IdtzNqTo~M&Lb2C% z=*)qx!(DScs!>;X{}tNZ?w~(NS2Oaem4MRp(e!2Z5u2Vg1h8NY4jb!Q0lZBAYz1MYHyEl!8dJX~iCl!~|!BDwmF5t*r#Qb}3i19O2Ept-@@zcXgIn91=J>TMVm96auqpKBzGA zM2~J*#{zgKx%<< zJc8oPTBK16U=WRC0WxcaR8xwTu7_^LHpXaZ6H?oDs}J;RN3ss4B`_(N)=UJ01%528F8jb zvjrS!__QjJ&ww(Hy%dHcv@WUz+a5X&dUA#hIrhODe;MESc`=Ko$bE1jLssq7=71AD zsvDZO3_VN^f%HCBq*5>7Z5c&QXmn`^XqxbAZc5l|L8wP{NZZ6Fpd2hXJ`C=)$`bO? zJ|PEb+@SLXvdIh`aq9M%p+HW&CdJ2uZtoH~jEKxXG#vG%Jg}NCq~g$KA< z62$JC@X}U%;$XAq9?BEgzx@umi)h5Wzi5^d0Ic zdHS-Rnss{OpWY4+YC7g$dLoo{`X7l;e+!w{N`w#bvp=gjrb7Dt2;vfQq`()}$g$7V z+Hfx0I?RFt042(stI5N3J~EyF0A@8 zItu_O(rgGo-8t=aV|2YnvfvH|;M_2Zs1BAUvsZN#LmO&gRR?}4%2Z!Yl5~hxA|vU- zc$Y`WO*t*SUTlr_wh`?xI~mQEb*WPXGQ7y{<*oNV1O}z<)f@+H zBVWQ2ZX~eu`ooAH$ap6KX(+Z6%A9?yhc?WY&ZC3Xhz+D0&n)-qlj06 zGJOkxYhX)@SEW2pLbROX;K^3^$AFOc!M%_$G~DM|^8Nv5z>O4^?{&`Ja(jm}kY5jy zdaD2UW`oJ>{F~=F$9gVqew%J2(b=FMmCID)E7`sIlUcfT=y}cx)}+FxtryRD%;#Rj zet61BBjT$m>%O`xu10sYWQDHL5m*}E=wFC>Z$1Fu(1q0Dyj2Zm2S%wXsPDSw;#ey_{Aqs%z{E7!~4ot?=3 zO0s565kGmNV%-AV`vtfC7|2s6-RHbi0i;JWEumsPU)`<7qGAxREzbIaQwac(_|a{b z%@|iQ9ZjG4N`mWZD-&+eJq9#K55nh5)V{e?4nr+nLQ`K69oNoyuKhRgo$QdI!RJ=` zlBi%&jzB7G@KAB<)kr{Am2WY+GU{qv27muZjJ}}tYLQt;GB^rOqx|;Z)6(v>c}u`V z6p>p#7x1IW1B%@%|MiuwV=pSUjD5QNi1E0)6#d3G_Rp%*5nJvJmvW4^PxX!N=`kt2 zntXCy-4A^&=1Y#~WtC!9@a<&xjLE*R&`mVk&{tB{Q{YxGY{+WQyLz5?O0M;YKK+Kg zdbLPnm{ArvcCPf&_kFt~BW-&oZFRX<&fZzGV*kvp zogkZS+j~Bo6nT}F9^35hpWLx+JScJ@wj$-zxXDh_WW~lyr_NNMr-%E;|1bOX(ioMS2dbj>efFnq_HXu} zUF(@BAgJ(p6`@4qE9!0UA`f;4eRHyVW9Qjlb`cJuvFA!!?Q5=APc`~ixgV8!Gs+^_ zO9rZoMNJmYL!ScFVdhC2R2Ob5h3Bug_Z#AjBn4{{;vVEMkJ}bvCJfF4 z&NqE!Mmxqz&=Jv;Xknxu6)YI7;^iQGD6(NPI#-|o?p20Im zcgvNA*JUgkSN^m?v##9(WY`+$Ix`h7m-#9O*niy&{(>v{vl{<=*Q(wuGTtypUL6+d z89MB2SPZiKipuQEcqvsDM}|nT+I23mp}Ril2=zBUAN{S0>8$PUzT_#vkFC9Bwg#ud zr!s%DV%o8u-MY5C3flTt3+$~gzhUf{WV+ivLf>{kUX*w0Li83lKaJg<3rp%A_X(Fp zM)|*K3=VzN&;Ua}-vsX2%MEACFGQE*PFz~M|2}TZC`4@-*y8-DaC^J$a6#$ljK<+s zMWk-|qk%2hGtmHX)Kx=Y(h(m07DTp``uulKfeWs18>x4fm5Ye*{&Qc{%djaz+#jr@ z{pUwyO)oK{OQy|kbr-#(cRaJyncna^&0zhQv~t~Gz`OmYlU6Fv?Z19(kqlnEEP?qz zkRDq$IJ>2Ez}an01-dnANB-Hzd$%-3SEDe0Y_{&hx}+s%AElkGzp^G{?3=#gLreK^ zl`HRY$O%rwL_$*cQQM(Ht{OmClJM$fcS!qc8R4L+#f?_aU%cWvl6)*}t%z{c?~RQOUA)CVh{D4sp}nDtFdzwm*7!D6GG`&6-&`k+W05s19IIm+kh_4zw;mvZ!-sh39u--Fx^aK!^+^+6gnj)k zxy3cF_gu4<_($&TP^L#ew(-Y-C>f0(%l&(xoSzMM=VG3U9Hzouf+u2!;M>I);}K0K zDDC)UTPo%lAho8fCE^&3w~OVflp?EvVb1apv{TvI=wH+Hu&gaOB=YK?p$AtcQg^%c zcq=>0M_vjHVIXrR7ZoKx} zZR;ZR$EgbW0&Dxn5o#L4!r*-|F2ssts@ElK1Ar2;xF2M_y(fWj1^0jU z^0#C^z@bczL-vzA<2g{Jswus13mHs4SSiYyl235>WDO@3NBo~&ZM06wLxG)3Zr7+e z1oN(|XP3p@h!udB&a3adJ@a#?)II!;wEIoNyAE8e_<#Bh|Ns7V9TjHC+7Clh7^z z_Zot%C$8A;Njtdg&$|vPupEwm!D*K`Z2f{0k=?o<1ECnxD zsRLc@aSlKzeX^-4o(bq+{lIqp-*wSS-8YEW$M?1hq5=A3>jI^`+}A=<;AnF0I~NUk zpX$-2XHP13=(XK!#a+JkKiYfpcqrHJ-)UsBHIgk8W2=-hNJ>P=c4diD_QGTxV;M%m zP_~q8MJ3tyE!l1eF-Ry|wid?HawRiS4aWFA-0%I~^y|Ohz4!Cm#>Xu0%sKD#p7*TJ z^PF?WFHUw>oD|`p5kd$lO2;DaZ66l3Jam*WI({IqIJ)EEsr%Yf2yq}+lsyf=Kayxg ze`^Cl@^U^1=e=eEcEil1|y#yQF{zlsAR&HxPw}3IT0V9`jkeg!Rilzw02CC z%0YovQZV=H4>kgHxrr`RYx(qH_eU2d;xcd55pg)h=pyHvK82_TT*bNfp!l4_$#{@ik#rIL7I>BWr`h zZ0R}y+O@Y9?s9cz?uzb2n8?Eh!CLHx)Onu!D^OzxiKzrS_mqGnWp!yJgR^=Cnz#Xx z{%haETIowBeOc5;BRkXWKKdY$v4uo;y|&1ToLwT0XW3L6??4~_Zd$j{cIM_(NBDfP z)+U!b)#O3eFXQ#f?m~#$?Ypue<>rx`J2kA~s1s!@PK#$nU%#B(0$Q>%=kks?!R>#% z?+d0!=>}A@=`q#haSX5D>Bb=83-#Mu@~0cPcT!_6WxtrW8^NRoj)PseT9oDUk4uXh zF65ye*)^L~&1KHH&$!!*M4_i{%aVS8QJ;+4%QoUYcB=Nx zv3&<^sRQNdMib9@7`~veMc9Rl_61ysU+#_S_o@o3<8$B8OU-GL{yxQFDQR^yOgJb4 zRA){lQbZEcRP(bKvUFmui(>T%XI0Jh6i^Iy*t&7H#5u3m@|AjSR(DeLYc@TY)W^`9 z_OVFd$A_vlsm)9KfpVfOcYwnwu@*~{LP_t;$~_q#r0fGzGoLSEmFf{0#PkesHVHGJn;WlrUTArcFJzezH1X6Ifbu zOIeGo0m|^iUY+JQVzF(a{6mB}iRiCY+Vk42Q!K!9`G-fmofM-YP4QFNPuwdaL}i$8 zEVHml^i)iGNYYr7P{AE;n}xTO0jl@YL7LjK8r^8d67yUb}n;B1SMID0&LQTSl8jM2LYB21MT;!od0S<)P*KQFi)r zlJR9hyosd%@)KBb@{ghvNd-yekC~JdU0doIO>4h%c-V25FJ5I-NxK^CWzd?(%_22fTGu+_EgT* z4A(cR)Z*t?f_AzvxqERw);3R{z5Q*v?T+W)5z`S9nR zeUhvmW9V#<;;hViGn3@nM$@0zS(5l|k+qj!KX0jCM} zb*{?R6ViKoS`w3Y1T0PRtx8rbaNOD^FpS-mNamhZ!4dY&w2Vyx#X3 z8cDxoK;xHe+x6)y-s)iD=bavJgAX_SQ{&vM-=@kFKH5w)MRNPqXjG5zd`uz}d&TAs zEX%9hw9}p%eS|8gDZG9(ZFBWML0veStcnaj0=G(F@}LEkVs!dP-R?;?5O7tdRf8sE z`Mc+CP#a_YyFs*q(A*i8@gzWVVk0K&!}CfNy`a3>A_^`q{eHpzuoI1f8&zE^XS*Ou(40WwXHCIj|W zG{wq{wi2*mj;HVIrR1hQKhx$}Z1!{z#@hQtzY03@<%orjVCt#|q5LT9~#^9CRf4 zW4#J+r)EP*)Efh&h4M7`|J(zcjmTf$C<@eb%~h6iD5LqcMdj#@X4fP~sWJ4>h#Lp5 zV&z+=uJE=-b0j&C6!s{1fsU70+Ph^$0I-yyIbd5uR|^P9hbbAl| z#h-zon7vUFK@T3U84VMX%lz%UR!q}klWseR>I;kz5w0j)kP!x`-7SED|d2{=Ku>21^Nn;}?%_G`%KfRh7?Uk5yQ z_h()u*GRw2@(k-QGFKUjM7%AHQh_fCwJga7@PK;l1_G@?JE(l`!e#@+med&7K~-r6 zYcInxpDsCl2N?zG>Ni1a#9uTwfUyko7M2Jp1RCPAP>wL5eBymz0!a!Hk8um z!1j25;hp$k=AmwdG9SwHiF_Nd)$!(woKzfZSDQ!k=4f#ps9gG?@lG^pMQ;*kFpMi8(A0auy2H8WfA6JCIb=K@LqGEfr_WtGHq7v>fdw(Au*BSZIje| zS`Txn@*?5j;uiV#0y;L9Z*actl=!~$I-NxsJH$4bc|6l#JqsHoHK~td7P-HaGEj67 z-Y0<=>hC%Niy7Z{k&H}pjbb7OB+Pog9B@XNQ6HXpQIyO1ajOM!PgqQZ!tEQN z8MqCN{k_GW(vZsAd?2Pb17)9O`1;M8;CX>KD^|2DXTgw~{N=c#LZj4JzpKp9Bd(y^ zBNHrpPfbB*14Q3Zkj;!akWOQEWiWLiT=OV8I@Z}a19Z#p*&!+TRr=)>NWA#na;OT}rW368-s z$b;H0Gd#78kB)@cr%N1MEDu^?;;L8fYR_HQk~ifTOlIXYnY@vYr+k^BxgfL^jM*i& zR(Tu?dDKwnA?{L;m=&h(Q(tK|9&8G_tm;;JkI8Q=#(pU7+^883JCWPtp8U4eP;Be1 zCmhaSUSBeoYS=fF(zv-U;bY38vf~ToCF{G5%R&TK>V4L@Hnalrz~#4etIiO;R1+W8 z%(jQD5}$4bAz>h+4{Htw(BiI@jS#f50*BUkfID- z%lU3QSbUYEL5_WTm1Ja5+{eL~syOoS=8e7*3nKSaO&`<@R~rndYX}60y)CzL7{f`W zjPi|Rmpr~`I_S10<*AhKQDV5@O~&nU$1xRvjGLpt{U5p=yfGA^XDCc4aXfSZ&Z z#iNqb-Nj9M@nVncxsK*ZqFO1&hL8G2 ztfyPfj*a&YA+Cw;wGww1J`k){SgAL^jn*jp`TC*cdwA6;<+#ngk^8mKRgbwz!nN!; z1!iMYzP}e!*mhj;)wW9$IhY6+(`1$%w$DzeA^RmTN(Ta;6etEErh4W|yBlH$X$QOK zrp=lA-btr+-cV8VTSokhlV?83+&c~U)O7;{ZTX&)?eSjN|7Ybt#Wx_{+V!%mVAw2T z2^oLUpaPwcmKfNbY$({-Q&t;Whig{&CBAE6Rsc7>7gc-wS$nfmgk)i9!Mv(E%Dj*_ z45lt3Wy6vGb+`ZSfuL(D_P2S8iukhf;FryYznp49J$2Q^Qet$V%80LNZjFy^!?+q9UjQ}=p zdyEIhWH-@b1}~L#rXpFf<3(9BLW3W8iBI_%QrACWBy{XwR;eqa?0#j41$CiQBMRzI zE}LyPPTXYpkoCN43hwnGrBU7NG=1u^_d0uTmpy!WahLilpW&TU?{*~TFSAvbSyp@x zeKwwqL{J_&&7YYpa&!EKY!Sk4Vbe_AyDTM>5gUI?CJ`SPeDzyk|AdxL6irR;bNj1Z z2ekxE<#VqNhx)NPXnyLLjSB>t^LM=p+prIqls^oON@c8*C*a59K<{q-lluR9XQ_#I zCt%$PwkI!mV7;wZ`O{c`J%Yf_llQm)g4ckdIo2U40H?I|!n%25WLBSl1l&rWxxWAa zbimJq)&ag}T{C#U!xHgcu6B65>*~3Gibtz`LG=JrQW>8L#tF#v3l9fAbcftmtL0Ce@d-cDF7!5i3a z@FM>+^2s72UAF*D@kaPZ`U~=`VIeqSaahdSv)s=RtSrX(-2%f%{ZGTN*!I(5e1`>q ztUbgaH0qYX-3SO;{TEnwF;XHB^lx)ZaPq;~xu0`(lX3CDI{rwJADxyz@hoKjPRt)& zm>)5$xAT9(Lm+$hyJ7sx>i-D6UblV^-JAa%^dFkmSbrm6Gzj7Ze;j~$&K9kH`VTX7 B7yJMK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/test1.csv b/pandas/io/tests/data/test1.csv new file mode 100644 index 00000000..4bdb6294 --- /dev/null +++ b/pandas/io/tests/data/test1.csv @@ -0,0 +1,8 @@ +index,A,B,C,D +2000-01-03 00:00:00,0.980268513777,3.68573087906,-0.364216805298,-1.15973806169 +2000-01-04 00:00:00,1.04791624281,-0.0412318367011,-0.16181208307,0.212549316967 +2000-01-05 00:00:00,0.498580885705,0.731167677815,-0.537677223318,1.34627041952 +2000-01-06 00:00:00,1.12020151869,1.56762092543,0.00364077397681,0.67525259227 +2000-01-07 00:00:00,-0.487094399463,0.571454623474,-1.6116394093,0.103468562917 +2000-01-10 00:00:00,0.836648671666,0.246461918642,0.588542635376,1.0627820613 +2000-01-11 00:00:00,-0.157160753327,1.34030689438,1.19577795622,-1.09700699751 \ No newline at end of file diff --git a/pandas/io/tests/data/test2.csv b/pandas/io/tests/data/test2.csv new file mode 100644 index 00000000..6f914115 --- /dev/null +++ b/pandas/io/tests/data/test2.csv @@ -0,0 +1,6 @@ +A,B,C,D,E +2000-01-03 00:00:00,0.980268513777,3.68573087906,-0.364216805298,-1.15973806169,foo +2000-01-04 00:00:00,1.04791624281,-0.0412318367011,-0.16181208307,0.212549316967,bar +2000-01-05 00:00:00,0.498580885705,0.731167677815,-0.537677223318,1.34627041952,baz +2000-01-06 00:00:00,1.12020151869,1.56762092543,0.00364077397681,0.67525259227,qux +2000-01-07 00:00:00,-0.487094399463,0.571454623474,-1.6116394093,0.103468562917,foo2 diff --git a/pandas/io/tests/data/test2.xls b/pandas/io/tests/data/test2.xls new file mode 100644 index 0000000000000000000000000000000000000000..dadeb7c2453afa8ce6be9c064fa210647a316845 GIT binary patch literal 5632 zcmeHLO>9(E6h8OOOnK9mcBZAMV9QtnEwmaN7h(){_^C<)LQ0K|F_Gyoh00(_852z; z4k|lIG%VbZn8t35#-EtLLKCMu7a9!-3tbg9hG1kc5<^1!{J!(vv(L#)nK2CpGiQ44 z&w1y*bMCq4=f3{G zU3rFK9AX5>GhYEDE$a1m(U$6})?k5QnZz$A2^o@?(ND>1IPvOXH9RILu9}s>d{GVm z?Re_)7cFo*Tk(0&{>`x{u_Z!z-Hh|U<mRp>|^S!z!(z$PCTUrhm}ag7}Cu6`9$;!JXfhlLnQ* zE|?_Rq;y@U4N(hh)C3#3-u@Hi;FfIU%X}Qk9M9o8$!-AT6adpPf`gi3%P6R!a%z_KfWtd{OCnP<+?`QyOvPB%aKM zH|vT%ZQoR0zi_>!w70e0i1pWgBm6Sh6Bzeme4*zA{;myf3XG?v8^!VC$k49g#x=mATD!ObEBE-r}K z+!$h4TX}LV`aKirsmbG$sAeB3P??wXog?7hd?2OfT?iHSb91Coy-M)pF7mjWEs4Jn`dVg4ahvx<3Xm&a%zH!GQHmX+mc zgnU=B+m>Z2A$DTUl1GThet8A^KZ;u~zx{XNzX0QUCZyzW8=eHEGW2@#neeOKKXcG0 o-YX-r6B5#@Lq`7tQJX$n{>Au8Z?H~O9{;%c#1!{I7VE_OC literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/test2.xlsx b/pandas/io/tests/data/test2.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..441db5e55e666571a94e9966fe2b184bf702faab GIT binary patch literal 28216 zcmeEucU)81w(tp|w@8sHO}dCE(xe5XixeAGK%`6WK>|UMA_yob2uM*tP(cJ~(mN_5 zRjQN(6%m0%+7UwXoj7x++?n5-d*|Nw*ZU5V?6b?-YpuOk+iRn_F`Swcpatjw01yGx z43o9IU;xlb4FDVfJ(aDFuTPMRPmsgeFh7?-yF(~%FX0E&R8si>71aO#KmUh0PB}upc<3+1i=bIU*PKeu$*Ok+J`5%-Oxuoe4dGtGu4w zX%pR_y;xv(o{RHyi>6;GyLYxmdSVJE-QvQl?^rbO_V}xsC@-a^#=8e=1P>Q@C5n4G zt%u%odUuu0N(GU=znV#3cbkP}>w!%IEx6$x)FI<0I)1muG}=TGVV-y=d15}cCRFV9 z+sd8k7jGXsw}+Pa;v;`>XjM=cvTG(Npg0KO^GW<7ikCyol@ z@|!d17H&N})Vf&vOep1$+Ljo#$N`Y|X0z*#f8i~SX?*_F{csMkmx8{zGx>|$(EcKm z;eh$?gf!yK*<2P#NT(qgWP*g$A;85eQ0@@r^ZyIy|6z&z4e6z+CKTa}*)n|JmK8By zC49|1$7j#!Q2H47$z_j=?BtgrB25~xdOC&d=dKoM%rC6=WpB71)(9`&Q~jJOU60{L zY0!x&{Z`N2syaF@4x5;!=VEh)Y)bDo-fhQRKKcEWS9Qw4C*xB2Ca;iwf-c6iPFVGc z^Aimh4u5oVuQQl%)Ipz14^=0QL|uCDs^)~&_+$PH8-ojZPRpd zxyhAw516y(-$TD0BQzAe9BJ)`PusJ8*bJ9l9DEfmlKmF-NkObKVMLw%GSZ-mX!byQ zO(pOCg3zHLwT1d)>F+)sY$1(<$;Th1=SF&$ z&l<$4dZ?8TpogKn<`ezX5f6?(Y2a+s^xvvl8|U zE|~HUl^p9Gc~#P<9V(^l@?C0p2>#`$Sw$`5(9BG<`@_3w@f-`ZYb7tI`Y#%;?ap&f9Oxk3`j zfHJnR>S61vOG&p5zqc4V&qUnlJvMpa1N`XTGo$fTE%Yg?y|um}O~JKpp1$U3Rh3exbs3XpYx$iA2{iTKxg%9#$N~ za5SDHfre*RxpJfK{kzlGdotNBdhguwxEpTHBmJr4QL6S?numKP?TL5%>~@}}F+LV6 zV(9Y!{`y;+$m}@jqEEP!*WK;CJe?Qqp3yzIlC_0Mjbpu60bmS*1&coK`rbXF+mWVl zXIb0(axE-ALtH{E`dQv%vr0x5by%i-;A%{x(*4%3be%)b7R=`NwbJBjwSXzHX|D%R zl`l`SqXqAnUC1eD0*vv~7FXvFos$s86A>f)zza~S$qwbD|BdM7vV{WH}c-d?ti+gq!e zYsp&N?sUZOh}~P)=6eCfamzD=PXW4aT~G7XGf}Mv^Lym28{c30&hckUU&mJ6^AuWg z0`$xI)6xgJI|jHop9u=^@No;ID2wyS+KlZKPv~kW{qwQ2r`U}4@2?9gqKjX4hxduO zK73GHi_<^fycLG;+vHW-Xl)m{6*Eyq6?3*&VxX`6d(hod?F!T-bOya_<4wnEB0g7iFNr4FFfATM0ChL=rrwra;c~QLH*cN<9dvENp5>n z&u#|y95bC;XU@$ocYl2hxRl(0nGamSJ8)rMT+6dUWK8q_8GVNl@t$mS05Bs!arl31 z?I3p-Zx^{ApGPR6JvZFru= zD@kCfHV_lj)jof-&RpNdO=Is8t|>%I-)gHa3!QzDpnOjE z;#DopTiJEIx+jXVWn|>9IUbZ-yDf(I>D0TiS3QC7+I&uQfdBcOcPCsT#Jd$3%&$6` zhr1~}5Ydh(lOKP5CzMBB;>4FLH?QYfnCxSS77#qEc-+(_n;|IP`e;;GEV0Ba`CJrF zI-|Wpbi#Kgi<+lT=lV{@KccA=fAREt?+5uS*tW-+XKv7S6~6N+^LHs8@C8ox&8y~^ zHk^@N=LJ}G@FSC_N3EB9&f30s%l?vdZ*2PMh>Rk)!%&NoTVTPvOYitIWi1}TCi1F} z5%pb~YMQ>xvAo1B1%fu74b%3acv@!y^s;iN`*L3wzj>#7z~Q)2*mB6xWg6$s2^Kjt ze2)JPv!;%=66ZuwF%Lmk_H{K6a_#{Wz~XE*Nd3s;gucqPgFf>pVX0A$Ig_i#zF*Cy z7Gl@W8^&IJ_?Wjb?_x=OX0v5~OT-3I`tkO_B#RQ`(euZg?K!jbAfb7dpo0#45*`9m(u8x z%bE;Na&EjqA59RLJG4;R6_XK_efcSS$F7n5rr9A0m-m|8am_ELRQ!)mWfl%7t`!E= z`5JT{2~5H2VviWbAKw;03U|&=N1PC_5A~3$`yd*#a%QA`#Mmg0^`34P!*u?UPM0rh zyvLXQo`)lbU$)!qTiD98dO82eq}33_%8m%J1%nSel!^! zXMeVrdp5(ZjHkY8=H_@)%|uualk$w{Yb!Go-WTG1Gq&wi3SZmv(~riPp5E6vL?8L( zL@dL;_f{7~qp2;0%=d2Wzt`T}#OE&Vl=(=|_;vupg5J@u@5%_zwk$WRtj8AiOdS)m zXh6Hijjj7dzLy%|dD8vqWVTt=!@UR9CMNHklm<5?(#}<%GVZrfe3<$0^Bar0QfZ}y zfc!>37Fs4HN4Mw;Na};-9Tg5{EDHi}UYf7@INgrx(#4iDocdH9EY_}bQRf45#&~Jg z_(y4lg}jjdPcOfZAGCrGy;W*2?7|VPQ$L( zeZQfYJby;*w99hw6Fg57kBfVa=2|B{G;|YGCR(41=}n}enZNCSpm4HHzVe7dd8K)U zZol)gL)lnRrQHp9u50DJy`KCx_ml7L-E&E~%2{Fm{R4Cj3J+d9PqCK#z@eur$0v9n zmK`6#jHNTJM(P~EwzqRuMMh02_SJ-a+#D?(zP(z-{5IJsU0-D>eLF;=T!l$_xA0k; z`kgoM*~S|Fg}18-Pr<~1qI=!sqn7IdfHJ+Oj=+^YheB!e>rZFFxDVh4Vus>l?O;pI zivl-g{dAbhub}HLDv_B2O31Z}ZozjeW25sYi=D)z4oV(izxGwzR(>Khw(?TVy49V7 zXPo69J3l>HHUSSXv~t*L4W5pMb80`%_DQXVMXTjtRst=wqbp_#ZZGvMI(H3uc+RcP z)Qw-HEjC>i4>tC~=sO|z8)UcdV87U#rdr3)^?v@+4;wpz}aL~f&y?q?m<0=9;V zx|>1Q+XtkOad%tqBJ4i;oJmTlOh z#C=JV1|MSWcc~dyQd?<6uRm&*_jQitG1B=kOpJ_G|@m4uzm2GnSt{grG1lmmjV{VVX(e z)Zsm`t1%?H`#btZ|Z<$yO0O9``19Hw3Jc)1Jl2u3t>iFEA*# zPf93m;|oX^xwLws-nvZsmMN&;P*aFZiHnoFtvmM=+pT6jbV`5wYy{C^ut`?^)fps=)Z_h)I$2)5i&5+`SiVZ_+Ct@0kI$fE;bd?A!kDU!N!3;kTp9 zNlmcz?!K2hYU27f*kXS0^j>4tPY#`WlXvyrDKWI_p37Ggj#itCdPDI4>K7izoq6QU zwAwrGxp~)}MORq1Qfb0u94-ICmmcwRd-V@!-Pf)yUZC&sl%6K4Jq8^@MrkhU2GW{3`!hv_g!ft}KRc}bX2YE*{gsv-BuIkE7 zr3KB3@8;S^x4q3|D_p6DEw5Y|Cfu4}wqzPzq!kVh$!!`>HdQ!Hm}<Jh-SxlHi}7 znm)aulc#YqKGlBy{D#VOzbugQ_ zR_x{Wb@CjLE37l`TJ^2jgg3B~O7{JD)xccdkd>@@R!G-*Pi;(?N61L4f&?`cb$IsT8K(o+~y(TNw-x+ zX~jxJkjRHyYCJ-pt5Zi+&TJnUs?Z!vQFM6j8GAC!LaH=BEyqgAEl99v?QW$y({|&5 z@fV?-BNeYT)yfTS2gK>GqwC zBa;#C`ZY#&$-Z(vnuHCcB0>Xq6pEnK8ayct%2&mC29YcBm+z^rA@P z%cGtl)ZvLn4HiL?;wMDkScJSic9{(~F!M>>$@ZiXF8VHqpFF+pDgzG-yg0hmBYp$% zvP+n{iJTa7ocV0r4AgqhS)I~BWtVAut9-0xuMCPkq=?Vpk`!A<4}*Sn1m9p|l{~RwcZC$+@1HBfO>oyzQk~hS zB|?d+pNL!jD(j@iR|Ci8$BpTg+sIejn| z9&7ccB0SD{Vo~!o`8R5b5h_z25nxR-VV|%m08oA@$B53pPUZo=eu0oOl?=?jtg5^+X&N5KzRUO=n#?*U~+T{^wTysKS`OM(wFl6`)70Xr?Zq{**|apeeZvI zV{�bOHbvB~Rh&9O&c?l~Djd%mkOt%jza|S>NLgmAhvfEE( zT}t`lPi3ngea@cNf%;HzWp#3QbcV`ksC@9^Z|dECQ~n7Ds0Recz{STUz{5$H!V8GA zt{z@4KN^1e_>1P>^!%TS-d@2F%|9MN=#x3n^Nca{Ek&W-J;3nv&*gk4N4+ypSq!Rw z=@)dIQZE6OzXp4r(}v0i0Dy|!H9-H|PyMOXFT0<$gvwk{y|YJ<(b=EtFZ*0HHG|5~ zSgHq}zQz0{fxzjJt~wWsMt{zlA2$cohgAIR%4?htYa)wg> z1I}f49cT+u;J^beIi4_w%6w2cKEU@hWiAM3_yZR&1IkcAEQ50Mnkjwd7qzVSp2z< zFEuyyVd|sMuNw79Y7J@?>SKT~HIiD3T9aBAsy#-n_6r8)zrj-&a03=0On-vo4+KLq zQ}D0=j^E@9milFk@YMhZ52yYVJcP}!RQmqf4#D|_VhCs7-*Ny5ggEdI>STY*ryg`8 z&o3>)@2Km}4>R|m0kgm2FU<#<&ou9#&k36MzmBJwqInPfOaR)@3V8j%$Q#1S1W$uk!<*nQ;Pvn~fH1rj-Ue@hcf-r!4Zpzr^R@p8$}emGgI-D< zg<8rg{A0e~^4bOPVm;2v&nn8Q%PPbw&U*5fUc#*0tVdW4StX$s{$JM3>(}<*(c%GE zL%jM2SO0u2iUfE7=O9cxfKaG!0K`=ahkbxB2ck{LZt%A&I6$SxerBGQyz+b zrHuLMn<}5`A3b0|KKvhLAcNACJO$`byodkjp#b@h08j>#TLCtDAOi-#VQc^u8w}0{ zBXhOQ&GcdXs97Zz#^b_fSQelUF3*1 zEypQGy8Zr~@>g!%rx!h5(YE*WAYSa~#ek~}j9lD2ynNykl2XzK6cm+|RgS6Z=<4Yk z7#f{8bM~BtCA7Rw&MvNQ?jDx{gMvdsQDK*(uf@d1#a~ZIO}l+3J>za>)`Nn=qKCyL zj~-V(ud1%8t$R`5-qG3B-ShI*>!IP1ccWwD6O&ln{D+U97CtY2A$(g~-`Ly&zi(6M zqR{z|=TGT^m;j@qriN3~QRsqEg;EG-qoxr#Ld&jwiq6rWW54_rdd}mw?pL%ih#ozS z-+M7&kdaGF0V__RkoJSJe~qxKe~YqT2>X?;Nq`v+ZAvyc8-N5zMe0a0KwrMeb!)SX zPA_+1M|HSA%abWI-5PycPVDJR>*>3xZ%mCoJ$^N1pX>0%A){}F?o8n#A8P}KHnRX! zJ~2|wR6NB}_At%&VA*uevuvk?BN^#o)*{zoVRmOZ^fDro))yBgXqv*?M(x>m3o&j& zv024&>eBv2;7o6ipg4i~l*gBkD`+IP&wgf1_l`B!=$5hNi1!BXjS98qM~)t1#``Q@ zu9rcz=Pnq zexhVh3rRm8A=?7F;1@L&!7~PCaL_vFNqJ1mo?x|{V?ieB)@CK+s9J1J*`2R7?Fi|W zGTCZ=PKP?U8e{q^u1q7zEHDusw_H1N-t*YBqZY*ppD2KMU?J!Xd z41eO#=UGDPBM=Sv(hOX%Q<(UA%{-x`3ytY(xO1!V)d={6sx2B>Kk(pP^|bPt=fJKD z#8)K#3F-P+G!maPL59A>GbZPhH2*6H8Ngx#6_6yHDV5^qa7`-V`Uc_La%upEMh!CLNe zl+$9cYDHw|<_4e>h)=|#Ewu3UwgkE`3zH>rs{E>^)iSQSG5v8Q zpF)mCsn5>!bRgnXd)g5(^)*lC zd>Ox}<`(1E+G@9x?}y!A+)~aM8;cYiUZ`)VuS=&*4^O=QA?#t6zFVq++eIUZC=Izk z+x*`pH$A*_{Bt?nJ(?VY3Z6}L7HMrW+#^HnwzFIy=0wvAPYqwg zk*W~3#oDdH{zv2X*4AX8=VK?kflqsshVtfRyhD3GKUx%D9RH~jFVv~lJ2Em#20Ay( z@wHuq^Y(2|W_2`Dh++=R>e5&w^(F5jFFVDDgJ@(i&5m*gD$Sv&ud`Kf_)qpRN^aR0 zztn2ax0~Ip3dVKJu;OQ~KmFXohz@+T$k&}wFg|g~#3x2aa|2#@d^I^?^WbA5Z%_M$!Wyo6i_TtNsI=NkO-X4*&2SDe z7!Eg3kwAAUhQxCzQnO+JEs1Tp{>l-{pSKsRT-6>Z9#PlE<~?^X7HoN_*h*Ib3T;+{ zM+kB_vyPS+5*64n@5||#$4QhXD0fFPzFVJ--O9tqec48|d%LyQ3nj`q5rX@#3J=6Q zu?JgDs7e%Ub`&lRtVA3leS8}9p&8h&C9y60eGuYbatoay1J`|HkInCcc1BvR%0ewC z(1K4{nq61-%mWYM`^kW`6r?4tVeHAk{rU4`pe-~O^W8{mpWfKDU^?*D{Y&2SEdJFp z7;02yH&aUm3dQr)S8p6iIqUiS4E`o1? z#MB7s+Sex_(;_t1@)NT3Z#9nYUcVP^P6e4G zWI1Ml@DwxG(19ct=EF&ByWym`q(Cy{&q8+cqd4G~#T~u6Obnc#0@WSLdLl|}K|SA< z$R6yLvW%N!iGe$_SwYp2Ft*T ztapg%=>3@V=#$F8mS&flrN5d1oGMD}Ujl_i!?8>|OhRl$afurZXX^;FN##$^T7#Dz zl(x4zmB*B8#rg^wMC$5peLNz5ygwmhsR_U zc7*7`khibLzS;}kwVkx};JP{0Dv@W9>Y;mTe{rTl%E9bV!#5vNVH1B?82}8oQ-`+L zHNpJFW?l&XYjI#d7<&Fx456l`U@)kk8P+d(<^I4Q*AyWOV&|UZ1kNpVvGAWK*uN%cLOvH7z@q4m;fH%j8Wmvp zT(*qRRZWFauVu8{1KY_jqq+M*e_zi>+FW^5dJ(>~1@kg$pyLZIiDb22+wf}!ObUi` zCBQdCZ0NQ|T#rK%hP{z(ZZ*F7Ab4w@Uj5j8^cj4q1Ft467~9iP(C%9@g|c}&-@zA} z6gi8{PGa+^?$dlF=VD~%tH|Ti1$hCsQ4tLO^C|pavKFHAN19o|+75gVPP#aG?Gypg zu@vu?qppY(vsJK@>#D!Dq;Tt%*z0fY={w#Z^fO&hr?n7ijYd1tk#A-s>cG(XrGA9H zE*W5IG$SO=v&P_z)P3=JcEd$OBu%12%DiRg%MB%hA3^Y@I`31o(eT%5eiqAVILqgH6sEk3?WNMVbnqjBT$QOb1NS6+29=+zF+8W_yhCQ&5wYqsnl7*RGt!a6r^`l5@39 z*qqMtUx4>75dR0XV6Q+On=z;A(QA<`?_61M{hVr?@6=Tq1U9lCqO6RbJd)#JztMYl z{OZd$h3v4=KN9hGCI!aE&|0U~=8_y}Y>@3|-H@K8&|zyfXLG+VJZxG5%KyBwe<1Epc>YIp_aVCM+|FlX zi&OSzw`~^{%A?HejeH76h#F!cpC^M`J#tkPJzRrtxMW^T`f%ZJ{2MJ{UJR3_1X^`C zk{^XI8&MC&%E&As8Qhh&LNe@yhp;bd>Jm+AjV|Q!9^xxmRk0pnVveJEc*2c#;dhOWQCnQ_fm`ZKDZ;o9!V3ajM*L!Y%j<1Y|p8xiJEz3M}E;9H@@=`q(5a zCLY5U1osroegKA>HUqnZyII3sGgKfQKBE=GhYrWeaih)g%2yQ}Xuy_k)k9@sqxK5T z_@cT2*+%#I(?@1(FWJit2QA65ds=Ytc4wD<$=F6)*5W67!u?>&kmTM*1(KpC4M@zg z7;qU7`iu8~3_(%+cS3iX)aZHvA9$woR8^jD&19u!IW|_;b?q?kyL}~+2iackJd9Fp z0Z*tyMnZd86jCBwEyNHgBKz;u9FrC8%oTFrLbdCUuIH!fVOx`Zm#@AsR%_ic%Z#10 z^65FQ?+!16%yvOX-n!rr67T2z6&O%&We;G@ZlBGD_B9rAdHPj1E$sz|BhocJ!P4UtM6bjOyHeKiBzVuAxXIkA;(Xo>t2-4k7C`~ z;WvcV&LW!(L@BipFGV4~dW({-*!YrwNr;1w0c1s0$~KKh3kZjHS9b|O>K;OV-usXY z%&|ac1$_CeplGiSy+FulAf6 z{JLD7!ZkYq3fe`2;~pV|#ZhkdOR09*iTs8SuD%XQpOh-l9S1Z={~0s*e@vQz;ZRTk zZk9)6SmE0vMC|6H${LYS>lv;|^-`8jD*v%2Ab56s(qNaWYL*DACIjjb4RDN*usI@JZtEzX6_4P#Y>mSlmO^w9Tv;@rlf{MRF zM=+e=d143$IufX?s;1~an9-xI4Cy|vwbbYHZezYt5oyBk|W0Hl=Tw;zjAa zfiUilSuTkp!U8U(-7HR16EyQB=o}vi>gOdIjiEJmGI^R}TK$ z&WUuUB79H$2ir$(**$y-ebj#9?HY9nN+EZ|IIgfInSVZ8$#HdDIYi8$e!QV8Rw_V-WC3wpT%lXXPhaeZU ztUozqr+n&p!(I65G}W37!8GOpnAg#)hX3>&O9t2$RYRKYd-7povp#j48{7z231&HZ zy0R{%vrM3?XyZ;Nx5aRGmw4>iTSgYNQHm|k8=(_L9n%!ZU=+?0hoH@xGcUKz7la{- zOOSz2sZmmwnt8{nr;nNs1~)l$@V!lXVtszG{%P7{P()GS=$YYi$MEo8mgDWKog6kZ z6LwRt#JH7ej~G+q8@djt9+d{>h{xs-ptfQ;0%hCAFqcB-u8N5vns58WVa?)3?FI0p z+zi#NI3Hh&VVxnxaAO{^#>RWoeG1ZC-*5EqHx^soA9dMbGxSEwXOBUO6^E2m78G{> zCEkdytGt+j;Q}&%m4+lu;WmJAL-mqkbA+HXQlpmdh54*zM#DrKv?XpXS9~b_6n7kY z0uk%toTe0-YyT1&CVYlX`{qQdwung7$vv$CZvRV2wDVyS`NnJxgBd4oAh=NXoe|hf zmudMW<>S_zDSHem#Qf|ocJQ=KI;Zi!q_h6#<@swe;X7*G4(ke#cdD6X&a zd_{xBhTWrY)uj)sw7o?T#d0D(!?W-fdAL!0Xp9Yleeu-s8G*qpmAnr2H-!7cWv>gM zvvdaLNXJuc7+GZ}HEtC^0Quiw39J=;?is-btw30sMx_|n)i^MC_NlX>F4=}j#Cwd< z%p1LJQHjY3cyNZffnmC$;wr2Stpt{rEMgFidyDn(~(I^{&$Ts z#r^nU+@TwBwhg!6f2rSZ({>0ghf@pm=8T<-4p{OVdXUTP_xk;E8vx1~KuG`_(wl9w z1fmR19KYKAD5#IFl?=epyk(%~RA+#@y#S%Q|LYP~Bd|RZMu=>l;g1lqt|gp&6@e}D z?NqLpcv93+xkSP0ojHSwbKfK07>bfpnV?wYuU%@XJm+2^hrTEw7{` zNLDqCW^~k>H|KTa@)>uh@+yksdt%FQgH@YB>EGAwSY;9?d?b^cZ3 zRgSs|S^aIuZ<1+*;$Y*>7LWl!285sktAFnDaYfxE2<=(e2W6*P;-lqah}zt#-WS(+ zG|R9V@^Y*AQ;al`*h146Dipu$FTwqtF07U(2#S+naaSdDj3bAWknh5N?XIZ^yA>9a z5fv8QKOY&8n-95#E&of?pJ~?=ua6zGhPRVMGB63mQ%8g5@ns;~73^945xFy!6#Ula9zVW`5} z1q8L=@62EZif|4@>NqlxG5~`%kln&AqrEufpcrDnT_`sCs~&z1oJ6DGKq$;hU&iL` zu{}7$F;?Jtefi>^c7BsOT5GG5C%W^(#;ML}tEl9&_+Go*oyOfh8&_fc*kiC)D7sNS z)j)is(lEK+<(W&6hmMT&OP34u>d@KxcelB?_Gvfl?D9E6Zx5G^L0pwtj3#opI>&YX zJc0H8oQ6)1>O$wTZ+C}fV;z2`0y~jyABSz+reCwwA>7)D%zM&P+y!~21v>uww1jNI z4$GQ5M0th8W%;$!CIQ>V{Fr4K)R1u(J~V;!GVNq;SZ%Cv0dv549+y2&R=*XBIsuV% zAf{aif3BL)a;1?CClKCab8h8P=zL2&-`>8IPP15>lxynrUXa)Kct)yO3KidTc}4-B zlU(G#W0J(*ZX@1Ro=7yf|K&+IZHgna(&(fx97<%QkQf~Zp&kB2y$XW<3+ybryPznT zgDa0+CFFHB)8(!if{fWv`V+jk!<5&0>*nC&dK$2`*Jx*Rqmg z7OqQHr z<(5^V1MwUn_EPiERM8D1};S%OQD*vLC~NK`Jeb1#IBob=N{JP3$7EsIzcT+uf9+Q z1Uvi(ObeUs>TR$9?FbAOpQbY#g>V3;39nd=(mVi6h~-8FxWnH@@}POaGo|Qbc%>(g z(VF-&Qc(%YQm3)G>BW$&%L^#sH$0afe*L5Kv{>cKH!EL_Tp?dyh?ewfa<{m1HAM(Y ze-aF^rqQlW&EP_CK6tseJy$SMAFmmMZ<$v|vNozY;iUHuKNyp(p4|%u%%3D^#s!wx zzn=tC%<4wvzRnwm^M~Xk2ATq~p&dvDJ222{fC$Gv;jgQX(CPD(m6(+F@qU0$xxOkm zn9yx3Ryo-|Wq2$||4aS=9d4n^7nD}+Y?8$Kqm(pDrt7#LpY&NWth3f144)xUhSkSJUS#^D2P@(gYB*cP|t zV$pVvrC`Cycf|?)Ce?Uk;=^c>(~E#+Gi2v6#2`Y;GOt``xNkx;9bO{I7h6t4>JvPG zzhUm4y4Wl~a!^x_)Nc=fY2{xuC5DTt;*67dh>lOeXoIS>6M|=zCruwcd|5JyGKX#J zLFzs(2J_i&um$hn3B}((EQd~e%le_uv!EO$#td`il`^qz}zOFQU+;dYPYNtn7z=615a4%%oL-nBp?g%J==clBi5R7DCprHUW z*E|LVtrV;9m$O1QIz+Gz+uT9EL~hDKK|SzcJ&zIP(rAeTv$co#R36~5Nh~oHXXUMD zxDTi|vV5Q#|Dx3g1n1%7QI@2bX?`g4wb?@Sfh?b{x(pdOzLErSZ^wb83>_c+EhOQo zcuWjhQTAz#%Ro>jBdWIRh)6h-tIR%|z9IeSBlwc210{2?D_R1+e-A#zX#bw|z$?h= znkPdU_QB?Pq3?7y+d|uJTKmSXLYe8zMt-c3%YedlamWc%NHyH#db6@xZ2MfTBIV$l z&gVy7UHlPB8~nF$fh|CKmMDj9@Qsbstg9WUDv^wCw^@JIw>PQYi1T{!d3nR+idahv z*qtlXY&W4l%WdXIapPoHJ637zJ;(qT@l?@-9m+{Bci5!10)J*|-1e0tT*m`G4a%=9 zKe#GiQE-0x{M+N7j6PhniV7=_Z9EnJ80|ygZks{~5N!uY`}H1N$`u=_jxfN^Mg=wT z+YdCVJ+}HR`?w8fNZiwNx;l#toJyY6PyH&=Wayie@8}jF3T#Q@Et3^?!`r?=d13Ua zvI!F$!*W;YXe4vIo*L-cX__H5nv19k++P@=Q)8LRvF{sKOd*>Q4;@Cb!*0I^u0Ct~ zlkJlTYeO~hqubn9u-*d2VIOj;3WToRUR*vpeZKMGK)ZAJ^@&$U4Xxn%!>t2Wrdt+# zpUPYU#fx(D#Je8mVVOK;I?UGeV$ymMF{%1h6&!C4E@$3Qnh9z?A$j*Z0EMd8!p$Ke z>**u|Tm4WLDVri>%Q~buAr3MytH}!m*FI6lY?C0dBXLAX$brV{`c}{pnmj`S7}{rE z-xAB&S%(J`dv87qo#r>HR~Y}&@UB(095kp#Z0|#xkpVOanZe9@$WIp>L4NipAa~cG zLM-y!M!Ab60#SLKfi4SJj_uqHn1hSJIijhjG*UghXDe9JI% zelCjj`5&qI6NrBkw_|d2pDl-MmUL0iJN>;0z6nnI74a+k9##kOS*)oeMb^ZRg(=-h z@z=E3kJX~Hg>=>Cedrh?%Uz>7CXk7)& zBYme_4mcVFDKL!^%)mSjv$FuDk^mB>pa#7vuCAbp-tXVgWv8&4G6jR5#AF=zQ)sf zg3D?mverIGmz9&nr-iGJO>4#HldK;<4EXJ#M$;u+ zIoB&_`;J||+shpn$Tw-lD>~s30=p@25)gYO9qSe2HD5<(3596RdFwK$C(w~`)hfy6 zN6_})sG*a@A~+f?>Ib0k;$Q>O8h1T#Z&5V0yr;71#fZtQTL+z&Z!$-NlgaJC`xfYIBh-jh=H&N7H zBBK|-awFHvB7|02X>oKO?NY6HF=^{|<)Z`I2N0aFVNAQVr^U3+LEbgx^sP~aw+k}-&&&-^qw-l& zuGysL2|*$3tydW5RLN+x`qPwPs`_U`%ax@Mwa#6cy9%R=?gP*?JBNuJVD+5P6k;CJ zg@JCc+M?8Ygz$A1YS)Pd4;`%AN&CU_0#r(CPg|My;ZeDR!5I4S>e)nZb$bb42Jq@& zP&sWcYcwX%Cts+*US#B;#0ZhI!om7wN0(`*K_Ju3g7z6nns`ew=9}`USMW!iu(K9=g&_;$0(r%c~v^lMJqYB8{Z$@AOl{gv-7LLqH8_XKyb?? zC}PWtN3O18C^6pOd<)>y=xg~A-=X{*@uP^WyDF89Z&_FTb0e!@W4nK!gnuF*)&t7p z-rIO)=CbASpq|}$eJ{Ff`ex*o&WPZA7VuT`9TeXp_h4ozr=w7G`0u>eLsYLut$OZgrs&BH$B)1vHfYDBPcfbQJp6W{M(G!*|D z{1pHG8?SUZ#PfBvzA>|V?zqL=1(BTq>;Svb@F5j=h>=Fr2k2HJG8Rb_$+;xN+K71k zC6WsNaiyRHwAF2DZmfHjY_#%pB?uJH#kr|BroMi&((~kq0t{cN*0bVboF+>B6^ zn^EdPk{S#)){q#nva){+M8?jbML-wqu5VjymfTe|^5L`z?pW!^tjTtGaN`I|R|-Q(A)AkOFjzL4Jl6K__7cF_1+kisVAqxbtZd5MAbv$goK_KYqkw|U_d4z(3}#h{BHcdvYu<`!|J|;tqQWruYDQ8SPQetp6Wsq zhYtE={bMfHVvON*zBZIYL1ujME}+mtq)8!4QIz!2_Plifx>k+nLgFEhZBIWVW&mys zsT(UuE}%F?M_n4q-Lc3)2e;7OAx6mg`Yi-_+7Ohy%^)}=KJZ4ExMSlwa!>PHy)Mr5 zK=oHSCB1=ZLe~ZQ{{qP0LE}TC5LS-4&$BEuNJH^8CG3&*w0dQQ8}&(5*cUOPwb$SR zkk4BC@6z)JRA^t#%`#pKP+*Z0ff&+L7~vOD0;LViuAz@0qZ?%uCIz#_D#1|wR3pJr z`L{n}oqf*F*^MMqjU=;vI7Yu^*B`+D3k+X_JhB-M{JZiFq~QCh&sjl7+pw~itz45Q zxC}pD!#*-Y1u`&Nc16=eqUG+PA#4^OI7TtA*1|ME0lMlB!iEOgp@M0_s|+ zJ_hO$$~$NF4@WYgku@o;GxXphE=R6Shy`sxIOGc&3O9yr^mBb#D(s_;7P{`U4wfoL zSP-0s55~Ni6Y@2|`X+>^E?g5IYd+BS)W!(M`yh4{N$sBHsS`BAPhct<%cFk!mR&4Y zlu25DZ3|C$l#e(|>LEDnveJ-a`yQ0oHro!yfXCuZp*-sdq zPdV~!=dzTD0Tm>p{~^h+@~V7}U|tijs+5R~KA|y+z5LAgJf=!9VZ7 zb#R(M7t{(8cMvANvaC5i78-Aul-jD z@`BYR+u|)Bw62%^e z(r`&9;3ZvUi#AE?AvHGV7RFb~0HnboO*zo4(`F(1>-v(=J~2Eh!Lx@n1P0DGNk}9h zSx^aFR>dPkq$l(6N)F?cfv73rx{K8t{}ZoaUmWn69+^7s&4PXIw|(?-01jfXt7WqZiWod2;L+tPD<$Wbwn_PKT_Kro9+s z2P``sh-LY#7V};89)#vJ<#D1;HChHopODAAggf2;1axV$O3wKt596o1b@ss#Jy(|f zv=n3W-l$b#|b#i6Y5Io2F3vJAv7TET~ zCrn}6r*M$4T+g`bhy3If2&wT$mR-F0td&uW(0@UYPZf1c1_}wF7$Mgm^1CvwA6w2_ z&SP94ENTgOL;pFTIZTTN@ZUxflSuMgaUBlZdf{3EF~2mBn5b<$U(n_8fK@lPgr+jk zJ8O_1{sS%};G8Kn7jQyJE!B{64~dg)uAKfVJrbYfjuu3ydROGLKK=;@geurT2L4}r zR~nUM+J-?*Tv4+$xA5fBlv}2iJGMA#W~7zj?qp_SglQ_Jf?JGB?v_oZ;-oF6=~ynA zgBz|jYLS{WZkP`4Fqu*jevj3cMKk9+XFh+<9}c{n`{w1@?(4bV`?)UlN(SP6{8Ftp z&8N)}`f5!rjH`fsXNF7$X{>gXf=jrr#isHqwp3hrKsZvj*pE!~yN7WhBr>wsJNOz< zCwksSbd&n)kK!}Ipc%?24g5j?3M!EZGGq(*&;gerA5x&DB8=SJ_O_Is>mto93pi3x< z9Dg@f#7s(_`-|V*@FahQD>xa>l{f3@o|1S{C`$7k`a!(wZ(?ig78=2W92H7wF`{Fi36|0LiKrvH&DI18_QRy$DHXF2N~S1946q z$x*olPVKpaSslqi@uC1^5G+UV?8tDJkvg9I`fX2(n=gmo1PKsLpLeJ7GStsik{sZ( z*-_7m2|>S0Puck|-A`eug0EzgpN}m)Ew>B@0LAj&A|#5QvIx;w zT7p-gMFMkVSk@$6o!}c;_S)WT&jeM}T)!nYLFM>_QSWHA0-YQf_&Cs3eZ_78S~8?3 zCKUAk65h#d5Mv%?wg@Tsko6Gcc>Gb%ldHPAPe*%T{gZ;U^E2d!`4JZ2c&;vMhyF$g zM05xWp4SiZ_2K_+W2tZ24z`MYu--$l3;=;g53xoqr?JE4=T01Z=ksprlDWxnTE(w4 zHLMrB*XiJsC&vCSqoIOg|cfn zggu%vmCJ}fgA?6BO>*jW=cfQmAbh7#F>97aaF3@V3_`*g1$)Id8CGWVUuy zcid5GQH9-($ZheQlVgFUo!$Zp!R8<`_2&*$9|3+j1qqXwu=dz$m>bsPBJ33 zZ! zTv`|O|NIOhzKGdzY;f(yL#zfUIQ}7O)-QfI^?svs^{Z?#e6sm3hCk^y*JsO%J+=A% zPiW5yv_GUxhZ{6X3C4-OZw3!R!q9Wb*9J?|>arOoTxIy6@a^Fj27|hN99Xi3&5MYyP?oTNbVV7~gf=^Bp@^UB zpCBltKLgps9WpPN=kj+*NzDhhTrT}hFYc6Dvf~4hS)sz>=qIJEst7R*vNCt7*oQ@$ z9U3TM|9Ft)c>&9n9)R|26QL+@TBtFW-ADE+THIpQRV(?q+^+CGt-P-i=ck3e=&`5H zODaog1T}RebZEBEzm8rqZC}(sPAq%QA%K%vrl`IV7aBo*u%yxMz0Q5p2(^|prmQDN z^AXhp(KVMV!u|e!SFwtj|M1+ilYd~1x7l_>kd|jqFW${Nb8;Ct2_{8?&C4sqn0VX_ zW{5xcVI8?J_d3z8qU&{%iGP&t^*om*HJyPwK8OsZn#UTx=yq$h?_f2=Mih4)*!bm4 zg26v4{SnIff%5=}IfH)UusK8OBaoIvvb=euuoCXZnw_6O`Awb#v8xmGMeITx5`bA4 z*AVPvQ{F1f)UtBeuq99~m~(YLm9vYdO$)YSYqZ^qX{7hZf|@U{Npf-bKmpm!m3(2G z=M76}A5Sb?gVGlE>WGRoL2y>=20(Nr<#P1FF5A49Cp)0}0X7F+37tl;5(cRr>d!RT z8twE6eP*KH#(8*k^KKt(kV><|nLo#Wd1<+$aHiu(fKV3`N^p(H`@{wvlflEuv(zm8^4@r}-M4lL$K0h%y1sX0jL~-;-0xK|oQA&s zLs$0Qyr?zMrz_ZVl0CG?J-gxz1zF`+S15A7LG9=IU97 z-Q}4nL-WlY6Z+)IGLadbBCmvFtF3QtHL{h-G%?Wf4onv@gx0QTW1X&*L7qs7rGuOz z*ZXHb!iEP`&&qLR>}|*Q+l)}3rJv95X&XF3pa$E@yJKdu4MK$rCNqkn^|n+L9VRj2$dEN9R=42_ z=lZb?YsHGZ_m&nqrREu+?|0yEj(0beNSa-Bi8-zDye=cNz|EJ*ymn)qD?zC3eHsIC6BYk7FONCm5f+Q;I4y5ccR9%HgdrLc{Vi_Q@*5*J5!zh8B(i@dF z-$~m4labGO2YuF%nq9VPw&_Vk+31OrS2Qu{x};9TNaZxmt`dVToOe<_kR!JL0dx$L zf!yVgk9RzgazuZ!g4bas%9t_3-<`~%HO>5>Du9Tc|3$Ce;sNnF_JD=j>DB2zHe(O* zJ2Td5yjlpED&cKMd!{zBh^w8Je*xkOZwP198OQqR+@YYt_8Yv~`TL`k!P^9YH4C1ZDeiKwEVZA|A%k3!Q!eC=;|=as3)6{St} zrc|Ug_l3}t5PBXKKi@hfm92NKm3j0QHP9hJAUdgB{M}x&S&B!5%&K(Ja<;yuMsg&> ziO3~yH`Z)@2@$UlLC0VEt~h_si(S3}_0x6PnqwW(H_z&M(~QIyo=}3NZ9K9@eMMo> z2X;qX9FS~!Sv4_Ew4Xcyvk31~krnh0&PosMk*;+_8edgwCM9h2b|=SS*#V_P7q2~9 z6LhMc;$v;?zS$Vn-J>YwzdlcCL?+Wr{+Zh4A0m`Swqm+zg)lpPc_x7^P-4ZVsY-ZV zWZH*lN(;ivHYs@Lrk3$!UTIZ8V)vGLZB8C`GfnAF@<zS@`izN%*q1_E&ohXt$#@7AK2tynIvm8VCB~<0ys-=zt7> z@L$SnE%)2UcYh(ab-?ax2VYD7eCeP7OpPyufcOo+Rz3OBa76e6>;K=~ zBq#cfFb2PI6`z>GvayW#2jg#~75F{y%M~npV1M+$FJ8d!fS;{-*?|F&KLElWL7%ys z`Azxxe3wo4t^NGhzhU?N=;iNuS(m-E1De*)0rhK^R{mT0Iu0+p04&si`|Y!n|3+ng k^RJiFFU_+~d}01+aXnzS3Jj4YsVQ+t6A1k7{!5?!0}L-p4FCWD literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/test3.xls b/pandas/io/tests/data/test3.xls new file mode 100644 index 0000000000000000000000000000000000000000..f73943d6779517e5448aec76fe1b7cfd526f37ae GIT binary patch literal 23040 zcmeHPYiu0V6+ScGwa0NDcH%fs?6ni;;n*SBbwWZOo5VbY^C$!*70_g3uM;cAj+`}3 zB%&Bv{uD~Vra>icDW#|nDDM(bX%i#_t>m;7C8DGeg(^ir44tNpQ9^@{x^1gI3qcM) zNoymsDGDCca7q8-hB?i1<}6utZA=F$v4YuMCUAo!>&`g}5bD{`+}|2F+6KJ*{^&>!}pAN8R>;zNJPhyHCJ z`mH{6KY9G{pAa}TP>b+GTMysypuXs`+>9&0>e>NojW!i$%I*h`EoaJshs$2ANIeHAIlk({Af+~_#ej?6= zJhpx;|F4qtmuV+XXG)w2l5WfSU{-HgoRO-knu;3e`5r-s05-m*XG{O3=o7v4Tqesa za!!@>moW&WwYDCnNV-x!3c83h4m;jxRMU;DP&*~*rKfJGJ7l!bAs3ws=rjTGzXWGI z02O@<`i+U`t@Zp0))=*j3f5q-hzcx7Q6fc(t-vZPiUaqJMO9$q7FB_bQd9+Q7DZLy z_EJ;@Zm307;K)=|1#V(RRp2&UR0VF7ffd#(?SZ3ciGO>Dh=!8xVwaP{szZhf!Xc#q z5Oza<5RTggfUs-&gK*F&0EFGuAEdf45O!gI5DsevNWyOI55j%=m>{F=!PdbPq!5r` z>r5{UB-lDL3IhqY4mzwt9w*p3vkC(VwhrcQ4oPs~?n|t;6Hf$N2XnhZK!UAPTNp^N zbB`D70~@IanW$z(E5C?`z{#WV|=NDAeo z<&R?dr9$AiKpiJH7h9pF*b0k_t$^j9>@%>n*?=y^<_}AjpqSNras|vT+!dscz3_LZ zkEvsur8Wqgle3E}`wkO>!$`ak5HlGJk|+d3?QT@NwgM#Q2zif_@6O9mA=k!|$s4?t zXpVl%lnwRMZ)2)amfNcEZ1}Q4n}eDyRK9y$-~t&*`?!bs_J{*SZS5;;^{_7U=9ALL zOmox*k?MK;@yGL8yOSncJJWzIl5FixTE5LzqO8+qNpG8Z#l_@j7J@E>P&v=b(^P)U zTIH&sx~g`_xLWBBR-;11e=fs%SDUJ{`O5AfUy?PpBZJ_c6>)IU@D{KvlI}Y5<~wGT z&sj}i*T`!V%%)0Z6VGSU5WvO?M1oB;-F5EFAH3O2R@o%-*~|-IV}&ZgrZnC4_B*e6 zvzel@Y0YQT6u^dqSypLf>8`VX{Hr&cYL(5hd^U>%*l-BUvN6(Kr~md3Z#Gj^Hp}zb z!~@t^K}{&FJl*x-UtaZQQ^RapgFUSdY+4KOw9)pbN6&h*nWnPw@U%GrY@D7p+WzL# z$Gq80SJ`-Y+WY`EPEQ+c|NYr#z1hrA*?4%`q5w8dPaAC?J#pHb%}kYzho_w%z{crm zqwU9Dd&`^6ER~IirzHZ|I6ZB&{ex3adb63$Y?cRmS`2KK7vO2}#5bRK#bUz&4atsa z=c#NwJZ)|O8>gqm6Q_^d<;|v6W#i##3j)|UJuRMi{*|NNZ0b}t9-h`5z{crm@x)`_ zebJjuOl9NYX-fjwI6W<%7&>{zn@zpS#>3NE1K2n{EuMJ)_&2=RjK`~M%v>&PWu zpaw1AVcCXM=a8||ZFy~$Swb5yaqF~u;3$v2uqeQzBR=PqgKQ+J#Lfh);kTuFItF@D zneM*cA>$@Dpk8BEkhR&Ug9fn8CxxL`H}zo%#v^9#dl{4m#*zWADhLLpgf~RE$rDcv zg`HQ1>#E#!HLOpGfkGk{8|c-&rUOJ!Y2sjMq0m31|&M`+Pti8*A%+@eHFz@!Z-xgMVsSP^mA=D|{w zU7(aL#s0-y*%)1ep5QvWPRn2fb>=B`BBp^l1K4TRFUGzdbkMG`c(4pL7ArN%%<*|O z+`Oh@ZwTIGTzp_(PpUVS>FYlj+nPR*88R+)L+Q)RO1hX1V7ndLzE{}44}u(o&bLCG z1Mml)rbz3vv9s<~j%ULJ{nEZZIO-}lt~S$*(4|mXpW>>#wji;5&Kl| zDi3fg{`r8L$sZi&F+77OoZyIoD)?j%a4Qb_fLlWYU4XYb!4V6UfKW2U1DxZbWwA1+ zT88y;v;OYG$KrmER`x;JjlhtzU zSp>wUzD#<^*yYC1$DzG2Mg~F=o=-+zUMGn%L>z}fZi;0kZ*Hm$ma;@vN;oSl2}(#66;DzVqI~-yl^lI-OY825RJkRTky>N z_Qao6#;&@GU3F`BrU%+u>aJBX3Fg@f@5sP#tgn1;*BD~8fauGx2PkU~B{KvBE8Mbv zL^zdeeS!5ZHQfK;{b!A5C^KH4x(5v~jNW6^oHLRzfMV4}=Pn{em32=khX{1s99tKv8h zrMpCWGszoP<@o{|<*npG$N7D{r4@)8IB8HJYat=h?~?L6Ln9brjO4^X4R>12osURY zLkbi`AI6EF(K3#qW9%T#n^~zKP9mmR#>>$@6uK2#uTUte8=}NoYSLZ-?gr4R;Ye$- zO!pMO%cFIC$fjFisasuA>e$vhiAtmxx)5l$F^zURib2UdS1!3I$gY<3lGB&q$e$mZ z@&ig+%5RM7Kc-Ek4zQ1mtspMf<%qC=j}@#kmh}l1_{3H?i}6%wh0oFo4zvDGpMHeS zu2ghIZZX-}UccqubMI~5Q~BVX1}&QZ#H-vZJ&Me4@%jEbWR9wrA#+sNi98wk05bO= zcOYYp4zpmQDab!W=74e*m3U7GM+WDQif}n5ePee=f8YMTu1xIW10CreD~dI(cxLnY zC$)8`KX~2l#xLL;W9=}78VEHIY9Q1=sDV%ep$0+?gc=An5NaUQK&XLG16d6O&HwdN zCr^zmo>e(~H|GD%?>)@(|L2i;uFvy-p7-<5ceWt&&(pRe^PGMsGSBz-AoHs6K4e}8 zkm~~X;asi*So8FWqCUvJ-?aedvb^?SEtKa@KD$Vcoc^=O@l=!b)c&=ZC%`!}4ER@IO-U zA4>4lybd{rT#wB6=iqxTaw9VT55Ron1;`7L`TmE<{2iuGR|8jKlOpfCjA7$B|I(8; zn-`O6C@S`Vu~h6wF@ujpC&ZC1vA*0#wxgtM1NVOXs9=|nW3MC^{EdHrwfVCQg;w02 z-FFwItHhQ~{xtzVJMX#f5;8F*_wx%s+y&szZHU`Xq5SzjVnpM?SClRSH~zWD_1Hkl Q7AQ`h6(8gI8yx)q1OB=L6aWAK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/test_types.xls b/pandas/io/tests/data/test_types.xls new file mode 100644 index 0000000000000000000000000000000000000000..2d387603a8307bc7b7050a36fc82cedc4fd5ceb3 GIT binary patch literal 26112 zcmeHv30PA{*YG3+2q21zihC49R6sUGsse&iMK+^FMTKGr!Gc5yiZ)VE+^T)mDlRCk z+G^dZRur{r>b|rhE>+wSm#THCyVNz`Id^iiT#^|6Z+YJL`Fq>BxpVH!IcLr`XXcKa zI$6JJ<(6jG3FitXR^+|fj@anIdGL&dxts`bg$Jy^SF6<&NC-UsbNvTN;2rSl3(B*C zB7kBI1$AxKqM04osfOZokY+ddjIR}mFr6@%6V>3&DR8y!9-57NIDq_zjFAk z?bU$rd1$vR50>;t4*Q4Fmeyn-8AnFLf62hdMADCBl2nE#gk+OZ@SJG|sRz)fus|5sNS!$UF;dFTmu9w`P$eTXLr0jl% zwAn*j!kZQ3CpAe4_^2DS!1?JpFO+HbFXrBsxz~j|kL+OjTw)-;5S1Q7dm)*J>g1D? zw!Cn#fDEvJkzg64$iG_)yAN_&mu*3ekA+79$Q~3V6jLqiKG1>>kzOM)6F6{=FxSEp z{HL2)DuNHTxB}WVj{|iZY?cpf5%9qlJwDKaM?%bwCC$iKMr$lASz}{7utM{C!8zHK zOj`+BKvaSGx2;=SS5H@0pQH}mhkLM<;T>8D+Q1ACb1@5o_rt`}lo5uF+CmBdlQrAO z+s6Ec`6_6dqxzN!#PPlxeT! zbE!sd8hCO2`i&hLgPb=oa9qQ(_^F(!{H-@FgDbxgKcd)pi*i@0o}b; z9neo})d3y2Rvpl@Yt;c=zE&MD7SyT(1_QGldU5;#qlA@de-JQ^B27h;qdBKYlq=*S z!H{AB5;Q~8NHES=fCNp`G!hIx79c@$HH`!#l?6!9giRyCux0@gG;7mHa4Y14NXnwj zYX>(REkI&kJGf(M0TT1t!7WbL__H`~Xj^ydnr&=Vo8pL^dd1{g75*mTo>A{vRot%IA-ClcpJy=j`YF8hU3k-3Y z!f!H^DNO9}GCH!=t5?@xDkLPN22=VlmMMgBnL?P3DQ?=^3D_{37jQe^7o}Qtz|T*$ z>VV&>YSjThY|##=ZB(FvU~8r&KG2K?H0%JY4#N(tjU9QUZ)4ot1~$6%!U(F=fGXM! z0?h|`*xmau5_O1qB>t8l;Z8SPy#@dhoz!a#r=~e~fetz!3kex-T!`wjq8jG>Kj&jy zrF7Mb!l>i34z14TX_=~N9)Si(ilrvZLw&zk4++=X@mq?{+iR>RSsPQU%d|+?5L8l9 zQp4NzVXU_!jO*qq zfQa;h@23(%zn;=S#)Gp8wgPGDmD^9$*7%If!GPM~?gx^o&n4qmlT2GPWN4^k$XH8L zpI*6aoJ<2Q8ULDO+|7`o!H^+iBTc>kq|!KAkGBj*5qp_2wzWei{aWajOOn`Yy zYYk)qEU>h)sGViEjFV}?C1b?WT+NWtx3sdT+dFp{C*#5;W5m+hn<1lbX=PCtZtXEn zrYV<<5lic2hK#8g zdV6l0aWXBCOjq-kCIT{DEwD5{|Akx6Q!*IP;L^slmRvGMEX~af8GTFh^S`vH&^Vb^ zTrx&1&BF{CeM|H6KXiVXaWbvBWQke(^Yfp7 z;)-!HZMbBNSXzJ?GWwS0=l^=|Tw`SFz^8av8EfawUYvQerP|iyTuDgmWR-EGt~#V( zQCy^LI7s!a2tLNdMH@%zrb7z$!$sPTgH+#U2wAp!sBxt2bV$KkxJWy2km_3qA^9oa z8As}_LkhORMe4;ts&5Z)%IwD@#*wzyAq7j|BK74U)z>~D)r%sGAr-=TNyx*(l2hgj zgA6s#Yp%8;VX!nz(aylunBT#+17=6~jC&KBNa6%%E#NCbcyfkhTt;&CD0$|5;bcRE z{KjerLXSt`(X|Xftvf1b%VZLW6jN`O0Pb5b1Z?2^D89sX6A#3oUlSVC>tkQ5x<-h6vPLLo$T#6c% znJ!Jy_va#(H_@l_sRJ)_84jPr60n+j{>}6FC)H4XIBO4Tl@~Ua=JVeohK_SmZ z3*hWMST$u(punm21&mP{>@T3)**HQ+%hu4u>XV+JBQiKSD-%N&4X+`FjPe~IQV#|S zS>RV})G`)UaS~@FJV!W$%+cE^%siIEsvC zEXa6^05TsAf#U`s(1P>rSA{mD^$usKoh5IdS(B&2sf28}N01^(!EfaA$h@&1!O8#C zVE6fC-@bkDlTRe;-;=9eenkD^D-v^KFpB)bKx+GT@_# z6hU2GR7N1i5C}BHKUGcq%gO)_d}1krl1fNwY7tqqXc2)1#J>mt5JUZ;6fp&0{?Hcy z!=374`UDq_60c90nG-{A#LO1_8V^xt#deFblL z@ZWx(5DFnxZTX}E+EwVa!yUJ`FgfAL1}xxx{W7z|B14{xj}Vx`fMTLYLPDTm*1_+2Di zRA-SMQvf1y{jCJfzz%}jkY#tr#J+{67Qk;q_#GB5&&-BRx}-r9$4C_;yeEv#`0?xG zpB?mZ>Qg=KeZi^i7qWJFtiDkG-P{qaXFs>HnV2#Af@_=Tu&326yL%Tl;7`jGoGDXP zm#rP%`be*bzV3I{o!PMGl>M)-j|>?6=gAZO$8Y{*l?dIcbr=Jv)8QdlAgW|WIZO{-8JbzeJ1^b4BXfOnEgt6+yC@0qrW+PxFkjjhC*sId~&HYY0{p!)8F?v;oE$5-|apN8`PV5F?Nr5Q&`IBU!BfG z_PHD|(k*Yz*JG2;w_fsX)2gmfzrScKJ|Fn$rnzf>Rc}4$Q}%7_{=ChefAkS#1{Ow* zx;1j;+Rirzs^+bgu0PxARO!xF74KjuH3ynLaZ%#}7~m`@cv>JkO*&fIS??YV$)rr0 zh2~MGEZY8{$SL%C?#i5b7Xu}`J^G9+IlgK4OA=n(tk=;MXO>?+|HsXO!18O(=a%(% z9O_wY>o`k#!h7b9$CEtQG+%qEZ;Qj88P|_Cy8Y(_mtVL2tKqeN#}or{UC+&%wz2f8 zsG_vfxi*hHuO|NN_F2!_9}m9!dv%8!eeQR1x94{=&b&{(nd$YxJ1XIbLStjh`y$&9 zhQ`X9_i^6q`lMxqQZ8vfzS+m))?e5%EbaG*EyK>dWCS-+0FZw6*CuWf%yeth`b^4|I)A;~y|^pitx?SMB~PXX6m_z$!~F0uBV!BU3e~1n%nM3fL+`aNt{v^ z@Lii;xf}g1{I)R1rH6auy~*>w{5C$ig+qJQCKFN1k zWSikn z-^A=5==HoQsT=?Lu<(EFK#hW%XL;Mn`(^H(>W?bM}7(wV(oWs2|5eRi&SkyrdW z-o@pmL2o0Zs_m+KRgTAQ+*kaakiPqBV$NfmlqXR6YvGa4PaJL@cPK5(XGR6@;duXt z)+vWBI(D99Rn>f9o#60ZU7atkUgPqlx7Tl_E+JLlIS|K`A*TiFMny(+o9dZ)bArgD zqG46^l>YJuaUOT3KOf$A`jj8mH{G**z$Uz?*07L-~J(qi=Kq!RiBOx`Qn~!nDq(I&!4#hU-tQ?1_$5u z^LrWV?Jm6%d}PKiKVR-PHtcfIs#9H_t;*gh|M-xvV(#_c*M0lV413)&M0Dup)!fLI zi8-S@cHVNG_Se9(2hR5Ix4dp~@1=HESNI;1-h0+G?BSTL%6g^Uo=sGR*B#;JEJ$s2 z^Uc(tfW)H>r%9Io7CdKY(c@u5`UJZw2i9LSw&gm}8Oece_v(CE7+lny|JKX;o%{4J z{&0Nbz3hm$*yHo6-zPdK5+nY4Trhs>&9gHeo_6afxwgP|-ghn!$No|9C~HU9bN@rU z_6`Sg=Qe16MHV<^clLuN0sWocrMB&{Wlzk`1$$K;nkSFkRcBlFnt?|HCXTqA>il~9 zH`hmB&a}Dw)67@1?i^W{suEUr&N$tD!u$vI*Uoy??%wDbUtC^Upq?}%anSK5*A~k* zy6oCkIq!mM`^AYz9Re#|e;X1T-Sp>nCn|>>;0HW7uws$_j2Ll?LuZ6v+>4xU*W${M zPhATHgImWndeOG{z%Qz1Y3(FM>ss|M%(A=F$N#~(jZc269{h6K(DQekE(f{A@1B%4 z$2g;e(Ap8lhOhGPsVroq3DOZW$`1q?(s6DF!bgVRsl(Ph)@uyz?0-wvJqLtj~|z_&v`nRa}(we`)yu^m@k z+V5B58?d<~ZcFczDG!r2p3g2B@s-uLsU^jY(wom~t6tK`>9fFXDFJO)cC_6c@ZHZ_ z=MHUut6`tsotw2<$y+uvx6XCjn9`8&j@J(yaNhPs{^c$wwom-yWyO!b7e3lnr+kiN zQAD@*UCfzY^4a$1Ll$}tOz~TvvMFlgMXRj7Lz1fZjk{W4<=kuivdsCV zyn^oE(huH zNv_A|+->h+3GIy1u2uWw-AH|gX}tL|wF`fVsVo7%ngiG*K< zBt4EOT~h6Q@luKOMs7^W?C8So-wd4Hq~G`>KOHP>IjnD&-Y>JiJaEb*WX6(xOX`jI zLq1^AW5QQi?ReiXb#KhC&fiA0T%|maaz4M_nv1dS52vJWjai-?Fu&cKl&N!8_B-z?-A|pCid|5U6NI|edL1TQ?mxW z%HP!O`?>MA+J65-uVcGU^{pP4yW+_*-lW5Bs*z60bEN5_xTWf?@47YnQr5Y3;}P!% z^io`$X6^Cak+Le+W$U8^zYW{9>(Djr(0P0>_{wf7DTZcxmyr}XuD>AyZlb8`?FI7G zOB=ZzxrvvqV`E6r^%5|ejOmG=i;0^B=`j6n3=@+RadB{sK!?Kxc5U^!F=G;O{jFhj zgIgKaWCV=9A!g3F>#O;f%1#FVQrUrz3sHJuw^TU8a47`;N&)`+-a2qkf#noF*M~d) z#R4IDyY9)E=#g`#n}!vAmHx~@Fv5X{jd(Af8U^hX+4#> zuY_+dJZOOnJRTA7bo^8SA9z)T{Ql)9(AFN#(ZS{n{CCbCUkP{wb^-BX2F%=reAsfEsL+YHHT-FJR$&CsDx~6Z>PtcpQTF=OH{|%@+a`vGJJpJirsoES(V-ND$tj zw!eUV7-$W+MA5fG5f_dZv5-)_^Ft{vK7)(W<}I6BmVmLxQAxF4{2)dhB%y5K~MO;pyIGQvKE`c5{0fP%x7Yr@| zg9{BBapBHU2-zpT6^b?F8gapXB3K1(xlJtz}_KqhQLWXwz26TAgKKwI^u4oZRl-2bB_fF6pTg?@p5O9kx^fB(7u|B%3Y zHR#@+PAs5$?Zw!|xfAD#y_-8NSS%!+JYQ|{fwzaCV64aYaZoU34S|CH{3ZnoMz;(o zIFDsR!Ck^hP;hHyDioXnfl?$)Nd;wf_z|n zrCYP44uB9PqQh>3s+VZEbV$+;9d=2_cSN7QFBcUaer5_W8VgUYm!nEgfM3Gg`HEtRi=y~%1`e&1mHZr) z#D!@IEx2{)$CoSF9_b0d;s`+gT2Tz$WI@mk4p>NBVifQTaJci+6U2oQm%jn7tOP>v zn^-MZzGU7*%}QCBP$6+)TF@T6_)>~Tu7pO(z{MN6(ke%lr%q2`UhrcSF$&;KkA%b} zzEqz?LIszEEXa^VjtW_2NXUT1Te(u8C`Os5Com0xFS$}#DGSn(Q51^{8M5q1_{vIg zVL}BRE3{+k@uj-4BU^OL3?Z*K=?U)>Dqw&Tm-`I&aA%NdYajxIG0i=y(-W``x>6Lw z9W=-M(wYW_Oq0zdNA(`MDT)&+-sPzsa#V7qT!|t}td<2OR49t^IiUiiUJsvQ6mn%^ zH9qC2l$EHD_|o(Q+U8ZB+7ZVkF3eLSate(vO@v|Ps1hn*oPa;E8g-_u#HWM`)GXow z{Elc|D1M|bSH>uERN_Ke5Yl1#!XSX*jll3b_!Sqz9nhsG#Fs{jV-#Udk>WfxiY>lW zT*$H;RGyv?qlgqoiqW*foFaTtQCJrjQg*|fg39Ad6Dnd9$T;3{kcg3G7i)55&qpx| z6hvY*Dvq+NDDL@4;*wC&^HGi}Mv(|Y&r?VEB5vqw$Zm{6$F3|0Nkxhk#VAdDX4#D| zRaQodkr|36p+Xjf5=OQtyJ*1Jv*#mp3d0 zd4BrmvWbd454AqP*n5)3-sMVJ5Smer>LGX^+F*|AX`Wh$?t^B8_KzKum2UtlaY0K* zksxb0yw}jH>qhSl>7PS?z=E#Fup@&CAO=rmPvXVY)?{xScI8~%4 z+|N;A{K-*4V1dX3HF~Eo8qxp+lSECVAXodgiYrFsN&^++O3)1zmspKffpS$8M~YEB z7>>|EG=sH^@Ws%EUP^-(N)}Yg@84ip*9c!q1m%k@v-k7g9Oe|}gmOmTjPQl&hrxp}QAY@UkRhZ|AVh3^Q>5V(M}&TY>cfzTHqVZTdJ6g) z&P?bJ%!oj3SZ7p7r2}K4p&9g+*!$)GFJM|5nhbSvChCj+vTjLWJ4LdLi zF;)czL4Bf?Eaxg6t$|Ox2fI~PhB;A`XlBYvovF%|VNMuFB78A=YfV)cDen0w!WW|~ z^@{jXB+|b@Qj?%^AOcFtQ_Gd`M6Hh%DI%eSfMWrMkqBQ_dlY$xNSqkZD5(3PPeh7? z%4I?NgwQJ(LX1zKpJ{zU4nl%4K~AGH6C{QK0w*@s1<}Xptc13zBPR>Oxg3e%j3se_ zaRCvW_E;yx=~s_nCyu|MbW!$N2Ly(+67Q%<02s%+$GRUs;)(kxh*)eLoD)15BNmPX zBdor`V@#$7FAG9-<6ChdN}P%e^+de}`Nok@0|*9ffkt37*h?0q6`7P0f-H`;FH|Aw ziV91vq*FMe!nsRB$d*Pg%p>p(GZXTY1}lnvzoKN&ki>;rGU=M}P;OylP-pE8Cq)eN zXaqP@q4c$}LP(7u%n5x9Ct4;3!#Ko+y3mL*I7Xo*#*PUmC~9#JKrc9z(O{3W2%^PU zf$0Gi9nLm%&cUe-$A_Xrfhmev(P2+58Ium26d2o3zKk=;u~P$%Ltmqq(&eaLz^qOq zM`p_CpKcrm*MP0<$%K0Goi5>-69Y&Vy$keE^}1R z^VIgJIpm)Dk6tvf!(^$;#-MT<0wR3T*C0eQmWrdMtfN8dkz^em9adkMkz%xC93jr8 zSWuWW(bUEjJDRLu06>YL!6j7CSpgGLIz~(h^mN?;X3oE+}bn0RQ(+AbX zju@?(buNoL;mlQQTf+IvLc{QYChbb*AQbt?0|&Wez( zfy9Ni?1qxaQ(N4Pl>_{hp0yY literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/test_types.xlsx b/pandas/io/tests/data/test_types.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..ef749e04ff3b5eb1282f4908476ebd215b970580 GIT binary patch literal 33769 zcmeEsc|4Tu*Z8fmBs)onQP#2+QkEIAC7}{ROp$$=vScufvWz8ELdcdqNp{(dElKvh zm>HEVnJLR1gWv6WzTfA0zTfxxyubH-e}BHe+sA#Mxvq0v*E#1p=UnGH*LBGdMt=}s z1oi^}a1>BGAE)U~2LSE#0Kfz6-)F6j_VjV^^g&(qzwO{{BZu*D7kN&mWb+$Sz0TG|uZhvof04(@31Bph*Hv&qJ`@!Tq!5nOZjX_Iym9e>(o9BL%VaVaJ- zZeli~+E47shw`mYZ$7-ZMF=c8!NvT<@A7mSpXRJ4dGZ~sj+XW^>%CZ9$q&q0O&cEn zwM8rCRpmjE7kqUevcztAbCD|)+CQkI;_6;w{X8vMlSj4Z7p=@hY^qNAxG`)3=j>QG zW90^|4rhPM@NYp1>18BQQ?%+fA5*yE&TH&b5891?cq{I(-AC6JMT>eKjjtc0fgj3@ zsreN!+Ndx&t0{$P?oqy6%RmI?|KB+O7hB|CSbzE0XwNvq zH_m@-O%0l@6bZYO?#X$@Z+|%K;PFrc zb}(eLC#g?dpQyi~@WtM_R)6BAHr6c3Pn|X#;`+S1`hw>83&9&}19O?X&wJU#uReWf z$mdMbmS8b(l7HFRe~CNm8TP{%wLa(Va7!QTlP&w_^+4H$f$q?wX&*3OmBh-Uht;_Q zkouK0ljp~Op2^IfgUk7-&DEVv8u=pATuL}>;mYCYba_Mc?eRj>63owGY)6Nrp=A(c zAOF#M9wc>m{)9wT2h|Dy`{^+5^1mC9AKJ^!4vlu(Tjc&}Omxsn2U*(x?&sy>KG!w{ z*3OmC#?XPFJA<(&_0nxSv&Lit0ek5?&US)^TM0Fp-C0K2KFQZvGsBu!B(s;lI~{4~ zlUNM3xWp^Ie^p|9+V21&b*x^JvwhN9gFgO%gxL5FH4C{HF=K8|)raf>`FS z-a37J$Hb^gpXqdROz|&?CVVMepTB^3v)OcG=b5u#mgSBoeQ{G5NmZCRD)jc~4;P^> z?(R3HHlY(QncIHdpwOQ(Jlm|#=zsUlnmjbq|50TsZ%MmNLTa!BsSF2%0jbPimF2+| zBlI%`&PIzx+-s~89c=0rgU;0UC5so6QZ5M<_V4Xvs~>snyF9!*eKIV#FxiVA8K07} z@%4*Jj{AV3in+5H5yN}4op&hl(%e4W?p@Yb3gvMxqN}Z_`^&$MG+TMBu<>HK*@gNd)IBYkNwvly?n28z>8DF1A`hk1Vor!B&59Nrry+AW3! zGc(3Z`X$%Y*%9353Yzb&wU&IBsVE$b`M7ZTR=}x%lhj~Y2 z5~dT=9Xj0`YJkWx*XWfJ_bVNFK=O{5WqbC8__EgGiKwnyyX8-5TtM{aWLk=p+>=ak z`T6RM$nl?EC8=yx`eI@_TGt=e-qF;|Gbdj`7-j8`9xiQ3OZl{BtnT@YTT|29xv`>o<7?y0 zmLhI_v1;~dw)QcxF9wQ*wy)?T#g4)UZ+&F_zFwbvdY>4tal}(H<(Bz#r*oUub1vrnX@Vu*hPto*^`-|jpc`}As9TA2?u*nHZ_J7>goL@-6xyohch zv+68O&!MrpagoUOmb~N*TDjGKvh|}d-l9T3z9)7D^mYhdYIpWhz>m@`-&I^@xYa(v zCXa;?1ru2j+FGX%PUICFrs~MPt2&G%K4$^gZe1IoFLJq{cP326a~2~aHOfOYx^Ia7 zc1db3V)gp@i2M03_!~0q3L{gREc%**)@a8A)@a?`ZBCLLxHJB%^Pfo@=EIfh=D0P3#wWF&!_~$rm<`Cvm z#D>d7u{xu;enfn$iuMDyv;2`i4OT@j3NatK=NZ2k7kz}deB2RnJ*M{aC!T(FY|Byt zQ(Q^o`HKhF-eVP`g@|%=$K^6iR1j;b?1{TsE}ZNv`JI||Qkg`HKh?cxcy&wPi7vrM5_U(-d12AXmu-4QAh z8MG&yS!9iRf=E%@N0VjPmWT-c!nP%s)+u*IDu6iT&*E*X_ni`YH*9tozdRsz!hR-! z>EZLk%Xhwnu6Wg)TUBhQlVoWNiszR6=%jML-sjs>B}2|_$7AYc4aT(z&5g$eZ`#$d zR{9i}b}9MWOgSFhtWGC+OnEX+y^j19J=a;}*tmZ~-hDs~^Y|NAVe~KYMGt)NRCeet z7H@mChi@j?spN27<@Cex#_9?GE*6z((RbHOjQHP(_fA_A_9=ZMWF;x$jjtSOAKV|j zcp-xM$j56pL__H<;FoyUjy)qZH3~S3+ou!>8$R)3p3_zQHc~=;yIdxXd zydLX}A6va0{84K7aB=6?i)kj6`Mfe}6O)M-kAv$H31(H74g1WM^HcKY-<#LIJbrr4 zE34r)8zal@vFU;bLT0(&v@MgA0FF%%FF4hQh7`1SoR5~dZp)YD&j3AKlAA7$O{N( z)1^fQv67gKtB~3!NCd*c%HWVG<=$%lFYBW(hn}odvVMrOPtrTHl(czQqVx=l%69JS zR`ta9u$hKx!MP7B(XYT5ue@iSyNVX8UVzGeH*KMNoN|5)dUaP)>G)2N`@;t#BW&oF znihl}%HGyyDZPiSwL87b;#Ig?qwM7Sv^*j-YqG#zOiD)b1b5gsE$dSgei7xa)vMPM zWlV3KdU5O3#gYk_*ZFIxjTYZep|FEmFVZ|8SJ8#4rQ^PPn`?!ZP3JsW>Rq_iG3eq( zT$!#Nw__|YUKaP+`AXxHI8?mlzE<%=PaP%N6F1LSVRJ>c^ner9O{u(SxOj9SMl1%Gm+$mE`@8~oUYU|o!f+8+b!ImvTd8qH;kCUPe&7fhJNADGDi*Cu$?-u za(>GV*b-KAPB17gA9hrO_gXbwLs9gqo^BkRm@Ic72N{>g7(G!-H;v&lxZC-t$p z^QO{c@vU27^OQ44+`)_E+~uhAKV-|b+)k!DhZ9p*`iTb|F|w`}J?AiM?xb^s_pGw@ zoTvF*5e$?d8Mlv{&P4d)WCljG$H`p5ElcL!%)1;FO zpL->Rv802C<7_pCa5o3aQarERUD}Hl|KkdpSbWr16^bP}832IoAAy9Azq`Y45k&42 zBlMI4Ys3m{9ZvsAP~6g7CS{pjiZct(%aFRS{$2Iw{UdnE?vzFp@`9HTlZKD3_j##k ztWMMO$DdX&Jm;w}KAb)~Vi(0Vlvot*=|%GyM+LRL^?%`ZBG&P&@Y6PxjyaQ*pP{$n z^*hz~h21@i2$=c4|IWyx$A9 zIdzx4$862?s)l2(oG0Pm63&~|*Xs!y8Zzgt7qgYvj^CjkDAjR@Rs1Lq=(El}w<|6F z_*j2Q|aqjqo2>Y%)MfqTVu=_UM(rgX4Q2))iSrmaWVSAg9pcE_4+PL znb!u6evO~Tm$Y+S*8RHkoF|cJ$)x9yTlO~eeF%ynK0i7y()xt1i|%c&7hy*pt9|WA zT?zh;{y5&=leB;#f}cwRURt``s=w3uM5~xQe-26Wgv1y7r^6`P z{ZBt95?kNnzqc(j@Cfktwj0K@WDao$Mr`l1YlO!HAgIWPvB7?w6-+&~^Ia*mS+Sp$<<@aT1u(pBN%gMDeXan}~PcVSukbx_kv)5>>uKd%~juetOC-I%k!Fpf9u zvdkuyZ!+qdapLLOOXtIT4$d{?nr>+31*&XqI+V|njXb5y*hQXrEqXqA(b**59dYPd zK*MwKYp3K6yu5K@wCN>%N70=VUPv9OA1v=Lj2(N>;iP~`I(g3CxSByWk%3F8>j2op zR{WW{L?rW`^lQwNxey`dfQY$_)8Y+Hn*xzpyl*#y?U$@T_^QI$n`zy=*w#qmW2bkp zGc(+cqv}8O5hA@3$6*Yr%W+u~!;KuB2A?-8qPqK7a@71!-aQDJ?K5s&Q>lcd`PI@h zxFJFK4Q9IMN`gwqB%&LK7B6(q>lb+t!V0H=@x1rDy^ITy^ zvrk@0C3(CG{_#oW-KZc#anSjw5AV-tv|r_xipt-owZD&!tI4=EtL)fY$?3G2Pt5&- zceRJv&b2&T%t|;S5HLFO@Ie`&B8sTSlnT4rb70AwrIYKecmP${{AhmIuPs(1nqR+8 zT=lO!S?9?^$GuKf z@TwCWeARc62UrG1PL^4eG zmIzN{_kVqM_IkR+?ViELO}WHhzSZ}QJ~dzF$iCgrrn)h3#CLO(M-Mj4&q}y~xR!M} ztiO3k&AcV{V?Z?;VaFhW_{8HXyqq0W>6BoH4Bp%!tN%Fo$`d0T{A86`DV4gU*t)5w zf~Eua>2E9F+TKLrCvbmp{Rbd;?>P#%nuds;u>*j;|Gn()Ewue5FZ6A1X!GRjVdr_% z#a+(zwu95|_%0FPyr5^G2hh>c0q3DVV3*8PYoM)-x^%@v&)}jiQ~>}?g*R{CcHbug z03Mz`URU(biCnX^5@DKvj%WCx+^_(^a?{@Xw$`Of7x#wW!`=J->$5)kdx~pF_V4Te z75m?QvD|X-wg&*Zz2hVFEpK}d2*v;agS-80FCPG4{0x}c?*KE5G-T&mvX1Sz`xS~VL<)pJ9s*Hx!8;B z$pVtBql>%4Zxz44{x8*k!Tira4|iXP=ieS-=*{ZwW@-q1rS^Dt_Bwy%A8?laO^Bd0PN#-^wKl?9e>bD+S$QiZ;l~)V8AU09bE{PfnZ^0-}C?IA9lg} zuIXO+Z~X)~Ya2r_)DO(d_2z|35G(+}kzVL4dt*T~!=5|1>+g*P^$+X3?Q`iLG{ELP z-HrFgg7~Fpckup=N2pKOypQu$NKg0L(x3M6y84f{^j3~8dItZ%$KH1TO&1~B=ySb% zuk6tV^-te_(@R$$f}wuse>ixW?a2*-84lmNsk0|rIS57o=5#j!2LKJ&L2rA&6W9hs zfGdDLa1Q$32E3peN5BPehe{lvS_i1ag^mqygUbFM@6vBLzqk1l!|Auy^LwqKQT`t1 z_YoHUfrF;!qgS9;g#Oj&FVbt!pP@esh|nYHHR%!bI#B6ZdbK|&xbzn~MF1yY0iyKx ze(nIi(8zmqm;>h?3iwL>(MIIH7s|!HFP;KX^Cy?+KkK1>{@@s*8U0rY0N#)U{>h!} zU**(=Z8-c#jmQXn?e!tnE-Ya3f0Hl6XNGx(5$HX^@bS;~3{wmrp}z@03z`A<-zf5c zsB-yjCcOW^ll>PSS|Lmy;3o7YL3n70)_+p^FEi@s`B31`nmtK4;EmUff%`IlmMV-X zEGvvCWGmz%|A?BeVf|G*Mq=VL#~ex6+tsuBES-rWDJ z{}*0d082<#|CH+AkF{q3E`S+Ci3{Kd;d((*-IK5<;1BisS3CI!rS!t|C+Q9TO>58W z>GD6RVAN&QVblUd7*8;sWjx7vVGsVNlfPvNy0MJI?_V;pea`q89rJ$f9q6IBw4k8Ag_Hr^JH{IPu zeyj2p@pABX@Ve`8OAgpu2lv_mz|Skct#5RP`u;5QF#-UM?~u>x|Fg`>7XV&*L(8(# zpJh^zPvp%6fZEgczFv2K?+;pddjY`PQSsDozv)=dLB45vclRgcub3VK!1klv-L2=l zyW2TX{~rOM#eJ6q_~>F>909<{XwU0Imo4b&1ORCF_ZwvhI>A3+Jvu4Kf$x>?0|2}W zK>rV%Jzu@|?DDXGtMa`W;F3X5KpS5#J2*VMkLBeb=5bauV%em6KYJTf{qJ~2rm&wl>$b#8uP zk^1B3>e~7S_-k{IuRT8h`TQ+kkPzth(bL1|nfCai+vm5(I2S#`(UXkaT9=t_-r+fR z>fZi?=N@I3wK9t;UZL>Xc?}#mB&I|Xr|z-#8)yGD#_s=*IQxUKKlz#jSYeP-a>2L& zB(RgGj@$+Im#!arv|hrbn=!YgI@Fiy#^RS`iG3n3_Uf(Wr>BqK8ykFm(LH6GfhtBN z_kL$G&0P>+ug5W_>Ef0-u7+MT_ ztnoi6Q)?=^=^|!$#Qg0~leM{R?B=}<6WZ}zARM)7+`zvJgndum1>Oz=I6@|9whQPi zwPu3G3piS=COjpxa;1G20L@U9PHpMP*5>_6F*6~VY->2!c<``5M*$%SZZ(iy0}d6M zkV+0%H}e?)O_Y~2%sFXoNG{)}K5l`nCG)n&BiM`$-2zV?fvxuK9NYy6rJItrtkBob za2NRO3Lr@)v=h6)b6;W~P12`%M=y&*wi$GwEFhFYQ+*Q{Xz5d28s5z5tCoJ&$4K4M zq;MQlLrO16{8mliIR3pvwn`9`ox62Pmv{NSbF1TRO%22U;xOGgRFyfyaf zduk5iLa3&2DbyQ|WU2WzQNr}8EQ@D>0&L{MsMHC({oDX zNKz8~ztjHO_w7zl4$T;hrG^rI zegZ{jR|qD|pbGK(8&doOY06BCm8ca>ml~cNod?EHUM>YQQ0DLrg0m+{YBrLDx?Nz( zCtX5AyRD$3d0chv)0g49UpYDYaeqTK-D8>vcvbXaJU*C}Ui2Z3Bs8S>Fn8|Q6mm2~ zeTLZ8#(`JuB5;J)RTmS{2NqQ`3MfBYYc}Ju{j(P~RFcQWf`y0X>gwxilNgf%W1>F$ z=cnp9J=S-!Gmr?;kpDa7|0TJ;JhY0wzYCN*b)utYSQ`Y1>3m%Xo&hDdMke!w)@i=s zL4x(nArN<=@r|1X`uuR^U9N?ijoiMXaa&8vU7+hrJGZ_kAw)xEJ%EBD^a)}`DFu;V z%PH`7wVvVO(OsZ@y_8bZLA`F@A?aN`^4x8d^9QiV0){P>WK04o zueZHLc<68R7$d)INLgyJ71&N|QU&pC)9jS#s8{pN2e95n3j&?VIpY(qR*pGNq@`cW z@-NY{B!lHw%lXhj71{UkabBBiT&cj%OorfY1+ai1%uhWEzL{;5TB;LHDHb~2V?=RJ zN@Cm`ln!IXDn;Q3!6TTrnwK@Y=Aipe*cCLOb`-{t~%0 zDNVIr$@PAUd69fOoUE8Vkb2`_;_~uvdo~3ByFOF6x-&K8K zP|!>7XP$if9NlU%O>}c(n?qo3hou4Xp6@O|iv@hFc^)%{q}Y%$cdlGw>V;RL;#T7) z#@d1vx1VD@$nyM)bw~oo{bsI)%f)S3=PSwB;km$WHAzPMR3LZ4-TO?acSJwC3<>_3}|! ztUYPwFj)S5!*B$_B_CgDSi;h3D@bw7E&*pc?xt4FgkzQSP1vT(UXF|T9phx*>X@nS z`?gK&JG9WmIJ|QPL-^St5Z?i3#EPlf##5%Z%u;*WGdbO{K|v%Tt1l&3)^3%1Ro;}I z&g$Ek4HLH&X4!Roko#tNV^!A^x?YUm`d82e*(S>|>(n0PO)hAa4@V#>hkTnurfFM4KHu*YASY45f<^`=gG9;@8v1h6JsX?H`@gKcdA+Z$~`YNpN9TvgAq0+d9IY~otpd*YG1 zIgqIV;2r;24N1~2(8i`uF73?Z!_*;RlQaZn7r3WXiWjggB}4RKaxQJ>E@Mqd*2U;Z z`_|Lb48uDIErWYd+`h=jxvq^JA&1x?^fM!=KF$f#Ec zX{)7~0}U!uyP-b59728Jb6M# z^@~QF0RxGW?eY;QX8AP^A)BR4c+)Z3jk*E!w>;1?ci#LI=+q$~2)_9Gbr6tbpFRATkYnl6KNUFtsggEUg zbtY;@dLGWEyb=LLJC&Icqp9$A^g+o^4)}GNBIxZRTNA6UMk-~;*ff~TvZ-CK7+RA* zjET`2a2mp@kd1=b^gMWUo0=ZLO2C9IFPc24Pr~u_Q29D-Pg4pyN*S;xsp$4tH1TwS z2j%-pXZ26ev^7H(48bz1L}QgXB6ki;#;I!&XBgb3r76?*;IW>{LQaIc<(P~QgEqo~ zX_1U%q#a1m-m`bqh;0S3ZexsQ?fz%xD+iTsH$>H0)&!yT{p**zJ zFGGmhluqR#uC$fLS}{e^PHJQ%h{fjikXE8p%@UL+wH0S9Kh+(*_uPwaW#->y59h<6Vb1|{Bg5#wW()6`Hb^GWM>T#I#GDiNV|1FtMd;-|a{^EgC8h7g-=vPsqS{O8F=}M_7=##xz`^c}I!Ke|>omF7 zbhL~zPf1%mJ$d(~XLYPgyiLjtC$3mn_v#6jd8<2*65*QD;8hY zDCyV21p4!}@q%ttRXCfqvtJ!s3Z88KFg5WRUuK0$DDGBYwe@tmQT5{KW@an{sc_lT zR`MFn&_%9Z>`J${>>Hp4x5b1-Hq}yf9`$bNxg=#Ar)I=Z?L6QYzs-+oc`l0pF!ht~ln8n1ajkHchn+qki3FHk>uJG+JGY?4jen zGcxMy@?GaNRpyc&-Kx|2MkH=te*`i>4`^+_-VYO_ve$E~mxIz*JFk%SA?J zRjAdwX_;N!XZbV)5PNos=^pJGB@>Tyo;k2M!>(>gVzX$6vp1cdT|7qLtow)`f` zrL0S-_;8275_!w=QOTMujrL=rLhrA-zcuq{uRSHvk z-+mC^p{f(iJBAJx#%fVk!e*N&LVclCC^}3s2`sN}$mkp*1S?TGU#48WX~pt*)xJ=5EtLpQ zS!MsKQf^C^0FwyDc(TW-Il?AeZwS)}j!~V9*cY-XkwYma;qJ+=1|hcs+h;hxP>L}% zgpIK}1MnW%!`iALb#luTx2FYxxAX{ezLLxV;7iAJ=b=q`QU{?gWU5=!H=PgWL%s5Q3Dd^}? zf})=$5ocREN^+YJd{#d~C$%B9s-PoXtu>g5GM(&F`f0IdQ@5cy+3J}s+{V;=q*OlS ztXk#-e*Nr#vXE6E?g*xofa0Txmo7{zj5f(E%v74RrAyB;MbP|n!J1izuZ^=dw(B02 zm1!DI9h33Pic~{FhVqcF32Wv7S1XlhpIv~hgVeLXGU$Ms2>TECIDXx$j;cisUyE;_ z;WpxnQ5T){P&6KGQpR|j4fvp#qmtz*gWaG}9p(+RZ#P` z6vur1g0CvDedaQa*mhj#{LbSx`4V{}K6KJAM41EMk1?6$stOV%evK!^^Qg2hGpIXJ ze!wS|$Qp%DQ^o`8SJbs`)!lbR>I9vP zC&z=5_|_Gnk8TOzhCaA9>p*qH2fEFwp}4Wv`#rzWP_t@%;Z2gDMCPHjw;~zCI6}|R z&Oxx=jAEQp=;DzNdiELO^-|R3e1(|wn#zG~MjP+;{V#G_Hk)|QJy_h~{{Xb~(Tttx zW~Qc(?gH6xi(R0#CIa`%Tazuwublyo*grxstAIjL9BY6ulzdR4qs+CVCYVSY%c4x- z>RMEk7FbHQQ5_ z*rOgEm8UjNr&f*O9mafm*z|_!n#8y9G+wd|vaLEExy+D@a)t}ymw6F2x!8-8V<=1} zc@txLj*3n8>oM+1EN&(oU^7ChKYPeG(Bm|@eIJ{}IxmH7 zdA-6n&u~i*)O&}E+HCG@X29T}&BbA?nhU8f40(uVQn~~sHk0Fa_*_($i&}GB5vNA5 zmuFSRd}5tJJfRpp8_zd6zXKn1fCt_dNBural=Z$RNQ>-LWgK5jy|7*sq>B!DZvCV}ImBCbCqj`CSa~J53 zFWv=+Q_$Y!O%^*7`BlX$Bg#R69?- zFVp*>a9GDtgp;*7Jd+0@! zB15y0jI3;AvY=p_4i4x*%eX|2_Z{+#09d`%p}N%cN>)jh$7E==*g&od;3!!$iEN@9 zS;H|-E!aZwWR5yQ;_=sO?p3{~zn)(nYD0)c<@-dFQtT`xqECHh;5a7dr+S}m9!YtE z@!koY)Zby1#evHJ{J;6z%T{B!LGcwR+A*m(0jhOLzW}{0Un5wk!es@MkR#7@eVqXZ z^?iHNVrkvlY5-eqhED*aMQa~!&&_nPgt?8WbN z>v)>DFci0Etv}iYLRnBdYMM;{g%UV@CLIVVLJhKgW>UVji49b|Ly{NVfi?zg2wodE zZ>(W_+q44JB$6ghgJ0IaUv<%aYxer8QmF()BEFQASFVdQS~mxupRd zVd|H+xM-GCkK|4y{}hIAJplBj&UNC1Nniq{rac%2=8z(Td1;FHHb|2LDQTqSNM&5a zVCHO(-+YvovvXQE}mH8fAsdFrWzW&pni52 z&{`ISeE*?l+T9Q&1Ubi#%U=MysVcKw51_;X?HJ`c?l3lRixrH3^Dd-X+;^fjCFk$- z&G0G5N5c8g`7!NHOjL^qUn95nq-52ro5}@IKAGIk8OKH{LDP=Yc9YW*SsXH;$`%{# zCTQ`9^(;#D-|fB*PW93(8Bl2Zw?W zQ1WvMx_!wuhv0M{G5Ge$>q(P|$#^Bvl8~UxU4WhpYQ2KC;uZ2?hy%3qwG@xc$OgWf zq-Aao>vPoPFm+vfGU^+O3FEExW>hwxveJeO9~xiS;M+8tT1dr3Evs9Q>jRIA+BkJu z4cD`ZzyolE%cQ?VebwYx8q=1;LZ%Sb zkDSSk$+7~2Kn`sw})C9qpmB>upeo$0|-W_VhdhyJC&6r8Ch{Fze=vXZfM%XX&^d1w_G z)x_P8bVAZ&5ko#J{203_vQunH6`D_cgRl~{DVLqSG{f%DSO@wA-m#}L*?>;uSRR|o zA)gY*$)2{4R4SNC0e2#EDFIuSDWfycH;$suTYtPsh% zzCLsRJkQ5ZHkiQk8Dz4H($w9@E^|q}Uy>{jC;5vx`q^1ViKPO+v`LFyZd*RI^Ppkz zM~p>>Fx@DE`6Jedk{m%Aps|tFaIA}|$o69N0k^V6t8n~1nrI=Yx~M$r?o%R2;JJ>~ zG&1fn8m@vCo`VYku_;4UoE(+PI7Se8aioC{kw!9MbNSg#_80MgSt%~E9HcV2L=V>F+tm$^U8H54$Ee-zVtT-@u62n~niG|U z=+}ZfLbGTZYA}KFW`fu=)S8&j!=6e)@bhDKl)lIYzk($s`zjRIs_LcWJJ;blI9O?F z!|L*+ndsB0!kKQ_Q#5U;@u7&=+!zBfV9V*>3Fi3EIff3rjKG}R(5~{5d?bqP<&Lzo zO7j7P5RG%dw=a5{fhzonC~X68ZRSRZL4n02x^d*xFS??FB^D5B zBcJrrs7T`H9qu8Ani2W@N&rNmz+cqv2A(Zrnj;0jEJ$i}BC34*McwKgJ7w^~5-ScJ zi1fj+s^CcqzSEm#L%|VZwxCpz8iga$CtaKTSpbnvsh%+_tEL`$-NZ=^jv}Ll=LN#b zwj*KdJ%royn@ts^t_IJirdkVKY&nLHRyGbTi`G>cZp)EBYv1918O8hFPBVG`W7U^0 z-U+p^YO2lrn=p3#5y;@tnJPE|H4)R<%%v_{v?zQAtf9uQ_iShH@X*+yyk-QAiC8Lh z9V6L-tiW}0OzM$$SkU-t1Pdj&SR%0$)FyUC1PfsBmcuj~>Vch36#o?XsxwIOs-efR zRoJNA_NxXQwRznOT%kIV{5m-BIF7W|T|h`(9Ll6#92sj>Ma7M+J2Y%32eZ>|k7AjK zQ)x5IG%X4ZQg+W3?9ErF!wZ+14vqN)S&^aWr!BtFxxd$gY;+hlFAqxMI^_j+2Ce$K z;-hTks2fb zQ9l>pl^e?@TSlN0DLzd7I9WrQAW+lRD@j7$lcg!m9R84(v$#Hqk6fMMP-!!PVe;So z?AX~KV=|fz;-Lup!@WY3^R!^YNR@Q+Q^M_6s0i}Kp=ifNJ4G47nSiT7xv4oJ+4@ZS zEN5;m6rHK7jlVT#-Y|CR!FHnZ{<_D-KdvFzdlVYX((VN5oARVOUTz!AUbepK^-<#p zyC8%4Vdvw@Cu&QJe~y$Y_Z~JH$UA)8JQ4S)^j~Lx(|bw#ziBOWr8x_9M`ERo-kOVU zGYmBkE5yy(e-%VJ({J;_^%H#n+ zskfO!5_!})=s?5-k3fJXXsY(PexE*mn!y-WV@snae_^sAHL0q|1?^CXsq5+Lmyg04 zRumz2hPU?0ldmZ?mZ)ceoR9KMgdQ*8Ltd&$|pjPVfoj=uZ-;a)Ze6eJjI#ze@<e&gdC1lC48v-M4Q ze)AyL0ATgZ`s?j5(^fb);^+dBod)+Mu7snwXc~UAji?LT507GeJd77-p}Q249fpzF zkw(>>ON8l@_hofDpx`MswnZU7<;djs)VfHf#k&bN$wTB)K64Ij&fI{m zqT%GX_#JMBlLxOyynpN;7D1<=4ftmM`{coX=5)?f&k*{;g*Ct^;-|pYCbdDxS)$qz z+=FOkLyn1h)1BV5c6g|~#oKyNUAK6r$M$e-!!XjJ5y8JEJu`U@%`%RquY_(5sbOAy zjuIPP74)+nU&p9xa(}zVA|t}ou=B3z4CuXe4&$}49s??pN_YqZm`ycSaKjuFbRw4W zV$b!j_@s+k4_5hC@s1uH^Nkh$+PP#Lc&FItYnyCchiv63>%vPMFL4zOP!3bu7}~pp z<<8IsFEPzJ#p9gig`6|1BC3delRWFbpug{y0;ocs!_(*}L+i0reS(o8nZ3%7YESNk zVwIz1m6Xv;Z-y_K5tVsRmOpxxjmM12kdae;e6`kE<&<-3$%oh<;KiQElj?>kMDaw@ z{Oui^BkDI(&5EqfJn$6d3eAc1p;Y>7Je$?avpiR28+^XdYi6=r7aPZ+M80Uj`*p&n z{22ls@#8Hihjp^*!a%ztFZB#7LUMw*;I*Y19qu}m7g^+M{LH2B(`cp)?iS>F{*^C_ zVfgj`fo-}iOrIS8MKyjDTk;HZ<;I=RXK>I!b*rE05vbpldXX>V0C+uE7GxYE3uhB; z1lz4}ZNI)Pi&3!|a8+!YvMyxiq~r0IQ(IyYH1Un`*@!zg1E^GwGY?>6&HqN>CA#hO zS@}%?9~@0wVi))dogRK$;m7q4t5O2(O|U_xvZ>MifMQ+N;gHBb=)Auxvcm-BRQclzKQw3Ay1j3mjAfCLt%700m0 ziShK{hc<)e5#uy}fB)^0Sq-&zftaaXz>&q~MAwCi5nt&Z4N2|65)YqJc9-M%^QvZ* z0>SBMZXihZf506!yO<`}I1D8Spg4{eG^FlNb<(6F>M0da^v5zoP&H2>cKb965`y`u z3E;cYK(nTy6=f|%S@Ctu7f zl28`BaFV_GjsqFa-N6^3Zg@;QGs0uwR=C6)o$A+hQ37NGIsVW72N{0~rF>RP>9#Wf zzTrsH4bTc|*xn2>bL;}}ERM(~8E8WbG|6L%$&O`I9^+A82QZXuGEvcPt47Hv@+G-7 zb9_|!);Q~--_gW6ux2E9g>U~RqNdo*)TK$fe3(8a$dziX=^^3qoo+6eWjnly7bATQ z#QA1}X|q?GxF}bff$!)B4&OqI)sikXzj*7+BKLIghk%IYLYx!ZO$mxw0ZlaEDwj*m zXcRe3em~~gOq64cX@Ui?5kI~~;t1ab9Qwk{;roMy`^YGMP_50&WnJLN3TYRBxzq%B z#62Qy!!;|*N95G`9P+H)QtPiRoKA4H+9_k5{9bhXYv57}&aV=-m)Y_VpcJgxI?ib5)>c#xd_+%5aV~ zS48ZA?@BJlvFbXKSDGcuqSIA}KcY1pl;Kh1zZS8k3FhPjGaJk)FZ|4dV+Y-0&or?OQ@%2LRvPD#=z z*<*?dNz+26OlGq0Q-mTWMOsL9A!An*vW6@(B)b{Q^ocQkxAQ!|=bSp{`#Rs}oc{dw zhu7=#(S5tO&vm`8>w4eURXXmIPO7lF20{vPAta^2k<6~>56 zr#O$_N_r5PM8Z%t;nS3xrQ)s*>!4z%g*~o-FDvA6!=#ob=pzHE*BA(h_sYgqB;z>@kmY%n}v19J5*iL?(F(3FG@I`u2XEzf-S8y=U zfmuMm{cZxwzRAD%A2Jfv7!msu94*%8VuEv(c)gDQ+;A7e+uDCyV9Nz~tfi<(R$A{> zSd7^b!A%G|x?EODHM4*|erz<}<8p2P6iVUt+Z*SjC!KKhY)3ML zH}KVOZ{nQx=}Q}Bjj&v4!%d%zgpeP7*0<%pn!~J8)s69gr+<9XQl2DKQ!umqD>f{7U-yU#DdzD#y=mtX*H zRhz-t-I#9+Aj(H$kyyv!gMG5Olwi4S&~6s0f`%kIgN<0pF=3$tGd*x?KuGM`=N68h z_tW#ag>CsCt35BrwO{urh#J|En$c``?qS`Ko88@wQ4+9kSJ*3|Jp+xM-TkDBMnCG> z#z4UF=`Y4pE_UysPpus_P;EKMOnSuvMru{>>L9L&?0Kc9Pfi%oO`S}P9U$qH)wk`> z7ex{rVGA=8`J|bz?n^Kd@OvIw;L2%h z^_3T{5zV1KKo_(PEC6OXm}w5RqAv)`5QAdqo78}~dE0v1ko39t7yd-tQ>+4X%CWe0pHL9i0&i*c3pjlpsP(9;2opM&6s z7jqOD;?5?nDuk?{q2dN)!F=#t2O3yhMSFtt35YdNkCG+e%M8%0yB(T1;js7OjkVV& zgHCFy2uFQ|!|YNCHJP|=dXH}S&Jf)<9{f-&FOvQtXT!-uSt~{0 zUEl4i&sLvi5)%t(C6sDxmrivO?1Mg>eio`6nED84izQA9*LhMe@!V#%49Gnlz;`Jh zZ6j)3dtzPv-00cJ^9OyS@dD%Q4}@uVgWfu;!l+fN;Z;QO#b^xRF2ikzS`a?dRrr%j zfzNsn|0P0?YS2Qd(5uvQ>?9!619jRp=`0na!tBB}fNm7q`xuFt+QS| zUGv41b-c(My8g`?F@b`Vu;nCap9X4npTCa;(Sq3w;TSwOT_4yI9<#k^L0%?|2xlzQ zD{wZ##4f*;gUlGibYIOL)pcj=dqH?%f)gM0(~-(}c2?;mswp$Sa|zbiQ?R%M1CgE3 z2dLW2u6p(!D29cr?iFNBjnC)+&Iz}Yod|ief*3uK#V_3CBv{H-7*UQY)X7Za3gd)E zbtj|U37%9G5d#_1NwS@`(AIfxd+jCI(RQ^t6Fa>{$A&Jx2HkZs;?iQ_HD;B~X<_da z4-o1`e7*_|9}VE$a@mWRJ7OO0w94iAF1z?+_`pDWa>ang$8iIx2-U03TXxhk9BHHw z!a1SJCU&%YeN>FpMgI+Q)+)QEXX=iTUedn*F71OZez(1U_1Pj6Ugt~?F2PLl7XbEG zV{k{7VCTq~jogi#wTq7p;nW>7{v>MpT&{_>YxD9u>A)b z6SwR?9$xmr@2jBAZXND})PX=6@PN;#Pee2H8hvRw%GG_LXu%TY2PdG~3Lm+=SIuW% zF$b@=y$COupNvj9=d-1xd40fOn<@vLa!!J)Qu30w-pl=w34)^5Mb`hq)?yi?vGA2T zwtbOwApb%Ko4D(_ACguw8za+8)ST$u>LCuRz@oY9RQB?*25r@HRnA=KN{W&0d|e&7 z=w}vRapO=`g4>`1ViE=YqOSrF!6bm$z6&_64$!E7m@q183FdA`6XEDW(q0UbzZZbi zvWx`0Btc5$y>qptgG^`>;Sg51lNO#n`?h4iyLJdV7_YA*{zx#e>yA6T)TI@na$q&G z2AzYPz%r&nsp1Cey>7jz4lOEzfG@`(<}!o&)?{$e1f4@r+*4c5!9^=Wm#!yx)nd>FayAfNUOkOzmiTqa>k{u2W8vAQV)prPjijUqVZ;{cfht zRqBOTf?kmjoufv7B+^7jZs%f4OM zP6%WBKvt|{qxE~5ON>q}-hRS4K(%9Cczq@{@sy?7-Y=%FnLWiy&s52x>$7c0s57m=VEu=r3G_*e{7gZ^XW4kTB^q_PNb=USPvQYk91>(V0&W0C6iI+uT zmhSTV{pd`u*I{HqB zwL23u8RxzQ>WNydd&w;8R}8H(Qm=n%Ue;7$b+O`EXuzS5_gbDFGf&h`Zul%Vp}1MS zr}4-6;11m|BUNmm7=;R`V{7x{{GxO@{Ns0WzB!2~OAK#w$_1hZ+J z3uH-UfBh}OjU4$Rolj0nu=Qg&U|bv?M$E+5vhNhPklZiA3dgdt{8!Uk>7b zCj~H2CY>bEP29nnt1b;l-J>rj(vceIlJ4GoFvV1t9`%c7`1Ycg{OPOLhlBf9?#%8o znlRo6P1rJng$fuETkFOurBvDR?yo7nNxc=lCaa~S56VrI$9cK06^@CJ8X6tCEU_Ic zFH(hgPt;P2+{F%{_Z+L;<8Fx(G3pQ0 zjY;pA55GFE8FxxxbRW?j!KZh79r9IyFXu{{8&@#F4oYPto@>AfVOMmUxb}Y}CIm7& zf=@?|>f|tQB-|5rjFQdPzjgP^t-FVPH_OB38X==R)VvH4y96Ub0D|6I-buMu%s|GqW?#!7 zRaV*^8pil5_)Exu$G(g;w;#nz(xWR)WwK19w;RiZz8QX(&KtEFDT}8Ir$EmQn>}88 zTeX~5)8CMpnwS?r4M?9D&lqUqY=79^+}>Sz$#&2A>dL##pTt~R>Q9?*EJOkjy_%g2 zJcg8zCD;@d(A;O1<+*%U(4wG$@)E3+xC0Qfdthfrdsh{7j8jcJJYQScwxkdSqbe<1 zO3+U_jTN>k1RCX)Fms#qs@J2Ww~HIW?*zh{R&L|$Zkd>K)f(0^fwbPbN7dP%l~pdk zJ=)w^leN*-_(@uTC2QjL`kNv=cdMTB9}-&K1mpkaBT6`jpHScBPwUn(>pREA2N99V z?2L)L${xGkb9S8dK0ThRTBA->Uh*8O7ptOd*M0M@YQ=8ly`^|DLXifeQ8+mgEjVO> z75o-8l~c0bxVUmcPQHB2=M?9aQ(G?@B5nR`!fOYZ_&Y2UiQv4#&o`$Od=HLu6e#@h z;e9Txa>_VHwbENT4M^`6k zr%rGj#7v&2;I&|Hrd}!Z4?9PH2&G`$nZQ7JJxX0~m3Wl%7u` zr6R>eEiaDbT+HsZKG91!%kY+dPEnI`JwNDoWlMm?ehV{{;h*&PCua}baQLfJyv*4* zl&3<6iP(y)&-y!tuLFBZ$5002ua9!O@c>fBQ7v1gt+Sb>apH+j@)PKr@w4^biQ2+i zFQ}wT^m~~Z8G}}*qcmSFjz$mN{NV<{_d1dt{q7Ss4GdXSxGpd za)tIBu|9qZ|6x|l*@!jkVV9HOtgQibOfX=pp6eA_rg2IV4|I5_EFQ)YNR?Qx{ysjJ zUUYVIiCTYzZZk@(ge|oyo#o~uPyS@YA0L^?Fm;%#O1Sj|`2@?tE_k0{-gwB-hhkFK z5%+6#P}7I6Tcnnh;sQ(dOBRG4%v*@+P*s{arhVKpbaTQx{%0@Xqa>r1JIo>LyhiJj*#L%T!J0u;MlTLOf2nBDot!3L0PsuCqDQ7y7&3e!Kj)iL+w$w1SyBk!=q z=EoGn6SLvB*2(M;bU~zpPb?cdFIHj!f+)6LzzX}7SM#@C!7~}Jcz1#h?W#{aM?upq z#eTpPwS|E^iHvI-n4|zl@H@Av=5Ic-AEMufHk4H>_2_=Whvz>|KUz8+YoT~haSwy6 zi-$}UfWEP-#zCvnNYm}$2Gf82VkbJZRMDFfn50jJu2jV`@N33C+?!NWwOs8r(!n=$ z_`~Qi-I{x!K096IUBioA3(sPP9cvkY?3mNL)W@TKVd8^gK1@EavXrvVx483$_ocp& z;dOS#lb_F1J=1?cqCzfE5TcP2@^b9KeiKBg%g{_3q$K5t&GP~ z)t@6X>C)oGEcdjqs`qq^knPr({?1cP_(}Ix#H?GDZh(Im)i+jUY}Y!%(~rTEr!KE0 z*j4ps@ebaM;=xL;_uNR=P|%H^miV1se~nvpd+(a*ulU^s8D2RP_;Cu z6BOS2Hzrej2U22ylqN7tLz&u}7u;)oOEwBr!6*VQ_js(QF$_ERj-jzp zUB3=MY$$C5AOs^s>RS%G1OMHB^Eg8bixZz<7BfWBnK@`AM20EuHP$^-3%l@E`K3&> zp`0_bwm-8X(TU!1bIu-H<{CocSUcUw;bb!E<{!(ZZ}Y-|!g&Ye{v z8;C<<=?;W6b`i^~tPjoE%oJ!3;JL{vsERg9b}3!^szo1psvjE$n4*-5JijVk!RI%g z0WJ(*xo%I*^GsK+W=`{51g09KPIpe}3ghEzGek9Y1L8G*2_^&5p4rdD2F-RqEiux% zpJQkJ3q>)#RFWl=ZCHE1rn#KopX8%(;n^lc3yIFpOk0F$(m1OqoK4HVfgKgpSs3t+ zGQ}}W02_rGc}YzO6A&=71_u#**!%Q1FTo52)PcVaSOvU5`u)BKL`)%*5{QhLy~`8u z_?symCsg;AZOzUNc@CWtep9RZT;V467vmBs=`lE3$HH)331XFS_~fh9MK@$5Eu;%ka$E8ua)TRvdb)sOBcA zB4hi$`G6m#D0sz_aoAdTJ6O8^ayCBf>9M*uPR%Uty zoK4u*ayk|#INWS3-&JGNGdp(G*(p5M(O6;HSYgIc7W0R-`l0u~SZg^F%dWylDi3lV z=uWsHdL)pbub64V|Dk*1TbZY6C$=99=dp+(pE>nLUK7rUKTi_lC^9ieSUqV~3Ey(n z5cD}}up=}7CUdN!D8I1CB9J21*^>Ls)7|_~0j=1#S({I7Ztm{=nnu;ScbDNa#J)W7 zL^>Wa^juI6F6zmkRyPJ=R+nl)301cEa(oc`10-FJmen^;h2-g(!Cb}nAW*bYf@1KB z(PQs%??ICjORbkG7MS_0SM~atTd(dTCPqrsSVwA4F#U_ZR$FV;8|&}WH0!GEj}SF= zxsXulv%#Ziraz{$;+N>EQ)f2y`f%m6G~uJvk6hq?N-hHjF6=M}0Jy-owJhACAQD6- zY=Pw^3P?Ga&?VULMGUmD1p~zcrY6S<*f=+V3q|}M@W(g77by(dybOxFI=7LlMBe2V zp#n@YX8gz|qRl-@K#Ad)4qGWPCc+cXA4IvJ{qpub^rd?BII`i@H%RmE+oHxvz}eT- zPh#$WOk!!mNKQ~DHwXcIV`{*33IQxobTw&qCz>PdO68cs&_50yai`fMQr zIDN20Z^&dZBn_>pmDp)`JX}2%fr^`QR!)U^l74LY&!>%9J>sD3ZSPjcvI$lX(>Dvp zWNEhdZ5-3tYiz7?d&mHX-@kHGzZufzA5B?nmLj&wk$Cc=7_c!z1n&~c8&_TQ3G(YD z?`m8H1`u5#Cr%(*JBF0#J<_~HIyI8hgC2}N@TK1ZBL>AF2D zBOac>_~%u%PAE1Lk3Kc@T$lG+oi2)C5f z7|NfV-g4)_ru6a@!!n>|e$L+o>gWtmJ&kc87-kGhVhN_=f{1P=+CNqg<=Ah|Wx;J6 znZstKfb+3`dVC6T?RLb@w{n+xB`iev@2CZqEi0Ziu|U-sH~^qrVO53WWB}#*b5?2D zcIeTTws+A=3tmS9w!@}r&W_Bw74_*ox`7%a??em*6+(iewCi-Auh{gf`A`q^IaOT* za3&b)T+3w}_iqhOio3nJZl}0|!dpx9^cE=WDUzF<3o=}&R2Q9>6J(~VJ2OOB1g`@J zG$5xck(xsz?frLRckhb&WoKQ{4*Bry%hAOaupB?G>@p1bTSEkPIZkY(11F&G?JoPc zG#g7-yRP8M(RtBbkv{KP;ioz?GQzLndwKfMXz=!TWh-0|w@X``+i0iXdeqyatEA>8 zA2HAB>B*ui3QV0CYJ2C$DqY3=rA!&4@$z<`UbmFhtF!Zhg@cqlK}J_%$U_2dU^gw0 z)E`$+M5)Aw;~9+su8&-(Xjq%aJqe?h@zNC#C`j^3Jsdz!`YtI_$f~KT?nZusZ!^1B z?fSRbFypyaw`=|0o69hV^bgWl+nGm@ocECg^xPKcktV#|3~smz)Mhb5DqaFPy=Jd8 zVCW(&<=r`pIrJ0U-J1{@4_1(QLE_Q}f*2!^m5Ns5z=YhEUH&vc?T@_3peDCR+JGmK2Uqvn1{TJvcgwij4reX$ra8(`IUb$&DVst z{c8;HFE|O(U@Ql?fV1?fF!SeM0SN0D2jr8V8qGO~)f=5`V?3aO4XCZwE|A;6LPes2 zKDT9Pq9(N6O|Tz`=8x)BhX{$beb}Ox-G!=H5i$%}*Ix9Q5=`8+AnGP3)Z~L6?bTd- z$dP7x)dGI|UM*^fk@ZU^E^ORGz=T1qve&6@WLZjMkpyg+<@JtE&TgJ!`jl`lA+uT_ z4La1!_;R#$4anfNn%W;%i4$;3LcRVlt5WP8MiF}Nw8NBv5yi=On*e55!^mKtn}mkS zwS4Bw+hzxg4+Zj0u0yoa=x`==aYZ8+c@D#g0|INkfhfSI;KMDO0E-(nynHH=6{J}{ z3?%ymn0*ee`xil#QH)ISiY`};a*3dD6LycG`}Qse&HG^)?+eL3j+eS+nvejv ztVa0*G$T%8>Kz4X-HL$p{l_aJXkKBknps`_BDZ-YHQYyq^~)M=!V`h4ed#?$Qk;T) zddCHi*hCsD-2Yhv(BiD38%C0{8+mIJCX8MxQ)i2oof-WkzE2RQqC?a*AT^<~OEY`h z>sAR5N?LbL{;FP{A%%VzTpzacfndTPcJQxPc-b`hVdeTuw-8h+Etdg+mwVc}NwpH? zOjCa9dD@MrT7Z|eJBm)br^CBO>#lE;Ur#3Zkt%9)7O5Lr^|8tore5?LX_a9nJ$9Ax znH6|B&YsrH5_R{K>Pw0sINc*rZH25V*!Tfnl%Lh~&PtH7vCD`zk&MQiIENGP2t#$o z)eO*U$3K}%{%Uf9#pijun~K7dGrmu?FO|LH!IGhaz(pp z0U34{_7h&f-;Qh}v~WcLKfFzUtCBFmtKn=@o@K<%oZ@LtuAIE(-bY?{9gKCNc>Q64 zurY(ify{Y60u~H?0+`J4*I_^F4bc3^@b@nY21k?D8S2|GZYn1vpUW1%>!#yyB^fJR zo39kDwDaSRhjd%x)xCzgjX!B4Bo%3q#DazlI;OC1M%_&=i2L|D_{v0)5m>*HEhoX6 z+@cB&*C@vAX-kQ5(!Ehy9rX)K;ozhMyvHfRCc9in_5mVo0TW3ZR-oDxU zd`6I==g4rQN~~PMYi#9qTgsZT8iw8t)bT(%o!d}2BQBZB5~#XVXSG4LKgQyvt+wP! z1l`n#J+l~-UMkeXk#6H|8};F1MbLbu%XDj!B`PuNpj$3d&qnqcwaKlM*ypZdq5bZH zd`RrXgra-)^EY?-;nM>|*Tv9Dj_iUOLE2*RXb+#QERU24h?DL$&dWuEcxZdP*rT)vqCtGb{fRWCr@8=T3x+I^eCh#1$dsNa zQJ#cp)iO0QX`L+D(|# zKH>`7L19(#{$N;k_6K$gyDM-M-(7(} z`TGd26sf;&`HA~(@&TI;_Kh175s|Si2(k@~t2YaqIZ1U{raU(rDaiaU+ zA=h_nOSY@5J+CX8*Sc*ZSr_~Bwho@TfF-Uwa|Ueh!?B@bm_Y{?@}kiq!Ue$xF2hkO z1k@|#WLxn1##dx}0)tX*dJbsHUfm;Gofj^A*!SStjt;&B2ta#Q7-(bPfD{S7wc zl}*T{&ZVXA3k9EpOb_e>*`jSgE@@#<>hwRwCY&-}ZV_y1Km1u2^Sqp&7b+k~p8 zeEX6IdzD;2?%&(JDrR>upRZW8-NDDj)!J&txy60GvyJU_ymC9H-s5}ujkRlwcuUJZ zW`EfHPPh* z4;5_4lW|f(nejH+oSh;jO-d}UW`4A6wo00g!Wyf zG*&)4^>o3`*wE&oLL863+hI0r|Ly+$$ruO4^zLtZ5{gf{TLxdHdp8Jq``_dx3K`TG zsAP^$8uUF>Rk{;!$3J{fV}KQ!)8>s~TanP`Ra;w_q!3L7O=bTaE$8!l^by%Bam2uJ z=|3wz)M0*q+bwYNq(Lroq2Hf8P-5i&4&J|#f-nEPAN5nzZ2H(KIa?t({q}eVCI5C!=hdzVw7P@A!zUL|iR#zi3_Culx_mTrg>OV2-@+mN zsZxg<>E7#nv3#N|QE8|_DD9fM zQ_D`iuOCW1g$_EzeRf`h^flXeBc@lNda!VGgk(DAf;1p@%8DY~JmaFg+J$JxFuM}u zYXU=+o$Mb3yR$C7?2ox$zxK+7^3pR#M)q5FVcXiJP;PR`(gVVG^d(y5@1D9LJ+S>; zTgx+~iH5|m4{Jr{epbtR{*y7ordO4O5Q5p8C7r0Zvbn-DD)}(~L)j;{MAA>_QFpSrK7Ul_-ybUU%L-ytLk-A;c&!#H*rHkl6BWS08jbX*TGq?$x*c@;beb z@LNk5SGuab_g%F8a^|SJU3Bsgdgol@s<}4_?|in|6ex`+8@O^W@H;cN_Y^tws@7o@ zm&Vc`4}N<#hX;G{uR3k;6>7lH=K(4DVXzG_y@Qd4=fOFy0_o?MYn^`o&u$<7g2$N) z9=PMzE}pq!zkIOGOup~n`&#Y4Z=MQ1jo(#n|FPlsWvYMQaDeC6nyddq(dr-j`LTBK zZ#{{x`9me+9~=Kz@%FdIvV6Zb{zLuSAA9(*SnO{-DDeMj>DV7T__0>rZylTk8v-gZ zU-?hvydRrZ{7?byx2E-C|C#z9>Y@Fsm;bq9({H_;lKda*H~l!2?=|snUBF-s-oNYQ q|Lf%+n}07(f8YF|&+nT5TezB;tO7>_2I@cY!K#7k7reRr^#1^2a_5Br literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/times_1900.xls b/pandas/io/tests/data/times_1900.xls new file mode 100644 index 0000000000000000000000000000000000000000..e9a62b2c25da968e90b607a72efa4e9547e98048 GIT binary patch literal 16384 zcmeHOdvH|M8UJ>(Nj89x1c(R{R^E>ULjuS{c6o@88pKf??0|zI*(E6F2v-jTYjx$bm zJehO%-tV6C`yS^z-+A1Vr=O}ga?{--Ulk)orAR(bl}U+>9>6`bO64M>aKq=1Q>m0D zis1H(^ck|iN04Ar$!N^09;Q~^DT!}msc^L9= z2oXU1=5MXq(o$?bl|s3wxWgdPgU?3%Q!()i-dU< zIjF93rvA~_OK-S6dDfbz0w?3KM;DWo;NB(cWv!B9y}fu|_mvV1&~Ggn@~K4D@6FI9 zi{(P;mJRr8Qo5f5DculLgY%fjfzCB<^4iED*Cn9uzpLN-fxL$KP3VZGwfkX8jzX&ln|8g zZ~XwqM5onFE*vraF@hK${YwPkh5d0U^TenQjX+$+m^}98vEypSjjNl#y4Jd_4voeL zkFnVz%wKguyyX&mfocr@7`Uwh)9dW@@l+ie2k|hJ+e;WIs@N=`j$b4!;LMQ8D+v+ui=_o9=VTlvp3Js=UPr&CbdlapL!hkMiX z(2dIex%4|c=s)+M-|0c$>p{QEgMPaQ{RR*E4iCDQJYM+sC>&1Wo%Z>W2VKjPTVBhP zOTWbf=WY-By&m*md(gk*L4VMLUZ_03R5)A=IqjqUgo`3Y|9(Zqo`HJ?N@%w#&qb1^ zClZg0d1Q=)?pO3;1RX{H4+^Mc`R5c5dW{Etsw&n{pGz(JGjcv| z$6)HE{5Dh+U>vxFioEfZGF zT6jAxSEwltp+4B=VQx2}>{j_?mNI+eYzU=$Bdwn!l*KI76UbO9SF@=Z&FKG3t}eZQ z|Nab!z=0g6 zvC}LqNCUP?mebg2xl<-A?QL4n4;-f57QV>nkEf$TC=*_Dw4mLoo;|41?iNk=Mz@eV z7PYc*8xOgSY=L;kEl!xTAUV>}#zhiO*rm&jp!`hJFGydbKM42ovmkhIQ^yScX73r7 z4~YIYKCN1(H}&APqU`mKyKgf88Bpx98Tb;K$T(I=3 zPCUW6wKvF!d_ejd{XzKMFDnTi+->3yG9e!j{q53Q9WIKql6d+Fmk-_`{H)n8NMEBr z2-cN05Infs#2;j0J|OyAuD3c|@b;@tJi!%`HwZuT_6yS2=noQcfZ)O1CjKCk@&VD` zw!PKi=2gGy#1mWudV}zYM86vRf)s#7OOJ@)h`NJMIp7vbAwv*<6EdPx$%j(2G&p7_(z zcT>f@CofDJCU7%@Y${AP^|@@u`moW97R9DGo;d!}VRtr_CYy#_HWPi==w*#!QxZ?S z_3lgVY=)X_X6CY)>cfT$pCU|?V5rh~;^>jTxw9E&vYD03X1Wg>t}oJT!f2a!{^`zU zxXEUAE}MEEHhLbcv?+@xK6>*7cQ#dIGt=MG#(>StJUp$h@qxbA-PxRIvT^aWaXxJ9 zp4QiR?7;`z*^DsRxOmzmA2xPR>uWsp`a|w)PBPiJc-k}{Hg-?zYwSDtqC1t#*-93ji z8wNC_OiUYXvT^aW@jh(qo>t%R!k+8g*;JcsTs&>E4;#Cu)i?a+x$E88j4|1`cv_7Q z8@s2~HypU>33oOTlZ}g~)%mcods=-%?~~~WZMMxRCL0$|o9V;G?rHT69~`{KO`Ab7 z7OP&#jGK#$adUL4)N`y9S$_G$?x5qcpztUY^b`Zs?nU^9a{e-R(D7MN_>T!X!2q@U zjL6VuDUFKW&!cT!6#ZdjFMq|#58RQk!1NrEQEFE>;uNL02uzmXT%A7G z2VTOmH9aLLJJ~2(f>X}fvJqK@k>Cf=kl3b?u zl%mFTqef|JP)-deuixI(wH$%4b64^OOagfFtuvmGsk4+U1E@oT~- z?EdALAR#U_wsowJx1{_3xrQ(!5SJQNL$61pw>Tm|QN|lHvB-)(+!HT>;ReZRkYCP= zcPG1=5MGSH95@+)`8)tq(=PZF1B+hKsCbI6E0(_d@ChdE@NlUfL3eC$${G-`JP)4U zr4IfQDsBA$ZnKNkG8`%c#Ce^`_^$9;Cq{4p`U^3V5Q>|VT=ghXu(NH2H10}g1Fe3t zLYjWELYjW+4GxsKf?crX-AUErUo)5v2lR4fgWNW^hkQmzN1Kz7l|(;TlISN(68*$b zHm|1`wj1vhp|}_gaXxhJ$}Ie8GP>$~y6TFyWIL=i!&$3n2%2XFyd#N*qrGyyJwS-l z6+~x}9$@S}6rH3XXmH*7A>~w#{RP_HiX&QE>xnK_Aq&t(va+eGgCR?Y*O*h#;6Oy` zmEh2Yeq}{1x3J1VTv_<)@WpbqZIk30Xjg&16#!YQw7n1_Z&i28pgV&hkDSki729O9 z@}O)9_MmkbMUx1Xv*l8CJo!p!wVB_b;TadU#^cEu*M~>Q^){Z`ZT07tn{lN3X4^Z~ z+X*)g;Ed#>fiSYjx?=(P5C?Wxj~PzBqHRMwg)u%XrCWQ@y%unhKm#6D(qT87;cEV zufUc6eG--rZ+-U<%hsMBTchYx6@8kzep69;P{v9V;Lsv9fy?pc(GYnJNB)*7eTBMC zMCpV|$6}Lg_meXyZ^Wk9?r*}JpNIj-7k~+Ag$9-W7_)wpvJbO|eOWP3!|?0gZ%!(`x+Vn(lg3jocG^eEXY~IZA?Cv#)HEvbt8V?>+e3aDBZYcKrE`Dx zE3a*TSW&2DRCX`f(f*hJKA|YoF)DLjyyMLj9sL{@ice z{raI76ouMF<=Sr@yX%$imlcJ&MJ08~q?K=f`&C7uW>J~_^;OTk_57a|g?dG$efRe+ zdSlWtMWI$vx%t3vpEU6Hgmpm&E)MAkH3(Sf*IQFMmxec2bf3T;OO{(lxNLblYJ&(w^+U!>2c1wO_w zR;&gYG_)Gx0c^_ZCIvFy61&D(0+`p`@8l`_NlXx>1`JyS0c9}Gg|c^ zWA_b%kN*F_AK;h1_Btx@dm!uZpNjC~`phV|tgX4Lb5my`8CkleIo_^&!}GH%m*W4( z7P8{=*Qoy~OZNa$3QB(Z2cLcN*?rSTmG|y}{x$F4N&O!~)^SWQeCwD*z4_s|1)06P z9htIoMm`kzGGvaOS0G#VpnYycw%Y?{QAz7ENMVscOeXWU8cf(uY!5vxkU=03lICUc zm7QJdH*v@(Nj89zgb)!Wth^ryh6Ip@>?){e4d5sacEG`q?3$F2(6FJ``O`7my0%O5`EPRmek; ztC3Ga9)^4}@^EDO2y>0?{xf8z&#kBzOBeo<5|MMI6TjWE0WFk&s)ol{#tEWYB+RSG z0d;knMF@;pLJg!63^a?8AO43@F5>8!0{gyZa8<||37_?YLzZR?Nx5SzKmWcLKA|Nrt9;T!LnbuDUK?(oX z4`57mTHWNr5z`+di1E?CL;zmcAD1#ujGE9e#AS@hqjrxPT|0Vo{hZ}>)@@B_1V(s_ z%@$$KvWw%bm)i@}VED(tZ55bOZ?BK1n$T#7hoRhF!az~QrpqL$7g<5(lOjrz6?Iyc z>|^@vsJc;jx1#Q>6?HYCTA3*0SutB|x$T~GNBOx@3paHzf&d0DP06y0#j*^i$z(a* z#?Y#r=5JM1Z0+@i%Wa(VEE`idvwqg)t*y%=tC5yRHXzl6Dr665O>J$nSFfJEb}dlq zUs2N}k$QJ(0}bg|BeU!ArE??^88Zaa53bxkMFctH|PdmE*@3+jD58KV+v~FT!CAx7c~wSoK89KOxrC@ z-&IjjJ+K;f-l6DRm|FPS&f5OP7_t@Jurn9Bif+hRWznC(>`$T@J8+@vR$dV1Rh;3> znh9~{s&JKDMZ;xMOlC}9cJIb;oosZ_Ss;job(LB~a12nx5{8-+F^C6qE3imG74#gj z01jNu6vTmw7gT|vwx9}ZtAZ-fnG34GMk%NQ-M^p;>~{rKV4x|e0((qB71$*Us=z*0 zPz46Xf+|cdv_fs673vDDFs0B6vkR>-$5Da(&mIH<8^dQCQG@#zMVY8!RtSdE;e;KN zQ|bs(E=WT@AgP!?NMk-A4UJ9^GYC!tNY+bo)X5bc%ZjxAjBbe2DZ9afA?VK+kN^Gm zDR2EJI6!tui91N@2|tiCoFH)zNm75^Ws~GaqdBJoq*OI0rH;kcc}UXt!wg7iTE$J9 zHl^6SOl)=n*lA@p8eXY1z=gIkYzj3CW$i-6+=V)R{CJK~u~;leC_Bv(inO#$ST$?m z?X+B>COU-rV55h*-Gs7R<&#;;?2R)Zlxf1U>Tlv8fuXx#ooPpbDYLb zv$P-$*eY30W2fa#nXt6CX+b}5n090MQlmefi3*`ic+Js*cBgvwphmk}G~FBBLhdNk z%EoOx1*-_;dj5RBzSPQi9g8Ld_eTKOK)|!D9%da=_g!1c!Th>X1^ePP5vNQ zSK2`E;BFItka78d=x@2+>Tto^uR8GrS4iF<{LI@gNMDmbNW=kx2X~wJgN)AyM1R}% zR)?Ec{i+jBa1rPY!Xpy>g7h`{gPi67!GpU^oFL{n9!0h61Utv=w*@QlWeuyrhWfFQ zy2#lrE%DCe)UDx#vX!<@r$DGCb%HctPk*dLqz?IPe7kHB{pC_GNdnmME)B&Ke>(bZ zs+jlWg=xcBZf1~8g~_HNm(3_2HhR&b*c8VT$6q|+&Zg32)0oR?TlT3^%sFTUZ<<`k2Si>EdEu(5kuU(<(&_Pet=m277Cds+l+X5`^%4UISLdrq@q zKtsyJv=Js77f&1G!^ZAu4UI3{e3Lty8k3ESr%mu-WB0U%#@{@9FZ#Z94kc@U;T(X=;$mcJjw(;%>cD~5x${Zw9p-NOcoUWV}gz~K4=Mb3_R&wa;qBq_(a z2W1a;2cXe(JU+n}8R;LDvK9){V2tt1gD<( zk{cian8E%Oc083ip0))C<0J<@U(9cdx2%fHk9Txz4R12!Iw>^(an9RqyI(@DW zyo6E{vr<}88BeD!5!R4%?>eTgyP-mP`CzJ}KP7lt6b*s5=OZwC#xlHLT zMU5#&jndShoElDEL*;ays6BW6nvUkq=44m*hRBlm`sCK|0w+{(da4|UHP_=%9^1ZC zwLcG0Zh*~~K%DjP2hR1a)@Ng<-Kz|`VLxXe=$#fvx z7CaG)3>!FNp9x;!0^8+tI5rQ|`!4U&Z@JbhO9S1$Y_0%A!z^B>45etog zP&C8^obgb%SgGAC+kyjRIvP?g=v>nSSLkszS+ENF@WkpG_#zuU+tE_^P+&D0zb1Ub z?q7}x65>)*d*`ZnYr6kmXb3Y5aj8i)^eQxZt0MvwWxO#Hi>&CwJ@FD4ZjhV-`DIbO zC)wSM@L~k!z^Mq#ivUbbyWvv|EP6$w;wiqaSo-e5Cz!Ou!=-u!-LcUrYe2wq5j?$H z9sDI!+WG<9W*4hvI8+9RMP14G*6>OvMsNW73o()qikp*M^(axWvu%Ym?n-9^t$wmX zntrlEnts|A94NB{yI_lZlB&hOW-uKJ=*7wgxovI_`HYZ`HYXt~iGH#q(NC5n`iY@z zc5g9kH^wPKaWNX=BIw+mS@_drbk#X@)g|r84p?idvsTeyG|v)vM-mN3d*yn2fDmUY zh^{0(z}R~zI$1%`;JWog%BdXt3$(ixN3^!q6J4xA=Aw;cX>)fcLzWJ&F{hxxfr!*g z!J!-d%8FWUVU>fpvhelc%jA07Cdm!Zt^$8c0J2hPdkI9|pzanzcLqZqIiCe9w#z!@ zLD>@QL04iFO(ImzkSo;jNbqBY7|oMi#m9SU^6+fgRRkhLbO8UmZ_jj1NmGSiwxv60SzZO*&Ple}l3eEVkR; z%sU#8ouKWs(FnT*v@JFo0eA>PpE@shoH`#Etm12x$bAdG*{sKKaROY0T`LH~4OaIR zxbnYG!t&wm@BU%o%8Ozv6n&DSPgd7&DoQWPSZORATBJ5`HQpQyk=IbiGPsOvbC zPN;M&Hr{qWA%pTpY@+S{R?PW{7=U~Mn2=UzROyc~>o+O8FnidQ6$3S#=}ZnRDqRXG z!m?4>UgZ80s4Wu4D<;}RH&!^v??1e+rIB$nSV+^2>7)_CEQb*$heq3&<fZ`}9# zb1x_gwTsG)-#T{h%RMhC3U!N0>hkeR-~RTiibBnzGUMyZo_*{2KPw9Lib}`!?_K)F z_+yGft)jAf?{A+uy6%nGb8@OY1~rbLYgfAbMX^zOKZTLActJY4PGmFs(`JV+y|!)y zdIfbw(%yjH9X<(J(|AP(;%-IJIlA{{U+5~d9TE8dS+oe*Qfod_GyZ;&KA#r&7{gex z8f4JWwnSHY<=R`0zrVPx{LUN0GI{*I7wSR#39|0*+B4aw&PS%VU4~qV+=k3()r*YX zHw-@d{{w%3U;5hXsKoDqtiyjQ!jJ1Squj#wmhP^#U5RAm-1RN-4&57`pHX=({*UY| zyzd=*N1XptmhJ(d6qfw-4<35*(5@+`lyAEk`q#dH5A}ZxS;sNK@U3GK_2!4;R%G__ z4rI#C8Tk<8tB^T%UW07egZ8-v*=`S*MJ27vAcaK&F`3NY8Zcoyu|4#(Kn8(CNSc?$ zmv(ipTFV*3TGhj?0D(8@6iBaq*I-t;Qodr5^Zw<3VcB~|J{8&S-T4&EvrM7<^MuX2 z*k_r40MWe&gB5i!|95j7ERgMgixY|VmbmVQk%s#E#z@Aw)QB;!J&~A+(TrOd=&4hb WM^2e3@6Fo#)x1slMZr(m0{;U(y*23o literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/tips.csv b/pandas/io/tests/data/tips.csv new file mode 100644 index 00000000..856a65a6 --- /dev/null +++ b/pandas/io/tests/data/tips.csv @@ -0,0 +1,245 @@ +total_bill,tip,sex,smoker,day,time,size +16.99,1.01,Female,No,Sun,Dinner,2 +10.34,1.66,Male,No,Sun,Dinner,3 +21.01,3.5,Male,No,Sun,Dinner,3 +23.68,3.31,Male,No,Sun,Dinner,2 +24.59,3.61,Female,No,Sun,Dinner,4 +25.29,4.71,Male,No,Sun,Dinner,4 +8.77,2.0,Male,No,Sun,Dinner,2 +26.88,3.12,Male,No,Sun,Dinner,4 +15.04,1.96,Male,No,Sun,Dinner,2 +14.78,3.23,Male,No,Sun,Dinner,2 +10.27,1.71,Male,No,Sun,Dinner,2 +35.26,5.0,Female,No,Sun,Dinner,4 +15.42,1.57,Male,No,Sun,Dinner,2 +18.43,3.0,Male,No,Sun,Dinner,4 +14.83,3.02,Female,No,Sun,Dinner,2 +21.58,3.92,Male,No,Sun,Dinner,2 +10.33,1.67,Female,No,Sun,Dinner,3 +16.29,3.71,Male,No,Sun,Dinner,3 +16.97,3.5,Female,No,Sun,Dinner,3 +20.65,3.35,Male,No,Sat,Dinner,3 +17.92,4.08,Male,No,Sat,Dinner,2 +20.29,2.75,Female,No,Sat,Dinner,2 +15.77,2.23,Female,No,Sat,Dinner,2 +39.42,7.58,Male,No,Sat,Dinner,4 +19.82,3.18,Male,No,Sat,Dinner,2 +17.81,2.34,Male,No,Sat,Dinner,4 +13.37,2.0,Male,No,Sat,Dinner,2 +12.69,2.0,Male,No,Sat,Dinner,2 +21.7,4.3,Male,No,Sat,Dinner,2 +19.65,3.0,Female,No,Sat,Dinner,2 +9.55,1.45,Male,No,Sat,Dinner,2 +18.35,2.5,Male,No,Sat,Dinner,4 +15.06,3.0,Female,No,Sat,Dinner,2 +20.69,2.45,Female,No,Sat,Dinner,4 +17.78,3.27,Male,No,Sat,Dinner,2 +24.06,3.6,Male,No,Sat,Dinner,3 +16.31,2.0,Male,No,Sat,Dinner,3 +16.93,3.07,Female,No,Sat,Dinner,3 +18.69,2.31,Male,No,Sat,Dinner,3 +31.27,5.0,Male,No,Sat,Dinner,3 +16.04,2.24,Male,No,Sat,Dinner,3 +17.46,2.54,Male,No,Sun,Dinner,2 +13.94,3.06,Male,No,Sun,Dinner,2 +9.68,1.32,Male,No,Sun,Dinner,2 +30.4,5.6,Male,No,Sun,Dinner,4 +18.29,3.0,Male,No,Sun,Dinner,2 +22.23,5.0,Male,No,Sun,Dinner,2 +32.4,6.0,Male,No,Sun,Dinner,4 +28.55,2.05,Male,No,Sun,Dinner,3 +18.04,3.0,Male,No,Sun,Dinner,2 +12.54,2.5,Male,No,Sun,Dinner,2 +10.29,2.6,Female,No,Sun,Dinner,2 +34.81,5.2,Female,No,Sun,Dinner,4 +9.94,1.56,Male,No,Sun,Dinner,2 +25.56,4.34,Male,No,Sun,Dinner,4 +19.49,3.51,Male,No,Sun,Dinner,2 +38.01,3.0,Male,Yes,Sat,Dinner,4 +26.41,1.5,Female,No,Sat,Dinner,2 +11.24,1.76,Male,Yes,Sat,Dinner,2 +48.27,6.73,Male,No,Sat,Dinner,4 +20.29,3.21,Male,Yes,Sat,Dinner,2 +13.81,2.0,Male,Yes,Sat,Dinner,2 +11.02,1.98,Male,Yes,Sat,Dinner,2 +18.29,3.76,Male,Yes,Sat,Dinner,4 +17.59,2.64,Male,No,Sat,Dinner,3 +20.08,3.15,Male,No,Sat,Dinner,3 +16.45,2.47,Female,No,Sat,Dinner,2 +3.07,1.0,Female,Yes,Sat,Dinner,1 +20.23,2.01,Male,No,Sat,Dinner,2 +15.01,2.09,Male,Yes,Sat,Dinner,2 +12.02,1.97,Male,No,Sat,Dinner,2 +17.07,3.0,Female,No,Sat,Dinner,3 +26.86,3.14,Female,Yes,Sat,Dinner,2 +25.28,5.0,Female,Yes,Sat,Dinner,2 +14.73,2.2,Female,No,Sat,Dinner,2 +10.51,1.25,Male,No,Sat,Dinner,2 +17.92,3.08,Male,Yes,Sat,Dinner,2 +27.2,4.0,Male,No,Thur,Lunch,4 +22.76,3.0,Male,No,Thur,Lunch,2 +17.29,2.71,Male,No,Thur,Lunch,2 +19.44,3.0,Male,Yes,Thur,Lunch,2 +16.66,3.4,Male,No,Thur,Lunch,2 +10.07,1.83,Female,No,Thur,Lunch,1 +32.68,5.0,Male,Yes,Thur,Lunch,2 +15.98,2.03,Male,No,Thur,Lunch,2 +34.83,5.17,Female,No,Thur,Lunch,4 +13.03,2.0,Male,No,Thur,Lunch,2 +18.28,4.0,Male,No,Thur,Lunch,2 +24.71,5.85,Male,No,Thur,Lunch,2 +21.16,3.0,Male,No,Thur,Lunch,2 +28.97,3.0,Male,Yes,Fri,Dinner,2 +22.49,3.5,Male,No,Fri,Dinner,2 +5.75,1.0,Female,Yes,Fri,Dinner,2 +16.32,4.3,Female,Yes,Fri,Dinner,2 +22.75,3.25,Female,No,Fri,Dinner,2 +40.17,4.73,Male,Yes,Fri,Dinner,4 +27.28,4.0,Male,Yes,Fri,Dinner,2 +12.03,1.5,Male,Yes,Fri,Dinner,2 +21.01,3.0,Male,Yes,Fri,Dinner,2 +12.46,1.5,Male,No,Fri,Dinner,2 +11.35,2.5,Female,Yes,Fri,Dinner,2 +15.38,3.0,Female,Yes,Fri,Dinner,2 +44.3,2.5,Female,Yes,Sat,Dinner,3 +22.42,3.48,Female,Yes,Sat,Dinner,2 +20.92,4.08,Female,No,Sat,Dinner,2 +15.36,1.64,Male,Yes,Sat,Dinner,2 +20.49,4.06,Male,Yes,Sat,Dinner,2 +25.21,4.29,Male,Yes,Sat,Dinner,2 +18.24,3.76,Male,No,Sat,Dinner,2 +14.31,4.0,Female,Yes,Sat,Dinner,2 +14.0,3.0,Male,No,Sat,Dinner,2 +7.25,1.0,Female,No,Sat,Dinner,1 +38.07,4.0,Male,No,Sun,Dinner,3 +23.95,2.55,Male,No,Sun,Dinner,2 +25.71,4.0,Female,No,Sun,Dinner,3 +17.31,3.5,Female,No,Sun,Dinner,2 +29.93,5.07,Male,No,Sun,Dinner,4 +10.65,1.5,Female,No,Thur,Lunch,2 +12.43,1.8,Female,No,Thur,Lunch,2 +24.08,2.92,Female,No,Thur,Lunch,4 +11.69,2.31,Male,No,Thur,Lunch,2 +13.42,1.68,Female,No,Thur,Lunch,2 +14.26,2.5,Male,No,Thur,Lunch,2 +15.95,2.0,Male,No,Thur,Lunch,2 +12.48,2.52,Female,No,Thur,Lunch,2 +29.8,4.2,Female,No,Thur,Lunch,6 +8.52,1.48,Male,No,Thur,Lunch,2 +14.52,2.0,Female,No,Thur,Lunch,2 +11.38,2.0,Female,No,Thur,Lunch,2 +22.82,2.18,Male,No,Thur,Lunch,3 +19.08,1.5,Male,No,Thur,Lunch,2 +20.27,2.83,Female,No,Thur,Lunch,2 +11.17,1.5,Female,No,Thur,Lunch,2 +12.26,2.0,Female,No,Thur,Lunch,2 +18.26,3.25,Female,No,Thur,Lunch,2 +8.51,1.25,Female,No,Thur,Lunch,2 +10.33,2.0,Female,No,Thur,Lunch,2 +14.15,2.0,Female,No,Thur,Lunch,2 +16.0,2.0,Male,Yes,Thur,Lunch,2 +13.16,2.75,Female,No,Thur,Lunch,2 +17.47,3.5,Female,No,Thur,Lunch,2 +34.3,6.7,Male,No,Thur,Lunch,6 +41.19,5.0,Male,No,Thur,Lunch,5 +27.05,5.0,Female,No,Thur,Lunch,6 +16.43,2.3,Female,No,Thur,Lunch,2 +8.35,1.5,Female,No,Thur,Lunch,2 +18.64,1.36,Female,No,Thur,Lunch,3 +11.87,1.63,Female,No,Thur,Lunch,2 +9.78,1.73,Male,No,Thur,Lunch,2 +7.51,2.0,Male,No,Thur,Lunch,2 +14.07,2.5,Male,No,Sun,Dinner,2 +13.13,2.0,Male,No,Sun,Dinner,2 +17.26,2.74,Male,No,Sun,Dinner,3 +24.55,2.0,Male,No,Sun,Dinner,4 +19.77,2.0,Male,No,Sun,Dinner,4 +29.85,5.14,Female,No,Sun,Dinner,5 +48.17,5.0,Male,No,Sun,Dinner,6 +25.0,3.75,Female,No,Sun,Dinner,4 +13.39,2.61,Female,No,Sun,Dinner,2 +16.49,2.0,Male,No,Sun,Dinner,4 +21.5,3.5,Male,No,Sun,Dinner,4 +12.66,2.5,Male,No,Sun,Dinner,2 +16.21,2.0,Female,No,Sun,Dinner,3 +13.81,2.0,Male,No,Sun,Dinner,2 +17.51,3.0,Female,Yes,Sun,Dinner,2 +24.52,3.48,Male,No,Sun,Dinner,3 +20.76,2.24,Male,No,Sun,Dinner,2 +31.71,4.5,Male,No,Sun,Dinner,4 +10.59,1.61,Female,Yes,Sat,Dinner,2 +10.63,2.0,Female,Yes,Sat,Dinner,2 +50.81,10.0,Male,Yes,Sat,Dinner,3 +15.81,3.16,Male,Yes,Sat,Dinner,2 +7.25,5.15,Male,Yes,Sun,Dinner,2 +31.85,3.18,Male,Yes,Sun,Dinner,2 +16.82,4.0,Male,Yes,Sun,Dinner,2 +32.9,3.11,Male,Yes,Sun,Dinner,2 +17.89,2.0,Male,Yes,Sun,Dinner,2 +14.48,2.0,Male,Yes,Sun,Dinner,2 +9.6,4.0,Female,Yes,Sun,Dinner,2 +34.63,3.55,Male,Yes,Sun,Dinner,2 +34.65,3.68,Male,Yes,Sun,Dinner,4 +23.33,5.65,Male,Yes,Sun,Dinner,2 +45.35,3.5,Male,Yes,Sun,Dinner,3 +23.17,6.5,Male,Yes,Sun,Dinner,4 +40.55,3.0,Male,Yes,Sun,Dinner,2 +20.69,5.0,Male,No,Sun,Dinner,5 +20.9,3.5,Female,Yes,Sun,Dinner,3 +30.46,2.0,Male,Yes,Sun,Dinner,5 +18.15,3.5,Female,Yes,Sun,Dinner,3 +23.1,4.0,Male,Yes,Sun,Dinner,3 +15.69,1.5,Male,Yes,Sun,Dinner,2 +19.81,4.19,Female,Yes,Thur,Lunch,2 +28.44,2.56,Male,Yes,Thur,Lunch,2 +15.48,2.02,Male,Yes,Thur,Lunch,2 +16.58,4.0,Male,Yes,Thur,Lunch,2 +7.56,1.44,Male,No,Thur,Lunch,2 +10.34,2.0,Male,Yes,Thur,Lunch,2 +43.11,5.0,Female,Yes,Thur,Lunch,4 +13.0,2.0,Female,Yes,Thur,Lunch,2 +13.51,2.0,Male,Yes,Thur,Lunch,2 +18.71,4.0,Male,Yes,Thur,Lunch,3 +12.74,2.01,Female,Yes,Thur,Lunch,2 +13.0,2.0,Female,Yes,Thur,Lunch,2 +16.4,2.5,Female,Yes,Thur,Lunch,2 +20.53,4.0,Male,Yes,Thur,Lunch,4 +16.47,3.23,Female,Yes,Thur,Lunch,3 +26.59,3.41,Male,Yes,Sat,Dinner,3 +38.73,3.0,Male,Yes,Sat,Dinner,4 +24.27,2.03,Male,Yes,Sat,Dinner,2 +12.76,2.23,Female,Yes,Sat,Dinner,2 +30.06,2.0,Male,Yes,Sat,Dinner,3 +25.89,5.16,Male,Yes,Sat,Dinner,4 +48.33,9.0,Male,No,Sat,Dinner,4 +13.27,2.5,Female,Yes,Sat,Dinner,2 +28.17,6.5,Female,Yes,Sat,Dinner,3 +12.9,1.1,Female,Yes,Sat,Dinner,2 +28.15,3.0,Male,Yes,Sat,Dinner,5 +11.59,1.5,Male,Yes,Sat,Dinner,2 +7.74,1.44,Male,Yes,Sat,Dinner,2 +30.14,3.09,Female,Yes,Sat,Dinner,4 +12.16,2.2,Male,Yes,Fri,Lunch,2 +13.42,3.48,Female,Yes,Fri,Lunch,2 +8.58,1.92,Male,Yes,Fri,Lunch,1 +15.98,3.0,Female,No,Fri,Lunch,3 +13.42,1.58,Male,Yes,Fri,Lunch,2 +16.27,2.5,Female,Yes,Fri,Lunch,2 +10.09,2.0,Female,Yes,Fri,Lunch,2 +20.45,3.0,Male,No,Sat,Dinner,4 +13.28,2.72,Male,No,Sat,Dinner,2 +22.12,2.88,Female,Yes,Sat,Dinner,2 +24.01,2.0,Male,Yes,Sat,Dinner,4 +15.69,3.0,Male,Yes,Sat,Dinner,3 +11.61,3.39,Male,No,Sat,Dinner,2 +10.77,1.47,Male,No,Sat,Dinner,2 +15.53,3.0,Male,Yes,Sat,Dinner,2 +10.07,1.25,Male,No,Sat,Dinner,2 +12.6,1.0,Male,Yes,Sat,Dinner,2 +32.83,1.17,Male,Yes,Sat,Dinner,2 +35.83,4.67,Female,No,Sat,Dinner,3 +29.03,5.92,Male,No,Sat,Dinner,3 +27.18,2.0,Female,Yes,Sat,Dinner,2 +22.67,2.0,Male,Yes,Sat,Dinner,2 +17.82,1.75,Male,No,Sat,Dinner,2 +18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/pandas/io/tests/data/unicode_series.csv b/pandas/io/tests/data/unicode_series.csv new file mode 100644 index 00000000..2485e149 --- /dev/null +++ b/pandas/io/tests/data/unicode_series.csv @@ -0,0 +1,18 @@ +1617,King of New York (1990) +1618,All Things Fair (1996) +1619,"Sixth Man, The (1997)" +1620,Butterfly Kiss (1995) +1621,"Paris, France (1993)" +1622,"Cérémonie, La (1995)" +1623,Hush (1998) +1624,Nightwatch (1997) +1625,Nobody Loves Me (Keiner liebt mich) (1994) +1626,"Wife, The (1995)" +1627,Lamerica (1994) +1628,Nico Icon (1995) +1629,"Silence of the Palace, The (Saimt el Qusur) (1994)" +1630,"Slingshot, The (1993)" +1631,Land and Freedom (Tierra y libertad) (1995) +1632,Á köldum klaka (Cold Fever) (1994) +1633,Etz Hadomim Tafus (Under the Domin Tree) (1994) +1634,Two Friends (1986) diff --git a/pandas/io/tests/data/utf16_ex.txt b/pandas/io/tests/data/utf16_ex.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0b452a2bd5ff25e752f015f0f9a04da7d4106f3 GIT binary patch literal 11406 zcmds-%~Bgj5Xa}bsmeR-6T|_h@^O^0aBu;kfHD=|WWZRlEQkb7;kWanaP)ygDmNdJ z{D0G0c9z}QmC(e-DQb(=bWcxrPfvf%{`2>mJ9QUs?9Sc9UAoNe>209r*gbG-Zm2g# z_x;;QqnUd=gJ*(+IdCJQ^||jB=%+U+=b*`|c0XKG)x|C?HACZA-EZJv~X=*LvG} z+wyNKofxa`Yxk9VCTe3n@4PNX;w%%D2cnZn;)x(rFA-Sh8e4JS3i^Yd6}PJ2KWpBJ zG?cqt&3&oo*h{|VUg`aW;h|Gw(){y4ByteZd|KtZW?`{DWv_~ zYc%&3cjia1sOd~oN8YNY-tLg)|1s+uzrS6wWJxS7mb8ELS=3x-qXoAElNs z-Qgp{-=$|o8{gg70M*W=C87&Wsc3MVujWppOHd_p6u|Y=i5%> z6X}7-@?j>;JLx3kCiXrLon>$=PEkLlkh(vs9Vas%w66W4Mx+wO3Sv@*<6Shm;3?!F zOSu&1WZQNk@hrdT8OUilz7la>OLF>_WOfpo>nTY`d?=>^u z1OIZd(rT>U@1{pVol1XvU7F)0-3P37bwRC1QE*2rU~j3Y+c=DOx$kz;p886VI(E~a zFY~~3I^&S=WG#!8)&fdycBIQcwe!}Ap{6y7u5_ZfL90WXrE0?LUWF6cyh>VmHznbP z_Y=9YYixZaTc?ZMJRZsz!&!7Ztwcqqhn?G!MdAZrc*%-R&$WKlVWDK*H0*YznSL9r zY3;P~Bge6q??RQWb!@B-`_MRA+^8%%ubJBImD9?$tDXBO3#EFujyqAEyYe_B+*hzJ zI_CGhXRXr5TIh|=Gru}BCjWEKhgF^PeIc#}j%)sx@^Mse!ohWF!{l z?Vd8#jGopkIUo*HIUr(Hac*;2h2Ex6g;wXL(({Iw!EVFJ77x|S7)yV#@zA>JIOLzz z9!LzQKF+$Sqze6+-4-u$%WOWYd7k+?gAHYk>{;Pw?+R;LFO??3o2DJ#wY93bc|3RT z=9!Nfcn-ekDLgIw6Ez>} z|43Z|Tj%y%^L>W<>0!%hflfZ%G460D%A~DY8Yy?+H?^eY+vo+$rf*1Yh<}~>_bp-C z*9}%-Hc>3`sblLaeoFXq`r?}eDoeCL@1C!~$XV+*BKBp=M4J34ZP^zJ#mqYP=Yg}w z#TgAHtMj*<)=GbZ-l!<)RFm!7eG>76P6uagNB^-li$K+xC0-WzC}@U|-i`N40f}+2 zoJQa=JY)2&xDDiMNGz-D+tBBuZ}a)^+gI;`E~h7AIF%Y1y`4Ji{#wRY>IwgB5jckZ l`Jl^b0L#WRTWx%xckYudzlx2A)lCd%k#Qu*8vgFv_`m7kfOG%= literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/valid_markup.html b/pandas/io/tests/data/valid_markup.html new file mode 100644 index 00000000..0130e9ed --- /dev/null +++ b/pandas/io/tests/data/valid_markup.html @@ -0,0 +1,62 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ab
067
140
294
370
+ + + + + + + + + + + + + + + + + + + + +
ab
067
140
+ + diff --git a/pandas/io/tests/data/yahoo_options1.html b/pandas/io/tests/data/yahoo_options1.html new file mode 100644 index 00000000..987072b1 --- /dev/null +++ b/pandas/io/tests/data/yahoo_options1.html @@ -0,0 +1,329 @@ + +AAPL Options | Apple Inc. Stock - Yahoo! Finance
 

Send me a link:

*Only U.S. numbers are accepted. Text messaging rates may apply.

 Dow Up0.20% Nasdaq Up0.50%
+ + + +

Apple Inc. (AAPL)

-NasdaqGS
585.54 Down 2.45(0.42%) May 9, 4:00PM EDT
|After Hours + : + 585.73 Up 0.19 (0.03%) May 9, 7:59PM EDT
Get the big picture on all your investments.
OptionsGet Options for:
View By Expiration: May 14 | Jun 14 | Jul 14 | Aug 14 | Oct 14 | Jan 15 | Jan 16
Call OptionsExpire at close Friday, May 30, 2014
StrikeSymbolLastChgBidAskVolOpen Int
330.00AAPL140517C00330000263.00 0.00254.30256.9062
400.00AAPL140517C00400000190.70 0.00184.35186.408735
410.00AAPL140517C00410000181.30 0.00174.40176.456810
420.00AAPL140517C00420000170.80 0.00164.35166.501241
430.00AAPL140517C00430000160.75 0.00154.40156.5037622
440.00AAPL140517C00440000149.70 0.00144.40146.60901
445.00AAPL140517C00445000145.55 0.00139.40141.451061
450.00AAPL140517C00450000131.27Down 9.18134.40136.90452
450.00AAPL7140517C00450000117.45 0.00133.40137.8052
455.00AAPL140517C00455000134.70 0.00129.40131.40902
460.00AAPL140517C00460000130.95 0.00124.40126.903091
460.00AAPL7140517C00460000122.50Down 17.50123.30127.5511
465.00AAPL140517C00465000125.70 0.00119.45121.701724
470.00AAPL140517C00470000111.95Down 5.76114.45116.101519
470.00AAPL7140517C0047000062.00 0.00113.40116.3011
470.00AAPL140523C00470000122.85 0.00114.50116.3011
475.00AAPL140517C00475000108.00Down 4.23109.40111.7018
480.00AAPL140517C00480000102.00Down 8.60104.40106.80431
485.00AAPL140517C00485000107.50 0.0099.35101.853023
490.00AAPL140517C0049000091.90Down 8.5594.3596.90118
490.00AAPL7140517C0049000097.53 0.0093.4097.3515
495.00AAPL140517C0049500089.35Down 3.0189.4091.8554
500.00AAPL140517C0050000085.00Down 4.7584.4585.957518
500.00AAPL7140517C0050000095.89 0.0083.4086.90241
500.00AAPL140523C0050000085.55Down 1.1585.2586.00103260
500.00AAPL140530C0050000091.45 0.0084.5587.00855
502.50AAPL140530C0050250080.20Down 6.5282.0583.7011
505.00AAPL140517C0050500087.73 0.0079.3581.25243
505.00AAPL7140517C0050500092.71 0.0078.4081.3015
505.00AAPL140523C0050500028.00 0.0079.5081.9038520
510.00AAPL140517C0051000071.58Down 10.9574.4576.0028104
515.00AAPL140517C0051500067.00Down 7.0569.4071.00150
515.00AAPL140523C0051500073.20 0.0069.5071.8011
515.00AAPL140530C0051500074.24 0.0069.5071.506310
520.00AAPL140517C0052000062.30Down 5.1964.4566.005116
520.00AAPL7140517C0052000072.00 0.0063.4567.00127
520.00AAPL140530C0052000072.00 0.0064.6566.3035110
522.50AAPL140523C0052250060.20Down 5.9562.0564.3012
525.00AAPL140517C0052500059.87Down 2.6859.7061.0015380
525.00AAPL7140517C0052500060.00Down 7.3558.4561.85127
525.00AAPL140523C0052500057.70Down 8.6059.5561.151117
525.00AAPL7140523C0052500060.00Up 46.0058.4561.4022
525.00AAPL140530C0052500057.02Down 8.6459.6561.103343
525.00AAPL7140530C0052500056.64Up 11.6458.6062.9011
530.00AAPL140517C0053000054.86Down 2.4455.1555.9021326
530.00AAPL7140517C0053000055.00Down 5.0053.4056.45522
530.00AAPL140523C0053000057.00 0.0054.5056.35481
530.00AAPL7140523C0053000063.18 0.0053.5557.4013
530.00AAPL140530C0053000052.05Down 9.9554.6556.90114
532.50AAPL140523C0053250057.40 0.0052.0054.151651
532.50AAPL140530C0053250057.86 0.0052.2554.506610
535.00AAPL140517C0053500050.35Down 2.7550.1051.05228769
535.00AAPL7140517C0053500057.36 0.0048.5051.25317
535.00AAPL140523C0053500057.75 0.0049.6051.4023
535.00AAPL140530C0053500047.52Down 9.2349.7551.90258
540.00AAPL140517C0054000041.80Down 5.8944.4546.003199
540.00AAPL7140517C0054000043.37Down 6.8343.5046.35515
540.00AAPL140523C0054000042.29Down 10.3644.6046.30233
540.00AAPL140530C0054000042.80Down 10.2044.7546.95213
542.50AAPL140523C0054250048.55 0.0042.1543.901561
542.50AAPL140530C0054250050.80 0.0042.4544.3577
545.00AAPL140517C0054500036.72Down 5.6840.2541.05177901
545.00AAPL7140517C0054500042.90 0.0038.5041.40144150
545.00AAPL140530C0054500045.85 0.0040.0041.901255
550.00AAPL140517C0055000035.50Down 2.0035.6536.00131701
550.00AAPL7140517C0055000030.50Down 11.5033.5536.3515
550.00AAPL140523C0055000034.45Down 7.6034.8036.301920
550.00AAPL140530C0055000032.83Down 7.8235.5036.80342
552.50AAPL140523C0055250038.95 0.0032.4034.202141
552.50AAPL140530C0055250037.64 0.0032.9534.80691
555.00AAPL140517C0055500030.67Down 1.6630.4531.0544132
555.00AAPL7140517C0055500036.65 0.0028.5032.5592
555.00AAPL140523C0055500032.80 0.0029.9531.6523
555.00AAPL140530C0055500038.30 0.0030.6532.251011
557.50AAPL140530C0055750026.52Down 9.4828.4029.85110
560.00AAPL140517C0056000026.00Down 2.3025.6526.15142439
560.00AAPL7140517C0056000025.50Down 6.6723.6027.65426
560.00AAPL140523C0056000023.11Down 4.8925.3526.8510127
560.00AAPL7140523C0056000042.95 0.0024.3028.2012
560.00AAPL140530C0056000024.12Down 9.6326.2027.658102
562.50AAPL140517C0056250019.95Down 3.3523.1023.80111
562.50AAPL140523C0056250024.45Down 2.7523.2024.55296
562.50AAPL140530C0056250031.29 0.0024.1025.5043
565.00AAPL140517C0056500021.10Down 1.4020.7521.3564247
565.00AAPL7140517C0056500017.01Down 9.0218.7522.80310
565.00AAPL140523C0056500020.60Down 3.4021.4522.3511145
565.00AAPL7140523C0056500029.30 0.0019.8022.85115
565.00AAPL140530C0056500023.00Down 2.6022.8523.4015119
565.00AAPL7140530C0056500033.80 0.0021.2023.8523
567.50AAPL140517C0056750016.50Down 33.5018.3018.95102
567.50AAPL140523C0056750020.00Down 4.8519.5020.2084
570.00AAPL140517C0057000016.60Down 1.6416.1016.65475581
570.00AAPL7140517C0057000012.80Down 7.3814.4517.05984
570.00AAPL140523C0057000015.21Down 4.6417.5018.1011146
570.00AAPL7140523C0057000016.60Down 4.0316.0018.7012
570.00AAPL140530C0057000018.40Down 2.2018.9019.3511295
570.00AAPL7140530C0057000018.90Down 2.5016.8020.05663
572.50AAPL140517C0057250014.15Up 7.1513.9014.4022240
572.50AAPL140523C0057250016.00Down 2.1015.5516.10797
572.50AAPL140530C0057250017.30Down 1.2516.8517.50344
572.50AAPL7140530C005725008.20 0.0015.3517.8533
575.00AAPL140517C0057500012.15Down 2.0812.0012.30705678
575.00AAPL7140517C0057500013.90 0.009.9512.55113
575.00AAPL140523C0057500014.10Down 1.3013.8514.25101277
575.00AAPL7140523C0057500014.10Down 3.1211.7514.55617
575.00AAPL140530C0057500015.60Down 0.9015.3015.7084449
575.00AAPL7140530C0057500014.00Down 4.5813.6015.954179
577.50AAPL140517C0057750010.25Down 21.859.9510.351,15446
577.50AAPL140523C0057750010.99Down 2.7612.1012.45125130
577.50AAPL7140523C0057750015.07 0.0010.4513.40215
580.00AAPL140517C005800008.40Down 2.058.258.455,0972,008
580.00AAPL7140517C005800008.45Down 5.807.708.8050597
580.00AAPL140523C0058000010.60Down 1.4510.5010.80395418
580.00AAPL7140523C0058000010.00Down 10.908.9011.65218
580.00AAPL140530C0058000012.21Down 1.5912.1512.45224544
580.00AAPL7140530C005800009.96Down 5.9910.5012.701188
582.50AAPL140517C005825006.90Down 1.406.706.853,52898
582.50AAPL7140523C005825005.60Up 5.60N/AN/A082
582.50AAPL7140530C0058250041.32Up 41.32N/AN/A014
585.00AAPL140517C005850005.25Down 1.845.255.4511,7854,386
585.00AAPL7140517C005850005.30Down 1.824.705.6539208
585.00AAPL140523C005850007.85Down 1.407.657.95854484
585.00AAPL7140523C005850006.50Down 4.435.659.0011535
585.00AAPL140530C005850009.60Down 1.309.409.70348234
585.00AAPL7140530C005850008.75Down 2.257.509.85426
587.50AAPL140517C005875004.25Down 1.534.054.202,245305
587.50AAPL7140517C005875003.35Up 3.353.504.40510
587.50AAPL7140523C005875000.05Up 0.05N/AN/A2159
590.00AAPL140517C005900003.20Down 1.353.103.208,5796,343
590.00AAPL7140517C005900003.20Down 1.102.003.2586380
590.00AAPL140523C005900005.50Down 1.245.405.55786809
590.00AAPL7140523C005900004.50Down 2.053.905.901476
590.00AAPL140530C005900007.20Down 1.407.157.30196649
590.00AAPL7140530C005900007.20Down 2.805.308.10173
592.50AAPL140517C005925002.35Down 1.102.302.402,326493
592.50AAPL7140517C005925002.14Up 2.141.702.57126
595.00AAPL140517C005950001.75Down 0.931.681.754,5794,449
595.00AAPL7140517C005950001.65Down 0.851.541.867160
595.00AAPL140523C005950003.75Down 0.953.703.75652747
595.00AAPL7140523C005950003.50Down 4.603.553.906104
595.00AAPL140530C005950005.40Down 1.025.205.45377867
595.00AAPL7140530C005950004.40Down 4.105.105.451324
597.50AAPL140517C005975001.23Down 0.861.161.251,237392
597.50AAPL7140517C005975000.94Down 2.410.591.3921
600.00AAPL140517C006000000.89Down 0.660.880.898,02413,791
600.00AAPL7140517C006000000.95Down 0.550.780.95331,692
600.00AAPL140523C006000002.45Down 0.832.352.541,9971,364
600.00AAPL7140523C006000002.43Down 1.602.322.6121225
600.00AAPL140530C006000003.80Down 0.953.653.901,0265,990
600.00AAPL7140530C006000003.75Down 1.303.753.95253882
602.50AAPL140517C006025000.61Down 0.540.610.66972286
602.50AAPL7140517C006025002.09 0.000.310.791111
605.00AAPL140517C006050000.44Down 0.410.430.442,4766,776
605.00AAPL7140517C006050000.47Down 0.330.390.5131351
605.00AAPL140523C006050001.53Down 0.681.501.64626582
605.00AAPL140530C006050002.69Down 0.712.582.69155872
607.50AAPL140517C006075000.33Down 0.300.310.34432261
610.00AAPL140517C006100000.28Down 0.180.250.281,7964,968
610.00AAPL7140517C006100000.23Down 0.790.210.3246272
610.00AAPL140523C006100000.97Down 0.460.971.06335897
610.00AAPL140530C006100001.85Down 0.551.801.85208728
612.50AAPL140517C006125000.19Down 0.190.190.2412860
615.00AAPL140517C006150000.19Down 0.080.180.191,1253,790
615.00AAPL7140517C006150000.73 0.000.130.2534328
615.00AAPL140523C006150000.69Down 0.260.630.70123576
615.00AAPL140530C006150001.28Down 0.431.231.34127264
617.50AAPL140517C006175000.14Down 0.070.120.1644148
620.00AAPL140517C006200000.14Down 0.060.120.146963,306
620.00AAPL7140517C006200000.45 0.000.080.202970
620.00AAPL140523C006200000.45Down 0.220.420.47133476
620.00AAPL140530C006200000.90Down 0.290.850.95213910
622.50AAPL140517C006225000.10Down 0.070.080.1383174
625.00AAPL140517C006250000.09Down 0.060.080.094653,311
625.00AAPL7140517C006250000.80 0.000.050.1621112
625.00AAPL140523C006250000.30Down 0.170.300.35139284
625.00AAPL140530C006250000.57Down 0.430.590.6915443
627.50AAPL140517C006275000.10Down 0.030.060.11479
630.00AAPL140517C006300000.07Down 0.050.050.105813,159
630.00AAPL7140517C006300001.40 0.000.050.143177
630.00AAPL140523C006300000.23Down 0.170.230.2874281
630.00AAPL140530C006300000.40Down 0.250.420.5136206
635.00AAPL140517C006350000.07Down 0.040.050.06691,251
635.00AAPL7140517C006350000.37 0.000.020.12484
635.00AAPL140523C006350000.20Down 0.100.170.23210201
635.00AAPL140530C006350000.34Down 0.120.320.3917377
640.00AAPL140517C006400000.05Down 0.040.040.054542,284
640.00AAPL7140517C006400000.15 0.000.010.251455
640.00AAPL140523C006400000.17Down 0.050.140.1962197
640.00AAPL140530C006400000.29Down 0.180.250.3119214
645.00AAPL140517C006450000.04Down 0.06N/A0.04155633
645.00AAPL7140517C006450000.90 0.00N/A0.184490
645.00AAPL140523C006450000.11Down 0.070.100.163193
645.00AAPL140530C006450000.25Down 0.200.200.3025178
650.00AAPL140517C006500000.02Down 0.060.020.036315,904
650.00AAPL7140517C006500000.46 0.00N/A0.28172
650.00AAPL140523C006500000.10Down 0.050.080.1544243
650.00AAPL140530C006500000.20Down 0.020.120.209225
655.00AAPL140517C006550000.03Down 0.020.010.0427491
655.00AAPL7140517C006550000.70 0.00N/A0.22965
655.00AAPL140523C006550000.08Down 0.060.060.14371
660.00AAPL140517C006600000.03Down 0.040.020.0321582
660.00AAPL140523C006600000.09Down 0.020.010.12562
665.00AAPL140517C006650000.06 0.00N/A0.068316
665.00AAPL140523C006650000.07Down 0.01N/A0.10155
670.00AAPL140517C006700000.03Down 0.02N/A0.0710841
670.00AAPL140523C006700000.08 0.000.020.099123
675.00AAPL140517C006750000.04 0.00N/A0.03113483
675.00AAPL140523C006750000.10 0.000.010.081344
680.00AAPL140517C006800000.05 0.00N/A0.05952,580
680.00AAPL140523C006800000.10 0.00N/A0.135050
685.00AAPL140517C006850000.04 0.00N/A0.1138236
685.00AAPL140523C006850000.05 0.00N/A0.071817
690.00AAPL140517C006900000.04 0.00N/A0.04109430
695.00AAPL140517C006950000.03 0.00N/A0.0353188
695.00AAPL140523C006950000.11 0.00N/A0.083266
700.00AAPL140517C007000000.01 0.000.010.02631,329
700.00AAPL140523C007000000.05 0.00N/A0.081010
705.00AAPL140517C007050000.01 0.00N/A0.0157457
710.00AAPL140517C007100000.02 0.00N/A0.023494
710.00AAPL140523C007100000.03Down 0.02N/A0.0410282
715.00AAPL140517C007150000.02 0.00N/A0.0136293
720.00AAPL140517C007200000.02 0.00N/A0.0114331
725.00AAPL140517C007250000.02 0.00N/A0.0112599
730.00AAPL140517C007300000.01 0.00N/A0.016146
735.00AAPL140517C007350000.02 0.00N/A0.0322116
740.00AAPL140517C007400000.02 0.00N/A0.032151
745.00AAPL140517C007450000.02 0.00N/A0.031313
750.00AAPL140517C007500000.02 0.00N/A0.032213
755.00AAPL140517C007550000.09 0.00N/A0.045879
760.00AAPL140517C007600000.24 0.00N/A0.04039
770.00AAPL140517C007700000.24 0.00N/A0.04099
775.00AAPL140517C007750000.40 0.00N/A0.0405
795.00AAPL140517C007950000.20 0.00N/A0.04055
800.00AAPL140517C008000000.04 0.00N/A0.02147
805.00AAPL140517C008050000.15 0.00N/A0.04010
Put OptionsExpire at close Friday, May 30, 2014
StrikeSymbolLastChgBidAskVolOpen Int
280.00AAPL140517P002800000.05 0.00N/A0.1226
290.00AAPL140517P002900000.02 0.00N/A0.111111
295.00AAPL140517P002950000.01 0.00N/A0.0838
300.00AAPL140517P003000000.05 0.00N/A0.09123
305.00AAPL140517P003050000.05 0.00N/A0.101020
310.00AAPL140517P003100000.10 0.00N/A0.1101
315.00AAPL140517P003150000.12 0.00N/A0.1101
320.00AAPL140517P003200000.10Up 0.08N/A0.1217
325.00AAPL140517P003250000.05 0.00N/A0.01185342
330.00AAPL140517P003300000.02 0.00N/A0.1255
335.00AAPL140517P003350000.02 0.00N/A0.1256
340.00AAPL140517P003400000.04Up 0.02N/A0.1256
345.00AAPL140517P003450000.02 0.00N/A0.1255
350.00AAPL140517P003500000.01 0.00N/A0.0160636
355.00AAPL140517P003550000.01 0.00N/A0.126392
360.00AAPL140517P003600000.02 0.00N/A0.108167
365.00AAPL140517P003650000.01 0.00N/A0.10228
370.00AAPL140517P003700000.04 0.00N/A0.12127
375.00AAPL140517P003750000.02 0.00N/A0.121536
380.00AAPL140517P003800000.04 0.00N/A0.126303
385.00AAPL140517P003850000.09 0.00N/A0.054331
390.00AAPL140517P003900000.01 0.00N/A0.1233239
395.00AAPL140517P003950000.07 0.00N/A0.1210270
400.00AAPL140517P004000000.02 0.00N/A0.121431
405.00AAPL140517P004050000.04 0.00N/A0.028284
410.00AAPL140517P004100000.02Up 0.01N/A0.032400
415.00AAPL140517P004150000.03 0.00N/A0.0222401
420.00AAPL140517P004200000.04 0.00N/A0.0410489
425.00AAPL140517P004250000.02 0.00N/A0.031863
430.00AAPL140517P004300000.01 0.00N/A0.023673,892
435.00AAPL140517P004350000.01 0.00N/A0.121956
435.00AAPL7140517P004350000.90 0.00N/A1.7111
440.00AAPL140517P004400000.01Down 0.01N/A0.108803
440.00AAPL7140517P004400000.69 0.00N/A1.711313
445.00AAPL140517P004450000.02 0.00N/A0.02101,616
450.00AAPL140517P004500000.02Down 0.01N/A0.02103,981
450.00AAPL7140517P004500000.64 0.00N/A1.711116
455.00AAPL140517P004550000.04Up 0.01N/A0.028487
455.00AAPL7140517P004550001.47 0.00N/A1.71156
460.00AAPL140517P004600000.01Down 0.010.010.02602,133
460.00AAPL7140517P004600000.02 0.00N/A0.542138
465.00AAPL140517P004650000.01Down 0.020.010.0211,617
465.00AAPL7140517P004650000.52 0.00N/A0.50272
470.00AAPL140517P004700000.02 0.000.010.0218,005
470.00AAPL7140517P004700000.73 0.00N/A0.43361
470.00AAPL140523P004700000.16 0.00N/A0.1555
475.00AAPL140517P004750000.02Up 0.010.010.0223,076
475.00AAPL7140517P004750000.10 0.00N/A0.341142
480.00AAPL140517P004800000.02 0.00N/A0.05793,648
480.00AAPL7140517P004800000.10Up 0.010.050.282147
485.00AAPL140517P004850000.02Down 0.03N/A0.091802,581
485.00AAPL7140517P004850000.05 0.00N/A0.2314178
485.00AAPL140523P004850000.10 0.00N/A0.0733
490.00AAPL140517P004900000.03Up 0.010.010.03344,959
490.00AAPL7140517P004900000.10 0.00N/A0.2326511
490.00AAPL140523P004900000.12 0.00N/A0.057594
490.00AAPL140530P004900000.11 0.000.020.2120140
490.00AAPL7140530P004900000.25 0.00N/A0.25163
492.50AAPL140530P004925000.35 0.00N/A0.22445
492.50AAPL7140530P004925003.95 0.00N/A0.42172175
495.00AAPL140517P004950000.03 0.000.020.04234,303
495.00AAPL7140517P004950000.05 0.00N/A0.082267
495.00AAPL140523P004950000.05Down 0.07N/A0.123220
495.00AAPL7140523P004950004.60 0.00N/A0.13427433
495.00AAPL140530P004950000.10 0.00N/A0.227119
495.00AAPL7140530P004950004.55 0.00N/A0.43154154
497.50AAPL140530P004975000.43 0.00N/A0.2288
497.50AAPL7140530P004975004.65 0.00N/A0.4411145
500.00AAPL140517P005000000.03Down 0.010.030.0418710,044
500.00AAPL7140517P005000000.05Up 0.04N/A0.051400
500.00AAPL140523P005000000.07 0.000.010.1312356
500.00AAPL7140523P005000000.31 0.00N/A0.131106
500.00AAPL140530P005000000.15Up 0.050.020.191189
500.00AAPL7140530P005000000.08 0.00N/A0.454279
502.50AAPL140523P005025000.07 0.00N/A0.1310217
502.50AAPL140530P005025000.24 0.00N/A0.232549
502.50AAPL7140530P005025005.90 0.00N/A0.46114270
505.00AAPL140517P005050000.04Down 0.010.020.05383,196
505.00AAPL7140517P005050000.05 0.000.010.091217
505.00AAPL140523P005050000.07Up 0.010.010.0946659
505.00AAPL7140523P005050000.16 0.00N/A0.361549
505.00AAPL140530P005050000.16Up 0.040.010.23131
505.00AAPL7140530P005050000.54 0.00N/A0.462305
507.50AAPL140523P005075000.07Down 0.020.030.1419152
507.50AAPL7140523P005075006.70 0.00N/A0.15109527
507.50AAPL140530P005075000.10 0.000.020.22165194
507.50AAPL7140530P005075008.50 0.00N/A0.485500
510.00AAPL140517P005100000.04Down 0.010.020.059210,771
510.00AAPL7140517P005100000.08Down 0.03N/A0.10101,109
510.00AAPL140523P005100000.08Up 0.020.030.0896512
510.00AAPL7140523P005100000.17 0.00N/A0.38176
510.00AAPL140530P005100000.10 0.000.020.193123
510.00AAPL7140530P005100000.62 0.00N/A0.211253
512.50AAPL140523P005125000.10Up 0.040.040.1528116
512.50AAPL140530P005125000.08Down 0.040.020.205107
512.50AAPL7140530P005125009.05 0.00N/A0.235454
515.00AAPL140517P005150000.04Down 0.030.020.043383,916
515.00AAPL7140517P005150000.09Down 0.06N/A0.111470
515.00AAPL140523P005150000.11Up 0.040.040.1480441
515.00AAPL7140523P005150000.86 0.00N/A0.39319
515.00AAPL140530P005150000.16 0.000.040.22207298
515.00AAPL7140530P0051500012.00 0.00N/A0.2344
517.50AAPL140523P005175000.13Up 0.020.050.1514106
517.50AAPL7140523P005175000.70 0.00N/A0.40430
517.50AAPL140530P005175000.19 0.000.050.18232244
517.50AAPL7140530P005175001.18 0.00N/A0.2545
520.00AAPL140517P005200000.06Down 0.010.050.062029,047
520.00AAPL7140517P005200000.09Up 0.01N/A0.123357
520.00AAPL140523P005200000.10Down 0.010.060.1362387
520.00AAPL7140523P005200000.25 0.00N/A0.195114
520.00AAPL140530P005200000.16Down 0.030.070.222330
520.00AAPL7140530P0052000013.10 0.00N/A0.2726
522.50AAPL140523P005225000.16Up 0.040.070.17686
522.50AAPL7140523P005225000.83 0.00N/A0.20112
522.50AAPL140530P005225000.17Down 0.040.090.201279
522.50AAPL7140530P0052250019.90 0.00N/A0.2911
525.00AAPL140517P005250000.08Up 0.010.060.094503,526
525.00AAPL7140517P005250000.13Down 0.050.050.131471
525.00AAPL140523P005250000.16Up 0.020.080.1952670
525.00AAPL7140523P0052500016.00 0.00N/A0.217386
525.00AAPL140530P005250000.20Up 0.020.110.2369348
525.00AAPL7140530P005250001.25 0.00N/A0.3014
527.50AAPL140523P005275000.16Up 0.010.090.166318
527.50AAPL7140523P005275001.50 0.00N/A0.2314
527.50AAPL140530P005275000.28 0.000.140.251518
527.50AAPL7140530P005275001.46 0.000.110.3111
530.00AAPL140517P005300000.10Up 0.020.050.1053013,138
530.00AAPL7140517P005300000.10Down 0.06N/A0.1416436
530.00AAPL140523P005300000.18Up 0.010.130.23121589
530.00AAPL7140523P005300000.19Down 1.26N/A0.24138
530.00AAPL140530P005300000.26Up 0.030.200.27284,407
530.00AAPL7140530P005300002.43 0.000.100.3514
532.50AAPL140523P005325000.22Up 0.050.150.22656252
532.50AAPL7140523P0053250017.60 0.000.030.2912
532.50AAPL140530P005325000.33 0.000.250.363233
535.00AAPL140517P005350000.09Down 0.040.080.154253,948
535.00AAPL7140517P005350000.12Down 0.030.100.19127656
535.00AAPL140523P005350000.23Up 0.020.170.2686358
535.00AAPL7140523P005350000.31Down 0.010.040.32340
535.00AAPL140530P005350000.30 0.000.260.3726221
535.00AAPL7140530P005350001.05 0.000.150.5213
537.50AAPL140523P005375000.28Up 0.110.200.2956148
537.50AAPL7140523P005375001.96 0.000.070.31116
537.50AAPL140530P005375000.47 0.000.350.442774
540.00AAPL140517P005400000.11Down 0.010.100.143916,476
540.00AAPL7140517P005400000.15Up 0.050.060.252415
540.00AAPL140523P005400000.32Up 0.040.250.3119321
540.00AAPL7140523P005400000.70 0.000.100.331018
540.00AAPL140530P005400000.57Up 0.220.410.5141426
540.00AAPL7140530P005400001.60 0.000.390.54132134
542.50AAPL140523P005425000.36Up 0.070.250.356478
542.50AAPL7140523P005425003.67 0.000.130.36174176
542.50AAPL140530P005425000.66 0.000.480.57102124
542.50AAPL7140530P005425005.95 0.000.470.5911
545.00AAPL140517P005450000.13Down 0.040.120.172594,469
545.00AAPL7140517P005450000.14Down 0.010.080.261275
545.00AAPL140523P005450000.35Up 0.010.290.3973105
545.00AAPL7140523P005450001.02 0.000.110.41481
545.00AAPL140530P005450000.65Up 0.090.570.6563324
545.00AAPL7140530P005450001.48 0.000.550.67168
547.50AAPL140523P005475000.53Up 0.270.330.435178
547.50AAPL7140523P005475001.82 0.000.320.4522161
547.50AAPL140530P005475000.53 0.000.670.74471
550.00AAPL140517P005500000.16Down 0.030.150.161,1325,742
550.00AAPL7140517P005500000.16Down 0.390.100.211619
550.00AAPL140523P005500000.45Down 0.070.400.4591241
550.00AAPL7140523P005500000.41 0.000.400.51422
550.00AAPL140530P005500000.84Up 0.210.780.8742311
550.00AAPL7140530P005500001.14Up 0.490.740.9025
552.50AAPL140523P005525000.72Up 0.360.480.582268
552.50AAPL140530P005525001.14Up 0.290.931.04158
555.00AAPL140517P005550000.19Down 0.040.170.206183,546
555.00AAPL7140517P005550000.20Down 0.290.140.251413
555.00AAPL140523P005550000.65Down 0.020.590.66125232
555.00AAPL140530P005550001.27Up 0.441.111.2252204
555.00AAPL7140530P005550004.25 0.001.001.2311
557.50AAPL140517P005575000.22Down 0.030.200.2536552
557.50AAPL140523P005575001.15Up 0.290.720.8161225
557.50AAPL7140523P005575004.10 0.000.660.8611
557.50AAPL140530P005575001.35Up 0.261.321.452071
557.50AAPL7140530P005575001.93Down 1.921.291.4512
560.00AAPL140517P005600000.28Down 0.020.250.292,3064,494
560.00AAPL7140517P005600000.45Down 0.180.190.369424
560.00AAPL140523P005600001.06Up 0.120.880.97324580
560.00AAPL7140523P005600001.50Up 0.450.841.1088
560.00AAPL140530P005600001.84Up 0.241.581.73230599
560.00AAPL7140530P005600002.45Down 0.101.561.73133
562.50AAPL140517P005625000.32Down 0.050.290.361,450160
562.50AAPL140523P005625001.16Up 0.021.111.19200201
562.50AAPL7140523P005625003.50 0.001.021.261115
562.50AAPL140530P005625002.67Up 0.711.912.052112
562.50AAPL7140530P005625002.77 0.001.862.17169
565.00AAPL140517P005650000.45Down 0.010.390.462,7384,705
565.00AAPL7140517P005650000.50 0.000.350.521493
565.00AAPL140523P005650001.42 0.001.381.48449660
565.00AAPL7140523P005650001.30 0.001.291.60313
565.00AAPL140530P005650002.41 0.002.292.443081,159
565.00AAPL7140530P005650001.80 0.002.182.48663
567.50AAPL140517P005675000.55Down 0.070.530.551,213260
567.50AAPL140523P005675001.99Up 0.221.711.83111194
567.50AAPL7140523P005675002.90 0.001.651.95114
567.50AAPL140530P005675003.30Up 1.302.772.9216241
567.50AAPL7140530P005675003.90Down 1.552.673.0518
570.00AAPL140517P005700000.74Down 0.020.730.744,8474,582
570.00AAPL7140517P005700001.65Up 0.750.690.8726228
570.00AAPL140523P005700002.20Up 0.062.142.27619788
570.00AAPL7140523P005700003.20Up 1.152.072.32615
570.00AAPL140530P005700003.40Up 0.143.253.50260999
570.00AAPL7140530P005700004.79Up 2.173.203.501157
572.50AAPL140517P005725001.02Down 0.071.001.051,653434
572.50AAPL140523P005725002.95Up 0.302.622.79405224
572.50AAPL7140523P005725004.20Up 0.052.552.92178
572.50AAPL140530P005725005.25Up 2.453.904.15275376
572.50AAPL7140530P005725002.81 0.003.804.1511103
575.00AAPL140517P005750001.47Up 0.071.451.515,4555,975
575.00AAPL7140517P005750002.78Up 1.631.321.642284
575.00AAPL140523P005750003.45Up 0.213.253.45417604
575.00AAPL7140523P005750002.52 0.003.153.6015
575.00AAPL140530P005750004.75Up 0.254.654.905821,420
575.00AAPL7140530P005750006.50Up 0.954.554.9010111
577.50AAPL140517P005775001.98Down 0.051.952.052,748232
577.50AAPL140523P005775004.10Up 0.104.004.20197200
577.50AAPL7140523P005775006.10Up 3.053.904.352276
580.00AAPL140517P005800002.72Up 0.142.702.757,1274,696
580.00AAPL7140517P005800002.78Up 0.182.542.99211302
580.00AAPL140523P005800004.95Up 0.204.905.10466525
580.00AAPL7140523P005800005.20Down 0.404.755.204528
580.00AAPL140530P005800006.72Up 0.466.456.70191560
580.00AAPL7140530P005800006.68Up 0.886.107.503100
582.50AAPL140517P005825003.60Up 0.073.553.753,184607
585.00AAPL140517P005850004.80Up 0.424.654.855,4033,487
585.00AAPL7140517P005850004.86Up 0.564.455.0038493
585.00AAPL140523P005850007.14Up 0.447.057.205981,335
585.00AAPL7140523P005850007.80Up 0.756.859.50230
585.00AAPL140530P005850008.75Up 0.418.658.9576294
585.00AAPL7140530P005850009.89Up 1.398.4510.503946
587.50AAPL140517P005875006.00Up 0.505.906.101,375367
587.50AAPL7140517P005875008.67Up 2.675.756.3026
587.50AAPL140523P005875000.15Up 0.15N/AN/A0119
587.50AAPL7140523P005875000.01Up 0.01N/AN/A0265
587.50AAPL140530P005875001.00Up 1.00N/AN/A020
587.50AAPL7140530P005875000.03Up 0.03N/AN/A86513
590.00AAPL140517P005900007.50Up 0.847.407.652,9144,498
590.00AAPL7140517P005900007.80Up 0.307.057.8020274
590.00AAPL140523P005900009.74Up 0.849.7510.00310601
590.00AAPL7140523P0059000012.52Up 5.679.5511.60119
590.00AAPL140530P0059000011.90Up 1.4511.3011.7037285
590.00AAPL7140530P0059000013.92Up 3.9211.0513.85122
592.50AAPL140517P005925009.35Up 1.109.109.35368633
595.00AAPL140517P0059500011.05Up 1.3010.9511.304001,569
595.00AAPL7140517P0059500011.00 0.0010.7013.0024140
595.00AAPL140523P0059500013.35Up 1.1812.9513.25353477
595.00AAPL7140523P0059500016.37Up 4.7212.7515.4018
595.00AAPL140530P0059500015.10Up 2.6514.4514.7519228
595.00AAPL7140530P0059500017.57Up 6.0714.1016.95519
597.50AAPL140517P0059750013.05Up 1.4512.8513.3585149
597.50AAPL7140517P005975009.00 0.0012.0014.901818
600.00AAPL140517P0060000015.15Up 1.5515.0015.502822,184
600.00AAPL7140517P0060000018.50Up 8.3014.8016.807142
600.00AAPL140523P0060000017.10Up 1.7516.5517.0092262
600.00AAPL7140523P0060000015.75 0.0016.1519.1058
600.00AAPL140530P0060000018.20Up 3.5617.8018.3036133
600.00AAPL7140530P0060000020.03Up 5.7517.6020.40129
602.50AAPL140517P0060250017.67Up 16.1717.2517.751263
605.00AAPL140517P0060500019.80Up 1.7019.6020.10251970
605.00AAPL7140517P0060500015.40 0.0018.5022.15166
605.00AAPL140523P0060500021.10Up 1.9020.6521.15305198
605.00AAPL140530P0060500023.31Up 1.3121.8522.753164
607.50AAPL140517P0060750021.90Down 2,240.1021.9022.5528
610.00AAPL140517P0061000024.55Up 1.0024.4024.9528417
610.00AAPL7140517P0061000016.04 0.0022.9026.9511
610.00AAPL140523P0061000026.30Up 1.8025.1026.4023134
610.00AAPL140530P0061000030.14Up 5.2125.9027.101132
612.50AAPL140517P0061250022.20Down 2,357.8026.7528.251010
615.00AAPL140517P0061500029.73Down 0.1729.1530.052156
615.00AAPL140523P0061500025.70 0.0029.7531.052046
615.00AAPL140530P0061500022.10 0.0030.3031.65112
620.00AAPL140517P0062000034.97Up 1.2233.9535.654275
620.00AAPL140523P0062000029.95 0.0034.2035.95472
620.00AAPL140530P0062000028.35 0.0034.5036.25522
625.00AAPL140517P0062500037.25 0.0038.6040.756130
625.00AAPL7140517P0062500033.80 0.0037.4041.4033
625.00AAPL140523P0062500032.20 0.0039.0040.802832
625.00AAPL140530P0062500038.85 0.0039.4541.1051010
630.00AAPL140517P0063000043.00 0.0043.2045.703246
630.00AAPL140523P0063000038.30 0.0044.0545.75812
630.00AAPL140530P0063000041.30 0.0043.7545.9014
635.00AAPL140517P0063500035.20 0.0048.2050.651240
635.00AAPL7140517P0063500055.50 0.0047.3051.7511
635.00AAPL140523P0063500044.20 0.0048.4550.7566
635.00AAPL140530P0063500046.10 0.0048.8050.80113
640.00AAPL140517P0064000050.90 0.0053.1555.654035
640.00AAPL7140517P00640000102.30 0.0052.6056.703242
640.00AAPL140523P0064000043.95 0.0053.6555.80612
640.00AAPL140530P0064000047.60 0.0053.7055.7522
645.00AAPL140517P0064500065.37 0.0058.5060.65213
645.00AAPL140523P0064500045.78 0.0058.6560.6511
645.00AAPL140530P0064500051.40 0.0059.0060.7022
650.00AAPL140517P0065000064.52Up 2.5264.0065.655004,292
650.00AAPL7140517P0065000062.90 0.0062.3566.7012
650.00AAPL140530P0065000065.00 0.0063.7065.7522
655.00AAPL140517P0065500068.00 0.0068.1570.652445
655.00AAPL7140517P0065500065.00 0.0067.4071.80313
660.00AAPL140517P0066000071.75 0.0073.5575.6534
675.00AAPL140517P0067500088.35 0.0088.5590.856566
680.00AAPL140517P0068000092.40 0.0093.9095.801,850600
695.00AAPL140517P00695000107.45 0.00108.60110.6512
700.00AAPL140517P00700000118.53Up 8.38113.15115.651160
710.00AAPL140517P00710000190.57 0.00123.65125.6501
715.00AAPL140517P00715000133.46Down 49.24128.60130.6578
720.00AAPL140517P00720000157.25 0.00133.60135.65115
725.00AAPL140517P00725000204.00 0.00138.65140.6587
740.00AAPL140517P00740000152.50 0.00153.60155.65133133
750.00AAPL140517P00750000164.50Down 21.15163.65165.6555
780.00AAPL140517P00780000189.60 0.00193.65195.752222
790.00AAPL140517P00790000199.37 0.00203.80205.653333
800.00AAPL140517P00800000208.26 0.00213.60215.70121121
805.00AAPL140517P00805000217.30 0.00218.55220.753434
   
Highlighted options are in-the-money.

Expand to Straddle View...

Currency in USD.

+ + + diff --git a/pandas/io/tests/data/yahoo_options2.html b/pandas/io/tests/data/yahoo_options2.html new file mode 100644 index 00000000..91c7d419 --- /dev/null +++ b/pandas/io/tests/data/yahoo_options2.html @@ -0,0 +1,329 @@ + +AAPL Options | Apple Inc. Stock - Yahoo! Finance
 

Send me a link:

*Only U.S. numbers are accepted. Text messaging rates may apply.

 Dow Up0.12% Nasdaq Down0.33%
+ + + + +

Apple Inc. (AAPL)

-NasdaqGS
593.76 Up 0.93(0.16%) 4:00PM EDT
|After Hours + : + 593.12 Down 0.64 (0.11%) 7:59PM EDT
Get the big picture on all your investments.
OptionsGet Options for:
View By Expiration: May 14 | Jun 14 | Jul 14 | Aug 14 | Oct 14 | Jan 15 | Jan 16
Call OptionsExpire at close Saturday, June 21, 2014
StrikeSymbolLastChgBidAskVolOpen Int
300.00AAPL140621C00300000229.24 0.00293.05294.501515
330.00AAPL140621C00330000184.90 0.00263.05264.5511
400.00AAPL140621C00400000192.50 0.00193.10194.60210
420.00AAPL140621C00420000171.05 0.00173.05174.601571
430.00AAPL140621C00430000163.48Up 1.48163.10164.4537
450.00AAPL140621C00450000131.63 0.00143.05144.60219
450.00AAPL7140621C00450000112.50 0.00142.00145.8011
460.00AAPL140621C00460000131.00 0.00133.10134.5513621
465.00AAPL140621C00465000124.20 0.00128.15129.551461
470.00AAPL140621C00470000122.85 0.00123.15124.45156
475.00AAPL140621C00475000115.90 0.00118.10119.5519511
475.00AAPL7140621C00475000117.35 0.00117.10120.7511
480.00AAPL140621C00480000112.50 0.00113.20114.501267
485.00AAPL140621C00485000106.55 0.00108.10109.551243
490.00AAPL140621C00490000103.50Up 0.20103.20104.60121
495.00AAPL140621C0049500092.86 0.0098.3599.5013
500.00AAPL140606C0050000085.70 0.0092.9594.6033
500.00AAPL140613C0050000093.00Up 1.3593.0094.65212
500.00AAPL7140613C0050000082.00 0.0091.9595.8011
500.00AAPL140621C0050000094.60Up 0.7093.4094.554615
500.00AAPL7140621C0050000089.00 0.0092.2595.8037
505.00AAPL140621C0050500078.38 0.0088.4089.65376
505.00AAPL7140621C0050500097.00 0.0087.2590.65310
510.00AAPL140621C0051000083.20Down 0.3083.3584.702152
510.00AAPL7140621C0051000084.00 0.0082.2085.90102
515.00AAPL140621C0051500069.95 0.0078.4079.701125
520.00AAPL140606C0052000069.20 0.0072.9574.556315
520.00AAPL140621C0052000075.00Up 0.7573.5074.7542278
520.00AAPL7140621C0052000071.80 0.0072.2575.70121
525.00AAPL140621C0052500066.90Down 1.4568.6069.8510199
525.00AAPL7140621C0052500067.00 0.0067.4070.80328
530.00AAPL140613C0053000063.75Up 10.9563.3065.0012
530.00AAPL140621C0053000063.78Down 0.5763.6064.9528359
530.00AAPL7140621C0053000064.80Up 1.7562.4565.80126
535.00AAPL140621C0053500059.40Up 2.9058.8060.103155
535.00AAPL7140621C0053500048.18 0.0057.6061.10138
540.00AAPL140606C0054000052.42Up 10.1253.4554.70112
540.00AAPL140613C0054000052.90 0.0053.5555.1011
540.00AAPL140621C0054000055.10Up 1.0554.1055.0014440
540.00AAPL7140621C0054000054.80Down 0.2053.2556.3017
545.00AAPL140606C0054500043.42 0.0048.4049.703010
545.00AAPL140621C0054500050.00Up 2.0049.4550.555622
545.00AAPL7140621C0054500053.88 0.0048.5051.301533
545.00AAPL140627C0054500041.70 0.0049.7551.1011
550.00AAPL140606C0055000044.50Up 1.0043.7044.857297
550.00AAPL140613C0055000044.50Up 1.5044.2545.65422
550.00AAPL140621C0055000045.00Down 0.5044.7545.70392,403
550.00AAPL7140621C0055000045.00Up 8.5044.1046.605223
550.00AAPL140627C0055000043.80 0.0045.2546.6022
555.00AAPL140606C0055500034.53 0.0038.7040.009910
555.00AAPL140613C0055500038.72Up 4.8739.7041.2011
555.00AAPL140621C0055500040.75Up 1.9540.4041.3528885
555.00AAPL7140621C0055500039.01 0.0039.8542.153126
560.00AAPL140606C0056000033.50Down 0.5034.1035.30284
560.00AAPL140613C0056000035.31Down 0.0435.3536.452117
560.00AAPL7140613C0056000033.20 0.0034.6036.9044
560.00AAPL140621C0056000035.75Down 0.2536.0537.00202,934
560.00AAPL7140621C0056000035.90Down 0.2535.2537.501556
560.00AAPL140627C0056000034.25 0.0036.5538.0023
562.50AAPL140606C0056250030.00 0.0031.8533.001124
562.50AAPL7140606C0056250023.25 0.0030.8034.0599
565.00AAPL140606C0056500029.50Up 0.5029.6530.708455
565.00AAPL7140606C0056500029.80Down 7.7729.1531.80213
565.00AAPL140613C0056500029.75 0.0031.0532.15627
565.00AAPL7140613C0056500035.60 0.0030.4032.6522
565.00AAPL140621C0056500031.90Down 0.5031.7032.80411,573
565.00AAPL7140621C0056500032.41 0.0031.6033.506387
567.50AAPL140613C0056750027.60 0.0028.9030.10109
570.00AAPL140606C0057000026.36Up 0.4625.5526.0512319
570.00AAPL7140606C0057000018.40 0.0024.2026.90419
570.00AAPL140613C0057000027.49 0.0026.8028.05113
570.00AAPL140621C0057000028.30Down 0.7728.3028.502,4324,953
570.00AAPL7140621C0057000029.60Up 1.6027.4028.856655
570.00AAPL140627C0057000022.70 0.0028.9029.90510
572.50AAPL140613C0057250025.94Up 3.1924.9526.052012
575.00AAPL140606C0057500021.20Down 0.3021.1521.951439
575.00AAPL7140606C0057500025.30 0.0020.5022.9536
575.00AAPL140613C0057500023.75Down 0.5023.4024.201330
575.00AAPL140621C0057500024.50Down 0.6524.6024.852012,135
575.00AAPL7140621C0057500024.93Up 0.9324.3025.254384
577.50AAPL140613C0057750020.85 0.0021.3022.05110
580.00AAPL140606C0058000018.20Down 0.2017.7018.1512433
580.00AAPL7140606C0058000016.95 0.0016.7019.15114
580.00AAPL140613C0058000020.50 0.0019.6520.455182
580.00AAPL140621C0058000020.95Down 0.6021.1521.405233,065
580.00AAPL7140621C0058000021.50Down 0.1520.9521.804246
580.00AAPL140627C0058000022.40Up 0.2021.8022.553047
582.50AAPL140613C0058250013.50 0.0018.0518.7567
582.50AAPL7140613C0058250015.00 0.0016.8019.703030
582.50AAPL140627C0058250020.95 0.0020.3521.0049
585.00AAPL140606C0058500014.25Down 0.8514.4014.7019372
585.00AAPL7140606C0058500014.14Down 0.0813.4515.65128
585.00AAPL140613C0058500017.25Up 0.8516.4517.152177
585.00AAPL140621C0058500017.90Down 1.0318.1018.251612,407
585.00AAPL7140621C0058500017.85Down 0.6517.8518.6099304
585.00AAPL140627C0058500019.55Down 0.4318.8519.45511
587.50AAPL140613C0058750015.65 0.0015.0515.65134
587.50AAPL7140613C0058750014.00 0.0014.7016.75222
587.50AAPL140627C0058750017.10 0.0017.5018.00627
590.00AAPL140606C0059000011.05Down 0.9011.3511.70551,297
590.00AAPL7140606C0059000011.50Down 1.8710.2012.40645
590.00AAPL140613C0059000014.10Down 0.3713.6514.2538448
590.00AAPL140621C0059000015.15Down 0.5915.2515.4061814,296
590.00AAPL7140621C0059000015.20Down 0.8015.0515.8519230
590.00AAPL140627C0059000016.35Down 0.9516.1516.652057
590.00AAPL7140627C0059000014.13 0.0015.0017.5533
592.50AAPL140613C0059250012.60Down 0.3512.5012.852960
592.50AAPL7140613C0059250012.70Up 2.5012.1514.3523
592.50AAPL140627C0059250015.05Down 0.6514.8015.304012
592.50AAPL7140627C0059250014.80 0.0014.5516.5534
595.00AAPL140606C005950008.78Down 0.528.809.053151,315
595.00AAPL7140606C005950009.17Up 0.678.2010.154283
595.00AAPL140613C0059500011.55Up 0.0511.4511.60292318
595.00AAPL7140613C005950009.38 0.0011.0012.4558
595.00AAPL140621C0059500012.54Down 0.6612.6512.858572,741
595.00AAPL7140621C0059500013.25Down 0.2512.4513.3517317
595.00AAPL140627C0059500013.94Down 0.8613.6014.05760
595.00AAPL7140627C0059500014.24Down 1.7613.2015.1017
597.50AAPL140613C005975009.96 0.0010.0010.552274
597.50AAPL140627C0059750012.85 0.0012.6512.905236
600.00AAPL140606C006000006.70Down 0.406.656.903231,841
600.00AAPL7140606C006000006.75Down 0.806.557.1015179
600.00AAPL140613C006000009.40Down 0.409.209.40123690
600.00AAPL7140613C006000009.43Up 0.238.8510.001618
600.00AAPL140621C0060000010.55Down 0.5010.5010.608278,816
600.00AAPL7140621C0060000010.15Down 0.8510.2011.0068416
600.00AAPL140627C0060000011.63Down 0.8711.3511.80334
600.00AAPL7140627C006000009.22 0.0011.2012.6511
602.50AAPL140613C006025008.50 0.008.058.50551
602.50AAPL7140613C006025006.20 0.008.058.5033
605.00AAPL140606C006050005.03Down 0.274.905.10476849
605.00AAPL7140606C006050005.20 0.004.855.3014132
605.00AAPL140613C006050007.40Down 0.707.157.607112
605.00AAPL7140613C006050007.80 0.007.159.0029
605.00AAPL140621C006050008.45Down 0.608.508.604192,326
605.00AAPL7140621C006050008.70 0.008.358.657738
605.00AAPL140627C006050009.75Down 0.109.409.80817
607.50AAPL140613C006075005.72 0.006.356.7559
607.50AAPL7140613C0060750010.25 0.006.356.7544
607.50AAPL140627C006075009.64 0.008.558.90619
610.00AAPL140606C006100003.65Down 0.553.553.70648924
610.00AAPL7140606C006100003.90Down 0.253.503.9516135
610.00AAPL140613C006100006.10Down 0.255.606.0026131
610.00AAPL7140613C006100009.70 0.005.606.00510
610.00AAPL140621C006100006.75Down 0.646.857.004972,521
610.00AAPL7140621C006100007.20Down 0.206.757.0013423
610.00AAPL140627C006100008.01Down 0.447.758.10140
610.00AAPL7140627C006100008.02 0.007.559.1033
612.50AAPL140613C006125005.80 0.004.905.35345
612.50AAPL7140613C006125006.55 0.004.905.4012
612.50AAPL140627C006125005.57 0.007.007.402021
615.00AAPL140606C006150002.95Down 0.052.542.73105619
615.00AAPL7140606C006150002.82Down 0.172.422.838135
615.00AAPL140613C006150004.80Up 0.204.354.75551
615.00AAPL140621C006150005.40Down 0.605.455.552471,992
615.00AAPL7140621C006150005.30Down 0.015.355.70574
615.00AAPL140627C006150007.40 0.006.306.603938
617.50AAPL140613C006175004.15Up 0.653.854.20103
617.50AAPL7140613C006175006.45 0.003.804.1511
617.50AAPL140627C006175004.60 0.005.605.95121
617.50AAPL7140627C006175004.55 0.005.406.6033
620.00AAPL140606C006200001.88Down 0.411.841.97217738
620.00AAPL7140606C006200002.00Up 0.601.782.151846
620.00AAPL140613C006200003.50Down 0.553.353.7016140
620.00AAPL7140613C006200003.65Down 1.073.303.6543
620.00AAPL140621C006200004.26Down 0.494.304.402497,992
620.00AAPL7140621C006200004.60Down 0.354.204.459226
620.00AAPL140627C006200005.48Down 0.275.055.40176
620.00AAPL7140627C006200004.00 0.003.655.951016
622.50AAPL140613C006225003.20Up 0.452.933.251311
622.50AAPL7140613C006225003.10 0.002.903.3012
622.50AAPL7140627C006225004.60 0.004.405.4011
625.00AAPL140606C006250001.53Down 0.041.361.442,1642,256
625.00AAPL7140606C006250001.50Up 0.501.301.481029
625.00AAPL140613C006250002.75Down 0.452.582.841354
625.00AAPL7140613C006250002.18 0.002.532.8312
625.00AAPL140621C006250003.40Down 0.403.403.454112,307
625.00AAPL7140621C006250003.64Down 0.113.203.609230
625.00AAPL140627C006250004.40Down 0.204.104.35128
625.00AAPL7140627C006250004.42Up 0.223.904.9511
627.50AAPL140613C006275002.76 0.002.262.5224
627.50AAPL140627C006275003.94Down 0.263.653.95317
627.50AAPL7140627C006275003.80 0.003.454.4011
630.00AAPL140606C006300001.01Down 0.130.951.07139409
630.00AAPL7140606C006300001.77 0.000.891.1038
630.00AAPL140613C006300002.20Down 0.221.972.1441568
630.00AAPL7140613C006300005.10 0.001.972.1823
630.00AAPL140621C006300002.69Down 0.382.682.737325,019
630.00AAPL7140621C006300003.15 0.002.522.8521148
630.00AAPL140627C006300003.50Down 0.453.303.55439
630.00AAPL7140627C006300003.45 0.002.043.9511
632.50AAPL140613C006325001.95Down 0.051.721.8810925
632.50AAPL7140613C006325003.65 0.001.651.9611
635.00AAPL140606C006350000.74Down 0.010.690.7947252
635.00AAPL7140606C006350002.07 0.000.510.84322
635.00AAPL140613C006350001.65Down 0.301.531.664560
635.00AAPL7140613C006350003.30 0.001.431.8211
635.00AAPL140621C006350002.10Down 0.362.112.162171,188
635.00AAPL7140621C006350002.44 0.001.952.18157
635.00AAPL140627C006350002.94Down 0.162.652.85271
640.00AAPL140606C006400000.55Down 0.150.500.6439260
640.00AAPL7140606C006400000.65 0.000.470.66224
640.00AAPL140613C006400001.30Down 0.231.151.31141187
640.00AAPL7140613C006400001.24Down 0.211.151.331013
640.00AAPL140621C006400001.66Down 0.271.661.701121,451
640.00AAPL140627C006400002.29Down 0.182.112.31107
645.00AAPL140606C006450000.44Down 0.060.380.4933147
645.00AAPL7140606C006450001.65 0.000.200.5423
645.00AAPL140613C006450001.10 0.000.901.03489
645.00AAPL7140613C006450002.39 0.000.881.0755
645.00AAPL140621C006450001.35Down 0.261.321.36147848
645.00AAPL140627C006450001.78Up 0.131.681.882314
645.00AAPL7140627C006450001.91 0.001.172.1211
650.00AAPL140606C006500000.29Down 0.150.280.4014742
650.00AAPL7140606C006500000.35 0.000.120.44227
650.00AAPL140613C006500000.95 0.000.690.843124
650.00AAPL7140613C006500000.78Down 1.460.670.871011
650.00AAPL140621C006500001.08Down 0.221.061.1069410,025
650.00AAPL140627C006500001.46Up 0.131.361.572832
650.00AAPL7140627C006500003.95 0.000.521.8611
655.00AAPL140621C006550000.87Down 0.230.860.881661,192
660.00AAPL140621C006600000.74Down 0.120.690.721531,066
665.00AAPL140621C006650000.63Down 0.100.570.6062462
670.00AAPL140621C006700000.50Down 0.130.480.5049475
675.00AAPL140621C006750000.45Down 0.080.410.4231377
680.00AAPL140621C006800000.36Down 0.090.350.3625456
685.00AAPL140621C006850000.31Down 0.090.300.3112393
690.00AAPL140621C006900000.26Down 0.070.260.2718485
695.00AAPL140621C006950000.26Down 0.030.230.247333
700.00AAPL140621C007000000.21Down 0.060.200.211113,476
705.00AAPL140621C007050000.22Down 0.010.170.1825790
710.00AAPL140621C007100000.17Down 0.020.140.1612520
715.00AAPL140621C007150000.15Down 0.020.120.139441
720.00AAPL140621C007200000.11Down 0.030.100.1213265
725.00AAPL140621C007250000.10 0.000.090.109178
730.00AAPL140621C007300000.09Down 0.020.080.0938260
735.00AAPL140621C007350000.08Down 0.030.050.08116687
740.00AAPL140621C007400000.07Down 0.020.040.0798870
745.00AAPL140621C007450000.06Down 0.01N/A0.05171659
750.00AAPL140621C007500000.07 0.000.020.0521,954
755.00AAPL140621C007550000.07 0.00N/A0.04210558
760.00AAPL140621C007600000.03Down 0.01N/A0.03653,809
765.00AAPL140621C007650000.04 0.000.010.0352,394
770.00AAPL140621C007700000.01Down 0.020.010.022744,282
775.00AAPL140621C007750000.02 0.00N/A0.02502,627
780.00AAPL140621C007800000.02 0.00N/A0.025001,941
785.00AAPL140621C007850000.01Down 0.01N/A0.01362,514
Put OptionsExpire at close Saturday, June 21, 2014
StrikeSymbolLastChgBidAskVolOpen Int
280.00AAPL140621P002800000.05 0.00N/A0.091010
300.00AAPL140621P003000000.09 0.00N/A0.07144
325.00AAPL140621P003250000.09 0.00N/A0.091010
330.00AAPL140621P003300000.02 0.00N/A0.09120
335.00AAPL140621P003350000.01 0.00N/A0.094242
345.00AAPL140621P003450000.01 0.00N/A0.096666
350.00AAPL140621P003500000.20 0.00N/A0.083102
360.00AAPL140621P003600000.17 0.00N/A0.091020
365.00AAPL140621P003650000.02 0.00N/A0.0912180
370.00AAPL140621P003700000.02 0.00N/A0.09511
375.00AAPL140621P003750000.02 0.00N/A0.0955
380.00AAPL140621P003800000.10 0.00N/A0.032628
385.00AAPL140621P003850000.02 0.00N/A0.034176
390.00AAPL140621P003900000.03 0.00N/A0.054104
395.00AAPL140621P003950000.17 0.00N/A0.09215
400.00AAPL140621P004000000.02 0.00N/A0.032981
405.00AAPL140621P004050000.05Up 0.03N/A0.051749
410.00AAPL140621P004100000.02 0.00N/A0.055109
415.00AAPL140621P004150000.10 0.00N/A0.051097
420.00AAPL140621P004200000.01 0.00N/A0.0540256
425.00AAPL140621P004250000.04 0.00N/A0.0413685
425.00AAPL7140621P004250000.47 0.00N/A0.1655
430.00AAPL140621P004300000.04 0.00N/A0.0410412
435.00AAPL140621P004350000.04 0.00N/A0.043199
435.00AAPL7140621P004350000.31 0.00N/A0.1712
440.00AAPL140621P004400000.02 0.00N/A0.0411,593
440.00AAPL7140621P004400001.05 0.00N/A0.17116
445.00AAPL140621P004450000.02Down 0.03N/A0.041233
450.00AAPL140621P004500000.07 0.000.020.042732,123
450.00AAPL7140621P004500001.27 0.00N/A0.1915
455.00AAPL140621P004550000.05 0.00N/A0.1129245
460.00AAPL140621P004600000.06Up 0.01N/A0.111506
460.00AAPL7140621P004600000.30 0.00N/A0.2033
465.00AAPL140621P004650000.11 0.000.010.09271,123
465.00AAPL7140621P004650002.99 0.00N/A0.256498
470.00AAPL140621P004700000.04Down 0.050.020.1021,279
470.00AAPL7140621P004700004.70 0.00N/A0.251354
475.00AAPL140621P004750000.05Down 0.050.040.07412,028
475.00AAPL7140621P004750000.25 0.00N/A0.24579
480.00AAPL140621P004800000.07Down 0.050.040.09232,070
480.00AAPL7140621P004800000.22 0.000.030.25232
485.00AAPL140621P004850000.08Down 0.050.070.0816871
485.00AAPL7140621P004850004.45 0.00N/A0.25239
490.00AAPL140606P004900000.34 0.00N/A0.15266
490.00AAPL140621P004900000.09Down 0.070.090.101312,770
490.00AAPL7140621P004900005.00 0.00N/A0.30287
492.50AAPL140606P004925000.30 0.00N/A0.19412
495.00AAPL140606P004950000.60 0.00N/A0.2088
495.00AAPL140621P004950000.11Down 0.080.110.12642,606
495.00AAPL7140621P004950005.75 0.000.050.22322
500.00AAPL140606P005000000.21 0.000.010.18910
500.00AAPL7140606P005000001.45 0.00N/A0.253030
500.00AAPL140613P005000000.33 0.000.050.19412
500.00AAPL140621P005000000.13Down 0.110.130.142672,383
500.00AAPL7140621P005000000.20Down 0.150.090.24196
505.00AAPL140606P005050000.86 0.00N/A0.201617
505.00AAPL140621P005050000.16Down 0.110.150.17487861
505.00AAPL7140621P005050000.40 0.000.100.27334
507.50AAPL140606P005075000.27 0.000.010.22818
510.00AAPL140606P005100000.17 0.000.030.222035
510.00AAPL140621P005100000.20Down 0.130.190.202272,308
510.00AAPL7140621P005100000.23Down 0.150.120.303112
515.00AAPL140606P005150000.29 0.000.030.23150
515.00AAPL140613P005150000.24 0.000.130.2711
515.00AAPL140621P005150000.24Down 0.170.230.251591,634
515.00AAPL7140621P005150000.35Down 0.750.080.35124
517.50AAPL140606P005175000.41 0.000.030.18223
520.00AAPL140606P005200000.13Down 0.160.040.194128
520.00AAPL140613P005200000.34 0.000.180.31120
520.00AAPL140621P005200000.31Down 0.210.310.322952,777
520.00AAPL7140621P005200000.50 0.000.160.405125
522.50AAPL140606P005225000.54 0.000.060.2022
525.00AAPL140606P005250000.20Down 0.170.080.191204
525.00AAPL140613P005250000.47 0.000.250.371141
525.00AAPL140621P005250000.40Down 0.210.380.392122,729
525.00AAPL7140621P005250001.20 0.000.340.49194
527.50AAPL140606P005275000.17Down 1.290.100.20122
530.00AAPL140606P005300000.56 0.000.110.25372
530.00AAPL7140606P005300003.24 0.000.060.2744
530.00AAPL140613P005300000.76 0.000.340.45327
530.00AAPL140621P005300000.52Down 0.260.480.502892,675
530.00AAPL7140621P005300001.44 0.000.420.60475
530.00AAPL140627P005300000.75Down 0.230.680.8512122
532.50AAPL140606P005325000.90 0.000.110.2712
535.00AAPL140606P005350000.41 0.000.140.2940100
535.00AAPL7140606P005350001.12 0.000.150.31102
535.00AAPL140613P005350000.56Down 0.350.440.553108
535.00AAPL7140613P005350000.51Down 0.640.290.58101
535.00AAPL140621P005350000.65Down 0.300.630.651171,322
535.00AAPL7140621P005350002.05 0.000.580.75242
537.50AAPL140606P005375000.33 0.000.180.32336
540.00AAPL140606P005400000.29Down 0.120.220.3229133
540.00AAPL7140606P005400004.85 0.000.210.3716
540.00AAPL140613P005400001.09 0.000.600.70217
540.00AAPL140621P005400000.85Down 0.380.820.851443,077
540.00AAPL7140621P005400000.88Down 0.420.690.952369
540.00AAPL140627P005400001.25Down 1.931.151.33364
542.50AAPL140606P005425001.02 0.000.250.3912118
545.00AAPL140606P005450000.45Down 0.690.270.448107
545.00AAPL7140606P005450001.91 0.000.250.4611
545.00AAPL140613P005450000.87Down 0.270.810.9064135
545.00AAPL7140613P005450003.05 0.000.560.981010
545.00AAPL140621P005450001.14Down 0.411.091.133854,203
545.00AAPL7140621P005450003.10 0.000.951.217426
545.00AAPL140627P005450001.58Down 0.981.501.65221
547.50AAPL140606P005475000.49Down 0.280.360.49753
547.50AAPL7140606P005475002.43 0.000.310.5211
550.00AAPL140606P005500000.49Down 0.230.420.4926431
550.00AAPL7140606P005500001.10 0.000.240.6016
550.00AAPL140613P005500001.16Down 0.461.081.2069100
550.00AAPL7140613P005500001.11Down 2.690.981.271011
550.00AAPL140621P005500001.47Down 0.551.451.493444,118
550.00AAPL7140621P005500001.60Down 1.071.331.662348
550.00AAPL140627P005500002.54Down 0.711.952.121083
552.50AAPL140606P005525000.60Down 0.290.520.63933
552.50AAPL7140606P005525004.90 0.000.330.683030
555.00AAPL140606P005550000.89Down 0.120.620.745372
555.00AAPL7140606P005550003.55 0.000.430.7812
555.00AAPL140613P005550001.52Down 0.601.471.60102113
555.00AAPL7140613P005550004.70 0.001.261.642020
555.00AAPL140621P005550002.03Down 0.561.941.983881,057
555.00AAPL7140621P005550002.65Down 0.111.802.11827
555.00AAPL140627P005550003.75 0.002.532.72616
557.50AAPL140606P005575000.84Down 0.380.750.89567
557.50AAPL7140606P005575003.55 0.000.650.9214
557.50AAPL140627P005575002.96Down 3.592.873.10419
560.00AAPL140606P005600000.98Down 0.310.901.0041396
560.00AAPL7140606P005600001.73 0.000.721.06112
560.00AAPL140613P005600002.10Down 0.492.002.2315148
560.00AAPL140621P005600002.64Down 0.692.562.633875,780
560.00AAPL7140621P005600003.10Down 0.892.452.742130
560.00AAPL140627P005600003.90Down 0.703.203.502175
562.50AAPL140606P005625001.21Down 0.341.101.232154
562.50AAPL7140606P005625004.60 0.000.911.2512
562.50AAPL140613P005625002.51Down 2.542.292.573779
562.50AAPL7140613P005625006.45 0.002.092.491010
565.00AAPL140606P005650001.50Down 0.291.351.42461,051
565.00AAPL7140606P005650001.82Down 3.331.281.4817
565.00AAPL140613P005650002.87Down 0.592.662.9530191
565.00AAPL7140613P005650006.20 0.002.422.87113
565.00AAPL140621P005650003.50Down 0.703.403.452532,292
565.00AAPL7140621P005650008.32 0.003.253.605222
565.00AAPL140627P005650004.96Down 0.824.204.40114
567.50AAPL140613P005675003.22Down 0.783.103.401547
567.50AAPL140627P005675005.52 0.004.705.00733
567.50AAPL7140627P005675008.55 0.004.356.1011
570.00AAPL140606P005700002.05Down 0.531.942.12117469
570.00AAPL7140606P005700007.00 0.001.832.17715
570.00AAPL140613P005700004.05Down 0.443.553.95678
570.00AAPL7140613P005700008.03 0.003.553.8066
570.00AAPL140621P005700004.55Down 0.804.454.557593,360
570.00AAPL7140621P005700004.68Down 5.324.404.65480
570.00AAPL140627P005700006.10Down 1.125.305.60840
570.00AAPL7140627P005700008.30 0.005.006.7511
572.50AAPL140613P005725004.38Down 0.694.104.501893
572.50AAPL7140613P005725008.35 0.004.104.40111
572.50AAPL140627P005725007.30 0.005.956.3013
572.50AAPL7140627P0057250010.30 0.005.607.5011
575.00AAPL140606P005750003.00Down 0.852.832.9795265
575.00AAPL7140606P005750006.50 0.002.763.101263
575.00AAPL140613P005750005.10Down 0.904.705.1522144
575.00AAPL7140613P005750006.80 0.004.605.10122
575.00AAPL140621P005750005.86Down 0.975.705.855061,326
575.00AAPL7140621P005750007.79 0.005.705.953306
575.00AAPL140627P005750007.48Down 0.776.657.00123
577.50AAPL140613P005775005.73Down 1.425.405.85163
577.50AAPL7140613P0057750010.40 0.005.405.8015
577.50AAPL140627P0057750010.34 0.007.457.8011
580.00AAPL140606P005800004.15Down 0.924.004.20171301
580.00AAPL7140606P005800005.00 0.003.954.353346
580.00AAPL140613P005800006.58Down 1.426.206.601486
580.00AAPL7140613P005800007.40 0.006.056.60522
580.00AAPL140621P005800007.50Down 1.067.307.455133,276
580.00AAPL7140621P005800008.05Down 1.607.207.6024162
580.00AAPL140627P005800009.45Down 0.608.358.70127
582.50AAPL140613P005825007.25Down 1.207.007.45131
582.50AAPL7140613P0058250012.85 0.007.008.857272
582.50AAPL140627P0058250011.95 0.009.259.651384
585.00AAPL140606P005850005.60Down 1.855.555.95561271
585.00AAPL7140606P005850007.48 0.005.456.05212
585.00AAPL140613P005850009.15Down 0.757.958.404159
585.00AAPL7140613P0058500010.05 0.007.7510.3511
585.00AAPL140621P005850009.30Down 1.159.159.302931,145
585.00AAPL7140621P005850009.50Down 0.909.109.8020171
585.00AAPL140627P0058500010.25Down 1.6510.2510.60116
585.00AAPL7140627P0058500013.82 0.009.9512.0537
587.50AAPL140613P005875009.00Down 2.209.009.251099
587.50AAPL7140613P0058750013.50 0.008.6511.3522
587.50AAPL140627P0058750014.20 0.0011.3011.70454
590.00AAPL140606P005900007.80Down 1.207.607.85264546
590.00AAPL7140606P0059000017.67 0.007.359.40938
590.00AAPL140613P0059000010.80Down 1.5510.0510.5010211
590.00AAPL140621P0059000011.39Down 0.9711.3011.459462,582
590.00AAPL7140621P0059000012.39Down 2.7911.2511.60598
590.00AAPL140627P0059000015.90 0.0012.4012.801243
590.00AAPL7140627P0059000014.75 0.0012.1014.7024
592.50AAPL140613P0059250012.20Down 2.0011.2011.801017
592.50AAPL7140613P0059250013.02Down 2.4311.0013.6012
592.50AAPL140627P0059250017.00 0.0013.6013.904840
595.00AAPL140606P0059500010.15Down 0.9510.0010.3057406
595.00AAPL7140606P0059500018.55 0.009.7012.00125
595.00AAPL140613P0059500013.10Down 2.5012.5012.9518168
595.00AAPL7140613P0059500016.85 0.0012.2014.851313
595.00AAPL140621P0059500014.00Down 0.8913.8013.901851,228
595.00AAPL7140621P0059500023.00 0.0013.7014.50277
595.00AAPL140627P0059500015.07Down 1.6814.9015.15127
597.50AAPL140613P0059750017.00 0.0013.9014.65933
597.50AAPL140627P0059750025.10 0.0016.2016.50134
600.00AAPL140606P0060000013.02Down 1.9812.8013.4029153
600.00AAPL7140606P0060000013.00Down 9.9012.0015.10124
600.00AAPL140613P0060000015.53Down 3.1915.3015.752214
600.00AAPL7140613P0060000024.82 0.0014.9017.5025
600.00AAPL140621P0060000016.75Down 0.9516.4516.652182,099
600.00AAPL7140621P0060000022.80 0.0016.4517.301071
600.00AAPL140627P0060000020.20 0.0017.6018.1011
600.00AAPL7140627P0060000020.90 0.0016.9519.5022
602.50AAPL140613P0060250019.37 0.0016.8017.50323
602.50AAPL140627P0060250028.40 0.0019.1019.6033
605.00AAPL140606P0060500016.55Down 0.7016.1016.3517695
605.00AAPL7140606P0060500020.75 0.0015.0018.3014
605.00AAPL140613P0060500018.65Down 3.8518.3519.05741
605.00AAPL7140613P0060500023.40 0.0017.1020.001111
605.00AAPL140621P0060500019.85Down 1.1019.5519.70118468
605.00AAPL7140621P0060500030.30 0.0019.5020.3515
607.50AAPL7140613P0060750025.05 0.0019.1521.751212
607.50AAPL140627P0060750026.10 0.0022.2022.8522
610.00AAPL140606P0061000020.00Down 10.1019.7020.003473
610.00AAPL7140606P0061000024.10 0.0019.0021.852526
610.00AAPL140613P0061000024.70 0.0021.6522.6027
610.00AAPL140621P0061000023.20Down 1.0022.8023.1061493
610.00AAPL7140621P0061000027.80 0.0022.7523.652428
612.50AAPL140613P0061250034.05 0.0023.4024.3547
612.50AAPL7140613P0061250026.65 0.0022.3525.751010
612.50AAPL140627P0061250026.20Down 8.4525.6526.302525
615.00AAPL140606P0061500023.85Down 3.2023.7024.053435
615.00AAPL140613P0061500034.30 0.0025.3026.4046
615.00AAPL140621P0061500037.45 0.0026.5026.7513184
615.00AAPL7140621P0061500082.30 0.0026.4027.152010
617.50AAPL140613P0061750037.45 0.0027.2528.25417
617.50AAPL7140627P0061750039.68 0.0027.9031.2533
620.00AAPL140606P0062000028.70Down 10.0527.6528.7515
620.00AAPL7140606P0062000032.56 0.0026.6029.9534
620.00AAPL140613P0062000032.30 0.0029.2030.6527
620.00AAPL140621P0062000033.18 0.0030.3031.0020123
620.00AAPL7140627P0062000042.87 0.0029.9033.1033
625.00AAPL140606P0062500043.00 0.0032.1033.15133
625.00AAPL140613P0062500037.90 0.0033.4034.502020
625.00AAPL140621P0062500037.30 0.0034.1035.1021142
625.00AAPL7140621P0062500091.40 0.0033.7536.402010
627.50AAPL140627P0062750043.55 0.0036.8038.001010
630.00AAPL140606P0063000041.20 0.0036.3538.101414
630.00AAPL140613P0063000038.70Down 7.0037.7538.952510
630.00AAPL7140613P0063000041.50 0.0036.9539.7511
630.00AAPL140621P0063000039.00Down 10.8538.3539.352236
630.00AAPL7140621P0063000038.95 0.0038.0040.6555
630.00AAPL140627P0063000041.85 0.0039.0540.1511
635.00AAPL140606P0063500046.80 0.0041.4042.7544
635.00AAPL140613P0063500045.30 0.0042.2543.4048
635.00AAPL140621P0063500045.60 0.0042.7543.901124
635.00AAPL7140621P0063500047.20 0.0042.1045.0011
640.00AAPL140621P0064000048.00Down 1.5047.3048.60276
645.00AAPL140621P0064500054.50 0.0051.9553.102378
650.00AAPL140621P0065000058.00Down 8.9056.7057.852144
655.00AAPL140621P0065500060.80 0.0061.5062.702351
660.00AAPL140621P0066000069.80 0.0066.3567.5511
665.00AAPL140621P0066500069.25 0.0071.0572.4011
670.00AAPL140621P0067000078.75Down 9.7776.1077.4015
680.00AAPL140621P0068000081.60 0.0086.0087.2511
685.00AAPL140621P0068500099.75 0.0090.8092.3011
700.00AAPL140621P00700000138.05 0.00105.55107.3012
740.00AAPL140621P00740000155.70 0.00145.80147.2511
750.00AAPL140621P00750000159.65 0.00155.75157.2044
755.00AAPL140621P00755000168.60 0.00160.75162.2511
760.00AAPL140621P00760000173.60 0.00165.75167.2088
   
Highlighted options are in-the-money.

Expand to Straddle View...

Currency in USD.

+ + + diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py new file mode 100644 index 00000000..3a0386c7 --- /dev/null +++ b/pandas/io/tests/generate_legacy_pickles.py @@ -0,0 +1,154 @@ +""" self-contained to write legacy pickle files """ +from __future__ import print_function + +def _create_sp_series(): + + import numpy as np + from pandas import SparseSeries + + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + index = np.arange(15) + arr[7:12] = nan + arr[-1:] = nan + + bseries = SparseSeries(arr, kind='block') + bseries.name = 'bseries' + return bseries + +def _create_sp_tsseries(): + + import numpy as np + from pandas import bdate_range, SparseTimeSeries + + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=np.float64) + index = np.arange(15) + arr[7:12] = nan + arr[-1:] = nan + + date_index = bdate_range('1/1/2011', periods=len(index)) + bseries = SparseTimeSeries(arr, index=date_index, kind='block') + bseries.name = 'btsseries' + return bseries + +def _create_sp_frame(): + import numpy as np + from pandas import bdate_range, SparseDataFrame + + nan = np.nan + + data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C': np.arange(10).astype(np.int64), + 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + + dates = bdate_range('1/1/2011', periods=10) + return SparseDataFrame(data, index=dates) + +def create_data(): + """ create the pickle data """ + + import numpy as np + import pandas + from pandas import (Series,TimeSeries,DataFrame,Panel, + SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, + Index,MultiIndex,PeriodIndex, + date_range,period_range,bdate_range,Timestamp) + nan = np.nan + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E' : [0., 1, Timestamp('20100101'),'foo',2.], + } + + index = dict(int = Index(np.arange(10)), + date = date_range('20130101',periods=10), + period = period_range('2013-01-01', freq='M', periods=10)) + + mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), + names=['first', 'second'])) + series = dict(float = Series(data['A']), + int = Series(data['B']), + mixed = Series(data['E']), + ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)), + mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], + [3,4,3,4,5]])), + names=['one','two'])), + dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A'])) + + frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), + int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), + mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])), + mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)), + index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'], + ['one','two','one','two','three']])), + names=['first','second'])), + dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), + columns=['A', 'B', 'A'])) + panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), + dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), + items=['A', 'B', 'A'])) + + + + return dict( series = series, + frame = frame, + panel = panel, + index = index, + mi = mi, + sp_series = dict(float = _create_sp_series(), + ts = _create_sp_tsseries()), + sp_frame = dict(float = _create_sp_frame()) + ) + +def write_legacy_pickles(): + + # force our cwd to be the first searched + import sys + sys.path.insert(0,'.') + + import os, os.path + import numpy as np + import pandas + import pandas.util.testing as tm + import platform as pl + + # make sure we are < 0.13 compat (in py3) + try: + from pandas.compat import zip, cPickle as pickle + except: + import pickle + + sys_version = version = pandas.__version__ + if len(sys.argv) < 2: + exit("{0} ".format(sys.argv[0])) + + version = str(sys.argv[1]) + output_dir = str(sys.argv[2]) + + print("This script generates a pickle file for the current arch, system, and python version") + print(" system version: {0}".format(sys_version)) + print(" output version: {0}".format(version)) + print(" output dir : {0}".format(output_dir)) + + # construct a reasonable platform name + f = '_'.join([ str(version), str(pl.machine()), str(pl.system().lower()), str(pl.python_version()) ]) + pth = '{0}.pickle'.format(f) + + fh = open(os.path.join(output_dir,pth),'wb') + pickle.dump(create_data(),fh,pickle.HIGHEST_PROTOCOL) + fh.close() + + print("created pickle file: %s" % pth) + +if __name__ == '__main__': + write_legacy_pickles() diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py new file mode 100644 index 00000000..33e53fe0 --- /dev/null +++ b/pandas/io/tests/test_clipboard.py @@ -0,0 +1,103 @@ +import numpy as np +from numpy.random import randint + +import nose +import pandas as pd + +from pandas import DataFrame +from pandas import read_clipboard +from pandas import get_option +from pandas.util import testing as tm +from pandas.util.testing import makeCustomDataframe as mkdf, disabled + + +try: + import pandas.util.clipboard +except OSError: + raise nose.SkipTest("no clipboard found") + + +@disabled +class TestClipboard(tm.TestCase): + @classmethod + def setUpClass(cls): + super(TestClipboard, cls).setUpClass() + cls.data = {} + cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + cls.data['float'] = mkdf(5, 3, + data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, + 'b': np.arange(1, 6), + 'c': list('abcde')}) + # Test GH-5346 + max_rows = get_option('display.max_rows') + cls.data['longdf'] = mkdf(max_rows+1, 3, data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + cls.data_types = list(cls.data.keys()) + + @classmethod + def tearDownClass(cls): + super(TestClipboard, cls).tearDownClass() + del cls.data_types, cls.data + + def check_round_trip_frame(self, data_type, excel=None, sep=None): + data = self.data[data_type] + data.to_clipboard(excel=excel, sep=sep) + if sep is not None: + result = read_clipboard(sep=sep,index_col=0) + else: + result = read_clipboard() + tm.assert_frame_equal(data, result, check_dtype=False) + + def test_round_trip_frame_sep(self): + for dt in self.data_types: + self.check_round_trip_frame(dt,sep=',') + + def test_round_trip_frame_string(self): + for dt in self.data_types: + self.check_round_trip_frame(dt,excel=False) + + def test_round_trip_frame(self): + for dt in self.data_types: + self.check_round_trip_frame(dt) + + def test_read_clipboard_infer_excel(self): + from textwrap import dedent + from pandas.util.clipboard import clipboard_set + + text = dedent(""" + John James Charlie Mingus + 1 2 + 4 Harry Carney + """.strip()) + clipboard_set(text) + df = pd.read_clipboard() + + # excel data is parsed correctly + self.assertEqual(df.iloc[1][1], 'Harry Carney') + + # having diff tab counts doesn't trigger it + text = dedent(""" + a\t b + 1 2 + 3 4 + """.strip()) + clipboard_set(text) + res = pd.read_clipboard() + + text = dedent(""" + a b + 1 2 + 3 4 + """.strip()) + clipboard_set(text) + exp = pd.read_clipboard() + + tm.assert_frame_equal(res, exp) diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py new file mode 100644 index 00000000..ad6f071d --- /dev/null +++ b/pandas/io/tests/test_cparser.py @@ -0,0 +1,346 @@ +""" +C/Cython ascii file parser tests +""" + +from pandas.compat import StringIO, BytesIO, map +from datetime import datetime +from pandas import compat +import csv +import os +import sys +import re + +import nose + +from numpy import nan +import numpy as np + +from pandas import DataFrame, Series, Index, isnull, MultiIndex +import pandas.io.parsers as parsers +from pandas.io.parsers import (read_csv, read_table, read_fwf, + TextParser, TextFileReader) +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal, network) +import pandas.lib as lib +from pandas import compat +from pandas.lib import Timestamp + +import pandas.util.testing as tm + +from pandas.parser import TextReader +import pandas.parser as parser + + +class TestCParser(tm.TestCase): + + def setUp(self): + self.dirpath = tm.get_data_path() + self.csv1 = os.path.join(self.dirpath, 'test1.csv') + self.csv2 = os.path.join(self.dirpath, 'test2.csv') + self.xls1 = os.path.join(self.dirpath, 'test.xls') + + def test_file_handle(self): + try: + f = open(self.csv1, 'rb') + reader = TextReader(f) + result = reader.read() + finally: + f.close() + + def test_string_filename(self): + reader = TextReader(self.csv1, header=None) + result = reader.read() + + def test_file_handle_mmap(self): + try: + f = open(self.csv1, 'rb') + reader = TextReader(f, memory_map=True, header=None) + result = reader.read() + finally: + f.close() + + def test_StringIO(self): + text = open(self.csv1, 'rb').read() + src = BytesIO(text) + reader = TextReader(src, header=None) + result = reader.read() + + def test_string_factorize(self): + # should this be optional? + data = 'a\nb\na\nb\na' + reader = TextReader(StringIO(data), header=None) + result = reader.read() + self.assertEqual(len(set(map(id, result[0]))), 2) + + def test_skipinitialspace(self): + data = ('a, b\n' + 'a, b\n' + 'a, b\n' + 'a, b') + + reader = TextReader(StringIO(data), skipinitialspace=True, + header=None) + result = reader.read() + + self.assert_numpy_array_equal(result[0], ['a', 'a', 'a', 'a']) + self.assert_numpy_array_equal(result[1], ['b', 'b', 'b', 'b']) + + def test_parse_booleans(self): + data = 'True\nFalse\nTrue\nTrue' + + reader = TextReader(StringIO(data), header=None) + result = reader.read() + + self.assertEqual(result[0].dtype, np.bool_) + + def test_delimit_whitespace(self): + data = 'a b\na\t\t "b"\n"a"\t \t b' + + reader = TextReader(StringIO(data), delim_whitespace=True, + header=None) + result = reader.read() + + self.assert_numpy_array_equal(result[0], ['a', 'a', 'a']) + self.assert_numpy_array_equal(result[1], ['b', 'b', 'b']) + + def test_embedded_newline(self): + data = 'a\n"hello\nthere"\nthis' + + reader = TextReader(StringIO(data), header=None) + result = reader.read() + + expected = ['a', 'hello\nthere', 'this'] + self.assert_numpy_array_equal(result[0], expected) + + def test_euro_decimal(self): + data = '12345,67\n345,678' + + reader = TextReader(StringIO(data), delimiter=':', + decimal=',', header=None) + result = reader.read() + + expected = [12345.67, 345.678] + tm.assert_almost_equal(result[0], expected) + + def test_integer_thousands(self): + data = '123,456\n12,500' + + reader = TextReader(StringIO(data), delimiter=':', + thousands=',', header=None) + result = reader.read() + + expected = [123456, 12500] + tm.assert_almost_equal(result[0], expected) + + def test_integer_thousands_alt(self): + data = '123.456\n12.500' + + reader = TextFileReader(StringIO(data), delimiter=':', + thousands='.', header=None) + result = reader.read() + + expected = [123456, 12500] + tm.assert_almost_equal(result[0], expected) + + def test_skip_bad_lines(self): + # too many lines, see #2430 for why + data = ('a:b:c\n' + 'd:e:f\n' + 'g:h:i\n' + 'j:k:l:m\n' + 'l:m:n\n' + 'o:p:q:r') + + reader = TextReader(StringIO(data), delimiter=':', + header=None) + self.assertRaises(parser.CParserError, reader.read) + + reader = TextReader(StringIO(data), delimiter=':', + header=None, + error_bad_lines=False, + warn_bad_lines=False) + result = reader.read() + expected = {0: ['a', 'd', 'g', 'l'], + 1: ['b', 'e', 'h', 'm'], + 2: ['c', 'f', 'i', 'n']} + assert_array_dicts_equal(result, expected) + + stderr = sys.stderr + sys.stderr = StringIO() + try: + reader = TextReader(StringIO(data), delimiter=':', + header=None, + error_bad_lines=False, + warn_bad_lines=True) + reader.read() + val = sys.stderr.getvalue() + self.assertTrue('Skipping line 4' in val) + self.assertTrue('Skipping line 6' in val) + finally: + sys.stderr = stderr + + def test_header_not_enough_lines(self): + data = ('skip this\n' + 'skip this\n' + 'a,b,c\n' + '1,2,3\n' + '4,5,6') + + reader = TextReader(StringIO(data), delimiter=',', header=2, + as_recarray=True) + header = reader.header + expected = [['a', 'b', 'c']] + self.assertEqual(header, expected) + + recs = reader.read() + expected = {'a': [1, 4], 'b': [2, 5], 'c': [3, 6]} + assert_array_dicts_equal(expected, recs) + + # not enough rows + self.assertRaises(parser.CParserError, TextReader, StringIO(data), + delimiter=',', header=5, as_recarray=True) + + def test_escapechar(self): + data = ('\\"hello world\"\n' + '\\"hello world\"\n' + '\\"hello world\"') + + reader = TextReader(StringIO(data), delimiter=',', header=None, + escapechar='\\') + result = reader.read() + expected = {0: ['"hello world"'] * 3} + assert_array_dicts_equal(result, expected) + + def test_eof_has_eol(self): + # handling of new line at EOF + pass + + def test_na_substitution(self): + pass + + def test_numpy_string_dtype(self): + data = """\ +a,1 +aa,2 +aaa,3 +aaaa,4 +aaaaa,5""" + + def _make_reader(**kwds): + return TextReader(StringIO(data), delimiter=',', header=None, + **kwds) + + reader = _make_reader(dtype='S5,i4') + result = reader.read() + + self.assertEqual(result[0].dtype, 'S5') + + ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5') + self.assertTrue((result[0] == ex_values).all()) + self.assertEqual(result[1].dtype, 'i4') + + reader = _make_reader(dtype='S4') + result = reader.read() + self.assertEqual(result[0].dtype, 'S4') + ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4') + self.assertTrue((result[0] == ex_values).all()) + self.assertEqual(result[1].dtype, 'S4') + + reader = _make_reader(dtype='S4', as_recarray=True) + result = reader.read() + self.assertEqual(result['0'].dtype, 'S4') + ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4') + self.assertTrue((result['0'] == ex_values).all()) + self.assertEqual(result['1'].dtype, 'S4') + + def test_pass_dtype(self): + data = """\ +one,two +1,a +2,b +3,c +4,d""" + + def _make_reader(**kwds): + return TextReader(StringIO(data), delimiter=',', **kwds) + + reader = _make_reader(dtype={'one': 'u1', 1: 'S1'}) + result = reader.read() + self.assertEqual(result[0].dtype, 'u1') + self.assertEqual(result[1].dtype, 'S1') + + reader = _make_reader(dtype={'one': np.uint8, 1: object}) + result = reader.read() + self.assertEqual(result[0].dtype, 'u1') + self.assertEqual(result[1].dtype, 'O') + + reader = _make_reader(dtype={'one': np.dtype('u1'), + 1: np.dtype('O')}) + result = reader.read() + self.assertEqual(result[0].dtype, 'u1') + self.assertEqual(result[1].dtype, 'O') + + def test_usecols(self): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + def _make_reader(**kwds): + return TextReader(StringIO(data), delimiter=',', **kwds) + + reader = _make_reader(usecols=(1, 2)) + result = reader.read() + + exp = _make_reader().read() + self.assertEqual(len(result), 2) + self.assertTrue((result[1] == exp[1]).all()) + self.assertTrue((result[2] == exp[2]).all()) + + def test_cr_delimited(self): + def _test(text, **kwargs): + nice_text = text.replace('\r', '\r\n') + result = TextReader(StringIO(text), **kwargs).read() + expected = TextReader(StringIO(nice_text), **kwargs).read() + assert_array_dicts_equal(result, expected) + + data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12' + _test(data, delimiter=',') + + data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12' + _test(data, delim_whitespace=True) + + data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12' + _test(data, delimiter=',') + + sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r' + 'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r' + ',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0') + _test(sample, delimiter=',') + + data = 'A B C\r 2 3\r4 5 6' + _test(data, delim_whitespace=True) + + data = 'A B C\r2 3\r4 5 6' + _test(data, delim_whitespace=True) + + def test_empty_field_eof(self): + data = 'a,b,c\n1,2,3\n4,,' + + result = TextReader(StringIO(data), delimiter=',').read() + + expected = {0: np.array([1, 4]), + 1: np.array(['2', ''], dtype=object), + 2: np.array(['3', ''], dtype=object)} + assert_array_dicts_equal(result, expected) + + +def assert_array_dicts_equal(left, right): + for k, v in compat.iteritems(left): + assert(np.array_equal(v, right[k])) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py new file mode 100644 index 00000000..15ebeba9 --- /dev/null +++ b/pandas/io/tests/test_data.py @@ -0,0 +1,512 @@ +from __future__ import print_function +from pandas import compat +import warnings +import nose +from nose.tools import assert_equal +from datetime import datetime +import os + +import numpy as np +import pandas as pd +from pandas import DataFrame, Timestamp +from pandas.io import data as web +from pandas.io.data import DataReader, SymbolWarning, RemoteDataError +from pandas.util.testing import (assert_series_equal, assert_produces_warning, + network, assert_frame_equal) +import pandas.util.testing as tm +from numpy.testing import assert_array_equal + +if compat.PY3: + from urllib.error import HTTPError +else: + from urllib2 import HTTPError + + +def _skip_if_no_lxml(): + try: + import lxml + except ImportError: + raise nose.SkipTest("no lxml") + + +def assert_n_failed_equals_n_null_columns(wngs, obj, cls=SymbolWarning): + all_nan_cols = pd.Series(dict((k, pd.isnull(v).all()) for k, v in + compat.iteritems(obj))) + n_all_nan_cols = all_nan_cols.sum() + valid_warnings = pd.Series([wng for wng in wngs if isinstance(wng, cls)]) + assert_equal(len(valid_warnings), n_all_nan_cols) + failed_symbols = all_nan_cols[all_nan_cols].index + msgs = valid_warnings.map(lambda x: x.message) + assert msgs.str.contains('|'.join(failed_symbols)).all() + + +class TestGoogle(tm.TestCase): + @classmethod + def setUpClass(cls): + super(TestGoogle, cls).setUpClass() + cls.locales = tm.get_locales(prefix='en_US') + if not cls.locales: + raise nose.SkipTest("US English locale not available for testing") + + @classmethod + def tearDownClass(cls): + super(TestGoogle, cls).tearDownClass() + del cls.locales + + @network + def test_google(self): + # asserts that google is minimally working and that it throws + # an exception when DataReader can't get a 200 response from + # google + start = datetime(2010, 1, 1) + end = datetime(2013, 1, 27) + + for locale in self.locales: + with tm.set_locale(locale): + panel = web.DataReader("F", 'google', start, end) + self.assertEqual(panel.Close[-1], 13.68) + + self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", + 'google', start, end) + + @network + def test_get_quote_fails(self): + self.assertRaises(NotImplementedError, web.get_quote_google, + pd.Series(['GOOG', 'AAPL', 'GOOG'])) + + @network + def test_get_goog_volume(self): + for locale in self.locales: + with tm.set_locale(locale): + df = web.get_data_google('GOOG').sort_index() + self.assertEqual(df.Volume.ix['OCT-08-2010'], 2863473) + + @network + def test_get_multi1(self): + for locale in self.locales: + sl = ['AAPL', 'AMZN', 'GOOG'] + with tm.set_locale(locale): + pan = web.get_data_google(sl, '2012') + ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + if (hasattr(pan, 'Close') and hasattr(pan.Close, 'GOOG') and + hasattr(pan.Close, 'AAPL')): + self.assertEqual(ts[0].dayofyear, 96) + else: + self.assertRaises(AttributeError, lambda: pan.Close) + + @network + def test_get_multi2(self): + with warnings.catch_warnings(record=True) as w: + for locale in self.locales: + with tm.set_locale(locale): + pan = web.get_data_google(['GE', 'MSFT', 'INTC'], + 'JAN-01-12', 'JAN-31-12') + result = pan.Close.ix['01-18-12'] + assert_n_failed_equals_n_null_columns(w, result) + + # sanity checking + + assert np.issubdtype(result.dtype, np.floating) + result = pan.Open.ix['Jan-15-12':'Jan-20-12'] + self.assertEqual((4, 3), result.shape) + assert_n_failed_equals_n_null_columns(w, result) + + +class TestYahoo(tm.TestCase): + @classmethod + def setUpClass(cls): + super(TestYahoo, cls).setUpClass() + _skip_if_no_lxml() + + @network + def test_yahoo(self): + # asserts that yahoo is minimally working and that it throws + # an exception when DataReader can't get a 200 response from + # yahoo + start = datetime(2010, 1, 1) + end = datetime(2013, 1, 27) + + self.assertEqual(web.DataReader("F", 'yahoo', start, end)['Close'][-1], + 13.68) + + @network + def test_yahoo_fails(self): + start = datetime(2010, 1, 1) + end = datetime(2013, 1, 27) + self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", + 'yahoo', start, end) + + @network + def test_get_quote_series(self): + df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) + assert_series_equal(df.ix[0], df.ix[2]) + + @network + def test_get_quote_string(self): + df = web.get_quote_yahoo('GOOG') + + @network + def test_get_quote_stringlist(self): + df = web.get_quote_yahoo(['GOOG', 'AAPL', 'GOOG']) + assert_series_equal(df.ix[0], df.ix[2]) + + @network + def test_get_components_dow_jones(self): + raise nose.SkipTest('unreliable test, receive partial components back for dow_jones') + + df = web.get_components_yahoo('^DJI') #Dow Jones + assert isinstance(df, pd.DataFrame) + self.assertEqual(len(df), 30) + + @network + def test_get_components_dax(self): + raise nose.SkipTest('unreliable test, receive partial components back for dax') + + df = web.get_components_yahoo('^GDAXI') #DAX + assert isinstance(df, pd.DataFrame) + self.assertEqual(len(df), 30) + self.assertEqual(df[df.name.str.contains('adidas', case=False)].index, + 'ADS.DE') + + @network + def test_get_components_nasdaq_100(self): + # as of 7/12/13 the conditional will test false because the link is invalid + raise nose.SkipTest('unreliable test, receive partial components back for nasdaq_100') + + df = web.get_components_yahoo('^NDX') #NASDAQ-100 + assert isinstance(df, pd.DataFrame) + + if len(df) > 1: + # Usual culprits, should be around for a while + assert 'AAPL' in df.index + assert 'GOOG' in df.index + assert 'AMZN' in df.index + else: + expected = DataFrame({'exchange': 'N/A', 'name': '@^NDX'}, + index=['@^NDX']) + assert_frame_equal(df, expected) + + @network + def test_get_data_single_symbol(self): + #single symbol + #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d + # just test that we succeed + web.get_data_yahoo('GOOG') + + @network + def test_get_data_multiple_symbols(self): + # just test that we succeed + sl = ['AAPL', 'AMZN', 'GOOG'] + web.get_data_yahoo(sl, '2012') + + @network + def test_get_data_multiple_symbols_two_dates(self): + pan = web.get_data_yahoo(['GE', 'MSFT', 'INTC'], 'JAN-01-12', + 'JAN-31-12') + result = pan.Close.ix['01-18-12'] + self.assertEqual(len(result), 3) + + # sanity checking + assert np.issubdtype(result.dtype, np.floating) + + expected = np.array([[18.99, 28.4, 25.18], + [18.58, 28.31, 25.13], + [19.03, 28.16, 25.52], + [18.81, 28.82, 25.87]]) + result = pan.Open.ix['Jan-15-12':'Jan-20-12'] + self.assertEqual(expected.shape, result.shape) + + @network + def test_get_date_ret_index(self): + pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987', + ret_index=True) + self.assertTrue(hasattr(pan, 'Ret_Index')) + if hasattr(pan, 'Ret_Index') and hasattr(pan.Ret_Index, 'INTC'): + tstamp = pan.Ret_Index.INTC.first_valid_index() + result = pan.Ret_Index.ix[tstamp]['INTC'] + self.assertEqual(result, 1.0) + + # sanity checking + assert np.issubdtype(pan.values.dtype, np.floating) + + +class TestYahooOptions(tm.TestCase): + @classmethod + def setUpClass(cls): + super(TestYahooOptions, cls).setUpClass() + _skip_if_no_lxml() + + # aapl has monthlies + cls.aapl = web.Options('aapl', 'yahoo') + today = datetime.today() + year = today.year + month = today.month + 1 + if month > 12: + year = year + 1 + month = 1 + cls.expiry = datetime(year, month, 1) + cls.dirpath = tm.get_data_path() + cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') + cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') + cls.root1 = cls.aapl._parse_url(cls.html1) + cls.root2 = cls.aapl._parse_url(cls.html2) + cls.tables1 = cls.aapl._parse_option_page_from_yahoo(cls.root1) + cls.unprocessed_data1 = web._parse_options_data(cls.tables1[cls.aapl._TABLE_LOC['puts']]) + cls.data1 = cls.aapl._process_data(cls.unprocessed_data1, 'put') + + @classmethod + def tearDownClass(cls): + super(TestYahooOptions, cls).tearDownClass() + del cls.aapl, cls.expiry + + @network + def test_get_options_data(self): + # regression test GH6105 + self.assertRaises(ValueError, self.aapl.get_options_data, month=3) + self.assertRaises(ValueError, self.aapl.get_options_data, year=1992) + + try: + options = self.aapl.get_options_data(expiry=self.expiry) + except RemoteDataError as e: + raise nose.SkipTest(e) + self.assertTrue(len(options) > 1) + + @network + def test_get_near_stock_price(self): + try: + options = self.aapl.get_near_stock_price(call=True, put=True, + expiry=self.expiry) + except RemoteDataError as e: + raise nose.SkipTest(e) + self.assertTrue(len(options) > 1) + + @network + def test_get_call_data(self): + try: + calls = self.aapl.get_call_data(expiry=self.expiry) + except RemoteDataError as e: + raise nose.SkipTest(e) + self.assertTrue(len(calls) > 1) + + @network + def test_get_put_data(self): + try: + puts = self.aapl.get_put_data(expiry=self.expiry) + except RemoteDataError as e: + raise nose.SkipTest(e) + self.assertTrue(len(puts) > 1) + + @network + def test_get_expiry_months(self): + try: + dates = self.aapl._get_expiry_months() + except RemoteDataError as e: + raise nose.SkipTest(e) + self.assertTrue(len(dates) > 1) + + @network + def test_get_all_data(self): + try: + data = self.aapl.get_all_data(put=True) + except RemoteDataError as e: + raise nose.SkipTest(e) + self.assertTrue(len(data) > 1) + + @network + def test_get_all_data_calls_only(self): + try: + data = self.aapl.get_all_data(call=True, put=False) + except RemoteDataError as e: + raise nose.SkipTest(e) + self.assertTrue(len(data) > 1) + + @network + def test_sample_page_price_quote_time1(self): + #Tests the weekend quote time format + price, quote_time = self.aapl._get_underlying_price(self.root1) + self.assertIsInstance(price, (int, float, complex)) + self.assertIsInstance(quote_time, (datetime, Timestamp)) + + def test_chop(self): + #regression test for #7625 + self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan) + chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=300) + self.assertIsInstance(chopped, DataFrame) + self.assertTrue(len(chopped) > 1) + + @network + def test_sample_page_price_quote_time2(self): + #Tests the weekday quote time format + price, quote_time = self.aapl._get_underlying_price(self.root2) + self.assertIsInstance(price, (int, float, complex)) + self.assertIsInstance(quote_time, (datetime, Timestamp)) + + @network + def test_sample_page_chg_float(self): + #Tests that numeric columns with comma's are appropriately dealt with + self.assertEqual(self.data1['Chg'].dtype, 'float64') + + +class TestOptionsWarnings(tm.TestCase): + @classmethod + def setUpClass(cls): + super(TestOptionsWarnings, cls).setUpClass() + _skip_if_no_lxml() + + with assert_produces_warning(FutureWarning): + cls.aapl = web.Options('aapl') + + today = datetime.today() + cls.year = today.year + cls.month = today.month + 1 + if cls.month > 12: + cls.year += 1 + cls.month = 1 + + @classmethod + def tearDownClass(cls): + super(TestOptionsWarnings, cls).tearDownClass() + del cls.aapl, cls.year, cls.month + + @network + def test_get_options_data_warning(self): + with assert_produces_warning(): + try: + self.aapl.get_options_data(month=self.month, year=self.year) + except RemoteDataError as e: + raise nose.SkipTest(e) + + @network + def test_get_near_stock_price_warning(self): + with assert_produces_warning(): + try: + options_near = self.aapl.get_near_stock_price(call=True, + put=True, + month=self.month, + year=self.year) + except RemoteDataError as e: + raise nose.SkipTest(e) + + @network + def test_get_call_data_warning(self): + with assert_produces_warning(): + try: + self.aapl.get_call_data(month=self.month, year=self.year) + except RemoteDataError as e: + raise nose.SkipTest(e) + + @network + def test_get_put_data_warning(self): + with assert_produces_warning(): + try: + self.aapl.get_put_data(month=self.month, year=self.year) + except RemoteDataError as e: + raise nose.SkipTest(e) + + +class TestDataReader(tm.TestCase): + def test_is_s3_url(self): + from pandas.io.common import _is_s3_url + self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) + + @network + def test_read_yahoo(self): + gs = DataReader("GS", "yahoo") + assert isinstance(gs, DataFrame) + + @network + def test_read_google(self): + gs = DataReader("GS", "google") + assert isinstance(gs, DataFrame) + + @network + def test_read_fred(self): + vix = DataReader("VIXCLS", "fred") + assert isinstance(vix, DataFrame) + + @network + def test_read_famafrench(self): + for name in ("F-F_Research_Data_Factors", + "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3", + "F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"): + ff = DataReader(name, "famafrench") + assert ff + assert isinstance(ff, dict) + + +class TestFred(tm.TestCase): + @network + def test_fred(self): + + # Throws an exception when DataReader can't get a 200 response from + # FRED. + + start = datetime(2010, 1, 1) + end = datetime(2013, 1, 27) + + received = web.DataReader("GDP", "fred", start, end)['GDP'].tail(1)[0] + self.assertEqual(int(received), 16535) + + self.assertRaises(Exception, web.DataReader, "NON EXISTENT SERIES", + 'fred', start, end) + + @network + def test_fred_nan(self): + start = datetime(2010, 1, 1) + end = datetime(2013, 1, 27) + df = web.DataReader("DFII5", "fred", start, end) + assert pd.isnull(df.ix['2010-01-01'][0]) + + @network + def test_fred_parts(self): + raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') + + start = datetime(2010, 1, 1) + end = datetime(2013, 1, 27) + df = web.get_data_fred("CPIAUCSL", start, end) + self.assertEqual(df.ix['2010-05-01'][0], 217.23) + + t = df.CPIAUCSL.values + assert np.issubdtype(t.dtype, np.floating) + self.assertEqual(t.shape, (37,)) + + @network + def test_fred_part2(self): + expected = [[576.7], + [962.9], + [684.7], + [848.3], + [933.3]] + result = web.get_data_fred("A09024USA144NNBR", start="1915").ix[:5] + assert_array_equal(result.values, np.array(expected)) + + @network + def test_invalid_series(self): + name = "NOT A REAL SERIES" + self.assertRaises(Exception, web.get_data_fred, name) + + @network + def test_fred_multi(self): + raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') + + names = ['CPIAUCSL', 'CPALTT01USQ661S', 'CPILFESL'] + start = datetime(2010, 1, 1) + end = datetime(2013, 1, 27) + + received = web.DataReader(names, "fred", start, end).head(1) + expected = DataFrame([[217.478, 0.99701529, 220.544]], columns=names, + index=[pd.tslib.Timestamp('2010-01-01 00:00:00')]) + expected.index.rename('DATE', inplace=True) + assert_frame_equal(received, expected, check_less_precise=True) + + @network + def test_fred_multi_bad_series(self): + + names = ['NOTAREALSERIES', 'CPIAUCSL', "ALSO FAKE"] + with tm.assertRaises(HTTPError): + DataReader(names, data_source="fred") + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py new file mode 100644 index 00000000..ee537d94 --- /dev/null +++ b/pandas/io/tests/test_date_converters.py @@ -0,0 +1,126 @@ +from pandas.compat import StringIO, BytesIO +from datetime import date, datetime +import csv +import os +import sys +import re + +import nose + +from numpy import nan +import numpy as np +from numpy.testing.decorators import slow + +from pandas import DataFrame, Series, Index, isnull +import pandas.io.parsers as parsers +from pandas.io.parsers import (read_csv, read_table, read_fwf, + TextParser) +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal, network) +import pandas.lib as lib +from pandas import compat +from pandas.lib import Timestamp +import pandas.io.date_converters as conv +import pandas.util.testing as tm + +class TestConverters(tm.TestCase): + + def setUp(self): + self.years = np.array([2007, 2008]) + self.months = np.array([1, 2]) + self.days = np.array([3, 4]) + self.hours = np.array([5, 6]) + self.minutes = np.array([7, 8]) + self.seconds = np.array([9, 0]) + self.dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) + self.times = np.array(['05:07:09', '06:08:00'], dtype=object) + self.expected = np.array([datetime(2007, 1, 3, 5, 7, 9), + datetime(2008, 2, 4, 6, 8, 0)]) + + def test_parse_date_time(self): + result = conv.parse_date_time(self.dates, self.times) + self.assertTrue((result == self.expected).all()) + + data = """\ +date, time, a, b +2001-01-05, 10:00:00, 0.0, 10. +2001-01-05, 00:00:00, 1., 11. +""" + datecols = {'date_time': [0, 1]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, date_parser=conv.parse_date_time) + self.assertIn('date_time', df) + self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0)) + + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + df = read_csv(StringIO(data), header=None, parse_dates=date_spec, + date_parser=conv.parse_date_time) + + def test_parse_date_fields(self): + result = conv.parse_date_fields(self.years, self.months, self.days) + expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) + self.assertTrue((result == expected).all()) + + data = "year, month, day, a\n 2001 , 01 , 10 , 10.\n 2001 , 02 , 1 , 11." + datecols = {'ymd': [0, 1, 2]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_date_fields) + self.assertIn('ymd', df) + self.assertEqual(df.ymd.ix[0], datetime(2001, 1, 10)) + + def test_datetime_six_col(self): + result = conv.parse_all_fields(self.years, self.months, self.days, + self.hours, self.minutes, self.seconds) + self.assertTrue((result == self.expected).all()) + + data = """\ +year, month, day, hour, minute, second, a, b +2001, 01, 05, 10, 00, 0, 0.0, 10. +2001, 01, 5, 10, 0, 00, 1., 11. +""" + datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_all_fields) + self.assertIn('ymdHMS', df) + self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0)) + + def test_datetime_fractional_seconds(self): + data = """\ +year, month, day, hour, minute, second, a, b +2001, 01, 05, 10, 00, 0.123456, 0.0, 10. +2001, 01, 5, 10, 0, 0.500000, 1., 11. +""" + datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_all_fields) + self.assertIn('ymdHMS', df) + self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0, + microsecond=123456)) + self.assertEqual(df.ymdHMS.ix[1], datetime(2001, 1, 5, 10, 0, 0, + microsecond=500000)) + + def test_generic(self): + data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." + datecols = {'ym': [0, 1]} + dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=dateconverter) + self.assertIn('ym', df) + self.assertEqual(df.ym.ix[0], date(2001, 1, 1)) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py new file mode 100644 index 00000000..96db5353 --- /dev/null +++ b/pandas/io/tests/test_excel.py @@ -0,0 +1,1311 @@ +# pylint: disable=E1101 + +from pandas.compat import u, range, map, openpyxl_compat +from datetime import datetime, date, time +import sys +import os +from distutils.version import LooseVersion + +import operator +import functools +import nose + +from numpy import nan +import numpy as np +from numpy.testing.decorators import slow + +from pandas import DataFrame, Index, MultiIndex +from pandas.io.parsers import read_csv +from pandas.io.excel import ( + ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _OpenpyxlWriter, + register_writer, _XlsxWriter +) +from pandas.io.common import URLError +from pandas.util.testing import ensure_clean +from pandas.core.config import set_option, get_option +import pandas.util.testing as tm +import pandas as pd + + +def _skip_if_no_xlrd(): + try: + import xlrd + ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) + if ver < (0, 9): + raise nose.SkipTest('xlrd < 0.9, skipping') + except ImportError: + raise nose.SkipTest('xlrd not installed, skipping') + + +def _skip_if_no_xlwt(): + try: + import xlwt # NOQA + except ImportError: + raise nose.SkipTest('xlwt not installed, skipping') + + +def _skip_if_no_openpyxl(): + try: + import openpyxl # NOQA + except ImportError: + raise nose.SkipTest('openpyxl not installed, skipping') + + +def _skip_if_no_xlsxwriter(): + try: + import xlsxwriter # NOQA + except ImportError: + raise nose.SkipTest('xlsxwriter not installed, skipping') + + +def _skip_if_no_excelsuite(): + _skip_if_no_xlrd() + _skip_if_no_xlwt() + _skip_if_no_openpyxl() + + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() +_frame = DataFrame(_seriesd)[:10] +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] +_tsframe = tm.makeTimeDataFrame()[:5] +_mixed_frame = _frame.copy() +_mixed_frame['foo'] = 'bar' + + +class SharedItems(object): + def setUp(self): + self.dirpath = tm.get_data_path() + self.csv1 = os.path.join(self.dirpath, 'test1.csv') + self.csv2 = os.path.join(self.dirpath, 'test2.csv') + self.xls1 = os.path.join(self.dirpath, 'test.xls') + self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx') + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def read_csv(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = 'python' + return read_csv(*args, **kwds) + + +class ExcelReaderTests(SharedItems, tm.TestCase): + def test_parse_cols_int(self): + _skip_if_no_openpyxl() + _skip_if_no_xlrd() + + suffix = ['xls', 'xlsx', 'xlsm'] + + for s in suffix: + pth = os.path.join(self.dirpath, 'test.%s' % s) + xls = ExcelFile(pth) + df = xls.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols=3) + df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = df2.reindex(columns=['A', 'B', 'C']) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, parse_cols=3) + # TODO add index to xls file) + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + def test_parse_cols_list(self): + _skip_if_no_openpyxl() + _skip_if_no_xlrd() + + suffix = ['xls', 'xlsx', 'xlsm'] + + for s in suffix: + pth = os.path.join(self.dirpath, 'test.%s' % s) + xls = ExcelFile(pth) + df = xls.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols=[0, 2, 3]) + df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = df2.reindex(columns=['B', 'C']) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols=[0, 2, 3]) + # TODO add index to xls file) + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + def test_parse_cols_str(self): + _skip_if_no_openpyxl() + _skip_if_no_xlrd() + + suffix = ['xls', 'xlsx', 'xlsm'] + + for s in suffix: + + pth = os.path.join(self.dirpath, 'test.%s' % s) + xls = ExcelFile(pth) + + df = xls.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A:D') + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = df2.reindex(columns=['A', 'B', 'C']) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, parse_cols='A:D') + # TODO add index to xls, read xls ignores index name ? + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + del df, df2, df3 + + df = xls.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A,C,D') + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = df2.reindex(columns=['B', 'C']) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols='A,C,D') + # TODO add index to xls file + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + del df, df2, df3 + + df = xls.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A,C:D') + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = df2.reindex(columns=['B', 'C']) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols='A,C:D') + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + def test_excel_stop_iterator(self): + _skip_if_no_xlrd() + + excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) + parsed = excel_data.parse('Sheet1') + expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) + tm.assert_frame_equal(parsed, expected) + + def test_excel_cell_error_na(self): + _skip_if_no_xlrd() + + excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) + parsed = excel_data.parse('Sheet1') + expected = DataFrame([[np.nan]], columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + def test_excel_passes_na(self): + _skip_if_no_xlrd() + + excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xlsx')) + parsed = excel_data.parse('Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = excel_data.parse('Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + def check_excel_table_sheet_by_index(self, filename, csvfile): + import xlrd + + pth = os.path.join(self.dirpath, filename) + xls = ExcelFile(pth) + df = xls.parse(0, index_col=0, parse_dates=True) + df2 = self.read_csv(csvfile, index_col=0, parse_dates=True) + df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True) + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1) + df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1) + tm.assert_frame_equal(df4, df.ix[:-1]) + tm.assert_frame_equal(df4, df5) + + self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf') + + def test_excel_table_sheet_by_index(self): + _skip_if_no_xlrd() + for filename, csvfile in [(self.xls1, self.csv1), + (self.xlsx1, self.csv1)]: + self.check_excel_table_sheet_by_index(filename, csvfile) + + def test_excel_table(self): + _skip_if_no_xlrd() + + pth = os.path.join(self.dirpath, 'test.xls') + xls = ExcelFile(pth) + df = xls.parse('Sheet1', index_col=0, parse_dates=True) + df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + df4 = xls.parse('Sheet1', index_col=0, parse_dates=True, + skipfooter=1) + df5 = xls.parse('Sheet1', index_col=0, parse_dates=True, + skip_footer=1) + tm.assert_frame_equal(df4, df.ix[:-1]) + tm.assert_frame_equal(df4, df5) + + def test_excel_read_buffer(self): + _skip_if_no_xlrd() + _skip_if_no_openpyxl() + + pth = os.path.join(self.dirpath, 'test.xls') + f = open(pth, 'rb') + xls = ExcelFile(f) + # it works + xls.parse('Sheet1', index_col=0, parse_dates=True) + + pth = os.path.join(self.dirpath, 'test.xlsx') + f = open(pth, 'rb') + xl = ExcelFile(f) + xl.parse('Sheet1', index_col=0, parse_dates=True) + + def test_read_xlrd_Book(self): + _skip_if_no_xlrd() + _skip_if_no_xlwt() + + import xlrd + + df = self.frame + + with ensure_clean('.xls') as pth: + df.to_excel(pth, "SheetA") + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine="xlrd") as xl: + result = xl.parse("SheetA") + tm.assert_frame_equal(df, result) + + result = read_excel(book, sheetname="SheetA", engine="xlrd") + tm.assert_frame_equal(df, result) + + @tm.network + def test_read_from_http_url(self): + _skip_if_no_xlrd() + + url = ('https://raw.github.com/pydata/pandas/master/' + 'pandas/io/tests/data/test.xlsx') + url_table = read_excel(url) + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'test.xlsx') + local_table = read_excel(localtable) + tm.assert_frame_equal(url_table, local_table) + + @slow + def test_read_from_file_url(self): + _skip_if_no_xlrd() + + # FILE + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("file:// not supported with Python < 2.6") + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'test.xlsx') + local_table = read_excel(localtable) + + try: + url_table = read_excel('file://localhost/' + localtable) + except URLError: + # fails on some systems + raise nose.SkipTest("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + + def test_xlsx_table(self): + _skip_if_no_xlrd() + _skip_if_no_openpyxl() + + pth = os.path.join(self.dirpath, 'test.xlsx') + xlsx = ExcelFile(pth) + df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) + df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) + df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) + + # TODO add index to xlsx file + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, + skipfooter=1) + df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, + skip_footer=1) + tm.assert_frame_equal(df4, df.ix[:-1]) + tm.assert_frame_equal(df4, df5) + + def test_reader_closes_file(self): + _skip_if_no_xlrd() + _skip_if_no_openpyxl() + + pth = os.path.join(self.dirpath, 'test.xlsx') + f = open(pth, 'rb') + with ExcelFile(f) as xlsx: + # parses okay + xlsx.parse('Sheet1', index_col=0) + + self.assertTrue(f.closed) + + def test_reader_special_dtypes(self): + _skip_if_no_xlrd() + + expected = DataFrame.from_items([ + ("IntCol", [1, 2, -3, 4, 0]), + ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), + ("BoolCol", [True, False, True, True, False]), + ("StrCol", [1, 2, 3, 4, 5]), + # GH5394 - this is why convert_float isn't vectorized + ("Str2Col", ["a", 3, "c", "d", "e"]), + ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), + datetime(1905, 1, 1), datetime(2013, 12, 14), + datetime(2015, 3, 14)]) + ]) + + xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx') + xls_path = os.path.join(self.dirpath, 'test_types.xls') + + # should read in correctly and infer types + for path in (xls_path, xlsx_path): + actual = read_excel(path, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + # if not coercing number, then int comes in as float + float_expected = expected.copy() + float_expected["IntCol"] = float_expected["IntCol"].astype(float) + float_expected.loc[1, "Str2Col"] = 3.0 + for path in (xls_path, xlsx_path): + actual = read_excel(path, 'Sheet1', convert_float=False) + tm.assert_frame_equal(actual, float_expected) + + # check setting Index (assuming xls and xlsx are the same here) + for icol, name in enumerate(expected.columns): + actual = read_excel(xlsx_path, 'Sheet1', index_col=icol) + actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name) + exp = expected.set_index(name) + tm.assert_frame_equal(actual, exp) + tm.assert_frame_equal(actual2, exp) + + # convert_float and converters should be different but both accepted + expected["StrCol"] = expected["StrCol"].apply(str) + actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}) + tm.assert_frame_equal(actual, expected) + + no_convert_float = float_expected.copy() + no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) + actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}, + convert_float=False) + tm.assert_frame_equal(actual, no_convert_float) + + def test_reader_seconds(self): + # Test reading times with and without milliseconds. GH5945. + _skip_if_no_xlrd() + import xlrd + + if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): + # Xlrd >= 0.9.3 can handle Excel milliseconds. + expected = DataFrame.from_items([("Time", + [time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54)])]) + else: + # Xlrd < 0.9.3 rounds Excel milliseconds. + expected = DataFrame.from_items([("Time", + [time(1, 2, 3), + time(2, 45, 56), + time(4, 29, 49), + time(6, 13, 42), + time(7, 57, 35), + time(9, 41, 29), + time(11, 25, 22), + time(13, 9, 15), + time(14, 53, 8), + time(16, 37, 1), + time(18, 20, 54)])]) + + epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls') + epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls') + + actual = read_excel(epoch_1900, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + actual = read_excel(epoch_1904, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + +class ExcelWriterBase(SharedItems): + # Base class for test cases to run with different Excel writers. + # To add a writer test, define the following: + # 1. A check_skip function that skips your tests if your writer isn't + # installed. + # 2. Add a property ext, which is the file extension that your writer + # writes to. (needs to start with '.' so it's a valid path) + # 3. Add a property engine_name, which is the name of the writer class. + + # Test with MultiIndex and Hierarchical Rows as merged cells. + merge_cells = True + + def setUp(self): + self.check_skip() + super(ExcelWriterBase, self).setUp() + self.option_name = 'io.excel.%s.writer' % self.ext.strip('.') + self.prev_engine = get_option(self.option_name) + set_option(self.option_name, self.engine_name) + + def tearDown(self): + set_option(self.option_name, self.prev_engine) + + def test_excel_sheet_by_name_raise(self): + _skip_if_no_xlrd() + import xlrd + + with ensure_clean(self.ext) as pth: + gt = DataFrame(np.random.randn(10, 2)) + gt.to_excel(pth) + xl = ExcelFile(pth) + df = xl.parse(0) + tm.assert_frame_equal(gt, df) + + self.assertRaises(xlrd.XLRDError, xl.parse, '0') + + def test_excel_deprecated_options(self): + with ensure_clean(self.ext) as path: + with tm.assert_produces_warning(FutureWarning): + self.frame.to_excel(path, 'test1', cols=['A', 'B']) + + with tm.assert_produces_warning(False): + self.frame.to_excel(path, 'test1', columns=['A', 'B']) + + def test_excelwriter_contextmanager(self): + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as pth: + with ExcelWriter(pth) as writer: + self.frame.to_excel(writer, 'Data1') + self.frame2.to_excel(writer, 'Data2') + + with ExcelFile(pth) as reader: + found_df = reader.parse('Data1') + found_df2 = reader.parse('Data2') + tm.assert_frame_equal(found_df, self.frame) + tm.assert_frame_equal(found_df2, self.frame2) + + def test_roundtrip(self): + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as path: + self.frame['A'][:5] = nan + + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', columns=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + + # test roundtrip + self.frame.to_excel(path, 'test1') + recons = read_excel(path, 'test1', index_col=0) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(path, 'test1', index=False) + recons = read_excel(path, 'test1', index_col=None) + recons.index = self.frame.index + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(path, 'test1', na_rep='NA') + recons = read_excel(path, 'test1', index_col=0, na_values=['NA']) + tm.assert_frame_equal(self.frame, recons) + + # GH 3611 + self.frame.to_excel(path, 'test1', na_rep='88') + recons = read_excel(path, 'test1', index_col=0, na_values=['88']) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(path, 'test1', na_rep='88') + recons = read_excel(path, 'test1', index_col=0, + na_values=[88, 88.0]) + tm.assert_frame_equal(self.frame, recons) + + # GH 6573 + self.frame.to_excel(path, 'Sheet1') + recons = read_excel(path, index_col=0) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(path, '0') + recons = read_excel(path, index_col=0) + tm.assert_frame_equal(self.frame, recons) + + def test_mixed(self): + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as path: + self.mixed_frame.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0) + tm.assert_frame_equal(self.mixed_frame, recons) + + def test_tsframe(self): + _skip_if_no_xlrd() + + df = tm.makeTimeDataFrame()[:5] + + with ensure_clean(self.ext) as path: + df.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + tm.assert_frame_equal(df, recons) + + def test_basics_with_nan(self): + _skip_if_no_xlrd() + with ensure_clean(self.ext) as path: + self.frame['A'][:5] = nan + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', columns=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + + def test_int_types(self): + _skip_if_no_xlrd() + + for np_type in (np.int8, np.int16, np.int32, np.int64): + + with ensure_clean(self.ext) as path: + # Test np.int values read come back as int (rather than float + # which is Excel's format). + frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), + dtype=np_type) + frame.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + int_frame = frame.astype(np.int64) + tm.assert_frame_equal(int_frame, recons) + recons2 = read_excel(path, 'test1') + tm.assert_frame_equal(int_frame, recons2) + + # test with convert_float=False comes back as float + float_frame = frame.astype(float) + recons = read_excel(path, 'test1', convert_float=False) + tm.assert_frame_equal(recons, float_frame) + + def test_float_types(self): + _skip_if_no_xlrd() + + for np_type in (np.float16, np.float32, np.float64): + with ensure_clean(self.ext) as path: + # Test np.float values read come back as float. + frame = DataFrame(np.random.random_sample(10), dtype=np_type) + frame.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1').astype(np_type) + tm.assert_frame_equal(frame, recons, check_dtype=False) + + def test_bool_types(self): + _skip_if_no_xlrd() + + for np_type in (np.bool8, np.bool_): + with ensure_clean(self.ext) as path: + # Test np.bool values read come back as float. + frame = (DataFrame([1, 0, True, False], dtype=np_type)) + frame.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1').astype(np_type) + tm.assert_frame_equal(frame, recons) + + def test_inf_roundtrip(self): + _skip_if_no_xlrd() + + frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + with ensure_clean(self.ext) as path: + frame.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + tm.assert_frame_equal(frame, recons) + + def test_sheets(self): + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as path: + self.frame['A'][:5] = nan + + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', columns=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + + # Test writing to separate sheets + writer = ExcelWriter(path) + self.frame.to_excel(writer, 'test1') + self.tsframe.to_excel(writer, 'test2') + writer.save() + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0) + tm.assert_frame_equal(self.frame, recons) + recons = reader.parse('test2', index_col=0) + tm.assert_frame_equal(self.tsframe, recons) + np.testing.assert_equal(2, len(reader.sheet_names)) + np.testing.assert_equal('test1', reader.sheet_names[0]) + np.testing.assert_equal('test2', reader.sheet_names[1]) + + def test_colaliases(self): + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as path: + self.frame['A'][:5] = nan + + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', columns=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(path, 'test1', header=col_aliases) + reader = ExcelFile(path) + rs = reader.parse('test1', index_col=0) + xp = self.frame2.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + def test_roundtrip_indexlabels(self): + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as path: + + self.frame['A'][:5] = nan + + self.frame.to_excel(path, 'test1') + self.frame.to_excel(path, 'test1', columns=['A', 'B']) + self.frame.to_excel(path, 'test1', header=False) + self.frame.to_excel(path, 'test1', index=False) + + # test index_label + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(path, 'test1', + index_label=['test'], + merge_cells=self.merge_cells) + reader = ExcelFile(path) + recons = reader.parse('test1', + index_col=0, + has_index_names=self.merge_cells + ).astype(np.int64) + frame.index.names = ['test'] + self.assertEqual(frame.index.names, recons.index.names) + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(path, + 'test1', + index_label=['test', 'dummy', 'dummy2'], + merge_cells=self.merge_cells) + reader = ExcelFile(path) + recons = reader.parse('test1', + index_col=0, + has_index_names=self.merge_cells + ).astype(np.int64) + frame.index.names = ['test'] + self.assertEqual(frame.index.names, recons.index.names) + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(path, + 'test1', + index_label='test', + merge_cells=self.merge_cells) + reader = ExcelFile(path) + recons = reader.parse('test1', + index_col=0, + has_index_names=self.merge_cells + ).astype(np.int64) + frame.index.names = ['test'] + tm.assert_frame_equal(frame, recons.astype(bool)) + + with ensure_clean(self.ext) as path: + + self.frame.to_excel(path, + 'test1', + columns=['A', 'B', 'C', 'D'], + index=False, merge_cells=self.merge_cells) + # take 'A' and 'B' as indexes (same row as cols 'C', 'D') + df = self.frame.copy() + df = df.set_index(['A', 'B']) + + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=[0, 1]) + tm.assert_frame_equal(df, recons, check_less_precise=True) + + def test_excel_roundtrip_indexname(self): + _skip_if_no_xlrd() + + df = DataFrame(np.random.randn(10, 4)) + df.index.name = 'foo' + + with ensure_clean(self.ext) as path: + df.to_excel(path, merge_cells=self.merge_cells) + + xf = ExcelFile(path) + result = xf.parse(xf.sheet_names[0], + index_col=0, + has_index_names=self.merge_cells) + + tm.assert_frame_equal(result, df) + self.assertEqual(result.index.name, 'foo') + + def test_excel_roundtrip_datetime(self): + _skip_if_no_xlrd() + + # datetime.date, not sure what to test here exactly + tsf = self.tsframe.copy() + with ensure_clean(self.ext) as path: + + tsf.index = [x.date() for x in self.tsframe.index] + tsf.to_excel(path, 'test1', merge_cells=self.merge_cells) + reader = ExcelFile(path) + recons = reader.parse('test1') + tm.assert_frame_equal(self.tsframe, recons) + + # GH4133 - excel output format strings + def test_excel_date_datetime_format(self): + _skip_if_no_xlrd() + df = DataFrame([[date(2014, 1, 31), + date(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), + datetime(2014, 2, 28, 13, 5, 13)]], + index=['DATE', 'DATETIME'], columns=['X', 'Y']) + df_expected = DataFrame([[datetime(2014, 1, 31), + datetime(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), + datetime(2014, 2, 28, 13, 5, 13)]], + index=['DATE', 'DATETIME'], columns=['X', 'Y']) + + with ensure_clean(self.ext) as filename1: + with ensure_clean(self.ext) as filename2: + writer1 = ExcelWriter(filename1) + writer2 = ExcelWriter(filename2, + date_format='DD.MM.YYYY', + datetime_format='DD.MM.YYYY HH-MM-SS') + + df.to_excel(writer1, 'test1') + df.to_excel(writer2, 'test1') + + writer1.close() + writer2.close() + + reader1 = ExcelFile(filename1) + reader2 = ExcelFile(filename2) + + rs1 = reader1.parse('test1', index_col=None) + rs2 = reader2.parse('test1', index_col=None) + + tm.assert_frame_equal(rs1, rs2) + + # since the reader returns a datetime object for dates, we need + # to use df_expected to check the result + tm.assert_frame_equal(rs2, df_expected) + + def test_to_excel_periodindex(self): + _skip_if_no_xlrd() + + frame = self.tsframe + xp = frame.resample('M', kind='period') + + with ensure_clean(self.ext) as path: + xp.to_excel(path, 'sht1') + + reader = ExcelFile(path) + rs = reader.parse('sht1', index_col=0, parse_dates=True) + tm.assert_frame_equal(xp, rs.to_period('M')) + + def test_to_excel_multiindex(self): + _skip_if_no_xlrd() + + frame = self.frame + arrays = np.arange(len(frame.index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, + names=['first', 'second']) + frame.index = new_index + + with ensure_clean(self.ext) as path: + frame.to_excel(path, 'test1', header=False) + frame.to_excel(path, 'test1', columns=['A', 'B']) + + # round trip + frame.to_excel(path, 'test1', merge_cells=self.merge_cells) + reader = ExcelFile(path) + df = reader.parse('test1', index_col=[0, 1], + parse_dates=False, + has_index_names=self.merge_cells) + tm.assert_frame_equal(frame, df) + self.assertEqual(frame.index.names, df.index.names) + + def test_to_excel_multiindex_dates(self): + _skip_if_no_xlrd() + + # try multiindex with dates + tsframe = self.tsframe.copy() + new_index = [tsframe.index, np.arange(len(tsframe.index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + with ensure_clean(self.ext) as path: + tsframe.index.names = ['time', 'foo'] + tsframe.to_excel(path, 'test1', merge_cells=self.merge_cells) + reader = ExcelFile(path) + recons = reader.parse('test1', + index_col=[0, 1], + has_index_names=self.merge_cells) + + tm.assert_frame_equal(tsframe, recons) + self.assertEqual(recons.index.names, ('time', 'foo')) + + def test_to_excel_multiindex_no_write_index(self): + _skip_if_no_xlrd() + + # Test writing and re-reading a MI witout the index. GH 5616. + + # Initial non-MI frame. + frame1 = pd.DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) + + # Add a MI. + frame2 = frame1.copy() + multi_index = pd.MultiIndex.from_tuples([(70, 80), (90, 100)]) + frame2.index = multi_index + + with ensure_clean(self.ext) as path: + + # Write out to Excel without the index. + frame2.to_excel(path, 'test1', index=False) + + # Read it back in. + reader = ExcelFile(path) + frame3 = reader.parse('test1') + + # Test that it is the same as the initial frame. + tm.assert_frame_equal(frame1, frame3) + + def test_to_excel_float_format(self): + _skip_if_no_xlrd() + + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with ensure_clean(self.ext) as filename: + df.to_excel(filename, 'test1', float_format='%.2f') + + reader = ExcelFile(filename) + rs = reader.parse('test1', index_col=None) + xp = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + tm.assert_frame_equal(rs, xp) + + def test_to_excel_output_encoding(self): + _skip_if_no_xlrd() + ext = self.ext + filename = '__tmp_to_excel_float_format__.' + ext + df = DataFrame([[u('\u0192'), u('\u0193'), u('\u0194')], + [u('\u0195'), u('\u0196'), u('\u0197')]], + index=[u('A\u0192'), 'B'], columns=[u('X\u0193'), 'Y', 'Z']) + + with ensure_clean(filename) as filename: + df.to_excel(filename, sheet_name='TestSheet', encoding='utf8') + result = read_excel(filename, 'TestSheet', encoding='utf8') + tm.assert_frame_equal(result, df) + + def test_to_excel_unicode_filename(self): + _skip_if_no_xlrd() + with ensure_clean(u('\u0192u.') + self.ext) as filename: + try: + f = open(filename, 'wb') + except UnicodeEncodeError: + raise nose.SkipTest('no unicode file names on this system') + else: + f.close() + + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + df.to_excel(filename, 'test1', float_format='%.2f') + + reader = ExcelFile(filename) + rs = reader.parse('test1', index_col=None) + xp = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + tm.assert_frame_equal(rs, xp) + + # def test_to_excel_header_styling_xls(self): + + # import StringIO + # s = StringIO( + # """Date,ticker,type,value + # 2001-01-01,x,close,12.2 + # 2001-01-01,x,open ,12.1 + # 2001-01-01,y,close,12.2 + # 2001-01-01,y,open ,12.1 + # 2001-02-01,x,close,12.2 + # 2001-02-01,x,open ,12.1 + # 2001-02-01,y,close,12.2 + # 2001-02-01,y,open ,12.1 + # 2001-03-01,x,close,12.2 + # 2001-03-01,x,open ,12.1 + # 2001-03-01,y,close,12.2 + # 2001-03-01,y,open ,12.1""") + # df = read_csv(s, parse_dates=["Date"]) + # pdf = df.pivot_table(values="value", rows=["ticker"], + # cols=["Date", "type"]) + + # try: + # import xlwt + # import xlrd + # except ImportError: + # raise nose.SkipTest + + # filename = '__tmp_to_excel_header_styling_xls__.xls' + # pdf.to_excel(filename, 'test1') + + # wbk = xlrd.open_workbook(filename, + # formatting_info=True) + # self.assertEqual(["test1"], wbk.sheet_names()) + # ws = wbk.sheet_by_name('test1') + # self.assertEqual([(0, 1, 5, 7), (0, 1, 3, 5), (0, 1, 1, 3)], + # ws.merged_cells) + # for i in range(0, 2): + # for j in range(0, 7): + # xfx = ws.cell_xf_index(0, 0) + # cell_xf = wbk.xf_list[xfx] + # font = wbk.font_list + # self.assertEqual(1, font[cell_xf.font_index].bold) + # self.assertEqual(1, cell_xf.border.top_line_style) + # self.assertEqual(1, cell_xf.border.right_line_style) + # self.assertEqual(1, cell_xf.border.bottom_line_style) + # self.assertEqual(1, cell_xf.border.left_line_style) + # self.assertEqual(2, cell_xf.alignment.hor_align) + # os.remove(filename) + # def test_to_excel_header_styling_xlsx(self): + # import StringIO + # s = StringIO( + # """Date,ticker,type,value + # 2001-01-01,x,close,12.2 + # 2001-01-01,x,open ,12.1 + # 2001-01-01,y,close,12.2 + # 2001-01-01,y,open ,12.1 + # 2001-02-01,x,close,12.2 + # 2001-02-01,x,open ,12.1 + # 2001-02-01,y,close,12.2 + # 2001-02-01,y,open ,12.1 + # 2001-03-01,x,close,12.2 + # 2001-03-01,x,open ,12.1 + # 2001-03-01,y,close,12.2 + # 2001-03-01,y,open ,12.1""") + # df = read_csv(s, parse_dates=["Date"]) + # pdf = df.pivot_table(values="value", rows=["ticker"], + # cols=["Date", "type"]) + # try: + # import openpyxl + # from openpyxl.cell import get_column_letter + # except ImportError: + # raise nose.SkipTest + # if openpyxl.__version__ < '1.6.1': + # raise nose.SkipTest + # # test xlsx_styling + # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx' + # pdf.to_excel(filename, 'test1') + # wbk = openpyxl.load_workbook(filename) + # self.assertEqual(["test1"], wbk.get_sheet_names()) + # ws = wbk.get_sheet_by_name('test1') + # xlsaddrs = ["%s2" % chr(i) for i in range(ord('A'), ord('H'))] + # xlsaddrs += ["A%s" % i for i in range(1, 6)] + # xlsaddrs += ["B1", "D1", "F1"] + # for xlsaddr in xlsaddrs: + # cell = ws.cell(xlsaddr) + # self.assertTrue(cell.style.font.bold) + # self.assertEqual(openpyxl.style.Border.BORDER_THIN, + # cell.style.borders.top.border_style) + # self.assertEqual(openpyxl.style.Border.BORDER_THIN, + # cell.style.borders.right.border_style) + # self.assertEqual(openpyxl.style.Border.BORDER_THIN, + # cell.style.borders.bottom.border_style) + # self.assertEqual(openpyxl.style.Border.BORDER_THIN, + # cell.style.borders.left.border_style) + # self.assertEqual(openpyxl.style.Alignment.HORIZONTAL_CENTER, + # cell.style.alignment.horizontal) + # mergedcells_addrs = ["C1", "E1", "G1"] + # for maddr in mergedcells_addrs: + # self.assertTrue(ws.cell(maddr).merged) + # os.remove(filename) + + def test_excel_010_hemstring(self): + _skip_if_no_xlrd() + + if self.merge_cells: + raise nose.SkipTest('Skip tests for merged MI format.') + + from pandas.util.testing import makeCustomDataframe as mkdf + # ensure limited functionality in 0.10 + # override of #2370 until sorted out in 0.11 + + def roundtrip(df, header=True, parser_hdr=0): + + with ensure_clean(self.ext) as path: + df.to_excel(path, header=header, merge_cells=self.merge_cells) + xf = pd.ExcelFile(path) + res = xf.parse(xf.sheet_names[0], header=parser_hdr) + return res + + nrows = 5 + ncols = 3 + + for i in range(1, 4): # row multindex upto nlevel=3 + for j in range(1, 4): # col "" + df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) + res = roundtrip(df) + # shape + self.assertEqual(res.shape, (nrows, ncols + i)) + + # no nans + for r in range(len(res.index)): + for c in range(len(res.columns)): + self.assertTrue(res.ix[r, c] is not np.nan) + + for i in range(1, 4): # row multindex upto nlevel=3 + for j in range(1, 4): # col "" + df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) + res = roundtrip(df, False) + # shape + self.assertEqual(res.shape, ( + nrows - 1, ncols + i)) # first row taken as columns + + # no nans + for r in range(len(res.index)): + for c in range(len(res.columns)): + self.assertTrue(res.ix[r, c] is not np.nan) + + res = roundtrip(DataFrame([0])) + self.assertEqual(res.shape, (1, 1)) + self.assertTrue(res.ix[0, 0] is not np.nan) + + res = roundtrip(DataFrame([0]), False, None) + self.assertEqual(res.shape, (1, 2)) + self.assertTrue(res.ix[0, 0] is not np.nan) + + def test_duplicated_columns(self): + # Test for issue #5235. + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as path: + write_frame = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) + colnames = ['A', 'B', 'B'] + + write_frame.columns = colnames + write_frame.to_excel(path, 'test1') + + read_frame = read_excel(path, 'test1') + read_frame.columns = colnames + + tm.assert_frame_equal(write_frame, read_frame) + + def test_swapped_columns(self): + # Test for issue #5427. + _skip_if_no_xlrd() + + with ensure_clean(self.ext) as path: + write_frame = DataFrame({'A': [1, 1, 1], + 'B': [2, 2, 2]}) + write_frame.to_excel(path, 'test1', columns=['B', 'A']) + + read_frame = read_excel(path, 'test1', header=0) + + tm.assert_series_equal(write_frame['A'], read_frame['A']) + tm.assert_series_equal(write_frame['B'], read_frame['B']) + + +def raise_wrapper(orig_method): + @functools.wraps(orig_method) + def wrapped(self, *args, **kwargs): + _skip_if_no_openpyxl() + if openpyxl_compat.is_compat(): + orig_method(self, *args, **kwargs) + else: + msg = 'Installed openpyxl is not supported at this time\. Use.+' + with tm.assertRaisesRegexp(ValueError, msg): + orig_method(self, *args, **kwargs) + return wrapped + + +def raise_on_incompat_version(cls): + methods = filter(operator.methodcaller('startswith', 'test_'), dir(cls)) + for method in methods: + setattr(cls, method, raise_wrapper(getattr(cls, method))) + return cls + + +@raise_on_incompat_version +class OpenpyxlTests(ExcelWriterBase, tm.TestCase): + ext = '.xlsx' + engine_name = 'openpyxl' + check_skip = staticmethod(lambda *args, **kwargs: None) + + def test_to_excel_styleconverter(self): + _skip_if_no_openpyxl() + if not openpyxl_compat.is_compat(): + raise nose.SkipTest('incompatiable openpyxl version') + + import openpyxl + + hstyle = {"font": {"bold": True}, + "borders": {"top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}} + + xlsx_style = _OpenpyxlWriter._convert_to_style(hstyle) + self.assertTrue(xlsx_style.font.bold) + self.assertEqual(openpyxl.style.Border.BORDER_THIN, + xlsx_style.borders.top.border_style) + self.assertEqual(openpyxl.style.Border.BORDER_THIN, + xlsx_style.borders.right.border_style) + self.assertEqual(openpyxl.style.Border.BORDER_THIN, + xlsx_style.borders.bottom.border_style) + self.assertEqual(openpyxl.style.Border.BORDER_THIN, + xlsx_style.borders.left.border_style) + self.assertEqual(openpyxl.style.Alignment.HORIZONTAL_CENTER, + xlsx_style.alignment.horizontal) + self.assertEqual(openpyxl.style.Alignment.VERTICAL_TOP, + xlsx_style.alignment.vertical) + + +class XlwtTests(ExcelWriterBase, tm.TestCase): + ext = '.xls' + engine_name = 'xlwt' + check_skip = staticmethod(_skip_if_no_xlwt) + + def test_to_excel_styleconverter(self): + _skip_if_no_xlwt() + + import xlwt + + hstyle = {"font": {"bold": True}, + "borders": {"top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}} + + xls_style = _XlwtWriter._convert_to_style(hstyle) + self.assertTrue(xls_style.font.bold) + self.assertEqual(xlwt.Borders.THIN, xls_style.borders.top) + self.assertEqual(xlwt.Borders.THIN, xls_style.borders.right) + self.assertEqual(xlwt.Borders.THIN, xls_style.borders.bottom) + self.assertEqual(xlwt.Borders.THIN, xls_style.borders.left) + self.assertEqual(xlwt.Alignment.HORZ_CENTER, xls_style.alignment.horz) + self.assertEqual(xlwt.Alignment.VERT_TOP, xls_style.alignment.vert) + + +class XlsxWriterTests(ExcelWriterBase, tm.TestCase): + ext = '.xlsx' + engine_name = 'xlsxwriter' + check_skip = staticmethod(_skip_if_no_xlsxwriter) + + +@raise_on_incompat_version +class OpenpyxlTests_NoMerge(ExcelWriterBase, tm.TestCase): + ext = '.xlsx' + engine_name = 'openpyxl' + check_skip = staticmethod(_skip_if_no_openpyxl) + + # Test < 0.13 non-merge behaviour for MultiIndex and Hierarchical Rows. + merge_cells = False + + +class XlwtTests_NoMerge(ExcelWriterBase, tm.TestCase): + ext = '.xls' + engine_name = 'xlwt' + check_skip = staticmethod(_skip_if_no_xlwt) + + # Test < 0.13 non-merge behaviour for MultiIndex and Hierarchical Rows. + merge_cells = False + + +class XlsxWriterTests_NoMerge(ExcelWriterBase, tm.TestCase): + ext = '.xlsx' + engine_name = 'xlsxwriter' + check_skip = staticmethod(_skip_if_no_xlsxwriter) + + # Test < 0.13 non-merge behaviour for MultiIndex and Hierarchical Rows. + merge_cells = False + + +class ExcelWriterEngineTests(tm.TestCase): + def test_ExcelWriter_dispatch(self): + with tm.assertRaisesRegexp(ValueError, 'No engine'): + ExcelWriter('nothing') + + try: + import xlsxwriter + writer_klass = _XlsxWriter + except ImportError: + _skip_if_no_openpyxl() + if not openpyxl_compat.is_compat(): + raise nose.SkipTest('incompatible openpyxl version') + writer_klass = _OpenpyxlWriter + + with ensure_clean('.xlsx') as path: + writer = ExcelWriter(path) + tm.assert_isinstance(writer, writer_klass) + + _skip_if_no_xlwt() + with ensure_clean('.xls') as path: + writer = ExcelWriter(path) + tm.assert_isinstance(writer, _XlwtWriter) + + def test_register_writer(self): + # some awkward mocking to test out dispatch and such actually works + called_save = [] + called_write_cells = [] + + class DummyClass(ExcelWriter): + called_save = False + called_write_cells = False + supported_extensions = ['test', 'xlsx', 'xls'] + engine = 'dummy' + + def save(self): + called_save.append(True) + + def write_cells(self, *args, **kwargs): + called_write_cells.append(True) + + def check_called(func): + func() + self.assertTrue(len(called_save) >= 1) + self.assertTrue(len(called_write_cells) >= 1) + del called_save[:] + del called_write_cells[:] + + register_writer(DummyClass) + writer = ExcelWriter('something.test') + tm.assert_isinstance(writer, DummyClass) + df = tm.makeCustomDataframe(1, 1) + panel = tm.makePanel() + func = lambda: df.to_excel('something.test') + check_called(func) + check_called(lambda: panel.to_excel('something.test')) + val = get_option('io.excel.xlsx.writer') + set_option('io.excel.xlsx.writer', 'dummy') + check_called(lambda: df.to_excel('something.xlsx')) + check_called(lambda: df.to_excel('something.xls', engine='dummy')) + set_option('io.excel.xlsx.writer', val) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_ga.py b/pandas/io/tests/test_ga.py new file mode 100644 index 00000000..cc26411e --- /dev/null +++ b/pandas/io/tests/test_ga.py @@ -0,0 +1,185 @@ +import os +from datetime import datetime + +import nose +import pandas as pd +from pandas import DataFrame +from pandas.util.testing import network, assert_frame_equal, with_connectivity_check +from numpy.testing.decorators import slow +import pandas.util.testing as tm + +try: + import httplib2 + import pandas.io.ga as ga + from pandas.io.ga import GAnalytics, read_ga + from pandas.io.auth import AuthenticationConfigError, reset_default_token_store + from pandas.io import auth +except ImportError: + raise nose.SkipTest("need httplib2 and auth libs") + +class TestGoogle(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_remove_token_store(self): + auth.DEFAULT_TOKEN_FILE = 'test.dat' + with open(auth.DEFAULT_TOKEN_FILE, 'w') as fh: + fh.write('test') + + reset_default_token_store() + self.assertFalse(os.path.exists(auth.DEFAULT_TOKEN_FILE)) + + @slow + @network + def test_getdata(self): + try: + end_date = datetime.now() + start_date = end_date - pd.offsets.Day() * 5 + end_date = end_date.strftime('%Y-%m-%d') + start_date = start_date.strftime('%Y-%m-%d') + + reader = GAnalytics() + df = reader.get_data( + metrics=['avgTimeOnSite', 'visitors', 'newVisits', + 'pageviewsPerVisit'], + start_date=start_date, + end_date=end_date, + dimensions=['date', 'hour'], + parse_dates={'ts': ['date', 'hour']}) + + assert isinstance(df, DataFrame) + assert isinstance(df.index, pd.DatetimeIndex) + assert len(df) > 1 + assert 'date' not in df + assert 'hour' not in df + assert df.index.name == 'ts' + assert 'avgTimeOnSite' in df + assert 'visitors' in df + assert 'newVisits' in df + assert 'pageviewsPerVisit' in df + + df2 = read_ga( + metrics=['avgTimeOnSite', 'visitors', 'newVisits', + 'pageviewsPerVisit'], + start_date=start_date, + end_date=end_date, + dimensions=['date', 'hour'], + parse_dates={'ts': ['date', 'hour']}) + + assert_frame_equal(df, df2) + + except AuthenticationConfigError: + raise nose.SkipTest("authentication error") + + @slow + @with_connectivity_check("http://www.google.com") + def test_iterator(self): + try: + reader = GAnalytics() + + it = reader.get_data( + metrics='visitors', + start_date='2005-1-1', + dimensions='date', + max_results=10, chunksize=5) + + df1 = next(it) + df2 = next(it) + + for df in [df1, df2]: + assert isinstance(df, DataFrame) + assert isinstance(df.index, pd.DatetimeIndex) + assert len(df) == 5 + assert 'date' not in df + assert df.index.name == 'date' + assert 'visitors' in df + + assert (df2.index > df1.index).all() + + except AuthenticationConfigError: + raise nose.SkipTest("authentication error") + + def test_v2_advanced_segment_format(self): + advanced_segment_id = 1234567 + query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) + assert query['segment'] == 'gaid::' + str(advanced_segment_id), "An integer value should be formatted as an advanced segment." + + def test_v2_dynamic_segment_format(self): + dynamic_segment_id = 'medium==referral' + query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=dynamic_segment_id) + assert query['segment'] == 'dynamic::ga:' + str(dynamic_segment_id), "A string value with more than just letters and numbers should be formatted as a dynamic segment." + + def test_v3_advanced_segment_common_format(self): + advanced_segment_id = 'aZwqR234' + query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) + assert query['segment'] == 'gaid::' + str(advanced_segment_id), "A string value with just letters and numbers should be formatted as an advanced segment." + + def test_v3_advanced_segment_weird_format(self): + advanced_segment_id = '_aZwqR234-s1' + query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) + assert query['segment'] == 'gaid::' + str(advanced_segment_id), "A string value with just letters, numbers, and hyphens should be formatted as an advanced segment." + + def test_v3_advanced_segment_with_underscore_format(self): + advanced_segment_id = 'aZwqR234_s1' + query = ga.format_query('google_profile_id', ['visits'], '2013-09-01', segment=advanced_segment_id) + assert query['segment'] == 'gaid::' + str(advanced_segment_id), "A string value with just letters, numbers, and underscores should be formatted as an advanced segment." + + + @slow + @with_connectivity_check("http://www.google.com") + def test_segment(self): + try: + end_date = datetime.now() + start_date = end_date - pd.offsets.Day() * 5 + end_date = end_date.strftime('%Y-%m-%d') + start_date = start_date.strftime('%Y-%m-%d') + + reader = GAnalytics() + df = reader.get_data( + metrics=['avgTimeOnSite', 'visitors', 'newVisits', + 'pageviewsPerVisit'], + start_date=start_date, + end_date=end_date, + segment=-2, + dimensions=['date', 'hour'], + parse_dates={'ts': ['date', 'hour']}) + + assert isinstance(df, DataFrame) + assert isinstance(df.index, pd.DatetimeIndex) + assert len(df) > 1 + assert 'date' not in df + assert 'hour' not in df + assert df.index.name == 'ts' + assert 'avgTimeOnSite' in df + assert 'visitors' in df + assert 'newVisits' in df + assert 'pageviewsPerVisit' in df + + #dynamic + df = read_ga( + metrics=['avgTimeOnSite', 'visitors', 'newVisits', + 'pageviewsPerVisit'], + start_date=start_date, + end_date=end_date, + segment="source=~twitter", + dimensions=['date', 'hour'], + parse_dates={'ts': ['date', 'hour']}) + + assert isinstance(df, DataFrame) + assert isinstance(df.index, pd.DatetimeIndex) + assert len(df) > 1 + assert 'date' not in df + assert 'hour' not in df + assert df.index.name == 'ts' + assert 'avgTimeOnSite' in df + assert 'visitors' in df + assert 'newVisits' in df + assert 'pageviewsPerVisit' in df + + except AuthenticationConfigError: + raise nose.SkipTest("authentication error") + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py new file mode 100644 index 00000000..0f595f75 --- /dev/null +++ b/pandas/io/tests/test_gbq.py @@ -0,0 +1,290 @@ +import ast +import datetime +import json +import nose +import os +import pytz +import shutil +import subprocess +import sys +import platform +from time import sleep + +import numpy as np + +from pandas import NaT +from pandas.compat import u +from pandas.core.frame import DataFrame +import pandas.io.gbq as gbq +import pandas.util.testing as tm + +PROJECT_ID = None + +VERSION = platform.python_version() + +def missing_bq(): + try: + subprocess.call('bq') + return False + except OSError: + return True + +def test_requirements(): + try: + gbq._test_imports() + except (ImportError, NotImplementedError) as import_exception: + raise nose.SkipTest(import_exception) + +class TestGBQConnectorIntegration(tm.TestCase): + def setUp(self): + test_requirements() + + if not PROJECT_ID: + raise nose.SkipTest("Cannot run integration tests without a project id") + + self.sut = gbq.GbqConnector(PROJECT_ID) + + def test_should_be_able_to_make_a_connector(self): + self.assertTrue(self.sut is not None, 'Could not create a GbqConnector') + + def test_should_be_able_to_get_valid_credentials(self): + credentials = self.sut.get_credentials() + self.assertFalse(credentials.invalid, 'Returned credentials invalid') + + def test_should_be_able_to_get_a_bigquery_service(self): + credentials = self.sut.get_credentials() + bigquery_service = self.sut.get_service(credentials) + self.assertTrue(bigquery_service is not None, 'No service returned') + + def test_should_be_able_to_get_schema_from_query(self): + schema, pages = self.sut.run_query('SELECT 1') + self.assertTrue(schema is not None) + + def test_should_be_able_to_get_results_from_query(self): + schema, pages = self.sut.run_query('SELECT 1') + self.assertTrue(pages is not None) + +class TestReadGBQUnitTests(tm.TestCase): + def setUp(self): + test_requirements() + + def test_should_return_bigquery_integers_as_python_floats(self): + result = gbq._parse_entry(1, 'INTEGER') + tm.assert_equal(result, float(1)) + + def test_should_return_bigquery_floats_as_python_floats(self): + result = gbq._parse_entry(1, 'FLOAT') + tm.assert_equal(result, float(1)) + + def test_should_return_bigquery_timestamps_as_numpy_datetime(self): + result = gbq._parse_entry('0e9', 'TIMESTAMP') + tm.assert_equal(result, np.datetime64('1970-01-01T00:00:00Z')) + + def test_should_return_bigquery_booleans_as_python_booleans(self): + result = gbq._parse_entry('false', 'BOOLEAN') + tm.assert_equal(result, False) + + def test_should_return_bigquery_strings_as_python_strings(self): + result = gbq._parse_entry('STRING', 'STRING') + tm.assert_equal(result, 'STRING') + + def test_to_gbq_should_fail_if_invalid_table_name_passed(self): + with tm.assertRaises(gbq.NotFoundException): + gbq.to_gbq(DataFrame(), 'invalid_table_name', project_id="1234") + + def test_to_gbq_with_no_project_id_given_should_fail(self): + with tm.assertRaises(TypeError): + gbq.to_gbq(DataFrame(), 'dataset.tablename') + + def test_read_gbq_with_no_project_id_given_should_fail(self): + with tm.assertRaises(TypeError): + gbq.read_gbq('SELECT "1" as NUMBER_1') + + def test_that_parse_data_works_properly(self): + test_schema = {'fields': [{'mode': 'NULLABLE', + 'name': 'VALID_STRING', + 'type': 'STRING'}]} + test_page = [{'f': [{'v': 'PI'}]}] + + test_output = gbq._parse_data(test_schema, test_page) + correct_output = DataFrame({'VALID_STRING' : ['PI']}) + tm.assert_frame_equal(test_output, correct_output) + +class TestReadGBQIntegration(tm.TestCase): + def setUp(self): + test_requirements() + + if not PROJECT_ID: + raise nose.SkipTest("Cannot run integration tests without a project id") + + def test_should_properly_handle_valid_strings(self): + query = 'SELECT "PI" as VALID_STRING' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'VALID_STRING' : ['PI']})) + + def test_should_properly_handle_empty_strings(self): + query = 'SELECT "" as EMPTY_STRING' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING' : [""]})) + + def test_should_properly_handle_null_strings(self): + query = 'SELECT STRING(NULL) as NULL_STRING' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'NULL_STRING' : [None]})) + + def test_should_properly_handle_valid_integers(self): + query = 'SELECT INTEGER(3) as VALID_INTEGER' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER' : [3]})) + + def test_should_properly_handle_null_integers(self): + query = 'SELECT INTEGER(NULL) as NULL_INTEGER' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER' : [np.nan]})) + + def test_should_properly_handle_valid_floats(self): + query = 'SELECT PI() as VALID_FLOAT' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'VALID_FLOAT' : [3.141592653589793]})) + + def test_should_properly_handle_null_floats(self): + query = 'SELECT FLOAT(NULL) as NULL_FLOAT' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT' : [np.nan]})) + + def test_should_properly_handle_timestamp_unix_epoch(self): + query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'UNIX_EPOCH' : [np.datetime64('1970-01-01T00:00:00.000000Z')]})) + + def test_should_properly_handle_arbitrary_timestamp(self): + query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'VALID_TIMESTAMP' : [np.datetime64('2004-09-15T05:00:00.000000Z')]})) + + def test_should_properly_handle_null_timestamp(self): + query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP' :[NaT]})) + + def test_should_properly_handle_true_boolean(self): + query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN' : [True]})) + + def test_should_properly_handle_false_boolean(self): + query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN' : [False]})) + + def test_should_properly_handle_null_boolean(self): + query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN' + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN' : [None]})) + + def test_unicode_string_conversion_and_normalization(self): + correct_test_datatype = DataFrame( + {'UNICODE_STRING' : [u("\xe9\xfc")]} + ) + + query = 'SELECT "\xc3\xa9\xc3\xbc" as UNICODE_STRING' + + df = gbq.read_gbq(query, project_id=PROJECT_ID) + tm.assert_frame_equal(df, correct_test_datatype) + + def test_index_column(self): + query = "SELECT 'a' as STRING_1, 'b' as STRING_2" + result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col="STRING_1") + correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b']}).set_index("STRING_1") + tm.assert_equal(result_frame.index.name, correct_frame.index.name) + + def test_column_order(self): + query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" + col_order = ['STRING_3', 'STRING_1', 'STRING_2'] + result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, col_order=col_order) + correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b'], 'STRING_3' : ['c']})[col_order] + tm.assert_frame_equal(result_frame, correct_frame) + + def test_column_order_plus_index(self): + query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" + col_order = ['STRING_3', 'STRING_2'] + result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col='STRING_1', col_order=col_order) + correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b'], 'STRING_3' : ['c']}) + correct_frame.set_index('STRING_1', inplace=True) + correct_frame = correct_frame[col_order] + tm.assert_frame_equal(result_frame, correct_frame) + + def test_malformed_query(self): + with tm.assertRaises(gbq.InvalidQueryException): + gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", project_id=PROJECT_ID) + + def test_bad_project_id(self): + with tm.assertRaises(gbq.NotFoundException): + gbq.read_gbq("SELECT 1", project_id='001') + + def test_bad_table_name(self): + with tm.assertRaises(gbq.NotFoundException): + gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", project_id=PROJECT_ID) + + def test_download_dataset_larger_than_200k_rows(self): + # Test for known BigQuery bug in datasets larger than 100k rows + # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results + df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] GROUP EACH BY id ORDER BY id ASC LIMIT 200005", project_id=PROJECT_ID) + self.assertEqual(len(df.drop_duplicates()), 200005) + +class TestToGBQIntegration(tm.TestCase): + # This class requires bq.py to be installed for setup/teardown. + # It will also need to be preconfigured with a default dataset, + # so, be sure to `bq init` in terminal before running. + + def setUp(self): + test_requirements() + + if not PROJECT_ID: + raise nose.SkipTest("Cannot run integration tests without a project id") + if missing_bq(): + raise nose.SkipTest("Cannot run to_gbq tests without bq command line client") + + @classmethod + def setUpClass(cls): + if PROJECT_ID and not missing_bq(): + subprocess.call(['bq','mk','pydata_pandas_bq_testing']) + subprocess.call(['bq','mk','pydata_pandas_bq_testing.new_test','bools:BOOLEAN,flts:FLOAT,ints:INTEGER,strs:STRING,times:TIMESTAMP']) + + def test_upload_data(self): + test_size = 1000001 + #create df to test for all BQ datatypes except RECORD + bools = np.random.randint(2, size=(1,test_size)).astype(bool) + flts = np.random.randn(1,test_size) + ints = np.random.randint(1,10, size=(1,test_size)) + strs = np.random.randint(1,10, size=(1,test_size)).astype(str) + times = [datetime.datetime.now(pytz.timezone('US/Arizona')) for t in xrange(test_size)] + df = DataFrame({'bools':bools[0], 'flts':flts[0], 'ints':ints[0], 'strs':strs[0], 'times':times[0]}, index=range(test_size)) + gbq.to_gbq(df,"pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID, chunksize=10000) + sleep(60) # <- Curses Google!!! + + result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM pydata_pandas_bq_testing.new_test", project_id=PROJECT_ID) + self.assertEqual(result['NUM_ROWS'][0], test_size) + + def test_google_upload_errors_should_raise_exception(self): + test_timestamp = datetime.datetime.now(pytz.timezone('US/Arizona')) + bad_df = DataFrame( {'bools': [False, False], + 'flts': [0.0,1.0], + 'ints': [0,'1'], + 'strs': ['a', 1], + 'times': [test_timestamp, test_timestamp] + }, index=range(2)) + with tm.assertRaises(gbq.UnknownGBQException): + gbq.to_gbq(bad_df, 'pydata_pandas_bq_testing.new_test', project_id = PROJECT_ID) + + + @classmethod + def tearDownClass(cls): + if PROJECT_ID and not missing_bq(): + subprocess.call(['bq','rm','-f','pydata_pandas_bq_testing.new_test']) + subprocess.call(['bq','rm','-f','pydata_pandas_bq_testing']) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py new file mode 100644 index 00000000..326b7bc0 --- /dev/null +++ b/pandas/io/tests/test_html.py @@ -0,0 +1,723 @@ +from __future__ import print_function + +import glob +import os +import re +import warnings + +try: + from importlib import import_module +except ImportError: + import_module = __import__ + +from distutils.version import LooseVersion + +import nose + +import numpy as np +from numpy.random import rand +from numpy.testing.decorators import slow + +from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, + date_range, Series) +from pandas.compat import map, zip, StringIO, string_types, BytesIO +from pandas.io.common import URLError, urlopen, file_path_to_url +from pandas.io.html import read_html +from pandas.parser import CParserError + +import pandas.util.testing as tm +from pandas.util.testing import makeCustomDataframe as mkdf, network + + +def _have_module(module_name): + try: + import_module(module_name) + return True + except ImportError: + return False + + +def _skip_if_no(module_name): + if not _have_module(module_name): + raise nose.SkipTest("{0!r} not found".format(module_name)) + + +def _skip_if_none_of(module_names): + if isinstance(module_names, string_types): + _skip_if_no(module_names) + if module_names == 'bs4': + import bs4 + if bs4.__version__ == LooseVersion('4.2.0'): + raise nose.SkipTest("Bad version of bs4: 4.2.0") + else: + not_found = [module_name for module_name in module_names if not + _have_module(module_name)] + if set(not_found) & set(module_names): + raise nose.SkipTest("{0!r} not found".format(not_found)) + if 'bs4' in module_names: + import bs4 + if bs4.__version__ == LooseVersion('4.2.0'): + raise nose.SkipTest("Bad version of bs4: 4.2.0") + + +DATA_PATH = tm.get_data_path() + + +def assert_framelist_equal(list1, list2, *args, **kwargs): + assert len(list1) == len(list2), ('lists are not of equal size ' + 'len(list1) == {0}, ' + 'len(list2) == {1}'.format(len(list1), + len(list2))) + msg = 'not all list elements are DataFrames' + both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and + isinstance(y, DataFrame), list1, list2)) + assert both_frames, msg + for frame_i, frame_j in zip(list1, list2): + tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) + assert not frame_i.empty, 'frames are both empty' + + +def test_bs4_version_fails(): + _skip_if_none_of(('bs4', 'html5lib')) + import bs4 + if bs4.__version__ == LooseVersion('4.2.0'): + tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, + "spam.html"), + flavor='bs4') + + +class TestReadHtml(tm.TestCase): + @classmethod + def setUpClass(cls): + super(TestReadHtml, cls).setUpClass() + _skip_if_none_of(('bs4', 'html5lib')) + + def read_html(self, *args, **kwargs): + kwargs['flavor'] = kwargs.get('flavor', self.flavor) + return read_html(*args, **kwargs) + + def setup_data(self): + self.spam_data = os.path.join(DATA_PATH, 'spam.html') + self.banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + def setup_flavor(self): + self.flavor = 'bs4' + + def setUp(self): + self.setup_data() + self.setup_flavor() + + def test_to_html_compat(self): + df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, + r_idx_names=False).applymap('{0:.3f}'.format).astype(float) + out = df.to_html() + res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] + tm.assert_frame_equal(res, df) + + @network + def test_banklist_url(self): + url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' + df1 = self.read_html(url, 'First Federal Bank of Florida', + attrs={"id": 'table'}) + df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) + + assert_framelist_equal(df1, df2) + + @network + def test_spam_url(self): + url = ('http://ndb.nal.usda.gov/ndb/foods/show/1732?fg=&man=&' + 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam') + df1 = self.read_html(url, '.*Water.*') + df2 = self.read_html(url, 'Unit') + + assert_framelist_equal(df1, df2) + + @slow + def test_banklist(self): + df1 = self.read_html(self.banklist_data, '.*Florida.*', + attrs={'id': 'table'}) + df2 = self.read_html(self.banklist_data, 'Metcalf Bank', + attrs={'id': 'table'}) + + assert_framelist_equal(df1, df2) + + def test_spam_no_types(self): + with tm.assert_produces_warning(FutureWarning): + df1 = self.read_html(self.spam_data, '.*Water.*', + infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', infer_types=False) + + assert_framelist_equal(df1, df2) + + self.assertEqual(df1[0].ix[0, 0], 'Proximates') + self.assertEqual(df1[0].columns[0], 'Nutrient') + + def test_spam_with_types(self): + df1 = self.read_html(self.spam_data, '.*Water.*') + df2 = self.read_html(self.spam_data, 'Unit') + assert_framelist_equal(df1, df2) + + self.assertEqual(df1[0].ix[0, 0], 'Proximates') + self.assertEqual(df1[0].columns[0], 'Nutrient') + + def test_spam_no_match(self): + dfs = self.read_html(self.spam_data) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + def test_banklist_no_match(self): + dfs = self.read_html(self.banklist_data, attrs={'id': 'table'}) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + def test_spam_header(self): + df = self.read_html(self.spam_data, '.*Water.*', header=1)[0] + self.assertEqual(df.columns[0], 'Proximates') + self.assertFalse(df.empty) + + def test_skiprows_int(self): + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + + assert_framelist_equal(df1, df2) + + def test_skiprows_xrange(self): + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0] + df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] + tm.assert_frame_equal(df1, df2) + + def test_skiprows_list(self): + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1]) + + assert_framelist_equal(df1, df2) + + def test_skiprows_set(self): + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2])) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1])) + + assert_framelist_equal(df1, df2) + + def test_skiprows_slice(self): + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + + assert_framelist_equal(df1, df2) + + def test_skiprows_slice_short(self): + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) + + assert_framelist_equal(df1, df2) + + def test_skiprows_slice_long(self): + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) + + assert_framelist_equal(df1, df2) + + def test_skiprows_ndarray(self): + df1 = self.read_html(self.spam_data, '.*Water.*', + skiprows=np.arange(2)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) + + assert_framelist_equal(df1, df2) + + def test_skiprows_invalid(self): + with tm.assertRaisesRegexp(TypeError, + 'is not a valid type for skipping rows'): + self.read_html(self.spam_data, '.*Water.*', skiprows='asdf') + + def test_index(self): + df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) + df2 = self.read_html(self.spam_data, 'Unit', index_col=0) + assert_framelist_equal(df1, df2) + + def test_header_and_index_no_types(self): + with tm.assert_produces_warning(FutureWarning): + df1 = self.read_html(self.spam_data, '.*Water.*', header=1, + index_col=0, infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0, + infer_types=False) + assert_framelist_equal(df1, df2) + + def test_header_and_index_with_types(self): + df1 = self.read_html(self.spam_data, '.*Water.*', header=1, + index_col=0) + df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) + assert_framelist_equal(df1, df2) + + def test_infer_types(self): + with tm.assert_produces_warning(FutureWarning): + df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0, + infer_types=False) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', index_col=0, + infer_types=False) + assert_framelist_equal(df1, df2) + + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(self.spam_data, 'Unit', index_col=0, + infer_types=True) + + with tm.assertRaises(AssertionError): + assert_framelist_equal(df1, df2) + + def test_string_io(self): + with open(self.spam_data) as f: + data1 = StringIO(f.read()) + + with open(self.spam_data) as f: + data2 = StringIO(f.read()) + + df1 = self.read_html(data1, '.*Water.*') + df2 = self.read_html(data2, 'Unit') + assert_framelist_equal(df1, df2) + + def test_string(self): + with open(self.spam_data) as f: + data = f.read() + + df1 = self.read_html(data, '.*Water.*') + df2 = self.read_html(data, 'Unit') + + assert_framelist_equal(df1, df2) + + def test_file_like(self): + with open(self.spam_data) as f: + df1 = self.read_html(f, '.*Water.*') + + with open(self.spam_data) as f: + df2 = self.read_html(f, 'Unit') + + assert_framelist_equal(df1, df2) + + @network + def test_bad_url_protocol(self): + with tm.assertRaises(URLError): + self.read_html('git://github.com', match='.*Water.*') + + @network + def test_invalid_url(self): + try: + with tm.assertRaises(URLError): + self.read_html('http://www.a23950sdfa908sd.com', + match='.*Water.*') + except ValueError as e: + tm.assert_equal(str(e), 'No tables found') + + @slow + def test_file_url(self): + url = self.banklist_data + dfs = self.read_html(file_path_to_url(url), 'First', + attrs={'id': 'table'}) + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + @slow + def test_invalid_table_attrs(self): + url = self.banklist_data + with tm.assertRaisesRegexp(ValueError, 'No tables found'): + self.read_html(url, 'First Federal Bank of Florida', + attrs={'id': 'tasdfable'}) + + def _bank_data(self, *args, **kwargs): + return self.read_html(self.banklist_data, 'Metcalf', + attrs={'id': 'table'}, *args, **kwargs) + + @slow + def test_multiindex_header(self): + df = self._bank_data(header=[0, 1])[0] + tm.assert_isinstance(df.columns, MultiIndex) + + @slow + def test_multiindex_index(self): + df = self._bank_data(index_col=[0, 1])[0] + tm.assert_isinstance(df.index, MultiIndex) + + @slow + def test_multiindex_header_index(self): + df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] + tm.assert_isinstance(df.columns, MultiIndex) + tm.assert_isinstance(df.index, MultiIndex) + + @slow + def test_multiindex_header_skiprows_tuples(self): + df = self._bank_data(header=[0, 1], skiprows=1, tupleize_cols=True)[0] + tm.assert_isinstance(df.columns, Index) + + @slow + def test_multiindex_header_skiprows(self): + df = self._bank_data(header=[0, 1], skiprows=1)[0] + tm.assert_isinstance(df.columns, MultiIndex) + + @slow + def test_multiindex_header_index_skiprows(self): + df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] + tm.assert_isinstance(df.index, MultiIndex) + tm.assert_isinstance(df.columns, MultiIndex) + + @slow + def test_regex_idempotency(self): + url = self.banklist_data + dfs = self.read_html(file_path_to_url(url), + match=re.compile(re.compile('Florida')), + attrs={'id': 'table'}) + tm.assert_isinstance(dfs, list) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + + def test_negative_skiprows(self): + with tm.assertRaisesRegexp(ValueError, + '\(you passed a negative value\)'): + self.read_html(self.spam_data, 'Water', skiprows=-1) + + @network + def test_multiple_matches(self): + url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) + self.assertTrue(len(dfs) > 1) + + @network + def test_pythonxy_plugins_table(self): + url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) + zz = [df.iloc[0, 0] for df in dfs] + self.assertEqual(sorted(zz), sorted(['Python', 'SciTE'])) + + @slow + def test_thousands_macau_stats(self): + all_non_nan_table_index = -2 + macau_data = os.path.join(DATA_PATH, 'macau.html') + dfs = self.read_html(macau_data, index_col=0, + attrs={'class': 'style1'}) + df = dfs[all_non_nan_table_index] + + self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) + + @slow + def test_thousands_macau_index_col(self): + all_non_nan_table_index = -2 + macau_data = os.path.join(DATA_PATH, 'macau.html') + dfs = self.read_html(macau_data, index_col=0, header=0) + df = dfs[all_non_nan_table_index] + + self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) + + def test_countries_municipalities(self): + # GH5048 + data1 = StringIO(''' + + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + data2 = StringIO(''' + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + res1 = self.read_html(data1) + res2 = self.read_html(data2, header=0) + assert_framelist_equal(res1, res2) + + def test_nyse_wsj_commas_table(self): + data = os.path.join(DATA_PATH, 'nyse_wsj.html') + df = self.read_html(data, index_col=0, header=0, + attrs={'class': 'mdcTable'})[0] + + columns = Index(['Issue(Roll over for charts and headlines)', + 'Volume', 'Price', 'Chg', '% Chg']) + nrows = 100 + self.assertEqual(df.shape[0], nrows) + self.assertTrue(df.columns.equals(columns)) + + @slow + def test_banklist_header(self): + from pandas.io.html import _remove_whitespace + + def try_remove_ws(x): + try: + return _remove_whitespace(x) + except AttributeError: + return x + + df = self.read_html(self.banklist_data, 'Metcalf', + attrs={'id': 'table'})[0] + ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), + converters={'Updated Date': Timestamp, + 'Closing Date': Timestamp}) + self.assertEqual(df.shape, ground_truth.shape) + old = ['First Vietnamese American BankIn Vietnamese', + 'Westernbank Puerto RicoEn Espanol', + 'R-G Premier Bank of Puerto RicoEn Espanol', + 'EurobankEn Espanol', 'Sanderson State BankEn Espanol', + 'Washington Mutual Bank(Including its subsidiary Washington ' + 'Mutual Bank FSB)', + 'Silver State BankEn Espanol', + 'AmTrade International BankEn Espanol', + 'Hamilton Bank, NAEn Espanol', + 'The Citizens Savings BankPioneer Community Bank, Inc.'] + new = ['First Vietnamese American Bank', 'Westernbank Puerto Rico', + 'R-G Premier Bank of Puerto Rico', 'Eurobank', + 'Sanderson State Bank', 'Washington Mutual Bank', + 'Silver State Bank', 'AmTrade International Bank', + 'Hamilton Bank, NA', 'The Citizens Savings Bank'] + dfnew = df.applymap(try_remove_ws).replace(old, new) + gtnew = ground_truth.applymap(try_remove_ws) + converted = dfnew.convert_objects(convert_numeric=True) + tm.assert_frame_equal(converted.convert_objects(convert_dates='coerce'), + gtnew) + + @slow + def test_gold_canyon(self): + gc = 'Gold Canyon' + with open(self.banklist_data, 'r') as f: + raw_text = f.read() + + self.assertIn(gc, raw_text) + df = self.read_html(self.banklist_data, 'Gold Canyon', + attrs={'id': 'table'})[0] + self.assertIn(gc, df.to_string()) + + def test_different_number_of_rows(self): + expected = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233 nan nan nan
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" + out = """ + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" + expected = self.read_html(expected, index_col=0)[0] + res = self.read_html(out, index_col=0)[0] + tm.assert_frame_equal(expected, res) + + def test_parse_dates_list(self): + df = DataFrame({'date': date_range('1/1/2001', periods=10)}) + expected = df.to_html() + res = self.read_html(expected, parse_dates=[0], index_col=0) + tm.assert_frame_equal(df, res[0]) + + def test_parse_dates_combine(self): + raw_dates = Series(date_range('1/1/2001', periods=10)) + df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), + 'time': raw_dates.map(lambda x: str(x.time()))}) + res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, + index_col=1) + newdf = DataFrame({'datetime': raw_dates}) + tm.assert_frame_equal(newdf, res[0]) + + def test_computer_sales_page(self): + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assertRaisesRegexp(CParserError, r"Passed header=\[0,1\] are " + "too many rows for this multi_index " + "of columns"): + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + + +def _lang_enc(filename): + return os.path.splitext(os.path.basename(filename))[0].split('_') + + +class TestReadHtmlEncoding(tm.TestCase): + files = glob.glob(os.path.join(DATA_PATH, 'html_encoding', '*.html')) + + def read_filename(self, f, encoding): + return read_html(f, encoding=encoding, index_col=0) + + def read_file_like(self, f, encoding): + with open(f, 'rb') as fobj: + return read_html(BytesIO(fobj.read()), encoding=encoding, + index_col=0) + + def read_string(self, f, encoding): + with open(f, 'rb') as fobj: + return read_html(fobj.read(), encoding=encoding, index_col=0) + + def test_encode(self): + for f in self.files: + _, encoding = _lang_enc(f) + from_string = self.read_string(f, encoding).pop() + from_file_like = self.read_file_like(f, encoding).pop() + from_filename = self.read_filename(f, encoding).pop() + tm.assert_frame_equal(from_string, from_file_like) + tm.assert_frame_equal(from_string, from_filename) + + +class TestReadHtmlLxml(tm.TestCase): + @classmethod + def setUpClass(cls): + super(TestReadHtmlLxml, cls).setUpClass() + _skip_if_no('lxml') + + def read_html(self, *args, **kwargs): + self.flavor = ['lxml'] + kwargs['flavor'] = kwargs.get('flavor', self.flavor) + return read_html(*args, **kwargs) + + def test_data_fail(self): + from lxml.etree import XMLSyntaxError + spam_data = os.path.join(DATA_PATH, 'spam.html') + banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + with tm.assertRaises(XMLSyntaxError): + self.read_html(spam_data, flavor=['lxml']) + + with tm.assertRaises(XMLSyntaxError): + self.read_html(banklist_data, flavor=['lxml']) + + def test_works_on_valid_markup(self): + filename = os.path.join(DATA_PATH, 'valid_markup.html') + dfs = self.read_html(filename, index_col=0, flavor=['lxml']) + tm.assert_isinstance(dfs, list) + tm.assert_isinstance(dfs[0], DataFrame) + + @slow + def test_fallback_success(self): + _skip_if_none_of(('bs4', 'html5lib')) + banklist_data = os.path.join(DATA_PATH, 'banklist.html') + self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) + + def test_parse_dates_list(self): + df = DataFrame({'date': date_range('1/1/2001', periods=10)}) + expected = df.to_html() + res = self.read_html(expected, parse_dates=[0], index_col=0) + tm.assert_frame_equal(df, res[0]) + + def test_parse_dates_combine(self): + raw_dates = Series(date_range('1/1/2001', periods=10)) + df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), + 'time': raw_dates.map(lambda x: str(x.time()))}) + res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, + index_col=1) + newdf = DataFrame({'datetime': raw_dates}) + tm.assert_frame_equal(newdf, res[0]) + + def test_computer_sales_page(self): + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + + +def test_invalid_flavor(): + url = 'google.com' + nose.tools.assert_raises(ValueError, read_html, url, 'google', + flavor='not a* valid**++ flaver') + + +def get_elements_from_file(url, element='table'): + _skip_if_none_of(('bs4', 'html5lib')) + url = file_path_to_url(url) + from bs4 import BeautifulSoup + with urlopen(url) as f: + soup = BeautifulSoup(f, features='html5lib') + return soup.find_all(element) + + +@slow +def test_bs4_finds_tables(): + filepath = os.path.join(DATA_PATH, "spam.html") + with warnings.catch_warnings(): + warnings.filterwarnings('ignore') + assert get_elements_from_file(filepath, 'table') + + +def get_lxml_elements(url, element): + _skip_if_no('lxml') + from lxml.html import parse + doc = parse(url) + return doc.xpath('.//{0}'.format(element)) + + +@slow +def test_lxml_finds_tables(): + filepath = os.path.join(DATA_PATH, "spam.html") + assert get_lxml_elements(filepath, 'table') + + +@slow +def test_lxml_finds_tbody(): + filepath = os.path.join(DATA_PATH, "spam.html") + assert get_lxml_elements(filepath, 'tbody') + + +def test_same_ordering(): + _skip_if_none_of(['bs4', 'lxml', 'html5lib']) + filename = os.path.join(DATA_PATH, 'valid_markup.html') + dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) + dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) + assert_framelist_equal(dfs_lxml, dfs_bs4) diff --git a/pandas/io/tests/test_json/__init__.py b/pandas/io/tests/test_json/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/io/tests/test_json/data/tsframe_iso_v012.json b/pandas/io/tests/test_json/data/tsframe_iso_v012.json new file mode 100644 index 00000000..bd9ff885 --- /dev/null +++ b/pandas/io/tests/test_json/data/tsframe_iso_v012.json @@ -0,0 +1 @@ +{"A":{"2000-01-03T00:00:00":1.56808523,"2000-01-04T00:00:00":-0.2550111,"2000-01-05T00:00:00":1.51493992,"2000-01-06T00:00:00":-0.02765498,"2000-01-07T00:00:00":0.05951614},"B":{"2000-01-03T00:00:00":0.65727391,"2000-01-04T00:00:00":-0.08072427,"2000-01-05T00:00:00":0.11805825,"2000-01-06T00:00:00":0.44679743,"2000-01-07T00:00:00":-2.69652057},"C":{"2000-01-03T00:00:00":1.81021139,"2000-01-04T00:00:00":-0.03202878,"2000-01-05T00:00:00":1.629455,"2000-01-06T00:00:00":0.33192641,"2000-01-07T00:00:00":1.28163262},"D":{"2000-01-03T00:00:00":-0.17251653,"2000-01-04T00:00:00":-0.17581665,"2000-01-05T00:00:00":-1.31506612,"2000-01-06T00:00:00":-0.27885413,"2000-01-07T00:00:00":0.34703478},"date":{"2000-01-03T00:00:00":"1992-01-06T18:21:32.120000","2000-01-04T00:00:00":"1992-01-06T18:21:32.120000","2000-01-05T00:00:00":"1992-01-06T18:21:32.120000","2000-01-06T00:00:00":"2013-01-01T00:00:00","2000-01-07T00:00:00":"1992-01-06T18:21:32.120000"}} \ No newline at end of file diff --git a/pandas/io/tests/test_json/data/tsframe_v012.json b/pandas/io/tests/test_json/data/tsframe_v012.json new file mode 100644 index 00000000..d4474c76 --- /dev/null +++ b/pandas/io/tests/test_json/data/tsframe_v012.json @@ -0,0 +1 @@ +{"A":{"946857600000000000":1.56808523,"946944000000000000":-0.2550111,"947030400000000000":1.51493992,"947116800000000000":-0.02765498,"947203200000000000":0.05951614},"B":{"946857600000000000":0.65727391,"946944000000000000":-0.08072427,"947030400000000000":0.11805825,"947116800000000000":0.44679743,"947203200000000000":-2.69652057},"C":{"946857600000000000":1.81021139,"946944000000000000":-0.03202878,"947030400000000000":1.629455,"947116800000000000":0.33192641,"947203200000000000":1.28163262},"D":{"946857600000000000":-0.17251653,"946944000000000000":-0.17581665,"947030400000000000":-1.31506612,"947116800000000000":-0.27885413,"947203200000000000":0.34703478},"date":{"946857600000000000":694722092120000000,"946944000000000000":694722092120000000,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000},"modified":{"946857600000000000":694722092120000000,"946944000000000000":null,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000}} \ No newline at end of file diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py new file mode 100644 index 00000000..f6f70520 --- /dev/null +++ b/pandas/io/tests/test_json/test_pandas.py @@ -0,0 +1,631 @@ +# pylint: disable-msg=W0612,E1101 +from pandas.compat import range, lrange, StringIO +from pandas import compat +import os + +import numpy as np +import nose +from pandas import Series, DataFrame, DatetimeIndex, Timestamp +import pandas as pd +read_json = pd.read_json + +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal, network, + ensure_clean, assert_index_equal) +import pandas.util.testing as tm + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() + +_frame = DataFrame(_seriesd) +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_intframe = DataFrame(dict((k, v.astype(np.int64)) + for k, v in compat.iteritems(_seriesd))) + +_tsframe = DataFrame(_tsd) + +_mixed_frame = _frame.copy() + +class TestPandasContainer(tm.TestCase): + + def setUp(self): + self.dirpath = tm.get_data_path() + + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.objSeries = tm.makeObjectSeries() + self.objSeries.name = 'objects' + + self.empty_series = Series([], index=[]) + self.empty_frame = DataFrame({}) + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.intframe = _intframe.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def tearDown(self): + del self.dirpath + + del self.ts + + del self.series + + del self.objSeries + + del self.empty_series + del self.empty_frame + + del self.frame + del self.frame2 + del self.intframe + del self.tsframe + del self.mixed_frame + + def test_frame_double_encoded_labels(self): + df = DataFrame([['a', 'b'], ['c', 'd']], + index=['index " 1', 'index / 2'], + columns=['a \\ b', 'y / z']) + + assert_frame_equal(df, read_json(df.to_json(orient='split'), + orient='split')) + assert_frame_equal(df, read_json(df.to_json(orient='columns'), + orient='columns')) + assert_frame_equal(df, read_json(df.to_json(orient='index'), + orient='index')) + df_unser = read_json(df.to_json(orient='records'), orient='records') + assert_index_equal(df.columns, df_unser.columns) + np.testing.assert_equal(df.values, df_unser.values) + + def test_frame_non_unique_index(self): + df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1], + columns=['x', 'y']) + + self.assertRaises(ValueError, df.to_json, orient='index') + self.assertRaises(ValueError, df.to_json, orient='columns') + + assert_frame_equal(df, read_json(df.to_json(orient='split'), + orient='split')) + unser = read_json(df.to_json(orient='records'), orient='records') + self.assertTrue(df.columns.equals(unser.columns)) + np.testing.assert_equal(df.values, unser.values) + unser = read_json(df.to_json(orient='values'), orient='values') + np.testing.assert_equal(df.values, unser.values) + + def test_frame_non_unique_columns(self): + df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2], + columns=['x', 'x']) + + self.assertRaises(ValueError, df.to_json, orient='index') + self.assertRaises(ValueError, df.to_json, orient='columns') + self.assertRaises(ValueError, df.to_json, orient='records') + + assert_frame_equal(df, read_json(df.to_json(orient='split'), + orient='split', dtype=False)) + unser = read_json(df.to_json(orient='values'), orient='values') + np.testing.assert_equal(df.values, unser.values) + + # GH4377; duplicate columns not processing correctly + df = DataFrame([['a','b'],['c','d']], index=[1,2], columns=['x','y']) + result = read_json(df.to_json(orient='split'), orient='split') + assert_frame_equal(result, df) + + def _check(df): + result = read_json(df.to_json(orient='split'), orient='split', + convert_dates=['x']) + assert_frame_equal(result, df) + + for o in [[['a','b'],['c','d']], + [[1.5,2.5],[3.5,4.5]], + [[1,2.5],[3,4.5]], + [[Timestamp('20130101'),3.5],[Timestamp('20130102'),4.5]]]: + _check(DataFrame(o, index=[1,2], columns=['x','x'])) + + def test_frame_from_json_to_json(self): + def _check_orient(df, orient, dtype=None, numpy=False, + convert_axes=True, check_dtype=True, raise_ok=None): + df = df.sort() + dfjson = df.to_json(orient=orient) + + try: + unser = read_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy, convert_axes=convert_axes) + except Exception as detail: + if raise_ok is not None: + if isinstance(detail, raise_ok): + return + raise + + unser = unser.sort() + + if dtype is False: + check_dtype=False + + if not convert_axes and df.index.dtype.type == np.datetime64: + unser.index = DatetimeIndex( + unser.index.values.astype('i8') * 1e6) + if orient == "records": + # index is not captured in this orientation + assert_almost_equal(df.values, unser.values) + self.assertTrue(df.columns.equals(unser.columns)) + elif orient == "values": + # index and cols are not captured in this orientation + assert_almost_equal(df.values, unser.values) + elif orient == "split": + # index and col labels might not be strings + unser.index = [str(i) for i in unser.index] + unser.columns = [str(i) for i in unser.columns] + unser = unser.sort() + assert_almost_equal(df.values, unser.values) + else: + if convert_axes: + assert_frame_equal(df, unser, check_dtype=check_dtype) + else: + assert_frame_equal(df, unser, check_less_precise=False, + check_dtype=check_dtype) + + def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None): + + # numpy=False + if convert_axes: + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, convert_axes=False) + _check_orient(df, "records", dtype=dtype, convert_axes=False) + _check_orient(df, "split", dtype=dtype, convert_axes=False) + _check_orient(df, "index", dtype=dtype, convert_axes=False) + _check_orient(df, "values", dtype=dtype ,convert_axes=False) + + # numpy=True and raise_ok might be not None, so ignore the error + if convert_axes: + _check_orient(df, "columns", dtype=dtype, numpy=True, + raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, + raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, + raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, + raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, + raise_ok=raise_ok) + + _check_orient(df, "columns", dtype=dtype, numpy=True, + convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, + convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, + convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, + convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, + convert_axes=False, raise_ok=raise_ok) + + # basic + _check_all_orients(self.frame) + self.assertEqual(self.frame.to_json(), + self.frame.to_json(orient="columns")) + + _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + _check_all_orients(self.intframe, dtype=False) + + # big one + # index and columns are strings as all unserialised JSON object keys + # are assumed to be strings + biggie = DataFrame(np.zeros((200, 4)), + columns=[str(i) for i in range(4)], + index=[str(i) for i in range(200)]) + _check_all_orients(biggie,dtype=False,convert_axes=False) + + # dtypes + _check_all_orients(DataFrame(biggie, dtype=np.float64), + dtype=np.float64, convert_axes=False) + _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, + convert_axes=False) + _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3', + convert_axes=False, raise_ok=ValueError) + + # empty + _check_all_orients(self.empty_frame) + + # time series data + _check_all_orients(self.tsframe) + + # mixed data + index = pd.Index(['a', 'b', 'c', 'd', 'e']) + data = { + 'A': [0., 1., 2., 3., 4.], + 'B': [0., 1., 0., 1., 0.], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': [True, False, True, False, True] + } + df = DataFrame(data=data, index=index) + _check_orient(df, "split", check_dtype=False) + _check_orient(df, "records", check_dtype=False) + _check_orient(df, "values", check_dtype=False) + _check_orient(df, "columns", check_dtype=False) + # index oriented is problematic as it is read back in in a transposed + # state, so the columns are interpreted as having mixed data and + # given object dtypes. + # force everything to have object dtype beforehand + _check_orient(df.transpose().transpose(), "index", dtype=False) + + def test_frame_from_json_bad_data(self): + self.assertRaises(ValueError, read_json, StringIO('{"key":b:a:d}')) + + # too few indices + json = StringIO('{"columns":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}') + self.assertRaises(ValueError, read_json, json, + orient="split") + + # too many columns + json = StringIO('{"columns":["A","B","C"],' + '"index":["1","2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}') + self.assertRaises(AssertionError, read_json, json, + orient="split") + + # bad key + json = StringIO('{"badkey":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}') + with tm.assertRaisesRegexp(ValueError, r"unexpected key\(s\): badkey"): + read_json(json, orient="split") + + def test_frame_from_json_nones(self): + df = DataFrame([[1, 2], [4, 5, 6]]) + unser = read_json(df.to_json()) + self.assertTrue(np.isnan(unser[2][0])) + + df = DataFrame([['1', '2'], ['4', '5', '6']]) + unser = read_json(df.to_json()) + self.assertTrue(np.isnan(unser[2][0])) + unser = read_json(df.to_json(),dtype=False) + self.assertTrue(unser[2][0] is None) + unser = read_json(df.to_json(),convert_axes=False,dtype=False) + self.assertTrue(unser['2']['0'] is None) + + unser = read_json(df.to_json(), numpy=False) + self.assertTrue(np.isnan(unser[2][0])) + unser = read_json(df.to_json(), numpy=False, dtype=False) + self.assertTrue(unser[2][0] is None) + unser = read_json(df.to_json(), numpy=False, convert_axes=False, dtype=False) + self.assertTrue(unser['2']['0'] is None) + + # infinities get mapped to nulls which get mapped to NaNs during + # deserialisation + df = DataFrame([[1, 2], [4, 5, 6]]) + df[2][0] = np.inf + unser = read_json(df.to_json()) + self.assertTrue(np.isnan(unser[2][0])) + unser = read_json(df.to_json(), dtype=False) + self.assertTrue(np.isnan(unser[2][0])) + + df[2][0] = np.NINF + unser = read_json(df.to_json()) + self.assertTrue(np.isnan(unser[2][0])) + unser = read_json(df.to_json(),dtype=False) + self.assertTrue(np.isnan(unser[2][0])) + + def test_frame_to_json_except(self): + df = DataFrame([1, 2, 3]) + self.assertRaises(ValueError, df.to_json, orient="garbage") + + def test_v12_compat(self): + df = DataFrame( + [[1.56808523, 0.65727391, 1.81021139, -0.17251653], + [-0.2550111, -0.08072427, -0.03202878, -0.17581665], + [1.51493992, 0.11805825, 1.629455, -1.31506612], + [-0.02765498, 0.44679743, 0.33192641, -0.27885413], + [0.05951614, -2.69652057, 1.28163262, 0.34703478]], + columns=['A', 'B', 'C', 'D'], + index=pd.date_range('2000-01-03', '2000-01-07')) + df['date'] = pd.Timestamp('19920106 18:21:32.12') + df.ix[3, 'date'] = pd.Timestamp('20130101') + df['modified'] = df['date'] + df.ix[1, 'modified'] = pd.NaT + + v12_json = os.path.join(self.dirpath, 'tsframe_v012.json') + df_unser = pd.read_json(v12_json) + df_unser = pd.read_json(v12_json) + assert_frame_equal(df, df_unser) + + df_iso = df.drop(['modified'], axis=1) + v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json') + df_unser_iso = pd.read_json(v12_iso_json) + assert_frame_equal(df_iso, df_unser_iso) + + def test_series_non_unique_index(self): + s = Series(['a', 'b'], index=[1, 1]) + + self.assertRaises(ValueError, s.to_json, orient='index') + + assert_series_equal(s, read_json(s.to_json(orient='split'), + orient='split', typ='series')) + unser = read_json(s.to_json(orient='records'), + orient='records', typ='series') + np.testing.assert_equal(s.values, unser.values) + + def test_series_from_json_to_json(self): + + def _check_orient(series, orient, dtype=None, numpy=False): + series = series.sort_index() + unser = read_json(series.to_json(orient=orient), + typ='series', orient=orient, numpy=numpy, + dtype=dtype) + unser = unser.sort_index() + if orient == "records" or orient == "values": + assert_almost_equal(series.values, unser.values) + else: + try: + assert_series_equal(series, unser) + except: + raise + if orient == "split": + self.assertEqual(series.name, unser.name) + + def _check_all_orients(series, dtype=None): + _check_orient(series, "columns", dtype=dtype) + _check_orient(series, "records", dtype=dtype) + _check_orient(series, "split", dtype=dtype) + _check_orient(series, "index", dtype=dtype) + _check_orient(series, "values", dtype=dtype) + + _check_orient(series, "columns", dtype=dtype, numpy=True) + _check_orient(series, "records", dtype=dtype, numpy=True) + _check_orient(series, "split", dtype=dtype, numpy=True) + _check_orient(series, "index", dtype=dtype, numpy=True) + _check_orient(series, "values", dtype=dtype, numpy=True) + + # basic + _check_all_orients(self.series) + self.assertEqual(self.series.to_json(), + self.series.to_json(orient="index")) + + objSeries = Series([str(d) for d in self.objSeries], + index=self.objSeries.index, + name=self.objSeries.name) + _check_all_orients(objSeries, dtype=False) + _check_all_orients(self.empty_series) + _check_all_orients(self.ts) + + # dtype + s = Series(lrange(6), index=['a','b','c','d','e','f']) + _check_all_orients(Series(s, dtype=np.float64), dtype=np.float64) + _check_all_orients(Series(s, dtype=np.int), dtype=np.int) + + def test_series_to_json_except(self): + s = Series([1, 2, 3]) + self.assertRaises(ValueError, s.to_json, orient="garbage") + + def test_series_from_json_precise_float(self): + s = Series([4.56, 4.56, 4.56]) + result = read_json(s.to_json(), typ='series', precise_float=True) + assert_series_equal(result, s) + + def test_frame_from_json_precise_float(self): + df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) + result = read_json(df.to_json(), precise_float=True) + assert_frame_equal(result, df) + + def test_typ(self): + + s = Series(lrange(6), index=['a','b','c','d','e','f'], dtype='int64') + result = read_json(s.to_json(),typ=None) + assert_series_equal(result,s) + + def test_reconstruction_index(self): + + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + result = read_json(df.to_json()) + + # the index is serialized as strings....correct? + assert_frame_equal(result, df) + + def test_path(self): + with ensure_clean('test.json') as path: + for df in [self.frame, self.frame2, self.intframe, self.tsframe, + self.mixed_frame]: + df.to_json(path) + read_json(path) + + def test_axis_dates(self): + + # frame + json = self.tsframe.to_json() + result = read_json(json) + assert_frame_equal(result, self.tsframe) + + # series + json = self.ts.to_json() + result = read_json(json, typ='series') + assert_series_equal(result, self.ts) + + def test_convert_dates(self): + + # frame + df = self.tsframe.copy() + df['date'] = Timestamp('20130101') + + json = df.to_json() + result = read_json(json) + assert_frame_equal(result, df) + + df['foo'] = 1. + json = df.to_json(date_unit='ns') + result = read_json(json, convert_dates=False) + expected = df.copy() + expected['date'] = expected['date'].values.view('i8') + expected['foo'] = expected['foo'].astype('int64') + assert_frame_equal(result, expected) + + # series + ts = Series(Timestamp('20130101'), index=self.ts.index) + json = ts.to_json() + result = read_json(json, typ='series') + assert_series_equal(result, ts) + + def test_date_format_frame(self): + df = self.tsframe.copy() + + def test_w_date(date, date_unit=None): + df['date'] = Timestamp(date) + df.ix[1, 'date'] = pd.NaT + df.ix[5, 'date'] = pd.NaT + if date_unit: + json = df.to_json(date_format='iso', date_unit=date_unit) + else: + json = df.to_json(date_format='iso') + result = read_json(json) + assert_frame_equal(result, df) + + test_w_date('20130101 20:43:42.123') + test_w_date('20130101 20:43:42', date_unit='s') + test_w_date('20130101 20:43:42.123', date_unit='ms') + test_w_date('20130101 20:43:42.123456', date_unit='us') + test_w_date('20130101 20:43:42.123456789', date_unit='ns') + + self.assertRaises(ValueError, df.to_json, date_format='iso', + date_unit='foo') + + def test_date_format_series(self): + def test_w_date(date, date_unit=None): + ts = Series(Timestamp(date), index=self.ts.index) + ts.ix[1] = pd.NaT + ts.ix[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format='iso', date_unit=date_unit) + else: + json = ts.to_json(date_format='iso') + result = read_json(json, typ='series') + assert_series_equal(result, ts) + + test_w_date('20130101 20:43:42.123') + test_w_date('20130101 20:43:42', date_unit='s') + test_w_date('20130101 20:43:42.123', date_unit='ms') + test_w_date('20130101 20:43:42.123456', date_unit='us') + test_w_date('20130101 20:43:42.123456789', date_unit='ns') + + ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index) + self.assertRaises(ValueError, ts.to_json, date_format='iso', + date_unit='foo') + + def test_date_unit(self): + df = self.tsframe.copy() + df['date'] = Timestamp('20130101 20:43:42') + df.ix[1, 'date'] = Timestamp('19710101 20:43:42') + df.ix[2, 'date'] = Timestamp('21460101 20:43:42') + df.ix[4, 'date'] = pd.NaT + + for unit in ('s', 'ms', 'us', 'ns'): + json = df.to_json(date_format='epoch', date_unit=unit) + + # force date unit + result = read_json(json, date_unit=unit) + assert_frame_equal(result, df) + + # detect date unit + result = read_json(json, date_unit=None) + assert_frame_equal(result, df) + + def test_weird_nested_json(self): + # this used to core dump the parser + s = r'''{ + "status": "success", + "data": { + "posts": [ + { + "id": 1, + "title": "A blog post", + "body": "Some useful content" + }, + { + "id": 2, + "title": "Another blog post", + "body": "More content" + } + ] + } + }''' + + read_json(s) + + def test_doc_example(self): + dfj2 = DataFrame(np.random.randn(5, 2), columns=list('AB')) + dfj2['date'] = Timestamp('20130101') + dfj2['ints'] = lrange(5) + dfj2['bools'] = True + dfj2.index = pd.date_range('20130101',periods=5) + + json = dfj2.to_json() + result = read_json(json,dtype={'ints' : np.int64, 'bools' : np.bool_}) + assert_frame_equal(result,result) + + def test_misc_example(self): + + # parsing unordered input fails + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=True) + expected = DataFrame([[1,2],[1,2]],columns=['a','b']) + with tm.assertRaisesRegexp(AssertionError, + '\[index\] left \[.+\], right \[.+\]'): + assert_frame_equal(result, expected) + + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]') + expected = DataFrame([[1,2],[1,2]],columns=['a','b']) + assert_frame_equal(result,expected) + + @network + def test_round_trip_exception_(self): + # GH 3867 + csv = 'https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv' + df = pd.read_csv(csv) + s = df.to_json() + result = pd.read_json(s) + assert_frame_equal(result.reindex(index=df.index,columns=df.columns),df) + + @network + def test_url(self): + url = 'https://api.github.com/repos/pydata/pandas/issues?per_page=5' + result = read_json(url, convert_dates=True) + for c in ['created_at', 'closed_at', 'updated_at']: + self.assertEqual(result[c].dtype, 'datetime64[ns]') + + def test_timedelta(self): + tm._skip_if_not_numpy17_friendly() + + from datetime import timedelta + converter = lambda x: pd.to_timedelta(x,unit='ms') + + s = Series([timedelta(23), timedelta(seconds=5)]) + self.assertEqual(s.dtype,'timedelta64[ns]') + assert_series_equal(s, pd.read_json(s.to_json(),typ='series').apply(converter)) + + frame = DataFrame([timedelta(23), timedelta(seconds=5)]) + self.assertEqual(frame[0].dtype,'timedelta64[ns]') + assert_frame_equal( + frame, pd.read_json(frame.to_json()).apply(converter)) + + def test_default_handler(self): + from datetime import timedelta + + frame = DataFrame([timedelta(23), timedelta(seconds=5), 42]) + self.assertRaises(OverflowError, frame.to_json) + + expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5)), 42]) + assert_frame_equal( + expected, pd.read_json(frame.to_json(default_handler=str))) + + def my_handler_raises(obj): + raise TypeError("raisin") + self.assertRaises(TypeError, frame.to_json, + default_handler=my_handler_raises) diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py new file mode 100644 index 00000000..fcd55154 --- /dev/null +++ b/pandas/io/tests/test_json/test_ujson.py @@ -0,0 +1,1536 @@ +# -*- coding: utf-8 -*- + +from unittest import TestCase + +try: + import json +except ImportError: + import simplejson as json +import math +import nose +import platform +import sys +import time +import datetime +import calendar +import re +import decimal +from functools import partial +from pandas.compat import range, zip, StringIO, u +import pandas.json as ujson +import pandas.compat as compat + +import numpy as np +from numpy.testing import (assert_array_equal, + assert_array_almost_equal_nulp, + assert_approx_equal) +import pytz +import dateutil +from pandas import DataFrame, Series, Index, NaT, DatetimeIndex +import pandas.util.testing as tm + + +def _skip_if_python_ver(skip_major, skip_minor=None): + major, minor = sys.version_info[:2] + if major == skip_major and (skip_minor is None or minor == skip_minor): + raise nose.SkipTest("skipping Python version %d.%d" % (major, minor)) + + +json_unicode = (json.dumps if sys.version_info[0] >= 3 + else partial(json.dumps, encoding="utf-8")) + +class UltraJSONTests(TestCase): + + def test_encodeDecimal(self): + sut = decimal.Decimal("1337.1337") + encoded = ujson.encode(sut, double_precision=15) + decoded = ujson.decode(encoded) + self.assertEqual(decoded, 1337.1337) + + def test_encodeStringConversion(self): + input = "A string \\ / \b \f \n \r \t &" + not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t <\\/script> &"' + html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t \\u003c\\/script\\u003e \\u0026"' + + def helper(expected_output, **encode_kwargs): + output = ujson.encode(input, **encode_kwargs) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, expected_output) + self.assertEqual(input, ujson.decode(output)) + + # Default behavior assumes encode_html_chars=False. + helper(not_html_encoded, ensure_ascii=True) + helper(not_html_encoded, ensure_ascii=False) + + # Make sure explicit encode_html_chars=False works. + helper(not_html_encoded, ensure_ascii=True, encode_html_chars=False) + helper(not_html_encoded, ensure_ascii=False, encode_html_chars=False) + + # Make sure explicit encode_html_chars=True does the encoding. + helper(html_encoded, ensure_ascii=True, encode_html_chars=True) + helper(html_encoded, ensure_ascii=False, encode_html_chars=True) + + def test_doubleLongIssue(self): + sut = {u('a'): -4342969734183514} + encoded = json.dumps(sut) + decoded = json.loads(encoded) + self.assertEqual(sut, decoded) + encoded = ujson.encode(sut, double_precision=15) + decoded = ujson.decode(encoded) + self.assertEqual(sut, decoded) + + def test_doubleLongDecimalIssue(self): + sut = {u('a'): -12345678901234.56789012} + encoded = json.dumps(sut) + decoded = json.loads(encoded) + self.assertEqual(sut, decoded) + encoded = ujson.encode(sut, double_precision=15) + decoded = ujson.decode(encoded) + self.assertEqual(sut, decoded) + + def test_encodeNonCLocale(self): + import locale + savedlocale = locale.getlocale(locale.LC_NUMERIC) + try: + locale.setlocale(locale.LC_NUMERIC, 'it_IT.UTF-8') + except: + try: + locale.setlocale(locale.LC_NUMERIC, 'Italian_Italy') + except: + raise nose.SkipTest('Could not set locale for testing') + self.assertEqual(ujson.loads(ujson.dumps(4.78e60)), 4.78e60) + self.assertEqual(ujson.loads('4.78', precise_float=True), 4.78) + locale.setlocale(locale.LC_NUMERIC, savedlocale) + + def test_encodeDecodeLongDecimal(self): + sut = {u('a'): -528656961.4399388} + encoded = ujson.dumps(sut, double_precision=15) + ujson.decode(encoded) + + def test_decimalDecodeTestPrecise(self): + sut = {u('a'): 4.56} + encoded = ujson.encode(sut) + decoded = ujson.decode(encoded, precise_float=True) + self.assertEqual(sut, decoded) + + def test_encodeDoubleTinyExponential(self): + num = 1e-40 + self.assertEqual(num, ujson.decode(ujson.encode(num))) + num = 1e-100 + self.assertEqual(num, ujson.decode(ujson.encode(num))) + num = -1e-45 + self.assertEqual(num, ujson.decode(ujson.encode(num))) + num = -1e-145 + self.assertTrue(np.allclose(num, ujson.decode(ujson.encode(num)))) + + def test_encodeDictWithUnicodeKeys(self): + input = {u("key1"): u("value1"), u("key1"): + u("value1"), u("key1"): u("value1"), + u("key1"): u("value1"), u("key1"): + u("value1"), u("key1"): u("value1")} + output = ujson.encode(input) + + input = {u("بن"): u("value1"), u("بن"): u("value1"), + u("بن"): u("value1"), u("بن"): u("value1"), + u("بن"): u("value1"), u("بن"): u("value1"), + u("بن"): u("value1")} + output = ujson.encode(input) + + pass + + def test_encodeDoubleConversion(self): + input = math.pi + output = ujson.encode(input) + self.assertEqual(round(input, 5), round(json.loads(output), 5)) + self.assertEqual(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeWithDecimal(self): + input = 1.0 + output = ujson.encode(input) + self.assertEqual(output, "1.0") + + def test_encodeDoubleNegConversion(self): + input = -math.pi + output = ujson.encode(input) + + self.assertEqual(round(input, 5), round(json.loads(output), 5)) + self.assertEqual(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeArrayOfNestedArrays(self): + input = [[[[]]]] * 20 + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + #self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + input = np.array(input) + assert_array_equal(input, ujson.decode(output, numpy=True, dtype=input.dtype)) + + def test_encodeArrayOfDoubles(self): + input = [ 31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + #self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + + def test_doublePrecisionTest(self): + input = 30.012345678901234 + output = ujson.encode(input, double_precision = 15) + self.assertEqual(input, json.loads(output)) + self.assertEqual(input, ujson.decode(output)) + + output = ujson.encode(input, double_precision = 9) + self.assertEqual(round(input, 9), json.loads(output)) + self.assertEqual(round(input, 9), ujson.decode(output)) + + output = ujson.encode(input, double_precision = 3) + self.assertEqual(round(input, 3), json.loads(output)) + self.assertEqual(round(input, 3), ujson.decode(output)) + + def test_invalidDoublePrecision(self): + input = 30.12345678901234567890 + + self.assertRaises(ValueError, ujson.encode, input, double_precision = 20) + self.assertRaises(ValueError, ujson.encode, input, double_precision = -1) + + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = '9') + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = None) + + def test_encodeStringConversion(self): + input = "A string \\ / \b \f \n \r \t" + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, '"A string \\\\ \\/ \\b \\f \\n \\r \\t"') + self.assertEqual(input, ujson.decode(output)) + pass + + def test_decodeUnicodeConversion(self): + pass + + def test_encodeUnicodeConversion1(self): + input = "Räksmörgås اسامة بن محمد بن عوض بن لادن" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEqual(enc, json_unicode(input)) + self.assertEqual(dec, json.loads(enc)) + + def test_encodeControlEscaping(self): + input = "\x19" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEqual(input, dec) + self.assertEqual(enc, json_unicode(input)) + + def test_encodeUnicodeConversion2(self): + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEqual(enc, json_unicode(input)) + self.assertEqual(dec, json.loads(enc)) + + def test_encodeUnicodeSurrogatePair(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x90\x8d\x86" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEqual(enc, json_unicode(input)) + self.assertEqual(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x91\x80\xb0TRAILINGNORMAL" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEqual(enc, json_unicode(input)) + self.assertEqual(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8Highest(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL" + enc = ujson.encode(input) + + dec = ujson.decode(enc) + + self.assertEqual(enc, json_unicode(input)) + self.assertEqual(dec, json.loads(enc)) + + def test_encodeArrayInArray(self): + input = [[[[]]]] + output = ujson.encode(input) + + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeIntConversion(self): + input = 31337 + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + pass + + def test_encodeIntNegConversion(self): + input = -31337 + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + pass + + def test_encodeLongNegConversion(self): + input = -9223372036854775808 + output = ujson.encode(input) + + outputjson = json.loads(output) + outputujson = ujson.decode(output) + + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + pass + + def test_encodeListConversion(self): + input = [ 1, 2, 3, 4 ] + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeDictConversion(self): + input = { "k1": 1, "k2": 2, "k3": 3, "k4": 4 } + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(input, ujson.decode(output)) + self.assertEqual(input, ujson.decode(output)) + pass + + def test_encodeNoneConversion(self): + input = None + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + pass + + def test_encodeTrueConversion(self): + input = True + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + pass + + def test_encodeFalseConversion(self): + input = False + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + + def test_encodeDatetimeConversion(self): + ts = time.time() + input = datetime.datetime.fromtimestamp(ts) + output = ujson.encode(input, date_unit='s') + expected = calendar.timegm(input.utctimetuple()) + self.assertEqual(int(expected), json.loads(output)) + self.assertEqual(int(expected), ujson.decode(output)) + + def test_encodeDateConversion(self): + ts = time.time() + input = datetime.date.fromtimestamp(ts) + + output = ujson.encode(input, date_unit='s') + tup = (input.year, input.month, input.day, 0, 0, 0) + + expected = calendar.timegm(tup) + self.assertEqual(int(expected), json.loads(output)) + self.assertEqual(int(expected), ujson.decode(output)) + + def test_encodeTimeConversion(self): + tests = [ + datetime.time(), + datetime.time(1, 2, 3), + datetime.time(10, 12, 15, 343243), + datetime.time(10, 12, 15, 343243, pytz.utc), +# datetime.time(10, 12, 15, 343243, dateutil.tz.gettz('UTC')), # this segfaults! No idea why. + ] + for test in tests: + output = ujson.encode(test) + expected = '"%s"' % test.isoformat() + self.assertEqual(expected, output) + + def test_nat(self): + input = NaT + assert ujson.encode(input) == 'null', "Expected null" + + def test_npy_nat(self): + from distutils.version import LooseVersion + if LooseVersion(np.__version__) < '1.7.0': + raise nose.SkipTest("numpy version < 1.7.0, is " + "{0}".format(np.__version__)) + + input = np.datetime64('NaT') + assert ujson.encode(input) == 'null', "Expected null" + + def test_datetime_units(self): + from pandas.lib import Timestamp + + val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) + stamp = Timestamp(val) + + roundtrip = ujson.decode(ujson.encode(val, date_unit='s')) + self.assertEqual(roundtrip, stamp.value // 10**9) + + roundtrip = ujson.decode(ujson.encode(val, date_unit='ms')) + self.assertEqual(roundtrip, stamp.value // 10**6) + + roundtrip = ujson.decode(ujson.encode(val, date_unit='us')) + self.assertEqual(roundtrip, stamp.value // 10**3) + + roundtrip = ujson.decode(ujson.encode(val, date_unit='ns')) + self.assertEqual(roundtrip, stamp.value) + + self.assertRaises(ValueError, ujson.encode, val, date_unit='foo') + + def test_encodeToUTF8(self): + _skip_if_python_ver(2, 5) + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input, ensure_ascii=False) + dec = ujson.decode(enc) + self.assertEqual(enc, json_unicode(input, ensure_ascii=False)) + self.assertEqual(dec, json.loads(enc)) + + def test_decodeFromUnicode(self): + input = u("{\"obj\": 31337}") + dec1 = ujson.decode(input) + dec2 = ujson.decode(str(input)) + self.assertEqual(dec1, dec2) + + def test_encodeRecursionMax(self): + # 8 is the max recursion depth + + class O2: + member = 0 + pass + + class O1: + member = 0 + pass + + input = O1() + input.member = O2() + input.member.member = input + + try: + output = ujson.encode(input) + assert False, "Expected overflow exception" + except(OverflowError): + pass + + def test_encodeDoubleNan(self): + input = np.nan + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleInf(self): + input = np.inf + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleNegInf(self): + input = -np.inf + assert ujson.encode(input) == 'null', "Expected null" + + def test_decodeJibberish(self): + input = "fdsa sda v9sa fdsa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayStart(self): + input = "[" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectStart(self): + input = "{" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayEnd(self): + input = "]" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeArrayDepthTooBig(self): + input = '[' * (1024 * 1024) + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectEnd(self): + input = "}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeObjectDepthTooBig(self): + input = '{' * (1024 * 1024) + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUnterminated(self): + input = "\"TESTING" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUntermEscapeSequence(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringBadEscape(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeTrueBroken(self): + input = "tru" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeFalseBroken(self): + input = "fa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeNullBroken(self): + input = "n" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenDictKeyTypeLeakTest(self): + input = '{{1337:""}}' + for x in range(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except ValueError as e: + continue + + assert False, "Wrong exception" + + def test_decodeBrokenDictLeakTest(self): + input = '{{"key":"}' + for x in range(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeBrokenListLeakTest(self): + input = '[[[true' + for x in range(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeDictWithNoKey(self): + input = "{{{{31337}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoColonOrValue(self): + input = "{{{{\"key\"}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoValue(self): + input = "{{{{\"key\":}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeNumericIntPos(self): + input = "31337" + self.assertEqual(31337, ujson.decode(input)) + + def test_decodeNumericIntNeg(self): + input = "-31337" + self.assertEqual(-31337, ujson.decode(input)) + + def test_encodeUnicode4BytesUTF8Fail(self): + _skip_if_python_ver(3) + input = "\xfd\xbf\xbf\xbf\xbf\xbf" + try: + enc = ujson.encode(input) + assert False, "Expected exception" + except OverflowError: + pass + + def test_encodeNullCharacter(self): + input = "31337 \x00 1337" + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + + input = "\x00" + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + + self.assertEqual('" \\u0000\\r\\n "', ujson.dumps(u(" \u0000\r\n "))) + pass + + def test_decodeNullCharacter(self): + input = "\"31337 \\u0000 31337\"" + self.assertEqual(ujson.decode(input), json.loads(input)) + + def test_encodeListLongConversion(self): + input = [9223372036854775807, 9223372036854775807, 9223372036854775807, + 9223372036854775807, 9223372036854775807, 9223372036854775807 ] + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True, + dtype=np.int64)) + pass + + def test_encodeLongConversion(self): + input = 9223372036854775807 + output = ujson.encode(input) + self.assertEqual(input, json.loads(output)) + self.assertEqual(output, json.dumps(input)) + self.assertEqual(input, ujson.decode(output)) + pass + + def test_numericIntExp(self): + input = "1337E40" + output = ujson.decode(input) + self.assertEqual(output, json.loads(input)) + + def test_numericIntFrcExp(self): + input = "1.337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEPLUS(self): + input = "1337E+9" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpePLUS(self): + input = "1.337e+40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpE(self): + input = "1337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpe(self): + input = "1337e40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEMinus(self): + input = "1.337E-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpeMinus(self): + input = "1.337e-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_dumpToFile(self): + f = StringIO() + ujson.dump([1, 2, 3], f) + self.assertEqual("[1,2,3]", f.getvalue()) + + def test_dumpToFileLikeObject(self): + class filelike: + def __init__(self): + self.bytes = '' + def write(self, bytes): + self.bytes += bytes + f = filelike() + ujson.dump([1, 2, 3], f) + self.assertEqual("[1,2,3]", f.bytes) + + def test_dumpFileArgsError(self): + try: + ujson.dump([], '') + except TypeError: + pass + else: + assert False, 'expected TypeError' + + def test_loadFile(self): + f = StringIO("[1,2,3,4]") + self.assertEqual([1, 2, 3, 4], ujson.load(f)) + f = StringIO("[1,2,3,4]") + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileLikeObject(self): + class filelike: + def read(self): + try: + self.end + except AttributeError: + self.end = True + return "[1,2,3,4]" + f = filelike() + self.assertEqual([1, 2, 3, 4], ujson.load(f)) + f = filelike() + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileArgsError(self): + try: + ujson.load("[]") + except TypeError: + pass + else: + assert False, "expected TypeError" + + def test_version(self): + assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ + "ujson.__version__ must be a string like '1.4.0'" + + def test_encodeNumericOverflow(self): + try: + ujson.encode(12839128391289382193812939) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_encodeNumericOverflowNested(self): + for n in range(0, 100): + class Nested: + x = 12839128391289382193812939 + + nested = Nested() + + try: + ujson.encode(nested) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_decodeNumberWith32bitSignBit(self): + #Test that numbers that fit within 32 bits but would have the + # sign bit set (2**31 <= x < 2**32) are decoded properly. + boundary1 = 2**31 + boundary2 = 2**32 + docs = ( + '{"id": 3590016419}', + '{"id": %s}' % 2**31, + '{"id": %s}' % 2**32, + '{"id": %s}' % ((2**32)-1), + ) + results = (3590016419, 2**31, 2**32, 2**32-1) + for doc,result in zip(docs, results): + self.assertEqual(ujson.decode(doc)['id'], result) + + def test_encodeBigEscape(self): + for x in range(10): + if compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + input = base * 1024 * 1024 * 2 + output = ujson.encode(input) + + def test_decodeBigEscape(self): + for x in range(10): + if compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + quote = compat.str_to_bytes("\"") + input = quote + (base * 1024 * 1024 * 2) + quote + output = ujson.decode(input) + + def test_toDict(self): + d = {u("key"): 31337} + + class DictTest: + def toDict(self): + return d + + o = DictTest() + output = ujson.encode(o) + dec = ujson.decode(output) + self.assertEqual(dec, d) + + def test_defaultHandler(self): + + class _TestObject(object): + + def __init__(self, val): + self.val = val + + @property + def recursive_attr(self): + return _TestObject("recursive_attr") + + def __str__(self): + return str(self.val) + + self.assertRaises(OverflowError, ujson.encode, _TestObject("foo")) + self.assertEqual('"foo"', ujson.encode(_TestObject("foo"), + default_handler=str)) + + def my_handler(obj): + return "foobar" + self.assertEqual('"foobar"', ujson.encode(_TestObject("foo"), + default_handler=my_handler)) + + def my_handler_raises(obj): + raise TypeError("I raise for anything") + with tm.assertRaisesRegexp(TypeError, "I raise for anything"): + ujson.encode(_TestObject("foo"), default_handler=my_handler_raises) + + def my_int_handler(obj): + return 42 + self.assertEqual( + 42, ujson.decode(ujson.encode(_TestObject("foo"), + default_handler=my_int_handler))) + + def my_obj_handler(obj): + return datetime.datetime(2013, 2, 3) + self.assertEqual( + ujson.decode(ujson.encode(datetime.datetime(2013, 2, 3))), + ujson.decode(ujson.encode(_TestObject("foo"), + default_handler=my_obj_handler))) + + l = [_TestObject("foo"), _TestObject("bar")] + self.assertEqual(json.loads(json.dumps(l, default=str)), + ujson.decode(ujson.encode(l, default_handler=str))) + + +class NumpyJSONTests(TestCase): + + def testBool(self): + b = np.bool(True) + self.assertEqual(ujson.decode(ujson.encode(b)), b) + + def testBoolArray(self): + inpt = np.array([True, False, True, True, False, True, False , False], + dtype=np.bool) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) + assert_array_equal(inpt, outp) + + def testInt(self): + num = np.int(2562010) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(127) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(2562010) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(2562010) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.int64(2562010) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(255) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(2562010) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(2562010) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + num = np.uint64(2562010) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testIntArray(self): + arr = np.arange(100, dtype=np.int) + dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, + np.uint, np.uint8, np.uint16, np.uint32, np.uint64) + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) + assert_array_equal(inpt, outp) + + def testIntMax(self): + num = np.int(np.iinfo(np.int).max) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(np.iinfo(np.int8).max) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(np.iinfo(np.int16).max) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(np.iinfo(np.int32).max) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(np.iinfo(np.uint8).max) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(np.iinfo(np.uint16).max) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(np.iinfo(np.uint32).max) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + if platform.architecture()[0] != '32bit': + num = np.int64(np.iinfo(np.int64).max) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + # uint64 max will always overflow as it's encoded to signed + num = np.uint64(np.iinfo(np.int64).max) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testFloat(self): + num = np.float(256.2013) + self.assertEqual(np.float(ujson.decode(ujson.encode(num))), num) + + num = np.float32(256.2013) + self.assertEqual(np.float32(ujson.decode(ujson.encode(num))), num) + + num = np.float64(256.2013) + self.assertEqual(np.float64(ujson.decode(ujson.encode(num))), num) + + def testFloatArray(self): + arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) + dtypes = (np.float, np.float32, np.float64) + + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt, double_precision=15)), dtype=dtype) + assert_array_almost_equal_nulp(inpt, outp) + + def testFloatMax(self): + num = np.float(np.finfo(np.float).max/10) + assert_approx_equal(np.float(ujson.decode(ujson.encode(num, double_precision=15))), num, 15) + + num = np.float32(np.finfo(np.float32).max/10) + assert_approx_equal(np.float32(ujson.decode(ujson.encode(num, double_precision=15))), num, 15) + + num = np.float64(np.finfo(np.float64).max/10) + assert_approx_equal(np.float64(ujson.decode(ujson.encode(num, double_precision=15))), num, 15) + + def testArrays(self): + arr = np.arange(100); + + arr = arr.reshape((10, 10)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((5, 5, 4)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((100, 1)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = np.arange(96); + arr = arr.reshape((2, 2, 2, 2, 3, 2)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + l = ['a', list(), dict(), dict(), list(), + 42, 97.8, ['a', 'b'], {'key': 'val'}] + arr = np.array(l) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + + arr = np.arange(100.202, 200.202, 1, dtype=np.float32); + arr = arr.reshape((5, 5, 4)) + outp = np.array(ujson.decode(ujson.encode(arr)), dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + + def testArrayNumpyExcept(self): + + input = ujson.dumps([42, {}, 'a']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps(['a', 'b', [], 'c']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, ['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{}, []]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, None]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 'b'}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps({'a': {'b': {'c': 42}}}) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 42, 'b': 23}, {'c': 17}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + def testArrayNumpyLabelled(self): + input = {'a': []} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.empty((1, 0)) == output[0]).all()) + self.assertTrue((np.array(['a']) == output[1]).all()) + self.assertTrue(output[2] is None) + + input = [{'a': 42}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.array([42]) == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u('a')]) == output[2]).all()) + + # py3 is non-determinstic on the ordering...... + if not compat.PY3: + input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u('a'), 'b']) == output[2]).all()) + + + input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue((np.array(['1','2','3']) == output[1]).all()) + self.assertTrue((np.array(['a', 'b']) == output[2]).all()) + + +class PandasJSONTests(TestCase): + + def testDataFrame(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df))) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"))) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) + outp.index = df.index + self.assertTrue((df.values == outp.values).all()) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + def testDataFrameNumpy(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), + numpy=True)) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), numpy=True)) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + def testDataFrameNested(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + nested = {'df1': df, 'df2': df.copy()} + + exp = {'df1': ujson.decode(ujson.encode(df)), + 'df2': ujson.decode(ujson.encode(df))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="index")), + 'df2': ujson.decode(ujson.encode(df, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="records")), + 'df2': ujson.decode(ujson.encode(df, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="values")), + 'df2': ujson.decode(ujson.encode(df, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="split")), + 'df2': ujson.decode(ujson.encode(df, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + def testDataFrameNumpyLabelled(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(*ujson.decode(ujson.encode(df), numpy=True, labelled=True)) + self.assertTrue((df.T == outp).values.all()) + assert_array_equal(df.T.columns, outp.columns) + assert_array_equal(df.T.index, outp.index) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), numpy=True, labelled=True)) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), numpy=True, labelled=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + def testSeries(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + # column indexed + outp = Series(ujson.decode(ujson.encode(s))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"))) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), + numpy=True)) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + def testSeriesNested(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + nested = {'s1': s, 's2': s.copy()} + + exp = {'s1': ujson.decode(ujson.encode(s)), + 's2': ujson.decode(ujson.encode(s))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="split")), + 's2': ujson.decode(ujson.encode(s, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="records")), + 's2': ujson.decode(ujson.encode(s, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="values")), + 's2': ujson.decode(ujson.encode(s, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="index")), + 's2': ujson.decode(ujson.encode(s, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + def testIndex(self): + i = Index([23, 45, 18, 98, 43, 11], name="index") + + # column indexed + outp = Index(ujson.decode(ujson.encode(i))) + self.assertTrue(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i), numpy=True)) + self.assertTrue(i.equals(outp)) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"))) + outp = Index(**dec) + self.assertTrue(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), + numpy=True)) + outp = Index(**dec) + self.assertTrue(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"))) + self.assertTrue(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"), numpy=True)) + self.assertTrue(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"))) + self.assertTrue(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"), numpy=True)) + self.assertTrue(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"))) + self.assertTrue(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"), numpy=True)) + self.assertTrue(i.equals(outp)) + + def test_datetimeindex(self): + from pandas.tseries.index import date_range + + rng = date_range('1/1/2000', periods=20) + + encoded = ujson.encode(rng, date_unit='ns') + decoded = DatetimeIndex(np.array(ujson.decode(encoded))) + + self.assertTrue(rng.equals(decoded)) + + ts = Series(np.random.randn(len(rng)), index=rng) + decoded = Series(ujson.decode(ujson.encode(ts, date_unit='ns'))) + idx_values = decoded.index.values.astype(np.int64) + decoded.index = DatetimeIndex(idx_values) + tm.assert_series_equal(ts, decoded) + + def test_decodeArrayTrailingCommaFail(self): + input = "[31337,]" + try: + ujson.decode(input) + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeArrayLeadingCommaFail(self): + input = "[,31337]" + try: + ujson.decode(input) + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeArrayOnlyCommaFail(self): + input = "[,]" + try: + ujson.decode(input) + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeArrayUnmatchedBracketFail(self): + input = "[]]" + try: + ujson.decode(input) + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeArrayEmpty(self): + input = "[]" + ujson.decode(input) + + def test_decodeArrayOneItem(self): + input = "[31337]" + ujson.decode(input) + + def test_decodeBigValue(self): + input = "9223372036854775807" + ujson.decode(input) + + def test_decodeSmallValue(self): + input = "-9223372036854775808" + ujson.decode(input) + + def test_decodeTooBigValue(self): + try: + input = "9223372036854775808" + ujson.decode(input) + except ValueError as e: + pass + else: + assert False, "expected ValueError" + + def test_decodeTooSmallValue(self): + try: + input = "-90223372036854775809" + ujson.decode(input) + except ValueError as e: + pass + else: + assert False, "expected ValueError" + + def test_decodeVeryTooBigValue(self): + try: + input = "9223372036854775808" + ujson.decode(input) + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeVeryTooSmallValue(self): + try: + input = "-90223372036854775809" + ujson.decode(input) + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeWithTrailingWhitespaces(self): + input = "{}\n\t " + ujson.decode(input) + + def test_decodeWithTrailingNonWhitespaces(self): + try: + input = "{}\n\t a" + ujson.decode(input) + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeArrayWithBigInt(self): + try: + ujson.loads('[18446098363113800555]') + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeArrayFaultyUnicode(self): + try: + ujson.loads('[18446098363113800555]') + except ValueError: + pass + else: + assert False, "expected ValueError" + + def test_decodeFloatingPointAdditionalTests(self): + places = 15 + + self.assertAlmostEqual(-1.1234567893, ujson.loads("-1.1234567893"), places=places) + self.assertAlmostEqual(-1.234567893, ujson.loads("-1.234567893"), places=places) + self.assertAlmostEqual(-1.34567893, ujson.loads("-1.34567893"), places=places) + self.assertAlmostEqual(-1.4567893, ujson.loads("-1.4567893"), places=places) + self.assertAlmostEqual(-1.567893, ujson.loads("-1.567893"), places=places) + self.assertAlmostEqual(-1.67893, ujson.loads("-1.67893"), places=places) + self.assertAlmostEqual(-1.7893, ujson.loads("-1.7893"), places=places) + self.assertAlmostEqual(-1.893, ujson.loads("-1.893"), places=places) + self.assertAlmostEqual(-1.3, ujson.loads("-1.3"), places=places) + + self.assertAlmostEqual(1.1234567893, ujson.loads("1.1234567893"), places=places) + self.assertAlmostEqual(1.234567893, ujson.loads("1.234567893"), places=places) + self.assertAlmostEqual(1.34567893, ujson.loads("1.34567893"), places=places) + self.assertAlmostEqual(1.4567893, ujson.loads("1.4567893"), places=places) + self.assertAlmostEqual(1.567893, ujson.loads("1.567893"), places=places) + self.assertAlmostEqual(1.67893, ujson.loads("1.67893"), places=places) + self.assertAlmostEqual(1.7893, ujson.loads("1.7893"), places=places) + self.assertAlmostEqual(1.893, ujson.loads("1.893"), places=places) + self.assertAlmostEqual(1.3, ujson.loads("1.3"), places=places) + + def test_encodeBigSet(self): + s = set() + for x in range(0, 100000): + s.add(x) + ujson.encode(s) + + def test_encodeEmptySet(self): + s = set() + self.assertEqual("[]", ujson.encode(s)) + + def test_encodeSet(self): + s = set([1,2,3,4,5,6,7,8,9]) + enc = ujson.encode(s) + dec = ujson.decode(enc) + + for v in dec: + self.assertTrue(v in s) + + +def _clean_dict(d): + return dict((str(k), v) for k, v in compat.iteritems(d)) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_json_norm.py b/pandas/io/tests/test_json_norm.py new file mode 100644 index 00000000..8084446d --- /dev/null +++ b/pandas/io/tests/test_json_norm.py @@ -0,0 +1,207 @@ +import nose + +from pandas import DataFrame +import numpy as np + +import pandas.util.testing as tm + +from pandas.io.json import json_normalize, nested_to_record + +def _assert_equal_data(left, right): + if not left.columns.equals(right.columns): + left = left.reindex(columns=right.columns) + + tm.assert_frame_equal(left, right) + + +class TestJSONNormalize(tm.TestCase): + + def setUp(self): + self.state_data = [ + {'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}], + 'info': {'governor': 'Rick Scott'}, + 'shortname': 'FL', + 'state': 'Florida'}, + {'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}], + 'info': {'governor': 'John Kasich'}, + 'shortname': 'OH', + 'state': 'Ohio'}] + + def test_simple_records(self): + recs = [{'a': 1, 'b': 2, 'c': 3}, + {'a': 4, 'b': 5, 'c': 6}, + {'a': 7, 'b': 8, 'c': 9}, + {'a': 10, 'b': 11, 'c': 12}] + + result = json_normalize(recs) + expected = DataFrame(recs) + + tm.assert_frame_equal(result, expected) + + def test_simple_normalize(self): + result = json_normalize(self.state_data[0], 'counties') + expected = DataFrame(self.state_data[0]['counties']) + tm.assert_frame_equal(result, expected) + + result = json_normalize(self.state_data, 'counties') + + expected = [] + for rec in self.state_data: + expected.extend(rec['counties']) + expected = DataFrame(expected) + + tm.assert_frame_equal(result, expected) + + result = json_normalize(self.state_data, 'counties', meta='state') + expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) + + tm.assert_frame_equal(result, expected) + + def test_more_deeply_nested(self): + data = [{'country': 'USA', + 'states': [{'name': 'California', + 'cities': [{'name': 'San Francisco', + 'pop': 12345}, + {'name': 'Los Angeles', + 'pop': 12346}] + }, + {'name': 'Ohio', + 'cities': [{'name': 'Columbus', + 'pop': 1234}, + {'name': 'Cleveland', + 'pop': 1236}]} + ] + }, + {'country': 'Germany', + 'states': [{'name': 'Bayern', + 'cities': [{'name': 'Munich', 'pop': 12347}] + }, + {'name': 'Nordrhein-Westfalen', + 'cities': [{'name': 'Duesseldorf', 'pop': 1238}, + {'name': 'Koeln', 'pop': 1239}]} + ] + } + ] + + result = json_normalize(data, ['states', 'cities'], + meta=['country', ['states', 'name']]) + # meta_prefix={'states': 'state_'}) + + ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3, + 'states.name': ['California', 'California', 'Ohio', 'Ohio', + 'Bayern', 'Nordrhein-Westfalen', + 'Nordrhein-Westfalen'], + 'name': ['San Francisco', 'Los Angeles', 'Columbus', + 'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'], + 'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]} + + expected = DataFrame(ex_data, columns=result.columns) + tm.assert_frame_equal(result, expected) + + def test_shallow_nested(self): + data = [{'state': 'Florida', + 'shortname': 'FL', + 'info': { + 'governor': 'Rick Scott' + }, + 'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, + {'state': 'Ohio', + 'shortname': 'OH', + 'info': { + 'governor': 'John Kasich' + }, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] + + result = json_normalize(data, 'counties', + ['state', 'shortname', + ['info', 'governor']]) + ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit', + 'Cuyahoga'], + 'state': ['Florida'] * 3 + ['Ohio'] * 2, + 'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'], + 'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2, + 'population': [12345, 40000, 60000, 1234, 1337]} + expected = DataFrame(ex_data, columns=result.columns) + tm.assert_frame_equal(result, expected) + + def test_meta_name_conflict(self): + data = [{'foo': 'hello', + 'bar': 'there', + 'data': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}]}] + + self.assertRaises(ValueError, json_normalize, data, + 'data', meta=['foo', 'bar']) + + result = json_normalize(data, 'data', meta=['foo', 'bar'], + meta_prefix='meta') + + for val in ['metafoo', 'metabar', 'foo', 'bar']: + self.assertTrue(val in result) + + def test_record_prefix(self): + result = json_normalize(self.state_data[0], 'counties') + expected = DataFrame(self.state_data[0]['counties']) + tm.assert_frame_equal(result, expected) + + result = json_normalize(self.state_data, 'counties', + meta='state', + record_prefix='county_') + + expected = [] + for rec in self.state_data: + expected.extend(rec['counties']) + expected = DataFrame(expected) + expected = expected.rename(columns=lambda x: 'county_' + x) + expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) + + tm.assert_frame_equal(result, expected) + + +class TestNestedToRecord(tm.TestCase): + + def test_flat_stays_flat(self): + recs = [dict(flat1=1,flat2=2), + dict(flat1=3,flat2=4), + ] + + result = nested_to_record(recs) + expected = recs + self.assertEqual(result, expected) + + def test_one_level_deep_flattens(self): + data = dict(flat1=1, + dict1=dict(c=1,d=2)) + + result = nested_to_record(data) + expected = {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1} + + self.assertEqual(result,expected) + + def test_nested_flattens(self): + data = dict(flat1=1, + dict1=dict(c=1,d=2), + nested=dict(e=dict(c=1,d=2), + d=2)) + + result = nested_to_record(data) + expected = {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + + self.assertEqual(result,expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', + '--pdb-failure', '-s'], exit=False) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py new file mode 100644 index 00000000..9633f567 --- /dev/null +++ b/pandas/io/tests/test_packers.py @@ -0,0 +1,452 @@ +import nose + +import datetime +import numpy as np +import sys +from distutils.version import LooseVersion + +from pandas import compat +from pandas.compat import u +from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, + date_range, period_range, Index, SparseSeries, SparseDataFrame, + SparsePanel) +import pandas.util.testing as tm +from pandas.util.testing import ensure_clean +from pandas.tests.test_series import assert_series_equal +from pandas.tests.test_frame import assert_frame_equal +from pandas.tests.test_panel import assert_panel_equal + +import pandas +from pandas.sparse.tests.test_sparse import assert_sp_series_equal, assert_sp_frame_equal +from pandas import Timestamp, tslib + +nan = np.nan + +from pandas.io.packers import to_msgpack, read_msgpack + +_multiprocess_can_split_ = False + + +def check_arbitrary(a, b): + + if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): + assert(len(a) == len(b)) + for a_, b_ in zip(a, b): + check_arbitrary(a_, b_) + elif isinstance(a, Panel): + assert_panel_equal(a, b) + elif isinstance(a, DataFrame): + assert_frame_equal(a, b) + elif isinstance(a, Series): + assert_series_equal(a, b) + else: + assert(a == b) + + +class TestPackers(tm.TestCase): + + def setUp(self): + self.path = '__%s__.msg' % tm.rands(10) + + def tearDown(self): + pass + + def encode_decode(self, x, **kwargs): + with ensure_clean(self.path) as p: + to_msgpack(p, x, **kwargs) + return read_msgpack(p, **kwargs) + +class TestAPI(TestPackers): + + def test_string_io(self): + + df = DataFrame(np.random.randn(10,2)) + s = df.to_msgpack(None) + result = read_msgpack(s) + tm.assert_frame_equal(result,df) + + s = df.to_msgpack() + result = read_msgpack(s) + tm.assert_frame_equal(result,df) + + s = df.to_msgpack() + result = read_msgpack(compat.BytesIO(s)) + tm.assert_frame_equal(result,df) + + s = to_msgpack(None,df) + result = read_msgpack(s) + tm.assert_frame_equal(result, df) + + with ensure_clean(self.path) as p: + + s = df.to_msgpack() + fh = open(p,'wb') + fh.write(s) + fh.close() + result = read_msgpack(p) + tm.assert_frame_equal(result, df) + + def test_iterator_with_string_io(self): + + dfs = [ DataFrame(np.random.randn(10,2)) for i in range(5) ] + s = to_msgpack(None,*dfs) + for i, result in enumerate(read_msgpack(s,iterator=True)): + tm.assert_frame_equal(result,dfs[i]) + +class TestNumpy(TestPackers): + + def test_numpy_scalar_float(self): + x = np.float32(np.random.rand()) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + + def test_numpy_scalar_complex(self): + x = np.complex64(np.random.rand() + 1j * np.random.rand()) + x_rec = self.encode_decode(x) + self.assertTrue(np.allclose(x, x_rec)) + + def test_scalar_float(self): + x = np.random.rand() + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + + def test_scalar_complex(self): + x = np.random.rand() + 1j * np.random.rand() + x_rec = self.encode_decode(x) + self.assertTrue(np.allclose(x, x_rec)) + + def test_list_numpy_float(self): + x = [np.float32(np.random.rand()) for i in range(5)] + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + + def test_list_numpy_float_complex(self): + if not hasattr(np, 'complex128'): + raise nose.SkipTest('numpy cant handle complex128') + + x = [np.float32(np.random.rand()) for i in range(5)] + \ + [np.complex128(np.random.rand() + 1j * np.random.rand()) + for i in range(5)] + x_rec = self.encode_decode(x) + self.assertTrue(np.allclose(x, x_rec)) + + def test_list_float(self): + x = [np.random.rand() for i in range(5)] + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + + def test_list_float_complex(self): + x = [np.random.rand() for i in range(5)] + \ + [(np.random.rand() + 1j * np.random.rand()) for i in range(5)] + x_rec = self.encode_decode(x) + self.assertTrue(np.allclose(x, x_rec)) + + def test_dict_float(self): + x = {'foo': 1.0, 'bar': 2.0} + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + + def test_dict_complex(self): + x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j} + x_rec = self.encode_decode(x) + self.assertEqual(x, x_rec) + for key in x: + self.assertEqual(type(x[key]), type(x_rec[key])) + + def test_dict_numpy_float(self): + x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)} + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + + def test_dict_numpy_complex(self): + x = {'foo': np.complex128(1.0 + 1.0j), + 'bar': np.complex128(2.0 + 2.0j)} + x_rec = self.encode_decode(x) + self.assertEqual(x, x_rec) + for key in x: + self.assertEqual(type(x[key]), type(x_rec[key])) + + def test_numpy_array_float(self): + + # run multiple times + for n in range(10): + x = np.random.rand(10) + for dtype in ['float32','float64']: + x = x.astype(dtype) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + + def test_numpy_array_complex(self): + x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) + x_rec = self.encode_decode(x) + self.assertTrue(all(map(lambda x, y: x == y, x, x_rec)) and + x.dtype == x_rec.dtype) + + def test_list_mixed(self): + x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x,x_rec) + +class TestBasic(TestPackers): + + def test_timestamp(self): + + for i in [Timestamp( + '20130101'), Timestamp('20130101', tz='US/Eastern'), + Timestamp('201301010501')]: + i_rec = self.encode_decode(i) + self.assertEqual(i, i_rec) + + def test_datetimes(self): + + # fails under 2.6/win32 (np.datetime64 seems broken) + + if LooseVersion(sys.version) < '2.7': + raise nose.SkipTest('2.6 with np.datetime64 is broken') + + for i in [datetime.datetime( + 2013, 1, 1), datetime.datetime(2013, 1, 1, 5, 1), + datetime.date(2013, 1, 1), np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]: + i_rec = self.encode_decode(i) + self.assertEqual(i, i_rec) + + def test_timedeltas(self): + + for i in [datetime.timedelta(days=1), + datetime.timedelta(days=1, seconds=10), + np.timedelta64(1000000)]: + i_rec = self.encode_decode(i) + self.assertEqual(i, i_rec) + + +class TestIndex(TestPackers): + + def setUp(self): + super(TestIndex, self).setUp() + + self.d = { + 'string': tm.makeStringIndex(100), + 'date': tm.makeDateIndex(100), + 'int': tm.makeIntIndex(100), + 'float': tm.makeFloatIndex(100), + 'empty': Index([]), + 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), + 'period': Index(period_range('2012-1-1', freq='M', periods=3)), + 'date2': Index(date_range('2013-01-1', periods=10)), + 'bdate': Index(bdate_range('2013-01-02', periods=10)), + } + + self.mi = { + 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), + ('qux', 'one'), ('qux', 'two')], names=['first', 'second']), + } + + def test_basic_index(self): + + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + self.assertTrue(i.equals(i_rec)) + + # datetime with no freq (GH5506) + i = Index([Timestamp('20130101'),Timestamp('20130103')]) + i_rec = self.encode_decode(i) + self.assertTrue(i.equals(i_rec)) + + # datetime with timezone + i = Index([Timestamp('20130101 9:00:00'),Timestamp('20130103 11:00:00')]).tz_localize('US/Eastern') + i_rec = self.encode_decode(i) + self.assertTrue(i.equals(i_rec)) + + def test_multi_index(self): + + for s, i in self.mi.items(): + i_rec = self.encode_decode(i) + self.assertTrue(i.equals(i_rec)) + + def test_unicode(self): + i = tm.makeUnicodeIndex(100) + + # this currently fails + self.assertRaises(UnicodeEncodeError, self.encode_decode, i) + + #i_rec = self.encode_decode(i) + #self.assertTrue(i.equals(i_rec)) + + +class TestSeries(TestPackers): + + def setUp(self): + super(TestSeries, self).setUp() + + self.d = {} + + s = tm.makeStringSeries() + s.name = 'string' + self.d['string'] = s + + s = tm.makeObjectSeries() + s.name = 'object' + self.d['object'] = s + + s = Series(tslib.iNaT, dtype='M8[ns]', index=range(5)) + self.d['date'] = s + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + } + + self.d['float'] = Series(data['A']) + self.d['int'] = Series(data['B']) + self.d['mixed'] = Series(data['E']) + + def test_basic(self): + + # run multiple times here + for n in range(10): + for s, i in self.d.items(): + i_rec = self.encode_decode(i) + assert_series_equal(i, i_rec) + + +class TestNDFrame(TestPackers): + + def setUp(self): + super(TestNDFrame, self).setUp() + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], + } + + self.frame = { + 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), + 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), + 'mixed': DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']]))} + + self.panel = { + 'float': Panel(dict(ItemA=self.frame['float'], ItemB=self.frame['float'] + 1))} + + def test_basic_frame(self): + + for s, i in self.frame.items(): + i_rec = self.encode_decode(i) + assert_frame_equal(i, i_rec) + + def test_basic_panel(self): + + for s, i in self.panel.items(): + i_rec = self.encode_decode(i) + assert_panel_equal(i, i_rec) + + def test_multi(self): + + i_rec = self.encode_decode(self.frame) + for k in self.frame.keys(): + assert_frame_equal(self.frame[k], i_rec[k]) + + l = tuple( + [self.frame['float'], self.frame['float'].A, self.frame['float'].B, None]) + l_rec = self.encode_decode(l) + check_arbitrary(l, l_rec) + + # this is an oddity in that packed lists will be returned as tuples + l = [self.frame['float'], self.frame['float'] + .A, self.frame['float'].B, None] + l_rec = self.encode_decode(l) + self.assertIsInstance(l_rec, tuple) + check_arbitrary(l, l_rec) + + def test_iterator(self): + + l = [self.frame['float'], self.frame['float'] + .A, self.frame['float'].B, None] + + with ensure_clean(self.path) as path: + to_msgpack(path, *l) + for i, packed in enumerate(read_msgpack(path, iterator=True)): + check_arbitrary(packed, l[i]) + + def tests_datetimeindex_freq_issue(self): + + # GH 5947 + # inferring freq on the datetimeindex + df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013')) + result = self.encode_decode(df) + assert_frame_equal(result, df) + + df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013')) + result = self.encode_decode(df) + assert_frame_equal(result, df) + +class TestSparse(TestPackers): + + def _check_roundtrip(self, obj, comparator, **kwargs): + + # currently these are not implemetned + #i_rec = self.encode_decode(obj) + #comparator(obj, i_rec, **kwargs) + self.assertRaises(NotImplementedError, self.encode_decode, obj) + + def test_sparse_series(self): + + s = tm.makeStringSeries() + s[3:5] = np.nan + ss = s.to_sparse() + self._check_roundtrip(ss, tm.assert_series_equal, + check_series_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_series_equal, + check_series_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_series_equal, + check_series_type=True) + + def test_sparse_frame(self): + + s = tm.makeDataFrame() + s.ix[3:5, 1:3] = np.nan + s.ix[8:10, -2] = np.nan + ss = s.to_sparse() + + self._check_roundtrip(ss, tm.assert_frame_equal, + check_frame_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_frame_equal, + check_frame_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_frame_equal, + check_frame_type=True) + + def test_sparse_panel(self): + + items = ['x', 'y', 'z'] + p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) + sp = p.to_sparse() + + self._check_roundtrip(sp, tm.assert_panel_equal, + check_panel_type=True) + + sp2 = p.to_sparse(kind='integer') + self._check_roundtrip(sp2, tm.assert_panel_equal, + check_panel_type=True) + + sp3 = p.to_sparse(fill_value=0) + self._check_roundtrip(sp3, tm.assert_panel_equal, + check_panel_type=True) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py new file mode 100644 index 00000000..fd1febc3 --- /dev/null +++ b/pandas/io/tests/test_parsers.py @@ -0,0 +1,3569 @@ +# -*- coding: utf-8 -*- +# pylint: disable=E1101 + +from datetime import datetime +import csv +import os +import sys +import re +import nose +import platform + +from numpy import nan +import numpy as np +from pandas.io.common import DtypeWarning + +from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex +from pandas.compat import( + StringIO, BytesIO, PY3, range, long, lrange, lmap, u +) +from pandas.io.common import URLError +import pandas.io.parsers as parsers +from pandas.io.parsers import (read_csv, read_table, read_fwf, + TextFileReader, TextParser) + +import pandas.util.testing as tm +import pandas as pd + +from pandas.compat import parse_date +import pandas.lib as lib +from pandas import compat +from pandas.lib import Timestamp +from pandas.tseries.index import date_range +import pandas.tseries.tools as tools + +from numpy.testing.decorators import slow +from numpy.testing import assert_array_equal + +from pandas.parser import OverflowError, CParserError + + +class ParserTests(object): + """ + Want to be able to test either C+Cython or Python+Cython parsers + """ + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + def read_csv(self, *args, **kwargs): + raise NotImplementedError + + def read_table(self, *args, **kwargs): + raise NotImplementedError + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + self.dirpath = tm.get_data_path() + self.csv1 = os.path.join(self.dirpath, 'test1.csv') + self.csv2 = os.path.join(self.dirpath, 'test2.csv') + self.xls1 = os.path.join(self.dirpath, 'test.xls') + + def test_converters_type_must_be_dict(self): + with tm.assertRaisesRegexp(TypeError, 'Type converters.+'): + self.read_csv(StringIO(self.data1), converters=0) + + def test_multi_character_decimal_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,') + + def test_empty_decimal_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), decimal='') + + def test_empty_thousands_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), thousands='') + + + def test_multi_character_decimal_marker(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + self.assertRaises(ValueError, read_csv, StringIO(data), thousands=',,') + + def test_empty_string(self): + data = """\ +One,Two,Three +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = self.read_csv(StringIO(data)) + xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}, + keep_default_na=False) + xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', 'nan', 'five', + '', 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = self.read_csv( + StringIO(data), na_values=['a'], keep_default_na=False) + xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', 'nan', 'five', '', + 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}) + xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + + # GH4318, passing na_values=None and keep_default_na=False yields 'None' as a na_value + data = """\ +One,Two,Three +a,1,None +b,2,two +,3,None +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = self.read_csv( + StringIO(data), keep_default_na=False) + xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two': [1, 2, 3, 4, 5, 6, 7], + 'Three': ['None', 'two', 'None', 'nan', 'five', '', + 'seven']}) + tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + + def test_read_csv(self): + if not compat.PY3: + if 'win' in sys.platform: + prefix = u("file:///") + else: + prefix = u("file://") + fname = prefix + compat.text_type(self.csv1) + # it works! + df1 = read_csv(fname, index_col=0, parse_dates=True) + + def test_dialect(self): + data = """\ +label1,label2,label3 +index1,"a,c,e +index2,b,d,f +""" + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + df = self.read_csv(StringIO(data), dialect=dia) + + data = '''\ +label1,label2,label3 +index1,a,c,e +index2,b,d,f +''' + exp = self.read_csv(StringIO(data)) + exp.replace('a', '"a', inplace=True) + tm.assert_frame_equal(df, exp) + + def test_1000_sep(self): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334, 13], + 'C': [5, 10.] + }) + + df = self.read_csv(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + tm.assert_equal(expected.A.dtype, 'int64') + tm.assert_equal(expected.B.dtype, 'float') + tm.assert_equal(expected.C.dtype, 'float') + + df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + data_with_odd_sep = """A|B|C +1|2.334,01|5 +10|13|10, +""" + df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + def test_separator_date_conflict(self): + # Regression test for issue #4678: make sure thousands separator and + # date parsing do not conflict. + data = '06-02-2013;13:00;1-000.215' + expected = DataFrame( + [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], + columns=['Date', 2] + ) + + df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None) + tm.assert_frame_equal(df, expected) + + def test_squeeze(self): + data = """\ +a,1 +b,2 +c,3 +""" + expected = Series([1, 2, 3], ['a', 'b', 'c']) + result = self.read_table(StringIO(data), sep=',', index_col=0, + header=None, squeeze=True) + tm.assert_isinstance(result, Series) + tm.assert_series_equal(result, expected) + + def test_inf_parsing(self): + data = """\ +,A +a,inf +b,-inf +c,Inf +d,-Inf +e,INF +f,-INF +g,INf +h,-INf +i,inF +j,-inF""" + inf = float('inf') + expected = Series([inf, -inf] * 5) + df = read_csv(StringIO(data), index_col=0) + tm.assert_almost_equal(df['A'].values, expected.values) + df = read_csv(StringIO(data), index_col=0, na_filter=False) + tm.assert_almost_equal(df['A'].values, expected.values) + + def test_multiple_date_col(self): + # Can use multiple date parsers + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + def func(*date_cols): + return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + + df = self.read_csv(StringIO(data), header=None, + date_parser=func, + prefix='X', + parse_dates={'nominal': [1, 2], + 'actual': [1, 3]}) + self.assertIn('nominal', df) + self.assertIn('actual', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) + + d = datetime(1999, 1, 27, 19, 0) + self.assertEqual(df.ix[0, 'nominal'], d) + + df = self.read_csv(StringIO(data), header=None, + date_parser=func, + parse_dates={'nominal': [1, 2], + 'actual': [1, 3]}, + keep_date_col=True) + self.assertIn('nominal', df) + self.assertIn('actual', df) + + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) + + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + df = read_csv(StringIO(data), header=None, + prefix='X', + parse_dates=[[1, 2], [1, 3]]) + + self.assertIn('X1_X2', df) + self.assertIn('X1_X3', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) + + d = datetime(1999, 1, 27, 19, 0) + self.assertEqual(df.ix[0, 'X1_X2'], d) + + df = read_csv(StringIO(data), header=None, + parse_dates=[[1, 2], [1, 3]], keep_date_col=True) + + self.assertIn('1_2', df) + self.assertIn('1_3', df) + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) + + data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' + df = self.read_csv(StringIO(data), sep=',', header=None, + parse_dates=[1], index_col=1) + d = datetime(1999, 1, 27, 19, 0) + self.assertEqual(df.index[0], d) + + def test_multiple_date_cols_int_cast(self): + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + import pandas.io.date_converters as conv + + # it works! + df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, + date_parser=conv.parse_date_time) + self.assertIn('nominal', df) + + def test_multiple_date_col_timestamp_parse(self): + data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 +05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" + result = self.read_csv(StringIO(data), sep=',', header=None, + parse_dates=[[0,1]], date_parser=Timestamp) + + ex_val = Timestamp('05/31/2012 15:30:00.029') + self.assertEqual(result['0_1'][0], ex_val) + + def test_single_line(self): + # GH 6607 + # Test currently only valid with python engine because sep=None and + # delim_whitespace=False. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: + + with tm.assertRaisesRegexp(ValueError, + 'sep=None with delim_whitespace=False'): + # sniff separator + buf = StringIO() + sys.stdout = buf + + # printing warning message when engine == 'c' for now + + try: + # it works! + df = self.read_csv(StringIO('1,2'), names=['a', 'b'], + header=None, sep=None) + tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) + finally: + sys.stdout = sys.__stdout__ + + def test_multiple_date_cols_with_header(self): + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) + self.assertNotIsInstance(df.nominal[0], compat.string_types) + + ts_data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + def test_multiple_date_col_name_collision(self): + self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data), + parse_dates={'ID': [1, 2]}) + + data = """\ +date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + self.assertRaises(ValueError, self.read_csv, StringIO(data), + parse_dates=[[1, 2]]) + + def test_index_col_named(self): + no_header = """\ +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" + data = h + no_header + # import pdb; pdb.set_trace() + rs = self.read_csv(StringIO(data), index_col='ID') + xp = self.read_csv(StringIO(data), header=0).set_index('ID') + tm.assert_frame_equal(rs, xp) + + self.assertRaises(ValueError, self.read_csv, StringIO(no_header), + index_col='ID') + + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + names = ['a', 'b', 'c', 'd', 'message'] + xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11], + 'd': [4, 8, 12]}, + index=Index(['hello', 'world', 'foo'], name='message')) + rs = self.read_csv(StringIO(data), names=names, index_col=['message']) + tm.assert_frame_equal(xp, rs) + self.assertEqual(xp.index.name, rs.index.name) + + rs = self.read_csv(StringIO(data), names=names, index_col='message') + tm.assert_frame_equal(xp, rs) + self.assertEqual(xp.index.name, rs.index.name) + + def test_converter_index_col_bug(self): + # 1835 + data = "A;B\n1;2\n3;4" + + rs = self.read_csv(StringIO(data), sep=';', index_col='A', + converters={'A': lambda x: x}) + + xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A')) + tm.assert_frame_equal(rs, xp) + self.assertEqual(rs.index.name, xp.index.name) + + def test_date_parser_int_bug(self): + # #3071 + log_file = StringIO( + 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,' + 'accountid,userid,contactid,level,silo,method\n' + '1343103150,0.062353,0,4,6,0.01690,3,' + '12345,1,-1,3,invoice_InvoiceResource,search\n' + ) + + def f(posix_string): + return datetime.utcfromtimestamp(int(posix_string)) + + # it works! + read_csv(log_file, index_col=0, parse_dates=0, date_parser=f) + + def test_multiple_skts_example(self): + data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." + pass + + def test_malformed(self): + # all + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + + try: + df = self.read_table( + StringIO(data), sep=',', header=1, comment='#') + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) + + # skip_footer + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + + # GH 6607 + # Test currently only valid with python engine because + # skip_footer != 0. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: + + try: + with tm.assertRaisesRegexp(ValueError, 'skip_footer'): #XXX + df = self.read_table( + StringIO(data), sep=',', header=1, comment='#', + skip_footer=1) + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) + + # first chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(5) + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + # middle chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', header=1, + comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(1) + it.read(2) + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + # last chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(1) + it.read() + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + def test_passing_dtype(self): + # GH 6607 + # Passing dtype is currently only supported by the C engine. + # Temporarily copied to TestCParser*. + # Test for ValueError with other engines: + + with tm.assertRaisesRegexp(ValueError, + "The 'dtype' option is not supported"): + + df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # GH 3795 + # passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + + # we expect all object columns, so need to convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result,df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + index_col=0) + + # valid but we don't support it (date) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0, parse_dates=['B']) + + # valid but we don't support it + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + index_col=0) + + def test_quoting(self): + bad_line_small = """printer\tresult\tvariant_name +Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jacob +Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jakob +Klosterdruckerei\tKlosterdruckerei (1609-1805)\t"Furststiftische Hofdruckerei, (1609-1805)\tGaller, Alois +Klosterdruckerei\tKlosterdruckerei (1609-1805)\tHochfurstliche Buchhandlung """ + self.assertRaises(Exception, self.read_table, StringIO(bad_line_small), + sep='\t') + + good_line_small = bad_line_small + '"' + df = self.read_table(StringIO(good_line_small), sep='\t') + self.assertEqual(len(df), 3) + + def test_non_string_na_values(self): + # GH3611, na_values that are not a string are an issue + with tm.ensure_clean('__non_string_na_values__.csv') as path: + df = DataFrame({'A' : [-999, 2, 3], 'B' : [1.2, -999, 4.5]}) + df.to_csv(path, sep=' ', index=False) + result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999']) + result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0]) + result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999]) + tm.assert_frame_equal(result1,result2) + tm.assert_frame_equal(result2,result3) + + result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0']) + result5 = read_csv(path, sep= ' ', header=0, na_values=['-999']) + result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0]) + result7 = read_csv(path, sep= ' ', header=0, na_values=[-999]) + tm.assert_frame_equal(result4,result3) + tm.assert_frame_equal(result5,result3) + tm.assert_frame_equal(result6,result3) + tm.assert_frame_equal(result7,result3) + + good_compare = result3 + + # with an odd float format, so we can't match the string 999.0 exactly, + # but need float matching + df.to_csv(path, sep=' ', index=False, float_format = '%.3f') + result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999']) + result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0]) + result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999]) + tm.assert_frame_equal(result1,good_compare) + tm.assert_frame_equal(result2,good_compare) + tm.assert_frame_equal(result3,good_compare) + + result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0']) + result5 = read_csv(path, sep= ' ', header=0, na_values=['-999']) + result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0]) + result7 = read_csv(path, sep= ' ', header=0, na_values=[-999]) + tm.assert_frame_equal(result4,good_compare) + tm.assert_frame_equal(result5,good_compare) + tm.assert_frame_equal(result6,good_compare) + tm.assert_frame_equal(result7,good_compare) + + def test_default_na_values(self): + _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '-NaN', '-nan', '#N/A N/A','']) + assert_array_equal (_NA_VALUES, parsers._NA_VALUES) + nv = len(_NA_VALUES) + def f(i, v): + if i == 0: + buf = '' + elif i > 0: + buf = ''.join([','] * i) + + buf = "{0}{1}".format(buf,v) + + if i < nv-1: + buf = "{0}{1}".format(buf,''.join([','] * (nv-i-1))) + + return buf + + data = StringIO('\n'.join([ f(i, v) for i, v in enumerate(_NA_VALUES) ])) + + expected = DataFrame(np.nan,columns=range(nv),index=range(nv)) + df = self.read_csv(data, header=None) + tm.assert_frame_equal(df, expected) + + def test_custom_na_values(self): + data = """A,B,C +ignore,this,row +1,NA,3 +-1.#IND,5,baz +7,8,NaN +""" + expected = [[1., nan, 3], + [nan, 5, nan], + [7, 8, nan]] + + df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) + tm.assert_almost_equal(df.values, expected) + + df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], + skiprows=[1]) + tm.assert_almost_equal(df2.values, expected) + + df3 = self.read_table(StringIO(data), sep=',', na_values='baz', + skiprows=[1]) + tm.assert_almost_equal(df3.values, expected) + + def test_nat_parse(self): + + # GH 3062 + df = DataFrame(dict({ + 'A' : np.asarray(lrange(10),dtype='float64'), + 'B' : pd.Timestamp('20010101') })) + df.iloc[3:6,:] = np.nan + + with tm.ensure_clean('__nat_parse_.csv') as path: + df.to_csv(path) + result = read_csv(path,index_col=0,parse_dates=['B']) + tm.assert_frame_equal(result,df) + + expected = Series(dict( A = 'float64',B = 'datetime64[ns]')) + tm.assert_series_equal(expected,result.dtypes) + + # test with NaT for the nan_rep + # we don't have a method to specif the Datetime na_rep (it defaults to '') + df.to_csv(path) + result = read_csv(path,index_col=0,parse_dates=['B']) + tm.assert_frame_equal(result,df) + + def test_skiprows_bug(self): + # GH #505 + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = self.read_csv(StringIO(text), skiprows=lrange(6), header=None, + index_col=0, parse_dates=True) + + data2 = self.read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], + index=[datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)]) + expected.index.name = 0 + tm.assert_frame_equal(data, expected) + tm.assert_frame_equal(data, data2) + + def test_deep_skiprows(self): + # GH #4382 + text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in range(10)]) + condensed_text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]) + data = self.read_csv(StringIO(text), skiprows=[6, 8]) + condensed_data = self.read_csv(StringIO(condensed_text)) + tm.assert_frame_equal(data, condensed_data) + + def test_detect_string_na(self): + data = """A,B +foo,bar +NA,baz +NaN,nan +""" + expected = [['foo', 'bar'], + [nan, 'baz'], + [nan, nan]] + + df = self.read_csv(StringIO(data)) + tm.assert_almost_equal(df.values, expected) + + def test_unnamed_columns(self): + data = """A,B,C,, +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + expected = [[1, 2, 3, 4, 5.], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]] + df = self.read_table(StringIO(data), sep=',') + tm.assert_almost_equal(df.values, expected) + self.assert_numpy_array_equal(df.columns, + ['A', 'B', 'C', 'Unnamed: 3', + 'Unnamed: 4']) + + def test_string_nas(self): + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = self.read_csv(StringIO(data)) + expected = DataFrame([['a', 'b', 'c'], + ['d', np.nan, 'f'], + [np.nan, 'g', 'h']], + columns=['A', 'B', 'C']) + + tm.assert_frame_equal(result, expected) + + def test_duplicate_columns(self): + for engine in ['python', 'c']: + data = """A,A,B,B,B + 1,2,3,4,5 + 6,7,8,9,10 + 11,12,13,14,15 + """ + # check default beahviour + df = self.read_table(StringIO(data), sep=',',engine=engine) + self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) + + df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False) + self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B']) + + df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True) + self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) + + def test_csv_mixed_type(self): + data = """A,B,C +a,1,2 +b,3,4 +c,4,5 +""" + df = self.read_csv(StringIO(data)) + # TODO + + def test_csv_custom_parser(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + f = lambda x: datetime.strptime(x, '%Y%m%d') + df = self.read_csv(StringIO(data), date_parser=f) + expected = self.read_csv(StringIO(data), parse_dates=True) + tm.assert_frame_equal(df, expected) + + def test_parse_dates_implicit_first_col(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + df = self.read_csv(StringIO(data), parse_dates=True) + expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) + tm.assert_frame_equal(df, expected) + + def test_parse_dates_string(self): + data = """date,A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + rs = self.read_csv( + StringIO(data), index_col='date', parse_dates='date') + idx = date_range('1/1/2009', periods=3) + idx.name = 'date' + xp = DataFrame({'A': ['a', 'b', 'c'], + 'B': [1, 3, 4], + 'C': [2, 4, 5]}, idx) + tm.assert_frame_equal(rs, xp) + + def test_yy_format(self): + data = """date,time,B,C +090131,0010,1,2 +090228,1020,3,4 +090331,0830,5,6 +""" + rs = self.read_csv(StringIO(data), index_col=0, + parse_dates=[['date', 'time']]) + idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0)]).asobject + idx.name = 'date_time' + xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) + tm.assert_frame_equal(rs, xp) + + rs = self.read_csv(StringIO(data), index_col=0, + parse_dates=[[0, 1]]) + idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0)]).asobject + idx.name = 'date_time' + xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) + tm.assert_frame_equal(rs, xp) + + def test_parse_dates_column_list(self): + from pandas.core.datetools import to_datetime + + data = '''date;destination;ventilationcode;unitcode;units;aux_date +01/01/2010;P;P;50;1;12/1/2011 +01/01/2010;P;R;50;1;13/1/2011 +15/01/2010;P;P;50;1;14/1/2011 +01/05/2010;P;P;50;1;15/1/2011''' + + expected = self.read_csv(StringIO(data), sep=";", index_col=lrange(4)) + + lev = expected.index.levels[0] + levels = list(expected.index.levels) + levels[0] = lev.to_datetime(dayfirst=True) + # hack to get this to work - remove for final test + levels[0].name = lev.name + expected.index.set_levels(levels, inplace=True) + expected['aux_date'] = to_datetime(expected['aux_date'], + dayfirst=True) + expected['aux_date'] = lmap(Timestamp, expected['aux_date']) + tm.assert_isinstance(expected['aux_date'][0], datetime) + + df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), + parse_dates=[0, 5], dayfirst=True) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data), sep=";", index_col=lrange(4), + parse_dates=['date', 'aux_date'], dayfirst=True) + tm.assert_frame_equal(df, expected) + + def test_no_header(self): + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df = self.read_table(StringIO(data), sep=',', header=None) + df_pref = self.read_table(StringIO(data), sep=',', prefix='X', + header=None) + + names = ['foo', 'bar', 'baz', 'quux', 'panda'] + df2 = self.read_table(StringIO(data), sep=',', names=names) + expected = [[1, 2, 3, 4, 5.], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]] + tm.assert_almost_equal(df.values, expected) + tm.assert_almost_equal(df.values, df2.values) + + self.assert_numpy_array_equal(df_pref.columns, + ['X0', 'X1', 'X2', 'X3', 'X4']) + self.assert_numpy_array_equal(df.columns, lrange(5)) + + self.assert_numpy_array_equal(df2.columns, names) + + def test_no_header_prefix(self): + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', + header=None) + + expected = [[1, 2, 3, 4, 5.], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]] + tm.assert_almost_equal(df_pref.values, expected) + + self.assert_numpy_array_equal(df_pref.columns, + ['Field0', 'Field1', 'Field2', 'Field3', 'Field4']) + + def test_header_with_index_col(self): + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ['A', 'B', 'C'] + df = self.read_csv(StringIO(data), names=names) + + self.assertEqual(names, ['A', 'B', 'C']) + + values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + expected = DataFrame(values, index=['foo', 'bar', 'baz'], + columns=['A', 'B', 'C']) + tm.assert_frame_equal(df, expected) + + def test_read_csv_dataframe(self): + df = self.read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = self.read_table(self.csv1, sep=',', index_col=0, + parse_dates=True) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) + self.assertEqual(df.index.name, 'index') + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) + self.assertEqual(df.values.dtype, np.float64) + tm.assert_frame_equal(df, df2) + + def test_read_csv_no_index_name(self): + df = self.read_csv(self.csv2, index_col=0, parse_dates=True) + df2 = self.read_table(self.csv2, sep=',', index_col=0, + parse_dates=True) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) + self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) + tm.assert_frame_equal(df, df2) + + def test_read_table_unicode(self): + fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8')) + df1 = read_table(fin, sep=";", encoding="utf-8", header=None) + tm.assert_isinstance(df1[0].values[0], compat.text_type) + + def test_read_table_wrong_num_columns(self): + # too few! + data = """A,B,C,D,E,F +1,2,3,4,5,6 +6,7,8,9,10,11,12 +11,12,13,14,15,16 +""" + self.assertRaises(Exception, self.read_csv, StringIO(data)) + + def test_read_table_duplicate_index(self): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + + result = self.read_csv(StringIO(data), index_col=0) + expected = self.read_csv(StringIO(data)).set_index('index', + verify_integrity=False) + tm.assert_frame_equal(result, expected) + + def test_read_table_duplicate_index_implicit(self): + data = """A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + + # it works! + result = self.read_csv(StringIO(data)) + + def test_parse_bools(self): + data = """A,B +True,1 +False,2 +True,3 +""" + data = self.read_csv(StringIO(data)) + self.assertEqual(data['A'].dtype, np.bool_) + + data = """A,B +YES,1 +no,2 +yes,3 +No,3 +Yes,3 +""" + data = self.read_csv(StringIO(data), + true_values=['yes', 'Yes', 'YES'], + false_values=['no', 'NO', 'No']) + self.assertEqual(data['A'].dtype, np.bool_) + + data = """A,B +TRUE,1 +FALSE,2 +TRUE,3 +""" + data = self.read_csv(StringIO(data)) + self.assertEqual(data['A'].dtype, np.bool_) + + data = """A,B +foo,bar +bar,foo""" + result = self.read_csv(StringIO(data), true_values=['foo'], + false_values=['bar']) + expected = DataFrame({'A': [True, False], 'B': [False, True]}) + tm.assert_frame_equal(result, expected) + + def test_int_conversion(self): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + data = self.read_csv(StringIO(data)) + self.assertEqual(data['A'].dtype, np.float64) + self.assertEqual(data['B'].dtype, np.int64) + + def test_infer_index_col(self): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + data = self.read_csv(StringIO(data)) + self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) + + def test_read_nrows(self): + df = self.read_csv(StringIO(self.data1), nrows=3) + expected = self.read_csv(StringIO(self.data1))[:3] + tm.assert_frame_equal(df, expected) + + def test_read_chunksize(self): + reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + def test_read_chunksize_named(self): + reader = self.read_csv( + StringIO(self.data1), index_col='index', chunksize=2) + df = self.read_csv(StringIO(self.data1), index_col='index') + + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + def test_get_chunk_passed_chunksize(self): + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +1,2,3""" + result = self.read_csv(StringIO(data), chunksize=2) + + piece = result.get_chunk() + self.assertEqual(len(piece), 2) + + def test_read_text_list(self): + data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" + as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', + '4', '5', '6']] + df = self.read_csv(StringIO(data), index_col=0) + + parser = TextParser(as_list, index_col=0, chunksize=2) + chunk = parser.read(None) + + tm.assert_frame_equal(chunk, df) + + def test_iterator(self): + # GH 6607 + # Test currently only valid with python engine because + # skip_footer != 0. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: + + with tm.assertRaisesRegexp(ValueError, 'skip_footer'): + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True) + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunk = reader.read(3) + tm.assert_frame_equal(chunk, df[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, df[3:]) + + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) + + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + # pass skiprows + parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[1:3]) + + # test bad parameter (skip_footer) + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True, skip_footer=True) + self.assertRaises(ValueError, reader.read, 3) + + treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + tm.assert_isinstance(treader, TextFileReader) + + # stopping iteration when on chunksize is specified, GH 3967 + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + reader = self.read_csv(StringIO(data), iterator=True) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + tm.assert_frame_equal(result[0], expected) + + # chunksize = 1 + reader = self.read_csv(StringIO(data), chunksize=1) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + self.assertEqual(len(result), 3) + tm.assert_frame_equal(pd.concat(result), expected) + + def test_header_not_first_line(self): + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + df = self.read_csv(StringIO(data), header=2, index_col=0) + expected = self.read_csv(StringIO(data2), header=0, index_col=0) + tm.assert_frame_equal(df, expected) + + def test_header_multi_index(self): + expected = tm.makeCustomDataframe(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + + df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + tm.assert_frame_equal(df, expected) + + # skipping lines in the header + df = self.read_csv(StringIO(data), header=[0, 2, 3, 4], index_col=[0, 1], tupleize_cols=False) + tm.assert_frame_equal(df, expected) + + #### invalid options #### + + # no as_recarray + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], as_recarray=True, tupleize_cols=False) + + # names + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], names=['foo','bar'], tupleize_cols=False) + # usecols + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], + index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False) + # non-numeric index_col + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], + index_col=['foo','bar'], tupleize_cols=False) + + def test_header_multiindex_common_format(self): + + df = DataFrame([[1,2,3,4,5,6],[7,8,9,10,11,12]], + index=['one','two'], + columns=MultiIndex.from_tuples([('a','q'),('a','r'),('a','s'), + ('b','t'),('c','u'),('c','v')])) + + # to_csv + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +,,,,,, +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(df,result) + + # common + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(df,result) + + # common, no index_col + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=None) + tm.assert_frame_equal(df.reset_index(drop=True),result) + + # malformed case 1 + expected = DataFrame(np.array([[2, 3, 4, 5, 6], + [8, 9, 10, 11, 12]], dtype='int64'), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[u('a'), u('q')])) + + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(expected,result) + + # malformed case 2 + expected = DataFrame(np.array([[2, 3, 4, 5, 6], + [8, 9, 10, 11, 12]], dtype='int64'), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=0) + tm.assert_frame_equal(expected,result) + + # mi on columns and index (malformed) + expected = DataFrame(np.array([[ 3, 4, 5, 6], + [ 9, 10, 11, 12]], dtype='int64'), + index=MultiIndex(levels=[[1, 7], [2, 8]], + labels=[[0, 1], [0, 1]]), + columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]], + labels=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, u('q')])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = self.read_csv(StringIO(data),header=[0,1],index_col=[0, 1]) + tm.assert_frame_equal(expected,result) + + def test_pass_names_with_index(self): + lines = self.data1.split('\n') + no_header = '\n'.join(lines[1:]) + + # regular index + names = ['index', 'A', 'B', 'C', 'D'] + df = self.read_csv(StringIO(no_header), index_col=0, names=names) + expected = self.read_csv(StringIO(self.data1), index_col=0) + tm.assert_frame_equal(df, expected) + + # multi index + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['index1', 'index2', 'A', 'B', 'C', 'D'] + df = self.read_csv(StringIO(no_header), index_col=[0, 1], + names=names) + expected = self.read_csv(StringIO(data), index_col=[0, 1]) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data), index_col=['index1', 'index2']) + tm.assert_frame_equal(df, expected) + + def test_multi_index_no_level_names(self): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + data2 = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['A', 'B', 'C', 'D'] + + df = self.read_csv(StringIO(no_header), index_col=[0, 1], + header=None, names=names) + expected = self.read_csv(StringIO(data), index_col=[0, 1]) + tm.assert_frame_equal(df, expected, check_names=False) + + # 2 implicit first cols + df2 = self.read_csv(StringIO(data2)) + tm.assert_frame_equal(df2, df) + + # reverse order of index + df = self.read_csv(StringIO(no_header), index_col=[1, 0], names=names, + header=None) + expected = self.read_csv(StringIO(data), index_col=[1, 0]) + tm.assert_frame_equal(df, expected, check_names=False) + + def test_multi_index_parse_dates(self): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) + self.assertIsInstance(df.index.levels[0][0], + (datetime, np.datetime64, Timestamp)) + + # specify columns out of order! + df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) + self.assertIsInstance(df2.index.levels[1][0], + (datetime, np.datetime64, Timestamp)) + + def test_skip_footer(self): + # GH 6607 + # Test currently only valid with python engine because + # skip_footer != 0. Temporarily copied to TestPythonParser. + # Test for ValueError with other engines: + + with tm.assertRaisesRegexp(ValueError, 'skip_footer'): + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +""" + result = self.read_csv(StringIO(data), skip_footer=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) + + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), nrows=3) + tm.assert_frame_equal(result, expected) + + # skipfooter alias + result = read_csv(StringIO(data), skipfooter=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = read_csv(StringIO(no_footer)) + + tm.assert_frame_equal(result, expected) + + def test_no_unnamed_index(self): + data = """ id c0 c1 c2 +0 1 0 a b +1 2 0 c d +2 2 2 e f +""" + df = self.read_table(StringIO(data), sep=' ') + self.assertIsNone(df.index.name) + + def test_converters(self): + data = """A,B,C,D +a,1,2,01/01/2009 +b,3,4,01/02/2009 +c,4,5,01/03/2009 +""" + from pandas.compat import parse_date + + result = self.read_csv(StringIO(data), converters={'D': parse_date}) + result2 = self.read_csv(StringIO(data), converters={3: parse_date}) + + expected = self.read_csv(StringIO(data)) + expected['D'] = expected['D'].map(parse_date) + + tm.assert_isinstance(result['D'][0], (datetime, Timestamp)) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + # produce integer + converter = lambda x: int(x.split('/')[2]) + result = self.read_csv(StringIO(data), converters={'D': converter}) + expected = self.read_csv(StringIO(data)) + expected['D'] = expected['D'].map(converter) + tm.assert_frame_equal(result, expected) + + def test_converters_no_implicit_conv(self): + # GH2184 + data = """000102,1.2,A\n001245,2,B""" + f = lambda x: x.strip() + converter = {0: f} + df = self.read_csv(StringIO(data), header=None, converters=converter) + self.assertEqual(df[0].dtype, object) + + def test_converters_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + f = lambda x: float(x.replace(",", ".")) + converter = {'Number1': f, 'Number2': f, 'Number3': f} + df2 = self.read_csv(StringIO(data), sep=';', converters=converter) + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float) + + def test_converter_return_string_bug(self): + # GH #583 + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + f = lambda x: float(x.replace(",", ".")) + converter = {'Number1': f, 'Number2': f, 'Number3': f} + df2 = self.read_csv(StringIO(data), sep=';', converters=converter) + self.assertEqual(df2['Number1'].dtype, float) + + def test_read_table_buglet_4x_multiindex(self): + # GH 6607 + # Parsing multi-level index currently causes an error in the C parser. + # Temporarily copied to TestPythonParser. + # Here test that CParserError is raised: + + with tm.assertRaises(CParserError): + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + # it works! + df = self.read_table(StringIO(text), sep='\s+') + self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) + + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows(self): + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # this should ignore the first four lines (including comments) + df = self.read_csv(StringIO(data), comment='#', skiprows=4) + tm.assert_almost_equal(df.values, expected) + + def test_comment_header(self): + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # header should begin at the second non-comment line + df = self.read_csv(StringIO(data), comment='#', header=1) + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows_header(self): + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # skiprows should skip the first 4 lines (including comments), while + # header should start from the second non-commented line starting + # with line 5 + df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + tm.assert_almost_equal(df.values, expected) + + def test_read_csv_parse_simple_list(self): + text = """foo +bar baz +qux foo +foo +bar""" + df = read_csv(StringIO(text), header=None) + expected = DataFrame({0: ['foo', 'bar baz', 'qux foo', + 'foo', 'bar']}) + tm.assert_frame_equal(df, expected) + + def test_parse_dates_custom_euroformat(self): + text = """foo,bar,baz +31/01/2010,1,2 +01/02/2010,1,NA +02/02/2010,1,2 +""" + parser = lambda d: parse_date(d, dayfirst=True) + df = self.read_csv(StringIO(text), + names=['time', 'Q', 'NTU'], header=0, + index_col=0, parse_dates=True, + date_parser=parser, na_values=['NA']) + + exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), + datetime(2010, 2, 2)], name='time') + expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]}, + index=exp_index, columns=['Q', 'NTU']) + tm.assert_frame_equal(df, expected) + + parser = lambda d: parse_date(d, day_first=True) + self.assertRaises(Exception, self.read_csv, + StringIO(text), skiprows=[0], + names=['time', 'Q', 'NTU'], index_col=0, + parse_dates=True, date_parser=parser, + na_values=['NA']) + + def test_na_value_dict(self): + data = """A,B,C +foo,bar,NA +bar,foo,foo +foo,bar,NA +bar,foo,foo""" + + df = self.read_csv(StringIO(data), + na_values={'A': ['foo'], 'B': ['bar']}) + expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], + 'B': [np.nan, 'foo', np.nan, 'foo'], + 'C': [np.nan, 'foo', np.nan, 'foo']}) + tm.assert_frame_equal(df, expected) + + data = """\ +a,b,c,d +0,NA,1,5 +""" + xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0]) + xp.index.name = 'a' + df = self.read_csv(StringIO(data), na_values={}, index_col=0) + tm.assert_frame_equal(df, xp) + + xp = DataFrame({'b': [np.nan], 'd': [5]}, + MultiIndex.from_tuples([(0, 1)])) + xp.index.names = ['a', 'c'] + df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2]) + tm.assert_frame_equal(df, xp) + + xp = DataFrame({'b': [np.nan], 'd': [5]}, + MultiIndex.from_tuples([(0, 1)])) + xp.index.names = ['a', 'c'] + df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c']) + tm.assert_frame_equal(df, xp) + + @tm.network + def test_url(self): + # HTTP(S) + url = ('https://raw.github.com/pydata/pandas/master/' + 'pandas/io/tests/data/salary.table') + url_table = self.read_table(url) + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'salary.table') + local_table = self.read_table(localtable) + tm.assert_frame_equal(url_table, local_table) + # TODO: ftp testing + + @slow + def test_file(self): + + # FILE + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("file:// not supported with Python < 2.6") + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'salary.table') + local_table = self.read_table(localtable) + + try: + url_table = self.read_table('file://localhost/' + localtable) + except URLError: + # fails on some systems + raise nose.SkipTest("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + def test_parse_tz_aware(self): + import pytz + # #1693 + data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5") + + # it works + result = read_csv(data, index_col=0, parse_dates=True) + stamp = result.index[0] + self.assertEqual(stamp.minute, 39) + try: + self.assertIs(result.index.tz, pytz.utc) + except AssertionError: # hello Yaroslav + arr = result.index.to_pydatetime() + result = tools.to_datetime(arr, utc=True)[0] + self.assertEqual(stamp.minute, result.minute) + self.assertEqual(stamp.hour, result.hour) + self.assertEqual(stamp.day, result.day) + + def test_multiple_date_cols_index(self): + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) + df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, + index_col='nominal') + tm.assert_frame_equal(xp.set_index('nominal'), df) + df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, + index_col=0) + tm.assert_frame_equal(df2, df) + + df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0) + tm.assert_frame_equal(df3, df, check_names=False) + + def test_multiple_date_cols_chunked(self): + df = self.read_csv(StringIO(self.ts_data), parse_dates={ + 'nominal': [1, 2]}, index_col='nominal') + reader = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal': + [1, 2]}, index_col='nominal', chunksize=2) + + chunks = list(reader) + + self.assertNotIn('nominalTime', df) + + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + def test_multiple_date_col_named_components(self): + xp = self.read_csv(StringIO(self.ts_data), + parse_dates={'nominal': [1, 2]}, + index_col='nominal') + colspec = {'nominal': ['date', 'nominalTime']} + df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec, + index_col='nominal') + tm.assert_frame_equal(df, xp) + + def test_multiple_date_col_multiple_index(self): + df = self.read_csv(StringIO(self.ts_data), + parse_dates={'nominal': [1, 2]}, + index_col=['nominal', 'ID']) + + xp = self.read_csv(StringIO(self.ts_data), + parse_dates={'nominal': [1, 2]}) + + tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df) + + def test_comment(self): + data = """A,B,C +1,2.,4.#hello world +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + df = self.read_table(StringIO(data), sep=',', comment='#', + na_values=['NaN']) + tm.assert_almost_equal(df.values, expected) + + def test_bool_na_values(self): + data = """A,B,C +True,False,True +NA,True,False +False,NA,True""" + + result = self.read_csv(StringIO(data)) + expected = DataFrame({'A': np.array([True, nan, False], dtype=object), + 'B': np.array([False, True, nan], dtype=object), + 'C': [True, False, True]}) + + tm.assert_frame_equal(result, expected) + + def test_nonexistent_path(self): + # don't segfault pls #2428 + path = '%s.csv' % tm.rands(10) + self.assertRaises(Exception, self.read_csv, path) + + def test_missing_trailing_delimiters(self): + data = """A,B,C,D +1,2,3,4 +1,3,3, +1,4,5""" + result = self.read_csv(StringIO(data)) + self.assertTrue(result['D'].isnull()[1:].all()) + + def test_skipinitialspace(self): + s = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' + '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' + '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' + '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' + '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') + + sfile = StringIO(s) + # it's 33 columns + result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'], + header=None, skipinitialspace=True) + self.assertTrue(pd.isnull(result.ix[0, 29])) + + def test_utf16_bom_skiprows(self): + # #2298 + data = u("""skip this +skip this too +A\tB\tC +1\t2\t3 +4\t5\t6""") + + data2 = u("""skip this +skip this too +A,B,C +1,2,3 +4,5,6""") + + path = '__%s__.csv' % tm.rands(10) + + with tm.ensure_clean(path) as path: + for sep, dat in [('\t', data), (',', data2)]: + for enc in ['utf-16', 'utf-16le', 'utf-16be']: + bytes = dat.encode(enc) + with open(path, 'wb') as f: + f.write(bytes) + + s = BytesIO(dat.encode('utf-8')) + if compat.PY3: + # somewhat False since the code never sees bytes + from io import TextIOWrapper + s = TextIOWrapper(s, encoding='utf-8') + + result = self.read_csv(path, encoding=enc, skiprows=2, + sep=sep) + expected = self.read_csv(s, encoding='utf-8', skiprows=2, + sep=sep) + + tm.assert_frame_equal(result, expected) + + def test_utf16_example(self): + path = tm.get_data_path('utf16_ex.txt') + + # it works! and is the right length + result = self.read_table(path, encoding='utf-16') + self.assertEqual(len(result), 50) + + if not compat.PY3: + buf = BytesIO(open(path, 'rb').read()) + result = self.read_table(buf, encoding='utf-16') + self.assertEqual(len(result), 50) + + def test_converters_corner_with_nas(self): + # skip aberration observed on Win64 Python 3.2.2 + if hash(np.int64(-1)) != -2: + raise nose.SkipTest("skipping because of windows hash on Python" + " 3.2.2") + + csv = """id,score,days +1,2,12 +2,2-5, +3,,14+ +4,6-12,2""" + + def convert_days(x): + x = x.strip() + if not x: + return np.nan + + is_plus = x.endswith('+') + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + return x + + def convert_days_sentinel(x): + x = x.strip() + if not x: + return np.nan + + is_plus = x.endswith('+') + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + return x + + def convert_score(x): + x = x.strip() + if not x: + return np.nan + if x.find('-') > 0: + valmin, valmax = lmap(int, x.split('-')) + val = 0.5 * (valmin + valmax) + else: + val = float(x) + + return val + + fh = StringIO(csv) + result = self.read_csv(fh, converters={'score': convert_score, + 'days': convert_days}, + na_values=['', None]) + self.assertTrue(pd.isnull(result['days'][1])) + + fh = StringIO(csv) + result2 = self.read_csv(fh, converters={'score': convert_score, + 'days': convert_days_sentinel}, + na_values=['', None]) + tm.assert_frame_equal(result, result2) + + def test_unicode_encoding(self): + pth = tm.get_data_path('unicode_series.csv') + + result = self.read_csv(pth, header=None, encoding='latin-1') + result = result.set_index(0) + + got = result[1][1632] + expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)') + + self.assertEqual(got, expected) + + def test_trailing_delimiters(self): + # #2442. grumble grumble + data = """A,B,C +1,2,3, +4,5,6, +7,8,9,""" + result = self.read_csv(StringIO(data), index_col=False) + + expected = DataFrame({'A': [1, 4, 7], 'B': [2, 5, 8], + 'C': [3, 6, 9]}) + + tm.assert_frame_equal(result, expected) + + def test_escapechar(self): + # http://stackoverflow.com/questions/13824840/feature-request-for- + # pandas-read-csv + data = '''SEARCH_TERM,ACTUAL_URL +"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' + + result = self.read_csv(StringIO(data), escapechar='\\', + quotechar='"', encoding='utf-8') + self.assertEqual(result['SEARCH_TERM'][2], + 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie') + self.assertTrue(np.array_equal(result.columns, + ['SEARCH_TERM', 'ACTUAL_URL'])) + + def test_header_names_backward_compat(self): + # #2539 + data = '1,2,3\n4,5,6' + + result = self.read_csv(StringIO(data), names=['a', 'b', 'c']) + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + tm.assert_frame_equal(result, expected) + + data2 = 'foo,bar,baz\n' + data + result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'], + header=0) + tm.assert_frame_equal(result, expected) + + def test_int64_min_issues(self): + # #2599 + data = 'A,B\n0,0\n0,' + + result = self.read_csv(StringIO(data)) + expected = DataFrame({'A': [0, 0], 'B': [0, np.nan]}) + + tm.assert_frame_equal(result, expected) + + def test_parse_integers_above_fp_precision(self): + data = """Numbers +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000194""" + + result = self.read_csv(StringIO(data)) + expected = DataFrame({'Numbers': [17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194]}) + + self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers'])) + + def test_usecols_index_col_conflict(self): + # Issue 4201 Test that index_col as integer reflects usecols + data = """SecId,Time,Price,P2,P3 +10000,2013-5-11,100,10,1 +500,2013-5-12,101,11,1 +""" + expected = DataFrame({'Price': [100, 101]}, index=[datetime(2013, 5, 11), datetime(2013, 5, 12)]) + expected.index.name = 'Time' + + df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0) + tm.assert_frame_equal(expected, df) + + df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time') + tm.assert_frame_equal(expected, df) + + df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time') + tm.assert_frame_equal(expected, df) + + df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0) + tm.assert_frame_equal(expected, df) + + expected = DataFrame({'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) + expected = expected.set_index(['Price', 'P2']) + df = self.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + tm.assert_frame_equal(expected, df) + + def test_chunks_have_consistent_numerical_type(self): + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) + + with tm.assert_produces_warning(False): + df = self.read_csv(StringIO(data)) + self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced. + self.assertEqual(df.a.dtype, np.float) + + def test_warn_if_chunks_have_mismatched_type(self): + # See test in TestCParserLowMemory. + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) + + with tm.assert_produces_warning(False): + df = self.read_csv(StringIO(data)) + self.assertEqual(df.a.dtype, np.object) + + def test_usecols(self): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + result = self.read_csv(StringIO(data), usecols=(1, 2)) + result2 = self.read_csv(StringIO(data), usecols=('b', 'c')) + exp = self.read_csv(StringIO(data)) + + self.assertEqual(len(result.columns), 2) + self.assertTrue((result['b'] == exp['b']).all()) + self.assertTrue((result['c'] == exp['c']).all()) + + tm.assert_frame_equal(result, result2) + + result = self.read_csv(StringIO(data), usecols=[1, 2], header=0, + names=['foo', 'bar']) + expected = self.read_csv(StringIO(data), usecols=[1, 2]) + expected.columns = ['foo', 'bar'] + tm.assert_frame_equal(result, expected) + + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + result = self.read_csv(StringIO(data), names=['b', 'c'], + header=None, usecols=[1, 2]) + + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + expected = expected[['b', 'c']] + tm.assert_frame_equal(result, expected) + + result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None, usecols=['b', 'c']) + tm.assert_frame_equal(result2, result) + + + # 5766 + result = self.read_csv(StringIO(data), names=['a', 'b'], + header=None, usecols=[0, 1]) + + expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], + header=None) + expected = expected[['a', 'b']] + tm.assert_frame_equal(result, expected) + + # length conflict, passed names and usecols disagree + self.assertRaises(ValueError, self.read_csv, StringIO(data), + names=['a', 'b'], usecols=[1], header=None) + + def test_integer_overflow_bug(self): + # #2601 + data = "65248E10 11\n55555E55 22\n" + + result = self.read_csv(StringIO(data), header=None, sep=' ') + self.assertTrue(result[0].dtype == np.float64) + + result = self.read_csv(StringIO(data), header=None, sep='\s+') + self.assertTrue(result[0].dtype == np.float64) + + def test_catch_too_many_names(self): + # Issue 5156 + data = """\ +1,2,3 +4,,6 +7,8,9 +10,11,12\n""" + tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd']) + + def test_ignore_leading_whitespace(self): + # GH 6607, GH 3374 + data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' + result = self.read_table(StringIO(data), sep='\s+') + expected = DataFrame({'a':[1,4,7], 'b':[2,5,8], 'c': [3,6,9]}) + tm.assert_frame_equal(result, expected) + + def test_nrows_and_chunksize_raises_notimplemented(self): + data = 'a b c' + self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), + nrows=10, chunksize=5) + + +class TestPythonParser(ParserTests, tm.TestCase): + def test_negative_skipfooter_raises(self): + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + + with tm.assertRaisesRegexp(ValueError, + 'skip footer cannot be negative'): + df = self.read_csv(StringIO(text), skipfooter=-1) + + def read_csv(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = 'python' + return read_csv(*args, **kwds) + + def read_table(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = 'python' + return read_table(*args, **kwds) + + def test_sniff_delimiter(self): + text = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data = self.read_csv(StringIO(text), index_col=0, sep=None) + self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) + + data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|') + tm.assert_frame_equal(data, data2) + + text = """ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data3 = self.read_csv(StringIO(text), index_col=0, + sep=None, skiprows=2) + tm.assert_frame_equal(data, data3) + + text = u("""ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""").encode('utf-8') + + s = BytesIO(text) + if compat.PY3: + # somewhat False since the code never sees bytes + from io import TextIOWrapper + s = TextIOWrapper(s, encoding='utf-8') + + data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2, + encoding='utf-8') + tm.assert_frame_equal(data, data4) + + def test_regex_separator(self): + data = """ A B C D +a 1 2 3 4 +b 1 2 3 4 +c 1 2 3 4 +""" + df = self.read_table(StringIO(data), sep='\s+') + expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), + index_col=0) + self.assertIsNone(expected.index.name) + tm.assert_frame_equal(df, expected) + + def test_1000_fwf(self): + data = """ + 1 2,334.0 5 +10 13 10. +""" + expected = [[1, 2334., 5], + [10, 13, 10]] + df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)], + thousands=',') + tm.assert_almost_equal(df.values, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + df = self.read_csv(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', thousands=',') + tm.assert_frame_equal(df, expected) + + def test_comment_fwf(self): + data = """ + 1 2. 4 #hello world + 5 NaN 10.0 +""" + expected = [[1, 2., 4], + [5, np.nan, 10.]] + df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)], + comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_fwf(self): + data_expected = """\ +2011,58,360.242940,149.910199,11950.7 +2011,59,444.953632,166.985655,11788.4 +2011,60,364.136849,183.628767,11806.2 +2011,61,413.836124,184.375703,11916.8 +2011,62,502.953953,173.237159,12468.3 +""" + expected = self.read_csv(StringIO(data_expected), header=None) + + data1 = """\ +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + df = read_fwf(StringIO(data1), colspecs=colspecs, header=None) + tm.assert_frame_equal(df, expected) + + data2 = """\ +2011 58 360.242940 149.910199 11950.7 +2011 59 444.953632 166.985655 11788.4 +2011 60 364.136849 183.628767 11806.2 +2011 61 413.836124 184.375703 11916.8 +2011 62 502.953953 173.237159 12468.3 +""" + df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None) + tm.assert_frame_equal(df, expected) + + # From Thomas Kluyver: apparently some non-space filler characters can + # be seen, this is supported by specifying the 'delimiter' character: + # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html + data3 = """\ +201158~~~~360.242940~~~149.910199~~~11950.7 +201159~~~~444.953632~~~166.985655~~~11788.4 +201160~~~~364.136849~~~183.628767~~~11806.2 +201161~~~~413.836124~~~184.375703~~~11916.8 +201162~~~~502.953953~~~173.237159~~~12468.3 +""" + df = read_fwf( + StringIO(data3), colspecs=colspecs, delimiter='~', header=None) + tm.assert_frame_equal(df, expected) + + with tm.assertRaisesRegexp(ValueError, "must specify only one of"): + read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7]) + + with tm.assertRaisesRegexp(ValueError, "Must specify either"): + read_fwf(StringIO(data3), colspecs=None, widths=None) + + def test_fwf_colspecs_is_list_or_tuple(self): + with tm.assertRaisesRegexp(TypeError, + 'column specifications must be a list or ' + 'tuple.+'): + pd.io.parsers.FixedWidthReader(StringIO(self.data1), + {'a': 1}, ',', '#') + + def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self): + with tm.assertRaisesRegexp(TypeError, + 'Each column specification must be.+'): + read_fwf(StringIO(self.data1), [('a', 1)]) + + def test_fwf_colspecs_None(self): + # GH 7079 + data = """\ +123456 +456789 +""" + colspecs = [(0, 3), (3, None)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123, 456], [456, 789]]) + tm.assert_frame_equal(result, expected) + + colspecs = [(None, 3), (3, 6)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123, 456], [456, 789]]) + tm.assert_frame_equal(result, expected) + + colspecs = [(0, None), (3, None)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123456, 456], [456789, 789]]) + tm.assert_frame_equal(result, expected) + + colspecs = [(None, None), (3, 6)] + result = read_fwf(StringIO(data), colspecs=colspecs, header=None) + expected = DataFrame([[123456, 456], [456789, 789]]) + tm.assert_frame_equal(result, expected) + + + def test_fwf_regression(self): + # GH 3594 + #### turns out 'T060' is parsable as a datetime slice! + + tzlist = [1,10,20,30,60,80,100] + ntz = len(tzlist) + tcolspecs = [16]+[8]*ntz + tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]] + data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 + 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 + 2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657 + 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379 + 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 +""" + + df = read_fwf(StringIO(data), + index_col=0, + header=None, + names=tcolnames, + widths=tcolspecs, + parse_dates=True, + date_parser=lambda s: datetime.strptime(s,'%Y%j%H%M%S')) + + for c in df.columns: + res = df.loc[:,c] + self.assertTrue(len(res)) + + def test_fwf_compression(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest("Need gzip and bz2 to run this test") + + data = """1111111111 + 2222222222 + 3333333333""".strip() + widths = [5, 5] + names = ['one', 'two'] + expected = read_fwf(StringIO(data), widths=widths, names=names) + if compat.PY3: + data = bytes(data, encoding='utf-8') + comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)] + for comp_name, compresser in comps: + with tm.ensure_clean() as path: + tmp = compresser(path, mode='wb') + tmp.write(data) + tmp.close() + result = read_fwf(path, widths=widths, names=names, + compression=comp_name) + tm.assert_frame_equal(result, expected) + + def test_BytesIO_input(self): + if not compat.PY3: + raise nose.SkipTest("Bytes-related test - only needs to work on Python 3") + result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2,2], encoding='utf8') + expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"]) + tm.assert_frame_equal(result, expected) + data = BytesIO("שלום::1234\n562::123".encode('cp1255')) + result = pd.read_table(data, sep="::", engine='python', + encoding='cp1255') + expected = pd.DataFrame([[562, 123]], columns=["שלום","1234"]) + tm.assert_frame_equal(result, expected) + + def test_verbose_import(self): + text = """a,b,c,d +one,1,2,3 +one,1,2,3 +,1,2,3 +one,1,2,3 +,1,2,3 +,1,2,3 +one,1,2,3 +two,1,2,3""" + + buf = StringIO() + sys.stdout = buf + + try: + # it works! + df = self.read_csv(StringIO(text), verbose=True) + self.assertEqual(buf.getvalue(), 'Filled 3 NA values in column a\n') + finally: + sys.stdout = sys.__stdout__ + + buf = StringIO() + sys.stdout = buf + + text = """a,b,c,d +one,1,2,3 +two,1,2,3 +three,1,2,3 +four,1,2,3 +five,1,2,3 +,1,2,3 +seven,1,2,3 +eight,1,2,3""" + + try: + # it works! + df = self.read_csv(StringIO(text), verbose=True, index_col=0) + self.assertEqual(buf.getvalue(), 'Filled 1 NA values in column a\n') + finally: + sys.stdout = sys.__stdout__ + + def test_iteration_open_handle(self): + if PY3: + raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info)) + + with tm.ensure_clean() as path: + with open(path, 'wb') as f: + f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG') + + with open(path, 'rb') as f: + for line in f: + if 'CCC' in line: + break + + try: + read_table(f, squeeze=True, header=None, engine='c') + except Exception: + pass + else: + raise ValueError('this should not happen') + + result = read_table(f, squeeze=True, header=None, + engine='python') + + expected = Series(['DDD', 'EEE', 'FFF', 'GGG']) + tm.assert_series_equal(result, expected) + + def test_iterator(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True) + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunk = reader.read(3) + tm.assert_frame_equal(chunk, df[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, df[3:]) + + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) + + df = self.read_csv(StringIO(self.data1), index_col=0) + + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[:2]) + tm.assert_frame_equal(chunks[1], df[2:4]) + tm.assert_frame_equal(chunks[2], df[4:]) + + # pass skiprows + parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) + chunks = list(parser) + tm.assert_frame_equal(chunks[0], df[1:3]) + + # test bad parameter (skip_footer) + reader = self.read_csv(StringIO(self.data1), index_col=0, + iterator=True, skip_footer=True) + self.assertRaises(ValueError, reader.read, 3) + + treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + tm.assert_isinstance(treader, TextFileReader) + + # stopping iteration when on chunksize is specified, GH 3967 + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + reader = self.read_csv(StringIO(data), iterator=True) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + tm.assert_frame_equal(result[0], expected) + + # chunksize = 1 + reader = self.read_csv(StringIO(data), chunksize=1) + result = list(reader) + expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + self.assertEqual(len(result), 3) + tm.assert_frame_equal(pd.concat(result), expected) + + def test_single_line(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + # sniff separator + buf = StringIO() + sys.stdout = buf + + # printing warning message when engine == 'c' for now + + try: + # it works! + df = self.read_csv(StringIO('1,2'), names=['a', 'b'], + header=None, sep=None) + tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) + finally: + sys.stdout = sys.__stdout__ + + def test_malformed(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + # all + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + + try: + df = self.read_table( + StringIO(data), sep=',', header=1, comment='#') + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) + + # skip_footer + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + + try: + df = self.read_table( + StringIO(data), sep=',', header=1, comment='#', + skip_footer=1) + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) + + # first chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(5) + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + # middle chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', header=1, + comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(1) + it.read(2) + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + # last chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = self.read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.read(1) + it.read() + self.assertTrue(False) + except Exception as inst: + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) + + def test_skip_footer(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with the C parser is fixed + + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +""" + result = self.read_csv(StringIO(data), skip_footer=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) + + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), nrows=3) + tm.assert_frame_equal(result, expected) + + # skipfooter alias + result = self.read_csv(StringIO(data), skipfooter=2) + no_footer = '\n'.join(data.split('\n')[:-3]) + expected = self.read_csv(StringIO(no_footer)) + + tm.assert_frame_equal(result, expected) + + def test_decompression_regex_sep(self): + # GH 6607 + # This is a copy which should eventually be moved to ParserTests + # when the issue with the C parser is fixed + + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, sep='::', compression='gzip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, sep='::', compression='bz2') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + def test_read_table_buglet_4x_multiindex(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the issue with multi-level index is fixed in the C parser. + + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + # it works! + df = self.read_table(StringIO(text), sep='\s+') + self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) + + # GH 6893 + data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' + expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)], + columns=list('abcABC'), index=list('abc')) + actual = self.read_table(StringIO(data), sep='\s+') + tm.assert_frame_equal(actual, expected) + +class TestFwfColspaceSniffing(tm.TestCase): + def test_full_file(self): + # File with all values + test = '''index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar +2000-01-05T00:00:00 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0.487094399463 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz +2000-01-11T00:00:00 0.157160753327 34 foo''' + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_missing(self): + # File with missing values + test = '''index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar + 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz + 34''' + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_spaces(self): + # File with spaces in columns + test = ''' +Account Name Balance CreditLimit AccountCreated +101 Keanu Reeves 9315.45 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 Jennifer Love Hewitt 0 17000.00 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 5000.00 2/5/2007 +'''.strip('\r\n') + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_spaces_and_missing(self): + # File with spaces and missing values in columsn + test = ''' +Account Name Balance CreditLimit AccountCreated +101 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 +'''.strip('\r\n') + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_messed_up_data(self): + # Completely messed up file + test = ''' + Account Name Balance Credit Limit Account Created + 101 10000.00 1/17/1998 + 312 Gerard Butler 90.00 1000.00 + + 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 + 317 Bill Murray 789.65 +'''.strip('\r\n') + colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_multiple_delimiters(self): + test = r''' +col1~~~~~col2 col3++++++++++++++++++col4 +~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves + 33+++122.33\\\bar.........Gerard Butler +++44~~~~12.01 baz~~Jennifer Love Hewitt +~~55 11+++foo++++Jada Pinkett-Smith +..66++++++.03~~~bar Bill Murray +'''.strip('\r\n') + colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) + expected = read_fwf(StringIO(test), colspecs=colspecs, + delimiter=' +~.\\') + tm.assert_frame_equal(expected, read_fwf(StringIO(test), + delimiter=' +~.\\')) + + def test_variable_width_unicode(self): + if not compat.PY3: + raise nose.SkipTest('Bytes-related test - only needs to work on Python 3') + test = ''' +שלום שלום +ום שלל +של ום +'''.strip('\r\n') + expected = pd.read_fwf(BytesIO(test.encode('utf8')), + colspecs=[(0, 4), (5, 9)], header=None, encoding='utf8') + tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')), + header=None, encoding='utf8')) + + +class TestCParserHighMemory(ParserTests, tm.TestCase): + + def read_csv(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = 'c' + kwds['low_memory'] = False + return read_csv(*args, **kwds) + + def read_table(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = 'c' + kwds['low_memory'] = False + return read_table(*args, **kwds) + + def test_compact_ints(self): + data = ('0,1,0,0\n' + '1,1,0,0\n' + '0,1,0,1') + + result = read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + result = read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + def test_parse_dates_empty_string(self): + # #2263 + s = StringIO("Date, test\n2012-01-01, 1\n,2") + result = self.read_csv(s, parse_dates=["Date"], na_filter=False) + self.assertTrue(result['Date'].isnull()[1]) + + def test_usecols(self): + raise nose.SkipTest("Usecols is not supported in C High Memory engine.") + + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows(self): + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # this should ignore the first four lines (including comments) + df = self.read_csv(StringIO(data), comment='#', skiprows=4) + tm.assert_almost_equal(df.values, expected) + + def test_comment_header(self): + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # header should begin at the second non-comment line + df = self.read_csv(StringIO(data), comment='#', header=1) + tm.assert_almost_equal(df.values, expected) + + def test_comment_skiprows_header(self): + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + # skiprows should skip the first 4 lines (including comments), while + # header should start from the second non-commented line starting + # with line 5 + df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + tm.assert_almost_equal(df.values, expected) + + def test_passing_dtype(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the dtype argument is supported by all engines. + + df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # GH 3795 + # passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + + # we expect all object columns, so need to convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result,df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + index_col=0) + + # valid but we don't support it (date) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0, parse_dates=['B']) + + # valid but we don't support it + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + index_col=0) + + def test_fallback_to_python(self): + # GH 6607 + data = 'a b c\n1 2 3' + + # specify C engine with unsupported options (raise) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep=None, + delim_whitespace=False) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep='\s') + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', skip_footer=1) + + +class TestCParserLowMemory(ParserTests, tm.TestCase): + + def read_csv(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = 'c' + kwds['low_memory'] = True + kwds['buffer_lines'] = 2 + return read_csv(*args, **kwds) + + def read_table(self, *args, **kwds): + kwds = kwds.copy() + kwds['engine'] = 'c' + kwds['low_memory'] = True + kwds['buffer_lines'] = 2 + return read_table(*args, **kwds) + + def test_compact_ints(self): + data = ('0,1,0,0\n' + '1,1,0,0\n' + '0,1,0,1') + + result = read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + result = read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + def test_pass_dtype(self): + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, + as_recarray=True) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'S1') + + def test_usecols_dtypes(self): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + result = self.read_csv(StringIO(data), usecols=(0, 1, 2), + names=('a', 'b', 'c'), + header=None, + converters={'a': str}, + dtype={'b': int, 'c': float}, + ) + result2 = self.read_csv(StringIO(data), usecols=(0, 2), + names=('a', 'b', 'c'), + header=None, + converters={'a': str}, + dtype={'b': int, 'c': float}, + ) + self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) + self.assertTrue((result2.dtypes == [object, np.float]).all()) + + def test_usecols_implicit_index_col(self): + # #2654 + data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + + result = self.read_csv(StringIO(data), usecols=['a', 'b']) + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_with_whitespace(self): + data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' + + result = self.read_csv(StringIO(data), delim_whitespace=True, + usecols=('a', 'b')) + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_regex_sep(self): + # #2733 + data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' + + df = self.read_csv(StringIO(data), sep='\s+', usecols=('a', 'b')) + + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + tm.assert_frame_equal(df, expected) + + def test_pure_python_failover(self): + data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" + + result = self.read_csv(StringIO(data), comment='#') + expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) + tm.assert_frame_equal(result, expected) + + def test_decompression(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='gzip') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='gzip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='bz2') + tm.assert_frame_equal(result, expected) + + # result = self.read_csv(open(path, 'rb'), compression='bz2') + # tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + def test_decompression_regex_sep(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + # Test currently only valid with the python engine because of + # regex sep. Temporarily copied to TestPythonParser. + # Here test for ValueError when passing regex sep: + + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='gzip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='bz2') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + def test_memory_map(self): + # it works! + result = self.read_csv(self.csv1, memory_map=True) + + def test_disable_bool_parsing(self): + # #2090 + + data = """A,B,C +Yes,No,Yes +No,Yes,Yes +Yes,,Yes +No,No,No""" + + result = read_csv(StringIO(data), dtype=object) + self.assertTrue((result.dtypes == object).all()) + + result = read_csv(StringIO(data), dtype=object, na_filter=False) + self.assertEqual(result['B'][2], '') + + def test_int64_overflow(self): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + + result = read_csv(StringIO(data)) + self.assertTrue(result['ID'].dtype == object) + + self.assertRaises(OverflowError, read_csv, StringIO(data), + dtype='i8') + + def test_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + df2 = self.read_csv(StringIO(data), sep=';', decimal=',') + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float) + + def test_custom_lineterminator(self): + data = 'a,b,c~1,2,3~4,5,6' + + result = self.read_csv(StringIO(data), lineterminator='~') + expected = self.read_csv(StringIO(data.replace('~', '\n'))) + + tm.assert_frame_equal(result, expected) + + data2 = data.replace('~', '~~') + result = self.assertRaises(ValueError, read_csv, StringIO(data2), + lineterminator='~~') + + def test_raise_on_passed_int_dtype_with_nas(self): + # #2631 + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + self.assertRaises(Exception, read_csv, StringIO(data), sep=",", + skipinitialspace=True, + dtype={'DOY': np.int64}) + + def test_na_trailing_columns(self): + data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" + + result = self.read_csv(StringIO(data)) + self.assertEqual(result['Date'][1], '2012-05-12') + self.assertTrue(result['UnitPrice'].isnull().all()) + + def test_parse_ragged_csv(self): + data = """1,2,3 +1,2,3,4 +1,2,3,4,5 +1,2 +1,2,3,4""" + + nice_data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + result = self.read_csv(StringIO(data), header=None, + names=['a', 'b', 'c', 'd', 'e']) + + expected = self.read_csv(StringIO(nice_data), header=None, + names=['a', 'b', 'c', 'd', 'e']) + + tm.assert_frame_equal(result, expected) + + # too many columns, cause segfault if not careful + data = "1,2\n3,4,5" + + result = self.read_csv(StringIO(data), header=None, + names=lrange(50)) + expected = self.read_csv(StringIO(data), header=None, + names=lrange(3)).reindex(columns=lrange(50)) + + tm.assert_frame_equal(result, expected) + + def test_tokenize_CR_with_quoting(self): + # #3453, this doesn't work with Python parser for some reason + + data = ' a,b,c\r"a,b","e,d","f,f"' + + result = self.read_csv(StringIO(data), header=None) + expected = self.read_csv(StringIO(data.replace('\r', '\n')), + header=None) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data)) + expected = self.read_csv(StringIO(data.replace('\r', '\n'))) + tm.assert_frame_equal(result, expected) + + def test_raise_on_no_columns(self): + # single newline + data = "\n" + self.assertRaises(ValueError, self.read_csv, StringIO(data)) + + # test with more than a single newline + data = "\n\n\n" + self.assertRaises(ValueError, self.read_csv, StringIO(data)) + + def test_warn_if_chunks_have_mismatched_type(self): + # Issue #3866 If chunks are different types and can't + # be coerced using numerical types, then issue warning. + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) + + with tm.assert_produces_warning(DtypeWarning): + df = self.read_csv(StringIO(data)) + self.assertEqual(df.a.dtype, np.object) + + def test_invalid_c_parser_opts_with_not_c_parser(self): + from pandas.io.parsers import _c_parser_defaults as c_defaults + + data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + + engines = 'python', 'python-fwf' + for default in c_defaults: + for engine in engines: + kwargs = {default: object()} + with tm.assertRaisesRegexp(ValueError, + 'The %r option is not supported ' + 'with the %r engine' % (default, + engine)): + read_csv(StringIO(data), engine=engine, **kwargs) + + def test_passing_dtype(self): + # GH 6607 + # This is a copy which should eventually be merged into ParserTests + # when the dtype argument is supported by all engines. + + df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # GH 3795 + # passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + + # we expect all object columns, so need to convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result,df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + index_col=0) + + # valid but we don't support it (date) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0) + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + index_col=0, parse_dates=['B']) + + # valid but we don't support it + self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + index_col=0) + + def test_fallback_to_python(self): + # GH 6607 + data = 'a b c\n1 2 3' + + # specify C engine with C-unsupported options (raise) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep=None, + delim_whitespace=False) + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', sep='\s') + with tm.assertRaisesRegexp(ValueError, 'does not support'): + self.read_table(StringIO(data), engine='c', skip_footer=1) + + def test_raise_on_sep_with_delim_whitespace(self): + # GH 6607 + data = 'a b c\n1 2 3' + with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): + self.read_table(StringIO(data), sep='\s', delim_whitespace=True) + + +class TestMiscellaneous(tm.TestCase): + + # for tests that don't fit into any of the other classes, e.g. those that + # compare results for different engines or test the behavior when 'engine' + # is not passed + + def test_compare_whitespace_regex(self): + # GH 6607 + data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' + result_c = pd.read_table(StringIO(data), sep='\s+', engine='c') + result_py = pd.read_table(StringIO(data), sep='\s+', engine='python') + tm.assert_frame_equal(result_c, result_py) + + def test_fallback_to_python(self): + # GH 6607 + data = 'a b c\n1 2 3' + + # specify C-unsupported options with python-unsupported option + # (options will be ignored on fallback, raise) + with tm.assertRaisesRegexp(ValueError, 'Falling back'): + pd.read_table(StringIO(data), sep=None, + delim_whitespace=False, dtype={'a': float}) + with tm.assertRaisesRegexp(ValueError, 'Falling back'): + pd.read_table(StringIO(data), sep='\s', dtype={'a': float}) + with tm.assertRaisesRegexp(ValueError, 'Falling back'): + pd.read_table(StringIO(data), skip_footer=1, dtype={'a': float}) + + # specify C-unsupported options without python-unsupported options + with tm.assert_produces_warning(parsers.ParserWarning): + pd.read_table(StringIO(data), sep=None, delim_whitespace=False) + with tm.assert_produces_warning(parsers.ParserWarning): + pd.read_table(StringIO(data), sep='\s') + with tm.assert_produces_warning(parsers.ParserWarning): + pd.read_table(StringIO(data), skip_footer=1) + + +class TestParseSQL(tm.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + assert_same_values_and_dtype(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + assert_same_values_and_dtype(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + + +class TestS3(tm.TestCase): + def setUp(self): + try: + import boto + except ImportError: + raise nose.SkipTest("boto not installed") + + if compat.PY3: + raise nose.SkipTest("boto incompatible with Python 3") + + @tm.network + def test_parse_public_s3_bucket(self): + import nose.tools as nt + df = pd.read_csv('s3://nyqpug/tips.csv') + nt.assert_true(isinstance(df, pd.DataFrame)) + nt.assert_false(df.empty) + tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + + @tm.network + def test_s3_fails(self): + import boto + with tm.assertRaisesRegexp(boto.exception.S3ResponseError, + 'S3ResponseError: 404 Not Found'): + pd.read_csv('s3://nyqpug/asdf.csv') + + with tm.assertRaisesRegexp(boto.exception.S3ResponseError, + 'S3ResponseError: 403 Forbidden'): + pd.read_csv('s3://cant_get_it/tips.csv') + + +def assert_same_values_and_dtype(res, exp): + tm.assert_equal(res.dtype, exp.dtype) + tm.assert_almost_equal(res, exp) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py new file mode 100644 index 00000000..c52a405f --- /dev/null +++ b/pandas/io/tests/test_pickle.py @@ -0,0 +1,101 @@ +# pylint: disable=E1101,E1103,W0232 + +""" manage legacy pickle tests """ + +from datetime import datetime, timedelta +import operator +import pickle as pkl +import nose +import os + +import numpy as np +import pandas.util.testing as tm +import pandas as pd +from pandas import Index +from pandas.sparse.tests import test_sparse +from pandas import compat +from pandas.compat import u +from pandas.util.misc import is_little_endian +import pandas + +def _read_pickle(vf, encoding=None, compat=False): + from pandas.compat import pickle_compat as pc + with open(vf,'rb') as fh: + pc.load(fh, encoding=encoding, compat=compat) + +class TestPickle(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + from pandas.io.tests.generate_legacy_pickles import create_data + self.data = create_data() + self.path = u('__%s__.pickle' % tm.rands(10)) + + def compare_element(self, typ, result, expected): + if isinstance(expected,Index): + self.assertTrue(expected.equals(result)) + return + + if typ.startswith('sp_'): + comparator = getattr(test_sparse,"assert_%s_equal" % typ) + comparator(result,expected,exact_indices=False) + else: + comparator = getattr(tm,"assert_%s_equal" % typ) + comparator(result,expected) + + def compare(self, vf): + + # py3 compat when reading py2 pickle + try: + data = pandas.read_pickle(vf) + except (ValueError) as detail: + # trying to read a py3 pickle in py2 + return + + for typ, dv in data.items(): + for dt, result in dv.items(): + try: + expected = self.data[typ][dt] + except (KeyError): + continue + + self.compare_element(typ, result, expected) + + def read_pickles(self, version): + if not is_little_endian(): + raise nose.SkipTest("known failure on non-little endian") + + pth = tm.get_data_path('legacy_pickle/{0}'.format(str(version))) + for f in os.listdir(pth): + vf = os.path.join(pth,f) + self.compare(vf) + + def test_read_pickles_0_10_1(self): + self.read_pickles('0.10.1') + + def test_read_pickles_0_11_0(self): + self.read_pickles('0.11.0') + + def test_read_pickles_0_12_0(self): + self.read_pickles('0.12.0') + + def test_read_pickles_0_13_0(self): + self.read_pickles('0.13.0') + + def test_round_trip_current(self): + for typ, dv in self.data.items(): + + for dt, expected in dv.items(): + + with tm.ensure_clean(self.path) as path: + + pd.to_pickle(expected,path) + + result = pd.read_pickle(path) + self.compare_element(typ, result, expected) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py new file mode 100644 index 00000000..d0d1b025 --- /dev/null +++ b/pandas/io/tests/test_pytables.py @@ -0,0 +1,4364 @@ +import nose +import sys +import os +import warnings +import tempfile +from contextlib import contextmanager + +import datetime +import numpy as np + +import pandas +from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, + date_range, Index, DatetimeIndex, isnull) +from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf, + IncompatibilityWarning, PerformanceWarning, + AttributeConflictWarning, DuplicateWarning, + PossibleDataLossError, ClosedFileError) +from pandas.io import pytables as pytables +import pandas.util.testing as tm +from pandas.util.testing import (assert_panel4d_equal, + assert_panel_equal, + assert_frame_equal, + assert_series_equal) +from pandas import concat, Timestamp +from pandas import compat, _np_version_under1p7 +from pandas.compat import range, lrange, u +from pandas.util.testing import assert_produces_warning + +try: + import tables +except ImportError: + raise nose.SkipTest('no pytables') + +from distutils.version import LooseVersion + +_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ + and 'blosc' or 'zlib' + +_multiprocess_can_split_ = False + +# contextmanager to ensure the file cleanup +def safe_remove(path): + if path is not None: + try: + os.remove(path) + except: + pass + + +def safe_close(store): + try: + if store is not None: + store.close() + except: + pass + + +def create_tempfile(path): + """ create an unopened named temporary file """ + return os.path.join(tempfile.gettempdir(),path) + + +@contextmanager +def ensure_clean_store(path, mode='a', complevel=None, complib=None, + fletcher32=False): + + try: + + # put in the temporary path if we don't have one already + if not len(os.path.dirname(path)): + path = create_tempfile(path) + + store = HDFStore(path, mode=mode, complevel=complevel, + complib=complib, fletcher32=False) + yield store + finally: + safe_close(store) + if mode == 'w' or mode == 'a': + safe_remove(path) + + +@contextmanager +def ensure_clean_path(path): + """ + return essentially a named temporary file that is not opened + and deleted on existing; if path is a list, then create and + return list of filenames + """ + try: + if isinstance(path, list): + filenames = [ create_tempfile(p) for p in path ] + yield filenames + else: + filenames = [ create_tempfile(path) ] + yield filenames[0] + finally: + for f in filenames: + safe_remove(f) + + +# set these parameters so we don't have file sharing +tables.parameters.MAX_NUMEXPR_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + +def _maybe_remove(store, key): + """For tests using tables, try removing the table to be sure there is + no content from previous tests using the same table name.""" + try: + store.remove(key) + except: + pass + + +def compat_assert_produces_warning(w,f): + """ don't produce a warning under PY3 """ + if compat.PY3: + f() + else: + with tm.assert_produces_warning(expected_warning=w): + f() + + +class TestHDFStore(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestHDFStore, cls).setUpClass() + + # Pytables 3.0.0 deprecates lots of things + tm.reset_testing_mode() + + @classmethod + def tearDownClass(cls): + super(TestHDFStore, cls).tearDownClass() + + # Pytables 3.0.0 deprecates lots of things + tm.set_testing_mode() + + def setUp(self): + warnings.filterwarnings(action='ignore', category=FutureWarning) + + self.path = 'tmp.__%s__.h5' % tm.rands(10) + + def tearDown(self): + pass + + def test_factory_fun(self): + try: + with get_store(self.path) as tbl: + raise ValueError('blah') + except ValueError: + pass + finally: + safe_remove(self.path) + + try: + with get_store(self.path) as tbl: + tbl['a'] = tm.makeDataFrame() + + with get_store(self.path) as tbl: + self.assertEqual(len(tbl), 1) + self.assertEqual(type(tbl['a']), DataFrame) + finally: + safe_remove(self.path) + + def test_conv_read_write(self): + + try: + + def roundtrip(key, obj,**kwargs): + obj.to_hdf(self.path, key,**kwargs) + return read_hdf(self.path, key) + + o = tm.makeTimeSeries() + assert_series_equal(o, roundtrip('series',o)) + + o = tm.makeStringSeries() + assert_series_equal(o, roundtrip('string_series',o)) + + o = tm.makeDataFrame() + assert_frame_equal(o, roundtrip('frame',o)) + + o = tm.makePanel() + assert_panel_equal(o, roundtrip('panel',o)) + + # table + df = DataFrame(dict(A=lrange(5), B=lrange(5))) + df.to_hdf(self.path,'table',append=True) + result = read_hdf(self.path, 'table', where = ['index>2']) + assert_frame_equal(df[df.index>2],result) + + finally: + safe_remove(self.path) + + def test_long_strings(self): + + # GH6166 + # unconversion of long strings was being chopped in earlier + # versions of numpy < 1.7.2 + df = DataFrame({'a': [tm.rands(100) for _ in range(10)]}, + index=[tm.rands(100) for _ in range(10)]) + + with ensure_clean_store(self.path) as store: + store.append('df', df, data_columns=['a']) + + result = store.select('df') + assert_frame_equal(df, result) + + + def test_api(self): + + # GH4584 + # API issue when to_hdf doesn't acdept append AND format args + with ensure_clean_path(self.path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path,'df',append=True,format='table') + df.iloc[10:].to_hdf(path,'df',append=True,format='table') + assert_frame_equal(read_hdf(path,'df'),df) + + # append to False + df.iloc[:10].to_hdf(path,'df',append=False,format='table') + df.iloc[10:].to_hdf(path,'df',append=True,format='table') + assert_frame_equal(read_hdf(path,'df'),df) + + with ensure_clean_path(self.path) as path: + + df = tm.makeDataFrame() + df.iloc[:10].to_hdf(path,'df',append=True) + df.iloc[10:].to_hdf(path,'df',append=True,format='table') + assert_frame_equal(read_hdf(path,'df'),df) + + # append to False + df.iloc[:10].to_hdf(path,'df',append=False,format='table') + df.iloc[10:].to_hdf(path,'df',append=True) + assert_frame_equal(read_hdf(path,'df'),df) + + with ensure_clean_path(self.path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path,'df',append=False,format='fixed') + assert_frame_equal(read_hdf(path,'df'),df) + + df.to_hdf(path,'df',append=False,format='f') + assert_frame_equal(read_hdf(path,'df'),df) + + df.to_hdf(path,'df',append=False) + assert_frame_equal(read_hdf(path,'df'),df) + + df.to_hdf(path,'df') + assert_frame_equal(read_hdf(path,'df'),df) + + with ensure_clean_store(self.path) as store: + + path = store._path + df = tm.makeDataFrame() + + _maybe_remove(store,'df') + store.append('df',df.iloc[:10],append=True,format='table') + store.append('df',df.iloc[10:],append=True,format='table') + assert_frame_equal(store.select('df'),df) + + # append to False + _maybe_remove(store,'df') + store.append('df',df.iloc[:10],append=False,format='table') + store.append('df',df.iloc[10:],append=True,format='table') + assert_frame_equal(store.select('df'),df) + + # formats + _maybe_remove(store,'df') + store.append('df',df.iloc[:10],append=False,format='table') + store.append('df',df.iloc[10:],append=True,format='table') + assert_frame_equal(store.select('df'),df) + + _maybe_remove(store,'df') + store.append('df',df.iloc[:10],append=False,format='table') + store.append('df',df.iloc[10:],append=True,format=None) + assert_frame_equal(store.select('df'),df) + + with ensure_clean_path(self.path) as path: + + # invalid + df = tm.makeDataFrame() + self.assertRaises(ValueError, df.to_hdf, path,'df',append=True,format='f') + self.assertRaises(ValueError, df.to_hdf, path,'df',append=True,format='fixed') + + self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,format='foo') + self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,format='bar') + + def test_api_default_format(self): + + # default_format option + with ensure_clean_store(self.path) as store: + df = tm.makeDataFrame() + + pandas.set_option('io.hdf.default_format','fixed') + _maybe_remove(store,'df') + store.put('df',df) + self.assertFalse(store.get_storer('df').is_table) + self.assertRaises(ValueError, store.append, 'df2',df) + + pandas.set_option('io.hdf.default_format','table') + _maybe_remove(store,'df') + store.put('df',df) + self.assertTrue(store.get_storer('df').is_table) + _maybe_remove(store,'df2') + store.append('df2',df) + self.assertTrue(store.get_storer('df').is_table) + + pandas.set_option('io.hdf.default_format',None) + + with ensure_clean_path(self.path) as path: + + df = tm.makeDataFrame() + + pandas.set_option('io.hdf.default_format','fixed') + df.to_hdf(path,'df') + with get_store(path) as store: + self.assertFalse(store.get_storer('df').is_table) + self.assertRaises(ValueError, df.to_hdf, path,'df2', append=True) + + pandas.set_option('io.hdf.default_format','table') + df.to_hdf(path,'df3') + with get_store(path) as store: + self.assertTrue(store.get_storer('df3').is_table) + df.to_hdf(path,'df4',append=True) + with get_store(path) as store: + self.assertTrue(store.get_storer('df4').is_table) + + pandas.set_option('io.hdf.default_format',None) + + def test_keys(self): + + with ensure_clean_store(self.path) as store: + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeStringSeries() + store['c'] = tm.makeDataFrame() + store['d'] = tm.makePanel() + store['foo/bar'] = tm.makePanel() + self.assertEqual(len(store), 5) + self.assertTrue(set( + store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar'])) + + def test_repr(self): + + with ensure_clean_store(self.path) as store: + repr(store) + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeStringSeries() + store['c'] = tm.makeDataFrame() + store['d'] = tm.makePanel() + store['foo/bar'] = tm.makePanel() + store.append('e', tm.makePanel()) + + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['bool3'] = True + df['int1'] = 1 + df['int2'] = 2 + df['timestamp1'] = Timestamp('20010102') + df['timestamp2'] = Timestamp('20010103') + df['datetime1'] = datetime.datetime(2001,1,2,0,0) + df['datetime2'] = datetime.datetime(2001,1,3,0,0) + df.ix[3:6,['obj1']] = np.nan + df = df.consolidate().convert_objects() + + warnings.filterwarnings('ignore', category=PerformanceWarning) + store['df'] = df + warnings.filterwarnings('always', category=PerformanceWarning) + + # make a random group in hdf space + store._handle.createGroup(store._handle.root,'bah') + + repr(store) + str(store) + + # storers + with ensure_clean_store(self.path) as store: + + df = tm.makeDataFrame() + store.append('df',df) + + s = store.get_storer('df') + repr(s) + str(s) + + def test_contains(self): + + with ensure_clean_store(self.path) as store: + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeDataFrame() + store['foo/bar'] = tm.makeDataFrame() + self.assertIn('a', store) + self.assertIn('b', store) + self.assertNotIn('c', store) + self.assertIn('foo/bar', store) + self.assertIn('/foo/bar', store) + self.assertNotIn('/foo/b', store) + self.assertNotIn('bar', store) + + # GH 2694 + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + store['node())'] = tm.makeDataFrame() + self.assertIn('node())', store) + + def test_versioning(self): + + with ensure_clean_store(self.path) as store: + store['a'] = tm.makeTimeSeries() + store['b'] = tm.makeDataFrame() + df = tm.makeTimeDataFrame() + _maybe_remove(store, 'df1') + store.append('df1', df[:10]) + store.append('df1', df[10:]) + self.assertEqual(store.root.a._v_attrs.pandas_version, '0.10.1') + self.assertEqual(store.root.b._v_attrs.pandas_version, '0.10.1') + self.assertEqual(store.root.df1._v_attrs.pandas_version, '0.10.1') + + # write a file and wipe its versioning + _maybe_remove(store, 'df2') + store.append('df2', df) + + # this is an error because its table_type is appendable, but no version + # info + store.get_node('df2')._v_attrs.pandas_version = None + self.assertRaises(Exception, store.select, 'df2') + + def test_mode(self): + + df = tm.makeTimeDataFrame() + + def check(mode): + + with ensure_clean_path(self.path) as path: + + # constructor + if mode in ['r','r+']: + self.assertRaises(IOError, HDFStore, path, mode=mode) + + else: + store = HDFStore(path,mode=mode) + self.assertEqual(store._handle.mode, mode) + store.close() + + with ensure_clean_path(self.path) as path: + + # context + if mode in ['r','r+']: + def f(): + with get_store(path,mode=mode) as store: + pass + self.assertRaises(IOError, f) + else: + with get_store(path,mode=mode) as store: + self.assertEqual(store._handle.mode, mode) + + with ensure_clean_path(self.path) as path: + + # conv write + if mode in ['r','r+']: + self.assertRaises(IOError, df.to_hdf, path, 'df', mode=mode) + df.to_hdf(path,'df',mode='w') + else: + df.to_hdf(path,'df',mode=mode) + + # conv read + if mode in ['w']: + self.assertRaises(KeyError, read_hdf, path, 'df', mode=mode) + else: + result = read_hdf(path,'df',mode=mode) + assert_frame_equal(result,df) + + check('r') + check('r+') + check('a') + check('w') + + def test_reopen_handle(self): + + with ensure_clean_path(self.path) as path: + + store = HDFStore(path,mode='a') + store['a'] = tm.makeTimeSeries() + + # invalid mode change + self.assertRaises(PossibleDataLossError, store.open, 'w') + store.close() + self.assertFalse(store.is_open) + + # truncation ok here + store.open('w') + self.assertTrue(store.is_open) + self.assertEqual(len(store), 0) + store.close() + self.assertFalse(store.is_open) + + store = HDFStore(path,mode='a') + store['a'] = tm.makeTimeSeries() + + # reopen as read + store.open('r') + self.assertTrue(store.is_open) + self.assertEqual(len(store), 1) + self.assertEqual(store._mode, 'r') + store.close() + self.assertFalse(store.is_open) + + # reopen as append + store.open('a') + self.assertTrue(store.is_open) + self.assertEqual(len(store), 1) + self.assertEqual(store._mode, 'a') + store.close() + self.assertFalse(store.is_open) + + # reopen as append (again) + store.open('a') + self.assertTrue(store.is_open) + self.assertEqual(len(store), 1) + self.assertEqual(store._mode, 'a') + store.close() + self.assertFalse(store.is_open) + + def test_open_args(self): + + with ensure_clean_path(self.path) as path: + + df = tm.makeDataFrame() + + # create an in memory store + store = HDFStore(path,mode='a',driver='H5FD_CORE',driver_core_backing_store=0) + store['df'] = df + store.append('df2',df) + + tm.assert_frame_equal(store['df'],df) + tm.assert_frame_equal(store['df2'],df) + + store.close() + + # only supported on pytable >= 3.0.0 + if LooseVersion(tables.__version__) >= '3.0.0': + + # the file should not have actually been written + self.assertFalse(os.path.exists(path)) + + def test_flush(self): + + with ensure_clean_store(self.path) as store: + store['a'] = tm.makeTimeSeries() + store.flush() + store.flush(fsync=True) + + def test_get(self): + + with ensure_clean_store(self.path) as store: + store['a'] = tm.makeTimeSeries() + left = store.get('a') + right = store['a'] + tm.assert_series_equal(left, right) + + left = store.get('/a') + right = store['/a'] + tm.assert_series_equal(left, right) + + self.assertRaises(KeyError, store.get, 'b') + + def test_getattr(self): + + with ensure_clean_store(self.path) as store: + + s = tm.makeTimeSeries() + store['a'] = s + + # test attribute access + result = store.a + tm.assert_series_equal(result, s) + result = getattr(store,'a') + tm.assert_series_equal(result, s) + + df = tm.makeTimeDataFrame() + store['df'] = df + result = store.df + tm.assert_frame_equal(result, df) + + # errors + self.assertRaises(AttributeError, getattr, store, 'd') + + for x in ['mode','path','handle','complib']: + self.assertRaises(AttributeError, getattr, store, x) + + # not stores + for x in ['mode','path','handle','complib']: + getattr(store,"_%s" % x) + + def test_put(self): + + with ensure_clean_store(self.path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeTimeDataFrame() + store['a'] = ts + store['b'] = df[:10] + store['foo/bar/bah'] = df[:10] + store['foo'] = df[:10] + store['/foo'] = df[:10] + store.put('c', df[:10], format='table') + + # not OK, not a table + self.assertRaises( + ValueError, store.put, 'b', df[10:], append=True) + + # node does not currently exist, test _is_table_type returns False in + # this case + # _maybe_remove(store, 'f') + # self.assertRaises(ValueError, store.put, 'f', df[10:], append=True) + + # can't put to a table (use append instead) + self.assertRaises(ValueError, store.put, 'c', df[10:], append=True) + + # overwrite table + store.put('c', df[:10], format='table', append=False) + tm.assert_frame_equal(df[:10], store['c']) + + def test_put_string_index(self): + + with ensure_clean_store(self.path) as store: + + index = Index( + ["I am a very long string index: %s" % i for i in range(20)]) + s = Series(np.arange(20), index=index) + df = DataFrame({'A': s, 'B': s}) + + store['a'] = s + tm.assert_series_equal(store['a'], s) + + store['b'] = df + tm.assert_frame_equal(store['b'], df) + + # mixed length + index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + ["I am a very long string index: %s" % i for i in range(20)]) + s = Series(np.arange(21), index=index) + df = DataFrame({'A': s, 'B': s}) + store['a'] = s + tm.assert_series_equal(store['a'], s) + + store['b'] = df + tm.assert_frame_equal(store['b'], df) + + def test_put_compression(self): + + with ensure_clean_store(self.path) as store: + df = tm.makeTimeDataFrame() + + store.put('c', df, format='table', complib='zlib') + tm.assert_frame_equal(store['c'], df) + + # can't compress if format='fixed' + self.assertRaises(ValueError, store.put, 'b', df, + format='fixed', complib='zlib') + + def test_put_compression_blosc(self): + tm.skip_if_no_package('tables', '2.2', app='blosc support') + df = tm.makeTimeDataFrame() + + with ensure_clean_store(self.path) as store: + + # can't compress if format='fixed' + self.assertRaises(ValueError, store.put, 'b', df, + format='fixed', complib='blosc') + + store.put('c', df, format='table', complib='blosc') + tm.assert_frame_equal(store['c'], df) + + def test_put_integer(self): + # non-date, non-string index + df = DataFrame(np.random.randn(50, 100)) + self._check_roundtrip(df, tm.assert_frame_equal) + + def test_put_mixed_type(self): + df = tm.makeTimeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['bool3'] = True + df['int1'] = 1 + df['int2'] = 2 + df['timestamp1'] = Timestamp('20010102') + df['timestamp2'] = Timestamp('20010103') + df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) + df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) + df.ix[3:6, ['obj1']] = np.nan + df = df.consolidate().convert_objects() + + with ensure_clean_store(self.path) as store: + _maybe_remove(store, 'df') + + # cannot use assert_produces_warning here for some reason + # a PendingDeprecationWarning is also raised? + warnings.filterwarnings('ignore', category=PerformanceWarning) + store.put('df',df) + warnings.filterwarnings('always', category=PerformanceWarning) + + expected = store.get('df') + tm.assert_frame_equal(expected,df) + + def test_append(self): + + with ensure_clean_store(self.path) as store: + df = tm.makeTimeDataFrame() + _maybe_remove(store, 'df1') + store.append('df1', df[:10]) + store.append('df1', df[10:]) + tm.assert_frame_equal(store['df1'], df) + + _maybe_remove(store, 'df2') + store.put('df2', df[:10], format='table') + store.append('df2', df[10:]) + tm.assert_frame_equal(store['df2'], df) + + _maybe_remove(store, 'df3') + store.append('/df3', df[:10]) + store.append('/df3', df[10:]) + tm.assert_frame_equal(store['df3'], df) + + # this is allowed by almost always don't want to do it + with tm.assert_produces_warning(expected_warning=tables.NaturalNameWarning): + _maybe_remove(store, '/df3 foo') + store.append('/df3 foo', df[:10]) + store.append('/df3 foo', df[10:]) + tm.assert_frame_equal(store['df3 foo'], df) + + # panel + wp = tm.makePanel() + _maybe_remove(store, 'wp1') + store.append('wp1', wp.ix[:, :10, :]) + store.append('wp1', wp.ix[:, 10:, :]) + assert_panel_equal(store['wp1'], wp) + + # ndim + p4d = tm.makePanel4D() + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :]) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + + # test using axis labels + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + assert_panel4d_equal(store['p4d'], p4d) + + # test using differnt number of items on each axis + p4d2 = p4d.copy() + p4d2['l4'] = p4d['l1'] + p4d2['l5'] = p4d['l1'] + _maybe_remove(store, 'p4d2') + store.append( + 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) + assert_panel4d_equal(store['p4d2'], p4d2) + + # test using differt order of items on the non-index axes + _maybe_remove(store, 'wp1') + wp_append1 = wp.ix[:, :10, :] + store.append('wp1', wp_append1) + wp_append2 = wp.ix[:, 10:, :].reindex(items=wp.items[::-1]) + store.append('wp1', wp_append2) + assert_panel_equal(store['wp1'], wp) + + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) + df['mixed_column'] = 'testing' + df.ix[2, 'mixed_column'] = np.nan + _maybe_remove(store, 'df') + store.append('df', df) + tm.assert_frame_equal(store['df'], df) + + # uints - test storage of uints + uint_data = DataFrame({'u08' : Series(np.random.random_integers(0, high=255, size=5), dtype=np.uint8), + 'u16' : Series(np.random.random_integers(0, high=65535, size=5), dtype=np.uint16), + 'u32' : Series(np.random.random_integers(0, high=2**30, size=5), dtype=np.uint32), + 'u64' : Series([2**58, 2**59, 2**60, 2**61, 2**62], dtype=np.uint64)}, + index=np.arange(5)) + _maybe_remove(store, 'uints') + store.append('uints', uint_data) + tm.assert_frame_equal(store['uints'], uint_data) + + # uints - test storage of uints in indexable columns + _maybe_remove(store, 'uints') + store.append('uints', uint_data, data_columns=['u08','u16','u32']) # 64-bit indices not yet supported + tm.assert_frame_equal(store['uints'], uint_data) + + def test_append_series(self): + + with ensure_clean_store(self.path) as store: + + # basic + ss = tm.makeStringSeries() + ts = tm.makeTimeSeries() + ns = Series(np.arange(100)) + + store.append('ss', ss) + result = store['ss'] + tm.assert_series_equal(result, ss) + self.assertIsNone(result.name) + + store.append('ts', ts) + result = store['ts'] + tm.assert_series_equal(result, ts) + self.assertIsNone(result.name) + + ns.name = 'foo' + store.append('ns', ns) + result = store['ns'] + tm.assert_series_equal(result, ns) + self.assertEqual(result.name, ns.name) + + # select on the values + expected = ns[ns>60] + result = store.select('ns',Term('foo>60')) + tm.assert_series_equal(result,expected) + + # select on the index and values + expected = ns[(ns>70) & (ns.index<90)] + result = store.select('ns',[Term('foo>70'), Term('index<90')]) + tm.assert_series_equal(result,expected) + + # multi-index + mi = DataFrame(np.random.randn(5,1),columns=['A']) + mi['B'] = np.arange(len(mi)) + mi['C'] = 'foo' + mi.loc[3:5,'C'] = 'bar' + mi.set_index(['C','B'],inplace=True) + s = mi.stack() + s.index = s.index.droplevel(2) + store.append('mi', s) + tm.assert_series_equal(store['mi'], s) + + def test_store_index_types(self): + # GH5386 + # test storing various index types + + with ensure_clean_store(self.path) as store: + + def check(format,index): + df = DataFrame(np.random.randn(10,2),columns=list('AB')) + df.index = index(len(df)) + + _maybe_remove(store, 'df') + store.put('df',df,format=format) + assert_frame_equal(df,store['df']) + + for index in [ tm.makeFloatIndex, tm.makeStringIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + + check('table',index) + check('fixed',index) + + # unicode + index = tm.makeUnicodeIndex + if compat.PY3: + check('table',index) + check('fixed',index) + else: + + # only support for fixed types (and they have a perf warning) + self.assertRaises(TypeError, check, 'table', index) + with tm.assert_produces_warning(expected_warning=PerformanceWarning): + check('fixed',index) + + def test_encoding(self): + + if LooseVersion(tables.__version__) < '3.0.0': + raise nose.SkipTest('tables version does not support proper encoding') + if sys.byteorder != 'little': + raise nose.SkipTest('system byteorder is not little') + + with ensure_clean_store(self.path) as store: + df = DataFrame(dict(A='foo',B='bar'),index=range(5)) + df.loc[2,'A'] = np.nan + df.loc[3,'B'] = np.nan + _maybe_remove(store, 'df') + store.append('df', df, encoding='ascii') + tm.assert_frame_equal(store['df'], df) + + expected = df.reindex(columns=['A']) + result = store.select('df',Term('columns=A',encoding='ascii')) + tm.assert_frame_equal(result,expected) + + def test_append_some_nans(self): + + with ensure_clean_store(self.path) as store: + df = DataFrame({'A' : Series(np.random.randn(20)).astype('int32'), + 'A1' : np.random.randn(20), + 'A2' : np.random.randn(20), + 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) }, + index=np.arange(20)) + # some nans + _maybe_remove(store, 'df1') + df.ix[0:15,['A1','B','D','E']] = np.nan + store.append('df1', df[:10]) + store.append('df1', df[10:]) + tm.assert_frame_equal(store['df1'], df) + + # first column + df1 = df.copy() + df1.ix[:,'A1'] = np.nan + _maybe_remove(store, 'df1') + store.append('df1', df1[:10]) + store.append('df1', df1[10:]) + tm.assert_frame_equal(store['df1'], df1) + + # 2nd column + df2 = df.copy() + df2.ix[:,'A2'] = np.nan + _maybe_remove(store, 'df2') + store.append('df2', df2[:10]) + store.append('df2', df2[10:]) + tm.assert_frame_equal(store['df2'], df2) + + # datetimes + df3 = df.copy() + df3.ix[:,'E'] = np.nan + _maybe_remove(store, 'df3') + store.append('df3', df3[:10]) + store.append('df3', df3[10:]) + tm.assert_frame_equal(store['df3'], df3) + + def test_append_all_nans(self): + + with ensure_clean_store(self.path) as store: + + df = DataFrame({'A1' : np.random.randn(20), + 'A2' : np.random.randn(20)}, + index=np.arange(20)) + df.ix[0:15,:] = np.nan + + + # nan some entire rows (dropna=True) + _maybe_remove(store, 'df') + store.append('df', df[:10], dropna=True) + store.append('df', df[10:], dropna=True) + tm.assert_frame_equal(store['df'], df[-4:]) + + # nan some entire rows (dropna=False) + _maybe_remove(store, 'df2') + store.append('df2', df[:10], dropna=False) + store.append('df2', df[10:], dropna=False) + tm.assert_frame_equal(store['df2'], df) + + # tests the option io.hdf.dropna_table + pandas.set_option('io.hdf.dropna_table',False) + _maybe_remove(store, 'df3') + store.append('df3', df[:10]) + store.append('df3', df[10:]) + tm.assert_frame_equal(store['df3'], df) + + pandas.set_option('io.hdf.dropna_table',True) + _maybe_remove(store, 'df4') + store.append('df4', df[:10]) + store.append('df4', df[10:]) + tm.assert_frame_equal(store['df4'], df[-4:]) + + # nan some entire rows (string are still written!) + df = DataFrame({'A1' : np.random.randn(20), + 'A2' : np.random.randn(20), + 'B' : 'foo', 'C' : 'bar'}, + index=np.arange(20)) + + df.ix[0:15,:] = np.nan + + _maybe_remove(store, 'df') + store.append('df', df[:10], dropna=True) + store.append('df', df[10:], dropna=True) + tm.assert_frame_equal(store['df'], df) + + _maybe_remove(store, 'df2') + store.append('df2', df[:10], dropna=False) + store.append('df2', df[10:], dropna=False) + tm.assert_frame_equal(store['df2'], df) + + # nan some entire rows (but since we have dates they are still written!) + df = DataFrame({'A1' : np.random.randn(20), + 'A2' : np.random.randn(20), + 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) }, + index=np.arange(20)) + + df.ix[0:15,:] = np.nan + + _maybe_remove(store, 'df') + store.append('df', df[:10], dropna=True) + store.append('df', df[10:], dropna=True) + tm.assert_frame_equal(store['df'], df) + + _maybe_remove(store, 'df2') + store.append('df2', df[:10], dropna=False) + store.append('df2', df[10:], dropna=False) + tm.assert_frame_equal(store['df2'], df) + + def test_append_frame_column_oriented(self): + + with ensure_clean_store(self.path) as store: + + # column oriented + df = tm.makeTimeDataFrame() + _maybe_remove(store, 'df1') + store.append('df1', df.ix[:, :2], axes=['columns']) + store.append('df1', df.ix[:, 2:]) + tm.assert_frame_equal(store['df1'], df) + + result = store.select('df1', 'columns=A') + expected = df.reindex(columns=['A']) + tm.assert_frame_equal(expected, result) + + # selection on the non-indexable + result = store.select( + 'df1', ('columns=A', Term('index=df.index[0:4]'))) + expected = df.reindex(columns=['A'], index=df.index[0:4]) + tm.assert_frame_equal(expected, result) + + # this isn't supported + self.assertRaises(TypeError, store.select, 'df1', ( + 'columns=A', Term('index>df.index[4]'))) + + def test_append_with_different_block_ordering(self): + + #GH 4096; using same frames, but different block orderings + with ensure_clean_store(self.path) as store: + + for i in range(10): + + df = DataFrame(np.random.randn(10,2),columns=list('AB')) + df['index'] = range(10) + df['index'] += i*10 + df['int64'] = Series([1]*len(df),dtype='int64') + df['int16'] = Series([1]*len(df),dtype='int16') + + if i % 2 == 0: + del df['int64'] + df['int64'] = Series([1]*len(df),dtype='int64') + if i % 3 == 0: + a = df.pop('A') + df['A'] = a + + df.set_index('index',inplace=True) + + store.append('df',df) + + # test a different ordering but with more fields (like invalid combinate) + with ensure_clean_store(self.path) as store: + + df = DataFrame(np.random.randn(10,2),columns=list('AB'), dtype='float64') + df['int64'] = Series([1]*len(df),dtype='int64') + df['int16'] = Series([1]*len(df),dtype='int16') + store.append('df',df) + + # store additonal fields in different blocks + df['int16_2'] = Series([1]*len(df),dtype='int16') + self.assertRaises(ValueError, store.append, 'df', df) + + # store multile additonal fields in different blocks + df['float_3'] = Series([1.]*len(df),dtype='float64') + self.assertRaises(ValueError, store.append, 'df', df) + + def test_ndim_indexables(self): + """ test using ndim tables in new ways""" + + with ensure_clean_store(self.path) as store: + + p4d = tm.makePanel4D() + + def check_indexers(key, indexers): + for i, idx in enumerate(indexers): + self.assertTrue(getattr(getattr( + store.root, key).table.description, idx)._v_pos == i) + + # append then change (will take existing schema) + indexers = ['items', 'major_axis', 'minor_axis'] + + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # same as above, but try to append with differnt axes + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'labels', 'items', 'major_axis']) + assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # pass incorrect number of axes + _maybe_remove(store, 'p4d') + self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[ + :, :, :10, :], axes=['major_axis', 'minor_axis']) + + # different than default indexables #1 + indexers = ['labels', 'major_axis', 'minor_axis'] + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # different than default indexables #2 + indexers = ['major_axis', 'labels', 'minor_axis'] + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # partial selection + result = store.select('p4d', ['labels=l1']) + expected = p4d.reindex(labels=['l1']) + assert_panel4d_equal(result, expected) + + # partial selection2 + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) + expected = p4d.reindex( + labels=['l1'], items=['ItemA'], minor_axis=['B']) + assert_panel4d_equal(result, expected) + + # non-existant partial selection + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) + expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) + assert_panel4d_equal(result, expected) + + def test_append_with_strings(self): + + with ensure_clean_store(self.path) as store: + wp = tm.makePanel() + wp2 = wp.rename_axis( + dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) + + def check_col(key,name,size): + self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size) + + store.append('s1', wp, min_itemsize=20) + store.append('s1', wp2) + expected = concat([wp, wp2], axis=2) + expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) + assert_panel_equal(store['s1'], expected) + check_col('s1', 'minor_axis', 20) + + # test dict format + store.append('s2', wp, min_itemsize={'minor_axis': 20}) + store.append('s2', wp2) + expected = concat([wp, wp2], axis=2) + expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) + assert_panel_equal(store['s2'], expected) + check_col('s2', 'minor_axis', 20) + + # apply the wrong field (similar to #1) + store.append('s3', wp, min_itemsize={'major_axis': 20}) + self.assertRaises(ValueError, store.append, 's3', wp2) + + # test truncation of bigger strings + store.append('s4', wp) + self.assertRaises(ValueError, store.append, 's4', wp2) + + # avoid truncation on elements + df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) + store.append('df_big', df) + tm.assert_frame_equal(store.select('df_big'), df) + check_col('df_big', 'values_block_1', 15) + + # appending smaller string ok + df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) + store.append('df_big', df2) + expected = concat([df, df2]) + tm.assert_frame_equal(store.select('df_big'), expected) + check_col('df_big', 'values_block_1', 15) + + # avoid truncation on elements + df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) + store.append('df_big2', df, min_itemsize={'values': 50}) + tm.assert_frame_equal(store.select('df_big2'), df) + check_col('df_big2', 'values_block_1', 50) + + # bigger string on next append + store.append('df_new', df) + df_new = DataFrame( + [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) + self.assertRaises(ValueError, store.append, 'df_new', df_new) + + # with nans + _maybe_remove(store, 'df') + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[1:4, 'string'] = np.nan + df['string2'] = 'bar' + df.ix[4:8, 'string2'] = np.nan + df['string3'] = 'bah' + df.ix[1:, 'string3'] = np.nan + store.append('df', df) + result = store.select('df') + tm.assert_frame_equal(result, df) + + with ensure_clean_store(self.path) as store: + + def check_col(key,name,size): + self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size) + + df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) + + # a min_itemsize that creates a data_column + _maybe_remove(store, 'df') + store.append('df', df, min_itemsize={'A' : 200 }) + check_col('df', 'A', 200) + self.assertEqual(store.get_storer('df').data_columns, ['A']) + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, 'df') + store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + check_col('df', 'A', 200) + self.assertEqual(store.get_storer('df').data_columns, ['B','A']) + + # a min_itemsize that creates a data_column2 + _maybe_remove(store, 'df') + store.append('df', df, data_columns = ['B'], min_itemsize={'values' : 200 }) + check_col('df', 'B', 200) + check_col('df', 'values_block_0', 200) + self.assertEqual(store.get_storer('df').data_columns, ['B']) + + # infer the .typ on subsequent appends + _maybe_remove(store, 'df') + store.append('df', df[:5], min_itemsize=200) + store.append('df', df[5:], min_itemsize=200) + tm.assert_frame_equal(store['df'], df) + + # invalid min_itemsize keys + df = DataFrame(['foo','foo','foo','barh','barh','barh'],columns=['A']) + _maybe_remove(store, 'df') + self.assertRaises(ValueError, store.append, 'df', df, min_itemsize={'foo' : 20, 'foobar' : 20}) + + def test_append_with_data_columns(self): + + with ensure_clean_store(self.path) as store: + df = tm.makeTimeDataFrame() + df.loc[:,'B'].iloc[0] = 1. + _maybe_remove(store, 'df') + store.append('df', df[:2], data_columns=['B']) + store.append('df', df[2:]) + tm.assert_frame_equal(store['df'], df) + + # check that we have indicies created + assert(store._handle.root.df.table.cols.index.is_indexed is True) + assert(store._handle.root.df.table.cols.B.is_indexed is True) + + # data column searching + result = store.select('df', [Term('B>0')]) + expected = df[df.B > 0] + tm.assert_frame_equal(result, expected) + + # data column searching (with an indexable and a data_columns) + result = store.select( + 'df', [Term('B>0'), Term('index>df.index[3]')]) + df_new = df.reindex(index=df.index[4:]) + expected = df_new[df_new.B > 0] + tm.assert_frame_equal(result, expected) + + # data column selection with a string data_column + df_new = df.copy() + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + _maybe_remove(store, 'df') + store.append('df', df_new, data_columns=['string']) + result = store.select('df', [Term('string=foo')]) + expected = df_new[df_new.string == 'foo'] + tm.assert_frame_equal(result, expected) + + # using min_itemsize and a data column + def check_col(key,name,size): + self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size) + + with ensure_clean_store(self.path) as store: + _maybe_remove(store, 'df') + store.append('df', df_new, data_columns=['string'], + min_itemsize={'string': 30}) + check_col('df', 'string', 30) + _maybe_remove(store, 'df') + store.append( + 'df', df_new, data_columns=['string'], min_itemsize=30) + check_col('df', 'string', 30) + _maybe_remove(store, 'df') + store.append('df', df_new, data_columns=['string'], + min_itemsize={'values': 30}) + check_col('df', 'string', 30) + + with ensure_clean_store(self.path) as store: + df_new['string2'] = 'foobarbah' + df_new['string_block1'] = 'foobarbah1' + df_new['string_block2'] = 'foobarbah2' + _maybe_remove(store, 'df') + store.append('df', df_new, data_columns=['string', 'string2'], min_itemsize={'string': 30, 'string2': 40, 'values': 50}) + check_col('df', 'string', 30) + check_col('df', 'string2', 40) + check_col('df', 'values_block_1', 50) + + with ensure_clean_store(self.path) as store: + # multiple data columns + df_new = df.copy() + df_new.loc[:,'A'].iloc[0] = 1. + df_new.loc[:,'B'].iloc[0] = -1. + df_new['string'] = 'foo' + df_new['string'][1:4] = np.nan + df_new['string'][5:6] = 'bar' + df_new['string2'] = 'foo' + df_new['string2'][2:5] = np.nan + df_new['string2'][7:8] = 'bar' + _maybe_remove(store, 'df') + store.append( + 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) + result = store.select('df', [Term('string=foo'), Term( + 'string2=foo'), Term('A>0'), Term('B<0')]) + expected = df_new[(df_new.string == 'foo') & ( + df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] + tm.assert_frame_equal(result, expected, check_index_type=False) + + # yield an empty frame + result = store.select('df', [Term('string=foo'), Term( + 'string2=cool')]) + expected = df_new[(df_new.string == 'foo') & ( + df_new.string2 == 'cool')] + tm.assert_frame_equal(result, expected, check_index_type=False) + + with ensure_clean_store(self.path) as store: + # doc example + df_dc = df.copy() + df_dc['string'] = 'foo' + df_dc.ix[4:6, 'string'] = np.nan + df_dc.ix[7:9, 'string'] = 'bar' + df_dc['string2'] = 'cool' + df_dc['datetime'] = Timestamp('20010102') + df_dc = df_dc.convert_objects() + df_dc.ix[3:5, ['A', 'B', 'datetime']] = np.nan + + _maybe_remove(store, 'df_dc') + store.append('df_dc', df_dc, data_columns=['B', 'C', + 'string', 'string2', 'datetime']) + result = store.select('df_dc', [Term('B>0')]) + + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected, check_index_type=False) + + result = store.select( + 'df_dc', ['B > 0', 'C > 0', 'string == foo']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & ( + df_dc.string == 'foo')] + tm.assert_frame_equal(result, expected, check_index_type=False) + + with ensure_clean_store(self.path) as store: + # doc example part 2 + np.random.seed(1234) + index = date_range('1/1/2000', periods=8) + df_dc = DataFrame(np.random.randn(8, 3), index=index, + columns=['A', 'B', 'C']) + df_dc['string'] = 'foo' + df_dc.ix[4:6,'string'] = np.nan + df_dc.ix[7:9,'string'] = 'bar' + df_dc.ix[:,['B','C']] = df_dc.ix[:,['B','C']].abs() + df_dc['string2'] = 'cool' + + # on-disk operations + store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + + result = store.select('df_dc', [ Term('B>0') ]) + expected = df_dc[df_dc.B>0] + tm.assert_frame_equal(result,expected) + + result = store.select('df_dc', ['B > 0', 'C > 0', 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + tm.assert_frame_equal(result,expected) + + with ensure_clean_store(self.path) as store: + # panel + # GH5717 not handling data_columns + np.random.seed(1234) + p = tm.makePanel() + + store.append('p1',p) + tm.assert_panel_equal(store.select('p1'),p) + + store.append('p2',p,data_columns=True) + tm.assert_panel_equal(store.select('p2'),p) + + result = store.select('p2',where='ItemA>0') + expected = p.to_frame() + expected = expected[expected['ItemA']>0] + tm.assert_frame_equal(result.to_frame(),expected) + + result = store.select('p2',where='ItemA>0 & minor_axis=["A","B"]') + expected = p.to_frame() + expected = expected[expected['ItemA']>0] + expected = expected[expected.reset_index(level=['major']).index.isin(['A','B'])] + tm.assert_frame_equal(result.to_frame(),expected) + + def test_create_table_index(self): + + with ensure_clean_store(self.path) as store: + + def col(t,column): + return getattr(store.get_storer(t).table.cols,column) + + # index=False + wp = tm.makePanel() + store.append('p5', wp, index=False) + store.create_table_index('p5', columns=['major_axis']) + assert(col('p5', 'major_axis').is_indexed is True) + assert(col('p5', 'minor_axis').is_indexed is False) + + # index=True + store.append('p5i', wp, index=True) + assert(col('p5i', 'major_axis').is_indexed is True) + assert(col('p5i', 'minor_axis').is_indexed is True) + + # default optlevels + store.get_storer('p5').create_index() + assert(col('p5', 'major_axis').index.optlevel == 6) + assert(col('p5', 'minor_axis').index.kind == 'medium') + + # let's change the indexing scheme + store.create_table_index('p5') + assert(col('p5', 'major_axis').index.optlevel == 6) + assert(col('p5', 'minor_axis').index.kind == 'medium') + store.create_table_index('p5', optlevel=9) + assert(col('p5', 'major_axis').index.optlevel == 9) + assert(col('p5', 'minor_axis').index.kind == 'medium') + store.create_table_index('p5', kind='full') + assert(col('p5', 'major_axis').index.optlevel == 9) + assert(col('p5', 'minor_axis').index.kind == 'full') + store.create_table_index('p5', optlevel=1, kind='light') + assert(col('p5', 'major_axis').index.optlevel == 1) + assert(col('p5', 'minor_axis').index.kind == 'light') + + # data columns + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df['string2'] = 'bar' + store.append('f', df, data_columns=['string', 'string2']) + assert(col('f', 'index').is_indexed is True) + assert(col('f', 'string').is_indexed is True) + assert(col('f', 'string2').is_indexed is True) + + # specify index=columns + store.append( + 'f2', df, index=['string'], data_columns=['string', 'string2']) + assert(col('f2', 'index').is_indexed is False) + assert(col('f2', 'string').is_indexed is True) + assert(col('f2', 'string2').is_indexed is False) + + # try to index a non-table + _maybe_remove(store, 'f2') + store.put('f2', df) + self.assertRaises(TypeError, store.create_table_index, 'f2') + + # try to change the version supports flag + from pandas.io import pytables + pytables._table_supports_index = False + self.assertRaises(Exception, store.create_table_index, 'f') + + # test out some versions + original = tables.__version__ + + for v in ['2.2', '2.2b']: + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = v + self.assertRaises(Exception, store.create_table_index, 'f') + + for v in ['2.3.1', '2.3.1b', '2.4dev', '2.4', '3.0.0', '3.1.0', original]: + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = v + store.create_table_index('f') + pytables._table_mod = None + pytables._table_supports_index = False + tables.__version__ = original + + def test_big_table_frame(self): + raise nose.SkipTest('no big table frame') + + # create and write a big table + df = DataFrame(np.random.randn(2000 * 100, 100), index=range( + 2000 * 100), columns=['E%03d' % i for i in range(100)]) + for x in range(20): + df['String%03d' % x] = 'string%03d' % x + + import time + x = time.time() + with ensure_clean_store(self.path,mode='w') as store: + store.append('df', df) + rows = store.root.df.table.nrows + recons = store.select('df') + assert isinstance(recons, DataFrame) + + com.pprint_thing("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) + + def test_big_table2_frame(self): + # this is a really big table: 1m rows x 60 float columns, 20 string, 20 datetime + # columns + raise nose.SkipTest('no big table2 frame') + + # create and write a big table + com.pprint_thing("\nbig_table2 start") + import time + start_time = time.time() + df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int( + 1000 * 1000)), columns=['E%03d' % i for i in range(60)]) + for x in range(20): + df['String%03d' % x] = 'string%03d' % x + for x in range(20): + df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) + + com.pprint_thing("\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" + % (len(df.index), time.time() - start_time)) + + def f(chunksize): + with ensure_clean_store(self.path,mode='w') as store: + store.append('df', df, chunksize=chunksize) + r = store.root.df.table.nrows + return r + + for c in [10000, 50000, 250000]: + start_time = time.time() + com.pprint_thing("big_table2 frame [chunk->%s]" % c) + rows = f(c) + com.pprint_thing("big_table2 frame [rows->%s,chunk->%s] -> %5.2f" + % (rows, c, time.time() - start_time)) + + def test_big_put_frame(self): + raise nose.SkipTest('no big put frame') + + com.pprint_thing("\nbig_put start") + import time + start_time = time.time() + df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int( + 1000 * 1000)), columns=['E%03d' % i for i in range(60)]) + for x in range(20): + df['String%03d' % x] = 'string%03d' % x + for x in range(20): + df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) + + com.pprint_thing("\nbig_put frame (creation of df) [rows->%s] -> %5.2f" + % (len(df.index), time.time() - start_time)) + + with ensure_clean_store(self.path, mode='w') as store: + start_time = time.time() + store = HDFStore(self.path, mode='w') + store.put('df', df) + + com.pprint_thing(df.get_dtype_counts()) + com.pprint_thing("big_put frame [shape->%s] -> %5.2f" + % (df.shape, time.time() - start_time)) + + def test_big_table_panel(self): + raise nose.SkipTest('no big table panel') + + # create and write a big table + wp = Panel( + np.random.randn(20, 1000, 1000), items=['Item%03d' % i for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), minor_axis=['E%03d' % i for i in range(1000)]) + + wp.ix[:, 100:200, 300:400] = np.nan + + for x in range(100): + wp['String%03d'] = 'string%03d' % x + + import time + x = time.time() + + + with ensure_clean_store(self.path, mode='w') as store: + store.append('wp', wp) + rows = store.root.wp.table.nrows + recons = store.select('wp') + assert isinstance(recons, Panel) + + com.pprint_thing("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) + + def test_append_diff_item_order(self): + + wp = tm.makePanel() + wp1 = wp.ix[:, :10, :] + wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] + + with ensure_clean_store(self.path) as store: + store.put('panel', wp1, format='table') + self.assertRaises(ValueError, store.put, 'panel', wp2, + append=True) + + def test_append_hierarchical(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + with ensure_clean_store(self.path) as store: + store.append('mi', df) + result = store.select('mi') + tm.assert_frame_equal(result, df) + + # GH 3748 + result = store.select('mi',columns=['A','B']) + expected = df.reindex(columns=['A','B']) + tm.assert_frame_equal(result,expected) + + with ensure_clean_path('test.hdf') as path: + df.to_hdf(path,'df',format='table') + result = read_hdf(path,'df',columns=['A','B']) + expected = df.reindex(columns=['A','B']) + tm.assert_frame_equal(result,expected) + + def test_column_multiindex(self): + # GH 4710 + # recreate multi-indexes properly + + index = MultiIndex.from_tuples([('A','a'), ('A','b'), ('B','a'), ('B','b')], names=['first','second']) + df = DataFrame(np.arange(12).reshape(3,4), columns=index) + + with ensure_clean_store(self.path) as store: + + store.put('df',df) + tm.assert_frame_equal(store['df'],df,check_index_type=True,check_column_type=True) + + store.put('df1',df,format='table') + tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + + self.assertRaises(ValueError, store.put, 'df2',df,format='table',data_columns=['A']) + self.assertRaises(ValueError, store.put, 'df3',df,format='table',data_columns=True) + + # appending multi-column on existing table (see GH 6167) + with ensure_clean_store(self.path) as store: + store.append('df2', df) + store.append('df2', df) + + tm.assert_frame_equal(store['df2'], concat((df,df))) + + # non_index_axes name + df = DataFrame(np.arange(12).reshape(3,4), columns=Index(list('ABCD'),name='foo')) + + with ensure_clean_store(self.path) as store: + + store.put('df1',df,format='table') + tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + + def test_store_multiindex(self): + + # validate multi-index names + # GH 5527 + with ensure_clean_store(self.path) as store: + + def make_index(names=None): + return MultiIndex.from_tuples([( datetime.datetime(2013,12,d), s, t) for d in range(1,3) for s in range(2) for t in range(3)], + names=names) + + + # no names + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index()) + store.append('df',df) + tm.assert_frame_equal(store.select('df'),df) + + # partial names + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date',None,None])) + store.append('df',df) + tm.assert_frame_equal(store.select('df'),df) + + # series + _maybe_remove(store, 's') + s = Series(np.zeros(12), index=make_index(['date',None,None])) + store.append('s',s) + tm.assert_series_equal(store.select('s'),s) + + # dup with column + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','a','t'])) + self.assertRaises(ValueError, store.append, 'df',df) + + # dup within level + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','date','date'])) + self.assertRaises(ValueError, store.append, 'df',df) + + # fully names + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','s','t'])) + store.append('df',df) + tm.assert_frame_equal(store.select('df'),df) + + def test_select_columns_in_where(self): + + # GH 6169 + # recreate multi-indexes when columns is passed + # in the `where` argument + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo_name', 'bar_name']) + + # With a DataFrame + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + with ensure_clean_store(self.path) as store: + store.put('df', df, format='table') + expected = df[['A']] + + tm.assert_frame_equal(store.select('df', columns=['A']), expected) + + tm.assert_frame_equal(store.select('df', where="columns=['A']"), expected) + + # With a Series + s = Series(np.random.randn(10), index=index, + name='A') + with ensure_clean_store(self.path) as store: + store.put('s', s, format='table') + tm.assert_series_equal(store.select('s', where="columns=['A']"),s) + + def test_pass_spec_to_storer(self): + + df = tm.makeDataFrame() + + with ensure_clean_store(self.path) as store: + store.put('df',df) + self.assertRaises(TypeError, store.select, 'df', columns=['A']) + self.assertRaises(TypeError, store.select, 'df',where=[('columns=A')]) + + def test_append_misc(self): + + with ensure_clean_store(self.path) as store: + + # unsuported data types for non-tables + p4d = tm.makePanel4D() + self.assertRaises(TypeError, store.put,'p4d',p4d) + + # unsuported data types + self.assertRaises(TypeError, store.put,'abc',None) + self.assertRaises(TypeError, store.put,'abc','123') + self.assertRaises(TypeError, store.put,'abc',123) + self.assertRaises(TypeError, store.put,'abc',np.arange(5)) + + df = tm.makeDataFrame() + store.append('df', df, chunksize=1) + result = store.select('df') + tm.assert_frame_equal(result, df) + + store.append('df1', df, expectedrows=10) + result = store.select('df1') + tm.assert_frame_equal(result, df) + + # more chunksize in append tests + def check(obj, comparator): + for c in [10, 200, 1000]: + with ensure_clean_store(self.path,mode='w') as store: + store.append('obj', obj, chunksize=c) + result = store.select('obj') + comparator(result,obj) + + df = tm.makeDataFrame() + df['string'] = 'foo' + df['float322'] = 1. + df['float322'] = df['float322'].astype('float32') + df['bool'] = df['float322'] > 0 + df['time1'] = Timestamp('20130101') + df['time2'] = Timestamp('20130102') + check(df, tm.assert_frame_equal) + + p = tm.makePanel() + check(p, assert_panel_equal) + + p4d = tm.makePanel4D() + check(p4d, assert_panel4d_equal) + + # empty frame, GH4273 + with ensure_clean_store(self.path) as store: + + # 0 len + df_empty = DataFrame(columns=list('ABC')) + store.append('df',df_empty) + self.assertRaises(KeyError,store.select, 'df') + + # repeated append of 0/non-zero frames + df = DataFrame(np.random.rand(10,3),columns=list('ABC')) + store.append('df',df) + assert_frame_equal(store.select('df'),df) + store.append('df',df_empty) + assert_frame_equal(store.select('df'),df) + + # store + df = DataFrame(columns=list('ABC')) + store.put('df2',df) + assert_frame_equal(store.select('df2'),df) + + # 0 len + p_empty = Panel(items=list('ABC')) + store.append('p',p_empty) + self.assertRaises(KeyError,store.select, 'p') + + # repeated append of 0/non-zero frames + p = Panel(np.random.randn(3,4,5),items=list('ABC')) + store.append('p',p) + assert_panel_equal(store.select('p'),p) + store.append('p',p_empty) + assert_panel_equal(store.select('p'),p) + + # store + store.put('p2',p_empty) + assert_panel_equal(store.select('p2'),p_empty) + + def test_append_raise(self): + + with ensure_clean_store(self.path) as store: + + # test append with invalid input to get good error messages + + # list in column + df = tm.makeDataFrame() + df['invalid'] = [['a']] * len(df) + self.assertEqual(df.dtypes['invalid'], np.object_) + self.assertRaises(TypeError, store.append,'df',df) + + # multiple invalid columns + df['invalid2'] = [['a']] * len(df) + df['invalid3'] = [['a']] * len(df) + self.assertRaises(TypeError, store.append,'df',df) + + # datetime with embedded nans as object + df = tm.makeDataFrame() + s = Series(datetime.datetime(2001,1,2),index=df.index) + s = s.astype(object) + s[0:5] = np.nan + df['invalid'] = s + self.assertEqual(df.dtypes['invalid'], np.object_) + self.assertRaises(TypeError, store.append,'df', df) + + # directy ndarray + self.assertRaises(TypeError, store.append,'df',np.arange(10)) + + # series directly + self.assertRaises(TypeError, store.append,'df',Series(np.arange(10))) + + # appending an incompatbile table + df = tm.makeDataFrame() + store.append('df',df) + + df['foo'] = 'foo' + self.assertRaises(ValueError, store.append,'df',df) + + def test_table_index_incompatible_dtypes(self): + df1 = DataFrame({'a': [1, 2, 3]}) + df2 = DataFrame({'a': [4, 5, 6]}, + index=date_range('1/1/2000', periods=3)) + + with ensure_clean_store(self.path) as store: + store.put('frame', df1, format='table') + self.assertRaises(TypeError, store.put, 'frame', df2, + format='table', append=True) + + def test_table_values_dtypes_roundtrip(self): + + with ensure_clean_store(self.path) as store: + df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') + store.append('df_f8', df1) + assert_series_equal(df1.dtypes,store['df_f8'].dtypes) + + df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') + store.append('df_i8', df2) + assert_series_equal(df2.dtypes,store['df_i8'].dtypes) + + # incompatible dtype + self.assertRaises(ValueError, store.append, 'df_i8', df1) + + # check creation/storage/retrieval of float32 (a bit hacky to actually create them thought) + df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['A']) + store.append('df_f4', df1) + assert_series_equal(df1.dtypes,store['df_f4'].dtypes) + assert df1.dtypes[0] == 'float32' + + # check with mixed dtypes + df1 = DataFrame(dict([ (c,Series(np.random.randn(5),dtype=c)) for c in + ['float32','float64','int32','int64','int16','int8'] ])) + df1['string'] = 'foo' + df1['float322'] = 1. + df1['float322'] = df1['float322'].astype('float32') + df1['bool'] = df1['float32'] > 0 + df1['time1'] = Timestamp('20130101') + df1['time2'] = Timestamp('20130102') + + store.append('df_mixed_dtypes1', df1) + result = store.select('df_mixed_dtypes1').get_dtype_counts() + expected = Series({ 'float32' : 2, 'float64' : 1,'int32' : 1, 'bool' : 1, + 'int16' : 1, 'int8' : 1, 'int64' : 1, 'object' : 1, + 'datetime64[ns]' : 2}) + result.sort() + expected.sort() + tm.assert_series_equal(result,expected) + + def test_table_mixed_dtypes(self): + + # frame + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['bool3'] = True + df['int1'] = 1 + df['int2'] = 2 + df['timestamp1'] = Timestamp('20010102') + df['timestamp2'] = Timestamp('20010103') + df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) + df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) + df.ix[3:6, ['obj1']] = np.nan + df = df.consolidate().convert_objects() + + with ensure_clean_store(self.path) as store: + store.append('df1_mixed', df) + tm.assert_frame_equal(store.select('df1_mixed'), df) + + # panel + wp = tm.makePanel() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['ItemA'] > 0 + wp['bool2'] = wp['ItemB'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() + + with ensure_clean_store(self.path) as store: + store.append('p1_mixed', wp) + assert_panel_equal(store.select('p1_mixed'), wp) + + # ndim + wp = tm.makePanel4D() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['l1'] > 0 + wp['bool2'] = wp['l2'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() + + with ensure_clean_store(self.path) as store: + store.append('p4d_mixed', wp) + assert_panel4d_equal(store.select('p4d_mixed'), wp) + + def test_unimplemented_dtypes_table_columns(self): + + with ensure_clean_store(self.path) as store: + + l = [('date', datetime.date(2001, 1, 2))] + + # py3 ok for unicode + if not compat.PY3: + l.append(('unicode', u('\\u03c3'))) + + ### currently not supported dtypes #### + for n, f in l: + df = tm.makeDataFrame() + df[n] = f + self.assertRaises( + TypeError, store.append, 'df1_%s' % n, df) + + # frame + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['datetime1'] = datetime.date(2001, 1, 2) + df = df.consolidate().convert_objects() + + with ensure_clean_store(self.path) as store: + # this fails because we have a date in the object block...... + self.assertRaises(TypeError, store.append, 'df_unimplemented', df) + + def test_append_with_timezones_pytz(self): + + from datetime import timedelta + + def compare(a,b): + tm.assert_frame_equal(a,b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a[c][i] + b_e = b[c][i] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e,b_e)) + + # as columns + with ensure_clean_store(self.path) as store: + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A = [ Timestamp('20130102 2:00:00',tz='US/Eastern') + timedelta(hours=1)*i for i in range(5) ])) + store.append('df_tz',df,data_columns=['A']) + result = store['df_tz'] + compare(result,df) + assert_frame_equal(result,df) + + # select with tz aware + compare(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) + + _maybe_remove(store, 'df_tz') + # ensure we include dates in DST and STD time here. + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130603',tz='US/Eastern')),index=range(5)) + store.append('df_tz',df) + result = store['df_tz'] + compare(result,df) + assert_frame_equal(result,df) + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5)) + self.assertRaises(TypeError, store.append, 'df_tz', df) + + # this is ok + _maybe_remove(store, 'df_tz') + store.append('df_tz',df,data_columns=['A','B']) + result = store['df_tz'] + compare(result,df) + assert_frame_equal(result,df) + + # can't append with diff timezone + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5)) + self.assertRaises(ValueError, store.append, 'df_tz', df) + + # as index + with ensure_clean_store(self.path) as store: + + # GH 4098 example + df = DataFrame(dict(A = Series(lrange(3), index=date_range('2000-1-1',periods=3,freq='H', tz='US/Eastern')))) + + _maybe_remove(store, 'df') + store.put('df',df) + result = store.select('df') + assert_frame_equal(result,df) + + _maybe_remove(store, 'df') + store.append('df',df) + result = store.select('df') + assert_frame_equal(result,df) + + def test_append_with_timezones_dateutil(self): + + from datetime import timedelta + tm._skip_if_no_dateutil() + + # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows filename issues. + from pandas.tslib import maybe_get_tz + gettz = lambda x: maybe_get_tz('dateutil/' + x) + + def compare(a, b): + tm.assert_frame_equal(a, b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a[c][i] + b_e = b[c][i] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e, b_e)) + + # as columns + with ensure_clean_store(self.path) as store: + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A=[ Timestamp('20130102 2:00:00', tz=gettz('US/Eastern')) + timedelta(hours=1) * i for i in range(5) ])) + store.append('df_tz', df, data_columns=['A']) + result = store['df_tz'] + compare(result, df) + assert_frame_equal(result, df) + + # select with tz aware + compare(store.select('df_tz', where=Term('A>=df.A[3]')), df[df.A >= df.A[3]]) + + _maybe_remove(store, 'df_tz') + # ensure we include dates in DST and STD time here. + df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130603', tz=gettz('US/Eastern'))), index=range(5)) + store.append('df_tz', df) + result = store['df_tz'] + compare(result, df) + assert_frame_equal(result, df) + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('EET'))), index=range(5)) + self.assertRaises(TypeError, store.append, 'df_tz', df) + + # this is ok + _maybe_remove(store, 'df_tz') + store.append('df_tz', df, data_columns=['A', 'B']) + result = store['df_tz'] + compare(result, df) + assert_frame_equal(result, df) + + # can't append with diff timezone + df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('CET'))), index=range(5)) + self.assertRaises(ValueError, store.append, 'df_tz', df) + + # as index + with ensure_clean_store(self.path) as store: + + # GH 4098 example + df = DataFrame(dict(A=Series(lrange(3), index=date_range('2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern'))))) + + _maybe_remove(store, 'df') + store.put('df', df) + result = store.select('df') + assert_frame_equal(result, df) + + _maybe_remove(store, 'df') + store.append('df', df) + result = store.select('df') + assert_frame_equal(result, df) + + def test_store_timezone(self): + # GH2852 + # issue storing datetime.date with a timezone as it resets when read back in a new timezone + + import platform + if platform.system() == "Windows": + raise nose.SkipTest("timezone setting not supported on windows") + + import datetime + import time + import os + + # original method + with ensure_clean_store(self.path) as store: + + today = datetime.date(2013,9,10) + df = DataFrame([1,2,3], index = [today, today, today]) + store['obj1'] = df + result = store['obj1'] + assert_frame_equal(result, df) + + # with tz setting + orig_tz = os.environ.get('TZ') + + def setTZ(tz): + if tz is None: + try: + del os.environ['TZ'] + except: + pass + else: + os.environ['TZ']=tz + time.tzset() + + try: + + with ensure_clean_store(self.path) as store: + + setTZ('EST5EDT') + today = datetime.date(2013,9,10) + df = DataFrame([1,2,3], index = [today, today, today]) + store['obj1'] = df + + setTZ('CST6CDT') + result = store['obj1'] + + assert_frame_equal(result, df) + + finally: + setTZ(orig_tz) + + def test_append_with_timedelta(self): + tm._skip_if_not_numpy17_friendly() + + # GH 3577 + # append timedelta + + from datetime import timedelta + df = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) + df['C'] = df['A']-df['B'] + df.ix[3:5,'C'] = np.nan + + with ensure_clean_store(self.path) as store: + + # table + _maybe_remove(store, 'df') + store.append('df',df,data_columns=True) + result = store.select('df') + assert_frame_equal(result,df) + + result = store.select('df',Term("C<100000")) + assert_frame_equal(result,df) + + result = store.select('df',Term("C","<",-3*86400)) + assert_frame_equal(result,df.iloc[3:]) + + result = store.select('df',"C<'-3D'") + assert_frame_equal(result,df.iloc[3:]) + + # a bit hacky here as we don't really deal with the NaT properly + + result = store.select('df',"C<'-500000s'") + result = result.dropna(subset=['C']) + assert_frame_equal(result,df.iloc[6:]) + + result = store.select('df',"C<'-3.5D'") + result = result.iloc[1:] + assert_frame_equal(result,df.iloc[4:]) + + # fixed + _maybe_remove(store, 'df2') + store.put('df2',df) + result = store.select('df2') + assert_frame_equal(result,df) + + def test_remove(self): + + with ensure_clean_store(self.path) as store: + + ts = tm.makeTimeSeries() + df = tm.makeDataFrame() + store['a'] = ts + store['b'] = df + _maybe_remove(store, 'a') + self.assertEqual(len(store), 1) + tm.assert_frame_equal(df, store['b']) + + _maybe_remove(store, 'b') + self.assertEqual(len(store), 0) + + # nonexistence + self.assertRaises(KeyError, store.remove, 'a_nonexistent_store') + + # pathing + store['a'] = ts + store['b/foo'] = df + _maybe_remove(store, 'foo') + _maybe_remove(store, 'b/foo') + self.assertEqual(len(store), 1) + + store['a'] = ts + store['b/foo'] = df + _maybe_remove(store, 'b') + self.assertEqual(len(store), 1) + + # __delitem__ + store['a'] = ts + store['b'] = df + del store['a'] + del store['b'] + self.assertEqual(len(store), 0) + + def test_remove_where(self): + + with ensure_clean_store(self.path) as store: + + # non-existance + crit1 = Term('index>foo') + self.assertRaises(KeyError, store.remove, 'a', [crit1]) + + # try to remove non-table (with crit) + # non-table ok (where = None) + wp = tm.makePanel() + store.put('wp', wp, format='table') + store.remove('wp', ["minor_axis=['A', 'D']"]) + rs = store.select('wp') + expected = wp.reindex(minor_axis=['B', 'C']) + assert_panel_equal(rs, expected) + + # empty where + _maybe_remove(store, 'wp') + store.put('wp', wp, format='table') + + # deleted number (entire table) + n = store.remove('wp', []) + assert(n == 120) + + # non - empty where + _maybe_remove(store, 'wp') + store.put('wp', wp, format='table') + self.assertRaises(ValueError, store.remove, + 'wp', ['foo']) + + # selectin non-table with a where + # store.put('wp2', wp, format='f') + # self.assertRaises(ValueError, store.remove, + # 'wp2', [('column', ['A', 'D'])]) + + def test_remove_startstop(self): + # GH #4835 and #6177 + + with ensure_clean_store(self.path) as store: + + wp = tm.makePanel() + + # start + store.put('wp1', wp, format='t') + n = store.remove('wp1', start=32) + #assert(n == 120-32) + result = store.select('wp1') + expected = wp.reindex(major_axis=wp.major_axis[:32//4]) + assert_panel_equal(result, expected) + + store.put('wp2', wp, format='t') + n = store.remove('wp2', start=-32) + #assert(n == 32) + result = store.select('wp2') + expected = wp.reindex(major_axis=wp.major_axis[:-32//4]) + assert_panel_equal(result, expected) + + # stop + store.put('wp3', wp, format='t') + n = store.remove('wp3', stop=32) + #assert(n == 32) + result = store.select('wp3') + expected = wp.reindex(major_axis=wp.major_axis[32//4:]) + assert_panel_equal(result, expected) + + store.put('wp4', wp, format='t') + n = store.remove('wp4', stop=-32) + #assert(n == 120-32) + result = store.select('wp4') + expected = wp.reindex(major_axis=wp.major_axis[-32//4:]) + assert_panel_equal(result, expected) + + # start n stop + store.put('wp5', wp, format='t') + n = store.remove('wp5', start=16, stop=-16) + #assert(n == 120-32) + result = store.select('wp5') + expected = wp.reindex(major_axis=wp.major_axis[:16//4]+wp.major_axis[-16//4:]) + assert_panel_equal(result, expected) + + store.put('wp6', wp, format='t') + n = store.remove('wp6', start=16, stop=16) + #assert(n == 0) + result = store.select('wp6') + expected = wp.reindex(major_axis=wp.major_axis) + assert_panel_equal(result, expected) + + # with where + date = wp.major_axis.take(np.arange(0,30,3)) + crit = Term('major_axis=date') + store.put('wp7', wp, format='t') + n = store.remove('wp7', where=[crit], stop=80) + #assert(n == 28) + result = store.select('wp7') + expected = wp.reindex(major_axis=wp.major_axis-wp.major_axis[np.arange(0,20,3)]) + assert_panel_equal(result, expected) + + def test_remove_crit(self): + + with ensure_clean_store(self.path) as store: + + wp = tm.makePanel() + + # group row removal + date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) + crit4 = Term('major_axis=date4') + store.put('wp3', wp, format='t') + n = store.remove('wp3', where=[crit4]) + assert(n == 36) + result = store.select('wp3') + expected = wp.reindex(major_axis=wp.major_axis - date4) + assert_panel_equal(result, expected) + + # upper half + store.put('wp', wp, format='table') + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = Term('major_axis>date') + crit2 = Term("minor_axis=['A', 'D']") + n = store.remove('wp', where=[crit1]) + + assert(n == 56) + + n = store.remove('wp', where=[crit2]) + assert(n == 32) + + result = store['wp'] + expected = wp.truncate(after=date).reindex(minor=['B', 'C']) + assert_panel_equal(result, expected) + + # individual row elements + store.put('wp2', wp, format='table') + + date1 = wp.major_axis[1:3] + crit1 = Term('major_axis=date1') + store.remove('wp2', where=[crit1]) + result = store.select('wp2') + expected = wp.reindex(major_axis=wp.major_axis - date1) + assert_panel_equal(result, expected) + + date2 = wp.major_axis[5] + crit2 = Term('major_axis=date2') + store.remove('wp2', where=[crit2]) + result = store['wp2'] + expected = wp.reindex( + major_axis=wp.major_axis - date1 - Index([date2])) + assert_panel_equal(result, expected) + + date3 = [wp.major_axis[7], wp.major_axis[9]] + crit3 = Term('major_axis=date3') + store.remove('wp2', where=[crit3]) + result = store['wp2'] + expected = wp.reindex( + major_axis=wp.major_axis - date1 - Index([date2]) - Index(date3)) + assert_panel_equal(result, expected) + + # corners + store.put('wp4', wp, format='table') + n = store.remove( + 'wp4', where=[Term('major_axis>wp.major_axis[-1]')]) + result = store.select('wp4') + assert_panel_equal(result, wp) + + def test_invalid_terms(self): + + with ensure_clean_store(self.path) as store: + + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[0:4,'string'] = 'bar' + wp = tm.makePanel() + p4d = tm.makePanel4D() + store.put('df', df, format='table') + store.put('wp', wp, format='table') + store.put('p4d', p4d, format='table') + + # some invalid terms + self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']") + self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"]) + self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"]) + self.assertRaises(TypeError, Term) + + # more invalid + self.assertRaises(ValueError, store.select, 'df','df.index[3]') + self.assertRaises(SyntaxError, store.select, 'df','index>') + self.assertRaises(ValueError, store.select, 'wp', "major_axis<'20000108' & minor_axis['A', 'B']") + + # from the docs + with ensure_clean_path(self.path) as path: + dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table',data_columns=True) + + # check ok + read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']") + read_hdf(path,'dfq',where="A>0 or C>0") + + # catch the invalid reference + with ensure_clean_path(self.path) as path: + dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table') + + self.assertRaises(ValueError, read_hdf, path,'dfq',where="A>0 or C>0") + + def test_terms(self): + + with ensure_clean_store(self.path) as store: + + wp = tm.makePanel() + p4d = tm.makePanel4D() + wpneg = Panel.fromDict({-1: tm.makeDataFrame(), 0: tm.makeDataFrame(), + 1: tm.makeDataFrame()}) + store.put('wp', wp, table=True) + store.put('p4d', p4d, table=True) + store.put('wpneg', wpneg, table=True) + + # panel + result = store.select('wp', [Term( + 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")]) + expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) + assert_panel_equal(result, expected) + + # with deprecation + result = store.select('wp', [Term( + 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")]) + expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) + + # p4d + result = store.select('p4d', [Term('major_axis<"20000108"'), + Term("minor_axis=['A', 'B']"), + Term("items=['ItemA', 'ItemB']")]) + expected = p4d.truncate(after='20000108').reindex( + minor=['A', 'B'], items=['ItemA', 'ItemB']) + assert_panel4d_equal(result, expected) + + # back compat invalid terms + terms = [ + dict(field='major_axis', op='>', value='20121114'), + [ dict(field='major_axis', op='>', value='20121114') ], + [ "minor_axis=['A','B']", dict(field='major_axis', op='>', value='20121114') ] + ] + for t in terms: + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + Term(t) + + # valid terms + terms = [ + ('major_axis=20121114'), + ('major_axis>20121114'), + (("major_axis=['20121114', '20121114']"),), + ('major_axis=datetime.datetime(2012, 11, 14)'), + 'major_axis> 20121114', + 'major_axis >20121114', + 'major_axis > 20121114', + (("minor_axis=['A', 'B']"),), + (("minor_axis=['A', 'B']"),), + ((("minor_axis==['A', 'B']"),),), + (("items=['ItemA', 'ItemB']"),), + ('items=ItemA'), + ] + + for t in terms: + store.select('wp', t) + store.select('p4d', t) + + # valid for p4d only + terms = [ + (("labels=['l1', 'l2']"),), + Term("labels=['l1', 'l2']"), + ] + + for t in terms: + store.select('p4d', t) + + with tm.assertRaisesRegexp(TypeError, 'Only named functions are supported'): + store.select('wp', Term('major_axis == (lambda x: x)("20130101")')) + + # check USub node parsing + res = store.select('wpneg', Term('items == -1')) + expected = Panel({-1: wpneg[-1]}) + tm.assert_panel_equal(res, expected) + + with tm.assertRaisesRegexp(NotImplementedError, + 'Unary addition not supported'): + store.select('wpneg', Term('items == +1')) + + def test_term_compat(self): + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + result = store.select('wp', [Term('major_axis>20000102'), + Term('minor_axis', '=', ['A','B']) ]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + assert_panel_equal(result, expected) + + store.remove('wp', Term('major_axis>20000103')) + result = store.select('wp') + expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + assert_panel_equal(result, expected) + + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + # stringified datetimes + result = store.select('wp', [Term('major_axis','>',datetime.datetime(2000,1,2))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + + result = store.select('wp', [Term('major_axis','>',datetime.datetime(2000,1,2,0,0))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + + result = store.select('wp', [Term('major_axis','=',[datetime.datetime(2000,1,2,0,0),datetime.datetime(2000,1,3,0,0)])]) + expected = wp.loc[:,[Timestamp('20000102'),Timestamp('20000103')]] + assert_panel_equal(result, expected) + + result = store.select('wp', [Term('minor_axis','=',['A','B'])]) + expected = wp.loc[:,:,['A','B']] + assert_panel_equal(result, expected) + + def test_backwards_compat_without_term_object(self): + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis>20000102'), + ('minor_axis', '=', ['A','B']) ]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + assert_panel_equal(result, expected) + + store.remove('wp', ('major_axis>20000103')) + result = store.select('wp') + expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + assert_panel_equal(result, expected) + + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + # stringified datetimes + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2,0,0))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','=',[datetime.datetime(2000,1,2,0,0), + datetime.datetime(2000,1,3,0,0)])]) + expected = wp.loc[:,[Timestamp('20000102'),Timestamp('20000103')]] + assert_panel_equal(result, expected) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('minor_axis','=',['A','B'])]) + expected = wp.loc[:,:,['A','B']] + assert_panel_equal(result, expected) + + def test_same_name_scoping(self): + + with ensure_clean_store(self.path) as store: + + import pandas as pd + df = DataFrame(np.random.randn(20, 2),index=pd.date_range('20130101',periods=20)) + store.put('df', df, table=True) + expected = df[df.index>pd.Timestamp('20130105')] + + import datetime + result = store.select('df','index>datetime.datetime(2013,1,5)') + assert_frame_equal(result,expected) + + from datetime import datetime + + # technically an error, but allow it + result = store.select('df','index>datetime.datetime(2013,1,5)') + assert_frame_equal(result,expected) + + result = store.select('df','index>datetime(2013,1,5)') + assert_frame_equal(result,expected) + + def test_series(self): + + s = tm.makeStringSeries() + self._check_roundtrip(s, tm.assert_series_equal) + + ts = tm.makeTimeSeries() + self._check_roundtrip(ts, tm.assert_series_equal) + + ts2 = Series(ts.index, Index(ts.index, dtype=object)) + self._check_roundtrip(ts2, tm.assert_series_equal) + + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), + dtype=object)) + self._check_roundtrip(ts3, tm.assert_series_equal) + + def test_sparse_series(self): + + s = tm.makeStringSeries() + s[3:5] = np.nan + ss = s.to_sparse() + self._check_roundtrip(ss, tm.assert_series_equal, + check_series_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_series_equal, + check_series_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_series_equal, + check_series_type=True) + + def test_sparse_frame(self): + + s = tm.makeDataFrame() + s.ix[3:5, 1:3] = np.nan + s.ix[8:10, -2] = np.nan + ss = s.to_sparse() + + self._check_double_roundtrip(ss, tm.assert_frame_equal, + check_frame_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_double_roundtrip(ss2, tm.assert_frame_equal, + check_frame_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_double_roundtrip(ss3, tm.assert_frame_equal, + check_frame_type=True) + + def test_sparse_panel(self): + + items = ['x', 'y', 'z'] + p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) + sp = p.to_sparse() + + self._check_double_roundtrip(sp, assert_panel_equal, + check_panel_type=True) + + sp2 = p.to_sparse(kind='integer') + self._check_double_roundtrip(sp2, assert_panel_equal, + check_panel_type=True) + + sp3 = p.to_sparse(fill_value=0) + self._check_double_roundtrip(sp3, assert_panel_equal, + check_panel_type=True) + + def test_float_index(self): + + # GH #454 + index = np.random.randn(10) + s = Series(np.random.randn(10), index=index) + self._check_roundtrip(s, tm.assert_series_equal) + + def test_tuple_index(self): + + # GH #492 + col = np.arange(10) + idx = [(0., 1.), (2., 3.), (4., 5.)] + data = np.random.randn(30).reshape((3, 10)) + DF = DataFrame(data, index=idx, columns=col) + with tm.assert_produces_warning(expected_warning=PerformanceWarning): + self._check_roundtrip(DF, tm.assert_frame_equal) + + def test_index_types(self): + + values = np.random.randn(2) + + func = lambda l, r: tm.assert_series_equal(l, r, + check_dtype=True, + check_index_type=True, + check_series_type=True) + + with tm.assert_produces_warning(expected_warning=PerformanceWarning): + ser = Series(values, [0, 'y']) + self._check_roundtrip(ser, func) + + with tm.assert_produces_warning(expected_warning=PerformanceWarning): + ser = Series(values, [datetime.datetime.today(), 0]) + self._check_roundtrip(ser, func) + + with tm.assert_produces_warning(expected_warning=PerformanceWarning): + ser = Series(values, ['y', 0]) + self._check_roundtrip(ser, func) + + with tm.assert_produces_warning(expected_warning=PerformanceWarning): + ser = Series(values, [datetime.date.today(), 'a']) + self._check_roundtrip(ser, func) + + with tm.assert_produces_warning(expected_warning=PerformanceWarning): + ser = Series(values, [1.23, 'b']) + self._check_roundtrip(ser, func) + + ser = Series(values, [1, 1.53]) + self._check_roundtrip(ser, func) + + ser = Series(values, [1, 5]) + self._check_roundtrip(ser, func) + + ser = Series(values, [datetime.datetime( + 2012, 1, 1), datetime.datetime(2012, 1, 2)]) + self._check_roundtrip(ser, func) + + def test_timeseries_preepoch(self): + + if sys.version_info[0] == 2 and sys.version_info[1] < 7: + raise nose.SkipTest("won't work on Python < 2.7") + + dr = bdate_range('1/1/1940', '1/1/1960') + ts = Series(np.random.randn(len(dr)), index=dr) + try: + self._check_roundtrip(ts, tm.assert_series_equal) + except OverflowError: + raise nose.SkipTest('known failer on some windows platforms') + + def test_frame(self): + + df = tm.makeDataFrame() + + # put in some random NAs + df.values[0, 0] = np.nan + df.values[5, 3] = np.nan + + self._check_roundtrip_table(df, tm.assert_frame_equal) + self._check_roundtrip(df, tm.assert_frame_equal) + + self._check_roundtrip_table(df, tm.assert_frame_equal, + compression=True) + self._check_roundtrip(df, tm.assert_frame_equal, + compression=True) + + tdf = tm.makeTimeDataFrame() + self._check_roundtrip(tdf, tm.assert_frame_equal) + self._check_roundtrip(tdf, tm.assert_frame_equal, + compression=True) + + with ensure_clean_store(self.path) as store: + # not consolidated + df['foo'] = np.random.randn(len(df)) + store['df'] = df + recons = store['df'] + self.assertTrue(recons._data.is_consolidated()) + + # empty + self._check_roundtrip(df[:0], tm.assert_frame_equal) + + def test_empty_series_frame(self): + s0 = Series() + s1 = Series(name='myseries') + df0 = DataFrame() + df1 = DataFrame(index=['a', 'b', 'c']) + df2 = DataFrame(columns=['d', 'e', 'f']) + + self._check_roundtrip(s0, tm.assert_series_equal) + self._check_roundtrip(s1, tm.assert_series_equal) + self._check_roundtrip(df0, tm.assert_frame_equal) + self._check_roundtrip(df1, tm.assert_frame_equal) + self._check_roundtrip(df2, tm.assert_frame_equal) + + def test_empty_series(self): + for dtype in [np.int64, np.float64, np.object, 'm8[ns]', 'M8[ns]']: + s = Series(dtype=dtype) + self._check_roundtrip(s, tm.assert_series_equal) + + def test_can_serialize_dates(self): + + rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + self._check_roundtrip(frame, tm.assert_frame_equal) + + def test_timezones(self): + rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(self.path) as store: + store['frame'] = frame + recons = store['frame'] + self.assertTrue(recons.index.equals(rng)) + self.assertEqual(rng.tz, recons.index.tz) + + def test_fixed_offset_tz(self): + rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(self.path) as store: + store['frame'] = frame + recons = store['frame'] + self.assertTrue(recons.index.equals(rng)) + self.assertEqual(rng.tz, recons.index.tz) + + def test_store_hierarchical(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + frame = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self._check_roundtrip(frame, tm.assert_frame_equal) + self._check_roundtrip(frame.T, tm.assert_frame_equal) + self._check_roundtrip(frame['A'], tm.assert_series_equal) + + # check that the names are stored + with ensure_clean_store(self.path) as store: + store['frame'] = frame + recons = store['frame'] + assert(recons.index.names == ('foo', 'bar')) + + def test_store_index_name(self): + df = tm.makeDataFrame() + df.index.name = 'foo' + + with ensure_clean_store(self.path) as store: + store['frame'] = df + recons = store['frame'] + assert(recons.index.name == 'foo') + + def test_store_series_name(self): + df = tm.makeDataFrame() + series = df['A'] + + with ensure_clean_store(self.path) as store: + store['series'] = series + recons = store['series'] + assert(recons.name == 'A') + + def test_store_mixed(self): + + def _make_one(): + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['int1'] = 1 + df['int2'] = 2 + return df.consolidate() + + df1 = _make_one() + df2 = _make_one() + + self._check_roundtrip(df1, tm.assert_frame_equal) + self._check_roundtrip(df2, tm.assert_frame_equal) + + with ensure_clean_store(self.path) as store: + store['obj'] = df1 + tm.assert_frame_equal(store['obj'], df1) + store['obj'] = df2 + tm.assert_frame_equal(store['obj'], df2) + + # check that can store Series of all of these types + self._check_roundtrip(df1['obj1'], tm.assert_series_equal) + self._check_roundtrip(df1['bool1'], tm.assert_series_equal) + self._check_roundtrip(df1['int1'], tm.assert_series_equal) + + # try with compression + self._check_roundtrip(df1['obj1'], tm.assert_series_equal, + compression=True) + self._check_roundtrip(df1['bool1'], tm.assert_series_equal, + compression=True) + self._check_roundtrip(df1['int1'], tm.assert_series_equal, + compression=True) + self._check_roundtrip(df1, tm.assert_frame_equal, + compression=True) + + def test_wide(self): + + wp = tm.makePanel() + self._check_roundtrip(wp, assert_panel_equal) + + def test_wide_table(self): + + wp = tm.makePanel() + self._check_roundtrip_table(wp, assert_panel_equal) + + def test_select_with_dups(self): + + # single dtypes + df = DataFrame(np.random.randn(10,4),columns=['A','A','B','B']) + df.index = date_range('20130101 9:30',periods=10,freq='T') + + with ensure_clean_store(self.path) as store: + store.append('df',df) + + result = store.select('df') + expected = df + assert_frame_equal(result,expected,by_blocks=True) + + result = store.select('df',columns=df.columns) + expected = df + assert_frame_equal(result,expected,by_blocks=True) + + result = store.select('df',columns=['A']) + expected = df.loc[:,['A']] + assert_frame_equal(result,expected) + + # dups accross dtypes + df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), + DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + axis=1) + df.index = date_range('20130101 9:30',periods=10,freq='T') + + with ensure_clean_store(self.path) as store: + store.append('df',df) + + result = store.select('df') + expected = df + assert_frame_equal(result,expected,by_blocks=True) + + result = store.select('df',columns=df.columns) + expected = df + assert_frame_equal(result,expected,by_blocks=True) + + expected = df.loc[:,['A']] + result = store.select('df',columns=['A']) + assert_frame_equal(result,expected,by_blocks=True) + + expected = df.loc[:,['B','A']] + result = store.select('df',columns=['B','A']) + assert_frame_equal(result,expected,by_blocks=True) + + # duplicates on both index and columns + with ensure_clean_store(self.path) as store: + store.append('df',df) + store.append('df',df) + + expected = df.loc[:,['B','A']] + expected = concat([expected, expected]) + result = store.select('df',columns=['B','A']) + assert_frame_equal(result,expected,by_blocks=True) + + def test_wide_table_dups(self): + wp = tm.makePanel() + with ensure_clean_store(self.path) as store: + store.put('panel', wp, format='table') + store.put('panel', wp, format='table', append=True) + + with tm.assert_produces_warning(expected_warning=DuplicateWarning): + recons = store['panel'] + + assert_panel_equal(recons, wp) + + def test_long(self): + def _check(left, right): + assert_panel_equal(left.to_panel(), right.to_panel()) + + wp = tm.makePanel() + self._check_roundtrip(wp.to_frame(), _check) + + # empty + # self._check_roundtrip(wp.to_frame()[:0], _check) + + def test_longpanel(self): + pass + + def test_overwrite_node(self): + + with ensure_clean_store(self.path) as store: + store['a'] = tm.makeTimeDataFrame() + ts = tm.makeTimeSeries() + store['a'] = ts + + tm.assert_series_equal(store['a'], ts) + + def test_sparse_with_compression(self): + + # GH 2931 + + # make sparse dataframe + df = DataFrame(np.random.binomial(n=1, p=.01, size=(1e3, 10))).to_sparse(fill_value=0) + + # case 1: store uncompressed + self._check_double_roundtrip(df, tm.assert_frame_equal, + compression = False, + check_frame_type=True) + + # case 2: store compressed (works) + self._check_double_roundtrip(df, tm.assert_frame_equal, + compression = 'zlib', + check_frame_type=True) + + # set one series to be completely sparse + df[0] = np.zeros(1e3) + + # case 3: store df with completely sparse series uncompressed + self._check_double_roundtrip(df, tm.assert_frame_equal, + compression = False, + check_frame_type=True) + + # case 4: try storing df with completely sparse series compressed (fails) + self._check_double_roundtrip(df, tm.assert_frame_equal, + compression = 'zlib', + check_frame_type=True) + + def test_select(self): + wp = tm.makePanel() + + with ensure_clean_store(self.path) as store: + + # put/select ok + _maybe_remove(store, 'wp') + store.put('wp', wp, format='table') + store.select('wp') + + # non-table ok (where = None) + _maybe_remove(store, 'wp') + store.put('wp2', wp) + store.select('wp2') + + # selection on the non-indexable with a large number of columns + wp = Panel( + np.random.randn(100, 100, 100), items=['Item%03d' % i for i in range(100)], + major_axis=date_range('1/1/2000', periods=100), minor_axis=['E%03d' % i for i in range(100)]) + + _maybe_remove(store, 'wp') + store.append('wp', wp) + items = ['Item%03d' % i for i in range(80)] + result = store.select('wp', Term('items=items')) + expected = wp.reindex(items=items) + assert_panel_equal(expected, result) + + # selectin non-table with a where + # self.assertRaises(ValueError, store.select, + # 'wp2', ('column', ['A', 'D'])) + + # select with columns= + df = tm.makeTimeDataFrame() + _maybe_remove(store, 'df') + store.append('df', df) + result = store.select('df', columns=['A', 'B']) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # equivalentsly + result = store.select('df', [("columns=['A', 'B']")]) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # with a data column + _maybe_remove(store, 'df') + store.append('df', df, data_columns=['A']) + result = store.select('df', ['A > 0'], columns=['A', 'B']) + expected = df[df.A > 0].reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # all a data columns + _maybe_remove(store, 'df') + store.append('df', df, data_columns=True) + result = store.select('df', ['A > 0'], columns=['A', 'B']) + expected = df[df.A > 0].reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) + + # with a data column, but different columns + _maybe_remove(store, 'df') + store.append('df', df, data_columns=['A']) + result = store.select('df', ['A > 0'], columns=['C', 'D']) + expected = df[df.A > 0].reindex(columns=['C', 'D']) + tm.assert_frame_equal(expected, result) + + def test_select_dtypes(self): + + with ensure_clean_store(self.path) as store: + + # with a Timestamp data column (GH #2637) + df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) + _maybe_remove(store, 'df') + store.append('df', df, data_columns=['ts', 'A']) + + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) + expected = df[df.ts >= Timestamp('2012-02-01')] + tm.assert_frame_equal(expected, result) + + # bool columns (GH #2849) + df = DataFrame(np.random.randn(5,2), columns =['A','B']) + df['object'] = 'foo' + df.ix[4:5,'object'] = 'bar' + df['boolv'] = df['A'] > 0 + _maybe_remove(store, 'df') + store.append('df', df, data_columns = True) + + expected = df[df.boolv == True].reindex(columns=['A','boolv']) + for v in [True,'true',1]: + result = store.select('df', Term('boolv == %s' % str(v)), columns = ['A','boolv']) + tm.assert_frame_equal(expected, result) + + expected = df[df.boolv == False ].reindex(columns=['A','boolv']) + for v in [False,'false',0]: + result = store.select('df', Term('boolv == %s' % str(v)), columns = ['A','boolv']) + tm.assert_frame_equal(expected, result) + + # integer index + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + _maybe_remove(store, 'df_int') + store.append('df_int', df) + result = store.select( + 'df_int', [Term("index<10"), Term("columns=['A']")]) + expected = df.reindex(index=list(df.index)[0:10],columns=['A']) + tm.assert_frame_equal(expected, result) + + # float index + df = DataFrame(dict(A=np.random.rand( + 20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) + _maybe_remove(store, 'df_float') + store.append('df_float', df) + result = store.select( + 'df_float', [Term("index<10.0"), Term("columns=['A']")]) + expected = df.reindex(index=list(df.index)[0:10],columns=['A']) + tm.assert_frame_equal(expected, result) + + with ensure_clean_store(self.path) as store: + + # floats w/o NaN + df = DataFrame(dict(cols = range(11), values = range(11)),dtype='float64') + df['cols'] = (df['cols']+10).apply(str) + + store.append('df1',df,data_columns=True) + result = store.select( + 'df1', where='values>2.0') + expected = df[df['values']>2.0] + tm.assert_frame_equal(expected, result) + + # floats with NaN + df.iloc[0] = np.nan + expected = df[df['values']>2.0] + + store.append('df2',df,data_columns=True,index=False) + result = store.select( + 'df2', where='values>2.0') + tm.assert_frame_equal(expected, result) + + # https://github.com/PyTables/PyTables/issues/282 + # bug in selection when 0th row has a np.nan and an index + #store.append('df3',df,data_columns=True) + #result = store.select( + # 'df3', where='values>2.0') + #tm.assert_frame_equal(expected, result) + + # not in first position float with NaN ok too + df = DataFrame(dict(cols = range(11), values = range(11)),dtype='float64') + df['cols'] = (df['cols']+10).apply(str) + + df.iloc[1] = np.nan + expected = df[df['values']>2.0] + + store.append('df4',df,data_columns=True) + result = store.select( + 'df4', where='values>2.0') + tm.assert_frame_equal(expected, result) + + def test_select_with_many_inputs(self): + + with ensure_clean_store(self.path) as store: + + df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), + A=np.random.randn(300), + B=range(300), + users = ['a']*50 + ['b']*50 + ['c']*100 + ['a%03d' % i for i in range(100)])) + _maybe_remove(store, 'df') + store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) + + # regular select + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) + expected = df[df.ts >= Timestamp('2012-02-01')] + tm.assert_frame_equal(expected, result) + + # small selector + result = store.select('df', [Term("ts>=Timestamp('2012-02-01') & users=['a','b','c']")]) + expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(['a','b','c']) ] + tm.assert_frame_equal(expected, result) + + # big selector along the columns + selector = [ 'a','b','c' ] + [ 'a%03d' % i for i in range(60) ] + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')"),Term('users=selector')]) + expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector) ] + tm.assert_frame_equal(expected, result) + + selector = range(100,200) + result = store.select('df', [Term('B=selector')]) + expected = df[ df.B.isin(selector) ] + tm.assert_frame_equal(expected, result) + self.assertEqual(len(result), 100) + + # big selector along the index + selector = Index(df.ts[0:100].values) + result = store.select('df', [Term('ts=selector')]) + expected = df[ df.ts.isin(selector.values) ] + tm.assert_frame_equal(expected, result) + self.assertEqual(len(result), 100) + + def test_select_iterator(self): + + # single table + with ensure_clean_store(self.path) as store: + + df = tm.makeTimeDataFrame(500) + _maybe_remove(store, 'df') + store.append('df', df) + + expected = store.select('df') + + results = [] + for s in store.select('df',iterator=True): + results.append(s) + result = concat(results) + tm.assert_frame_equal(expected, result) + results = [] + for s in store.select('df',chunksize=100): + results.append(s) + self.assertEqual(len(results), 5) + result = concat(results) + tm.assert_frame_equal(expected, result) + + results = [] + for s in store.select('df',chunksize=150): + results.append(s) + result = concat(results) + tm.assert_frame_equal(result, expected) + + with ensure_clean_path(self.path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path,'df_non_table') + self.assertRaises(TypeError, read_hdf, path,'df_non_table',chunksize=100) + self.assertRaises(TypeError, read_hdf, path,'df_non_table',iterator=True) + + with ensure_clean_path(self.path) as path: + + df = tm.makeTimeDataFrame(500) + df.to_hdf(path,'df',format='table') + + results = [] + for x in read_hdf(path,'df',chunksize=100): + results.append(x) + + self.assertEqual(len(results), 5) + result = concat(results) + tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, read_hdf(path,'df')) + + # multiple + + with ensure_clean_store(self.path) as store: + + df1 = tm.makeTimeDataFrame(500) + store.append('df1',df1,data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x) + df2['foo'] = 'bar' + store.append('df2',df2) + + df = concat([df1, df2], axis=1) + + # full selection + expected = store.select_as_multiple( + ['df1', 'df2'], selector='df1') + results = [] + for s in store.select_as_multiple( + ['df1', 'df2'], selector='df1', chunksize=150): + results.append(s) + result = concat(results) + tm.assert_frame_equal(expected, result) + + # where selection + #expected = store.select_as_multiple( + # ['df1', 'df2'], where= Term('A>0'), selector='df1') + #results = [] + #for s in store.select_as_multiple( + # ['df1', 'df2'], where= Term('A>0'), selector='df1', chunksize=25): + # results.append(s) + #result = concat(results) + #tm.assert_frame_equal(expected, result) + + def test_retain_index_attributes(self): + + # GH 3499, losing frequency info on index recreation + df = DataFrame(dict(A = Series(lrange(3), + index=date_range('2000-1-1',periods=3,freq='H')))) + + with ensure_clean_store(self.path) as store: + _maybe_remove(store,'data') + store.put('data', df, format='table') + + result = store.get('data') + tm.assert_frame_equal(df,result) + + for attr in ['freq','tz','name']: + for idx in ['index','columns']: + self.assertEqual(getattr(getattr(df,idx),attr,None), + getattr(getattr(result,idx),attr,None)) + + + # try to append a table with a different frequency + with tm.assert_produces_warning(expected_warning=AttributeConflictWarning): + df2 = DataFrame(dict(A = Series(lrange(3), + index=date_range('2002-1-1',periods=3,freq='D')))) + store.append('data',df2) + + self.assertIsNone(store.get_storer('data').info['index']['freq']) + + # this is ok + _maybe_remove(store,'df2') + df2 = DataFrame(dict(A = Series(lrange(3), + index=[Timestamp('20010101'),Timestamp('20010102'),Timestamp('20020101')]))) + store.append('df2',df2) + df3 = DataFrame(dict(A = Series(lrange(3),index=date_range('2002-1-1',periods=3,freq='D')))) + store.append('df2',df3) + + def test_retain_index_attributes2(self): + + with ensure_clean_path(self.path) as path: + + with tm.assert_produces_warning(expected_warning=AttributeConflictWarning): + + df = DataFrame(dict(A = Series(lrange(3), index=date_range('2000-1-1',periods=3,freq='H')))) + df.to_hdf(path,'data',mode='w',append=True) + df2 = DataFrame(dict(A = Series(lrange(3), index=date_range('2002-1-1',periods=3,freq='D')))) + df2.to_hdf(path,'data',append=True) + + idx = date_range('2000-1-1',periods=3,freq='H') + idx.name = 'foo' + df = DataFrame(dict(A = Series(lrange(3), index=idx))) + df.to_hdf(path,'data',mode='w',append=True) + + self.assertEqual(read_hdf(path,'data').index.name, 'foo') + + with tm.assert_produces_warning(expected_warning=AttributeConflictWarning): + + idx2 = date_range('2001-1-1',periods=3,freq='H') + idx2.name = 'bar' + df2 = DataFrame(dict(A = Series(lrange(3), index=idx2))) + df2.to_hdf(path,'data',append=True) + + self.assertIsNone(read_hdf(path,'data').index.name) + + def test_panel_select(self): + + wp = tm.makePanel() + + with ensure_clean_store(self.path) as store: + store.put('wp', wp, format='table') + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = ('major_axis>=date') + crit2 = ("minor_axis=['A', 'D']") + + result = store.select('wp', [crit1, crit2]) + expected = wp.truncate(before=date).reindex(minor=['A', 'D']) + assert_panel_equal(result, expected) + + result = store.select( + 'wp', ['major_axis>="20000124"', ("minor_axis=['A', 'B']")]) + expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) + assert_panel_equal(result, expected) + + def test_frame_select(self): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(self.path) as store: + store.put('frame', df,format='table') + date = df.index[len(df) // 2] + + crit1 = Term('index>=date') + self.assertEqual(crit1.env.scope['date'], date) + + crit2 = ("columns=['A', 'D']") + crit3 = ('columns=A') + + result = store.select('frame', [crit1, crit2]) + expected = df.ix[date:, ['A', 'D']] + tm.assert_frame_equal(result, expected) + + result = store.select('frame', [crit3]) + expected = df.ix[:, ['A']] + tm.assert_frame_equal(result, expected) + + # invalid terms + df = tm.makeTimeDataFrame() + store.append('df_time', df) + self.assertRaises( + ValueError, store.select, 'df_time', [Term("index>0")]) + + # can't select if not written as table + # store['frame'] = df + # self.assertRaises(ValueError, store.select, + # 'frame', [crit1, crit2]) + + def test_frame_select_complex(self): + # select via complex criteria + + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.loc[df.index[0:4],'string'] = 'bar' + + with ensure_clean_store(self.path) as store: + store.put('df', df, table=True, data_columns=['string']) + + # empty + result = store.select('df', 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index>df.index[3]) & (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + result = store.select('df', 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index>df.index[3]) & (df.string=='foo')] + tm.assert_frame_equal(result, expected) + + # or + result = store.select('df', 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index>df.index[3]) | (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + result = store.select('df', '(index>df.index[3] & index<=df.index[6]) | string="bar"') + expected = df.loc[((df.index>df.index[3]) & (df.index<=df.index[6])) | (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + # invert + result = store.select('df', 'string!="bar"') + expected = df.loc[df.string!='bar'] + tm.assert_frame_equal(result, expected) + + # invert not implemented in numexpr :( + self.assertRaises(NotImplementedError, store.select, 'df', '~(string="bar")') + + # invert ok for filters + result = store.select('df', "~(columns=['A','B'])") + expected = df.loc[:,df.columns-['A','B']] + tm.assert_frame_equal(result, expected) + + # in + result = store.select('df', "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index>df.index[3]].reindex(columns=['A','B']) + tm.assert_frame_equal(result, expected) + + def test_frame_select_complex2(self): + + with ensure_clean_path(['parms.hdf','hist.hdf']) as paths: + + pp, hh = paths + + # use non-trivial selection criteria + parms = DataFrame({ 'A' : [1,1,2,2,3] }) + parms.to_hdf(pp,'df',mode='w',format='table',data_columns=['A']) + + selection = read_hdf(pp,'df',where='A=[2,3]') + hist = DataFrame(np.random.randn(25,1),columns=['data'], + index=MultiIndex.from_tuples([ (i,j) for i in range(5) for j in range(5) ], + names=['l1','l2'])) + + hist.to_hdf(hh,'df',mode='w',format='table') + + expected = read_hdf(hh,'df',where=Term('l1','=',[2,3,4])) + + # list like + result = read_hdf(hh,'df',where=Term('l1','=',selection.index.tolist())) + assert_frame_equal(result, expected) + l = selection.index.tolist() + + # sccope with list like + store = HDFStore(hh) + result = store.select('df',where='l1=l') + assert_frame_equal(result, expected) + store.close() + + result = read_hdf(hh,'df',where='l1=l') + assert_frame_equal(result, expected) + + # index + index = selection.index + result = read_hdf(hh,'df',where='l1=index') + assert_frame_equal(result, expected) + + result = read_hdf(hh,'df',where='l1=selection.index') + assert_frame_equal(result, expected) + + result = read_hdf(hh,'df',where='l1=selection.index.tolist()') + assert_frame_equal(result, expected) + + result = read_hdf(hh,'df',where='l1=list(selection.index)') + assert_frame_equal(result, expected) + + # sccope with index + store = HDFStore(hh) + + result = store.select('df',where='l1=index') + assert_frame_equal(result, expected) + + result = store.select('df',where='l1=selection.index') + assert_frame_equal(result, expected) + + result = store.select('df',where='l1=selection.index.tolist()') + assert_frame_equal(result, expected) + + result = store.select('df',where='l1=list(selection.index)') + assert_frame_equal(result, expected) + + store.close() + + def test_invalid_filtering(self): + + # can't use more than one filter (atm) + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(self.path) as store: + store.put('df', df, table=True) + + # not implemented + self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A'] | columns=['B']") + + # in theory we could deal with this + self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A','B'] & columns=['C']") + + def test_string_select(self): + # GH 2973 + with ensure_clean_store(self.path) as store: + + df = tm.makeTimeDataFrame() + + # test string ==/!= + df['x'] = 'none' + df.ix[2:7,'x'] = '' + + store.append('df',df,data_columns=['x']) + + result = store.select('df',Term('x=none')) + expected = df[df.x == 'none'] + assert_frame_equal(result,expected) + + try: + result = store.select('df',Term('x!=none')) + expected = df[df.x != 'none'] + assert_frame_equal(result,expected) + except Exception as detail: + com.pprint_thing("[{0}]".format(detail)) + com.pprint_thing(store) + com.pprint_thing(expected) + + df2 = df.copy() + df2.loc[df2.x=='','x'] = np.nan + + store.append('df2',df2,data_columns=['x']) + result = store.select('df2',Term('x!=none')) + expected = df2[isnull(df2.x)] + assert_frame_equal(result,expected) + + # int ==/!= + df['int'] = 1 + df.ix[2:7,'int'] = 2 + + store.append('df3',df,data_columns=['int']) + + result = store.select('df3',Term('int=2')) + expected = df[df.int==2] + assert_frame_equal(result,expected) + + result = store.select('df3',Term('int!=2')) + expected = df[df.int!=2] + assert_frame_equal(result,expected) + + def test_read_column(self): + + df = tm.makeTimeDataFrame() + + with ensure_clean_store(self.path) as store: + _maybe_remove(store, 'df') + store.append('df', df) + + # error + self.assertRaises(KeyError, store.select_column, 'df', 'foo') + + def f(): + store.select_column('df', 'index', where = ['index>5']) + self.assertRaises(Exception, f) + + # valid + result = store.select_column('df', 'index') + tm.assert_almost_equal(result.values, Series(df.index).values) + self.assertIsInstance(result,Series) + + # not a data indexable column + self.assertRaises( + ValueError, store.select_column, 'df', 'values_block_0') + + # a data column + df2 = df.copy() + df2['string'] = 'foo' + store.append('df2', df2, data_columns=['string']) + result = store.select_column('df2', 'string') + tm.assert_almost_equal(result.values, df2['string'].values) + + # a data column with NaNs, result excludes the NaNs + df3 = df.copy() + df3['string'] = 'foo' + df3.ix[4:6, 'string'] = np.nan + store.append('df3', df3, data_columns=['string']) + result = store.select_column('df3', 'string') + tm.assert_almost_equal(result.values, df3['string'].values) + + # start/stop + result = store.select_column('df3', 'string', start=2) + tm.assert_almost_equal(result.values, df3['string'].values[2:]) + + result = store.select_column('df3', 'string', start=-2) + tm.assert_almost_equal(result.values, df3['string'].values[-2:]) + + result = store.select_column('df3', 'string', stop=2) + tm.assert_almost_equal(result.values, df3['string'].values[:2]) + + result = store.select_column('df3', 'string', stop=-2) + tm.assert_almost_equal(result.values, df3['string'].values[:-2]) + + result = store.select_column('df3', 'string', start=2, stop=-2) + tm.assert_almost_equal(result.values, df3['string'].values[2:-2]) + + result = store.select_column('df3', 'string', start=-2, stop=2) + tm.assert_almost_equal(result.values, df3['string'].values[-2:2]) + + def test_coordinates(self): + df = tm.makeTimeDataFrame() + + with ensure_clean_store(self.path) as store: + + _maybe_remove(store, 'df') + store.append('df', df) + + # all + c = store.select_as_coordinates('df') + assert((c.values == np.arange(len(df.index))).all() == True) + + # get coordinates back & test vs frame + _maybe_remove(store, 'df') + + df = DataFrame(dict(A=lrange(5), B=lrange(5))) + store.append('df', df) + c = store.select_as_coordinates('df', ['index<3']) + assert((c.values == np.arange(3)).all() == True) + result = store.select('df', where=c) + expected = df.ix[0:2, :] + tm.assert_frame_equal(result, expected) + + c = store.select_as_coordinates('df', ['index>=3', 'index<=4']) + assert((c.values == np.arange(2) + 3).all() == True) + result = store.select('df', where=c) + expected = df.ix[3:4, :] + tm.assert_frame_equal(result, expected) + self.assertIsInstance(c, Index) + + # multiple tables + _maybe_remove(store, 'df1') + _maybe_remove(store, 'df2') + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + store.append('df1', df1, data_columns=['A', 'B']) + store.append('df2', df2) + + c = store.select_as_coordinates('df1', ['A>0', 'B>0']) + df1_result = store.select('df1', c) + df2_result = store.select('df2', c) + result = concat([df1_result, df2_result], axis=1) + + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # pass array/mask as the coordinates + with ensure_clean_store(self.path) as store: + + df = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + store.append('df',df) + c = store.select_column('df','index') + where = c[DatetimeIndex(c).month==5].index + expected = df.iloc[where] + + # locations + result = store.select('df',where=where) + tm.assert_frame_equal(result,expected) + + # boolean + result = store.select('df',where=where) + tm.assert_frame_equal(result,expected) + + # invalid + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df),dtype='float64')) + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)+1)) + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5) + self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5,stop=10) + + # selection with filter + selection = date_range('20000101',periods=500) + result = store.select('df', where='index in selection') + expected = df[df.index.isin(selection)] + tm.assert_frame_equal(result,expected) + + # list + df = DataFrame(np.random.randn(10,2)) + store.append('df2',df) + result = store.select('df2',where=[0,3,5]) + expected = df.iloc[[0,3,5]] + tm.assert_frame_equal(result,expected) + + # boolean + where = [True] * 10 + where[-2] = False + result = store.select('df2',where=where) + expected = df.loc[where] + tm.assert_frame_equal(result,expected) + + # start/stop + result = store.select('df2', start=5, stop=10) + expected = df[5:10] + tm.assert_frame_equal(result,expected) + + def test_append_to_multiple(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df2['foo'] = 'bar' + df = concat([df1, df2], axis=1) + + with ensure_clean_store(self.path) as store: + + # exceptions + self.assertRaises(ValueError, store.append_to_multiple, + {'df1': ['A', 'B'], 'df2': None}, df, selector='df3') + self.assertRaises(ValueError, store.append_to_multiple, + {'df1': None, 'df2': None}, df, selector='df3') + self.assertRaises( + ValueError, store.append_to_multiple, 'df1', df, 'df1') + + # regular operation + store.append_to_multiple( + {'df1': ['A', 'B'], 'df2': None}, df, selector='df1') + result = store.select_as_multiple( + ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + expected = df[(df.A > 0) & (df.B > 0)] + tm.assert_frame_equal(result, expected) + + def test_append_to_multiple_dropna(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df1.ix[1, ['A', 'B']] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(self.path) as store: + # dropna=True should guarantee rows are synchronized + store.append_to_multiple( + {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', + dropna=True) + result = store.select_as_multiple(['df1', 'df2']) + expected = df.dropna() + tm.assert_frame_equal(result, expected) + tm.assert_index_equal(store.select('df1').index, + store.select('df2').index) + + # dropna=False shouldn't synchronize row indexes + store.append_to_multiple( + {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', + dropna=False) + self.assertRaises( + ValueError, store.select_as_multiple, ['df1', 'df2']) + assert not store.select('df1').index.equals( + store.select('df2').index) + + def test_select_as_multiple(self): + + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df2['foo'] = 'bar' + + with ensure_clean_store(self.path) as store: + + # no tables stored + self.assertRaises(Exception, store.select_as_multiple, + None, where=['A>0', 'B>0'], selector='df1') + + store.append('df1', df1, data_columns=['A', 'B']) + store.append('df2', df2) + + # exceptions + self.assertRaises(Exception, store.select_as_multiple, + None, where=['A>0', 'B>0'], selector='df1') + self.assertRaises(Exception, store.select_as_multiple, + [None], where=['A>0', 'B>0'], selector='df1') + self.assertRaises(KeyError, store.select_as_multiple, + ['df1','df3'], where=['A>0', 'B>0'], selector='df1') + self.assertRaises(KeyError, store.select_as_multiple, + ['df3'], where=['A>0', 'B>0'], selector='df1') + self.assertRaises(KeyError, store.select_as_multiple, + ['df1','df2'], where=['A>0', 'B>0'], selector='df4') + + # default select + result = store.select('df1', ['A>0', 'B>0']) + expected = store.select_as_multiple( + ['df1'], where=['A>0', 'B>0'], selector='df1') + tm.assert_frame_equal(result, expected) + expected = store.select_as_multiple( + 'df1', where=['A>0', 'B>0'], selector='df1') + tm.assert_frame_equal(result, expected) + + # multiple + result = store.select_as_multiple( + ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + expected = concat([df1, df2], axis=1) + expected = expected[(expected.A > 0) & (expected.B > 0)] + tm.assert_frame_equal(result, expected) + + # multiple (diff selector) + result = store.select_as_multiple(['df1', 'df2'], where=[Term( + 'index>df2.index[4]')], selector='df2') + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) + + # test excpection for diff rows + store.append('df3', tm.makeTimeDataFrame(nper=50)) + self.assertRaises(ValueError, store.select_as_multiple, + ['df1','df3'], where=['A>0', 'B>0'], selector='df1') + + def test_nan_selection_bug_4858(self): + + # GH 4858; nan selection bug, only works for pytables >= 3.1 + if LooseVersion(tables.__version__) < '3.1.0': + raise nose.SkipTest('tables version does not support fix for nan selection bug: GH 4858') + + with ensure_clean_store(self.path) as store: + + df = DataFrame(dict(cols = range(6), values = range(6)), dtype='float64') + df['cols'] = (df['cols']+10).apply(str) + df.iloc[0] = np.nan + + expected = DataFrame(dict(cols = ['13.0','14.0','15.0'], values = [3.,4.,5.]), index=[3,4,5]) + + # write w/o the index on that particular column + store.append('df',df, data_columns=True,index=['cols']) + result = store.select('df',where='values>2.0') + assert_frame_equal(result,expected) + + def test_start_stop(self): + + with ensure_clean_store(self.path) as store: + + df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) + store.append('df', df) + + result = store.select( + 'df', [Term("columns=['A']")], start=0, stop=5) + expected = df.ix[0:4, ['A']] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select( + 'df', [Term("columns=['A']")], start=30, stop=40) + assert(len(result) == 0) + assert(type(result) == DataFrame) + + def test_select_filter_corner(self): + + df = DataFrame(np.random.randn(50, 100)) + df.index = ['%.3d' % c for c in df.index] + df.columns = ['%.3d' % c for c in df.columns] + + with ensure_clean_store(self.path) as store: + store.put('frame', df, format='table') + + crit = Term('columns=df.columns[:75]') + result = store.select('frame', [crit]) + tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) + + crit = Term('columns=df.columns[:75:2]') + result = store.select('frame', [crit]) + tm.assert_frame_equal(result, df.ix[:, df.columns[:75:2]]) + + def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): + + options = {} + if compression: + options['complib'] = _default_compressor + + with ensure_clean_store(self.path, 'w', **options) as store: + store['obj'] = obj + retrieved = store['obj'] + comparator(retrieved, obj, **kwargs) + + def _check_double_roundtrip(self, obj, comparator, compression=False, + **kwargs): + options = {} + if compression: + options['complib'] = compression or _default_compressor + + with ensure_clean_store(self.path, 'w', **options) as store: + store['obj'] = obj + retrieved = store['obj'] + comparator(retrieved, obj, **kwargs) + store['obj'] = retrieved + again = store['obj'] + comparator(again, obj, **kwargs) + + def _check_roundtrip_table(self, obj, comparator, compression=False): + options = {} + if compression: + options['complib'] = _default_compressor + + with ensure_clean_store(self.path, 'w', **options) as store: + store.put('obj', obj, format='table') + retrieved = store['obj'] + # sorted_obj = _test_sort(obj) + comparator(retrieved, obj) + + def test_multiple_open_close(self): + # GH 4409, open & close multiple times + + with ensure_clean_path(self.path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path,'df',mode='w',format='table') + + # single + store = HDFStore(path) + self.assertNotIn('CLOSED', str(store)) + self.assertTrue(store.is_open) + store.close() + self.assertIn('CLOSED', str(store)) + self.assertFalse(store.is_open) + + with ensure_clean_path(self.path) as path: + + if pytables._table_file_open_policy_is_strict: + + # multiples + store1 = HDFStore(path) + def f(): + HDFStore(path) + self.assertRaises(ValueError, f) + store1.close() + + else: + + # multiples + store1 = HDFStore(path) + store2 = HDFStore(path) + + self.assertNotIn('CLOSED', str(store1)) + self.assertNotIn('CLOSED', str(store2)) + self.assertTrue(store1.is_open) + self.assertTrue(store2.is_open) + + store1.close() + self.assertIn('CLOSED', str(store1)) + self.assertFalse(store1.is_open) + self.assertNotIn('CLOSED', str(store2)) + self.assertTrue(store2.is_open) + + store2.close() + self.assertIn('CLOSED', str(store1)) + self.assertIn('CLOSED', str(store2)) + self.assertFalse(store1.is_open) + self.assertFalse(store2.is_open) + + # nested close + store = HDFStore(path,mode='w') + store.append('df',df) + + store2 = HDFStore(path) + store2.append('df2',df) + store2.close() + self.assertIn('CLOSED', str(store2)) + self.assertFalse(store2.is_open) + + store.close() + self.assertIn('CLOSED', str(store)) + self.assertFalse(store.is_open) + + # double closing + store = HDFStore(path,mode='w') + store.append('df', df) + + store2 = HDFStore(path) + store.close() + self.assertIn('CLOSED', str(store)) + self.assertFalse(store.is_open) + + store2.close() + self.assertIn('CLOSED', str(store2)) + self.assertFalse(store2.is_open) + + # ops on a closed store + with ensure_clean_path(self.path) as path: + + df = tm.makeDataFrame() + df.to_hdf(path,'df',mode='w',format='table') + + store = HDFStore(path) + store.close() + + self.assertRaises(ClosedFileError, store.keys) + self.assertRaises(ClosedFileError, lambda : 'df' in store) + self.assertRaises(ClosedFileError, lambda : len(store)) + self.assertRaises(ClosedFileError, lambda : store['df']) + self.assertRaises(ClosedFileError, lambda : store.df) + self.assertRaises(ClosedFileError, store.select, 'df') + self.assertRaises(ClosedFileError, store.get, 'df') + self.assertRaises(ClosedFileError, store.append, 'df2', df) + self.assertRaises(ClosedFileError, store.put, 'df3', df) + self.assertRaises(ClosedFileError, store.get_storer, 'df2') + self.assertRaises(ClosedFileError, store.remove, 'df2') + + def f(): + store.select('df') + tm.assertRaisesRegexp(ClosedFileError, 'file is not open', f) + + def test_pytables_native_read(self): + + try: + store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') + d2 = store['detector/readout'] + assert isinstance(d2, DataFrame) + finally: + safe_close(store) + + try: + store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') + str(store) + d1 = store['detector'] + assert isinstance(d1, DataFrame) + finally: + safe_close(store) + + def test_legacy_read(self): + try: + store = HDFStore(tm.get_data_path('legacy_hdf/legacy.h5'), 'r') + store['a'] + store['b'] + store['c'] + store['d'] + finally: + safe_close(store) + + def test_legacy_table_read(self): + # legacy table types + try: + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table.h5'), 'r') + store.select('df1') + store.select('df2') + store.select('wp1') + + # force the frame + store.select('df2', typ='legacy_frame') + + # old version warning + with tm.assert_produces_warning(expected_warning=IncompatibilityWarning): + self.assertRaises( + Exception, store.select, 'wp1', Term('minor_axis=B')) + + df2 = store.select('df2') + result = store.select('df2', Term('index>df2.index[2]')) + expected = df2[df2.index > df2.index[2]] + assert_frame_equal(expected, result) + + finally: + safe_close(store) + + def test_legacy_0_10_read(self): + # legacy from 0.10 + try: + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_0.10.h5'), 'r') + str(store) + for k in store.keys(): + store.select(k) + finally: + safe_close(store) + + def test_legacy_0_11_read(self): + # legacy from 0.11 + try: + path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') + store = HDFStore(tm.get_data_path(path), 'r') + str(store) + assert 'df' in store + assert 'df1' in store + assert 'mi' in store + df = store.select('df') + df1 = store.select('df1') + mi = store.select('mi') + assert isinstance(df, DataFrame) + assert isinstance(df1, DataFrame) + assert isinstance(mi, DataFrame) + finally: + safe_close(store) + + def test_copy(self): + + def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): + try: + if f is None: + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) + + + store = HDFStore(f, 'r') + + if new_f is None: + import tempfile + fd, new_f = tempfile.mkstemp() + + tstore = store.copy(new_f, keys = keys, propindexes = propindexes, **kwargs) + + # check keys + if keys is None: + keys = store.keys() + self.assertEqual(set(keys), set(tstore.keys())) + + # check indicies & nrows + for k in tstore.keys(): + if tstore.get_storer(k).is_table: + new_t = tstore.get_storer(k) + orig_t = store.get_storer(k) + + self.assertEqual(orig_t.nrows, new_t.nrows) + + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + self.assertTrue(new_t[a.name].is_indexed) + + finally: + safe_close(store) + safe_close(tstore) + try: + os.close(fd) + except: + pass + safe_remove(new_f) + + do_copy() + do_copy(keys = ['/a','/b','/df1_mixed']) + do_copy(propindexes = False) + + # new table + df = tm.makeDataFrame() + + try: + st = HDFStore(self.path) + st.append('df', df, data_columns = ['A']) + st.close() + do_copy(f = self.path) + do_copy(f = self.path, propindexes = False) + finally: + safe_remove(self.path) + + def test_legacy_table_write(self): + raise nose.SkipTest("skipping for now") + + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a') + + df = tm.makeDataFrame() + wp = tm.makePanel() + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + store.append('mi', df) + + df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10)) + store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + store.append('wp', wp) + + store.close() + + def test_store_datetime_fractional_secs(self): + + with ensure_clean_store(self.path) as store: + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + series = Series([0], [dt]) + store['a'] = series + self.assertEqual(store['a'].index[0], dt) + + def test_tseries_indices_series(self): + + with ensure_clean_store(self.path) as store: + idx = tm.makeDateIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store['a'] = ser + result = store['a'] + + assert_series_equal(result, ser) + self.assertEqual(type(result.index), type(ser.index)) + self.assertEqual(result.index.freq, ser.index.freq) + + idx = tm.makePeriodIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + store['a'] = ser + result = store['a'] + + assert_series_equal(result, ser) + self.assertEqual(type(result.index), type(ser.index)) + self.assertEqual(result.index.freq, ser.index.freq) + + def test_tseries_indices_frame(self): + + with ensure_clean_store(self.path) as store: + idx = tm.makeDateIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + store['a'] = df + result = store['a'] + + assert_frame_equal(result, df) + self.assertEqual(type(result.index), type(df.index)) + self.assertEqual(result.index.freq, df.index.freq) + + idx = tm.makePeriodIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), idx) + store['a'] = df + result = store['a'] + + assert_frame_equal(result, df) + self.assertEqual(type(result.index), type(df.index)) + self.assertEqual(result.index.freq, df.index.freq) + + def test_unicode_index(self): + + unicode_values = [u('\u03c3'), u('\u03c3\u03c3')] + def f(): + s = Series(np.random.randn(len(unicode_values)), unicode_values) + self._check_roundtrip(s, tm.assert_series_equal) + + compat_assert_produces_warning(PerformanceWarning,f) + + def test_store_datetime_mixed(self): + + df = DataFrame( + {'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c']}) + ts = tm.makeTimeSeries() + df['d'] = ts.index[:3] + self._check_roundtrip(df, tm.assert_frame_equal) + + # def test_cant_write_multiindex_table(self): + # # for now, #1848 + # df = DataFrame(np.random.randn(10, 4), + # index=[np.arange(5).repeat(2), + # np.tile(np.arange(2), 5)]) + + # self.assertRaises(Exception, store.put, 'foo', df, format='table') + + def test_append_with_diff_col_name_types_raises_value_error(self): + df = DataFrame(np.random.randn(10, 1)) + df2 = DataFrame({'a': np.random.randn(10)}) + df3 = DataFrame({(1, 2): np.random.randn(10)}) + df4 = DataFrame({('1', 2): np.random.randn(10)}) + df5 = DataFrame({('1', 2, object): np.random.randn(10)}) + + with ensure_clean_store(self.path) as store: + name = 'df_%s' % tm.rands(10) + store.append(name, df) + + for d in (df2, df3, df4, df5): + with tm.assertRaises(ValueError): + store.append(name, d) + + def test_query_with_nested_special_character(self): + df = DataFrame({'a': ['a', 'a', 'c', 'b', 'test & test', 'c' , 'b', 'e'], + 'b': [1, 2, 3, 4, 5, 6, 7, 8]}) + expected = df[df.a == 'test & test'] + with ensure_clean_store(self.path) as store: + store.append('test', df, format='table', data_columns=True) + result = store.select('test', 'a = "test & test"') + tm.assert_frame_equal(expected, result) + + +def _test_sort(obj): + if isinstance(obj, DataFrame): + return obj.reindex(sorted(obj.index)) + elif isinstance(obj, Panel): + return obj.reindex(major=sorted(obj.major_axis)) + else: + raise ValueError('type not supported here') + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py new file mode 100644 index 00000000..eadcb2c9 --- /dev/null +++ b/pandas/io/tests/test_sql.py @@ -0,0 +1,1911 @@ +"""SQL io tests + +The SQL tests are broken down in different classes: + +- `PandasSQLTest`: base class with common methods for all test classes +- Tests for the public API (only tests with sqlite3) + - `_TestSQLApi` base class + - `TestSQLApi`: test the public API with sqlalchemy engine + - `TesySQLLegacyApi`: test the public API with DBAPI connection +- Tests for the different SQL flavors (flavor specific type conversions) + - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with + common methods, the different tested flavors (sqlite3, MySQL, PostgreSQL) + derive from the base class + - Tests for the legacy mode (`TestSQLiteLegacy` and `TestMySQLLegacy`) + +""" + +from __future__ import print_function +import unittest +import sqlite3 +import csv +import os +import sys + +import nose +import warnings +import numpy as np + +from datetime import datetime + +from pandas import DataFrame, Series, Index, MultiIndex, isnull +from pandas import date_range, to_datetime, to_timedelta +import pandas.compat as compat +from pandas.compat import StringIO, range, lrange, string_types +from pandas.core.datetools import format as date_format + +import pandas.io.sql as sql +import pandas.util.testing as tm +from pandas import _np_version_under1p7 + + +try: + import sqlalchemy + SQLALCHEMY_INSTALLED = True +except ImportError: + SQLALCHEMY_INSTALLED = False + +SQL_STRINGS = { + 'create_iris': { + 'sqlite': """CREATE TABLE iris ( + "SepalLength" REAL, + "SepalWidth" REAL, + "PetalLength" REAL, + "PetalWidth" REAL, + "Name" TEXT + )""", + 'mysql': """CREATE TABLE iris ( + `SepalLength` DOUBLE, + `SepalWidth` DOUBLE, + `PetalLength` DOUBLE, + `PetalWidth` DOUBLE, + `Name` VARCHAR(200) + )""", + 'postgresql': """CREATE TABLE iris ( + "SepalLength" DOUBLE PRECISION, + "SepalWidth" DOUBLE PRECISION, + "PetalLength" DOUBLE PRECISION, + "PetalWidth" DOUBLE PRECISION, + "Name" VARCHAR(200) + )""" + }, + 'insert_iris': { + 'sqlite': """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""", + 'mysql': """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""", + 'postgresql': """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);""" + }, + 'create_test_types': { + 'sqlite': """CREATE TABLE types_test_data ( + "TextCol" TEXT, + "DateCol" TEXT, + "IntDateCol" INTEGER, + "FloatCol" REAL, + "IntCol" INTEGER, + "BoolCol" INTEGER, + "IntColWithNull" INTEGER, + "BoolColWithNull" INTEGER + )""", + 'mysql': """CREATE TABLE types_test_data ( + `TextCol` TEXT, + `DateCol` DATETIME, + `IntDateCol` INTEGER, + `FloatCol` DOUBLE, + `IntCol` INTEGER, + `BoolCol` BOOLEAN, + `IntColWithNull` INTEGER, + `BoolColWithNull` BOOLEAN + )""", + 'postgresql': """CREATE TABLE types_test_data ( + "TextCol" TEXT, + "DateCol" TIMESTAMP, + "IntDateCol" INTEGER, + "FloatCol" DOUBLE PRECISION, + "IntCol" INTEGER, + "BoolCol" BOOLEAN, + "IntColWithNull" INTEGER, + "BoolColWithNull" BOOLEAN + )""" + }, + 'insert_test_types': { + 'sqlite': """ + INSERT INTO types_test_data + VALUES(?, ?, ?, ?, ?, ?, ?, ?) + """, + 'mysql': """ + INSERT INTO types_test_data + VALUES("%s", %s, %s, %s, %s, %s, %s, %s) + """, + 'postgresql': """ + INSERT INTO types_test_data + VALUES(%s, %s, %s, %s, %s, %s, %s, %s) + """ + }, + 'read_parameters': { + 'sqlite': "SELECT * FROM iris WHERE Name=? AND SepalLength=?", + 'mysql': 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s', + 'postgresql': 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s' + }, + 'read_named_parameters': { + 'sqlite': """ + SELECT * FROM iris WHERE Name=:name AND SepalLength=:length + """, + 'mysql': """ + SELECT * FROM iris WHERE + `Name`="%(name)s" AND `SepalLength`=%(length)s + """, + 'postgresql': """ + SELECT * FROM iris WHERE + "Name"=%(name)s AND "SepalLength"=%(length)s + """ + } +} + + +class PandasSQLTest(unittest.TestCase): + """ + Base class with common private methods for SQLAlchemy and fallback cases. + + """ + + def drop_table(self, table_name): + self._get_exec().execute("DROP TABLE IF EXISTS %s" % table_name) + + def _get_exec(self): + if hasattr(self.conn, 'execute'): + return self.conn + else: + return self.conn.cursor() + + def _load_iris_data(self): + import io + iris_csv_file = os.path.join(tm.get_data_path(), 'iris.csv') + + self.drop_table('iris') + self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) + + with io.open(iris_csv_file, mode='r', newline=None) as iris_csv: + r = csv.reader(iris_csv) + next(r) # skip header row + ins = SQL_STRINGS['insert_iris'][self.flavor] + + for row in r: + self._get_exec().execute(ins, row) + + def _check_iris_loaded_frame(self, iris_frame): + pytype = iris_frame.dtypes[0].type + row = iris_frame.iloc[0] + + self.assertTrue( + issubclass(pytype, np.floating), 'Loaded frame has incorrect type') + tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + + def _load_test1_data(self): + columns = ['index', 'A', 'B', 'C', 'D'] + data = [( + '2000-01-03 00:00:00', 0.980268513777, 3.68573087906, -0.364216805298, -1.15973806169), + ('2000-01-04 00:00:00', 1.04791624281, - + 0.0412318367011, -0.16181208307, 0.212549316967), + ('2000-01-05 00:00:00', 0.498580885705, + 0.731167677815, -0.537677223318, 1.34627041952), + ('2000-01-06 00:00:00', 1.12020151869, 1.56762092543, 0.00364077397681, 0.67525259227)] + + self.test_frame1 = DataFrame(data, columns=columns) + + def _load_test2_data(self): + df = DataFrame(dict(A=[4, 1, 3, 6], + B=['asd', 'gsq', 'ylt', 'jkl'], + C=[1.1, 3.1, 6.9, 5.3], + D=[False, True, True, False], + E=['1990-11-22', '1991-10-26', '1993-11-26', '1995-12-12'])) + df['E'] = to_datetime(df['E']) + + self.test_frame3 = df + + def _load_test3_data(self): + columns = ['index', 'A', 'B'] + data = [( + '2000-01-03 00:00:00', 2 ** 31 - 1, -1.987670), + ('2000-01-04 00:00:00', -29, -0.0412318367011), + ('2000-01-05 00:00:00', 20000, 0.731167677815), + ('2000-01-06 00:00:00', -290867, 1.56762092543)] + + self.test_frame3 = DataFrame(data, columns=columns) + + def _load_raw_sql(self): + self.drop_table('types_test_data') + self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor]) + ins = SQL_STRINGS['insert_test_types'][self.flavor] + + data = [( + 'first', '2000-01-03 00:00:00', 535852800, 10.10, 1, False, 1, False), + ('first', '2000-01-04 00:00:00', 1356998400, 10.10, 1, False, None, None)] + for d in data: + self._get_exec().execute(ins, d) + + def _count_rows(self, table_name): + result = self._get_exec().execute( + "SELECT count(*) AS count_1 FROM %s" % table_name).fetchone() + return result[0] + + def _read_sql_iris(self): + iris_frame = self.pandasSQL.read_sql("SELECT * FROM iris") + self._check_iris_loaded_frame(iris_frame) + + def _read_sql_iris_parameter(self): + query = SQL_STRINGS['read_parameters'][self.flavor] + params = ['Iris-setosa', 5.1] + iris_frame = self.pandasSQL.read_sql(query, params=params) + self._check_iris_loaded_frame(iris_frame) + + def _read_sql_iris_named_parameter(self): + query = SQL_STRINGS['read_named_parameters'][self.flavor] + params = {'name': 'Iris-setosa', 'length': 5.1} + iris_frame = self.pandasSQL.read_sql(query, params=params) + self._check_iris_loaded_frame(iris_frame) + + def _to_sql(self): + self.drop_table('test_frame1') + + self.pandasSQL.to_sql(self.test_frame1, 'test_frame1') + self.assertTrue(self.pandasSQL.has_table( + 'test_frame1'), 'Table not written to DB') + + # Nuke table + self.drop_table('test_frame1') + + def _to_sql_fail(self): + self.drop_table('test_frame1') + + self.pandasSQL.to_sql( + self.test_frame1, 'test_frame1', if_exists='fail') + self.assertTrue(self.pandasSQL.has_table( + 'test_frame1'), 'Table not written to DB') + + self.assertRaises(ValueError, self.pandasSQL.to_sql, + self.test_frame1, 'test_frame1', if_exists='fail') + + self.drop_table('test_frame1') + + def _to_sql_replace(self): + self.drop_table('test_frame1') + + self.pandasSQL.to_sql( + self.test_frame1, 'test_frame1', if_exists='fail') + # Add to table again + self.pandasSQL.to_sql( + self.test_frame1, 'test_frame1', if_exists='replace') + self.assertTrue(self.pandasSQL.has_table( + 'test_frame1'), 'Table not written to DB') + + num_entries = len(self.test_frame1) + num_rows = self._count_rows('test_frame1') + + self.assertEqual( + num_rows, num_entries, "not the same number of rows as entries") + + self.drop_table('test_frame1') + + def _to_sql_append(self): + # Nuke table just in case + self.drop_table('test_frame1') + + self.pandasSQL.to_sql( + self.test_frame1, 'test_frame1', if_exists='fail') + + # Add to table again + self.pandasSQL.to_sql( + self.test_frame1, 'test_frame1', if_exists='append') + self.assertTrue(self.pandasSQL.has_table( + 'test_frame1'), 'Table not written to DB') + + num_entries = 2 * len(self.test_frame1) + num_rows = self._count_rows('test_frame1') + + self.assertEqual( + num_rows, num_entries, "not the same number of rows as entries") + + self.drop_table('test_frame1') + + def _roundtrip(self): + self.drop_table('test_frame_roundtrip') + self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip') + result = self.pandasSQL.read_sql('SELECT * FROM test_frame_roundtrip') + + result.set_index('level_0', inplace=True) + # result.index.astype(int) + + result.index.name = None + + tm.assert_frame_equal(result, self.test_frame1) + + def _execute_sql(self): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + iris_results = self.pandasSQL.execute("SELECT * FROM iris") + row = iris_results.fetchone() + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + + +#------------------------------------------------------------------------------ +#--- Testing the public API + +class _TestSQLApi(PandasSQLTest): + + """ + Base class to test the public API. + + From this two classes are derived to run these tests for both the + sqlalchemy mode (`TestSQLApi`) and the legacy mode (`TestSQLLegacyApi`). + These tests are run with sqlite3. Specific tests for the different + sql flavours are included in `_TestSQLAlchemy`. + + Notes: + flavor can always be passed even in SQLAlchemy mode, + should be correctly ignored. + + we don't use drop_table because that isn't part of the public api + + """ + flavor = 'sqlite' + + def setUp(self): + self.conn = self.connect() + self._load_iris_data() + self._load_test1_data() + self._load_test2_data() + self._load_test3_data() + self._load_raw_sql() + + def test_read_sql_iris(self): + iris_frame = sql.read_sql_query( + "SELECT * FROM iris", self.conn) + self._check_iris_loaded_frame(iris_frame) + + def test_legacy_read_frame(self): + with tm.assert_produces_warning(FutureWarning): + iris_frame = sql.read_frame( + "SELECT * FROM iris", self.conn) + self._check_iris_loaded_frame(iris_frame) + + def test_to_sql(self): + sql.to_sql(self.test_frame1, 'test_frame1', self.conn, flavor='sqlite') + self.assertTrue( + sql.has_table('test_frame1', self.conn, flavor='sqlite'), 'Table not written to DB') + + def test_to_sql_fail(self): + sql.to_sql(self.test_frame1, 'test_frame2', + self.conn, flavor='sqlite', if_exists='fail') + self.assertTrue( + sql.has_table('test_frame2', self.conn, flavor='sqlite'), 'Table not written to DB') + + self.assertRaises(ValueError, sql.to_sql, self.test_frame1, + 'test_frame2', self.conn, flavor='sqlite', if_exists='fail') + + def test_to_sql_replace(self): + sql.to_sql(self.test_frame1, 'test_frame3', + self.conn, flavor='sqlite', if_exists='fail') + # Add to table again + sql.to_sql(self.test_frame1, 'test_frame3', + self.conn, flavor='sqlite', if_exists='replace') + self.assertTrue( + sql.has_table('test_frame3', self.conn, flavor='sqlite'), + 'Table not written to DB') + + num_entries = len(self.test_frame1) + num_rows = self._count_rows('test_frame3') + + self.assertEqual( + num_rows, num_entries, "not the same number of rows as entries") + + def test_to_sql_append(self): + sql.to_sql(self.test_frame1, 'test_frame4', + self.conn, flavor='sqlite', if_exists='fail') + + # Add to table again + sql.to_sql(self.test_frame1, 'test_frame4', + self.conn, flavor='sqlite', if_exists='append') + self.assertTrue( + sql.has_table('test_frame4', self.conn, flavor='sqlite'), + 'Table not written to DB') + + num_entries = 2 * len(self.test_frame1) + num_rows = self._count_rows('test_frame4') + + self.assertEqual( + num_rows, num_entries, "not the same number of rows as entries") + + def test_to_sql_type_mapping(self): + sql.to_sql(self.test_frame3, 'test_frame5', + self.conn, flavor='sqlite', index=False) + result = sql.read_sql("SELECT * FROM test_frame5", self.conn) + + tm.assert_frame_equal(self.test_frame3, result) + + def test_to_sql_series(self): + s = Series(np.arange(5, dtype='int64'), name='series') + sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False) + s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) + tm.assert_frame_equal(s.to_frame(), s2) + + def test_to_sql_panel(self): + panel = tm.makePanel() + self.assertRaises(NotImplementedError, sql.to_sql, panel, + 'test_panel', self.conn, flavor='sqlite') + + def test_legacy_write_frame(self): + # Assume that functionality is already tested above so just do + # quick check that it basically works + with tm.assert_produces_warning(FutureWarning): + sql.write_frame(self.test_frame1, 'test_frame_legacy', self.conn, + flavor='sqlite') + + self.assertTrue( + sql.has_table('test_frame_legacy', self.conn, flavor='sqlite'), + 'Table not written to DB') + + def test_roundtrip(self): + sql.to_sql(self.test_frame1, 'test_frame_roundtrip', + con=self.conn, flavor='sqlite') + result = sql.read_sql_query( + 'SELECT * FROM test_frame_roundtrip', + con=self.conn) + + # HACK! + result.index = self.test_frame1.index + result.set_index('level_0', inplace=True) + result.index.astype(int) + result.index.name = None + tm.assert_frame_equal(result, self.test_frame1) + + def test_execute_sql(self): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + iris_results = sql.execute("SELECT * FROM iris", con=self.conn) + row = iris_results.fetchone() + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + + def test_date_parsing(self): + # Test date parsing in read_sq + # No Parsing + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn) + self.assertFalse( + issubclass(df.DateCol.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, + parse_dates=['DateCol']) + self.assertTrue( + issubclass(df.DateCol.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, + parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) + self.assertTrue( + issubclass(df.DateCol.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, + parse_dates=['IntDateCol']) + + self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), + "IntDateCol loaded with incorrect type") + + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, + parse_dates={'IntDateCol': 's'}) + + self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), + "IntDateCol loaded with incorrect type") + + def test_date_and_index(self): + # Test case where same column appears in parse_date and index_col + + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, + index_col='DateCol', + parse_dates=['DateCol', 'IntDateCol']) + + self.assertTrue(issubclass(df.index.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), + "IntDateCol loaded with incorrect type") + + def test_timedelta(self): + # see #6921 + tm._skip_if_not_numpy17_friendly() + + df = to_timedelta(Series(['00:00:01', '00:00:03'], name='foo')).to_frame() + with tm.assert_produces_warning(UserWarning): + df.to_sql('test_timedelta', self.conn) + result = sql.read_sql_query('SELECT * FROM test_timedelta', self.conn) + tm.assert_series_equal(result['foo'], df['foo'].astype('int64')) + + def test_to_sql_index_label(self): + temp_frame = DataFrame({'col1': range(4)}) + + # no index name, defaults to 'index' + sql.to_sql(temp_frame, 'test_index_label', self.conn) + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], 'index') + + # specifying index_label + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace', index_label='other_label') + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], 'other_label', + "Specified index_label not written to database") + + # using the index name + temp_frame.index.name = 'index_name' + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace') + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], 'index_name', + "Index name not written to database") + + # has index name, but specifying index_label + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace', index_label='other_label') + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], 'other_label', + "Specified index_label not written to database") + + def test_to_sql_index_label_multiindex(self): + temp_frame = DataFrame({'col1': range(4)}, + index=MultiIndex.from_product([('A0', 'A1'), ('B0', 'B1')])) + + # no index name, defaults to 'level_0' and 'level_1' + sql.to_sql(temp_frame, 'test_index_label', self.conn) + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[0], 'level_0') + self.assertEqual(frame.columns[1], 'level_1') + + # specifying index_label + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace', index_label=['A', 'B']) + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[:2].tolist(), ['A', 'B'], + "Specified index_labels not written to database") + + # using the index name + temp_frame.index.names = ['A', 'B'] + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace') + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[:2].tolist(), ['A', 'B'], + "Index names not written to database") + + # has index name, but specifying index_label + sql.to_sql(temp_frame, 'test_index_label', self.conn, + if_exists='replace', index_label=['C', 'D']) + frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) + self.assertEqual(frame.columns[:2].tolist(), ['C', 'D'], + "Specified index_labels not written to database") + + # wrong length of index_label + self.assertRaises(ValueError, sql.to_sql, temp_frame, + 'test_index_label', self.conn, if_exists='replace', + index_label='C') + + def test_integer_col_names(self): + df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) + sql.to_sql(df, "test_frame_integer_col_names", self.conn, + if_exists='replace') + + def test_get_schema(self): + create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite', + con=self.conn) + self.assertTrue('CREATE' in create_sql) + + +class TestSQLApi(_TestSQLApi): + """ + Test the public API as it would be used directly + + Tests for `read_sql_table` are included here, as this is specific for the + sqlalchemy mode. + + """ + flavor = 'sqlite' + + def connect(self): + if SQLALCHEMY_INSTALLED: + return sqlalchemy.create_engine('sqlite:///:memory:') + else: + raise nose.SkipTest('SQLAlchemy not installed') + + def test_read_table_columns(self): + # test columns argument in read_table + sql.to_sql(self.test_frame1, 'test_frame', self.conn) + + cols = ['A', 'B'] + result = sql.read_sql_table('test_frame', self.conn, columns=cols) + self.assertEqual(result.columns.tolist(), cols, + "Columns not correctly selected") + + def test_read_table_index_col(self): + # test columns argument in read_table + sql.to_sql(self.test_frame1, 'test_frame', self.conn) + + result = sql.read_sql_table('test_frame', self.conn, index_col="index") + self.assertEqual(result.index.names, ["index"], + "index_col not correctly set") + + result = sql.read_sql_table('test_frame', self.conn, index_col=["A", "B"]) + self.assertEqual(result.index.names, ["A", "B"], + "index_col not correctly set") + + result = sql.read_sql_table('test_frame', self.conn, index_col=["A", "B"], + columns=["C", "D"]) + self.assertEqual(result.index.names, ["A", "B"], + "index_col not correctly set") + self.assertEqual(result.columns.tolist(), ["C", "D"], + "columns not set correctly whith index_col") + + def test_read_sql_delegate(self): + iris_frame1 = sql.read_sql_query( + "SELECT * FROM iris", self.conn) + iris_frame2 = sql.read_sql( + "SELECT * FROM iris", self.conn) + tm.assert_frame_equal(iris_frame1, iris_frame2, + "read_sql and read_sql_query have not the same" + " result with a query") + + iris_frame1 = sql.read_sql_table('iris', self.conn) + iris_frame2 = sql.read_sql('iris', self.conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) + + def test_not_reflect_all_tables(self): + # create invalid table + qry = """CREATE TABLE invalid (x INTEGER, y UNKNOWN);""" + self.conn.execute(qry) + qry = """CREATE TABLE other_table (x INTEGER, y INTEGER);""" + self.conn.execute(qry) + + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + # Trigger a warning. + sql.read_sql_table('other_table', self.conn) + sql.read_sql_query('SELECT * FROM other_table', self.conn) + # Verify some things + self.assertEqual(len(w), 0, "Warning triggered for other table") + + +class TestSQLLegacyApi(_TestSQLApi): + """ + Test the public legacy API + + """ + flavor = 'sqlite' + + def connect(self, database=":memory:"): + return sqlite3.connect(database) + + def test_sql_open_close(self): + # Test if the IO in the database still work if the connection closed + # between the writing and reading (as in many real situations). + + with tm.ensure_clean() as name: + + conn = self.connect(name) + sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, + flavor="sqlite", index=False) + conn.close() + + conn = self.connect(name) + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", + conn) + conn.close() + + tm.assert_frame_equal(self.test_frame3, result) + + def test_read_sql_delegate(self): + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) + tm.assert_frame_equal(iris_frame1, iris_frame2, + "read_sql and read_sql_query have not the same" + " result with a query") + + self.assertRaises(sql.DatabaseError, sql.read_sql, 'iris', self.conn) + + def test_safe_names_warning(self): + # GH 6798 + df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space + # warns on create table with spaces in names + with tm.assert_produces_warning(): + sql.to_sql(df, "test_frame3_legacy", self.conn, + flavor="sqlite", index=False) + + def test_get_schema2(self): + # without providing a connection object (available for backwards comp) + create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite') + self.assertTrue('CREATE' in create_sql) + + def test_tquery(self): + with tm.assert_produces_warning(FutureWarning): + iris_results = sql.tquery("SELECT * FROM iris", con=self.conn) + row = iris_results[0] + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + + def test_uquery(self): + with tm.assert_produces_warning(FutureWarning): + rows = sql.uquery("SELECT * FROM iris LIMIT 1", con=self.conn) + self.assertEqual(rows, -1) + + +#------------------------------------------------------------------------------ +#--- Database flavor specific tests + + +class _TestSQLAlchemy(PandasSQLTest): + """ + Base class for testing the sqlalchemy backend. + + Subclasses for specific database types are created below. Tests that + deviate for each flavor are overwritten there. + + """ + flavor = None + + def setUp(self): + self.setup_import() + self.setup_driver() + self.setup_connect() + + self._load_iris_data() + self._load_raw_sql() + self._load_test1_data() + + def setup_import(self): + # Skip this test if SQLAlchemy not available + if not SQLALCHEMY_INSTALLED: + raise nose.SkipTest('SQLAlchemy not installed') + + def setup_driver(self): + raise NotImplementedError() + + def connect(self): + raise NotImplementedError() + + def setup_connect(self): + try: + self.conn = self.connect() + self.pandasSQL = sql.PandasSQLAlchemy(self.conn) + # to test if connection can be made: + self.conn.connect() + except sqlalchemy.exc.OperationalError: + raise nose.SkipTest("Can't connect to {0} server".format(self.flavor)) + + def tearDown(self): + raise NotImplementedError() + + def test_aread_sql(self): + self._read_sql_iris() + + def test_read_sql_parameter(self): + self._read_sql_iris_parameter() + + def test_read_sql_named_parameter(self): + self._read_sql_iris_named_parameter() + + def test_to_sql(self): + self._to_sql() + + def test_to_sql_fail(self): + self._to_sql_fail() + + def test_to_sql_replace(self): + self._to_sql_replace() + + def test_to_sql_append(self): + self._to_sql_append() + + def test_create_table(self): + temp_conn = self.connect() + temp_frame = DataFrame( + {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + + pandasSQL = sql.PandasSQLAlchemy(temp_conn) + pandasSQL.to_sql(temp_frame, 'temp_frame') + + self.assertTrue( + temp_conn.has_table('temp_frame'), 'Table not written to DB') + + def test_drop_table(self): + temp_conn = self.connect() + + temp_frame = DataFrame( + {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + + pandasSQL = sql.PandasSQLAlchemy(temp_conn) + pandasSQL.to_sql(temp_frame, 'temp_frame') + + self.assertTrue( + temp_conn.has_table('temp_frame'), 'Table not written to DB') + + pandasSQL.drop_table('temp_frame') + + self.assertFalse( + temp_conn.has_table('temp_frame'), 'Table not deleted from DB') + + def test_roundtrip(self): + self._roundtrip() + + def test_execute_sql(self): + self._execute_sql() + + def test_read_table(self): + iris_frame = sql.read_sql_table("iris", con=self.conn) + self._check_iris_loaded_frame(iris_frame) + + def test_read_table_columns(self): + iris_frame = sql.read_sql_table( + "iris", con=self.conn, columns=['SepalLength', 'SepalLength']) + tm.equalContents( + iris_frame.columns.values, ['SepalLength', 'SepalLength']) + + def test_read_table_absent(self): + self.assertRaises( + ValueError, sql.read_sql_table, "this_doesnt_exist", con=self.conn) + + def test_default_type_conversion(self): + df = sql.read_sql_table("types_test_data", self.conn) + + self.assertTrue(issubclass(df.FloatCol.dtype.type, np.floating), + "FloatCol loaded with incorrect type") + self.assertTrue(issubclass(df.IntCol.dtype.type, np.integer), + "IntCol loaded with incorrect type") + self.assertTrue(issubclass(df.BoolCol.dtype.type, np.bool_), + "BoolCol loaded with incorrect type") + + # Int column with NA values stays as float + self.assertTrue(issubclass(df.IntColWithNull.dtype.type, np.floating), + "IntColWithNull loaded with incorrect type") + # Bool column with NA values becomes object + self.assertTrue(issubclass(df.BoolColWithNull.dtype.type, np.object), + "BoolColWithNull loaded with incorrect type") + + def test_bigint(self): + # int64 should be converted to BigInteger, GH7433 + df = DataFrame(data={'i64':[2**62]}) + df.to_sql('test_bigint', self.conn, index=False) + result = sql.read_sql_table('test_bigint', self.conn) + + tm.assert_frame_equal(df, result) + + def test_default_date_load(self): + df = sql.read_sql_table("types_test_data", self.conn) + + # IMPORTANT - sqlite has no native date type, so shouldn't parse, but + # MySQL SHOULD be converted. + self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + def test_date_parsing(self): + # No Parsing + df = sql.read_sql_table("types_test_data", self.conn) + + df = sql.read_sql_table("types_test_data", self.conn, + parse_dates=['DateCol']) + self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + df = sql.read_sql_table("types_test_data", self.conn, + parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) + self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + df = sql.read_sql_table("types_test_data", self.conn, parse_dates={ + 'DateCol': {'format': '%Y-%m-%d %H:%M:%S'}}) + self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), + "IntDateCol loaded with incorrect type") + + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates=['IntDateCol']) + self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), + "IntDateCol loaded with incorrect type") + + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={'IntDateCol': 's'}) + self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), + "IntDateCol loaded with incorrect type") + + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={'IntDateCol': {'unit': 's'}}) + self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), + "IntDateCol loaded with incorrect type") + + def test_datetime(self): + if self.driver == 'pymysql': + raise nose.SkipTest('writing datetime not working with pymysql') + + df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), + 'B': np.arange(3.0)}) + df.to_sql('test_datetime', self.conn) + + # with read_table -> type information from schema used + result = sql.read_sql_table('test_datetime', self.conn) + result = result.drop('index', axis=1) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn) + result = result.drop('index', axis=1) + if self.flavor == 'sqlite': + self.assertTrue(isinstance(result.loc[0, 'A'], string_types)) + result['A'] = to_datetime(result['A']) + tm.assert_frame_equal(result, df) + else: + tm.assert_frame_equal(result, df) + + def test_datetime_NaT(self): + # status: + # - postgresql: gives error on inserting "0001-255-255T00:00:00" + # - sqlite3: works, but reading it with query returns '-001--1--1 -1:-1:-1.-00001' + + if self.driver == 'pymysql': + raise nose.SkipTest('writing datetime not working with pymysql') + if self.driver == 'psycopg2': + raise nose.SkipTest('writing datetime NaT not working with psycopg2') + if self.flavor == 'sqlite': + raise nose.SkipTest('reading datetime NaT not working with sqlite') + + df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), + 'B': np.arange(3.0)}) + df.loc[1, 'A'] = np.nan + df.to_sql('test_datetime', self.conn, index=False) + + # with read_table -> type information from schema used + result = sql.read_sql_table('test_datetime', self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn) + if self.flavor == 'sqlite': + self.assertTrue(isinstance(result.loc[0, 'A'], string_types)) + result['A'] = to_datetime(result['A'], coerce=True) + tm.assert_frame_equal(result, df) + else: + tm.assert_frame_equal(result, df) + + def test_mixed_dtype_insert(self): + # see GH6509 + s1 = Series(2**25 + 1,dtype=np.int32) + s2 = Series(0.0,dtype=np.float32) + df = DataFrame({'s1': s1, 's2': s2}) + + # write and read again + df.to_sql("test_read_write", self.conn, index=False) + df2 = sql.read_sql_table("test_read_write", self.conn) + + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) + + def test_nan_numeric(self): + if self.driver == 'pymysql': + raise nose.SkipTest('writing NaNs not working with pymysql') + + # NaNs in numeric float column + df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]}) + df.to_sql('test_nan', self.conn, index=False) + + # with read_table + result = sql.read_sql_table('test_nan', self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + tm.assert_frame_equal(result, df) + + def test_nan_fullcolumn(self): + if self.driver == 'pymysql': + raise nose.SkipTest('writing NaNs not working with pymysql') + + # full NaN column (numeric float column) + df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]}) + df.to_sql('test_nan', self.conn, index=False) + + if self.flavor == 'sqlite': + df['B'] = df['B'].astype('object') + df['B'] = None + + # with read_table + result = sql.read_sql_table('test_nan', self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + tm.assert_frame_equal(result, df) + + def test_nan_string(self): + if self.driver == 'pymysql': + raise nose.SkipTest('writing NaNs not working with pymysql') + + # NaNs in string column + df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]}) + df.to_sql('test_nan', self.conn, index=False) + + if self.flavor == 'sqlite': + df.loc[2, 'B'] = None + elif self.flavor == 'postgresql': + df = df.fillna('NaN') + + # with read_table + result = sql.read_sql_table('test_nan', self.conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + tm.assert_frame_equal(result, df) + + +class TestSQLiteAlchemy(_TestSQLAlchemy): + """ + Test the sqlalchemy backend against an in-memory sqlite database. + + """ + flavor = 'sqlite' + + def connect(self): + return sqlalchemy.create_engine('sqlite:///:memory:') + + def setup_driver(self): + # sqlite3 is built-in + self.driver = None + + def tearDown(self): + # in memory so tables should not be removed explicitly + pass + + def test_default_type_conversion(self): + df = sql.read_sql_table("types_test_data", self.conn) + + self.assertTrue(issubclass(df.FloatCol.dtype.type, np.floating), + "FloatCol loaded with incorrect type") + self.assertTrue(issubclass(df.IntCol.dtype.type, np.integer), + "IntCol loaded with incorrect type") + # sqlite has no boolean type, so integer type is returned + self.assertTrue(issubclass(df.BoolCol.dtype.type, np.integer), + "BoolCol loaded with incorrect type") + + # Int column with NA values stays as float + self.assertTrue(issubclass(df.IntColWithNull.dtype.type, np.floating), + "IntColWithNull loaded with incorrect type") + # Non-native Bool column with NA values stays as float + self.assertTrue(issubclass(df.BoolColWithNull.dtype.type, np.floating), + "BoolColWithNull loaded with incorrect type") + + def test_default_date_load(self): + df = sql.read_sql_table("types_test_data", self.conn) + + # IMPORTANT - sqlite has no native date type, so shouldn't parse, but + self.assertFalse(issubclass(df.DateCol.dtype.type, np.datetime64), + "DateCol loaded with incorrect type") + + def test_bigint_warning(self): + # test no warning for BIGINT (to support int64) is raised (GH7433) + df = DataFrame({'a':[1,2]}, dtype='int64') + df.to_sql('test_bigintwarning', self.conn, index=False) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + sql.read_sql_table('test_bigintwarning', self.conn) + self.assertEqual(len(w), 0, "Warning triggered for other table") + + +class TestMySQLAlchemy(_TestSQLAlchemy): + """ + Test the sqlalchemy backend against an MySQL database. + + """ + flavor = 'mysql' + + def connect(self): + return sqlalchemy.create_engine( + 'mysql+{driver}://root@localhost/pandas_nosetest'.format(driver=self.driver)) + + def setup_driver(self): + try: + import pymysql + self.driver = 'pymysql' + except ImportError: + raise nose.SkipTest('pymysql not installed') + + def tearDown(self): + c = self.conn.execute('SHOW TABLES') + for table in c.fetchall(): + self.conn.execute('DROP TABLE %s' % table[0]) + + def test_default_type_conversion(self): + df = sql.read_sql_table("types_test_data", self.conn) + + self.assertTrue(issubclass(df.FloatCol.dtype.type, np.floating), + "FloatCol loaded with incorrect type") + self.assertTrue(issubclass(df.IntCol.dtype.type, np.integer), + "IntCol loaded with incorrect type") + # MySQL has no real BOOL type (it's an alias for TINYINT) + self.assertTrue(issubclass(df.BoolCol.dtype.type, np.integer), + "BoolCol loaded with incorrect type") + + # Int column with NA values stays as float + self.assertTrue(issubclass(df.IntColWithNull.dtype.type, np.floating), + "IntColWithNull loaded with incorrect type") + # Bool column with NA = int column with NA values => becomes float + self.assertTrue(issubclass(df.BoolColWithNull.dtype.type, np.floating), + "BoolColWithNull loaded with incorrect type") + + def test_read_procedure(self): + # see GH7324. Although it is more an api test, it is added to the + # mysql tests as sqlite does not have stored procedures + df = DataFrame({'a': [1, 2, 3], 'b':[0.1, 0.2, 0.3]}) + df.to_sql('test_procedure', self.conn, index=False) + + proc = """DROP PROCEDURE IF EXISTS get_testdb; + + CREATE PROCEDURE get_testdb () + + BEGIN + SELECT * FROM test_procedure; + END""" + + connection = self.conn.connect() + trans = connection.begin() + try: + r1 = connection.execute(proc) + trans.commit() + except: + trans.rollback() + raise + + res1 = sql.read_sql_query("CALL get_testdb();", self.conn) + tm.assert_frame_equal(df, res1) + + # test delegation to read_sql_query + res2 = sql.read_sql("CALL get_testdb();", self.conn) + tm.assert_frame_equal(df, res2) + + +class TestPostgreSQLAlchemy(_TestSQLAlchemy): + """ + Test the sqlalchemy backend against an PostgreSQL database. + + """ + flavor = 'postgresql' + + def connect(self): + return sqlalchemy.create_engine( + 'postgresql+{driver}://postgres@localhost/pandas_nosetest'.format(driver=self.driver)) + + def setup_driver(self): + try: + import psycopg2 + self.driver = 'psycopg2' + except ImportError: + raise nose.SkipTest('psycopg2 not installed') + + def tearDown(self): + c = self.conn.execute( + "SELECT table_name FROM information_schema.tables" + " WHERE table_schema = 'public'") + for table in c.fetchall(): + self.conn.execute("DROP TABLE %s" % table[0]) + + +#------------------------------------------------------------------------------ +#--- Test Sqlite / MySQL fallback + +class TestSQLiteLegacy(PandasSQLTest): + """ + Test the legacy mode against an in-memory sqlite database. + + """ + flavor = 'sqlite' + + def connect(self): + return sqlite3.connect(':memory:') + + def drop_table(self, table_name): + cur = self.conn.cursor() + cur.execute("DROP TABLE IF EXISTS %s" % table_name) + self.conn.commit() + + def setUp(self): + self.conn = self.connect() + self.pandasSQL = sql.PandasSQLLegacy(self.conn, 'sqlite') + + self._load_iris_data() + + self._load_test1_data() + + def test_invalid_flavor(self): + self.assertRaises( + NotImplementedError, sql.PandasSQLLegacy, self.conn, 'oracle') + + def test_read_sql(self): + self._read_sql_iris() + + def test_read_sql_parameter(self): + self._read_sql_iris_parameter() + + def test_read_sql_named_parameter(self): + self._read_sql_iris_named_parameter() + + def test_to_sql(self): + self._to_sql() + + def test_to_sql_fail(self): + self._to_sql_fail() + + def test_to_sql_replace(self): + self._to_sql_replace() + + def test_to_sql_append(self): + self._to_sql_append() + + def test_create_and_drop_table(self): + temp_frame = DataFrame( + {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + + self.pandasSQL.to_sql(temp_frame, 'drop_test_frame') + + self.assertTrue(self.pandasSQL.has_table('drop_test_frame'), + 'Table not written to DB') + + self.pandasSQL.drop_table('drop_test_frame') + + self.assertFalse(self.pandasSQL.has_table('drop_test_frame'), + 'Table not deleted from DB') + + def test_roundtrip(self): + self._roundtrip() + + def test_execute_sql(self): + self._execute_sql() + + +class TestMySQLLegacy(TestSQLiteLegacy): + """ + Test the legacy mode against a MySQL database. + + """ + flavor = 'mysql' + + def drop_table(self, table_name): + cur = self.conn.cursor() + cur.execute("DROP TABLE IF EXISTS %s" % table_name) + self.conn.commit() + + def _count_rows(self, table_name): + cur = self._get_exec() + cur.execute( + "SELECT count(*) AS count_1 FROM %s" % table_name) + rows = cur.fetchall() + return rows[0][0] + + def connect(self): + return self.driver.connect(host='127.0.0.1', user='root', passwd='', db='pandas_nosetest') + + def setUp(self): + try: + import pymysql + self.driver = pymysql + except ImportError: + raise nose.SkipTest('pymysql not installed') + + try: + self.conn = self.connect() + except self.driver.err.OperationalError: + raise nose.SkipTest("Can't connect to MySQL server") + + self.pandasSQL = sql.PandasSQLLegacy(self.conn, 'mysql') + + self._load_iris_data() + self._load_test1_data() + + def tearDown(self): + c = self.conn.cursor() + c.execute('SHOW TABLES') + for table in c.fetchall(): + c.execute('DROP TABLE %s' % table[0]) + self.conn.commit() + self.conn.close() + + def test_a_deprecation(self): + with tm.assert_produces_warning(FutureWarning): + sql.to_sql(self.test_frame1, 'test_frame1', self.conn, + flavor='mysql') + self.assertTrue( + sql.has_table('test_frame1', self.conn, flavor='mysql'), + 'Table not written to DB') + + +#------------------------------------------------------------------------------ +#--- Old tests from 0.13.1 (before refactor using sqlalchemy) + + +_formatters = { + datetime: lambda dt: "'%s'" % date_format(dt), + str: lambda x: "'%s'" % x, + np.str_: lambda x: "'%s'" % x, + compat.text_type: lambda x: "'%s'" % x, + compat.binary_type: lambda x: "'%s'" % x, + float: lambda x: "%.8f" % x, + int: lambda x: "%s" % x, + type(None): lambda x: "NULL", + np.float64: lambda x: "%.10f" % x, + bool: lambda x: "'%s'" % x, +} + +def format_query(sql, *args): + """ + + """ + processed_args = [] + for arg in args: + if isinstance(arg, float) and isnull(arg): + arg = None + + formatter = _formatters[type(arg)] + processed_args.append(formatter(arg)) + + return sql % tuple(processed_args) + +def _skip_if_no_pymysql(): + try: + import pymysql + except ImportError: + raise nose.SkipTest('pymysql not installed, skipping') + + +class TestXSQLite(tm.TestCase): + + def setUp(self): + self.db = sqlite3.connect(':memory:') + + def test_basic(self): + frame = tm.makeTimeDataFrame() + self._check_roundtrip(frame) + + def test_write_row_by_row(self): + + frame = tm.makeTimeDataFrame() + frame.ix[0, 0] = np.nan + create_sql = sql.get_schema(frame, 'test', 'sqlite') + cur = self.db.cursor() + cur.execute(create_sql) + + cur = self.db.cursor() + + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for idx, row in frame.iterrows(): + fmt_sql = format_query(ins, *row) + sql.tquery(fmt_sql, cur=cur) + + self.db.commit() + + result = sql.read_frame("select * from test", con=self.db) + result.index = frame.index + tm.assert_frame_equal(result, frame) + + def test_execute(self): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, 'test', 'sqlite') + cur = self.db.cursor() + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (?, ?, ?, ?)" + + row = frame.ix[0] + sql.execute(ins, self.db, params=tuple(row)) + self.db.commit() + + result = sql.read_frame("select * from test", self.db) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) + + def test_schema(self): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, 'test', 'sqlite') + lines = create_sql.splitlines() + for l in lines: + tokens = l.split(' ') + if len(tokens) == 2 and tokens[0] == 'A': + self.assertTrue(tokens[1] == 'DATETIME') + + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, 'test', 'sqlite', keys=['A', 'B'],) + lines = create_sql.splitlines() + self.assertTrue('PRIMARY KEY (A,B)' in create_sql) + cur = self.db.cursor() + cur.execute(create_sql) + + def test_execute_fail(self): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + cur = self.db.cursor() + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.db) + sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)', self.db) + + try: + sys.stdout = StringIO() + self.assertRaises(Exception, sql.execute, + 'INSERT INTO test VALUES("foo", "bar", 7)', + self.db) + finally: + sys.stdout = sys.__stdout__ + + def test_execute_closed_connection(self): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + cur = self.db.cursor() + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.db) + self.db.close() + try: + sys.stdout = StringIO() + self.assertRaises(Exception, sql.tquery, "select * from test", + con=self.db) + finally: + sys.stdout = sys.__stdout__ + + def test_na_roundtrip(self): + pass + + def _check_roundtrip(self, frame): + sql.write_frame(frame, name='test_table', con=self.db) + result = sql.read_frame("select * from test_table", self.db) + + # HACK! Change this once indexes are handled properly. + result.index = frame.index + + expected = frame + tm.assert_frame_equal(result, expected) + + frame['txt'] = ['a'] * len(frame) + frame2 = frame.copy() + frame2['Idx'] = Index(lrange(len(frame2))) + 10 + sql.write_frame(frame2, name='test_table2', con=self.db) + result = sql.read_frame("select * from test_table2", self.db, + index_col='Idx') + expected = frame.copy() + expected.index = Index(lrange(len(frame2))) + 10 + expected.index.name = 'Idx' + tm.assert_frame_equal(expected, result) + + def test_tquery(self): + frame = tm.makeTimeDataFrame() + sql.write_frame(frame, name='test_table', con=self.db) + result = sql.tquery("select A from test_table", self.db) + expected = frame.A + result = Series(result, frame.index) + tm.assert_series_equal(result, expected) + + try: + sys.stdout = StringIO() + self.assertRaises(sql.DatabaseError, sql.tquery, + 'select * from blah', con=self.db) + + self.assertRaises(sql.DatabaseError, sql.tquery, + 'select * from blah', con=self.db, retry=True) + finally: + sys.stdout = sys.__stdout__ + + def test_uquery(self): + frame = tm.makeTimeDataFrame() + sql.write_frame(frame, name='test_table', con=self.db) + stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' + self.assertEqual(sql.uquery(stmt, con=self.db), 1) + + try: + sys.stdout = StringIO() + + self.assertRaises(sql.DatabaseError, sql.tquery, + 'insert into blah values (1)', con=self.db) + + self.assertRaises(sql.DatabaseError, sql.tquery, + 'insert into blah values (1)', con=self.db, + retry=True) + finally: + sys.stdout = sys.__stdout__ + + def test_keyword_as_column_names(self): + ''' + ''' + df = DataFrame({'From':np.ones(5)}) + sql.write_frame(df, con = self.db, name = 'testkeywords') + + def test_onecolumn_of_integer(self): + # GH 3628 + # a column_of_integers dataframe should transfer well to sql + + mono_df=DataFrame([1 , 2], columns=['c0']) + sql.write_frame(mono_df, con = self.db, name = 'mono_df') + # computing the sum via sql + con_x=self.db + the_sum=sum([my_c0[0] for my_c0 in con_x.execute("select * from mono_df")]) + # it should not fail, and gives 3 ( Issue #3628 ) + self.assertEqual(the_sum , 3) + + result = sql.read_frame("select * from mono_df",con_x) + tm.assert_frame_equal(result,mono_df) + + def test_if_exists(self): + df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) + df_if_exists_2 = DataFrame({'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) + table_name = 'table_if_exists' + sql_select = "SELECT * FROM %s" % table_name + + def clean_up(test_table_to_drop): + """ + Drops tables created from individual tests + so no dependencies arise from sequential tests + """ + if sql.table_exists(test_table_to_drop, self.db, flavor='sqlite'): + cur = self.db.cursor() + cur.execute("DROP TABLE %s" % test_table_to_drop) + cur.close() + + # test if invalid value for if_exists raises appropriate error + self.assertRaises(ValueError, + sql.write_frame, + frame=df_if_exists_1, + con=self.db, + name=table_name, + flavor='sqlite', + if_exists='notvalidvalue') + clean_up(table_name) + + # test if_exists='fail' + sql.write_frame(frame=df_if_exists_1, con=self.db, name=table_name, + flavor='sqlite', if_exists='fail') + self.assertRaises(ValueError, + sql.write_frame, + frame=df_if_exists_1, + con=self.db, + name=table_name, + flavor='sqlite', + if_exists='fail') + + # test if_exists='replace' + sql.write_frame(frame=df_if_exists_1, con=self.db, name=table_name, + flavor='sqlite', if_exists='replace') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(1, 'A'), (2, 'B')]) + sql.write_frame(frame=df_if_exists_2, con=self.db, name=table_name, + flavor='sqlite', if_exists='replace') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(3, 'C'), (4, 'D'), (5, 'E')]) + clean_up(table_name) + + # test if_exists='append' + sql.write_frame(frame=df_if_exists_1, con=self.db, name=table_name, + flavor='sqlite', if_exists='fail') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(1, 'A'), (2, 'B')]) + sql.write_frame(frame=df_if_exists_2, con=self.db, name=table_name, + flavor='sqlite', if_exists='append') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) + clean_up(table_name) + + +class TestXMySQL(tm.TestCase): + + def setUp(self): + _skip_if_no_pymysql() + import pymysql + try: + # Try Travis defaults. + # No real user should allow root access with a blank password. + self.db = pymysql.connect(host='localhost', user='root', passwd='', + db='pandas_nosetest') + except: + pass + else: + return + try: + self.db = pymysql.connect(read_default_group='pandas') + except pymysql.ProgrammingError as e: + raise nose.SkipTest( + "Create a group of connection parameters under the heading " + "[pandas] in your system's mysql default file, " + "typically located at ~/.my.cnf or /etc/.my.cnf. ") + except pymysql.Error as e: + raise nose.SkipTest( + "Cannot connect to database. " + "Create a group of connection parameters under the heading " + "[pandas] in your system's mysql default file, " + "typically located at ~/.my.cnf or /etc/.my.cnf. ") + + def test_basic(self): + _skip_if_no_pymysql() + frame = tm.makeTimeDataFrame() + self._check_roundtrip(frame) + + def test_write_row_by_row(self): + + _skip_if_no_pymysql() + frame = tm.makeTimeDataFrame() + frame.ix[0, 0] = np.nan + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = sql.get_schema(frame, 'test', 'mysql') + cur = self.db.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for idx, row in frame.iterrows(): + fmt_sql = format_query(ins, *row) + sql.tquery(fmt_sql, cur=cur) + + self.db.commit() + + result = sql.read_frame("select * from test", con=self.db) + result.index = frame.index + tm.assert_frame_equal(result, frame) + + def test_execute(self): + _skip_if_no_pymysql() + frame = tm.makeTimeDataFrame() + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = sql.get_schema(frame, 'test', 'mysql') + cur = self.db.cursor() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unknown table.*") + cur.execute(drop_sql) + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + + row = frame.ix[0].values.tolist() + sql.execute(ins, self.db, params=tuple(row)) + self.db.commit() + + result = sql.read_frame("select * from test", self.db) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) + + def test_schema(self): + _skip_if_no_pymysql() + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, 'test', 'mysql') + lines = create_sql.splitlines() + for l in lines: + tokens = l.split(' ') + if len(tokens) == 2 and tokens[0] == 'A': + self.assertTrue(tokens[1] == 'DATETIME') + + frame = tm.makeTimeDataFrame() + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = sql.get_schema(frame, 'test', 'mysql', keys=['A', 'B'],) + lines = create_sql.splitlines() + self.assertTrue('PRIMARY KEY (A,B)' in create_sql) + cur = self.db.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + + def test_execute_fail(self): + _skip_if_no_pymysql() + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a(5), b(5)) + ); + """ + cur = self.db.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.db) + sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)', self.db) + + try: + sys.stdout = StringIO() + self.assertRaises(Exception, sql.execute, + 'INSERT INTO test VALUES("foo", "bar", 7)', + self.db) + finally: + sys.stdout = sys.__stdout__ + + def test_execute_closed_connection(self): + _skip_if_no_pymysql() + drop_sql = "DROP TABLE IF EXISTS test" + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a(5), b(5)) + ); + """ + cur = self.db.cursor() + cur.execute(drop_sql) + cur.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.db) + self.db.close() + try: + sys.stdout = StringIO() + self.assertRaises(Exception, sql.tquery, "select * from test", + con=self.db) + finally: + sys.stdout = sys.__stdout__ + + def test_na_roundtrip(self): + _skip_if_no_pymysql() + pass + + def _check_roundtrip(self, frame): + _skip_if_no_pymysql() + drop_sql = "DROP TABLE IF EXISTS test_table" + cur = self.db.cursor() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unknown table.*") + cur.execute(drop_sql) + sql.write_frame(frame, name='test_table', con=self.db, flavor='mysql') + result = sql.read_frame("select * from test_table", self.db) + + # HACK! Change this once indexes are handled properly. + result.index = frame.index + result.index.name = frame.index.name + + expected = frame + tm.assert_frame_equal(result, expected) + + frame['txt'] = ['a'] * len(frame) + frame2 = frame.copy() + index = Index(lrange(len(frame2))) + 10 + frame2['Idx'] = index + drop_sql = "DROP TABLE IF EXISTS test_table2" + cur = self.db.cursor() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unknown table.*") + cur.execute(drop_sql) + sql.write_frame(frame2, name='test_table2', con=self.db, flavor='mysql') + result = sql.read_frame("select * from test_table2", self.db, + index_col='Idx') + expected = frame.copy() + + # HACK! Change this once indexes are handled properly. + expected.index = index + expected.index.names = result.index.names + tm.assert_frame_equal(expected, result) + + def test_tquery(self): + try: + import pymysql + except ImportError: + raise nose.SkipTest("no pymysql") + frame = tm.makeTimeDataFrame() + drop_sql = "DROP TABLE IF EXISTS test_table" + cur = self.db.cursor() + cur.execute(drop_sql) + sql.write_frame(frame, name='test_table', con=self.db, flavor='mysql') + result = sql.tquery("select A from test_table", self.db) + expected = frame.A + result = Series(result, frame.index) + tm.assert_series_equal(result, expected) + + try: + sys.stdout = StringIO() + self.assertRaises(sql.DatabaseError, sql.tquery, + 'select * from blah', con=self.db) + + self.assertRaises(sql.DatabaseError, sql.tquery, + 'select * from blah', con=self.db, retry=True) + finally: + sys.stdout = sys.__stdout__ + + def test_uquery(self): + try: + import pymysql + except ImportError: + raise nose.SkipTest("no pymysql") + frame = tm.makeTimeDataFrame() + drop_sql = "DROP TABLE IF EXISTS test_table" + cur = self.db.cursor() + cur.execute(drop_sql) + sql.write_frame(frame, name='test_table', con=self.db, flavor='mysql') + stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' + self.assertEqual(sql.uquery(stmt, con=self.db), 1) + + try: + sys.stdout = StringIO() + + self.assertRaises(sql.DatabaseError, sql.tquery, + 'insert into blah values (1)', con=self.db) + + self.assertRaises(sql.DatabaseError, sql.tquery, + 'insert into blah values (1)', con=self.db, + retry=True) + finally: + sys.stdout = sys.__stdout__ + + def test_keyword_as_column_names(self): + ''' + ''' + _skip_if_no_pymysql() + df = DataFrame({'From':np.ones(5)}) + sql.write_frame(df, con = self.db, name = 'testkeywords', + if_exists='replace', flavor='mysql') + + def test_if_exists(self): + _skip_if_no_pymysql() + df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) + df_if_exists_2 = DataFrame({'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) + table_name = 'table_if_exists' + sql_select = "SELECT * FROM %s" % table_name + + def clean_up(test_table_to_drop): + """ + Drops tables created from individual tests + so no dependencies arise from sequential tests + """ + if sql.table_exists(test_table_to_drop, self.db, flavor='mysql'): + cur = self.db.cursor() + cur.execute("DROP TABLE %s" % test_table_to_drop) + cur.close() + + # test if invalid value for if_exists raises appropriate error + self.assertRaises(ValueError, + sql.write_frame, + frame=df_if_exists_1, + con=self.db, + name=table_name, + flavor='mysql', + if_exists='notvalidvalue') + clean_up(table_name) + + # test if_exists='fail' + sql.write_frame(frame=df_if_exists_1, con=self.db, name=table_name, + flavor='mysql', if_exists='fail') + self.assertRaises(ValueError, + sql.write_frame, + frame=df_if_exists_1, + con=self.db, + name=table_name, + flavor='mysql', + if_exists='fail') + + # test if_exists='replace' + sql.write_frame(frame=df_if_exists_1, con=self.db, name=table_name, + flavor='mysql', if_exists='replace') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(1, 'A'), (2, 'B')]) + sql.write_frame(frame=df_if_exists_2, con=self.db, name=table_name, + flavor='mysql', if_exists='replace') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(3, 'C'), (4, 'D'), (5, 'E')]) + clean_up(table_name) + + # test if_exists='append' + sql.write_frame(frame=df_if_exists_1, con=self.db, name=table_name, + flavor='mysql', if_exists='fail') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(1, 'A'), (2, 'B')]) + sql.write_frame(frame=df_if_exists_2, con=self.db, name=table_name, + flavor='mysql', if_exists='append') + self.assertEqual(sql.tquery(sql_select, con=self.db), + [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) + clean_up(table_name) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py new file mode 100644 index 00000000..b045867b --- /dev/null +++ b/pandas/io/tests/test_stata.py @@ -0,0 +1,531 @@ +# pylint: disable=E1101 + +from datetime import datetime +import datetime as dt +import os +import warnings +import nose +import sys +from distutils.version import LooseVersion + +import numpy as np + +import pandas as pd +from pandas.core.frame import DataFrame, Series +from pandas.io.parsers import read_csv +from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, + PossiblePrecisionLoss) +import pandas.util.testing as tm +from pandas.util.misc import is_little_endian +from pandas import compat + +class TestStata(tm.TestCase): + + def setUp(self): + # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: + # http://stata-press.com/data/glmext.html + self.dirpath = tm.get_data_path() + self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') + self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') + self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') + self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') + self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') + self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') + self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') + self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') + self.csv3 = os.path.join(self.dirpath, 'stata3.csv') + + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') + self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') + self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') + self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') + self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') + self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') + + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') + self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') + self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') + self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta') + + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') + self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') + self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') + self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') + self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta') + + def read_dta(self, file): + return read_stata(file, convert_dates=True) + + def read_csv(self, file): + return read_csv(file, parse_dates=True) + + def test_read_empty_dta(self): + empty_ds = DataFrame(columns=['unit']) + # GH 7369, make sure can read a 0-obs dta file + with tm.ensure_clean() as path: + empty_ds.to_stata(path,write_index=False) + empty_ds2 = read_stata(path) + tm.assert_frame_equal(empty_ds, empty_ds2) + + def test_read_dta1(self): + reader_114 = StataReader(self.dta1_114) + parsed_114 = reader_114.data() + reader_117 = StataReader(self.dta1_117) + parsed_117 = reader_117.data() + # Pandas uses np.nan as missing value. + # Thus, all columns will be of type float, regardless of their name. + expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=['float_miss', 'double_miss', 'byte_miss', + 'int_miss', 'long_miss']) + + # this is an oddity as really the nan should be float64, but + # the casting doesn't fail so need to match stata here + expected['float_miss'] = expected['float_miss'].astype(np.float32) + + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_117, expected) + + def test_read_dta2(self): + if LooseVersion(sys.version) < '2.7': + raise nose.SkipTest('datetime interp under 2.6 is faulty') + + expected = DataFrame.from_records( + [ + ( + datetime(2006, 11, 19, 23, 13, 20), + 1479596223000, + datetime(2010, 1, 20), + datetime(2010, 1, 8), + datetime(2010, 1, 1), + datetime(1974, 7, 1), + datetime(2010, 1, 1), + datetime(2010, 1, 1) + ), + ( + datetime(1959, 12, 31, 20, 3, 20), + -1479590, + datetime(1953, 10, 2), + datetime(1948, 6, 10), + datetime(1955, 1, 1), + datetime(1955, 7, 1), + datetime(1955, 1, 1), + datetime(2, 1, 1) + ), + ( + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + pd.NaT, + ) + ], + columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date', + 'monthly_date', 'quarterly_date', 'half_yearly_date', + 'yearly_date'] + ) + expected['yearly_date'] = expected['yearly_date'].astype('O') + + with warnings.catch_warnings(record=True) as w: + parsed_114 = self.read_dta(self.dta2_114) + parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due ot limits date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + tm.assert_equal( + len(w), 1) # should get a warning for that format. + + # buggy test because of the NaT comparison on certain platforms + # Format 113 test fails since it does not support tc and tC formats + # tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) + + def test_read_dta3(self): + parsed_113 = self.read_dta(self.dta3_113) + parsed_114 = self.read_dta(self.dta3_114) + parsed_115 = self.read_dta(self.dta3_115) + parsed_117 = self.read_dta(self.dta3_117) + + # match stata here + expected = self.read_csv(self.csv3) + expected = expected.astype(np.float32) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) + + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) + + def test_read_dta4(self): + parsed_113 = self.read_dta(self.dta4_113) + parsed_114 = self.read_dta(self.dta4_114) + parsed_115 = self.read_dta(self.dta4_115) + parsed_117 = self.read_dta(self.dta4_117) + + expected = DataFrame.from_records( + [ + ["one", "ten", "one", "one", "one"], + ["two", "nine", "two", "two", "two"], + ["three", "eight", "three", "three", "three"], + ["four", "seven", 4, "four", "four"], + ["five", "six", 5, np.nan, "five"], + ["six", "five", 6, np.nan, "six"], + ["seven", "four", 7, np.nan, "seven"], + ["eight", "three", 8, np.nan, "eight"], + ["nine", "two", 9, np.nan, "nine"], + ["ten", "one", "ten", np.nan, "ten"] + ], + columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', + 'labeled_with_missings', 'float_labelled']) + + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) + + def test_read_write_dta5(self): + original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=['float_miss', 'double_miss', 'byte_miss', + 'int_miss', 'long_miss']) + original.index.name = 'index' + + with tm.ensure_clean() as path: + original.to_stata(path, None) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + + def test_write_dta6(self): + original = self.read_csv(self.csv3) + original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) + + with tm.ensure_clean() as path: + original.to_stata(path, None) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + + @nose.tools.nottest + def test_read_dta7(self): + expected = read_csv(self.csv7, parse_dates=True, sep='\t') + parsed = self.read_dta(self.dta7) + tm.assert_frame_equal(parsed, expected) + + @nose.tools.nottest + def test_read_dta8(self): + expected = read_csv(self.csv8, parse_dates=True, sep='\t') + parsed = self.read_dta(self.dta8) + tm.assert_frame_equal(parsed, expected) + + @nose.tools.nottest + def test_read_dta9(self): + expected = read_csv(self.csv9, parse_dates=True, sep='\t') + parsed = self.read_dta(self.dta9) + tm.assert_frame_equal(parsed, expected) + + def test_read_write_dta10(self): + original = DataFrame(data=[["string", "object", 1, 1.1, + np.datetime64('2003-12-25')]], + columns=['string', 'object', 'integer', 'floating', + 'datetime']) + original["object"] = Series(original["object"], dtype=object) + original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) + + with tm.ensure_clean() as path: + original.to_stata(path, {'datetime': 'tc'}) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + + def test_stata_doc_examples(self): + with tm.ensure_clean() as path: + df = DataFrame(np.random.randn(10, 2), columns=list('AB')) + df.to_stata(path) + + def test_encoding(self): + + # GH 4626, proper encoding handling + raw = read_stata(self.dta_encoding) + encoded = read_stata(self.dta_encoding, encoding="latin-1") + result = encoded.kreis1849[0] + + if compat.PY3: + expected = raw.kreis1849[0] + self.assertEqual(result, expected) + self.assertIsInstance(result, compat.string_types) + else: + expected = raw.kreis1849.str.decode("latin-1")[0] + self.assertEqual(result, expected) + self.assertIsInstance(result, unicode) + + with tm.ensure_clean() as path: + encoded.to_stata(path,encoding='latin-1', write_index=False) + reread_encoded = read_stata(path, encoding='latin-1') + tm.assert_frame_equal(encoded, reread_encoded) + + def test_read_write_dta11(self): + original = DataFrame([(1, 2, 3, 4)], + columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) + formatted = DataFrame([(1, 2, 3, 4)], + columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) + formatted.index.name = 'index' + formatted = formatted.astype(np.int32) + + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + original.to_stata(path, None) + # should get a warning for that format. + tm.assert_equal(len(w), 1) + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta12(self): + original = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_1', + 'astringwithmorethan32characters_2', + '+', + '-', + 'short', + 'delete']) + formatted = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_', + '_0astringwithmorethan32character', + '_', + '_1_', + '_short', + '_delete']) + formatted.index.name = 'index' + formatted = formatted.astype(np.int32) + + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + original.to_stata(path, None) + tm.assert_equal(len(w), 1) # should get a warning for that format. + + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9, dtype=np.int16) + s2 = Series(2**17, dtype=np.int32) + s3 = Series(2**33, dtype=np.int64) + original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3}) + original.index.name = 'index' + + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + formatted) + + def test_read_write_reread_dta14(self): + expected = self.read_csv(self.csv14) + cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] + for col in cols: + expected[col] = expected[col].convert_objects(convert_numeric=True) + expected['float_'] = expected['float_'].astype(np.float32) + expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True) + + parsed_113 = self.read_dta(self.dta14_113) + parsed_113.index.name = 'index' + parsed_114 = self.read_dta(self.dta14_114) + parsed_114.index.name = 'index' + parsed_115 = self.read_dta(self.dta14_115) + parsed_115.index.name = 'index' + parsed_117 = self.read_dta(self.dta14_117) + parsed_117.index.name = 'index' + + tm.assert_frame_equal(parsed_114, parsed_113) + tm.assert_frame_equal(parsed_114, parsed_115) + tm.assert_frame_equal(parsed_114, parsed_117) + + with tm.ensure_clean() as path: + parsed_114.to_stata(path, {'date_td': 'td'}) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) + + def test_read_write_reread_dta15(self): + expected = self.read_csv(self.csv15) + expected['byte_'] = expected['byte_'].astype(np.int8) + expected['int_'] = expected['int_'].astype(np.int16) + expected['long_'] = expected['long_'].astype(np.int32) + expected['float_'] = expected['float_'].astype(np.float32) + expected['double_'] = expected['double_'].astype(np.float64) + expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + + parsed_113 = self.read_dta(self.dta15_113) + parsed_114 = self.read_dta(self.dta15_114) + parsed_115 = self.read_dta(self.dta15_115) + parsed_117 = self.read_dta(self.dta15_117) + + tm.assert_frame_equal(expected, parsed_114) + tm.assert_frame_equal(parsed_113, parsed_114) + tm.assert_frame_equal(parsed_114, parsed_115) + tm.assert_frame_equal(parsed_114, parsed_117) + + def test_timestamp_and_label(self): + original = DataFrame([(1,)], columns=['var']) + time_stamp = datetime(2000, 2, 29, 14, 21) + data_label = 'This is a data file.' + with tm.ensure_clean() as path: + original.to_stata(path, time_stamp=time_stamp, data_label=data_label) + reader = StataReader(path) + parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M')) + assert parsed_time_stamp == time_stamp + assert reader.data_label == data_label + + def test_numeric_column_names(self): + original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) + original.index.name = 'index' + with tm.ensure_clean() as path: + # should get a warning for that format. + with warnings.catch_warnings(record=True) as w: + tm.assert_produces_warning(original.to_stata(path), InvalidColumnName) + # should produce a single warning + tm.assert_equal(len(w), 1) + + written_and_read_again = self.read_dta(path) + written_and_read_again = written_and_read_again.set_index('index') + columns = list(written_and_read_again.columns) + convert_col_name = lambda x: int(x[1]) + written_and_read_again.columns = map(convert_col_name, columns) + tm.assert_frame_equal(original, written_and_read_again) + + def test_nan_to_missing_value(self): + s1 = Series(np.arange(4.0), dtype=np.float32) + s2 = Series(np.arange(4.0), dtype=np.float64) + s1[::2] = np.nan + s2[1::2] = np.nan + original = DataFrame({'s1': s1, 's2': s2}) + original.index.name = 'index' + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + written_and_read_again = written_and_read_again.set_index('index') + tm.assert_frame_equal(written_and_read_again, original) + + def test_no_index(self): + columns = ['x', 'y'] + original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), + columns=columns) + original.index.name = 'index_not_written' + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + written_and_read_again = self.read_dta(path) + tm.assertRaises(KeyError, + lambda: written_and_read_again['index_not_written']) + + def test_string_no_dates(self): + s1 = Series(['a', 'A longer string']) + s2 = Series([1.0, 2.0], dtype=np.float64) + original = DataFrame({'s1': s1, 's2': s2}) + original.index.name = 'index' + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + original) + + def test_large_value_conversion(self): + s0 = Series([1, 99], dtype=np.int8) + s1 = Series([1, 127], dtype=np.int8) + s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) + s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) + original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3}) + original.index.name = 'index' + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + tm.assert_produces_warning(original.to_stata(path), + PossiblePrecisionLoss) + # should produce a single warning + tm.assert_equal(len(w), 1) + + written_and_read_again = self.read_dta(path) + modified = original.copy() + modified['s1'] = Series(modified['s1'], dtype=np.int16) + modified['s2'] = Series(modified['s2'], dtype=np.int32) + modified['s3'] = Series(modified['s3'], dtype=np.float64) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + modified) + + def test_dates_invalid_column(self): + original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) + original.index.name = 'index' + with tm.ensure_clean() as path: + with warnings.catch_warnings(record=True) as w: + tm.assert_produces_warning(original.to_stata(path, {0: 'tc'}), + InvalidColumnName) + tm.assert_equal(len(w), 1) + + written_and_read_again = self.read_dta(path) + modified = original.copy() + modified.columns = ['_0'] + tm.assert_frame_equal(written_and_read_again.set_index('index'), + modified) + + def test_date_export_formats(self): + columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] + conversions = dict(((c, c) for c in columns)) + data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) + original = DataFrame([data], columns=columns) + original.index.name = 'index' + expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time + datetime(2006, 11, 20), # Day + datetime(2006, 11, 19), # Week + datetime(2006, 11, 1), # Month + datetime(2006, 10, 1), # Quarter year + datetime(2006, 7, 1), # Half year + datetime(2006, 1, 1)] # Year + + expected = DataFrame([expected_values], columns=columns) + expected.index.name = 'index' + with tm.ensure_clean() as path: + original.to_stata(path, conversions) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + expected) + + def test_write_missing_strings(self): + original = DataFrame([["1"], [None]], columns=["foo"]) + expected = DataFrame([["1"], [""]], columns=["foo"]) + expected.index.name = 'index' + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py new file mode 100644 index 00000000..7c86f582 --- /dev/null +++ b/pandas/io/tests/test_wb.py @@ -0,0 +1,55 @@ +import nose + +import pandas +from pandas.compat import u +from pandas.util.testing import network +from pandas.util.testing import assert_frame_equal +from numpy.testing.decorators import slow +from pandas.io.wb import search, download, get_countries +import pandas.util.testing as tm + + +class TestWB(tm.TestCase): + + @slow + @network + def test_wdi_search(self): + raise nose.SkipTest + + expected = {u('id'): {2634: u('GDPPCKD'), + 4649: u('NY.GDP.PCAP.KD'), + 4651: u('NY.GDP.PCAP.KN'), + 4653: u('NY.GDP.PCAP.PP.KD')}, + u('name'): {2634: u('GDP per Capita, constant US$, ' + 'millions'), + 4649: u('GDP per capita (constant 2000 US$)'), + 4651: u('GDP per capita (constant LCU)'), + 4653: u('GDP per capita, PPP (constant 2005 ' + 'international $)')}} + result = search('gdp.*capita.*constant').ix[:, :2] + expected = pandas.DataFrame(expected) + expected.index = result.index + assert_frame_equal(result, expected) + + @slow + @network + def test_wdi_download(self): + raise nose.SkipTest + + expected = {'GDPPCKN': {(u('United States'), u('2003')): u('40800.0735367688'), (u('Canada'), u('2004')): u('37857.1261134552'), (u('United States'), u('2005')): u('42714.8594790102'), (u('Canada'), u('2003')): u('37081.4575704003'), (u('United States'), u('2004')): u('41826.1728310667'), (u('Mexico'), u('2003')): u('72720.0691255285'), (u('Mexico'), u('2004')): u('74751.6003347038'), (u('Mexico'), u('2005')): u('76200.2154469437'), (u('Canada'), u('2005')): u('38617.4563629611')}, 'GDPPCKD': {(u('United States'), u('2003')): u('40800.0735367688'), (u('Canada'), u('2004')): u('34397.055116118'), (u('United States'), u('2005')): u('42714.8594790102'), (u('Canada'), u('2003')): u('33692.2812368928'), (u('United States'), u('2004')): u('41826.1728310667'), (u('Mexico'), u('2003')): u('7608.43848670658'), (u('Mexico'), u('2004')): u('7820.99026814334'), (u('Mexico'), u('2005')): u('7972.55364129367'), (u('Canada'), u('2005')): u('35087.8925933298')}} + expected = pandas.DataFrame(expected) + result = download(country=['CA', 'MX', 'US', 'junk'], indicator=['GDPPCKD', + 'GDPPCKN', 'junk'], start=2003, end=2005) + expected.index = result.index + assert_frame_equal(result, pandas.DataFrame(expected)) + + @slow + @network + def test_wdi_get_countries(self): + result = get_countries() + self.assertTrue('Zimbabwe' in list(result['name'])) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/wb.py b/pandas/io/wb.py new file mode 100644 index 00000000..d815bb19 --- /dev/null +++ b/pandas/io/wb.py @@ -0,0 +1,192 @@ +from __future__ import print_function + +from pandas.compat import map, reduce, range, lrange +from pandas.io.common import urlopen +from pandas.io import json +import pandas +import numpy as np + + +def download(country=['MX', 'CA', 'US'], indicator=['GDPPCKD', 'GDPPCKN'], + start=2003, end=2005): + """ + Download data series from the World Bank's World Development Indicators + + Parameters + ---------- + + indicator: string or list of strings + taken from the ``id`` field in ``WDIsearch()`` + country: string or list of strings. + ``all`` downloads data for all countries + ISO-2 character codes select individual countries (e.g.``US``,``CA``) + start: int + First year of the data series + end: int + Last year of the data series (inclusive) + + Returns + ------- + + ``pandas`` DataFrame with columns: country, iso2c, year, indicator value. + """ + + # Are ISO-2 country codes valid? + valid_countries = [ + "AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BB", "BD", "BE", "BF", + "BG", "BH", "BI", "BJ", "BO", "BR", "BS", "BW", "BY", "BZ", "CA", "CD", + "CF", "CG", "CH", "CI", "CL", "CM", "CN", "CO", "CR", "CV", "CY", "CZ", + "DE", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ER", "ES", "ET", "FI", + "FJ", "FR", "GA", "GB", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW", + "GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN", "IR", "IS", + "IT", "JM", "JO", "JP", "KE", "KG", "KH", "KM", "KR", "KW", "KZ", "LA", + "LB", "LC", "LK", "LS", "LT", "LU", "LV", "MA", "MD", "MG", "MK", "ML", + "MN", "MR", "MU", "MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", + "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PT", "PY", + "RO", "RU", "RW", "SA", "SB", "SC", "SD", "SE", "SG", "SI", "SK", "SL", + "SN", "SR", "SV", "SY", "SZ", "TD", "TG", "TH", "TN", "TR", "TT", "TW", + "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE", "VN", "VU", "YE", "ZA", + "ZM", "ZW", "all" + ] + if type(country) == str: + country = [country] + bad_countries = np.setdiff1d(country, valid_countries) + country = np.intersect1d(country, valid_countries) + country = ';'.join(country) + # Work with a list of indicators + if type(indicator) == str: + indicator = [indicator] + # Download + data = [] + bad_indicators = [] + for ind in indicator: + try: + tmp = _get_data(ind, country, start, end) + tmp.columns = ['country', 'iso2c', 'year', ind] + data.append(tmp) + except: + bad_indicators.append(ind) + # Warn + if len(bad_indicators) > 0: + print('Failed to obtain indicator(s): %s' % '; '.join(bad_indicators)) + print('The data may still be available for download at ' + 'http://data.worldbank.org') + if len(bad_countries) > 0: + print('Invalid ISO-2 codes: %s' % ' '.join(bad_countries)) + # Merge WDI series + if len(data) > 0: + out = reduce(lambda x, y: x.merge(y, how='outer'), data) + # Clean + out = out.drop('iso2c', axis=1) + out = out.set_index(['country', 'year']) + out = out.convert_objects(convert_numeric=True) + return out + + +def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', + start=2002, end=2005): + # Build URL for api call + url = ("http://api.worldbank.org/countries/" + country + "/indicators/" + + indicator + "?date=" + str(start) + ":" + str(end) + + "&per_page=25000&format=json") + # Download + with urlopen(url) as response: + data = response.read() + # Parse JSON file + data = json.loads(data)[1] + country = [x['country']['value'] for x in data] + iso2c = [x['country']['id'] for x in data] + year = [x['date'] for x in data] + value = [x['value'] for x in data] + # Prepare output + out = pandas.DataFrame([country, iso2c, year, value]).T + return out + + +def get_countries(): + '''Query information about countries + ''' + url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' + with urlopen(url) as response: + data = response.read() + data = json.loads(data)[1] + data = pandas.DataFrame(data) + data.adminregion = [x['value'] for x in data.adminregion] + data.incomeLevel = [x['value'] for x in data.incomeLevel] + data.lendingType = [x['value'] for x in data.lendingType] + data.region = [x['value'] for x in data.region] + data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) + return data + + +def get_indicators(): + '''Download information about all World Bank data series + ''' + url = 'http://api.worldbank.org/indicators?per_page=50000&format=json' + with urlopen(url) as response: + data = response.read() + data = json.loads(data)[1] + data = pandas.DataFrame(data) + # Clean fields + data.source = [x['value'] for x in data.source] + fun = lambda x: x.encode('ascii', 'ignore') + data.sourceOrganization = data.sourceOrganization.apply(fun) + # Clean topic field + + def get_value(x): + try: + return x['value'] + except: + return '' + fun = lambda x: [get_value(y) for y in x] + data.topics = data.topics.apply(fun) + data.topics = data.topics.apply(lambda x: ' ; '.join(x)) + # Clean outpu + data = data.sort(columns='id') + data.index = pandas.Index(lrange(data.shape[0])) + return data + + +_cached_series = None + + +def search(string='gdp.*capi', field='name', case=False): + """ + Search available data series from the world bank + + Parameters + ---------- + + string: string + regular expression + field: string + id, name, source, sourceNote, sourceOrganization, topics + See notes below + case: bool + case sensitive search? + + Notes + ----- + + The first time this function is run it will download and cache the full + list of available series. Depending on the speed of your network + connection, this can take time. Subsequent searches will use the cached + copy, so they should be much faster. + + id : Data series indicator (for use with the ``indicator`` argument of + ``WDI()``) e.g. NY.GNS.ICTR.GN.ZS" + name: Short description of the data series + source: Data collection project + sourceOrganization: Data collection organization + note: + sourceNote: + topics: + """ + # Create cached list of series if it does not exist + global _cached_series + if type(_cached_series) is not pandas.core.frame.DataFrame: + _cached_series = get_indicators() + data = _cached_series[field] + idx = data.str.contains(string, case=case) + out = _cached_series.ix[idx].dropna() + return out diff --git a/pandas/lib.pyx b/pandas/lib.pyx new file mode 100644 index 00000000..7690cc48 --- /dev/null +++ b/pandas/lib.pyx @@ -0,0 +1,1690 @@ +cimport numpy as np +cimport cython +import numpy as np + +from numpy cimport * + + +cdef extern from "numpy/arrayobject.h": + cdef enum NPY_TYPES: + NPY_intp "NPY_INTP" + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyList_Check, PyFloat_Check, + PyString_Check, + PyBytes_Check, + PyTuple_SetItem, + PyTuple_New, + PyObject_SetAttrString) + +cdef extern from "Python.h": + Py_ssize_t PY_SSIZE_T_MAX + + ctypedef struct PySliceObject: + pass + + cdef int PySlice_GetIndicesEx( + PySliceObject* s, Py_ssize_t length, + Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step, + Py_ssize_t *slicelength) except -1 + + + +cimport cpython + +isnan = np.isnan +cdef double NaN = np.NaN +cdef double nan = NaN +cdef double NAN = nan + +from datetime import datetime as pydatetime + +# this is our tseries.pxd +from datetime cimport * + +from tslib cimport convert_to_tsobject, convert_to_timedelta64 +import tslib +from tslib import NaT, Timestamp, repr_timedelta64 + +cdef int64_t NPY_NAT = util.get_nat() + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan + +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: INT64_MAX + enum: INT64_MIN + + +cdef extern from "math.h": + double sqrt(double x) + double fabs(double) + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +def values_from_object(object o): + """ return my values or the object if we are say an ndarray """ + cdef f + + f = getattr(o, 'get_values', None) + if f is not None: + o = f() + + return o + +cpdef map_indices_list(list index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i from 0 <= i < length: + result[index[i]] = i + + return result + + +from libc.stdlib cimport malloc, free + + +def ismember_nans(float64_t[:] arr, set values, bint hasnans): + cdef: + Py_ssize_t i, n + ndarray[uint8_t] result + float64_t val + + n = len(arr) + result = np.empty(n, dtype=np.uint8) + for i in range(n): + val = arr[i] + result[i] = val in values or hasnans and isnan(val) + + return result.view(np.bool_) + + +def ismember(ndarray arr, set values): + ''' + Checks whether + + Parameters + ---------- + arr : ndarray + values : set + + Returns + ------- + ismember : ndarray (boolean dtype) + ''' + cdef: + Py_ssize_t i, n + ndarray[uint8_t] result + object val + + n = len(arr) + result = np.empty(n, dtype=np.uint8) + for i in range(n): + val = util.get_value_at(arr, i) + result[i] = val in values + + return result.view(np.bool_) + +#---------------------------------------------------------------------- +# datetime / io related + +cdef int _EPOCH_ORD = 719163 + +from datetime import date as pydate + +cdef inline int64_t gmtime(object date): + cdef int y, m, d, h, mn, s, days + + y = PyDateTime_GET_YEAR(date) + m = PyDateTime_GET_MONTH(date) + d = PyDateTime_GET_DAY(date) + h = PyDateTime_DATE_GET_HOUR(date) + mn = PyDateTime_DATE_GET_MINUTE(date) + s = PyDateTime_DATE_GET_SECOND(date) + + days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 + return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 + +cpdef object to_datetime(int64_t timestamp): + return pydatetime.utcfromtimestamp(timestamp / 1000.0) + +cpdef object to_timestamp(object dt): + return gmtime(dt) + +def array_to_timestamp(ndarray[object, ndim=1] arr): + cdef int i, n + cdef ndarray[int64_t, ndim=1] result + + n = len(arr) + result = np.empty(n, dtype=np.int64) + + for i from 0 <= i < n: + result[i] = gmtime(arr[i]) + + return result + +def time64_to_datetime(ndarray[int64_t, ndim=1] arr): + cdef int i, n + cdef ndarray[object, ndim=1] result + + n = len(arr) + result = np.empty(n, dtype=object) + + for i from 0 <= i < n: + result[i] = to_datetime(arr[i]) + + return result + +cdef inline int64_t get_timedelta64_value(val): + return val.view('i8') + +#---------------------------------------------------------------------- +# isnull / notnull related + +cdef double INF = np.inf +cdef double NEGINF = -INF + +cpdef checknull(object val): + if util.is_float_object(val) or util.is_complex_object(val): + return val != val # and val != INF and val != NEGINF + elif util.is_datetime64_object(val): + return get_datetime64_value(val) == NPY_NAT + elif val is NaT: + return True + elif util.is_timedelta64_object(val): + return get_timedelta64_value(val) == NPY_NAT + elif is_array(val): + return False + else: + return _checknull(val) + +cpdef checknull_old(object val): + if util.is_float_object(val) or util.is_complex_object(val): + return val != val or val == INF or val == NEGINF + elif util.is_datetime64_object(val): + return get_datetime64_value(val) == NPY_NAT + elif val is NaT: + return True + elif util.is_timedelta64_object(val): + return get_timedelta64_value(val) == NPY_NAT + elif is_array(val): + return False + else: + return util._checknull(val) + +def isscalar(object val): + return np.isscalar(val) or val is None or PyDateTime_Check(val) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj(ndarray[object] arr): + cdef Py_ssize_t i, n + cdef object val + cdef ndarray[uint8_t] result + + n = len(arr) + result = np.zeros(n, dtype=np.uint8) + for i from 0 <= i < n: + arobj = arr[i] + result[i] = arobj is NaT or _checknull(arobj) + return result.view(np.bool_) + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj_old(ndarray[object] arr): + cdef Py_ssize_t i, n + cdef object val + cdef ndarray[uint8_t] result + + n = len(arr) + result = np.zeros(n, dtype=np.uint8) + for i from 0 <= i < n: + result[i] = util._checknull_old(arr[i]) + return result.view(np.bool_) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj2d(ndarray[object, ndim=2] arr): + cdef Py_ssize_t i, j, n, m + cdef object val + cdef ndarray[uint8_t, ndim=2] result + + n, m = ( arr).shape + result = np.zeros((n, m), dtype=np.uint8) + for i from 0 <= i < n: + for j from 0 <= j < m: + val = arr[i, j] + if checknull(val): + result[i, j] = 1 + return result.view(np.bool_) + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj_old(ndarray[object] arr): + cdef Py_ssize_t i, n + cdef object val + cdef ndarray[uint8_t] result + + n = len(arr) + result = np.zeros(n, dtype=np.uint8) + for i from 0 <= i < n: + result[i] = util._checknull_old(arr[i]) + return result.view(np.bool_) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj2d_old(ndarray[object, ndim=2] arr): + cdef Py_ssize_t i, j, n, m + cdef object val + cdef ndarray[uint8_t, ndim=2] result + + n, m = ( arr).shape + result = np.zeros((n, m), dtype=np.uint8) + for i from 0 <= i < n: + for j from 0 <= j < m: + val = arr[i, j] + if checknull_old(val): + result[i, j] = 1 + return result.view(np.bool_) + +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr + + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < n: + val = values[i] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple(list arrays): + cdef: + ndarray[object] buf + Py_ssize_t k = len(arrays) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = arrays[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple_list(list lists): + cdef: + list buf + Py_ssize_t k = len(lists) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = lists[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple_list_gen(object gen): + cdef: + list buf + Py_ssize_t j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for buf in gen: + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def dicts_to_array(list dicts, list columns): + cdef: + Py_ssize_t i, j, k, n + ndarray[object, ndim=2] result + dict row + object col, onan = np.nan + + k = len(columns) + n = len(dicts) + + result = np.empty((n, k), dtype='O') + + for i in range(n): + row = dicts[i] + for j in range(k): + col = columns[j] + if col in row: + result[i, j] = row[col] + else: + result[i, j] = onan + + return result + +def fast_zip(list ndarrays): + ''' + For zipping multiple ndarrays into an ndarray of tuples + ''' + cdef: + Py_ssize_t i, j, k, n + ndarray[object] result + flatiter it + object val, tup + + k = len(ndarrays) + n = len(ndarrays[0]) + + result = np.empty(n, dtype=object) + + # initialize tuples on first pass + arr = ndarrays[0] + it = PyArray_IterNew(arr) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + tup = PyTuple_New(k) + + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + result[i] = tup + PyArray_ITER_NEXT(it) + + for j in range(1, k): + arr = ndarrays[j] + it = PyArray_IterNew(arr) + if len(arr) != n: + raise ValueError('all arrays must be same length') + + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + PyTuple_SET_ITEM(result[i], j, val) + Py_INCREF(val) + PyArray_ITER_NEXT(it) + + return result + +def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): + """ + Reverse indexing operation. + + Given `indexer`, make `indexer_inv` of it, such that:: + + indexer_inv[indexer[x]] = x + + .. note:: If indexer is not unique, only first occurrence is accounted. + + """ + + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] rev_indexer + int64_t idx + + rev_indexer = np.empty(length, dtype=np.int64) + rev_indexer.fill(-1) + for i in range(n): + idx = indexer[i] + if idx != -1: + rev_indexer[idx] = i + + return rev_indexer + + +def has_infs_f4(ndarray[float32_t] arr): + cdef: + Py_ssize_t i, n = len(arr) + float32_t inf, neginf, val + + inf = np.inf + neginf = -inf + + for i in range(n): + val = arr[i] + if val == inf or val == neginf: + return True + return False + +def has_infs_f8(ndarray[float64_t] arr): + cdef: + Py_ssize_t i, n = len(arr) + float64_t inf, neginf, val + + inf = np.inf + neginf = -inf + + for i in range(n): + val = arr[i] + if val == inf or val == neginf: + return True + return False + +def convert_timestamps(ndarray values): + cdef: + object val, f, result + dict cache = {} + Py_ssize_t i, n = len(values) + ndarray[object] out + + # for HDFStore, a bit temporary but... + + from datetime import datetime + f = datetime.fromtimestamp + + out = np.empty(n, dtype='O') + + for i in range(n): + val = util.get_value_1d(values, i) + if val in cache: + out[i] = cache[val] + else: + cache[val] = out[i] = f(val) + + return out + +def maybe_indices_to_slice(ndarray[int64_t] indices): + cdef: + Py_ssize_t i, n = len(indices) + + if not n or indices[0] < 0: + return indices + + for i in range(1, n): + if indices[i] - indices[i - 1] != 1: + return indices + return slice(indices[0], indices[n - 1] + 1) + + +def maybe_booleans_to_slice(ndarray[uint8_t] mask): + cdef: + Py_ssize_t i, n = len(mask) + Py_ssize_t start, end + bint started = 0, finished = 0 + + for i in range(n): + if mask[i]: + if finished: + return mask.view(np.bool_) + if not started: + started = 1 + start = i + else: + if finished: + continue + + if started: + end = i + finished = 1 + + if not started: + return slice(0, 0) + if not finished: + return slice(start, None) + else: + return slice(start, end) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def scalar_compare(ndarray[object] values, object val, object op): + import operator + cdef: + Py_ssize_t i, n = len(values) + ndarray[uint8_t, cast=True] result + int flag + object x + + if op is operator.lt: + flag = cpython.Py_LT + elif op is operator.le: + flag = cpython.Py_LE + elif op is operator.gt: + flag = cpython.Py_GT + elif op is operator.ge: + flag = cpython.Py_GE + elif op is operator.eq: + flag = cpython.Py_EQ + elif op is operator.ne: + flag = cpython.Py_NE + else: + raise ValueError('Unrecognized operator') + + result = np.empty(n, dtype=bool).view(np.uint8) + + if flag == cpython.Py_NE: + for i in range(n): + x = values[i] + if _checknull(x): + result[i] = True + else: + result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + else: + for i in range(n): + x = values[i] + if _checknull(x): + result[i] = False + else: + result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + + return result.view(bool) + +@cython.wraparound(False) +@cython.boundscheck(False) +def vec_compare(ndarray[object] left, ndarray[object] right, object op): + import operator + cdef: + Py_ssize_t i, n = len(left) + ndarray[uint8_t, cast=True] result + int flag + + if n != len(right): + raise ValueError('Arrays were different lengths: %d vs %d' + % (n, len(right))) + + if op is operator.lt: + flag = cpython.Py_LT + elif op is operator.le: + flag = cpython.Py_LE + elif op is operator.gt: + flag = cpython.Py_GT + elif op is operator.ge: + flag = cpython.Py_GE + elif op is operator.eq: + flag = cpython.Py_EQ + elif op is operator.ne: + flag = cpython.Py_NE + else: + raise ValueError('Unrecognized operator') + + result = np.empty(n, dtype=bool).view(np.uint8) + + if flag == cpython.Py_NE: + for i in range(n): + x = left[i] + y = right[i] + + if _checknull(x) or _checknull(y): + result[i] = True + else: + result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + else: + for i in range(n): + x = left[i] + y = right[i] + + if _checknull(x) or _checknull(y): + result[i] = False + else: + result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + + return result.view(bool) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def scalar_binop(ndarray[object] values, object val, object op): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] result + object x + + result = np.empty(n, dtype=object) + if util._checknull(val): + result.fill(val) + return result + + for i in range(n): + x = values[i] + if util._checknull(x): + result[i] = x + else: + result[i] = op(x, val) + + return maybe_convert_bool(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def vec_binop(ndarray[object] left, ndarray[object] right, object op): + cdef: + Py_ssize_t i, n = len(left) + ndarray[object] result + + if n != len(right): + raise ValueError('Arrays were different lengths: %d vs %d' + % (n, len(right))) + + result = np.empty(n, dtype=object) + + for i in range(n): + x = left[i] + y = right[i] + try: + result[i] = op(x, y) + except TypeError: + if util._checknull(x): + result[i] = x + elif util._checknull(y): + result[i] = y + else: + raise + + return maybe_convert_bool(result) + + +def astype_intsafe(ndarray[object] arr, new_dtype): + cdef: + Py_ssize_t i, n = len(arr) + object v + bint is_datelike + ndarray result + + # on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird + is_datelike = new_dtype in ['M8[ns]','m8[ns]'] + + result = np.empty(n, dtype=new_dtype) + for i in range(n): + v = arr[i] + if is_datelike and checknull(v): + result[i] = NPY_NAT + else: + util.set_value_at(result, i, v) + + return result + +cpdef ndarray[object] astype_str(ndarray arr): + cdef: + Py_ssize_t i, n = arr.size + ndarray[object] result = np.empty(n, dtype=object) + + for i in range(n): + util.set_value_at(result, i, str(arr[i])) + + return result + +def clean_index_list(list obj): + ''' + Utility used in pandas.core.index._ensure_index + ''' + cdef: + ndarray[object] converted + Py_ssize_t i, n = len(obj) + object v + bint all_arrays = 1 + + for i in range(n): + v = obj[i] + if not (PyList_Check(v) or np.PyArray_Check(v)): + all_arrays = 0 + break + + if all_arrays: + return obj, all_arrays + + converted = np.empty(n, dtype=object) + for i in range(n): + v = obj[i] + if PyList_Check(v) or np.PyArray_Check(v): + converted[i] = tuple(v) + else: + converted[i] = v + + return maybe_convert_objects(converted), 0 + +@cython.boundscheck(False) +@cython.wraparound(False) +def max_len_string_array(ndarray[object, ndim=1] arr): + """ return the maximum size of elements in a 1-dim string array """ + cdef: + int i, m, l + length = arr.shape[0] + object v + + m = 0 + for i from 0 <= i < length: + v = arr[i] + if PyString_Check(v) or PyBytes_Check(v): + l = len(v) + + if l > m: + m = l + + return m + +@cython.boundscheck(False) +@cython.wraparound(False) +def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): + """ replace the values in the array with replacement if they are nan_rep; return the same array """ + + cdef int length = arr.shape[0], i = 0 + if replace is None: + replace = np.nan + + for i from 0 <= i < length: + if arr[i] == nan_rep: + arr[i] = replace + + return arr + +@cython.boundscheck(False) +@cython.wraparound(False) +def write_csv_rows(list data, list data_index, int nlevels, list cols, object writer): + + cdef int N, j, i, ncols + cdef list rows + cdef object val + + # In crude testing, N>100 yields little marginal improvement + N=100 + + # pre-allocate rows + ncols = len(cols) + rows = [[None]*(nlevels+ncols) for x in range(N)] + + j = -1 + if nlevels == 1: + for j in range(len(data_index)): + row = rows[j % N] + row[0] = data_index[j] + for i in range(ncols): + row[1+i] = data[i][j] + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + elif nlevels > 1: + for j in range(len(data_index)): + row = rows[j % N] + row[:nlevels] = list(data_index[j]) + for i in range(ncols): + row[nlevels+i] = data[i][j] + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + else: + for j in range(len(data_index)): + row = rows[j % N] + for i in range(ncols): + row[i] = data[i][j] + + if j >= N-1 and j % N == N-1: + writer.writerows(rows) + + if j >= 0 and (j < N-1 or (j % N) != N-1 ): + writer.writerows(rows[:((j+1) % N)]) + +#------------------------------------------------------------------------------- +# Groupby-related functions + +@cython.boundscheck(False) +def arrmap(ndarray[object] index, object func): + cdef int length = index.shape[0] + cdef int i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + for i from 0 <= i < length: + result[i] = func(index[i]) + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def is_lexsorted(list list_of_arrays): + cdef: + int i + Py_ssize_t n, nlevels + int64_t k, cur, pre + ndarray arr + + nlevels = len(list_of_arrays) + n = len(list_of_arrays[0]) + + cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + for i from 0 <= i < nlevels: + # vecs[i] = ( list_of_arrays[i]).data + + arr = list_of_arrays[i] + vecs[i] = arr.data + # assume uniqueness?? + + for i from 1 <= i < n: + for k from 0 <= k < nlevels: + cur = vecs[k][i] + pre = vecs[k][i-1] + if cur == pre: + continue + elif cur > pre: + break + else: + return False + free(vecs) + return True + + + +# TODO: could do even better if we know something about the data. eg, index has +# 1-min data, binner has 5-min data, then bins are just strides in index. This +# is a general, O(max(len(values), len(binner))) method. + +@cython.boundscheck(False) +@cython.wraparound(False) +def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, + object closed='left', bint hasnans=0): + """ + Int64 (datetime64) version of generic python version in groupby.py + """ + cdef: + Py_ssize_t lenidx, lenbin, i, j, bc, vc + ndarray[int64_t] bins + int64_t l_bin, r_bin, nat_count + bint right_closed = closed == 'right' + + nat_count = 0 + if hasnans: + mask = values == iNaT + nat_count = np.sum(mask) + values = values[~mask] + + lenidx = len(values) + lenbin = len(binner) + + if lenidx <= 0 or lenbin <= 0: + raise ValueError("Invalid length for values or for binner") + + # check binner fits data + if values[0] < binner[0]: + raise ValueError("Values falls before first bin") + + if values[lenidx-1] > binner[lenbin-1]: + raise ValueError("Values falls after last bin") + + bins = np.empty(lenbin - 1, dtype=np.int64) + + j = 0 # index into values + bc = 0 # bin count + + # linear scan + if right_closed: + for i in range(0, lenbin - 1): + r_bin = binner[i+1] + # count values in current bin, advance to next bin + while j < lenidx and values[j] <= r_bin: + j += 1 + bins[bc] = j + bc += 1 + else: + for i in range(0, lenbin - 1): + r_bin = binner[i+1] + # count values in current bin, advance to next bin + while j < lenidx and values[j] < r_bin: + j += 1 + bins[bc] = j + bc += 1 + + if nat_count > 0: + # shift bins by the number of NaT + bins = bins + nat_count + bins = np.insert(bins, 0, nat_count) + + return bins + + + + +@cython.boundscheck(False) +@cython.wraparound(False) +def row_bool_subset(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, cast=True] mask): + cdef: + Py_ssize_t i, j, n, k, pos = 0 + ndarray[float64_t, ndim=2] out + + n, k = ( values).shape + assert(n == len(mask)) + + out = np.empty((mask.sum(), k), dtype=np.float64) + + for i in range(n): + if mask[i]: + for j in range(k): + out[pos, j] = values[i, j] + pos += 1 + + return out + +@cython.boundscheck(False) +@cython.wraparound(False) +def row_bool_subset_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, cast=True] mask): + cdef: + Py_ssize_t i, j, n, k, pos = 0 + ndarray[object, ndim=2] out + + n, k = ( values).shape + assert(n == len(mask)) + + out = np.empty((mask.sum(), k), dtype=object) + + for i in range(n): + if mask[i]: + for j in range(k): + out[pos, j] = values[i, j] + pos += 1 + + return out + + +def group_count(ndarray[int64_t] values, Py_ssize_t size): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] counts + + counts = np.zeros(size, dtype=np.int64) + for i in range(n): + counts[values[i]] += 1 + return counts + +def lookup_values(ndarray[object] values, dict mapping): + cdef: + Py_ssize_t i, n = len(values) + + result = np.empty(n, dtype='O') + for i in range(n): + result[i] = mapping[values[i]] + return maybe_convert_objects(result) + + +def count_level_1d(ndarray[uint8_t, cast=True] mask, + ndarray[int64_t] labels, Py_ssize_t max_bin): + cdef: + Py_ssize_t i, n + ndarray[int64_t] counts + + counts = np.zeros(max_bin, dtype='i8') + + n = len(mask) + + for i from 0 <= i < n: + if mask[i]: + counts[labels[i]] += 1 + + return counts + + +def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, + ndarray[int64_t] labels, Py_ssize_t max_bin): + cdef: + Py_ssize_t i, j, k, n + ndarray[int64_t, ndim=2] counts + + n, k = ( mask).shape + counts = np.zeros((max_bin, k), dtype='i8') + + for i from 0 <= i < n: + for j from 0 <= j < k: + if mask[i, j]: + counts[labels[i], j] += 1 + + return counts + +cdef class _PandasNull: + + def __richcmp__(_PandasNull self, object other, int op): + if op == 2: # == + return isinstance(other, _PandasNull) + elif op == 3: # != + return not isinstance(other, _PandasNull) + else: + return False + + def __hash__(self): + return 0 + +pandas_null = _PandasNull() + +def fast_zip_fillna(list ndarrays, fill_value=pandas_null): + ''' + For zipping multiple ndarrays into an ndarray of tuples + ''' + cdef: + Py_ssize_t i, j, k, n + ndarray[object] result + flatiter it + object val, tup + + k = len(ndarrays) + n = len(ndarrays[0]) + + result = np.empty(n, dtype=object) + + # initialize tuples on first pass + arr = ndarrays[0] + it = PyArray_IterNew(arr) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + tup = PyTuple_New(k) + + if val != val: + val = fill_value + + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + result[i] = tup + PyArray_ITER_NEXT(it) + + for j in range(1, k): + arr = ndarrays[j] + it = PyArray_IterNew(arr) + if len(arr) != n: + raise ValueError('all arrays must be same length') + + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + if val != val: + val = fill_value + + PyTuple_SET_ITEM(result[i], j, val) + Py_INCREF(val) + PyArray_ITER_NEXT(it) + + return result + +def duplicated(ndarray[object] values, take_last=False): + cdef: + Py_ssize_t i, n + dict seen = {} + object row + + n = len(values) + cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) + + if take_last: + for i from n > i >= 0: + row = values[i] + + if row in seen: + result[i] = 1 + else: + seen[row] = None + result[i] = 0 + else: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = None + result[i] = 0 + + return result.view(np.bool_) + +def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): + cdef: + Py_ssize_t i, group_size, n, lab, start + object slobj + ndarray[int64_t] starts + + n = len(labels) + + starts = np.zeros(ngroups, dtype=np.int64) + ends = np.zeros(ngroups, dtype=np.int64) + + start = 0 + group_size = 0 + for i in range(n): + group_size += 1 + lab = labels[i] + if i == n - 1 or lab != labels[i + 1]: + starts[lab] = start + ends[lab] = start + group_size + start += group_size + group_size = 0 + + return starts, ends + + +def indices_fast(object index, ndarray[int64_t] labels, list keys, + list sorted_labels): + cdef: + Py_ssize_t i, j, k, lab, cur, start, n = len(labels) + dict result = {} + object tup + + k = len(keys) + + if n == 0: + return result + + start = 0 + cur = labels[0] + for i in range(1, n): + lab = labels[i] + + if lab != cur: + if lab != -1: + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][i-1]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + + result[tup] = index[start:i] + start = i + cur = lab + + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][n - 1]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + result[tup] = index[start:] + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def get_blkno_indexers(int64_t[:] blknos, bint group=True): + """ + Enumerate contiguous runs of integers in ndarray. + + Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))`` + pairs for each contiguous run found. + + If `group` is True and there is more than one run for a certain blkno, + ``(blkno, array)`` with an array containing positions of all elements equal + to blkno. + + Returns + ------- + iter : iterator of (int, slice or array) + + """ + # There's blkno in this function's name because it's used in block & + # blockno handling. + cdef: + int64_t cur_blkno + Py_ssize_t i, start, stop, n, diff + + list group_order + dict group_slices + int64_t[:] res_view + + n = blknos.shape[0] + + if n > 0: + start = 0 + cur_blkno = blknos[start] + + if group == False: + for i in range(1, n): + if blknos[i] != cur_blkno: + yield cur_blkno, slice(start, i) + + start = i + cur_blkno = blknos[i] + + yield cur_blkno, slice(start, n) + else: + group_order = [] + group_dict = {} + + for i in range(1, n): + if blknos[i] != cur_blkno: + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, i)] + else: + group_dict[cur_blkno].append((start, i)) + + start = i + cur_blkno = blknos[i] + + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, n)] + else: + group_dict[cur_blkno].append((start, n)) + + for blkno in group_order: + slices = group_dict[blkno] + if len(slices) == 1: + yield blkno, slice(slices[0][0], slices[0][1]) + else: + tot_len = sum(stop - start for start, stop in slices) + result = np.empty(tot_len, dtype=np.int64) + res_view = result + + i = 0 + for start, stop in slices: + for diff in range(start, stop): + res_view[i] = diff + i += 1 + + yield blkno, result + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef slice indexer_as_slice(int64_t[:] vals): + cdef: + Py_ssize_t i, n, start, stop + int64_t d + + if vals is None: + raise TypeError("vals must be ndarray") + + n = vals.shape[0] + + if n == 0 or vals[0] < 0: + return None + + if n == 1: + return slice(vals[0], vals[0] + 1, 1) + + if vals[1] < 0: + return None + + # n > 2 + d = vals[1] - vals[0] + + if d == 0: + return None + + for i in range(2, n): + if vals[i] < 0 or vals[i] - vals[i-1] != d: + return None + + start = vals[0] + stop = start + n * d + if stop < 0 and d < 0: + return slice(start, None, d) + else: + return slice(start, stop, d) + + +cpdef slice_canonize(slice s): + """ + Convert slice to canonical bounded form. + """ + cdef: + Py_ssize_t start, stop, step, length + + if s.step is None: + step = 1 + else: + step = s.step + if step == 0: + raise ValueError("slice step cannot be zero") + + if step > 0: + if s.stop is None: + raise ValueError("unbounded slice") + + stop = s.stop + if s.start is None: + start = 0 + else: + start = s.start + if start > stop: + start = stop + elif step < 0: + if s.start is None: + raise ValueError("unbounded slice") + + start = s.start + if s.stop is None: + stop = -1 + else: + stop = s.stop + if stop > start: + stop = start + + if start < 0 or (stop < 0 and s.stop is not None): + raise ValueError("unbounded slice") + + if stop < 0: + return slice(start, None, step) + else: + return slice(start, stop, step) + + +cpdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): + """ + Get (start, stop, step, length) tuple for a slice. + + If `objlen` is not specified, slice must be bounded, otherwise the result + will be wrong. + + """ + cdef: + Py_ssize_t start, stop, step, length + + if slc is None: + raise TypeError("slc should be a slice") + + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) + return start, stop, step, length + + +cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: + """ + Get length of a bounded slice. + + The slice must not have any "open" bounds that would create dependency on + container size, i.e.: + - if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None`` + - if ``s.step < 0``, ``s.start`` is not ``None`` + + Otherwise, the result is unreliable. + + """ + cdef: + Py_ssize_t start, stop, step, length + + if slc is None: + raise TypeError("slc must be slice") + + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) + + return length + + +def slice_getitem(slice slc not None, ind): + cdef: + Py_ssize_t s_start, s_stop, s_step, s_len + Py_ssize_t ind_start, ind_stop, ind_step, ind_len + + s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc) + + if isinstance(ind, slice): + ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, + s_len) + + if ind_step > 0 and ind_len == s_len: + # short-cut for no-op slice + if ind_len == s_len: + return slc + + if ind_step < 0: + s_start = s_stop - s_step + ind_step = -ind_step + + s_step *= ind_step + s_stop = s_start + ind_stop * s_step + s_start = s_start + ind_start * s_step + + if s_step < 0 and s_stop < 0: + return slice(s_start, None, s_step) + else: + return slice(s_start, s_stop, s_step) + + else: + return np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind] + + +cdef class BlockPlacement: + # __slots__ = '_as_slice', '_as_array', '_len' + cdef slice _as_slice + cdef object _as_array + + cdef bint _has_slice, _has_array, _is_known_slice_like + + def __init__(self, val): + cdef slice slc + + self._has_slice = False + self._has_array = False + + if isinstance(val, slice): + slc = slice_canonize(val) + + if slc.start != slc.stop: + self._as_slice = slc + self._has_slice = True + else: + arr = np.empty(0, dtype=np.int64) + self._as_array = arr + self._has_array = True + else: + # Cython memoryview interface requires ndarray to be writeable. + arr = np.require(val, dtype=np.int64, requirements='W') + assert arr.ndim == 1 + self._as_array = arr + self._has_array = True + + def __unicode__(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + v = self._as_slice + else: + v = self._as_array + + return '%s(%r)' % (self.__class__.__name__, v) + + def __len__(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return slice_len(s) + else: + return len(self._as_array) + + def __iter__(self): + cdef slice s = self._ensure_has_slice() + cdef Py_ssize_t start, stop, step, _ + if s is not None: + start, stop, step, _ = slice_get_indices_ex(s) + return iter(range(start, stop, step)) + else: + return iter(self._as_array) + + @property + def as_slice(self): + cdef slice s = self._ensure_has_slice() + if s is None: + raise TypeError('Not slice-like') + else: + return s + + @property + def indexer(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return s + else: + return self._as_array + + def isin(self, arr): + from pandas.core.index import Int64Index + return Int64Index(self.as_array, copy=False).isin(arr) + + @property + def as_array(self): + cdef Py_ssize_t start, stop, end, _ + if not self._has_array: + start, stop, step, _ = slice_get_indices_ex(self._as_slice) + self._as_array = np.arange(start, stop, step, + dtype=np.int64) + self._has_array = True + return self._as_array + + @property + def is_slice_like(self): + cdef slice s = self._ensure_has_slice() + return s is not None + + def __getitem__(self, loc): + cdef slice s = self._ensure_has_slice() + if s is not None: + val = slice_getitem(s, loc) + else: + val = self._as_array[loc] + + if not isinstance(val, slice) and val.ndim == 0: + return val + + return BlockPlacement(val) + + def delete(self, loc): + return BlockPlacement(np.delete(self.as_array, loc, axis=0)) + + def append(self, others): + if len(others) == 0: + return self + + return BlockPlacement(np.concatenate([self.as_array] + + [o.as_array for o in others])) + + cdef iadd(self, other): + cdef slice s = self._ensure_has_slice() + cdef Py_ssize_t other_int, start, stop, step, l + + if isinstance(other, int) and s is not None: + other_int = other + + if other_int == 0: + return self + + start, stop, step, l = slice_get_indices_ex(s) + start += other_int + stop += other_int + + if ((step > 0 and start < 0) or + (step < 0 and stop < step)): + raise ValueError("iadd causes length change") + + if stop < 0: + self._as_slice = slice(start, None, step) + else: + self._as_slice = slice(start, stop, step) + + self._has_array = False + self._as_array = None + else: + newarr = self.as_array + other + if (newarr < 0).any(): + raise ValueError("iadd causes length change") + + self._as_array = newarr + self._has_array = True + self._has_slice = False + self._as_slice = None + + return self + + cdef BlockPlacement copy(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return BlockPlacement(s) + else: + return BlockPlacement(self._as_array) + + def add(self, other): + return self.copy().iadd(other) + + def sub(self, other): + return self.add(-other) + + cdef slice _ensure_has_slice(self): + if not self._has_slice: + self._as_slice = indexer_as_slice(self._as_array) + self._has_slice = True + return self._as_slice + + +include "reduce.pyx" +include "properties.pyx" +include "inference.pyx" diff --git a/pandas/msgpack.pyx b/pandas/msgpack.pyx new file mode 100644 index 00000000..4413e2c0 --- /dev/null +++ b/pandas/msgpack.pyx @@ -0,0 +1,669 @@ +# coding: utf-8 +#cython: embedsignature=True +#cython: profile=False + +from cpython cimport * +cdef extern from "Python.h": + ctypedef char* const_char_ptr "const char*" + ctypedef char* const_void_ptr "const void*" + ctypedef struct PyObject + cdef int PyObject_AsReadBuffer(object o, const_void_ptr* buff, Py_ssize_t* buf_len) except -1 + +from libc.stdlib cimport * +from libc.string cimport * +from libc.limits cimport * + +import cython +import numpy as np +from numpy cimport * + +class UnpackException(IOError): + pass + + +class BufferFull(UnpackException): + pass + + +class OutOfData(UnpackException): + pass + + +class UnpackValueError(UnpackException, ValueError): + pass + + +class ExtraData(ValueError): + def __init__(self, unpacked, extra): + self.unpacked = unpacked + self.extra = extra + + def __str__(self): + return "unpack(b) recieved extra data." + +class PackException(IOError): + pass + +class PackValueError(PackException, ValueError): + pass + +cdef extern from "msgpack/unpack.h": + ctypedef struct msgpack_user: + bint use_list + PyObject* object_hook + bint has_pairs_hook # call object_hook with k-v pairs + PyObject* list_hook + char *encoding + char *unicode_errors + + ctypedef struct template_context: + msgpack_user user + PyObject* obj + size_t count + unsigned int ct + PyObject* key + + ctypedef int (*execute_fn)(template_context* ctx, const_char_ptr data, + size_t len, size_t* off) except? -1 + execute_fn template_construct + execute_fn template_skip + execute_fn read_array_header + execute_fn read_map_header + void template_init(template_context* ctx) + object template_data(template_context* ctx) + +cdef extern from "msgpack/pack.h": + struct msgpack_packer: + char* buf + size_t length + size_t buf_size + + int msgpack_pack_int(msgpack_packer* pk, int d) + int msgpack_pack_nil(msgpack_packer* pk) + int msgpack_pack_true(msgpack_packer* pk) + int msgpack_pack_false(msgpack_packer* pk) + int msgpack_pack_long(msgpack_packer* pk, long d) + int msgpack_pack_long_long(msgpack_packer* pk, long long d) + int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d) + int msgpack_pack_float(msgpack_packer* pk, float d) + int msgpack_pack_double(msgpack_packer* pk, double d) + int msgpack_pack_array(msgpack_packer* pk, size_t l) + int msgpack_pack_map(msgpack_packer* pk, size_t l) + int msgpack_pack_raw(msgpack_packer* pk, size_t l) + int msgpack_pack_raw_body(msgpack_packer* pk, char* body, size_t l) + +cdef int DEFAULT_RECURSE_LIMIT=511 + + + +cdef class Packer(object): + """MessagePack Packer + + usage: + + packer = Packer() + astream.write(packer.pack(a)) + astream.write(packer.pack(b)) + + Packer's constructor has some keyword arguments: + + * *defaut* - Convert user type to builtin type that Packer supports. + See also simplejson's document. + * *encoding* - Convert unicode to bytes with this encoding. (default: 'utf-8') + * *unicode_errors* - Error handler for encoding unicode. (default: 'strict') + * *use_single_float* - Use single precision float type for float. (default: False) + * *autoreset* - Reset buffer after each pack and return it's content as `bytes`. (default: True). + If set this to false, use `bytes()` to get content and `.reset()` to clear buffer. + """ + cdef msgpack_packer pk + cdef object _default + cdef object _bencoding + cdef object _berrors + cdef char *encoding + cdef char *unicode_errors + cdef bool use_float + cdef bint autoreset + + def __cinit__(self): + cdef int buf_size = 1024*1024 + self.pk.buf = malloc(buf_size); + if self.pk.buf == NULL: + raise MemoryError("Unable to allocate internal buffer.") + self.pk.buf_size = buf_size + self.pk.length = 0 + + def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', + use_single_float=False, bint autoreset=1): + self.use_float = use_single_float + self.autoreset = autoreset + if default is not None: + if not PyCallable_Check(default): + raise TypeError("default must be a callable.") + self._default = default + if encoding is None: + self.encoding = NULL + self.unicode_errors = NULL + else: + if isinstance(encoding, unicode): + self._bencoding = encoding.encode('ascii') + else: + self._bencoding = encoding + self.encoding = PyBytes_AsString(self._bencoding) + if isinstance(unicode_errors, unicode): + self._berrors = unicode_errors.encode('ascii') + else: + self._berrors = unicode_errors + self.unicode_errors = PyBytes_AsString(self._berrors) + + def __dealloc__(self): + free(self.pk.buf); + + @cython.boundscheck(False) + @cython.wraparound(False) + cdef int _pack(self, object o, int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: + cdef long long llval + cdef unsigned long long ullval + cdef long longval + cdef float fval + cdef double dval + cdef char* rawval + cdef int ret + cdef dict d + cdef object dtype + + cdef int n,i + + if nest_limit < 0: + raise PackValueError("recursion limit exceeded.") + + if o is None: + ret = msgpack_pack_nil(&self.pk) + elif isinstance(o, bool): + if o: + ret = msgpack_pack_true(&self.pk) + else: + ret = msgpack_pack_false(&self.pk) + elif PyLong_Check(o): + if o > 0: + ullval = o + ret = msgpack_pack_unsigned_long_long(&self.pk, ullval) + else: + llval = o + ret = msgpack_pack_long_long(&self.pk, llval) + elif PyInt_Check(o): + longval = o + ret = msgpack_pack_long(&self.pk, longval) + elif PyFloat_Check(o): + if self.use_float: + fval = o + ret = msgpack_pack_float(&self.pk, fval) + else: + dval = o + ret = msgpack_pack_double(&self.pk, dval) + elif PyBytes_Check(o): + rawval = o + ret = msgpack_pack_raw(&self.pk, len(o)) + if ret == 0: + ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) + elif PyUnicode_Check(o): + if not self.encoding: + raise TypeError("Can't encode unicode string: no encoding is specified") + o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) + rawval = o + ret = msgpack_pack_raw(&self.pk, len(o)) + if ret == 0: + ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) + elif PyDict_CheckExact(o): + d = o + ret = msgpack_pack_map(&self.pk, len(d)) + if ret == 0: + for k, v in d.iteritems(): + ret = self._pack(k, nest_limit-1) + if ret != 0: break + ret = self._pack(v, nest_limit-1) + if ret != 0: break + elif PyDict_Check(o): + ret = msgpack_pack_map(&self.pk, len(o)) + if ret == 0: + for k, v in o.items(): + ret = self._pack(k, nest_limit-1) + if ret != 0: break + ret = self._pack(v, nest_limit-1) + if ret != 0: break + elif PyTuple_Check(o) or PyList_Check(o): + ret = msgpack_pack_array(&self.pk, len(o)) + if ret == 0: + for v in o: + ret = self._pack(v, nest_limit-1) + if ret != 0: break + + elif self._default: + o = self._default(o) + ret = self._pack(o, nest_limit-1) + else: + raise TypeError("can't serialize %r" % (o,)) + return ret + + cpdef pack(self, object obj): + cdef int ret + ret = self._pack(obj, DEFAULT_RECURSE_LIMIT) + if ret == -1: + raise MemoryError + elif ret: # should not happen. + raise TypeError + if self.autoreset: + buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) + self.pk.length = 0 + return buf + + def pack_array_header(self, size_t size): + cdef int ret = msgpack_pack_array(&self.pk, size) + if ret == -1: + raise MemoryError + elif ret: # should not happen + raise TypeError + if self.autoreset: + buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) + self.pk.length = 0 + return buf + + def pack_map_header(self, size_t size): + cdef int ret = msgpack_pack_map(&self.pk, size) + if ret == -1: + raise MemoryError + elif ret: # should not happen + raise TypeError + if self.autoreset: + buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) + self.pk.length = 0 + return buf + + def pack_map_pairs(self, object pairs): + """ + Pack *pairs* as msgpack map type. + + *pairs* should sequence of pair. + (`len(pairs)` and `for k, v in *pairs*:` should be supported.) + """ + cdef int ret = msgpack_pack_map(&self.pk, len(pairs)) + if ret == 0: + for k, v in pairs: + ret = self._pack(k) + if ret != 0: break + ret = self._pack(v) + if ret != 0: break + if ret == -1: + raise MemoryError + elif ret: # should not happen + raise TypeError + if self.autoreset: + buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) + self.pk.length = 0 + return buf + + def reset(self): + """Clear internal buffer.""" + self.pk.length = 0 + + def bytes(self): + """Return buffer content.""" + return PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) + + + cdef inline pack_pair(self, object k, object v, int nest_limit): + ret = self._pack(k, nest_limit-1) + if ret != 0: raise PackException("cannot pack : %s" % k) + ret = self._pack(v, nest_limit-1) + if ret != 0: raise PackException("cannot pack : %s" % v) + return ret + +def pack(object o, object stream, default=None, encoding='utf-8', unicode_errors='strict'): + """ + pack an object `o` and write it to stream).""" + packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors) + stream.write(packer.pack(o)) + +def packb(object o, default=None, encoding='utf-8', unicode_errors='strict', use_single_float=False): + """ + pack o and return packed bytes.""" + packer = Packer(default=default, encoding=encoding, unicode_errors=unicode_errors, + use_single_float=use_single_float) + return packer.pack(o) + + +cdef inline init_ctx(template_context *ctx, + object object_hook, object object_pairs_hook, object list_hook, + bint use_list, char* encoding, char* unicode_errors): + template_init(ctx) + ctx.user.use_list = use_list + ctx.user.object_hook = ctx.user.list_hook = NULL + + if object_hook is not None and object_pairs_hook is not None: + raise ValueError("object_pairs_hook and object_hook are mutually exclusive.") + + if object_hook is not None: + if not PyCallable_Check(object_hook): + raise TypeError("object_hook must be a callable.") + ctx.user.object_hook = object_hook + + if object_pairs_hook is None: + ctx.user.has_pairs_hook = False + else: + if not PyCallable_Check(object_pairs_hook): + raise TypeError("object_pairs_hook must be a callable.") + ctx.user.object_hook = object_pairs_hook + ctx.user.has_pairs_hook = True + + if list_hook is not None: + if not PyCallable_Check(list_hook): + raise TypeError("list_hook must be a callable.") + ctx.user.list_hook = list_hook + + ctx.user.encoding = encoding + ctx.user.unicode_errors = unicode_errors + +def unpackb(object packed, object object_hook=None, object list_hook=None, + bint use_list=1, encoding=None, unicode_errors="strict", + object_pairs_hook=None, + ): + """Unpack packed_bytes to object. Returns an unpacked object. + + Raises `ValueError` when `packed` contains extra bytes. + """ + cdef template_context ctx + cdef size_t off = 0 + cdef int ret + + cdef char* buf + cdef Py_ssize_t buf_len + cdef char* cenc = NULL + cdef char* cerr = NULL + + PyObject_AsReadBuffer(packed, &buf, &buf_len) + + if encoding is not None: + if isinstance(encoding, unicode): + encoding = encoding.encode('ascii') + cenc = PyBytes_AsString(encoding) + + if unicode_errors is not None: + if isinstance(unicode_errors, unicode): + unicode_errors = unicode_errors.encode('ascii') + cerr = PyBytes_AsString(unicode_errors) + + init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, use_list, cenc, cerr) + ret = template_construct(&ctx, buf, buf_len, &off) + if ret == 1: + obj = template_data(&ctx) + if off < buf_len: + raise ExtraData(obj, PyBytes_FromStringAndSize(buf+off, buf_len-off)) + return obj + elif ret < 0: + raise ValueError("Unpack failed: error = %d" % (ret,)) + else: + raise UnpackValueError + + +def unpack(object stream, object object_hook=None, object list_hook=None, + bint use_list=1, encoding=None, unicode_errors="strict", + object_pairs_hook=None, + ): + """Unpack an object from `stream`. + + Raises `ValueError` when `stream` has extra bytes. + """ + return unpackb(stream.read(), use_list=use_list, + object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook, + encoding=encoding, unicode_errors=unicode_errors, + ) + + +cdef class Unpacker(object): + """ + Streaming unpacker. + + `file_like` is a file-like object having `.read(n)` method. + When `Unpacker` initialized with `file_like`, unpacker reads serialized data + from it and `.feed()` method is not usable. + + `read_size` is used as `file_like.read(read_size)`. + (default: min(1024**2, max_buffer_size)) + + If `use_list` is true (default), msgpack list is deserialized to Python list. + Otherwise, it is deserialized to Python tuple. + + `object_hook` is same to simplejson. If it is not None, it should be callable + and Unpacker calls it with a dict argument after deserializing a map. + + `object_pairs_hook` is same to simplejson. If it is not None, it should be callable + and Unpacker calls it with a list of key-value pairs after deserializing a map. + + `encoding` is encoding used for decoding msgpack bytes. If it is None (default), + msgpack bytes is deserialized to Python bytes. + + `unicode_errors` is used for decoding bytes. + + `max_buffer_size` limits size of data waiting unpacked. + 0 means system's INT_MAX (default). + Raises `BufferFull` exception when it is insufficient. + You shoud set this parameter when unpacking data from untrasted source. + + example of streaming deserialize from file-like object:: + + unpacker = Unpacker(file_like) + for o in unpacker: + do_something(o) + + example of streaming deserialize from socket:: + + unpacker = Unpacker() + while 1: + buf = sock.recv(1024**2) + if not buf: + break + unpacker.feed(buf) + for o in unpacker: + do_something(o) + """ + cdef template_context ctx + cdef char* buf + cdef size_t buf_size, buf_head, buf_tail + cdef object file_like + cdef object file_like_read + cdef Py_ssize_t read_size + cdef object object_hook + cdef object encoding, unicode_errors + cdef size_t max_buffer_size + + def __cinit__(self): + self.buf = NULL + + def __dealloc__(self): + free(self.buf) + self.buf = NULL + + def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, + object object_hook=None, object object_pairs_hook=None, object list_hook=None, + encoding=None, unicode_errors='strict', int max_buffer_size=0, + ): + cdef char *cenc=NULL, *cerr=NULL + + self.file_like = file_like + if file_like: + self.file_like_read = file_like.read + if not PyCallable_Check(self.file_like_read): + raise ValueError("`file_like.read` must be a callable.") + if not max_buffer_size: + max_buffer_size = INT_MAX + if read_size > max_buffer_size: + raise ValueError("read_size should be less or equal to max_buffer_size") + if not read_size: + read_size = min(max_buffer_size, 1024**2) + self.max_buffer_size = max_buffer_size + self.read_size = read_size + self.buf = malloc(read_size) + if self.buf == NULL: + raise MemoryError("Unable to allocate internal buffer.") + self.buf_size = read_size + self.buf_head = 0 + self.buf_tail = 0 + + if encoding is not None: + if isinstance(encoding, unicode): + encoding = encoding.encode('ascii') + self.encoding = encoding + cenc = PyBytes_AsString(encoding) + + if unicode_errors is not None: + if isinstance(unicode_errors, unicode): + unicode_errors = unicode_errors.encode('ascii') + self.unicode_errors = unicode_errors + cerr = PyBytes_AsString(unicode_errors) + + init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook, use_list, cenc, cerr) + + def feed(self, object next_bytes): + """Append `next_bytes` to internal buffer.""" + cdef char* buf + cdef Py_ssize_t buf_len + if self.file_like is not None: + raise TypeError( + "unpacker.feed() is not be able to use with `file_like`.") + PyObject_AsReadBuffer(next_bytes, &buf, &buf_len) + self.append_buffer(buf, buf_len) + + cdef append_buffer(self, void* _buf, Py_ssize_t _buf_len): + cdef: + char* buf = self.buf + char* new_buf + size_t head = self.buf_head + size_t tail = self.buf_tail + size_t buf_size = self.buf_size + size_t new_size + + if tail + _buf_len > buf_size: + if ((tail - head) + _buf_len) <= buf_size: + # move to front. + memmove(buf, buf + head, tail - head) + tail -= head + head = 0 + else: + # expand buffer. + new_size = (tail-head) + _buf_len + if new_size > self.max_buffer_size: + raise BufferFull + new_size = min(new_size*2, self.max_buffer_size) + new_buf = malloc(new_size) + if new_buf == NULL: + # self.buf still holds old buffer and will be freed during + # obj destruction + raise MemoryError("Unable to enlarge internal buffer.") + memcpy(new_buf, buf + head, tail - head) + free(buf) + + buf = new_buf + buf_size = new_size + tail -= head + head = 0 + + memcpy(buf + tail, (_buf), _buf_len) + self.buf = buf + self.buf_head = head + self.buf_size = buf_size + self.buf_tail = tail + _buf_len + + cdef read_from_file(self): + next_bytes = self.file_like_read( + min(self.read_size, + self.max_buffer_size - (self.buf_tail - self.buf_head) + )) + if next_bytes: + self.append_buffer(PyBytes_AsString(next_bytes), PyBytes_Size(next_bytes)) + else: + self.file_like = None + + cdef object _unpack(self, execute_fn execute, object write_bytes, bint iter=0): + cdef int ret + cdef object obj + cdef size_t prev_head + while 1: + prev_head = self.buf_head + ret = execute(&self.ctx, self.buf, self.buf_tail, &self.buf_head) + if write_bytes is not None: + write_bytes(PyBytes_FromStringAndSize(self.buf + prev_head, self.buf_head - prev_head)) + + if ret == 1: + obj = template_data(&self.ctx) + template_init(&self.ctx) + return obj + elif ret == 0: + if self.file_like is not None: + self.read_from_file() + continue + if iter: + raise StopIteration("No more data to unpack.") + else: + raise OutOfData("No more data to unpack.") + else: + raise ValueError("Unpack failed: error = %d" % (ret,)) + + def read_bytes(self, Py_ssize_t nbytes): + """read a specified number of raw bytes from the stream""" + cdef size_t nread + nread = min(self.buf_tail - self.buf_head, nbytes) + ret = PyBytes_FromStringAndSize(self.buf + self.buf_head, nread) + self.buf_head += nread + if len(ret) < nbytes and self.file_like is not None: + ret += self.file_like.read(nbytes - len(ret)) + return ret + + def unpack(self, object write_bytes=None): + """ + unpack one object + + If write_bytes is not None, it will be called with parts of the raw + message as it is unpacked. + + Raises `OutOfData` when there are no more bytes to unpack. + """ + return self._unpack(template_construct, write_bytes) + + def skip(self, object write_bytes=None): + """ + read and ignore one object, returning None + + If write_bytes is not None, it will be called with parts of the raw + message as it is unpacked. + + Raises `OutOfData` when there are no more bytes to unpack. + """ + return self._unpack(template_skip, write_bytes) + + def read_array_header(self, object write_bytes=None): + """assuming the next object is an array, return its size n, such that + the next n unpack() calls will iterate over its contents. + + Raises `OutOfData` when there are no more bytes to unpack. + """ + return self._unpack(read_array_header, write_bytes) + + def read_map_header(self, object write_bytes=None): + """assuming the next object is a map, return its size n, such that the + next n * 2 unpack() calls will iterate over its key-value pairs. + + Raises `OutOfData` when there are no more bytes to unpack. + """ + return self._unpack(read_map_header, write_bytes) + + def __iter__(self): + return self + + def __next__(self): + return self._unpack(template_construct, None, 1) + + # for debug. + #def _buf(self): + # return PyString_FromStringAndSize(self.buf, self.buf_tail) + + #def _off(self): + # return self.buf_head diff --git a/pandas/parser.pyx b/pandas/parser.pyx new file mode 100644 index 00000000..199d4ab4 --- /dev/null +++ b/pandas/parser.pyx @@ -0,0 +1,1953 @@ +# Copyright (c) 2012, Lambda Foundry, Inc. +# See LICENSE for the license + +from libc.stdio cimport fopen, fclose +from libc.stdlib cimport malloc, free +from libc.string cimport strncpy, strlen, strcmp, strcasecmp +cimport libc.stdio as stdio +import warnings + +from cpython cimport (PyObject, PyBytes_FromString, + PyBytes_AsString, PyBytes_Check, + PyUnicode_Check, PyUnicode_AsUTF8String) +from io.common import DtypeWarning + + +cdef extern from "Python.h": + object PyUnicode_FromString(char *v) + + object PyUnicode_Decode(char *v, Py_ssize_t size, char *encoding, + char *errors) + +cdef extern from "stdlib.h": + void memcpy(void *dst, void *src, size_t n) + +cimport numpy as cnp + +from numpy cimport ndarray, uint8_t, uint64_t + +import numpy as np +cimport util + +import pandas.lib as lib + +import time +import os + +cnp.import_array() + +from khash cimport * + +import sys + +cdef bint PY3 = (sys.version_info[0] >= 3) + +cdef double INF = np.inf +cdef double NEGINF = -INF + +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: UINT16_MAX + enum: UINT32_MAX + enum: UINT64_MAX + enum: INT8_MIN + enum: INT8_MAX + enum: INT16_MIN + enum: INT16_MAX + enum: INT32_MAX + enum: INT32_MIN + enum: INT64_MAX + enum: INT64_MIN + +cdef extern from "headers/portable.h": + pass + +try: + basestring +except NameError: + basestring = str + +cdef extern from "parser/tokenizer.h": + + ctypedef enum ParserState: + START_RECORD + START_FIELD + ESCAPED_CHAR + IN_FIELD + IN_QUOTED_FIELD + ESCAPE_IN_QUOTED_FIELD + QUOTE_IN_QUOTED_FIELD + EAT_CRNL + EAT_CRNL_NOP + EAT_WHITESPACE + EAT_COMMENT + EAT_LINE_COMMENT + FINISHED + + enum: ERROR_OVERFLOW + + ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, + int *status) + ctypedef int (*io_cleanup)(void *src) + + ctypedef struct parser_t: + void *source + io_callback cb_io + io_cleanup cb_cleanup + + int chunksize # Number of bytes to prepare for each chunk + char *data # pointer to data to be processed + int datalen # amount of data available + int datapos + + # where to write out tokenized data + char *stream + int stream_len + int stream_cap + + # Store words in (potentially ragged) matrix for now, hmm + char **words + int *word_starts # where we are in the stream + int words_len + int words_cap + + char *pword_start # pointer to stream start of current field + int word_start # position start of current field + + int *line_start # position in words for start of line + int *line_fields # Number of fields in each line + int lines # Number of lines observed + int file_lines # Number of file lines observed (with bad/skipped) + int lines_cap # Vector capacity + + # Tokenizing stuff + ParserState state + int doublequote # is " represented by ""? */ + char delimiter # field separator */ + int delim_whitespace # consume tabs / spaces instead + char quotechar # quote character */ + char escapechar # escape character */ + char lineterminator + int skipinitialspace # ignore spaces following delimiter? */ + int quoting # style of quoting to write */ + + # hmm =/ + int numeric_field + + char commentchar + int allow_embedded_newline + int strict # raise exception on bad CSV */ + + int expected_fields + int error_bad_lines + int warn_bad_lines + + # floating point options + char decimal + char sci + + # thousands separator (comma, period) + char thousands + + int header # Boolean: 1: has header, 0: no header + int header_start # header row start + int header_end # header row end + + void *skipset + int skip_footer + + # error handling + char *warn_msg + char *error_msg + + ctypedef struct coliter_t: + char **words + int *line_start + int col + + void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) + char* COLITER_NEXT(coliter_t it) + + parser_t* parser_new() + + int parser_init(parser_t *self) nogil + void parser_free(parser_t *self) nogil + int parser_add_skiprow(parser_t *self, int64_t row) + + void parser_set_default_options(parser_t *self) + + int parser_consume_rows(parser_t *self, size_t nrows) + + int parser_trim_buffers(parser_t *self) + + void debug_print_parser(parser_t *self) + + int tokenize_all_rows(parser_t *self) + int tokenize_nrows(parser_t *self, size_t nrows) + + int64_t str_to_int64(char *p_item, int64_t int_min, + int64_t int_max, int *error, char tsep) + uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error) + + inline int to_double(char *item, double *p_value, + char sci, char decimal, char thousands) + inline int to_complex(char *item, double *p_real, + double *p_imag, char sci, char decimal) + inline int to_longlong(char *item, long long *p_value) + inline int to_longlong_thousands(char *item, long long *p_value, + char tsep) + inline int to_boolean(char *item, uint8_t *val) + + +cdef extern from "parser/io.h": + void *new_mmap(char *fname) + int del_mmap(void *src) + void* buffer_mmap_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status) + + void *new_file_source(char *fname, size_t buffer_size) + + void *new_rd_source(object obj) + + int del_file_source(void *src) + int del_rd_source(void *src) + + void* buffer_file_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status) + + void* buffer_rd_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status) + + +DEFAULT_CHUNKSIZE = 256 * 1024 + +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', +_NA_VALUES = [b'-1.#IND', b'1.#QNAN', b'1.#IND', b'-1.#QNAN', + b'#N/A N/A', b'NA', b'#NA', b'NULL', b'NaN', + b'nan', b''] + + +cdef class TextReader: + ''' + + # source: StringIO or file object + + ''' + + cdef: + parser_t *parser + object file_handle, na_fvalues + bint na_filter, verbose, has_usecols, has_mi_columns + int parser_start + list clocks + char *c_encoding + + cdef public: + int leading_cols, table_width, skip_footer, buffer_lines + object allow_leading_cols + object delimiter, converters, delim_whitespace + object na_values, true_values, false_values + object memory_map + object as_recarray + object header, orig_header, names, header_start, header_end + object index_col + object low_memory + object skiprows + object compact_ints, use_unsigned + object dtype + object encoding + object compression + object mangle_dupe_cols + object tupleize_cols + set noconvert, usecols + + def __cinit__(self, source, + delimiter=b',', + + header=0, + header_start=0, + header_end=0, + index_col=None, + names=None, + + memory_map=False, + tokenize_chunksize=DEFAULT_CHUNKSIZE, + delim_whitespace=False, + + compression=None, + + converters=None, + + as_recarray=False, + + skipinitialspace=False, + escapechar=None, + doublequote=True, + quotechar=b'"', + quoting=0, + lineterminator=None, + + encoding=None, + + comment=None, + decimal=b'.', + thousands=None, + + dtype=None, + usecols=None, + error_bad_lines=True, + warn_bad_lines=True, + + na_filter=True, + na_values=None, + na_fvalues=None, + true_values=None, + false_values=None, + + compact_ints=False, + allow_leading_cols=True, + use_unsigned=False, + low_memory=False, + buffer_lines=None, + skiprows=None, + skip_footer=0, + verbose=False, + mangle_dupe_cols=True, + tupleize_cols=False): + + self.parser = parser_new() + self.parser.chunksize = tokenize_chunksize + + self.mangle_dupe_cols=mangle_dupe_cols + self.tupleize_cols=tupleize_cols + + # For timekeeping + self.clocks = [] + + self.compression = compression + self.memory_map = memory_map + + self._setup_parser_source(source) + parser_set_default_options(self.parser) + + parser_init(self.parser) + + if delim_whitespace: + self.parser.delim_whitespace = delim_whitespace + else: + if len(delimiter) > 1: + raise ValueError('only length-1 separators excluded right now') + self.parser.delimiter = ord(delimiter) + + #---------------------------------------- + # parser options + + self.parser.doublequote = doublequote + self.parser.skipinitialspace = skipinitialspace + + if lineterminator is not None: + if len(lineterminator) != 1: + raise ValueError('Only length-1 line terminators supported') + self.parser.lineterminator = ord(lineterminator) + + if len(decimal) != 1: + raise ValueError('Only length-1 decimal markers supported') + self.parser.decimal = ord(decimal) + + if thousands is not None: + if len(thousands) != 1: + raise ValueError('Only length-1 thousands markers supported') + self.parser.thousands = ord(thousands) + + if escapechar is not None: + if len(escapechar) != 1: + raise ValueError('Only length-1 escapes supported') + self.parser.escapechar = ord(escapechar) + + self.parser.quotechar = ord(quotechar) + self.parser.quoting = quoting + + if comment is not None: + if len(comment) > 1: + raise ValueError('Only length-1 comment characters supported') + self.parser.commentchar = ord(comment) + + # error handling of bad lines + self.parser.error_bad_lines = int(error_bad_lines) + self.parser.warn_bad_lines = int(warn_bad_lines) + + self.skiprows = skiprows + if skiprows is not None: + self._make_skiprow_set() + + self.skip_footer = skip_footer + + # suboptimal + if usecols is not None: + self.has_usecols = 1 + self.usecols = set(usecols) + + # XXX + if skip_footer > 0: + self.parser.error_bad_lines = 0 + self.parser.warn_bad_lines = 0 + + self.delimiter = delimiter + self.delim_whitespace = delim_whitespace + + self.na_values = na_values + if na_fvalues is None: + na_fvalues = set() + self.na_fvalues = na_fvalues + + self.true_values = _maybe_encode(true_values) + self.false_values = _maybe_encode(false_values) + + self.converters = converters + + self.na_filter = na_filter + self.as_recarray = as_recarray + + self.compact_ints = compact_ints + self.use_unsigned = use_unsigned + + self.verbose = verbose + self.low_memory = low_memory + + # encoding + if encoding is not None: + if not isinstance(encoding, bytes): + encoding = encoding.encode('utf-8') + encoding = encoding.lower() + self.c_encoding = encoding + else: + self.c_encoding = NULL + + self.encoding = encoding + + if isinstance(dtype, dict): + conv = {} + for k in dtype: + v = dtype[k] + if isinstance(v, basestring): + v = np.dtype(v) + conv[k] = v + dtype = conv + elif dtype is not None: + dtype = np.dtype(dtype) + + self.dtype = dtype + + # XXX + self.noconvert = set() + + self.index_col = index_col + + #---------------------------------------- + # header stuff + + self.allow_leading_cols = allow_leading_cols + self.leading_cols = 0 + + # TODO: no header vs. header is not the first row + self.has_mi_columns = 0 + self.orig_header = header + if header is None: + # sentinel value + self.parser.header_start = -1 + self.parser.header_end = -1 + self.parser.header = -1 + self.parser_start = 0 + self.header = [] + else: + if isinstance(header, list) and len(header): + # need to artifically skip the final line + # which is still a header line + header = list(header) + header.append(header[-1]+1) + + self.parser.header_start = header[0] + self.parser.header_end = header[-1] + self.parser.header = header[0] + self.parser_start = header[-1] + 1 + self.has_mi_columns = 1 + self.header = header + else: + self.parser.header_start = header + self.parser.header_end = header + self.parser.header = header + self.parser_start = header + 1 + self.header = [ header ] + + self.names = names + self.header, self.table_width = self._get_header() + + if not self.table_width: + raise ValueError("No columns to parse from file") + + # compute buffer_lines as function of table width + heuristic = 2**20 // self.table_width + self.buffer_lines = 1 + while self.buffer_lines * 2< heuristic: + self.buffer_lines *= 2 + + def __init__(self, *args, **kwards): + pass + + def __dealloc__(self): + parser_free(self.parser) + + def set_error_bad_lines(self, int status): + self.parser.error_bad_lines = status + + cdef _make_skiprow_set(self): + if isinstance(self.skiprows, (int, np.integer)): + self.skiprows = range(self.skiprows) + + for i in self.skiprows: + parser_add_skiprow(self.parser, i) + + cdef _setup_parser_source(self, source): + cdef: + int status + void *ptr + + self.parser.cb_io = NULL + self.parser.cb_cleanup = NULL + + if self.compression: + if self.compression == 'gzip': + import gzip + if isinstance(source, basestring): + source = gzip.GzipFile(source, 'rb') + else: + source = gzip.GzipFile(fileobj=source) + elif self.compression == 'bz2': + import bz2 + if isinstance(source, basestring): + source = bz2.BZ2File(source, 'rb') + else: + raise ValueError('Python cannot read bz2 from open file ' + 'handle') + else: + raise ValueError('Unrecognized compression type: %s' % + self.compression) + + if isinstance(source, basestring): + if not isinstance(source, bytes): + source = source.encode(sys.getfilesystemencoding() or 'utf-8') + + if self.memory_map: + ptr = new_mmap(source) + if ptr == NULL: + # fall back + ptr = new_file_source(source, self.parser.chunksize) + self.parser.cb_io = &buffer_file_bytes + self.parser.cb_cleanup = &del_file_source + else: + self.parser.cb_io = &buffer_mmap_bytes + self.parser.cb_cleanup = &del_mmap + else: + ptr = new_file_source(source, self.parser.chunksize) + self.parser.cb_io = &buffer_file_bytes + self.parser.cb_cleanup = &del_file_source + + if ptr == NULL: + if not os.path.exists(source): + raise IOError('File %s does not exist' % source) + raise IOError('Initializing from file failed') + + self.parser.source = ptr + + elif hasattr(source, 'read'): + # e.g., StringIO + + ptr = new_rd_source(source) + if ptr == NULL: + raise IOError('Initializing parser from file-like ' + 'object failed') + + self.parser.source = ptr + self.parser.cb_io = &buffer_rd_bytes + self.parser.cb_cleanup = &del_rd_source + else: + raise IOError('Expected file path name or file-like object,' + ' got %s type' % type(source)) + + cdef _get_header(self): + # header is now a list of lists, so field_count should use header[0] + + cdef: + size_t i, start, data_line, field_count, passed_count, hr, unnamed_count + char *word + object name + int status + Py_ssize_t size + char *errors = "strict" + + header = [] + + if self.parser.header_start >= 0: + + # Header is in the file + for level, hr in enumerate(self.header): + + this_header = [] + + if self.parser.lines < hr + 1: + self._tokenize_rows(hr + 2) + + # e.g., if header=3 and file only has 2 lines + if self.parser.lines < hr + 1: + msg = self.orig_header + if isinstance(msg,list): + msg = "[%s], len of %d," % (','.join([ str(m) for m in msg ]),len(msg)) + raise CParserError('Passed header=%s but only %d lines in file' + % (msg, self.parser.lines)) + + field_count = self.parser.line_fields[hr] + start = self.parser.line_start[hr] + + # TODO: Py3 vs. Py2 + counts = {} + unnamed_count = 0 + for i in range(field_count): + word = self.parser.words[start + i] + + if self.c_encoding == NULL and not PY3: + name = PyBytes_FromString(word) + else: + if self.c_encoding == NULL or self.c_encoding == b'utf-8': + name = PyUnicode_FromString(word) + else: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) + + if name == '': + if self.has_mi_columns: + name = 'Unnamed: %d_level_%d' % (i,level) + else: + name = 'Unnamed: %d' % i + unnamed_count += 1 + + count = counts.get(name, 0) + if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: + this_header.append('%s.%d' % (name, count)) + else: + this_header.append(name) + counts[name] = count + 1 + + if self.has_mi_columns: + + # if we have grabbed an extra line, but its not in our format + # so save in the buffer, and create an blank extra line for the rest of the + # parsing code + if hr == self.header[-1]: + lc = len(this_header) + ic = len(self.index_col) if self.index_col is not None else 0 + if lc != unnamed_count and lc-ic > unnamed_count: + hr -= 1 + self.parser_start -= 1 + this_header = [ None ] * lc + + data_line = hr + 1 + header.append(this_header) + + if self.names is not None: + header = [ self.names ] + + elif self.names is not None: + # Enforce this unless usecols + if not self.has_usecols: + self.parser.expected_fields = len(self.names) + + # Names passed + if self.parser.lines < 1: + self._tokenize_rows(1) + + header = [ self.names ] + data_line = 0 + + if self.parser.lines < 1: + field_count = len(header[0]) + else: + field_count = self.parser.line_fields[data_line] + else: + # No header passed nor to be found in the file + if self.parser.lines < 1: + self._tokenize_rows(1) + + return None, self.parser.line_fields[0] + + # Corner case, not enough lines in the file + if self.parser.lines < data_line + 1: + field_count = len(header[0]) + else: # not self.has_usecols: + + field_count = self.parser.line_fields[data_line] + + # #2981 + if self.names is not None: + field_count = max(field_count, len(self.names)) + + passed_count = len(header[0]) + + # if passed_count > field_count: + # raise CParserError('Column names have %d fields, ' + # 'data has %d fields' + # % (passed_count, field_count)) + + if self.has_usecols: + nuse = len(self.usecols) + if nuse == passed_count: + self.leading_cols = 0 + elif self.names is None and nuse < passed_count: + self.leading_cols = field_count - passed_count + elif passed_count != field_count: + raise ValueError('Passed header names ' + 'mismatches usecols') + # oh boy, #2442, #2981 + elif self.allow_leading_cols and passed_count < field_count: + self.leading_cols = field_count - passed_count + + return header, field_count + + cdef _implicit_index_count(self): + pass + + def read(self, rows=None): + """ + rows=None --> read all rows + """ + cdef: + int status + + if self.low_memory: + # Conserve intermediate space + columns = self._read_low_memory(rows) + else: + # Don't care about memory usage + columns = self._read_rows(rows, 1) + + if self.as_recarray: + self._start_clock() + result = _to_structured_array(columns, self.header) + self._end_clock('Conversion to structured array') + + return result + else: + return columns + + cdef _read_low_memory(self, rows): + cdef: + size_t rows_read = 0 + chunks = [] + + if rows is None: + while True: + try: + chunk = self._read_rows(self.buffer_lines, 0) + if len(chunk) == 0: + break + except StopIteration: + break + else: + chunks.append(chunk) + else: + while rows_read < rows: + try: + crows = min(self.buffer_lines, rows - rows_read) + + chunk = self._read_rows(crows, 0) + if len(chunk) == 0: + break + + rows_read += len(list(chunk.values())[0]) + except StopIteration: + break + else: + chunks.append(chunk) + + parser_trim_buffers(self.parser) + + if len(chunks) == 0: + raise StopIteration + + # destructive to chunks + return _concatenate_chunks(chunks) + + cdef _tokenize_rows(self, size_t nrows): + cdef int status + status = tokenize_nrows(self.parser, nrows) + + if self.parser.warn_msg != NULL: + print >> sys.stderr, self.parser.warn_msg + free(self.parser.warn_msg) + self.parser.warn_msg = NULL + + if status < 0: + raise_parser_error('Error tokenizing data', self.parser) + + cdef _read_rows(self, rows, bint trim): + cdef: + int buffered_lines + int irows, footer = 0 + + self._start_clock() + + if rows is not None: + irows = rows + buffered_lines = self.parser.lines - self.parser_start + if buffered_lines < irows: + self._tokenize_rows(irows - buffered_lines) + + if self.skip_footer > 0: + raise ValueError('skip_footer can only be used to read ' + 'the whole file') + else: + status = tokenize_all_rows(self.parser) + + if self.parser.warn_msg != NULL: + print >> sys.stderr, self.parser.warn_msg + free(self.parser.warn_msg) + self.parser.warn_msg = NULL + + if status < 0: + raise_parser_error('Error tokenizing data', self.parser) + footer = self.skip_footer + + if self.parser_start == self.parser.lines: + raise StopIteration + self._end_clock('Tokenization') + + self._start_clock() + columns = self._convert_column_data(rows=rows, + footer=footer, + upcast_na=not self.as_recarray) + self._end_clock('Type conversion') + + self._start_clock() + if len(columns) > 0: + rows_read = len(list(columns.values())[0]) + # trim + parser_consume_rows(self.parser, rows_read) + if trim: + parser_trim_buffers(self.parser) + self.parser_start -= rows_read + + self._end_clock('Parser memory cleanup') + + return columns + + def debug_print(self): + debug_print_parser(self.parser) + + cdef _start_clock(self): + self.clocks.append(time.time()) + + cdef _end_clock(self, what): + if self.verbose: + elapsed = time.time() - self.clocks.pop(-1) + print '%s took: %.2f ms' % (what, elapsed * 1000) + + def set_noconvert(self, i): + self.noconvert.add(i) + + def remove_noconvert(self, i): + self.noconvert.remove(i) + + def _convert_column_data(self, rows=None, upcast_na=False, footer=0): + cdef: + Py_ssize_t i, nused + kh_str_t *na_hashset = NULL + int start, end + object name, na_flist + bint na_filter = 0 + Py_ssize_t num_cols + + start = self.parser_start + + if rows is None: + end = self.parser.lines + else: + end = min(start + rows, self.parser.lines) + + # # skip footer + # if footer > 0: + # end -= footer + + #print >> sys.stderr, self.table_width + #print >> sys.stderr, self.leading_cols + #print >> sys.stderr, self.parser.lines + #print >> sys.stderr, start + #print >> sys.stderr, end + #print >> sys.stderr, self.header + #print >> sys.stderr, "index" + num_cols = -1 + for i in range(self.parser.lines): + num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\ + (num_cols >= self.parser.line_fields[i]) * num_cols + + if self.table_width - self.leading_cols > num_cols: + raise CParserError("Too many columns specified: expected %s and found %s" % + (self.table_width - self.leading_cols, num_cols)) + + results = {} + nused = 0 + for i in range(self.table_width): + if i < self.leading_cols: + # Pass through leading columns always + name = i + elif self.usecols and nused == len(self.usecols): + # Once we've gathered all requested columns, stop. GH5766 + break + else: + name = self._get_column_name(i, nused) + if self.has_usecols and not (i in self.usecols or + name in self.usecols): + continue + nused += 1 + + conv = self._get_converter(i, name) + + # XXX + na_flist = set() + if self.na_filter: + na_list, na_flist = self._get_na_list(i, name) + if na_list is None: + na_filter = 0 + else: + na_filter = 1 + na_hashset = kset_from_list(na_list) + else: + na_filter = 0 + + if conv: + results[i] = _apply_converter(conv, self.parser, i, start, end, + self.c_encoding) + continue + + # Should return as the desired dtype (inferred or specified) + col_res, na_count = self._convert_tokens(i, start, end, name, + na_filter, na_hashset, na_flist) + + if na_filter: + self._free_na_set(na_hashset) + + if upcast_na and na_count > 0: + col_res = _maybe_upcast(col_res) + + if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: + col_res = downcast_int64(col_res, self.use_unsigned) + + if col_res is None: + raise Exception('Unable to parse column %d' % i) + + results[i] = col_res + + self.parser_start += end - start + + return results + + cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, + object name, bint na_filter, + kh_str_t *na_hashset, + object na_flist): + cdef: + object col_dtype = None + + if self.dtype is not None: + if isinstance(self.dtype, dict): + if name in self.dtype: + col_dtype = self.dtype[name] + elif i in self.dtype: + col_dtype = self.dtype[i] + else: + if self.dtype.names: + col_dtype = self.dtype.descr[i][1] + else: + col_dtype = self.dtype + + if col_dtype is not None: + if not isinstance(col_dtype, basestring): + if isinstance(col_dtype, np.dtype): + col_dtype = col_dtype.str + else: + col_dtype = np.dtype(col_dtype).str + + return self._convert_with_dtype(col_dtype, i, start, end, + na_filter, 1, na_hashset, na_flist) + + if i in self.noconvert: + return self._string_convert(i, start, end, na_filter, na_hashset) + else: + col_res = None + for dt in dtype_cast_order: + try: + col_res, na_count = self._convert_with_dtype( + dt, i, start, end, na_filter, 0, na_hashset, na_flist) + except OverflowError: + col_res, na_count = self._convert_with_dtype( + '|O8', i, start, end, na_filter, 0, na_hashset, na_flist) + + if col_res is not None: + break + + return col_res, na_count + + cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, + int start, int end, + bint na_filter, + bint user_dtype, + kh_str_t *na_hashset, + object na_flist): + cdef kh_str_t *true_set, *false_set + + if dtype[1] == 'i' or dtype[1] == 'u': + result, na_count = _try_int64(self.parser, i, start, end, + na_filter, na_hashset) + if user_dtype and na_count > 0: + raise Exception('Integer column has NA values') + + if dtype[1:] != 'i8': + result = result.astype(dtype) + + return result, na_count + + elif dtype[1] == 'f': + result, na_count = _try_double(self.parser, i, start, end, + na_filter, na_hashset, na_flist) + + if dtype[1:] != 'f8': + result = result.astype(dtype) + return result, na_count + + elif dtype[1] == 'b': + if self.true_values is not None or self.false_values is not None: + + true_set = kset_from_list(self.true_values + _true_values) + false_set = kset_from_list(self.false_values + _false_values) + result, na_count = _try_bool_flex(self.parser, i, start, end, + na_filter, na_hashset, + true_set, false_set) + kh_destroy_str(true_set) + kh_destroy_str(false_set) + else: + result, na_count = _try_bool(self.parser, i, start, end, + na_filter, na_hashset) + return result, na_count + elif dtype[1] == 'c': + raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) + + elif dtype[1] == 'S': + # TODO: na handling + width = int(dtype[2:]) + if width > 0: + result = _to_fw_string(self.parser, i, start, end, width) + return result, 0 + + # treat as a regular string parsing + return self._string_convert(i, start, end, na_filter, + na_hashset) + elif dtype[1] == 'U': + width = int(dtype[2:]) + if width > 0: + raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) + + # unicode variable width + return self._string_convert(i, start, end, na_filter, + na_hashset) + + + elif dtype[1] == 'O': + return self._string_convert(i, start, end, na_filter, + na_hashset) + else: + if dtype[1] == 'M': + raise TypeError("the dtype %s is not supported for parsing, " + "pass this column using parse_dates instead" % dtype) + raise TypeError("the dtype %s is not supported for parsing" % dtype) + + cdef _string_convert(self, Py_ssize_t i, int start, int end, + bint na_filter, kh_str_t *na_hashset): + if PY3: + if self.c_encoding != NULL: + if self.c_encoding == b"utf-8": + return _string_box_utf8(self.parser, i, start, end, + na_filter, na_hashset) + else: + return _string_box_decode(self.parser, i, start, end, + na_filter, na_hashset, + self.c_encoding) + else: + return _string_box_utf8(self.parser, i, start, end, + na_filter, na_hashset) + else: + if self.c_encoding != NULL: + if self.c_encoding == b"utf-8": + return _string_box_utf8(self.parser, i, start, end, + na_filter, na_hashset) + else: + return _string_box_decode(self.parser, i, start, end, + na_filter, na_hashset, + self.c_encoding) + else: + return _string_box_factorize(self.parser, i, start, end, + na_filter, na_hashset) + + def _get_converter(self, i, name): + if self.converters is None: + return None + + if name is not None and name in self.converters: + return self.converters[name] + + # Converter for position, if any + return self.converters.get(i) + + cdef _get_na_list(self, i, name): + if self.na_values is None: + return None, set() + + if isinstance(self.na_values, dict): + values = None + if name is not None and name in self.na_values: + values = self.na_values[name] + if values is not None and not isinstance(values, list): + values = list(values) + fvalues = self.na_fvalues[name] + if fvalues is not None and not isinstance(fvalues, set): + fvalues = set(fvalues) + else: + if i in self.na_values: + return self.na_values[i], self.na_fvalues[i] + else: + return _NA_VALUES, set() + + return _ensure_encoded(values), fvalues + else: + if not isinstance(self.na_values, list): + self.na_values = list(self.na_values) + if not isinstance(self.na_fvalues, set): + self.na_fvalues = set(self.na_fvalues) + + return _ensure_encoded(self.na_values), self.na_fvalues + + cdef _free_na_set(self, kh_str_t *table): + kh_destroy_str(table) + + cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): + if self.has_usecols and self.names is not None: + if len(self.names) == len(self.usecols): + return self.names[nused] + else: + return self.names[i - self.leading_cols] + else: + if self.header is not None: + j = i - self.leading_cols + # hack for #2442 + if j == len(self.header[0]): + return j + else: + return self.header[0][j] + else: + return None + +class CParserError(Exception): + pass + + +class OverflowError(ValueError): + pass + +cdef object _true_values = [b'True', b'TRUE', b'true'] +cdef object _false_values = [b'False', b'FALSE', b'false'] + + +def _ensure_encoded(list lst): + cdef list result = [] + for x in lst: + if PyUnicode_Check(x): + x = PyUnicode_AsUTF8String(x) + elif not PyBytes_Check(x): + x = asbytes(x) + + result.append(x) + return result + +cdef asbytes(object o): + if PY3: + return str(o).encode('utf-8') + else: + return str(o) + + +def _is_file_like(obj): + if PY3: + import io + if isinstance(obj, io.TextIOWrapper): + raise CParserError('Cannot handle open unicode files (yet)') + + # BufferedReader is a byte reader for Python 3 + file = io.BufferedReader + else: + import __builtin__ + file = __builtin__.file + + return isinstance(obj, (basestring, file)) + + +def _maybe_upcast(arr): + """ + + """ + if issubclass(arr.dtype.type, np.integer): + na_value = na_values[arr.dtype] + arr = arr.astype(float) + np.putmask(arr, arr == na_value, np.nan) + elif arr.dtype == np.bool_: + mask = arr.view(np.uint8) == na_values[np.uint8] + arr = arr.astype(object) + np.putmask(arr, mask, np.nan) + + return arr + +# ---------------------------------------------------------------------- +# Type conversions / inference support code + +cdef _string_box_factorize(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset): + cdef: + int error, na_count = 0 + Py_ssize_t i + size_t lines + coliter_t it + char *word + ndarray[object] result + + int ret = 0 + kh_strbox_t *table + + object pyval + + object NA = na_values[np.object_] + khiter_t k + + table = kh_init_strbox() + lines = line_end - line_start + result = np.empty(lines, dtype=np.object_) + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + word = COLITER_NEXT(it) + + if na_filter: + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + na_count += 1 + result[i] = NA + continue + + k = kh_get_strbox(table, word) + + # in the hash table + if k != table.n_buckets: + # this increments the refcount, but need to test + pyval = table.vals[k] + else: + # box it. new ref? + pyval = PyBytes_FromString(word) + + k = kh_put_strbox(table, word, &ret) + table.vals[k] = pyval + + result[i] = pyval + + kh_destroy_strbox(table) + + return result, na_count + +cdef _string_box_utf8(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset): + cdef: + int error, na_count = 0 + Py_ssize_t i + size_t lines + coliter_t it + char *word + ndarray[object] result + + int ret = 0 + kh_strbox_t *table + + object pyval + + object NA = na_values[np.object_] + khiter_t k + + table = kh_init_strbox() + lines = line_end - line_start + result = np.empty(lines, dtype=np.object_) + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + word = COLITER_NEXT(it) + + if na_filter: + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + na_count += 1 + result[i] = NA + continue + + k = kh_get_strbox(table, word) + + # in the hash table + if k != table.n_buckets: + # this increments the refcount, but need to test + pyval = table.vals[k] + else: + # box it. new ref? + pyval = PyUnicode_FromString(word) + + k = kh_put_strbox(table, word, &ret) + table.vals[k] = pyval + + result[i] = pyval + + kh_destroy_strbox(table) + + return result, na_count + +cdef _string_box_decode(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + char *encoding): + cdef: + int error, na_count = 0 + Py_ssize_t i, size + size_t lines + coliter_t it + char *word + ndarray[object] result + + int ret = 0 + kh_strbox_t *table + + char *errors = "strict" + + object pyval + + object NA = na_values[np.object_] + khiter_t k + + table = kh_init_strbox() + lines = line_end - line_start + result = np.empty(lines, dtype=np.object_) + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + word = COLITER_NEXT(it) + + if na_filter: + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + na_count += 1 + result[i] = NA + continue + + k = kh_get_strbox(table, word) + + # in the hash table + if k != table.n_buckets: + # this increments the refcount, but need to test + pyval = table.vals[k] + else: + # box it. new ref? + size = strlen(word) + pyval = PyUnicode_Decode(word, size, encoding, errors) + + k = kh_put_strbox(table, word, &ret) + table.vals[k] = pyval + + result[i] = pyval + + kh_destroy_strbox(table) + + return result, na_count + + +cdef _to_fw_string(parser_t *parser, int col, int line_start, + int line_end, size_t width): + cdef: + int error + Py_ssize_t i, j + coliter_t it + char *word, *data + ndarray result + + result = np.empty(line_end - line_start, dtype='|S%d' % width) + data = result.data + + coliter_setup(&it, parser, col, line_start) + + for i in range(line_end - line_start): + word = COLITER_NEXT(it) + strncpy(data, word, width) + data += width + + return result + +cdef char* cinf = b'inf' +cdef char* cneginf = b'-inf' + +cdef _try_double(parser_t *parser, int col, int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, object na_flist): + cdef: + int error, na_count = 0 + size_t i, lines + coliter_t it + char *word + double *data + double NA = na_values[np.float64] + ndarray result + khiter_t k + bint use_na_flist = len(na_flist) > 0 + + lines = line_end - line_start + result = np.empty(lines, dtype=np.float64) + data = result.data + coliter_setup(&it, parser, col, line_start) + + if na_filter: + for i in range(lines): + word = COLITER_NEXT(it) + + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + na_count += 1 + data[0] = NA + else: + error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) + if error != 1: + if strcasecmp(word, cinf) == 0: + data[0] = INF + elif strcasecmp(word, cneginf) == 0: + data[0] = NEGINF + else: + return None, None + if use_na_flist: + if data[0] in na_flist: + na_count += 1 + data[0] = NA + data += 1 + else: + for i in range(lines): + word = COLITER_NEXT(it) + error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) + if error != 1: + if strcasecmp(word, cinf) == 0: + data[0] = INF + elif strcasecmp(word, cneginf) == 0: + data[0] = NEGINF + else: + return None, None + data += 1 + + return result, na_count + + +cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset): + cdef: + int error, na_count = 0 + size_t i, lines + coliter_t it + char *word + int64_t *data + ndarray result + + int64_t NA = na_values[np.int64] + khiter_t k + + lines = line_end - line_start + result = np.empty(lines, dtype=np.int64) + data = result.data + coliter_setup(&it, parser, col, line_start) + + if na_filter: + for i in range(lines): + word = COLITER_NEXT(it) + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + na_count += 1 + data[i] = NA + continue + + data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, + &error, parser.thousands) + if error != 0: + if error == ERROR_OVERFLOW: + raise OverflowError(word) + + return None, None + else: + for i in range(lines): + word = COLITER_NEXT(it) + data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, + &error, parser.thousands) + if error != 0: + if error == ERROR_OVERFLOW: + raise OverflowError(word) + return None, None + + return result, na_count + + +cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset): + cdef: + int error, na_count = 0 + size_t i, lines + coliter_t it + char *word + uint8_t *data + ndarray result + + uint8_t NA = na_values[np.bool_] + khiter_t k + + lines = line_end - line_start + result = np.empty(lines, dtype=np.uint8) + data = result.data + coliter_setup(&it, parser, col, line_start) + + if na_filter: + for i in range(lines): + word = COLITER_NEXT(it) + + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + na_count += 1 + data[0] = NA + data += 1 + continue + + error = to_boolean(word, data) + if error != 0: + return None, None + data += 1 + else: + for i in range(lines): + word = COLITER_NEXT(it) + + error = to_boolean(word, data) + if error != 0: + return None, None + data += 1 + + return result.view(np.bool_), na_count + + +cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + kh_str_t *true_hashset, kh_str_t *false_hashset): + cdef: + int error, na_count = 0 + size_t i, lines + coliter_t it + char *word + uint8_t *data + ndarray result + + uint8_t NA = na_values[np.bool_] + khiter_t k + + lines = line_end - line_start + result = np.empty(lines, dtype=np.uint8) + data = result.data + coliter_setup(&it, parser, col, line_start) + + if na_filter: + for i in range(lines): + word = COLITER_NEXT(it) + + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + na_count += 1 + data[0] = NA + data += 1 + continue + + k = kh_get_str(true_hashset, word) + if k != true_hashset.n_buckets: + data[0] = 1 + data += 1 + continue + + k = kh_get_str(false_hashset, word) + if k != false_hashset.n_buckets: + data[0] = 0 + data += 1 + continue + + error = to_boolean(word, data) + if error != 0: + return None, None + data += 1 + else: + for i in range(lines): + word = COLITER_NEXT(it) + + k = kh_get_str(true_hashset, word) + if k != true_hashset.n_buckets: + data[0] = 1 + data += 1 + continue + + k = kh_get_str(false_hashset, word) + if k != false_hashset.n_buckets: + data[0] = 0 + data += 1 + continue + + error = to_boolean(word, data) + if error != 0: + return None, None + data += 1 + + return result.view(np.bool_), na_count + +cdef _get_na_mask(parser_t *parser, int col, int line_start, int line_end, + kh_str_t *na_hashset): + cdef: + int error + Py_ssize_t i + size_t lines + coliter_t it + char *word + ndarray[uint8_t, cast=True] result + khiter_t k + + lines = line_end - line_start + result = np.empty(lines, dtype=np.bool_) + + coliter_setup(&it, parser, col, line_start) + for i in range(lines): + word = COLITER_NEXT(it) + + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + result[i] = 1 + else: + result[i] = 0 + + return result + +cdef kh_str_t* kset_from_list(list values) except NULL: + # caller takes responsibility for freeing the hash table + cdef: + Py_ssize_t i + khiter_t k + kh_str_t *table + int ret = 0 + + object val + + table = kh_init_str() + + for i in range(len(values)): + val = values[i] + + # None creeps in sometimes, which isn't possible here + if not PyBytes_Check(val): + raise Exception('Must be all encoded bytes') + + k = kh_put_str(table, PyBytes_AsString(val), &ret) + + return table + + +# if at first you don't succeed... + +# TODO: endianness just a placeholder? +cdef list dtype_cast_order = [' mx: + mx = val + + if val < mn: + mn = val + + if mn >= 0 and use_unsigned: + if mx <= UINT8_MAX - 1: + result = arr.astype(np.uint8) + if na_count: + np.putmask(result, _mask, na_values[np.uint8]) + return result + + if mx <= UINT16_MAX - 1: + result = arr.astype(np.uint16) + if na_count: + np.putmask(result, _mask, na_values[np.uint16]) + return result + + if mx <= UINT32_MAX - 1: + result = arr.astype(np.uint32) + if na_count: + np.putmask(result, _mask, na_values[np.uint32]) + return result + + else: + if mn >= INT8_MIN + 1 and mx <= INT8_MAX: + result = arr.astype(np.int8) + if na_count: + np.putmask(result, _mask, na_values[np.int8]) + return result + + if mn >= INT16_MIN + 1 and mx <= INT16_MAX: + result = arr.astype(np.int16) + if na_count: + np.putmask(result, _mask, na_values[np.int16]) + return result + + if mn >= INT32_MIN + 1 and mx <= INT32_MAX: + result = arr.astype(np.int32) + if na_count: + np.putmask(result, _mask, na_values[np.int32]) + return result + + return arr + + +def _concatenate_chunks(list chunks): + cdef: + list names = list(chunks[0].keys()) + object name + list warning_columns + object warning_names + object common_type + + result = {} + warning_columns = list() + for name in names: + arrs = [chunk.pop(name) for chunk in chunks] + # Check each arr for consistent types. + dtypes = set([a.dtype for a in arrs]) + if len(dtypes) > 1: + common_type = np.find_common_type(dtypes, []) + if common_type == np.object: + warning_columns.append(str(name)) + result[name] = np.concatenate(arrs) + + if warning_columns: + warning_names = ','.join(warning_columns) + warning_message = " ".join(["Columns (%s) have mixed types." % warning_names, + "Specify dtype option on import or set low_memory=False." + ]) + warnings.warn(warning_message, DtypeWarning) + return result + +#---------------------------------------------------------------------- + +# NA values +def _compute_na_values(): + int64info = np.iinfo(np.int64) + int32info = np.iinfo(np.int32) + int16info = np.iinfo(np.int16) + int8info = np.iinfo(np.int8) + uint64info = np.iinfo(np.uint64) + uint32info = np.iinfo(np.uint32) + uint16info = np.iinfo(np.uint16) + uint8info = np.iinfo(np.uint8) + na_values = { + np.float64 : np.nan, + np.int64 : int64info.min, + np.int32 : int32info.min, + np.int16 : int16info.min, + np.int8 : int8info.min, + np.uint64 : uint64info.max, + np.uint32 : uint32info.max, + np.uint16 : uint16info.max, + np.uint8 : uint8info.max, + np.bool_ : uint8info.max, + np.object_ : np.nan # oof + } + return na_values + +na_values = _compute_na_values() + +for k in list(na_values): + na_values[np.dtype(k)] = na_values[k] + + +cdef _apply_converter(object f, parser_t *parser, int col, + int line_start, int line_end, + char* c_encoding): + cdef: + int error + Py_ssize_t i + size_t lines + coliter_t it + char *word + char *errors = "strict" + ndarray[object] result + object val + + lines = line_end - line_start + result = np.empty(lines, dtype=np.object_) + + coliter_setup(&it, parser, col, line_start) + + if not PY3 and c_encoding == NULL: + for i in range(lines): + word = COLITER_NEXT(it) + val = PyBytes_FromString(word) + result[i] = f(val) + elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'): + for i in range(lines): + word = COLITER_NEXT(it) + val = PyUnicode_FromString(word) + result[i] = f(val) + else: + for i in range(lines): + word = COLITER_NEXT(it) + val = PyUnicode_Decode(word, strlen(word), + c_encoding, errors) + result[i] = f(val) + + return lib.maybe_convert_objects(result) + + +def _to_structured_array(dict columns, object names): + cdef: + ndarray recs, column + cnp.dtype dt + dict fields + + object name, fnames, field_type + Py_ssize_t i, offset, nfields, length + int stride, elsize + char *buf + + if names is None: + names = ['%d' % i for i in range(len(columns))] + else: + # single line header + names = names[0] + + dt = np.dtype([(str(name), columns[i].dtype) + for i, name in enumerate(names)]) + fnames = dt.names + fields = dt.fields + + nfields = len(fields) + + if PY3: + length = len(list(columns.values())[0]) + else: + length = len(columns.values()[0]) + + stride = dt.itemsize + + # start = time.time() + + # we own the data + buf = malloc(length * stride) + + recs = util.sarr_from_data(dt, length, buf) + assert(recs.flags.owndata) + + # buf = recs.data + # end = time.time() + # print 'took %.4f' % (end - start) + + for i in range(nfields): + # start = time.clock() + # name = names[i] + + # XXX + field_type = fields[fnames[i]] + + # (dtype, stride) tuple + offset = field_type[1] + elsize = field_type[0].itemsize + column = columns[i] + + _fill_structured_column(buf + offset, column.data, + elsize, stride, length, + field_type[0] == np.object_) + + # print 'Transfer of %s took %.4f' % (str(field_type), + # time.clock() - start) + + return recs + +cdef _fill_structured_column(char *dst, char* src, int elsize, + int stride, int length, bint incref): + cdef: + size_t i + + if incref: + util.transfer_object_column(dst, src, stride, length) + else: + for i in range(length): + memcpy(dst, src, elsize) + dst += stride + src += elsize + + + +def _maybe_encode(values): + if values is None: + return [] + return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values] diff --git a/pandas/rpy/__init__.py b/pandas/rpy/__init__.py new file mode 100644 index 00000000..d5cf8a42 --- /dev/null +++ b/pandas/rpy/__init__.py @@ -0,0 +1,4 @@ +try: + from .common import importr, r, load_data +except ImportError: + pass diff --git a/pandas/rpy/base.py b/pandas/rpy/base.py new file mode 100644 index 00000000..4cd86d3c --- /dev/null +++ b/pandas/rpy/base.py @@ -0,0 +1,12 @@ +import pandas.rpy.util as util + + +class lm(object): + """ + Examples + -------- + >>> model = lm('x ~ y + z', data) + >>> model.coef + """ + def __init__(self, formula, data): + pass diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py new file mode 100644 index 00000000..5747285d --- /dev/null +++ b/pandas/rpy/common.py @@ -0,0 +1,357 @@ +""" +Utilities for making working with rpy2 more user- and +developer-friendly. +""" +from __future__ import print_function + +from pandas.compat import zip, range +import numpy as np + +import pandas as pd +import pandas.core.common as com +import pandas.util.testing as _test + +from rpy2.robjects.packages import importr +from rpy2.robjects import r +import rpy2.robjects as robj + +import itertools as IT + + +__all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe', + 'convert_to_r_matrix'] + + +def load_data(name, package=None, convert=True): + if package: + importr(package) + + r.data(name) + + robj = r[name] + + if convert: + return convert_robj(robj) + else: + return robj + + +def _rclass(obj): + """ + Return R class name for input object + """ + return r['class'](obj)[0] + + +def _is_null(obj): + return _rclass(obj) == 'NULL' + + +def _convert_list(obj): + """ + Convert named Vector to dict, factors to list + """ + try: + values = [convert_robj(x) for x in obj] + keys = r['names'](obj) + return dict(zip(keys, values)) + except TypeError: + # For state.division and state.region + factors = list(r['factor'](obj)) + level = list(r['levels'](obj)) + result = [level[index-1] for index in factors] + return result + + +def _convert_array(obj): + """ + Convert Array to DataFrame + """ + def _list(item): + try: + return list(item) + except TypeError: + return [] + + # For iris3, HairEyeColor, UCBAdmissions, Titanic + dim = list(obj.dim) + values = np.array(list(obj)) + names = r['dimnames'](obj) + try: + columns = list(r['names'](names))[::-1] + except TypeError: + columns = ['X{:d}'.format(i) for i in range(len(names))][::-1] + columns.append('value') + name_list = [(_list(x) or range(d)) for x, d in zip(names, dim)][::-1] + arr = np.array(list(IT.product(*name_list))) + arr = np.column_stack([arr,values]) + df = pd.DataFrame(arr, columns=columns) + return df + + +def _convert_vector(obj): + if isinstance(obj, robj.IntVector): + return _convert_int_vector(obj) + elif isinstance(obj, robj.StrVector): + return _convert_str_vector(obj) + # Check if the vector has extra information attached to it that can be used + # as an index + try: + attributes = set(r['attributes'](obj).names) + except AttributeError: + return list(obj) + if 'names' in attributes: + return pd.Series(list(obj), index=r['names'](obj)) + elif 'tsp' in attributes: + return pd.Series(list(obj), index=r['time'](obj)) + elif 'labels' in attributes: + return pd.Series(list(obj), index=r['labels'](obj)) + if _rclass(obj) == 'dist': + # For 'eurodist'. WARNING: This results in a DataFrame, not a Series or list. + matrix = r['as.matrix'](obj) + return convert_robj(matrix) + else: + return list(obj) + +NA_INTEGER = -2147483648 + + +def _convert_int_vector(obj): + arr = np.asarray(obj) + mask = arr == NA_INTEGER + if mask.any(): + arr = arr.astype(float) + arr[mask] = np.nan + return arr + + +def _convert_str_vector(obj): + arr = np.asarray(obj, dtype=object) + mask = arr == robj.NA_Character + if mask.any(): + arr[mask] = np.nan + return arr + + +def _convert_DataFrame(rdf): + columns = list(rdf.colnames) + rows = np.array(rdf.rownames) + + data = {} + for i, col in enumerate(columns): + vec = rdf.rx2(i + 1) + values = _convert_vector(vec) + + if isinstance(vec, robj.FactorVector): + levels = np.asarray(vec.levels) + if com.is_float_dtype(values): + mask = np.isnan(values) + notmask = -mask + result = np.empty(len(values), dtype=object) + result[mask] = np.nan + + locs = (values[notmask] - 1).astype(np.int_) + result[notmask] = levels.take(locs) + values = result + else: + values = np.asarray(vec.levels).take(values - 1) + + data[col] = values + + return pd.DataFrame(data, index=_check_int(rows), columns=columns) + + +def _convert_Matrix(mat): + columns = mat.colnames + rows = mat.rownames + + columns = None if _is_null(columns) else list(columns) + index = r['time'](mat) if _is_null(rows) else list(rows) + return pd.DataFrame(np.array(mat), index=_check_int(index), + columns=columns) + + +def _check_int(vec): + try: + # R observation numbers come through as strings + vec = vec.astype(int) + except Exception: + pass + + return vec + +_pandas_converters = [ + (robj.DataFrame, _convert_DataFrame), + (robj.Matrix, _convert_Matrix), + (robj.StrVector, _convert_vector), + (robj.FloatVector, _convert_vector), + (robj.Array, _convert_array), + (robj.Vector, _convert_list), +] + +_converters = [ + (robj.DataFrame, lambda x: _convert_DataFrame(x).toRecords(index=False)), + (robj.Matrix, lambda x: _convert_Matrix(x).toRecords(index=False)), + (robj.IntVector, _convert_vector), + (robj.StrVector, _convert_vector), + (robj.FloatVector, _convert_vector), + (robj.Array, _convert_array), + (robj.Vector, _convert_list), +] + + +def convert_robj(obj, use_pandas=True): + """ + Convert rpy2 object to a pandas-friendly form + + Parameters + ---------- + obj : rpy2 object + + Returns + ------- + Non-rpy data structure, mix of NumPy and pandas objects + """ + if not isinstance(obj, robj.RObjectMixin): + return obj + + converters = _pandas_converters if use_pandas else _converters + + for rpy_type, converter in converters: + if isinstance(obj, rpy_type): + return converter(obj) + + raise TypeError('Do not know what to do with %s object' % type(obj)) + + +def convert_to_r_posixct(obj): + """ + Convert DatetimeIndex or np.datetime array to R POSIXct using + m8[s] format. + + Parameters + ---------- + obj : source pandas object (one of [DatetimeIndex, np.datetime]) + + Returns + ------- + An R POSIXct vector (rpy2.robjects.vectors.POSIXct) + + """ + import time + from rpy2.rinterface import StrSexpVector + + # convert m8[ns] to m8[s] + vals = robj.vectors.FloatSexpVector(obj.values.view('i8') / 1E9) + as_posixct = robj.baseenv.get('as.POSIXct') + origin = StrSexpVector([time.strftime("%Y-%m-%d", + time.gmtime(0)), ]) + + # We will be sending ints as UTC + tz = obj.tz.zone if hasattr( + obj, 'tz') and hasattr(obj.tz, 'zone') else 'UTC' + tz = StrSexpVector([tz]) + utc_tz = StrSexpVector(['UTC']) + + posixct = as_posixct(vals, origin=origin, tz=utc_tz) + posixct.do_slot_assign('tzone', tz) + return posixct + + +VECTOR_TYPES = {np.float64: robj.FloatVector, + np.float32: robj.FloatVector, + np.float: robj.FloatVector, + np.int: robj.IntVector, + np.int32: robj.IntVector, + np.int64: robj.IntVector, + np.object_: robj.StrVector, + np.str: robj.StrVector, + np.bool: robj.BoolVector} + +NA_TYPES = {np.float64: robj.NA_Real, + np.float32: robj.NA_Real, + np.float: robj.NA_Real, + np.int: robj.NA_Integer, + np.int32: robj.NA_Integer, + np.int64: robj.NA_Integer, + np.object_: robj.NA_Character, + np.str: robj.NA_Character, + np.bool: robj.NA_Logical} + + +def convert_to_r_dataframe(df, strings_as_factors=False): + """ + Convert a pandas DataFrame to a R data.frame. + + Parameters + ---------- + df: The DataFrame being converted + strings_as_factors: Whether to turn strings into R factors (default: False) + + Returns + ------- + A R data.frame + + """ + + import rpy2.rlike.container as rlc + + columns = rlc.OrdDict() + + # FIXME: This doesn't handle MultiIndex + + for column in df: + value = df[column] + value_type = value.dtype.type + + if value_type == np.datetime64: + value = convert_to_r_posixct(value) + else: + value = [item if pd.notnull(item) else NA_TYPES[value_type] + for item in value] + + value = VECTOR_TYPES[value_type](value) + + if not strings_as_factors: + I = robj.baseenv.get("I") + value = I(value) + + columns[column] = value + + r_dataframe = robj.DataFrame(columns) + + del columns + + r_dataframe.rownames = robj.StrVector(df.index) + + return r_dataframe + + +def convert_to_r_matrix(df, strings_as_factors=False): + + """ + Convert a pandas DataFrame to a R matrix. + + Parameters + ---------- + df: The DataFrame being converted + strings_as_factors: Whether to turn strings into R factors (default: False) + + Returns + ------- + A R matrix + + """ + + if df._is_mixed_type: + raise TypeError("Conversion to matrix only possible with non-mixed " + "type DataFrames") + + r_dataframe = convert_to_r_dataframe(df, strings_as_factors) + as_matrix = robj.baseenv.get("as.matrix") + r_matrix = as_matrix(r_dataframe) + + return r_matrix + +if __name__ == '__main__': + pass diff --git a/pandas/rpy/mass.py b/pandas/rpy/mass.py new file mode 100644 index 00000000..12fbbdfa --- /dev/null +++ b/pandas/rpy/mass.py @@ -0,0 +1,2 @@ +class rlm(object): + pass diff --git a/pandas/rpy/tests/__init__.py b/pandas/rpy/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/rpy/tests/test_common.py b/pandas/rpy/tests/test_common.py new file mode 100644 index 00000000..a2e6d08d --- /dev/null +++ b/pandas/rpy/tests/test_common.py @@ -0,0 +1,213 @@ +""" +Testing that functions from rpy work as expected +""" + +import pandas as pd +import numpy as np +import unittest +import nose +import pandas.util.testing as tm + +try: + import pandas.rpy.common as com + from rpy2.robjects import r + import rpy2.robjects as robj +except ImportError: + raise nose.SkipTest('R not installed') + + +class TestCommon(unittest.TestCase): + def test_convert_list(self): + obj = r('list(a=1, b=2, c=3)') + + converted = com.convert_robj(obj) + expected = {'a': [1], 'b': [2], 'c': [3]} + + tm.assert_dict_equal(converted, expected) + + def test_convert_nested_list(self): + obj = r('list(a=list(foo=1, bar=2))') + + converted = com.convert_robj(obj) + expected = {'a': {'foo': [1], 'bar': [2]}} + + tm.assert_dict_equal(converted, expected) + + def test_convert_frame(self): + # built-in dataset + df = r['faithful'] + + converted = com.convert_robj(df) + + assert np.array_equal(converted.columns, ['eruptions', 'waiting']) + assert np.array_equal(converted.index, np.arange(1, 273)) + + def _test_matrix(self): + r('mat <- matrix(rnorm(9), ncol=3)') + r('colnames(mat) <- c("one", "two", "three")') + r('rownames(mat) <- c("a", "b", "c")') + + return r['mat'] + + def test_convert_matrix(self): + mat = self._test_matrix() + + converted = com.convert_robj(mat) + + assert np.array_equal(converted.index, ['a', 'b', 'c']) + assert np.array_equal(converted.columns, ['one', 'two', 'three']) + + def test_convert_r_dataframe(self): + + is_na = robj.baseenv.get("is.na") + + seriesd = tm.getSeriesData() + frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) + + # Null data + frame["E"] = [np.nan for item in frame["A"]] + # Some mixed type data + frame["F"] = ["text" if item % + 2 == 0 else np.nan for item in range(30)] + + r_dataframe = com.convert_to_r_dataframe(frame) + + assert np.array_equal( + com.convert_robj(r_dataframe.rownames), frame.index) + assert np.array_equal( + com.convert_robj(r_dataframe.colnames), frame.columns) + assert all(is_na(item) for item in r_dataframe.rx2("E")) + + for column in frame[["A", "B", "C", "D"]]: + coldata = r_dataframe.rx2(column) + original_data = frame[column] + assert np.array_equal(com.convert_robj(coldata), original_data) + + for column in frame[["D", "E"]]: + for original, converted in zip(frame[column], + r_dataframe.rx2(column)): + + if pd.isnull(original): + assert is_na(converted) + else: + assert original == converted + + def test_convert_r_matrix(self): + + is_na = robj.baseenv.get("is.na") + + seriesd = tm.getSeriesData() + frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) + # Null data + frame["E"] = [np.nan for item in frame["A"]] + + r_dataframe = com.convert_to_r_matrix(frame) + + assert np.array_equal( + com.convert_robj(r_dataframe.rownames), frame.index) + assert np.array_equal( + com.convert_robj(r_dataframe.colnames), frame.columns) + assert all(is_na(item) for item in r_dataframe.rx(True, "E")) + + for column in frame[["A", "B", "C", "D"]]: + coldata = r_dataframe.rx(True, column) + original_data = frame[column] + assert np.array_equal(com.convert_robj(coldata), + original_data) + + # Pandas bug 1282 + frame["F"] = ["text" if item % + 2 == 0 else np.nan for item in range(30)] + + try: + wrong_matrix = com.convert_to_r_matrix(frame) + except TypeError: + pass + except Exception: + raise + + def test_dist(self): + for name in ('eurodist',): + df = com.load_data(name) + dist = r[name] + labels = r['labels'](dist) + assert np.array_equal(df.index, labels) + assert np.array_equal(df.columns, labels) + + def test_timeseries(self): + """ + Test that the series has an informative index. + Unfortunately the code currently does not build a DateTimeIndex + """ + for name in ( + 'austres', 'co2', 'fdeaths', 'freeny.y', 'JohnsonJohnson', + 'ldeaths', 'mdeaths', 'nottem', 'presidents', 'sunspot.month', 'sunspots', + 'UKDriverDeaths', 'UKgas', 'USAccDeaths', + 'airmiles', 'discoveries', 'EuStockMarkets', + 'LakeHuron', 'lh', 'lynx', 'nhtemp', 'Nile', + 'Seatbelts', 'sunspot.year', 'treering', 'uspop'): + series = com.load_data(name) + ts = r[name] + assert np.array_equal(series.index, r['time'](ts)) + + def test_numeric(self): + for name in ('euro', 'islands', 'precip'): + series = com.load_data(name) + numeric = r[name] + names = numeric.names + assert np.array_equal(series.index, names) + + def test_table(self): + iris3 = pd.DataFrame({'X0': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4'}, + 'X1': {0: 'Sepal L.', + 1: 'Sepal L.', + 2: 'Sepal L.', + 3: 'Sepal L.', + 4: 'Sepal L.'}, + 'X2': {0: 'Setosa', + 1: 'Setosa', + 2: 'Setosa', + 3: 'Setosa', + 4: 'Setosa'}, + 'value': {0: '5.1', 1: '4.9', 2: '4.7', 3: '4.6', 4: '5.0'}}) + hec = pd.DataFrame( + { + 'Eye': {0: 'Brown', 1: 'Brown', 2: 'Brown', 3: 'Brown', 4: 'Blue'}, + 'Hair': {0: 'Black', 1: 'Brown', 2: 'Red', 3: 'Blond', 4: 'Black'}, + 'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Male'}, + 'value': {0: '32.0', 1: '53.0', 2: '10.0', 3: '3.0', 4: '11.0'}}) + titanic = pd.DataFrame( + { + 'Age': {0: 'Child', 1: 'Child', 2: 'Child', 3: 'Child', 4: 'Child'}, + 'Class': {0: '1st', 1: '2nd', 2: '3rd', 3: 'Crew', 4: '1st'}, + 'Sex': {0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4: 'Female'}, + 'Survived': {0: 'No', 1: 'No', 2: 'No', 3: 'No', 4: 'No'}, + 'value': {0: '0.0', 1: '0.0', 2: '35.0', 3: '0.0', 4: '0.0'}}) + for name, expected in zip(('HairEyeColor', 'Titanic', 'iris3'), + (hec, titanic, iris3)): + df = com.load_data(name) + table = r[name] + names = r['dimnames'](table) + try: + columns = list(r['names'](names))[::-1] + except TypeError: + columns = ['X{:d}'.format(i) for i in range(len(names))][::-1] + columns.append('value') + assert np.array_equal(df.columns, columns) + result = df.head() + cond = ((result.sort(axis=1) == expected.sort(axis=1))).values + assert np.all(cond) + + def test_factor(self): + for name in ('state.division', 'state.region'): + vector = r[name] + factors = list(r['factor'](vector)) + level = list(r['levels'](vector)) + factors = [level[index - 1] for index in factors] + result = com.load_data(name) + assert np.equal(result, factors) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/rpy/vars.py b/pandas/rpy/vars.py new file mode 100644 index 00000000..4756b277 --- /dev/null +++ b/pandas/rpy/vars.py @@ -0,0 +1,20 @@ +import pandas.rpy.util as util + + +class VAR(object): + """ + + Parameters + ---------- + y : + p : + type : {"const", "trend", "both", "none"} + season : + exogen : + lag_max : + ic : {"AIC", "HQ", "SC", "FPE"} + Information criterion to use, if lag_max is not None + """ + def __init__(y, p=1, type="none", season=None, exogen=None, + lag_max=None, ic=None): + pass diff --git a/pandas/sandbox/__init__.py b/pandas/sandbox/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/sandbox/qtpandas.py b/pandas/sandbox/qtpandas.py new file mode 100644 index 00000000..3f284990 --- /dev/null +++ b/pandas/sandbox/qtpandas.py @@ -0,0 +1,135 @@ +''' +Easy integration of DataFrame into pyqt framework + +@author: Jev Kuznetsov +''' +try: + from PyQt4.QtCore import QAbstractTableModel, Qt, QVariant, QModelIndex + from PyQt4.QtGui import ( + QApplication, QDialog, QVBoxLayout, QTableView, QWidget) +except ImportError: + from PySide.QtCore import QAbstractTableModel, Qt, QModelIndex + from PySide.QtGui import ( + QApplication, QDialog, QVBoxLayout, QTableView, QWidget) + QVariant = lambda value=None: value + +from pandas import DataFrame, Index + + +class DataFrameModel(QAbstractTableModel): + ''' data model for a DataFrame class ''' + def __init__(self): + super(DataFrameModel, self).__init__() + self.df = DataFrame() + + def setDataFrame(self, dataFrame): + self.df = dataFrame + + def signalUpdate(self): + ''' tell viewers to update their data (this is full update, not + efficient)''' + self.layoutChanged.emit() + + #------------- table display functions ----------------- + def headerData(self, section, orientation, role=Qt.DisplayRole): + if role != Qt.DisplayRole: + return QVariant() + + if orientation == Qt.Horizontal: + try: + return self.df.columns.tolist()[section] + except (IndexError, ): + return QVariant() + elif orientation == Qt.Vertical: + try: + # return self.df.index.tolist() + return self.df.index.tolist()[section] + except (IndexError, ): + return QVariant() + + def data(self, index, role=Qt.DisplayRole): + if role != Qt.DisplayRole: + return QVariant() + + if not index.isValid(): + return QVariant() + + return QVariant(str(self.df.ix[index.row(), index.column()])) + + def flags(self, index): + flags = super(DataFrameModel, self).flags(index) + flags |= Qt.ItemIsEditable + return flags + + def setData(self, index, value, role): + row = self.df.index[index.row()] + col = self.df.columns[index.column()] + if hasattr(value, 'toPyObject'): + # PyQt4 gets a QVariant + value = value.toPyObject() + else: + # PySide gets an unicode + dtype = self.df[col].dtype + if dtype != object: + value = None if value == '' else dtype.type(value) + self.df.set_value(row, col, value) + return True + + def rowCount(self, index=QModelIndex()): + return self.df.shape[0] + + def columnCount(self, index=QModelIndex()): + return self.df.shape[1] + + +class DataFrameWidget(QWidget): + ''' a simple widget for using DataFrames in a gui ''' + def __init__(self, dataFrame, parent=None): + super(DataFrameWidget, self).__init__(parent) + + self.dataModel = DataFrameModel() + self.dataTable = QTableView() + self.dataTable.setModel(self.dataModel) + + layout = QVBoxLayout() + layout.addWidget(self.dataTable) + self.setLayout(layout) + # Set DataFrame + self.setDataFrame(dataFrame) + + def setDataFrame(self, dataFrame): + self.dataModel.setDataFrame(dataFrame) + self.dataModel.signalUpdate() + self.dataTable.resizeColumnsToContents() + +#-----------------stand alone test code + + +def testDf(): + ''' creates test dataframe ''' + data = {'int': [1, 2, 3], 'float': [1.5, 2.5, 3.5], + 'string': ['a', 'b', 'c'], 'nan': [np.nan, np.nan, np.nan]} + return DataFrame(data, index=Index(['AAA', 'BBB', 'CCC']), + columns=['int', 'float', 'string', 'nan']) + + +class Form(QDialog): + def __init__(self, parent=None): + super(Form, self).__init__(parent) + + df = testDf() # make up some data + widget = DataFrameWidget(df) + widget.resizeColumnsToContents() + + layout = QVBoxLayout() + layout.addWidget(widget) + self.setLayout(layout) + +if __name__ == '__main__': + import sys + import numpy as np + + app = QApplication(sys.argv) + form = Form() + form.show() + app.exec_() diff --git a/pandas/sparse/__init__.py b/pandas/sparse/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/sparse/api.py b/pandas/sparse/api.py new file mode 100644 index 00000000..230ad159 --- /dev/null +++ b/pandas/sparse/api.py @@ -0,0 +1,7 @@ +# pylint: disable=W0611 + +from pandas.sparse.array import SparseArray +from pandas.sparse.list import SparseList +from pandas.sparse.series import SparseSeries, SparseTimeSeries +from pandas.sparse.frame import SparseDataFrame +from pandas.sparse.panel import SparsePanel diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py new file mode 100644 index 00000000..38a5688e --- /dev/null +++ b/pandas/sparse/array.py @@ -0,0 +1,531 @@ +""" +SparseArray data structure +""" +from __future__ import division +# pylint: disable=E1101,E1103,W0231 + +from numpy import nan, ndarray +import numpy as np + +from pandas.core.base import PandasObject +import pandas.core.common as com + +from pandas import compat, lib +from pandas.compat import range + +from pandas._sparse import BlockIndex, IntIndex +import pandas._sparse as splib +import pandas.index as _index +import pandas.core.ops as ops + + +def _arith_method(op, name, str_rep=None, default_axis=None, + fill_zeros=None, **eval_kwargs): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def wrapper(self, other): + if isinstance(other, np.ndarray): + if len(self) != len(other): + raise AssertionError("length mismatch: %d vs. %d" % + (len(self), len(other))) + if not isinstance(other, com.ABCSparseArray): + other = SparseArray(other, fill_value=self.fill_value) + if name[0] == 'r': + return _sparse_array_op(other, self, op, name[1:]) + else: + return _sparse_array_op(self, other, op, name) + elif np.isscalar(other): + new_fill_value = op(np.float64(self.fill_value), + np.float64(other)) + + return SparseArray(op(self.sp_values, other), + sparse_index=self.sp_index, + fill_value=new_fill_value) + else: # pragma: no cover + raise TypeError('operation with %s not supported' % type(other)) + if name.startswith("__"): + name = name[2:-2] + wrapper.__name__ = name + return wrapper + + +def _sparse_array_op(left, right, op, name): + if np.isnan(left.fill_value): + sparse_op = lambda a, b: _sparse_nanop(a, b, name) + else: + sparse_op = lambda a, b: _sparse_fillop(a, b, name) + + if left.sp_index.equals(right.sp_index): + result = op(left.sp_values, right.sp_values) + result_index = left.sp_index + else: + result, result_index = sparse_op(left, right) + + try: + fill_value = op(left.fill_value, right.fill_value) + except: + fill_value = nan + + return SparseArray(result, sparse_index=result_index, + fill_value=fill_value) + + +def _sparse_nanop(this, other, name): + sparse_op = getattr(splib, 'sparse_nan%s' % name) + result, result_index = sparse_op(this.sp_values, + this.sp_index, + other.sp_values, + other.sp_index) + + return result, result_index + + +def _sparse_fillop(this, other, name): + sparse_op = getattr(splib, 'sparse_%s' % name) + result, result_index = sparse_op(this.sp_values, + this.sp_index, + this.fill_value, + other.sp_values, + other.sp_index, + other.fill_value) + + return result, result_index + + +class SparseArray(PandasObject, np.ndarray): + + """Data structure for labeled, sparse floating point data + +Parameters +---------- +data : {array-like, Series, SparseSeries, dict} +kind : {'block', 'integer'} +fill_value : float + Defaults to NaN (code for missing) +sparse_index : {BlockIndex, IntIndex}, optional + Only if you have one. Mainly used internally + +Notes +----- +SparseArray objects are immutable via the typical Python means. If you +must change values, convert to dense, make your changes, then convert back +to sparse + """ + __array_priority__ = 15 + _typ = 'array' + _subtyp = 'sparse_array' + + sp_index = None + fill_value = None + + def __new__( + cls, data, sparse_index=None, index=None, kind='integer', fill_value=None, + dtype=np.float64, copy=False): + + if index is not None: + if data is None: + data = np.nan + if not np.isscalar(data): + raise Exception("must only pass scalars with an index ") + values = np.empty(len(index), dtype='float64') + values.fill(data) + data = values + + if dtype is not None: + dtype = np.dtype(dtype) + is_sparse_array = isinstance(data, SparseArray) + if fill_value is None: + if is_sparse_array: + fill_value = data.fill_value + else: + fill_value = nan + + if is_sparse_array: + sparse_index = data.sp_index + values = np.asarray(data) + else: + # array-like + if sparse_index is None: + values, sparse_index = make_sparse(data, kind=kind, + fill_value=fill_value) + else: + values = data + if len(values) != sparse_index.npoints: + raise AssertionError("Non array-like type {0} must have" + " the same length as the" + " index".format(type(values))) + + # Create array, do *not* copy data by default + if copy: + subarr = np.array(values, dtype=dtype, copy=True) + else: + subarr = np.asarray(values, dtype=dtype) + + # if we have a bool type, make sure that we have a bool fill_value + if (dtype is not None and issubclass(dtype.type, np.bool_)) or (data is not None and lib.is_bool_array(subarr)): + if np.isnan(fill_value) or not fill_value: + fill_value = False + else: + fill_value = bool(fill_value) + + # Change the class of the array to be the subclass type. + output = subarr.view(cls) + output.sp_index = sparse_index + output.fill_value = fill_value + return output + + @property + def _constructor(self): + return lambda x: SparseArray(x, fill_value=self.fill_value, + kind=self.kind) + + @property + def kind(self): + if isinstance(self.sp_index, BlockIndex): + return 'block' + elif isinstance(self.sp_index, IntIndex): + return 'integer' + + def __array_finalize__(self, obj): + """ + Gets called after any ufunc or other array operations, necessary + to pass on the index. + """ + self.sp_index = getattr(obj, 'sp_index', None) + self.fill_value = getattr(obj, 'fill_value', None) + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(ndarray.__reduce__(self)) + subclass_state = self.fill_value, self.sp_index + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + nd_state, own_state = state + ndarray.__setstate__(self, nd_state) + + fill_value, sp_index = own_state[:2] + self.sp_index = sp_index + self.fill_value = fill_value + + def __len__(self): + try: + return self.sp_index.length + except: + return 0 + + def __unicode__(self): + return '%s\nFill: %s\n%s' % (com.pprint_thing(self), + com.pprint_thing(self.fill_value), + com.pprint_thing(self.sp_index)) + + def disable(self, other): + raise NotImplementedError('inplace binary ops not supported') + # Inplace operators + __iadd__ = disable + __isub__ = disable + __imul__ = disable + __itruediv__ = disable + __ifloordiv__ = disable + __ipow__ = disable + + # Python 2 division operators + if not compat.PY3: + __idiv__ = disable + + @property + def values(self): + """ + Dense values + """ + output = np.empty(len(self), dtype=np.float64) + int_index = self.sp_index.to_int_index() + output.fill(self.fill_value) + output.put(int_index.indices, self) + return output + + @property + def sp_values(self): + # caching not an option, leaks memory + return self.view(np.ndarray) + + def get_values(self, fill=None): + """ return a dense representation """ + return self.to_dense(fill=fill) + + def to_dense(self, fill=None): + """ + Convert SparseSeries to (dense) Series + """ + values = self.values + + # fill the nans + if fill is None: + fill = self.fill_value + if not np.isnan(fill): + values[np.isnan(values)] = fill + + return values + + def __iter__(self): + for i in range(len(self)): + yield self._get_val_at(i) + raise StopIteration + + def __getitem__(self, key): + """ + + """ + if com.is_integer(key): + return self._get_val_at(key) + else: + data_slice = self.values[key] + return self._constructor(data_slice) + + def __getslice__(self, i, j): + if i < 0: + i = 0 + if j < 0: + j = 0 + slobj = slice(i, j) + return self.__getitem__(slobj) + + def _get_val_at(self, loc): + n = len(self) + if loc < 0: + loc += n + + if loc >= n or loc < 0: + raise IndexError('Out of bounds access') + + sp_loc = self.sp_index.lookup(loc) + if sp_loc == -1: + return self.fill_value + else: + return _index.get_value_at(self, sp_loc) + + def take(self, indices, axis=0): + """ + Sparse-compatible version of ndarray.take + + Returns + ------- + taken : ndarray + """ + if axis: + raise ValueError("axis must be 0, input was {0}".format(axis)) + indices = np.atleast_1d(np.asarray(indices, dtype=int)) + + # allow -1 to indicate missing values + n = len(self) + if ((indices >= n) | (indices < -1)).any(): + raise IndexError('out of bounds access') + + if self.sp_index.npoints > 0: + locs = np.array([self.sp_index.lookup(loc) if loc > -1 else -1 + for loc in indices]) + result = self.sp_values.take(locs) + mask = locs == -1 + if mask.any(): + try: + result[mask] = self.fill_value + except ValueError: + # wrong dtype + result = result.astype('float64') + result[mask] = self.fill_value + + else: + result = np.empty(len(indices)) + result.fill(self.fill_value) + + return result + + def __setitem__(self, key, value): + # if com.is_integer(key): + # self.values[key] = value + # else: + # raise Exception("SparseArray does not support seting non-scalars via setitem") + raise TypeError( + "SparseArray does not support item assignment via setitem") + + def __setslice__(self, i, j, value): + if i < 0: + i = 0 + if j < 0: + j = 0 + slobj = slice(i, j) + + # if not np.isscalar(value): + # raise Exception("SparseArray does not support seting non-scalars via slices") + + #x = self.values + #x[slobj] = value + #self.values = x + raise TypeError( + "SparseArray does not support item assignment via slices") + + def astype(self, dtype=None): + """ + + """ + dtype = np.dtype(dtype) + if dtype is not None and dtype not in (np.float_, float): + raise TypeError('Can only support floating point data for now') + return self.copy() + + def copy(self, deep=True): + """ + Make a copy of the SparseSeries. Only the actual sparse values need to + be copied + """ + if deep: + values = self.sp_values.copy() + else: + values = self.sp_values + return SparseArray(values, sparse_index=self.sp_index, + dtype=self.dtype, + fill_value=self.fill_value) + + def count(self): + """ + Compute sum of non-NA/null observations in SparseSeries. If the + fill_value is not NaN, the "sparse" locations will be included in the + observation count + + Returns + ------- + nobs : int + """ + sp_values = self.sp_values + valid_spvals = np.isfinite(sp_values).sum() + if self._null_fill_value: + return valid_spvals + else: + return valid_spvals + self.sp_index.ngaps + + @property + def _null_fill_value(self): + return np.isnan(self.fill_value) + + @property + def _valid_sp_values(self): + sp_vals = self.sp_values + mask = np.isfinite(sp_vals) + return sp_vals[mask] + + def sum(self, axis=None, dtype=None, out=None): + """ + Sum of non-NA/null values + + Returns + ------- + sum : float + """ + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + if self._null_fill_value: + return sp_sum + else: + nsparse = self.sp_index.ngaps + return sp_sum + self.fill_value * nsparse + + def cumsum(self, axis=0, dtype=None, out=None): + """ + Cumulative sum of values. Preserves locations of NaN values + + Extra parameters are to preserve ndarray interface. + + Returns + ------- + cumsum : Series + """ + if com.notnull(self.fill_value): + return self.to_dense().cumsum() + # TODO: what if sp_values contains NaN?? + return SparseArray(self.sp_values.cumsum(), + sparse_index=self.sp_index, + fill_value=self.fill_value) + + def mean(self, axis=None, dtype=None, out=None): + """ + Mean of non-NA/null values + + Returns + ------- + mean : float + """ + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + ct = len(valid_vals) + + if self._null_fill_value: + return sp_sum / ct + else: + nsparse = self.sp_index.ngaps + return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) + + +def _maybe_to_dense(obj): + """ try to convert to dense """ + if hasattr(obj, 'to_dense'): + return obj.to_dense() + return obj + + +def _maybe_to_sparse(array): + if isinstance(array, com.ABCSparseSeries): + array = SparseArray( + array.values, sparse_index=array.sp_index, fill_value=array.fill_value, copy=True) + if not isinstance(array, SparseArray): + array = com._values_from_object(array) + return array + + +def make_sparse(arr, kind='block', fill_value=nan): + """ + Convert ndarray to sparse format + + Parameters + ---------- + arr : ndarray + kind : {'block', 'integer'} + fill_value : NaN or another value + + Returns + ------- + (sparse_values, index) : (ndarray, SparseIndex) + """ + if hasattr(arr, 'values'): + arr = arr.values + else: + if np.isscalar(arr): + arr = [arr] + arr = np.asarray(arr) + + length = len(arr) + + if np.isnan(fill_value): + mask = ~np.isnan(arr) + else: + mask = arr != fill_value + + indices = np.arange(length, dtype=np.int32)[mask] + + if kind == 'block': + locs, lens = splib.get_blocks(indices) + index = BlockIndex(length, locs, lens) + elif kind == 'integer': + index = IntIndex(length, indices) + else: # pragma: no cover + raise ValueError('must be block or integer type') + + sparsified_values = arr[mask] + return sparsified_values, index + +ops.add_special_arithmetic_methods(SparseArray, + arith_method=_arith_method, + use_numexpr=False) diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py new file mode 100644 index 00000000..bd34c7e5 --- /dev/null +++ b/pandas/sparse/frame.py @@ -0,0 +1,832 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only +with float64 data +""" +from __future__ import division +# pylint: disable=E1101,E1103,W0231,E0202 + +from numpy import nan +from pandas.compat import range, lmap, map +from pandas import compat +import numpy as np + +from pandas.core.common import (isnull, notnull, _pickle_array, + _unpickle_array, _try_sort) +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.indexing import _maybe_convert_indices +from pandas.core.series import Series +from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, + _default_index) +from pandas.util.decorators import cache_readonly +import pandas.core.common as com +import pandas.core.datetools as datetools +from pandas.core.internals import BlockManager, create_block_manager_from_arrays + +from pandas.core.generic import NDFrame +from pandas.sparse.series import SparseSeries, SparseArray +from pandas.util.decorators import Appender +import pandas.core.ops as ops + + +class SparseDataFrame(DataFrame): + + """ + DataFrame containing sparse floating point data in the form of SparseSeries + objects + + Parameters + ---------- + data : same types as can be passed to DataFrame + index : array-like, optional + column : array-like, optional + default_kind : {'block', 'integer'}, default 'block' + Default sparse kind for converting Series to SparseSeries. Will not + override SparseSeries passed into constructor + default_fill_value : float + Default fill_value for converting Series to SparseSeries. Will not + override SparseSeries passed in + """ + _constructor_sliced = SparseSeries + _subtyp = 'sparse_frame' + + def __init__(self, data=None, index=None, columns=None, + default_kind=None, default_fill_value=None, + dtype=None, copy=False): + + # pick up the defaults from the Sparse structures + if isinstance(data, SparseDataFrame): + if index is None: + index = data.index + if columns is None: + columns = data.columns + if default_fill_value is None: + default_fill_value = data.default_fill_value + if default_kind is None: + default_kind = data.default_kind + elif isinstance(data, (SparseSeries, SparseArray)): + if index is None: + index = data.index + if default_fill_value is None: + default_fill_value = data.fill_value + if columns is None and hasattr(data, 'name'): + columns = [data.name] + if columns is None: + raise Exception("cannot pass a series w/o a name or columns") + data = {columns[0]: data} + + if default_fill_value is None: + default_fill_value = np.nan + if default_kind is None: + default_kind = 'block' + + self._default_kind = default_kind + self._default_fill_value = default_fill_value + + if isinstance(data, dict): + mgr = self._init_dict(data, index, columns) + if dtype is not None: + mgr = mgr.astype(dtype) + elif isinstance(data, (np.ndarray, list)): + mgr = self._init_matrix(data, index, columns) + if dtype is not None: + mgr = mgr.astype(dtype) + elif isinstance(data, SparseDataFrame): + mgr = self._init_mgr( + data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) + elif isinstance(data, DataFrame): + mgr = self._init_dict(data, data.index, data.columns) + if dtype is not None: + mgr = mgr.astype(dtype) + elif isinstance(data, BlockManager): + mgr = self._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) + elif data is None: + data = {} + + if index is None: + index = Index([]) + else: + index = _ensure_index(index) + + if columns is None: + columns = Index([]) + else: + for c in columns: + data[c] = SparseArray(np.nan, + index=index, + kind=self._default_kind, + fill_value=self._default_fill_value) + mgr = dict_to_manager(data, columns, index) + if dtype is not None: + mgr = mgr.astype(dtype) + + NDFrame.__init__(self, mgr) + + @property + def _constructor(self): + def wrapper(data=None, index=None, columns=None, default_fill_value=None, kind=None, fill_value=None, copy=False): + result = SparseDataFrame(data, index=index, columns=columns, + default_fill_value=fill_value, + default_kind=kind, + copy=copy) + + # fill if requested + if fill_value is not None and not isnull(fill_value): + result.fillna(fill_value, inplace=True) + + # set the default_fill_value + # if default_fill_value is not None: + # result._default_fill_value = default_fill_value + return result + + return wrapper + + def _init_dict(self, data, index, columns, dtype=None): + # pre-filter out columns if we passed it + if columns is not None: + columns = _ensure_index(columns) + data = dict((k, v) for k, v in compat.iteritems(data) if k in columns) + else: + columns = Index(_try_sort(list(data.keys()))) + + if index is None: + index = extract_index(list(data.values())) + + sp_maker = lambda x: SparseArray(x, + kind=self._default_kind, + fill_value=self._default_fill_value, + copy=True) + sdict = {} + for k, v in compat.iteritems(data): + if isinstance(v, Series): + # Force alignment, no copy necessary + if not v.index.equals(index): + v = v.reindex(index) + + if not isinstance(v, SparseSeries): + v = sp_maker(v.values) + elif isinstance(v, SparseArray): + v = sp_maker(v.values) + else: + if isinstance(v, dict): + v = [v.get(i, nan) for i in index] + + v = sp_maker(v) + sdict[k] = v + + # TODO: figure out how to handle this case, all nan's? + # add in any other columns we want to have (completeness) + nan_vec = np.empty(len(index)) + nan_vec.fill(nan) + for c in columns: + if c not in sdict: + sdict[c] = sp_maker(nan_vec) + + return dict_to_manager(sdict, columns, index) + + def _init_matrix(self, data, index, columns, dtype=None): + data = _prep_ndarray(data, copy=False) + N, K = data.shape + if index is None: + index = _default_index(N) + if columns is None: + columns = _default_index(K) + + if len(columns) != K: + raise ValueError('Column length mismatch: %d vs. %d' % + (len(columns), K)) + if len(index) != N: + raise ValueError('Index length mismatch: %d vs. %d' % + (len(index), N)) + + data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)]) + return self._init_dict(data, index, columns, dtype) + + def __array_wrap__(self, result): + return SparseDataFrame(result, index=self.index, columns=self.columns, + default_kind=self._default_kind, + default_fill_value=self._default_fill_value).__finalize__(self) + + def __getstate__(self): + # pickling + return dict(_typ=self._typ, + _subtyp=self._subtyp, + _data=self._data, + _default_fill_value=self._default_fill_value, + _default_kind=self._default_kind) + + def _unpickle_sparse_frame_compat(self, state): + """ original pickle format """ + series, cols, idx, fv, kind = state + + if not isinstance(cols, Index): # pragma: no cover + columns = _unpickle_array(cols) + else: + columns = cols + + if not isinstance(idx, Index): # pragma: no cover + index = _unpickle_array(idx) + else: + index = idx + + series_dict = {} + for col, (sp_index, sp_values) in compat.iteritems(series): + series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, + fill_value=fv) + + self._data = dict_to_manager(series_dict, columns, index) + self._default_fill_value = fv + self._default_kind = kind + + def to_dense(self): + """ + Convert to dense DataFrame + + Returns + ------- + df : DataFrame + """ + data = dict((k, v.to_dense()) for k, v in compat.iteritems(self)) + return DataFrame(data, index=self.index) + + def astype(self, dtype): + raise NotImplementedError + + def copy(self, deep=True): + """ + Make a copy of this SparseDataFrame + """ + result = super(SparseDataFrame, self).copy(deep=deep) + result._default_fill_value = self._default_fill_value + result._default_kind = self._default_kind + return result + + @property + def default_fill_value(self): + return self._default_fill_value + + @property + def default_kind(self): + return self._default_kind + + @property + def density(self): + """ + Ratio of non-sparse points to total (dense) data points + represented in the frame + """ + tot_nonsparse = sum([ser.sp_index.npoints + for _, ser in compat.iteritems(self)]) + tot = len(self.index) * len(self.columns) + return tot_nonsparse / float(tot) + + def fillna(self, value=None, method=None, axis=0, inplace=False, + limit=None, downcast=None): + new_self = super( + SparseDataFrame, self).fillna(value=value, method=method, axis=axis, + inplace=inplace, limit=limit, downcast=downcast) + if not inplace: + self = new_self + + # set the fill value if we are filling as a scalar with nothing special + # going on + if value is not None and value == value and method is None and limit is None: + self._default_fill_value = value + + if not inplace: + return self + + #---------------------------------------------------------------------- + # Support different internal representation of SparseDataFrame + + def _sanitize_column(self, key, value): + sp_maker = lambda x, index=None: SparseArray(x, + index=index, + fill_value=self._default_fill_value, + kind=self._default_kind) + if isinstance(value, SparseSeries): + clean = value.reindex( + self.index).as_sparse_array(fill_value=self._default_fill_value, + kind=self._default_kind) + + elif isinstance(value, SparseArray): + if len(value) != len(self.index): + raise AssertionError('Length of values does not match ' + 'length of index') + clean = value + + elif hasattr(value, '__iter__'): + if isinstance(value, Series): + clean = value.reindex(self.index) + if not isinstance(value, SparseSeries): + clean = sp_maker(clean) + else: + if len(value) != len(self.index): + raise AssertionError('Length of values does not match ' + 'length of index') + clean = sp_maker(value) + + # Scalar + else: + clean = sp_maker(value, self.index) + + # always return a SparseArray! + return clean + + def __getitem__(self, key): + """ + Retrieve column or slice from DataFrame + """ + if isinstance(key, slice): + date_rng = self.index[key] + return self.reindex(date_rng) + elif isinstance(key, (np.ndarray, list, Series)): + return self._getitem_array(key) + else: + return self._get_item_cache(key) + + @Appender(DataFrame.get_value.__doc__, indents=0) + def get_value(self, index, col, takeable=False): + if takeable is True: + series = self._iget_item_cache(col) + else: + series = self._get_item_cache(col) + + return series.get_value(index, takeable=takeable) + + def set_value(self, index, col, value, takeable=False): + """ + Put single value at passed column and index + + Parameters + ---------- + index : row label + col : column label + value : scalar value + takeable : interpret the index/col as indexers, default False + + Notes + ----- + This method *always* returns a new object. It is currently not + particularly efficient (and potentially very expensive) but is provided + for API compatibility with DataFrame + + Returns + ------- + frame : DataFrame + """ + dense = self.to_dense().set_value(index, col, value, takeable=takeable) + return dense.to_sparse(kind=self._default_kind, + fill_value=self._default_fill_value) + + def _slice(self, slobj, axis=0, typ=None): + if axis == 0: + new_index = self.index[slobj] + new_columns = self.columns + else: + new_index = self.index + new_columns = self.columns[slobj] + + return self.reindex(index=new_index, columns=new_columns) + + def xs(self, key, axis=0, copy=False): + """ + Returns a row (cross-section) from the SparseDataFrame as a Series + object. + + Parameters + ---------- + key : some index contained in the index + + Returns + ------- + xs : Series + """ + if axis == 1: + data = self[key] + return data + + i = self.index.get_loc(key) + data = self.take([i]).get_values()[0] + return Series(data, index=self.columns) + + #---------------------------------------------------------------------- + # Arithmetic-related methods + + def _combine_frame(self, other, func, fill_value=None, level=None): + this, other = self.align(other, join='outer', level=level, + copy=False) + new_index, new_columns = this.index, this.columns + + if level is not None: + raise NotImplementedError + + if self.empty and other.empty: + return SparseDataFrame(index=new_index).__finalize__(self) + + new_data = {} + new_fill_value = None + if fill_value is not None: + # TODO: be a bit more intelligent here + for col in new_columns: + if col in this and col in other: + dleft = this[col].to_dense() + dright = other[col].to_dense() + result = dleft._binop(dright, func, fill_value=fill_value) + result = result.to_sparse(fill_value=this[col].fill_value) + new_data[col] = result + else: + + for col in new_columns: + if col in this and col in other: + new_data[col] = func(this[col], other[col]) + + # if the fill values are the same use them? or use a valid one + other_fill_value = getattr(other, 'default_fill_value', np.nan) + if self.default_fill_value == other_fill_value: + new_fill_value = self.default_fill_value + elif np.isnan(self.default_fill_value) and not np.isnan(other_fill_value): + new_fill_value = other_fill_value + elif not np.isnan(self.default_fill_value) and np.isnan(other_fill_value): + new_fill_value = self.default_fill_value + + return self._constructor(data=new_data, + index=new_index, + columns=new_columns, + default_fill_value=new_fill_value, + fill_value=new_fill_value).__finalize__(self) + + def _combine_match_index(self, other, func, level=None, fill_value=None): + new_data = {} + + if fill_value is not None: + raise NotImplementedError + if level is not None: + raise NotImplementedError + + new_index = self.index.union(other.index) + this = self + if self.index is not new_index: + this = self.reindex(new_index) + + if other.index is not new_index: + other = other.reindex(new_index) + + for col, series in compat.iteritems(this): + new_data[col] = func(series.values, other.values) + + # fill_value is a function of our operator + if isnull(other.fill_value) or isnull(self.default_fill_value): + fill_value = np.nan + else: + fill_value = func(np.float64(self.default_fill_value), + np.float64(other.fill_value)) + + return self._constructor(new_data, + index=new_index, + columns=self.columns, + default_fill_value=fill_value, + fill_value=self.default_fill_value).__finalize__(self) + + def _combine_match_columns(self, other, func, level=None, fill_value=None): + # patched version of DataFrame._combine_match_columns to account for + # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, + # where 3.0 is numpy.float64 and series is a SparseSeries. Still + # possible for this to happen, which is bothersome + + if fill_value is not None: + raise NotImplementedError + if level is not None: + raise NotImplementedError + + new_data = {} + + union = intersection = self.columns + + if not union.equals(other.index): + union = other.index.union(self.columns) + intersection = other.index.intersection(self.columns) + + for col in intersection: + new_data[col] = func(self[col], float(other[col])) + + return self._constructor(new_data, + index=self.index, + columns=union, + default_fill_value=self.default_fill_value, + fill_value=self.default_fill_value).__finalize__(self) + + def _combine_const(self, other, func): + new_data = {} + for col, series in compat.iteritems(self): + new_data[col] = func(series, other) + + return self._constructor(data=new_data, + index=self.index, + columns=self.columns, + default_fill_value=self.default_fill_value, + fill_value=self.default_fill_value).__finalize__(self) + + def _reindex_index(self, index, method, copy, level, fill_value=np.nan, + limit=None, takeable=False): + if level is not None: + raise TypeError('Reindex by level not supported for sparse') + + if self.index.equals(index): + if copy: + return self.copy() + else: + return self + + if len(self.index) == 0: + return SparseDataFrame(index=index, columns=self.columns) + + indexer = self.index.get_indexer(index, method, limit=limit) + indexer = com._ensure_platform_int(indexer) + mask = indexer == -1 + need_mask = mask.any() + + new_series = {} + for col, series in self.iteritems(): + if mask.all(): + continue + + values = series.values + new = values.take(indexer) + + if need_mask: + np.putmask(new, mask, fill_value) + + new_series[col] = new + + return SparseDataFrame(new_series, index=index, columns=self.columns, + default_fill_value=self._default_fill_value) + + def _reindex_columns(self, columns, copy, level, fill_value, limit=None, + takeable=False): + if level is not None: + raise TypeError('Reindex by level not supported for sparse') + + if com.notnull(fill_value): + raise NotImplementedError + + if limit: + raise NotImplementedError + + # TODO: fill value handling + sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) + return SparseDataFrame(sdict, index=self.index, columns=columns, + default_fill_value=self._default_fill_value) + + def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None, + copy=False, allow_dups=False): + + if method is not None or limit is not None: + raise NotImplementedError("cannot reindex with a method or limit with sparse") + + if fill_value is None: + fill_value = np.nan + + index, row_indexer = reindexers.get(0, (None, None)) + columns, col_indexer = reindexers.get(1, (None, None)) + + if columns is None: + columns = self.columns + + new_arrays = {} + for col in columns: + if col not in self: + continue + if row_indexer is not None: + new_arrays[col] = com.take_1d( + self[col].get_values(), row_indexer, + fill_value=fill_value) + else: + new_arrays[col] = self[col] + + return SparseDataFrame(new_arrays, index=index, columns=columns).__finalize__(self) + + def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', + sort=False): + if on is not None: + raise NotImplementedError("'on' keyword parameter is not yet " + "implemented") + return self._join_index(other, how, lsuffix, rsuffix) + + def _join_index(self, other, how, lsuffix, rsuffix): + if isinstance(other, Series): + if other.name is None: + raise ValueError('Other Series must have a name') + + other = SparseDataFrame({other.name: other}, + default_fill_value=self._default_fill_value) + + join_index = self.index.join(other.index, how=how) + + this = self.reindex(join_index) + other = other.reindex(join_index) + + this, other = this._maybe_rename_join(other, lsuffix, rsuffix) + + from pandas import concat + return concat([this, other], axis=1, verify_integrity=True) + + def _maybe_rename_join(self, other, lsuffix, rsuffix): + to_rename = self.columns.intersection(other.columns) + if len(to_rename) > 0: + if not lsuffix and not rsuffix: + raise ValueError('columns overlap but no suffix specified: %s' + % to_rename) + + def lrenamer(x): + if x in to_rename: + return '%s%s' % (x, lsuffix) + return x + + def rrenamer(x): + if x in to_rename: + return '%s%s' % (x, rsuffix) + return x + + this = self.rename(columns=lrenamer) + other = other.rename(columns=rrenamer) + else: + this = self + + return this, other + + def transpose(self): + """ + Returns a DataFrame with the rows/columns switched. + """ + return SparseDataFrame(self.values.T, index=self.columns, + columns=self.index, + default_fill_value=self._default_fill_value, + default_kind=self._default_kind).__finalize__(self) + T = property(transpose) + + @Appender(DataFrame.count.__doc__) + def count(self, axis=0, **kwds): + return self.apply(lambda x: x.count(), axis=axis) + + def cumsum(self, axis=0): + """ + Return SparseDataFrame of cumulative sums over requested axis. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + + Returns + ------- + y : SparseDataFrame + """ + return self.apply(lambda x: x.cumsum(), axis=axis) + + def apply(self, func, axis=0, broadcast=False, reduce=False): + """ + Analogous to DataFrame.apply, for SparseDataFrame + + Parameters + ---------- + func : function + Function to apply to each column + axis : {0, 1, 'index', 'columns'} + broadcast : bool, default False + For aggregation functions, return object of same size with values + propagated + + Returns + ------- + applied : Series or SparseDataFrame + """ + if not len(self.columns): + return self + axis = self._get_axis_number(axis) + + if isinstance(func, np.ufunc): + new_series = {} + for k, v in compat.iteritems(self): + applied = func(v) + applied.fill_value = func(applied.fill_value) + new_series[k] = applied + return self._constructor(new_series, index=self.index, + columns=self.columns, + default_fill_value=self._default_fill_value, + kind=self._default_kind).__finalize__(self) + else: + if not broadcast: + return self._apply_standard(func, axis, reduce=reduce) + else: + return self._apply_broadcast(func, axis) + + def applymap(self, func): + """ + Apply a function to a DataFrame that is intended to operate + elementwise, i.e. like doing map(func, series) for each series in the + DataFrame + + Parameters + ---------- + func : function + Python function, returns a single value from a single value + + Returns + ------- + applied : DataFrame + """ + return self.apply(lambda x: lmap(func, x)) + +def dict_to_manager(sdict, columns, index): + """ create and return the block manager from a dict of series, columns, index """ + + # from BlockManager perspective + axes = [_ensure_index(columns), _ensure_index(index)] + + return create_block_manager_from_arrays([sdict[c] for c in columns], columns, axes) + + +def stack_sparse_frame(frame): + """ + Only makes sense when fill_value is NaN + """ + lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)] + nobs = sum(lengths) + + # this is pretty fast + minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) + + inds_to_concat = [] + vals_to_concat = [] + # TODO: Figure out whether this can be reached. + # I think this currently can't be reached because you can't build a SparseDataFrame + # with a non-np.NaN fill value (fails earlier). + for _, series in compat.iteritems(frame): + if not np.isnan(series.fill_value): + raise TypeError('This routine assumes NaN fill value') + + int_index = series.sp_index.to_int_index() + inds_to_concat.append(int_index.indices) + vals_to_concat.append(series.sp_values) + + major_labels = np.concatenate(inds_to_concat) + stacked_values = np.concatenate(vals_to_concat) + index = MultiIndex(levels=[frame.index, frame.columns], + labels=[major_labels, minor_labels], + verify_integrity=False) + + lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, + columns=['foo']) + return lp.sortlevel(level=0) + + +def homogenize(series_dict): + """ + Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex + corresponding to the locations where they all have data + + Parameters + ---------- + series_dict : dict or DataFrame + + Notes + ----- + Using the dumbest algorithm I could think of. Should put some more thought + into this + + Returns + ------- + homogenized : dict of SparseSeries + """ + index = None + + need_reindex = False + + for _, series in compat.iteritems(series_dict): + if not np.isnan(series.fill_value): + raise TypeError('this method is only valid with NaN fill values') + + if index is None: + index = series.sp_index + elif not series.sp_index.equals(index): + need_reindex = True + index = index.intersect(series.sp_index) + + if need_reindex: + output = {} + for name, series in compat.iteritems(series_dict): + if not series.sp_index.equals(index): + series = series.sparse_reindex(index) + + output[name] = series + else: + output = series_dict + + return output + +# use unaccelerated ops for sparse objects +ops.add_flex_arithmetic_methods(SparseDataFrame, use_numexpr=False, + **ops.frame_flex_funcs) +ops.add_special_arithmetic_methods(SparseDataFrame, use_numexpr=False, + **ops.frame_special_funcs) diff --git a/pandas/sparse/list.py b/pandas/sparse/list.py new file mode 100644 index 00000000..bfc4ab9d --- /dev/null +++ b/pandas/sparse/list.py @@ -0,0 +1,142 @@ +import numpy as np +from pandas.core.base import PandasObject +from pandas.core.common import pprint_thing + +from pandas.sparse.array import SparseArray +import pandas._sparse as splib + + +class SparseList(PandasObject): + + """ + Data structure for accumulating data to be converted into a + SparseArray. Has similar API to the standard Python list + + Parameters + ---------- + data : scalar or array-like + fill_value : scalar, default NaN + """ + + def __init__(self, data=None, fill_value=np.nan): + self.fill_value = fill_value + self._chunks = [] + + if data is not None: + self.append(data) + + def __unicode__(self): + contents = '\n'.join(repr(c) for c in self._chunks) + return '%s\n%s' % (object.__repr__(self), pprint_thing(contents)) + + def __len__(self): + return sum(len(c) for c in self._chunks) + + def __getitem__(self, i): + if i < 0: + if i + len(self) < 0: # pragma: no cover + raise ValueError('%d out of range' % i) + i += len(self) + + passed = 0 + j = 0 + while i >= passed + len(self._chunks[j]): + passed += len(self._chunks[j]) + j += 1 + return self._chunks[j][i - passed] + + def __setitem__(self, i, value): + raise NotImplementedError + + @property + def nchunks(self): + return len(self._chunks) + + @property + def is_consolidated(self): + return self.nchunks == 1 + + def consolidate(self, inplace=True): + """ + Internally consolidate chunks of data + + Parameters + ---------- + inplace : boolean, default True + Modify the calling object instead of constructing a new one + + Returns + ------- + splist : SparseList + If inplace=False, new object, otherwise reference to existing + object + """ + if not inplace: + result = self.copy() + else: + result = self + + if result.is_consolidated: + return result + + result._consolidate_inplace() + return result + + def _consolidate_inplace(self): + new_values = np.concatenate([c.sp_values for c in self._chunks]) + new_index = _concat_sparse_indexes([c.sp_index for c in self._chunks]) + new_arr = SparseArray(new_values, sparse_index=new_index, + fill_value=self.fill_value) + self._chunks = [new_arr] + + def copy(self): + """ + Return copy of the list + + Returns + ------- + new_list : SparseList + """ + new_splist = SparseList(fill_value=self.fill_value) + new_splist._chunks = list(self._chunks) + return new_splist + + def to_array(self): + """ + Return SparseArray from data stored in the SparseList + + Returns + ------- + sparr : SparseArray + """ + self.consolidate(inplace=True) + return self._chunks[0] + + def append(self, value): + """ + Append element or array-like chunk of data to the SparseList + + Parameters + ---------- + value: scalar or array-like + """ + if np.isscalar(value): + value = [value] + + sparr = SparseArray(value, fill_value=self.fill_value) + self._chunks.append(sparr) + self._consolidated = False + + +def _concat_sparse_indexes(indexes): + all_indices = [] + total_length = 0 + + for index in indexes: + # increment by offset + inds = index.to_int_index().indices + total_length + + all_indices.append(inds) + total_length += index.length + + return splib.IntIndex(total_length, np.concatenate(all_indices)) diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py new file mode 100644 index 00000000..20bbc58c --- /dev/null +++ b/pandas/sparse/panel.py @@ -0,0 +1,555 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only +with float64 data +""" + +# pylint: disable=E1101,E1103,W0231 + +from pandas.compat import range, lrange, zip +from pandas import compat +import numpy as np + +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.frame import DataFrame +from pandas.core.panel import Panel +from pandas.sparse.frame import SparseDataFrame +from pandas.util.decorators import deprecate + +import pandas.core.common as com +import pandas.core.ops as ops + + +class SparsePanelAxis(object): + + def __init__(self, cache_field, frame_attr): + self.cache_field = cache_field + self.frame_attr = frame_attr + + def __get__(self, obj, type=None): + return getattr(obj, self.cache_field, None) + + def __set__(self, obj, value): + value = _ensure_index(value) + + if isinstance(value, MultiIndex): + raise NotImplementedError + + for v in compat.itervalues(obj._frames): + setattr(v, self.frame_attr, value) + + setattr(obj, self.cache_field, value) + + +class SparsePanel(Panel): + + """ + Sparse version of Panel + + Parameters + ---------- + frames : dict of DataFrame objects + items : array-like + major_axis : array-like + minor_axis : array-like + default_kind : {'block', 'integer'}, default 'block' + Default sparse kind for converting Series to SparseSeries. Will not + override SparseSeries passed into constructor + default_fill_value : float + Default fill_value for converting Series to SparseSeries. Will not + override SparseSeries passed in + + Notes + ----- + """ + ndim = 3 + _typ = 'panel' + _subtyp = 'sparse_panel' + + def __init__(self, frames, items=None, major_axis=None, minor_axis=None, + default_fill_value=np.nan, default_kind='block', + copy=False): + if isinstance(frames, np.ndarray): + new_frames = {} + for item, vals in zip(items, frames): + new_frames[item] = \ + SparseDataFrame(vals, index=major_axis, + columns=minor_axis, + default_fill_value=default_fill_value, + default_kind=default_kind) + frames = new_frames + + if not isinstance(frames, dict): + raise TypeError('input must be a dict, a %r was passed' % + type(frames).__name__) + + self.default_fill_value = fill_value = default_fill_value + self.default_kind = kind = default_kind + + # pre-filter, if necessary + if items is None: + items = Index(sorted(frames.keys())) + items = _ensure_index(items) + + (clean_frames, + major_axis, + minor_axis) = _convert_frames(frames, major_axis, + minor_axis, kind=kind, + fill_value=fill_value) + + self._frames = clean_frames + + # do we want to fill missing ones? + for item in items: + if item not in clean_frames: + raise ValueError('column %r not found in data' % item) + + self._items = items + self.major_axis = major_axis + self.minor_axis = minor_axis + + def _consolidate_inplace(self): # pragma: no cover + # do nothing when DataFrame calls this method + pass + + def __array_wrap__(self, result): + return SparsePanel(result, items=self.items, + major_axis=self.major_axis, + minor_axis=self.minor_axis, + default_kind=self.default_kind, + default_fill_value=self.default_fill_value) + + @classmethod + def from_dict(cls, data): + """ + Analogous to Panel.from_dict + """ + return SparsePanel(data) + + def to_dense(self): + """ + Convert SparsePanel to (dense) Panel + + Returns + ------- + dense : Panel + """ + return Panel(self.values, self.items, self.major_axis, + self.minor_axis) + + def as_matrix(self): + return self.values + + @property + def values(self): + # return dense values + return np.array([self._frames[item].values + for item in self.items]) + + # need a special property for items to make the field assignable + + _items = None + + def _get_items(self): + return self._items + + def _set_items(self, new_items): + new_items = _ensure_index(new_items) + if isinstance(new_items, MultiIndex): + raise NotImplementedError + + # need to create new frames dict + + old_frame_dict = self._frames + old_items = self._items + self._frames = dict((new_k, old_frame_dict[old_k]) + for new_k, old_k in zip(new_items, old_items)) + self._items = new_items + items = property(fget=_get_items, fset=_set_items) + + # DataFrame's index + major_axis = SparsePanelAxis('_major_axis', 'index') + + # DataFrame's columns / "items" + minor_axis = SparsePanelAxis('_minor_axis', 'columns') + + def _ixs(self, i, axis=0): + """ + for compat as we don't support Block Manager here + i : int, slice, or sequence of integers + axis : int + """ + + key = self._get_axis(axis)[i] + + # xs cannot handle a non-scalar key, so just reindex here + if com.is_list_like(key): + return self.reindex(**{self._get_axis_name(axis): key}) + + return self.xs(key, axis=axis) + + def _slice(self, slobj, axis=0, typ=None): + """ + for compat as we don't support Block Manager here + """ + axis = self._get_axis_name(axis) + index = self._get_axis(axis) + + return self.reindex(**{axis: index[slobj]}) + + def _get_item_cache(self, key): + return self._frames[key] + + def __setitem__(self, key, value): + if isinstance(value, DataFrame): + value = value.reindex(index=self.major_axis, + columns=self.minor_axis) + if not isinstance(value, SparseDataFrame): + value = value.to_sparse(fill_value=self.default_fill_value, + kind=self.default_kind) + else: + raise ValueError('only DataFrame objects can be set currently') + + self._frames[key] = value + + if key not in self.items: + self._items = Index(list(self.items) + [key]) + + def set_value(self, item, major, minor, value): + """ + Quickly set single value at (item, major, minor) location + + Parameters + ---------- + item : item label (panel item) + major : major axis label (panel item row) + minor : minor axis label (panel item column) + value : scalar + + Notes + ----- + This method *always* returns a new object. It is not particularly + efficient but is provided for API compatibility with Panel + + Returns + ------- + panel : SparsePanel + """ + dense = self.to_dense().set_value(item, major, minor, value) + return dense.to_sparse(kind=self.default_kind, + fill_value=self.default_fill_value) + + def __delitem__(self, key): + loc = self.items.get_loc(key) + indices = lrange(loc) + lrange(loc + 1, len(self.items)) + del self._frames[key] + self._items = self._items.take(indices) + + def __getstate__(self): + # pickling + return (self._frames, com._pickle_array(self.items), + com._pickle_array(self.major_axis), + com._pickle_array(self.minor_axis), + self.default_fill_value, self.default_kind) + + def __setstate__(self, state): + frames, items, major, minor, fv, kind = state + + self.default_fill_value = fv + self.default_kind = kind + self._items = _ensure_index(com._unpickle_array(items)) + self._major_axis = _ensure_index(com._unpickle_array(major)) + self._minor_axis = _ensure_index(com._unpickle_array(minor)) + self._frames = frames + + def copy(self, deep=True): + """ + Make a copy of the sparse panel + + Returns + ------- + copy : SparsePanel + """ + + d = self._construct_axes_dict() + if deep: + new_data = dict((k, v.copy(deep=True)) for k, v in compat.iteritems(self._frames)) + d = dict((k, v.copy(deep=True)) for k, v in compat.iteritems(d)) + else: + new_data = self._frames.copy() + d['default_fill_value']=self.default_fill_value + d['default_kind']=self.default_kind + + return SparsePanel(new_data, **d) + + def to_frame(self, filter_observations=True): + """ + Convert SparsePanel to (dense) DataFrame + + Returns + ------- + frame : DataFrame + """ + if not filter_observations: + raise TypeError('filter_observations=False not supported for ' + 'SparsePanel.to_long') + + I, N, K = self.shape + counts = np.zeros(N * K, dtype=int) + + d_values = {} + d_indexer = {} + + for item in self.items: + frame = self[item] + + values, major, minor = _stack_sparse_info(frame) + + # values are stacked column-major + indexer = minor * N + major + counts.put(indexer, counts.take(indexer) + 1) # cuteness + + d_values[item] = values + d_indexer[item] = indexer + + # have full set of observations for each item + mask = counts == I + + # for each item, take mask values at index locations for those sparse + # values, and use that to select values + values = np.column_stack([d_values[item][mask.take(d_indexer[item])] + for item in self.items]) + + inds, = mask.nonzero() + + # still column major + major_labels = inds % N + minor_labels = inds // N + + index = MultiIndex(levels=[self.major_axis, self.minor_axis], + labels=[major_labels, minor_labels], + verify_integrity=False) + + df = DataFrame(values, index=index, columns=self.items) + return df.sortlevel(level=0) + + to_long = deprecate('to_long', to_frame) + toLong = deprecate('toLong', to_frame) + + def reindex(self, major=None, items=None, minor=None, major_axis=None, + minor_axis=None, copy=False): + """ + Conform / reshape panel axis labels to new input labels + + Parameters + ---------- + major : array-like, default None + items : array-like, default None + minor : array-like, default None + copy : boolean, default False + Copy underlying SparseDataFrame objects + + Returns + ------- + reindexed : SparsePanel + """ + major = com._mut_exclusive(major=major, major_axis=major_axis) + minor = com._mut_exclusive(minor=minor, minor_axis=minor_axis) + + if com._all_none(items, major, minor): + raise ValueError('Must specify at least one axis') + + major = self.major_axis if major is None else major + minor = self.minor_axis if minor is None else minor + + if items is not None: + new_frames = {} + for item in items: + if item in self._frames: + new_frames[item] = self._frames[item] + else: + raise NotImplementedError('Reindexing with new items not yet ' + 'supported') + else: + new_frames = self._frames + + if copy: + new_frames = dict((k, v.copy()) for k, v in compat.iteritems(new_frames)) + + return SparsePanel(new_frames, items=items, + major_axis=major, + minor_axis=minor, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + def _combine(self, other, func, axis=0): + if isinstance(other, DataFrame): + return self._combineFrame(other, func, axis=axis) + elif isinstance(other, Panel): + return self._combinePanel(other, func) + elif np.isscalar(other): + new_frames = dict((k, func(v, other)) + for k, v in compat.iteritems(self)) + return self._new_like(new_frames) + + def _combineFrame(self, other, func, axis=0): + index, columns = self._get_plane_axes(axis) + axis = self._get_axis_number(axis) + + other = other.reindex(index=index, columns=columns) + + if axis == 0: + new_values = func(self.values, other.values) + elif axis == 1: + new_values = func(self.values.swapaxes(0, 1), other.values.T) + new_values = new_values.swapaxes(0, 1) + elif axis == 2: + new_values = func(self.values.swapaxes(0, 2), other.values) + new_values = new_values.swapaxes(0, 2) + + # TODO: make faster! + new_frames = {} + for item, item_slice in zip(self.items, new_values): + old_frame = self[item] + ofv = old_frame.default_fill_value + ok = old_frame.default_kind + new_frames[item] = SparseDataFrame(item_slice, + index=self.major_axis, + columns=self.minor_axis, + default_fill_value=ofv, + default_kind=ok) + + return self._new_like(new_frames) + + def _new_like(self, new_frames): + return SparsePanel(new_frames, self.items, self.major_axis, + self.minor_axis, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + def _combinePanel(self, other, func): + items = self.items + other.items + major = self.major_axis + other.major_axis + minor = self.minor_axis + other.minor_axis + + # could check that everything's the same size, but forget it + + this = self.reindex(items=items, major=major, minor=minor) + other = other.reindex(items=items, major=major, minor=minor) + + new_frames = {} + for item in items: + new_frames[item] = func(this[item], other[item]) + + if not isinstance(other, SparsePanel): + new_default_fill = self.default_fill_value + else: + # maybe unnecessary + new_default_fill = func(self.default_fill_value, + other.default_fill_value) + + return SparsePanel(new_frames, items, major, minor, + default_fill_value=new_default_fill, + default_kind=self.default_kind) + + def major_xs(self, key): + """ + Return slice of panel along major axis + + Parameters + ---------- + key : object + Major axis label + + Returns + ------- + y : DataFrame + index -> minor axis, columns -> items + """ + slices = dict((k, v.xs(key)) for k, v in compat.iteritems(self)) + return DataFrame(slices, index=self.minor_axis, columns=self.items) + + def minor_xs(self, key): + """ + Return slice of panel along minor axis + + Parameters + ---------- + key : object + Minor axis label + + Returns + ------- + y : SparseDataFrame + index -> major axis, columns -> items + """ + slices = dict((k, v[key]) for k, v in compat.iteritems(self)) + return SparseDataFrame(slices, index=self.major_axis, + columns=self.items, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + # TODO: allow SparsePanel to work with flex arithmetic. + # pow and mod only work for scalars for now + def pow(self, val, *args, **kwargs): + """wrapper around `__pow__` (only works for scalar values)""" + return self.__pow__(val) + + def mod(self, val, *args, **kwargs): + """wrapper around `__mod__` (only works for scalar values""" + return self.__mod__(val) + +# Sparse objects opt out of numexpr +SparsePanel._add_aggregate_operations(use_numexpr=False) +ops.add_special_arithmetic_methods(SparsePanel, use_numexpr=False, **ops.panel_special_funcs) +SparseWidePanel = SparsePanel + + +def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'): + from pandas.core.panel import _get_combined_index + output = {} + for item, df in compat.iteritems(frames): + if not isinstance(df, SparseDataFrame): + df = SparseDataFrame(df, default_kind=kind, + default_fill_value=fill_value) + + output[item] = df + + if index is None: + all_indexes = [df.index for df in output.values()] + index = _get_combined_index(all_indexes) + if columns is None: + all_columns = [df.columns for df in output.values()] + columns = _get_combined_index(all_columns) + + index = _ensure_index(index) + columns = _ensure_index(columns) + + for item, df in compat.iteritems(output): + if not (df.index.equals(index) and df.columns.equals(columns)): + output[item] = df.reindex(index=index, columns=columns) + + return output, index, columns + + +def _stack_sparse_info(frame): + lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)] + + # this is pretty fast + minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) + + inds_to_concat = [] + vals_to_concat = [] + for col in frame.columns: + series = frame[col] + + if not np.isnan(series.fill_value): + raise TypeError('This routine assumes NaN fill value') + + int_index = series.sp_index.to_int_index() + inds_to_concat.append(int_index.indices) + vals_to_concat.append(series.sp_values) + + major_labels = np.concatenate(inds_to_concat) + sparse_values = np.concatenate(vals_to_concat) + + return sparse_values, major_labels, minor_labels diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py new file mode 100644 index 00000000..48576266 --- /dev/null +++ b/pandas/sparse/series.py @@ -0,0 +1,660 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only +with float64 data +""" + +# pylint: disable=E1101,E1103,W0231 + +from numpy import nan, ndarray +import numpy as np + +import operator + +from pandas.core.common import isnull, _values_from_object, _maybe_match_name +from pandas.core.index import Index, _ensure_index +from pandas.core.series import Series +from pandas.core.frame import DataFrame +from pandas.core.internals import SingleBlockManager +from pandas.core import generic +import pandas.core.common as com +import pandas.core.ops as ops +import pandas.core.datetools as datetools +import pandas.index as _index + +from pandas import compat + +from pandas.sparse.array import (make_sparse, _sparse_array_op, SparseArray) +from pandas._sparse import BlockIndex, IntIndex +import pandas._sparse as splib + +from pandas.util.decorators import Appender + +#------------------------------------------------------------------------------ +# Wrapper function for Series arithmetic methods + + +def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None, + **eval_kwargs): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + + str_rep, default_axis, fill_zeros and eval_kwargs are not used, but are present + for compatibility. + """ + + def wrapper(self, other): + if isinstance(other, Series): + if not isinstance(other, SparseSeries): + other = other.to_sparse(fill_value=self.fill_value) + return _sparse_series_op(self, other, op, name) + elif isinstance(other, DataFrame): + return NotImplemented + elif np.isscalar(other): + if isnull(other) or isnull(self.fill_value): + new_fill_value = np.nan + else: + new_fill_value = op(np.float64(self.fill_value), + np.float64(other)) + + return SparseSeries(op(self.sp_values, other), + index=self.index, + sparse_index=self.sp_index, + fill_value=new_fill_value, + name=self.name) + else: # pragma: no cover + raise TypeError('operation with %s not supported' % type(other)) + + wrapper.__name__ = name + if name.startswith("__"): + # strip special method names, e.g. `__add__` needs to be `add` when passed + # to _sparse_series_op + name = name[2:-2] + return wrapper + + +def _sparse_series_op(left, right, op, name): + left, right = left.align(right, join='outer', copy=False) + new_index = left.index + new_name = _maybe_match_name(left, right) + + result = _sparse_array_op(left, right, op, name) + return SparseSeries(result, index=new_index, name=new_name) + + +class SparseSeries(Series): + + """Data structure for labeled, sparse floating point data + + Parameters + ---------- + data : {array-like, Series, SparseSeries, dict} + kind : {'block', 'integer'} + fill_value : float + Defaults to NaN (code for missing) + sparse_index : {BlockIndex, IntIndex}, optional + Only if you have one. Mainly used internally + + Notes + ----- + SparseSeries objects are immutable via the typical Python means. If you + must change values, convert to dense, make your changes, then convert back + to sparse + """ + _subtyp = 'sparse_series' + + def __init__(self, data, index=None, sparse_index=None, kind='block', + fill_value=None, name=None, dtype=None, copy=False, + fastpath=False): + + # we are called internally, so short-circuit + if fastpath: + + # data is an ndarray, index is defined + data = SingleBlockManager(data, index, fastpath=True) + if copy: + data = data.copy() + else: + + is_sparse_array = isinstance(data, SparseArray) + if fill_value is None: + if is_sparse_array: + fill_value = data.fill_value + else: + fill_value = nan + + if is_sparse_array: + if isinstance(data, SparseSeries) and index is None: + index = data.index.view() + elif index is not None: + assert(len(index) == len(data)) + + sparse_index = data.sp_index + data = np.asarray(data) + + elif isinstance(data, SparseSeries): + if index is None: + index = data.index.view() + + # extract the SingleBlockManager + data = data._data + + elif isinstance(data, (Series, dict)): + if index is None: + index = data.index.view() + + data = Series(data) + data, sparse_index = make_sparse(data, kind=kind, + fill_value=fill_value) + + elif isinstance(data, (tuple, list, np.ndarray)): + # array-like + if sparse_index is None: + data, sparse_index = make_sparse(data, kind=kind, + fill_value=fill_value) + else: + assert(len(data) == sparse_index.npoints) + + elif isinstance(data, SingleBlockManager): + if dtype is not None: + data = data.astype(dtype) + if index is None: + index = data.index.view() + else: + data = data.reindex(index, copy=False) + + else: + + length = len(index) + + if data == fill_value or (isnull(data) + and isnull(fill_value)): + if kind == 'block': + sparse_index = BlockIndex(length, [], []) + else: + sparse_index = IntIndex(length, []) + data = np.array([]) + + else: + if kind == 'block': + locs, lens = ([0], [length]) if length else ([], []) + sparse_index = BlockIndex(length, locs, lens) + else: + sparse_index = IntIndex(length, index) + v = data + data = np.empty(length) + data.fill(v) + + if index is None: + index = com._default_index(sparse_index.length) + index = _ensure_index(index) + + # create/copy the manager + if isinstance(data, SingleBlockManager): + + if copy: + data = data.copy() + else: + + # create a sparse array + if not isinstance(data, SparseArray): + data = SparseArray( + data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) + + data = SingleBlockManager(data, index) + + generic.NDFrame.__init__(self, data) + + self.index = index + self.name = name + + @property + def values(self): + """ return the array """ + return self._data._values + + def get_values(self): + """ same as values """ + return self._data._values.to_dense().view() + + @property + def block(self): + return self._data._block + + @property + def fill_value(self): + return self.block.fill_value + + @fill_value.setter + def fill_value(self, v): + self.block.fill_value = v + + @property + def sp_index(self): + return self.block.sp_index + + @property + def sp_values(self): + return self.values.sp_values + + @property + def npoints(self): + return self.sp_index.npoints + + @classmethod + def from_array(cls, arr, index=None, name=None, copy=False, fill_value=None, fastpath=False): + """ + Simplified alternate constructor + """ + return cls(arr, index=index, name=name, copy=copy, fill_value=fill_value, fastpath=fastpath) + + @property + def _constructor(self): + return SparseSeries + + @property + def kind(self): + if isinstance(self.sp_index, BlockIndex): + return 'block' + elif isinstance(self.sp_index, IntIndex): + return 'integer' + + def as_sparse_array(self, kind=None, fill_value=None, copy=False): + """ return my self as a sparse array, do not copy by default """ + + if fill_value is None: + fill_value = self.fill_value + if kind is None: + kind = self.kind + return SparseArray(self.values, + sparse_index=self.sp_index, + fill_value=fill_value, + kind=kind, + copy=copy) + + def __len__(self): + return len(self.block) + + def __unicode__(self): + # currently, unicode is same as repr...fixes infinite loop + series_rep = Series.__unicode__(self) + rep = '%s\n%s' % (series_rep, repr(self.sp_index)) + return rep + + def __array_wrap__(self, result): + """ + Gets called prior to a ufunc (and after) + """ + return self._constructor(result, + index=self.index, + sparse_index=self.sp_index, + fill_value=self.fill_value, + copy=False).__finalize__(self) + + def __array_finalize__(self, obj): + """ + Gets called after any ufunc or other array operations, necessary + to pass on the index. + """ + self.name = getattr(obj, 'name', None) + self.fill_value = getattr(obj, 'fill_value', None) + + def __getstate__(self): + # pickling + return dict(_typ=self._typ, + _subtyp=self._subtyp, + _data=self._data, + fill_value=self.fill_value, + name=self.name) + + def _unpickle_series_compat(self, state): + + nd_state, own_state = state + + # recreate the ndarray + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + + index, fill_value, sp_index = own_state[:3] + name = None + if len(own_state) > 3: + name = own_state[3] + + # create a sparse array + if not isinstance(data, SparseArray): + data = SparseArray( + data, sparse_index=sp_index, fill_value=fill_value, copy=False) + + # recreate + data = SingleBlockManager(data, index, fastpath=True) + generic.NDFrame.__init__(self, data) + + self._set_axis(0, index) + self.name = name + + def __iter__(self): + """ forward to the array """ + return iter(self.values) + + def _set_subtyp(self, is_all_dates): + if is_all_dates: + object.__setattr__(self, '_subtyp', 'sparse_time_series') + else: + object.__setattr__(self, '_subtyp', 'sparse_series') + + def _get_val_at(self, loc): + """ forward to the array """ + return self.block.values._get_val_at(loc) + + def __getitem__(self, key): + """ + + """ + try: + return self._get_val_at(self.index.get_loc(key)) + + except KeyError: + if isinstance(key, (int, np.integer)): + return self._get_val_at(key) + raise Exception('Requested index not in this series!') + + except TypeError: + # Could not hash item, must be array-like? + pass + + # is there a case where this would NOT be an ndarray? + # need to find an example, I took out the case for now + + key = _values_from_object(key) + dataSlice = self.values[key] + new_index = Index(self.index.view(ndarray)[key]) + return self._constructor(dataSlice, index=new_index).__finalize__(self) + + def _set_with_engine(self, key, value): + return self.set_value(key, value) + + def abs(self): + """ + Return an object with absolute value taken. Only applicable to objects + that are all numeric + + Returns + ------- + abs: type of caller + """ + res_sp_values = np.abs(self.sp_values) + return self._constructor(res_sp_values, index=self.index, + sparse_index=self.sp_index, + fill_value=self.fill_value) + + def get(self, label, default=None): + """ + Returns value occupying requested label, default to specified + missing value if not present. Analogous to dict.get + + Parameters + ---------- + label : object + Label value looking for + default : object, optional + Value to return if label not in index + + Returns + ------- + y : scalar + """ + if label in self.index: + loc = self.index.get_loc(label) + return self._get_val_at(loc) + else: + return default + + def get_value(self, label, takeable=False): + """ + Retrieve single value at passed index label + + Parameters + ---------- + index : label + takeable : interpret the index as indexers, default False + + Returns + ------- + value : scalar value + """ + loc = label if takeable is True else self.index.get_loc(label) + return self._get_val_at(loc) + + def set_value(self, label, value, takeable=False): + """ + Quickly set single value at passed label. If label is not contained, a + new object is created with the label placed at the end of the result + index + + Parameters + ---------- + label : object + Partial indexing with MultiIndex not allowed + value : object + Scalar value + takeable : interpret the index as indexers, default False + + Notes + ----- + This method *always* returns a new object. It is not particularly + efficient but is provided for API compatibility with Series + + Returns + ------- + series : SparseSeries + """ + values = self.to_dense() + + # if the label doesn't exist, we will create a new object here + # and possibily change the index + new_values = values.set_value(label, value, takeable=takeable) + if new_values is not None: + values = new_values + new_index = values.index + values = SparseArray( + values, fill_value=self.fill_value, kind=self.kind) + self._data = SingleBlockManager(values, new_index) + self._index = new_index + + def _set_values(self, key, value): + + # this might be inefficient as we have to recreate the sparse array + # rather than setting individual elements, but have to convert + # the passed slice/boolean that's in dense space into a sparse indexer + # not sure how to do that! + if isinstance(key, Series): + key = key.values + + values = self.values.to_dense() + values[key] = _index.convert_scalar(values, value) + values = SparseArray( + values, fill_value=self.fill_value, kind=self.kind) + self._data = SingleBlockManager(values, self.index) + + def to_dense(self, sparse_only=False): + """ + Convert SparseSeries to (dense) Series + """ + if sparse_only: + int_index = self.sp_index.to_int_index() + index = self.index.take(int_index.indices) + return Series(self.sp_values, index=index, name=self.name) + else: + return Series(self.values.to_dense(), index=self.index, name=self.name) + + @property + def density(self): + r = float(self.sp_index.npoints) / float(self.sp_index.length) + return r + + def copy(self, deep=True): + """ + Make a copy of the SparseSeries. Only the actual sparse values need to + be copied + """ + new_data = self._data + if deep: + new_data = self._data.copy() + + return self._constructor(new_data, + sparse_index=self.sp_index, + fill_value=self.fill_value).__finalize__(self) + + def reindex(self, index=None, method=None, copy=True, limit=None): + """ + Conform SparseSeries to new Index + + See Series.reindex docstring for general behavior + + Returns + ------- + reindexed : SparseSeries + """ + new_index = _ensure_index(index) + + if self.index.equals(new_index): + if copy: + return self.copy() + else: + return self + return self._constructor(self._data.reindex(new_index, method=method, limit=limit, copy=copy), + index=new_index).__finalize__(self) + + def sparse_reindex(self, new_index): + """ + Conform sparse values to new SparseIndex + + Parameters + ---------- + new_index : {BlockIndex, IntIndex} + + Returns + ------- + reindexed : SparseSeries + """ + if not isinstance(new_index, splib.SparseIndex): + raise TypeError('new index must be a SparseIndex') + + block = self.block.sparse_reindex(new_index) + new_data = SingleBlockManager(block, self.index) + return self._constructor(new_data, index=self.index, + sparse_index=new_index, + fill_value=self.fill_value).__finalize__(self) + + def take(self, indices, axis=0, convert=True): + """ + Sparse-compatible version of ndarray.take + + Returns + ------- + taken : ndarray + """ + new_values = SparseArray.take(self.values, indices) + new_index = self.index.take(indices) + return self._constructor(new_values, index=new_index).__finalize__(self) + + def cumsum(self, axis=0, dtype=None, out=None): + """ + Cumulative sum of values. Preserves locations of NaN values + + Returns + ------- + cumsum : Series or SparseSeries + """ + new_array = SparseArray.cumsum(self.values) + if isinstance(new_array, SparseArray): + return self._constructor(new_array, index=self.index, sparse_index=new_array.sp_index).__finalize__(self) + return Series(new_array, index=self.index).__finalize__(self) + + def dropna(self, axis=0, inplace=False, **kwargs): + """ + Analogous to Series.dropna. If fill_value=NaN, returns a dense Series + """ + # TODO: make more efficient + axis = self._get_axis_number(axis or 0) + dense_valid = self.to_dense().valid() + if inplace: + raise NotImplementedError("Cannot perform inplace dropna" + " operations on a SparseSeries") + if isnull(self.fill_value): + return dense_valid + else: + dense_valid = dense_valid[dense_valid != self.fill_value] + return dense_valid.to_sparse(fill_value=self.fill_value) + + def shift(self, periods, freq=None, **kwds): + """ + Analogous to Series.shift + """ + from pandas.core.datetools import _resolve_offset + + offset = _resolve_offset(freq, kwds) + + # no special handling of fill values yet + if not isnull(self.fill_value): + dense_shifted = self.to_dense().shift(periods, freq=freq, + **kwds) + return dense_shifted.to_sparse(fill_value=self.fill_value, + kind=self.kind) + + if periods == 0: + return self.copy() + + if offset is not None: + return self._constructor(self.sp_values, + sparse_index=self.sp_index, + index=self.index.shift(periods, offset), + fill_value=self.fill_value).__finalize__(self) + + int_index = self.sp_index.to_int_index() + new_indices = int_index.indices + periods + start, end = new_indices.searchsorted([0, int_index.length]) + + new_indices = new_indices[start:end] + + new_sp_index = IntIndex(len(self), new_indices) + if isinstance(self.sp_index, BlockIndex): + new_sp_index = new_sp_index.to_block_index() + + return self._constructor(self.sp_values[start:end].copy(), + index=self.index, + sparse_index=new_sp_index, + fill_value=self.fill_value).__finalize__(self) + + def combine_first(self, other): + """ + Combine Series values, choosing the calling Series's values + first. Result index will be the union of the two indexes + + Parameters + ---------- + other : Series + + Returns + ------- + y : Series + """ + if isinstance(other, SparseSeries): + other = other.to_dense() + + dense_combined = self.to_dense().combine_first(other) + return dense_combined.to_sparse(fill_value=self.fill_value) + +# overwrite series methods with unaccelerated versions +ops.add_special_arithmetic_methods(SparseSeries, use_numexpr=False, + **ops.series_special_funcs) +ops.add_flex_arithmetic_methods(SparseSeries, use_numexpr=False, + **ops.series_flex_funcs) +# overwrite basic arithmetic to use SparseSeries version +# force methods to overwrite previous definitions. +ops.add_special_arithmetic_methods(SparseSeries, _arith_method, + radd_func=operator.add, comp_method=None, + bool_method=None, use_numexpr=False, force=True) + +# backwards compatiblity +SparseTimeSeries = SparseSeries diff --git a/pandas/sparse/tests/__init__.py b/pandas/sparse/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py new file mode 100644 index 00000000..a12d1dfe --- /dev/null +++ b/pandas/sparse/tests/test_array.py @@ -0,0 +1,182 @@ +from pandas.compat import range +import re +from numpy import nan, ndarray +import numpy as np + +import operator +import pickle + +from pandas.core.series import Series +from pandas.core.common import notnull +from pandas.sparse.api import SparseArray +from pandas.util.testing import assert_almost_equal, assertRaisesRegexp +import pandas.util.testing as tm + + +def assert_sp_array_equal(left, right): + assert_almost_equal(left.sp_values, right.sp_values) + assert(left.sp_index.equals(right.sp_index)) + if np.isnan(left.fill_value): + assert(np.isnan(right.fill_value)) + else: + assert(left.fill_value == right.fill_value) + + +class TestSparseArray(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.arr_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) + self.arr = SparseArray(self.arr_data) + self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + + def test_get_item(self): + errmsg = re.compile("bounds") + assertRaisesRegexp(IndexError, errmsg, lambda: self.arr[11]) + assertRaisesRegexp(IndexError, errmsg, lambda: self.arr[-11]) + self.assertEqual(self.arr[-1], self.arr[len(self.arr) - 1]) + + def test_bad_take(self): + assertRaisesRegexp(IndexError, "bounds", lambda: self.arr.take(11)) + self.assertRaises(IndexError, lambda: self.arr.take(-11)) + + def test_set_item(self): + def setitem(): + self.arr[5] = 3 + + def setslice(): + self.arr[1:5] = 2 + assertRaisesRegexp(TypeError, "item assignment", setitem) + assertRaisesRegexp(TypeError, "item assignment", setslice) + + def test_constructor_from_sparse(self): + res = SparseArray(self.zarr) + self.assertEqual(res.fill_value, 0) + assert_almost_equal(res.sp_values, self.zarr.sp_values) + + def test_constructor_copy(self): + cp = SparseArray(self.arr, copy=True) + cp.sp_values[:3] = 0 + self.assertFalse((self.arr.sp_values[:3] == 0).any()) + + not_copy = SparseArray(self.arr) + not_copy.sp_values[:3] = 0 + self.assertTrue((self.arr.sp_values[:3] == 0).all()) + + def test_astype(self): + res = self.arr.astype('f8') + res.sp_values[:3] = 27 + self.assertFalse((self.arr.sp_values[:3] == 27).any()) + + assertRaisesRegexp(TypeError, "floating point", self.arr.astype, 'i8') + + def test_copy_shallow(self): + arr2 = self.arr.copy(deep=False) + + def _get_base(values): + base = values.base + while base.base is not None: + base = base.base + return base + + assert(_get_base(arr2) is _get_base(self.arr)) + + def test_values_asarray(self): + assert_almost_equal(self.arr.values, self.arr_data) + assert_almost_equal(self.arr.to_dense(), self.arr_data) + assert_almost_equal(self.arr.sp_values, np.asarray(self.arr)) + + def test_getitem(self): + def _checkit(i): + assert_almost_equal(self.arr[i], self.arr.values[i]) + + for i in range(len(self.arr)): + _checkit(i) + _checkit(-i) + + def test_getslice(self): + result = self.arr[:-3] + exp = SparseArray(self.arr.values[:-3]) + assert_sp_array_equal(result, exp) + + result = self.arr[-4:] + exp = SparseArray(self.arr.values[-4:]) + assert_sp_array_equal(result, exp) + + # two corner cases from Series + result = self.arr[-12:] + exp = SparseArray(self.arr) + assert_sp_array_equal(result, exp) + + result = self.arr[:-12] + exp = SparseArray(self.arr.values[:0]) + assert_sp_array_equal(result, exp) + + def test_binary_operators(self): + data1 = np.random.randn(20) + data2 = np.random.randn(20) + data1[::2] = np.nan + data2[::3] = np.nan + + arr1 = SparseArray(data1) + arr2 = SparseArray(data2) + + data1[::2] = 3 + data2[::3] = 3 + farr1 = SparseArray(data1, fill_value=3) + farr2 = SparseArray(data2, fill_value=3) + + def _check_op(op, first, second): + res = op(first, second) + exp = SparseArray(op(first.values, second.values), + fill_value=first.fill_value) + tm.assert_isinstance(res, SparseArray) + assert_almost_equal(res.values, exp.values) + + res2 = op(first, second.values) + tm.assert_isinstance(res2, SparseArray) + assert_sp_array_equal(res, res2) + + res3 = op(first.values, second) + tm.assert_isinstance(res3, SparseArray) + assert_sp_array_equal(res, res3) + + res4 = op(first, 4) + tm.assert_isinstance(res4, SparseArray) + + # ignore this if the actual op raises (e.g. pow) + try: + exp = op(first.values, 4) + exp_fv = op(first.fill_value, 4) + assert_almost_equal(res4.fill_value, exp_fv) + assert_almost_equal(res4.values, exp) + except (ValueError) : + pass + + def _check_inplace_op(op): + tmp = arr1.copy() + self.assertRaises(NotImplementedError, op, tmp, arr2) + + bin_ops = [operator.add, operator.sub, operator.mul, operator.truediv, + operator.floordiv, operator.pow] + for op in bin_ops: + _check_op(op, arr1, arr2) + _check_op(op, farr1, farr2) + + inplace_ops = ['iadd', 'isub', 'imul', 'itruediv', 'ifloordiv', 'ipow'] + for op in inplace_ops: + _check_inplace_op(getattr(operator, op)) + + def test_pickle(self): + def _check_roundtrip(obj): + pickled = pickle.dumps(obj) + unpickled = pickle.loads(pickled) + assert_sp_array_equal(unpickled, obj) + + _check_roundtrip(self.arr) + _check_roundtrip(self.zarr) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py new file mode 100644 index 00000000..cd68d264 --- /dev/null +++ b/pandas/sparse/tests/test_libsparse.py @@ -0,0 +1,398 @@ +from pandas import Series + +import nose +from numpy import nan +import numpy as np +import operator +from numpy.testing import assert_almost_equal, assert_equal +import pandas.util.testing as tm + +from pandas.core.sparse import SparseSeries +from pandas import DataFrame + +from pandas._sparse import IntIndex, BlockIndex +import pandas._sparse as splib + +TEST_LENGTH = 20 + +plain_case = dict(xloc=[0, 7, 15], + xlen=[3, 5, 5], + yloc=[2, 9, 14], + ylen=[2, 3, 5], + intersect_loc=[2, 9, 15], + intersect_len=[1, 3, 4]) +delete_blocks = dict(xloc=[0, 5], + xlen=[4, 4], + yloc=[1], + ylen=[4], + intersect_loc=[1], + intersect_len=[3]) +split_blocks = dict(xloc=[0], + xlen=[10], + yloc=[0, 5], + ylen=[3, 7], + intersect_loc=[0, 5], + intersect_len=[3, 5]) +skip_block = dict(xloc=[10], + xlen=[5], + yloc=[0, 12], + ylen=[5, 3], + intersect_loc=[12], + intersect_len=[3]) + +no_intersect = dict(xloc=[0, 10], + xlen=[4, 6], + yloc=[5, 17], + ylen=[4, 2], + intersect_loc=[], + intersect_len=[]) + + +def check_cases(_check_case): + def _check_case_dict(case): + _check_case(case['xloc'], case['xlen'], case['yloc'], case['ylen'], + case['intersect_loc'], case['intersect_len']) + + _check_case_dict(plain_case) + _check_case_dict(delete_blocks) + _check_case_dict(split_blocks) + _check_case_dict(skip_block) + _check_case_dict(no_intersect) + + # one or both is empty + _check_case([0], [5], [], [], [], []) + _check_case([], [], [], [], [], []) + + +def test_index_make_union(): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + bresult = xindex.make_union(yindex) + assert(isinstance(bresult, BlockIndex)) + assert_equal(bresult.blocs, eloc) + assert_equal(bresult.blengths, elen) + + ixindex = xindex.to_int_index() + iyindex = yindex.to_int_index() + iresult = ixindex.make_union(iyindex) + assert(isinstance(iresult, IntIndex)) + assert_equal(iresult.indices, bresult.to_int_index().indices) + + """ + x: ---- + y: ---- + r: -------- + """ + xloc = [0] + xlen = [5] + yloc = [5] + ylen = [4] + eloc = [0] + elen = [9] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ----- ----- + y: ----- -- + """ + xloc = [0, 10] + xlen = [5, 5] + yloc = [2, 17] + ylen = [5, 2] + eloc = [0, 10, 17] + elen = [7, 5, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ------ + y: ------- + r: ---------- + """ + xloc = [1] + xlen = [5] + yloc = [3] + ylen = [5] + eloc = [1] + elen = [7] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ------ ----- + y: ------- + r: ------------- + """ + xloc = [2, 10] + xlen = [4, 4] + yloc = [4] + ylen = [8] + eloc = [2] + elen = [12] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: --- ----- + y: ------- + r: ------------- + """ + xloc = [0, 5] + xlen = [3, 5] + yloc = [0] + ylen = [7] + eloc = [0] + elen = [10] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ------ ----- + y: ------- --- + r: ------------- + """ + xloc = [2, 10] + xlen = [4, 4] + yloc = [4, 13] + ylen = [8, 4] + eloc = [2] + elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ---------------------- + y: ---- ---- --- + r: ---------------------- + """ + xloc = [2] + xlen = [15] + yloc = [4, 9, 14] + ylen = [3, 2, 2] + eloc = [2] + elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ---- --- + y: --- --- + """ + xloc = [0, 10] + xlen = [3, 3] + yloc = [5, 15] + ylen = [2, 2] + eloc = [0, 5, 10, 15] + elen = [3, 2, 3, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + # TODO: different-length index objects + + +def test_lookup(): + + def _check(index): + assert(index.lookup(0) == -1) + assert(index.lookup(5) == 0) + assert(index.lookup(7) == 2) + assert(index.lookup(8) == -1) + assert(index.lookup(9) == -1) + assert(index.lookup(10) == -1) + assert(index.lookup(11) == -1) + assert(index.lookup(12) == 3) + assert(index.lookup(17) == 8) + assert(index.lookup(18) == -1) + + bindex = BlockIndex(20, [5, 12], [3, 6]) + iindex = bindex.to_int_index() + + _check(bindex) + _check(iindex) + + # corner cases + + +def test_intersect(): + def _check_correct(a, b, expected): + result = a.intersect(b) + assert(result.equals(expected)) + + def _check_length_exc(a, longer): + nose.tools.assert_raises(Exception, a.intersect, longer) + + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + expected = BlockIndex(TEST_LENGTH, eloc, elen) + longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) + + _check_correct(xindex, yindex, expected) + _check_correct(xindex.to_int_index(), + yindex.to_int_index(), + expected.to_int_index()) + + _check_length_exc(xindex, longer_index) + _check_length_exc(xindex.to_int_index(), + longer_index.to_int_index()) + + check_cases(_check_case) + + +class TestBlockIndex(tm.TestCase): + + def test_equals(self): + index = BlockIndex(10, [0, 4], [2, 5]) + + self.assertTrue(index.equals(index)) + self.assertFalse(index.equals(BlockIndex(10, [0, 4], [2, 6]))) + + def test_check_integrity(self): + locs = [] + lengths = [] + + # 0-length OK + index = BlockIndex(0, locs, lengths) + + # also OK even though empty + index = BlockIndex(1, locs, lengths) + + # block extend beyond end + self.assertRaises(Exception, BlockIndex, 10, [5], [10]) + + # block overlap + self.assertRaises(Exception, BlockIndex, 10, [2, 5], [5, 3]) + + def test_to_int_index(self): + locs = [0, 10] + lengths = [4, 6] + exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15] + + block = BlockIndex(20, locs, lengths) + dense = block.to_int_index() + + assert_equal(dense.indices, exp_inds) + + def test_to_block_index(self): + index = BlockIndex(10, [0, 5], [4, 5]) + self.assertIs(index.to_block_index(), index) + + +class TestIntIndex(tm.TestCase): + + def test_equals(self): + index = IntIndex(10, [0, 1, 2, 3, 4]) + self.assertTrue(index.equals(index)) + self.assertFalse(index.equals(IntIndex(10, [0, 1, 2, 3]))) + + def test_to_block_index(self): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + # see if survive the round trip + xbindex = xindex.to_int_index().to_block_index() + ybindex = yindex.to_int_index().to_block_index() + tm.assert_isinstance(xbindex, BlockIndex) + self.assertTrue(xbindex.equals(xindex)) + self.assertTrue(ybindex.equals(yindex)) + check_cases(_check_case) + + def test_to_int_index(self): + index = IntIndex(10, [2, 3, 4, 5, 6]) + self.assertIs(index.to_int_index(), index) + + +class TestSparseOperators(tm.TestCase): + + def _nan_op_tests(self, sparse_op, python_op): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + xdindex = xindex.to_int_index() + ydindex = yindex.to_int_index() + + x = np.arange(xindex.npoints) * 10. + 1 + y = np.arange(yindex.npoints) * 100. + 1 + + result_block_vals, rb_index = sparse_op(x, xindex, y, yindex) + result_int_vals, ri_index = sparse_op(x, xdindex, y, ydindex) + + self.assertTrue(rb_index.to_int_index().equals(ri_index)) + assert_equal(result_block_vals, result_int_vals) + + # check versus Series... + xseries = Series(x, xdindex.indices) + yseries = Series(y, ydindex.indices) + series_result = python_op(xseries, yseries).valid() + assert_equal(result_block_vals, series_result.values) + assert_equal(result_int_vals, series_result.values) + + check_cases(_check_case) + + def _op_tests(self, sparse_op, python_op): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + xdindex = xindex.to_int_index() + ydindex = yindex.to_int_index() + + x = np.arange(xindex.npoints) * 10. + 1 + y = np.arange(yindex.npoints) * 100. + 1 + + xfill = 0 + yfill = 2 + + result_block_vals, rb_index = sparse_op( + x, xindex, xfill, y, yindex, yfill) + result_int_vals, ri_index = sparse_op(x, xdindex, xfill, + y, ydindex, yfill) + + self.assertTrue(rb_index.to_int_index().equals(ri_index)) + assert_equal(result_block_vals, result_int_vals) + + # check versus Series... + xseries = Series(x, xdindex.indices) + xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill) + + yseries = Series(y, ydindex.indices) + yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill) + + series_result = python_op(xseries, yseries) + series_result = series_result.reindex(ri_index.indices) + + assert_equal(result_block_vals, series_result.values) + assert_equal(result_int_vals, series_result.values) + + check_cases(_check_case) + +# too cute? oh but how I abhor code duplication + +check_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + + +def make_nanoptestf(op): + def f(self): + sparse_op = getattr(splib, 'sparse_nan%s' % op) + python_op = getattr(operator, op) + self._nan_op_tests(sparse_op, python_op) + f.__name__ = 'test_nan%s' % op + return f + + +def make_optestf(op): + def f(self): + sparse_op = getattr(splib, 'sparse_%s' % op) + python_op = getattr(operator, op) + self._op_tests(sparse_op, python_op) + f.__name__ = 'test_%s' % op + return f + +for op in check_ops: + f = make_nanoptestf(op) + g = make_optestf(op) + setattr(TestSparseOperators, f.__name__, f) + setattr(TestSparseOperators, g.__name__, g) + del f + del g + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/sparse/tests/test_list.py b/pandas/sparse/tests/test_list.py new file mode 100644 index 00000000..7b81e483 --- /dev/null +++ b/pandas/sparse/tests/test_list.py @@ -0,0 +1,107 @@ +from pandas.compat import range +import unittest + +from numpy import nan +import numpy as np + +from pandas.sparse.api import SparseList, SparseArray +from pandas.util.testing import assert_almost_equal + +from .test_sparse import assert_sp_array_equal + + +def assert_sp_list_equal(left, right): + assert_sp_array_equal(left.to_array(), right.to_array()) + + +class TestSparseList(unittest.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) + self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6]) + + def test_constructor(self): + lst1 = SparseList(self.na_data[:5]) + exp = SparseList() + exp.append(self.na_data[:5]) + assert_sp_list_equal(lst1, exp) + + def test_len(self): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + self.assertEqual(len(splist), 5) + splist.append(arr[5]) + self.assertEqual(len(splist), 6) + splist.append(arr[6:]) + self.assertEqual(len(splist), 10) + + def test_append_na(self): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + sparr = splist.to_array() + assert_sp_array_equal(sparr, SparseArray(arr)) + + def test_append_zero(self): + arr = self.zero_data + splist = SparseList(fill_value=0) + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + sparr = splist.to_array() + assert_sp_array_equal(sparr, SparseArray(arr, fill_value=0)) + + def test_consolidate(self): + arr = self.na_data + exp_sparr = SparseArray(arr) + + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + consol = splist.consolidate(inplace=False) + self.assertEqual(consol.nchunks, 1) + self.assertEqual(splist.nchunks, 3) + assert_sp_array_equal(consol.to_array(), exp_sparr) + + splist.consolidate() + self.assertEqual(splist.nchunks, 1) + assert_sp_array_equal(splist.to_array(), exp_sparr) + + def test_copy(self): + arr = self.na_data + exp_sparr = SparseArray(arr) + + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + + cp = splist.copy() + cp.append(arr[6:]) + self.assertEqual(splist.nchunks, 2) + assert_sp_array_equal(cp.to_array(), exp_sparr) + + def test_getitem(self): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + for i in range(len(arr)): + assert_almost_equal(splist[i], arr[i]) + assert_almost_equal(splist[-i], arr[-i]) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py new file mode 100644 index 00000000..475b8f93 --- /dev/null +++ b/pandas/sparse/tests/test_sparse.py @@ -0,0 +1,1778 @@ +# pylint: disable-msg=E1101,W0612 + +import operator +from datetime import datetime + +import nose + +from numpy import nan +import numpy as np +import pandas as pd +dec = np.testing.dec + +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assert_panel_equal, assertRaisesRegexp) +from numpy.testing import assert_equal + +from pandas import Series, DataFrame, bdate_range, Panel +from pandas.core.datetools import BDay +from pandas.core.index import Index +from pandas.tseries.index import DatetimeIndex +import pandas.core.datetools as datetools +from pandas.core.common import isnull +import pandas.util.testing as tm +from pandas.compat import range, lrange, cPickle as pickle, StringIO, lrange +from pandas import compat + +import pandas.sparse.frame as spf + +from pandas._sparse import BlockIndex, IntIndex +from pandas.sparse.api import (SparseSeries, SparseTimeSeries, + SparseDataFrame, SparsePanel, + SparseArray) + +import pandas.tests.test_frame as test_frame +import pandas.tests.test_panel as test_panel +import pandas.tests.test_series as test_series + +from .test_array import assert_sp_array_equal + +import warnings +warnings.filterwarnings(action='ignore', category=FutureWarning) + + +def _test_data1(): + # nan-based + arr = np.arange(20, dtype=float) + index = np.arange(20) + arr[:2] = nan + arr[5:10] = nan + arr[-3:] = nan + + return arr, index + + +def _test_data2(): + # nan-based + arr = np.arange(15, dtype=float) + index = np.arange(15) + arr[7:12] = nan + arr[-1:] = nan + return arr, index + + +def _test_data1_zero(): + # zero-based + arr, index = _test_data1() + arr[np.isnan(arr)] = 0 + return arr, index + + +def _test_data2_zero(): + # zero-based + arr, index = _test_data2() + arr[np.isnan(arr)] = 0 + return arr, index + + +def assert_sp_series_equal(a, b, exact_indices=True): + assert(a.index.equals(b.index)) + assert_sp_array_equal(a, b) + + +def assert_sp_frame_equal(left, right, exact_indices=True): + """ + exact: Series SparseIndex objects must be exactly the same, otherwise just + compare dense representations + """ + for col, series in compat.iteritems(left): + assert(col in right) + # trade-off? + + if exact_indices: + assert_sp_series_equal(series, right[col]) + else: + assert_series_equal(series.to_dense(), right[col].to_dense()) + + assert_almost_equal(left.default_fill_value, + right.default_fill_value) + + # do I care? + # assert(left.default_kind == right.default_kind) + + for col in right: + assert(col in left) + + +def assert_sp_panel_equal(left, right, exact_indices=True): + for item, frame in compat.iteritems(left): + assert(item in right) + # trade-off? + assert_sp_frame_equal(frame, right[item], exact_indices=exact_indices) + + assert_almost_equal(left.default_fill_value, + right.default_fill_value) + assert(left.default_kind == right.default_kind) + + for item in right: + assert(item in left) + + +class TestSparseSeries(tm.TestCase, + test_series.CheckNameIntegration): + _multiprocess_can_split_ = True + + def setUp(self): + arr, index = _test_data1() + + date_index = bdate_range('1/1/2011', periods=len(index)) + + self.bseries = SparseSeries(arr, index=index, kind='block') + self.bseries.name = 'bseries' + + self.ts = self.bseries + + self.btseries = SparseSeries(arr, index=date_index, kind='block') + + self.iseries = SparseSeries(arr, index=index, kind='integer') + + arr, index = _test_data2() + self.bseries2 = SparseSeries(arr, index=index, kind='block') + self.iseries2 = SparseSeries(arr, index=index, kind='integer') + + arr, index = _test_data1_zero() + self.zbseries = SparseSeries(arr, index=index, kind='block', + fill_value=0) + self.ziseries = SparseSeries(arr, index=index, kind='integer', + fill_value=0) + + arr, index = _test_data2_zero() + self.zbseries2 = SparseSeries(arr, index=index, kind='block', + fill_value=0) + self.ziseries2 = SparseSeries(arr, index=index, kind='integer', + fill_value=0) + + def test_iteration_and_str(self): + [x for x in self.bseries] + str(self.bseries) + + def test_construct_DataFrame_with_sp_series(self): + # it works! + df = DataFrame({'col': self.bseries}) + + # printing & access + df.iloc[:1] + df['col'] + df.dtypes + str(df) + + assert_sp_series_equal(df['col'], self.bseries) + + # blocking + expected = Series({'col': 'float64:sparse'}) + result = df.ftypes + assert_series_equal(expected, result) + + def test_series_density(self): + # GH2803 + ts = Series(np.random.randn(10)) + ts[2:-2] = nan + sts = ts.to_sparse() + density = sts.density # don't die + self.assertEqual(density, 4 / 10.0) + + def test_sparse_to_dense(self): + arr, index = _test_data1() + series = self.bseries.to_dense() + assert_equal(series, arr) + + series = self.bseries.to_dense(sparse_only=True) + assert_equal(series, arr[np.isfinite(arr)]) + + series = self.iseries.to_dense() + assert_equal(series, arr) + + arr, index = _test_data1_zero() + series = self.zbseries.to_dense() + assert_equal(series, arr) + + series = self.ziseries.to_dense() + assert_equal(series, arr) + + def test_dense_to_sparse(self): + series = self.bseries.to_dense() + bseries = series.to_sparse(kind='block') + iseries = series.to_sparse(kind='integer') + assert_sp_series_equal(bseries, self.bseries) + assert_sp_series_equal(iseries, self.iseries) + + # non-NaN fill value + series = self.zbseries.to_dense() + zbseries = series.to_sparse(kind='block', fill_value=0) + ziseries = series.to_sparse(kind='integer', fill_value=0) + assert_sp_series_equal(zbseries, self.zbseries) + assert_sp_series_equal(ziseries, self.ziseries) + + def test_to_dense_preserve_name(self): + assert(self.bseries.name is not None) + result = self.bseries.to_dense() + self.assertEqual(result.name, self.bseries.name) + + def test_constructor(self): + # test setup guys + self.assertTrue(np.isnan(self.bseries.fill_value)) + tm.assert_isinstance(self.bseries.sp_index, BlockIndex) + self.assertTrue(np.isnan(self.iseries.fill_value)) + tm.assert_isinstance(self.iseries.sp_index, IntIndex) + + self.assertEqual(self.zbseries.fill_value, 0) + assert_equal(self.zbseries.values.values, + self.bseries.to_dense().fillna(0).values) + + # pass SparseSeries + s2 = SparseSeries(self.bseries) + s3 = SparseSeries(self.iseries) + s4 = SparseSeries(self.zbseries) + assert_sp_series_equal(s2, self.bseries) + assert_sp_series_equal(s3, self.iseries) + assert_sp_series_equal(s4, self.zbseries) + + # Sparse time series works + date_index = bdate_range('1/1/2000', periods=len(self.bseries)) + s5 = SparseSeries(self.bseries, index=date_index) + tm.assert_isinstance(s5, SparseTimeSeries) + + # pass Series + bseries2 = SparseSeries(self.bseries.to_dense()) + assert_equal(self.bseries.sp_values, bseries2.sp_values) + + # pass dict? + + # don't copy the data by default + values = np.ones(self.bseries.npoints) + sp = SparseSeries(values, sparse_index=self.bseries.sp_index) + sp.sp_values[:5] = 97 + self.assertEqual(values[0], 97) + + # but can make it copy! + sp = SparseSeries(values, sparse_index=self.bseries.sp_index, + copy=True) + sp.sp_values[:5] = 100 + self.assertEqual(values[0], 97) + + def test_constructor_scalar(self): + data = 5 + sp = SparseSeries(data, np.arange(100)) + sp = sp.reindex(np.arange(200)) + self.assertTrue((sp.ix[:99] == data).all()) + self.assertTrue(isnull(sp.ix[100:]).all()) + + data = np.nan + sp = SparseSeries(data, np.arange(100)) + + def test_constructor_ndarray(self): + pass + + def test_constructor_nonnan(self): + arr = [0, 0, 0, nan, nan] + sp_series = SparseSeries(arr, fill_value=0) + assert_equal(sp_series.values.values, arr) + + def test_copy_astype(self): + cop = self.bseries.astype(np.float64) + self.assertIsNot(cop, self.bseries) + self.assertIs(cop.sp_index, self.bseries.sp_index) + self.assertEqual(cop.dtype, np.float64) + + cop2 = self.iseries.copy() + + assert_sp_series_equal(cop, self.bseries) + assert_sp_series_equal(cop2, self.iseries) + + # test that data is copied + cop[:5] = 97 + self.assertEqual(cop.sp_values[0], 97) + self.assertNotEqual(self.bseries.sp_values[0], 97) + + # correct fill value + zbcop = self.zbseries.copy() + zicop = self.ziseries.copy() + + assert_sp_series_equal(zbcop, self.zbseries) + assert_sp_series_equal(zicop, self.ziseries) + + # no deep copy + view = self.bseries.copy(deep=False) + view.sp_values[:5] = 5 + self.assertTrue((self.bseries.sp_values[:5] == 5).all()) + + def test_astype(self): + self.assertRaises(Exception, self.bseries.astype, np.int64) + + def test_kind(self): + self.assertEqual(self.bseries.kind, 'block') + self.assertEqual(self.iseries.kind, 'integer') + + def test_pickle(self): + def _test_roundtrip(series): + pickled = pickle.dumps(series, protocol=pickle.HIGHEST_PROTOCOL) + unpickled = pickle.loads(pickled) + assert_sp_series_equal(series, unpickled) + assert_series_equal(series.to_dense(), unpickled.to_dense()) + + self._check_all(_test_roundtrip) + + def _check_all(self, check_func): + check_func(self.bseries) + check_func(self.iseries) + check_func(self.zbseries) + check_func(self.ziseries) + + def test_getitem(self): + def _check_getitem(sp, dense): + for idx, val in compat.iteritems(dense): + assert_almost_equal(val, sp[idx]) + + for i in range(len(dense)): + assert_almost_equal(sp[i], dense[i]) + # j = np.float64(i) + # assert_almost_equal(sp[j], dense[j]) + + # API change 1/6/2012 + # negative getitem works + # for i in xrange(len(dense)): + # assert_almost_equal(sp[-i], dense[-i]) + + _check_getitem(self.bseries, self.bseries.to_dense()) + _check_getitem(self.btseries, self.btseries.to_dense()) + + _check_getitem(self.zbseries, self.zbseries.to_dense()) + _check_getitem(self.iseries, self.iseries.to_dense()) + _check_getitem(self.ziseries, self.ziseries.to_dense()) + + # exception handling + self.assertRaises(Exception, self.bseries.__getitem__, + len(self.bseries) + 1) + + # index not contained + self.assertRaises(Exception, self.btseries.__getitem__, + self.btseries.index[-1] + BDay()) + + def test_get_get_value(self): + assert_almost_equal(self.bseries.get(10), self.bseries[10]) + self.assertIsNone(self.bseries.get(len(self.bseries) + 1)) + + dt = self.btseries.index[10] + result = self.btseries.get(dt) + expected = self.btseries.to_dense()[dt] + assert_almost_equal(result, expected) + + assert_almost_equal(self.bseries.get_value(10), self.bseries[10]) + + def test_set_value(self): + + idx = self.btseries.index[7] + self.btseries.set_value(idx, 0) + self.assertEqual(self.btseries[idx], 0) + + self.iseries.set_value('foobar', 0) + self.assertEqual(self.iseries.index[-1], 'foobar') + self.assertEqual(self.iseries['foobar'], 0) + + def test_getitem_slice(self): + idx = self.bseries.index + res = self.bseries[::2] + tm.assert_isinstance(res, SparseSeries) + + expected = self.bseries.reindex(idx[::2]) + assert_sp_series_equal(res, expected) + + res = self.bseries[:5] + tm.assert_isinstance(res, SparseSeries) + assert_sp_series_equal(res, self.bseries.reindex(idx[:5])) + + res = self.bseries[5:] + assert_sp_series_equal(res, self.bseries.reindex(idx[5:])) + + # negative indices + res = self.bseries[:-3] + assert_sp_series_equal(res, self.bseries.reindex(idx[:-3])) + + def test_take(self): + def _compare_with_dense(sp): + dense = sp.to_dense() + + def _compare(idx): + dense_result = dense.take(idx).values + sparse_result = sp.take(idx) + self.assertIsInstance(sparse_result, SparseSeries) + assert_almost_equal(dense_result, sparse_result.values.values) + + _compare([1., 2., 3., 4., 5., 0.]) + _compare([7, 2, 9, 0, 4]) + _compare([3, 6, 3, 4, 7]) + + self._check_all(_compare_with_dense) + + self.assertRaises(Exception, self.bseries.take, + [0, len(self.bseries) + 1]) + + # Corner case + sp = SparseSeries(np.ones(10.) * nan) + assert_almost_equal(sp.take([0, 1, 2, 3, 4]), np.repeat(nan, 5)) + + def test_setitem(self): + self.bseries[5] = 7. + self.assertEqual(self.bseries[5], 7.) + + def test_setslice(self): + self.bseries[5:10] = 7. + assert_series_equal(self.bseries[5:10].to_dense(), Series( + 7., index=range(5, 10), name=self.bseries.name)) + + def test_operators(self): + def _check_op(a, b, op): + sp_result = op(a, b) + adense = a.to_dense() if isinstance(a, SparseSeries) else a + bdense = b.to_dense() if isinstance(b, SparseSeries) else b + dense_result = op(adense, bdense) + assert_almost_equal(sp_result.to_dense(), dense_result) + + def check(a, b): + _check_op(a, b, operator.add) + _check_op(a, b, operator.sub) + _check_op(a, b, operator.truediv) + _check_op(a, b, operator.floordiv) + _check_op(a, b, operator.mul) + + _check_op(a, b, lambda x, y: operator.add(y, x)) + _check_op(a, b, lambda x, y: operator.sub(y, x)) + _check_op(a, b, lambda x, y: operator.truediv(y, x)) + _check_op(a, b, lambda x, y: operator.floordiv(y, x)) + _check_op(a, b, lambda x, y: operator.mul(y, x)) + + # NaN ** 0 = 1 in C? + # _check_op(a, b, operator.pow) + # _check_op(a, b, lambda x, y: operator.pow(y, x)) + + check(self.bseries, self.bseries) + check(self.iseries, self.iseries) + check(self.bseries, self.iseries) + + check(self.bseries, self.bseries2) + check(self.bseries, self.iseries2) + check(self.iseries, self.iseries2) + + # scalar value + check(self.bseries, 5) + + # zero-based + check(self.zbseries, self.zbseries * 2) + check(self.zbseries, self.zbseries2) + check(self.ziseries, self.ziseries2) + + # with dense + result = self.bseries + self.bseries.to_dense() + assert_sp_series_equal(result, self.bseries + self.bseries) + + # @dec.knownfailureif(True, 'Known NumPy failer as of 1.5.1') + def test_operators_corner2(self): + raise nose.SkipTest('known failer on numpy 1.5.1') + + # NumPy circumvents __r*__ operations + val = np.float64(3.0) + result = val - self.zbseries + assert_sp_series_equal(result, 3 - self.zbseries) + + def test_binary_operators(self): + + # skipping for now ##### + raise nose.SkipTest("skipping sparse binary operators test") + + def _check_inplace_op(iop, op): + tmp = self.bseries.copy() + + expected = op(tmp, self.bseries) + iop(tmp, self.bseries) + assert_sp_series_equal(tmp, expected) + + inplace_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow'] + for op in inplace_ops: + _check_inplace_op( + getattr(operator, "i%s" % op), getattr(operator, op)) + + def test_reindex(self): + def _compare_with_series(sps, new_index): + spsre = sps.reindex(new_index) + + series = sps.to_dense() + seriesre = series.reindex(new_index) + seriesre = seriesre.to_sparse(fill_value=sps.fill_value) + + assert_sp_series_equal(spsre, seriesre) + assert_series_equal(spsre.to_dense(), seriesre.to_dense()) + + _compare_with_series(self.bseries, self.bseries.index[::2]) + _compare_with_series(self.bseries, list(self.bseries.index[::2])) + _compare_with_series(self.bseries, self.bseries.index[:10]) + _compare_with_series(self.bseries, self.bseries.index[5:]) + + _compare_with_series(self.zbseries, self.zbseries.index[::2]) + _compare_with_series(self.zbseries, self.zbseries.index[:10]) + _compare_with_series(self.zbseries, self.zbseries.index[5:]) + + # special cases + same_index = self.bseries.reindex(self.bseries.index) + assert_sp_series_equal(self.bseries, same_index) + self.assertIsNot(same_index, self.bseries) + + # corner cases + sp = SparseSeries([], index=[]) + sp_zero = SparseSeries([], index=[], fill_value=0) + _compare_with_series(sp, np.arange(10)) + + # with copy=False + reindexed = self.bseries.reindex(self.bseries.index, copy=True) + reindexed.sp_values[:] = 1. + self.assertTrue((self.bseries.sp_values != 1.).all()) + + reindexed = self.bseries.reindex(self.bseries.index, copy=False) + reindexed.sp_values[:] = 1. + np.testing.assert_array_equal(self.bseries.sp_values, 1.) + + def test_sparse_reindex(self): + length = 10 + + def _check(values, index1, index2, fill_value): + first_series = SparseSeries(values, sparse_index=index1, + fill_value=fill_value) + reindexed = first_series.sparse_reindex(index2) + self.assertIs(reindexed.sp_index, index2) + + int_indices1 = index1.to_int_index().indices + int_indices2 = index2.to_int_index().indices + + expected = Series(values, index=int_indices1) + expected = expected.reindex(int_indices2).fillna(fill_value) + assert_almost_equal(expected.values, reindexed.sp_values) + + # make sure level argument asserts + expected = expected.reindex(int_indices2).fillna(fill_value) + + def _check_with_fill_value(values, first, second, fill_value=nan): + i_index1 = IntIndex(length, first) + i_index2 = IntIndex(length, second) + + b_index1 = i_index1.to_block_index() + b_index2 = i_index2.to_block_index() + + _check(values, i_index1, i_index2, fill_value) + _check(values, b_index1, b_index2, fill_value) + + def _check_all(values, first, second): + _check_with_fill_value(values, first, second, fill_value=nan) + _check_with_fill_value(values, first, second, fill_value=0) + + index1 = [2, 4, 5, 6, 8, 9] + values1 = np.arange(6.) + + _check_all(values1, index1, [2, 4, 5]) + _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9]) + _check_all(values1, index1, [0, 1]) + _check_all(values1, index1, [0, 1, 7, 8, 9]) + _check_all(values1, index1, []) + + first_series = SparseSeries(values1, sparse_index=IntIndex(length, + index1), + fill_value=nan) + with tm.assertRaisesRegexp(TypeError, + 'new index must be a SparseIndex'): + reindexed = first_series.sparse_reindex(0) + + def test_repr(self): + bsrepr = repr(self.bseries) + isrepr = repr(self.iseries) + + def test_iter(self): + pass + + def test_truncate(self): + pass + + def test_fillna(self): + pass + + def test_groupby(self): + pass + + def test_reductions(self): + def _compare_with_dense(obj, op): + sparse_result = getattr(obj, op)() + series = obj.to_dense() + dense_result = getattr(series, op)() + self.assertEqual(sparse_result, dense_result) + + to_compare = ['count', 'sum', 'mean', 'std', 'var', 'skew'] + + def _compare_all(obj): + for op in to_compare: + _compare_with_dense(obj, op) + + _compare_all(self.bseries) + + self.bseries.sp_values[5:10] = np.NaN + _compare_all(self.bseries) + + _compare_all(self.zbseries) + self.zbseries.sp_values[5:10] = np.NaN + _compare_all(self.zbseries) + + series = self.zbseries.copy() + series.fill_value = 2 + _compare_all(series) + + nonna = Series(np.random.randn(20)).to_sparse() + _compare_all(nonna) + + nonna2 = Series(np.random.randn(20)).to_sparse(fill_value=0) + _compare_all(nonna2) + + def test_dropna(self): + sp = SparseSeries([0, 0, 0, nan, nan, 5, 6], + fill_value=0) + + sp_valid = sp.valid() + + expected = sp.to_dense().valid() + expected = expected[expected != 0] + + assert_almost_equal(sp_valid.values, expected.values) + self.assertTrue(sp_valid.index.equals(expected.index)) + self.assertEqual(len(sp_valid.sp_values), 2) + + result = self.bseries.dropna() + expected = self.bseries.to_dense().dropna() + self.assertNotIsInstance(result, SparseSeries) + tm.assert_series_equal(result, expected) + + def test_homogenize(self): + def _check_matches(indices, expected): + data = {} + for i, idx in enumerate(indices): + data[i] = SparseSeries(idx.to_int_index().indices, + sparse_index=idx) + homogenized = spf.homogenize(data) + + for k, v in compat.iteritems(homogenized): + assert(v.sp_index.equals(expected)) + + indices1 = [BlockIndex(10, [2], [7]), + BlockIndex(10, [1, 6], [3, 4]), + BlockIndex(10, [0], [10])] + expected1 = BlockIndex(10, [2, 6], [2, 3]) + _check_matches(indices1, expected1) + + indices2 = [BlockIndex(10, [2], [7]), + BlockIndex(10, [2], [7])] + expected2 = indices2[0] + _check_matches(indices2, expected2) + + # must have NaN fill value + data = {'a': SparseSeries(np.arange(7), sparse_index=expected2, + fill_value=0)} + assertRaisesRegexp(TypeError, "NaN fill value", spf.homogenize, data) + + def test_fill_value_corner(self): + cop = self.zbseries.copy() + cop.fill_value = 0 + result = self.bseries / cop + + self.assertTrue(np.isnan(result.fill_value)) + + cop2 = self.zbseries.copy() + cop2.fill_value = 1 + result = cop2 / cop + self.assertTrue(np.isnan(result.fill_value)) + + def test_shift(self): + series = SparseSeries([nan, 1., 2., 3., nan, nan], + index=np.arange(6)) + + shifted = series.shift(0) + self.assertIsNot(shifted, series) + assert_sp_series_equal(shifted, series) + + f = lambda s: s.shift(1) + _dense_series_compare(series, f) + + f = lambda s: s.shift(-2) + _dense_series_compare(series, f) + + series = SparseSeries([nan, 1., 2., 3., nan, nan], + index=bdate_range('1/1/2000', periods=6)) + f = lambda s: s.shift(2, freq='B') + _dense_series_compare(series, f) + + f = lambda s: s.shift(2, freq=datetools.bday) + _dense_series_compare(series, f) + + def test_cumsum(self): + result = self.bseries.cumsum() + expected = self.bseries.to_dense().cumsum() + tm.assert_isinstance(result, SparseSeries) + self.assertEqual(result.name, self.bseries.name) + assert_series_equal(result.to_dense(), expected) + + result = self.zbseries.cumsum() + expected = self.zbseries.to_dense().cumsum() + tm.assert_isinstance(result, Series) + assert_series_equal(result, expected) + + def test_combine_first(self): + s = self.bseries + + result = s[::2].combine_first(s) + result2 = s[::2].combine_first(s.to_dense()) + + expected = s[::2].to_dense().combine_first(s.to_dense()) + expected = expected.to_sparse(fill_value=s.fill_value) + + assert_sp_series_equal(result, result2) + assert_sp_series_equal(result, expected) + + +class TestSparseTimeSeries(tm.TestCase): + pass + + +class TestSparseDataFrame(tm.TestCase, test_frame.SafeForSparse): + klass = SparseDataFrame + _multiprocess_can_split_ = True + + def setUp(self): + + self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C': np.arange(10), + 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + + self.dates = bdate_range('1/1/2011', periods=10) + + self.frame = SparseDataFrame(self.data, index=self.dates) + self.iframe = SparseDataFrame(self.data, index=self.dates, + default_kind='integer') + + values = self.frame.values.copy() + values[np.isnan(values)] = 0 + + self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=0, + index=self.dates) + + values = self.frame.values.copy() + values[np.isnan(values)] = 2 + self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=2, + index=self.dates) + + self.empty = SparseDataFrame() + + def test_as_matrix(self): + empty = self.empty.as_matrix() + self.assertEqual(empty.shape, (0, 0)) + + no_cols = SparseDataFrame(index=np.arange(10)) + mat = no_cols.as_matrix() + self.assertEqual(mat.shape, (10, 0)) + + no_index = SparseDataFrame(columns=np.arange(10)) + mat = no_index.as_matrix() + self.assertEqual(mat.shape, (0, 10)) + + def test_copy(self): + cp = self.frame.copy() + tm.assert_isinstance(cp, SparseDataFrame) + assert_sp_frame_equal(cp, self.frame) + self.assertTrue(cp.index.is_(self.frame.index)) + + def test_constructor(self): + for col, series in compat.iteritems(self.frame): + tm.assert_isinstance(series, SparseSeries) + + tm.assert_isinstance(self.iframe['A'].sp_index, IntIndex) + + # constructed zframe from matrix above + self.assertEqual(self.zframe['A'].fill_value, 0) + assert_almost_equal([0, 0, 0, 0, 1, 2, 3, 4, 5, 6], + self.zframe['A'].values) + + # construct no data + sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) + for col, series in compat.iteritems(sdf): + tm.assert_isinstance(series, SparseSeries) + + # construct from nested dict + data = {} + for c, s in compat.iteritems(self.frame): + data[c] = s.to_dict() + + sdf = SparseDataFrame(data) + assert_sp_frame_equal(sdf, self.frame) + + # TODO: test data is copied from inputs + + # init dict with different index + idx = self.frame.index[:5] + cons = SparseDataFrame(self.frame, index=idx, + columns=self.frame.columns, + default_fill_value=self.frame.default_fill_value, + default_kind=self.frame.default_kind, + copy=True) + reindexed = self.frame.reindex(idx) + assert_sp_frame_equal(cons, reindexed, exact_indices=False) + + # assert level parameter breaks reindex + self.assertRaises(TypeError, self.frame.reindex, idx, level=0) + + repr(self.frame) + + def test_constructor_ndarray(self): + # no index or columns + sp = SparseDataFrame(self.frame.values) + + # 1d + sp = SparseDataFrame(self.data['A'], index=self.dates, + columns=['A']) + assert_sp_frame_equal(sp, self.frame.reindex(columns=['A'])) + + # raise on level argument + self.assertRaises(TypeError, self.frame.reindex, columns=['A'], + level=1) + + # wrong length index / columns + assertRaisesRegexp( + ValueError, "^Index length", SparseDataFrame, self.frame.values, + index=self.frame.index[:-1]) + assertRaisesRegexp( + ValueError, "^Column length", SparseDataFrame, self.frame.values, + columns=self.frame.columns[:-1]) + + def test_constructor_empty(self): + sp = SparseDataFrame() + self.assertEqual(len(sp.index), 0) + self.assertEqual(len(sp.columns), 0) + + def test_constructor_dataframe(self): + dense = self.frame.to_dense() + sp = SparseDataFrame(dense) + assert_sp_frame_equal(sp, self.frame) + + def test_constructor_convert_index_once(self): + arr = np.array([1.5, 2.5, 3.5]) + sdf = SparseDataFrame(columns=lrange(4), index=arr) + self.assertTrue(sdf[0].index is sdf[1].index) + + def test_constructor_from_series(self): + + # GH 2873 + x = Series(np.random.randn(10000), name='a') + x = x.to_sparse(fill_value=0) + tm.assert_isinstance(x,SparseSeries) + df = SparseDataFrame(x) + tm.assert_isinstance(df,SparseDataFrame) + + x = Series(np.random.randn(10000), name='a') + y = Series(np.random.randn(10000), name='b') + x2 = x.astype(float) + x2.ix[:9998] = np.NaN + x_sparse = x2.to_sparse(fill_value=np.NaN) + + # Currently fails too with weird ufunc error + # df1 = SparseDataFrame([x_sparse, y]) + + y.ix[:9998] = 0 + y_sparse = y.to_sparse(fill_value=0) + # without sparse value raises error + # df2 = SparseDataFrame([x2_sparse, y]) + + def test_dtypes(self): + df = DataFrame(np.random.randn(10000, 4)) + df.ix[:9998] = np.nan + sdf = df.to_sparse() + + result = sdf.get_dtype_counts() + expected = Series({'float64': 4}) + assert_series_equal(result, expected) + + def test_str(self): + df = DataFrame(np.random.randn(10000, 4)) + df.ix[:9998] = np.nan + sdf = df.to_sparse() + + str(sdf) + + def test_array_interface(self): + res = np.sqrt(self.frame) + dres = np.sqrt(self.frame.to_dense()) + assert_frame_equal(res.to_dense(), dres) + + def test_pickle(self): + def _test_roundtrip(frame): + pickled = pickle.dumps(frame, protocol=pickle.HIGHEST_PROTOCOL) + unpickled = pickle.loads(pickled) + assert_sp_frame_equal(frame, unpickled) + + _test_roundtrip(SparseDataFrame()) + self._check_all(_test_roundtrip) + + def test_dense_to_sparse(self): + df = DataFrame({'A': [nan, nan, nan, 1, 2], + 'B': [1, 2, nan, nan, nan]}) + sdf = df.to_sparse() + tm.assert_isinstance(sdf, SparseDataFrame) + self.assertTrue(np.isnan(sdf.default_fill_value)) + tm.assert_isinstance(sdf['A'].sp_index, BlockIndex) + tm.assert_frame_equal(sdf.to_dense(), df) + + sdf = df.to_sparse(kind='integer') + tm.assert_isinstance(sdf['A'].sp_index, IntIndex) + + df = DataFrame({'A': [0, 0, 0, 1, 2], + 'B': [1, 2, 0, 0, 0]}, dtype=float) + sdf = df.to_sparse(fill_value=0) + self.assertEqual(sdf.default_fill_value, 0) + tm.assert_frame_equal(sdf.to_dense(), df) + + def test_density(self): + df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) + self.assertEqual(df.density, 0.7) + + def test_sparse_to_dense(self): + pass + + def test_sparse_series_ops(self): + import sys + buf = StringIO() + tmp = sys.stderr + sys.stderr = buf + try: + self._check_frame_ops(self.frame) + finally: + sys.stderr = tmp + + def test_sparse_series_ops_i(self): + import sys + buf = StringIO() + tmp = sys.stderr + sys.stderr = buf + try: + self._check_frame_ops(self.iframe) + finally: + sys.stderr = tmp + + def test_sparse_series_ops_z(self): + import sys + buf = StringIO() + tmp = sys.stderr + sys.stderr = buf + try: + self._check_frame_ops(self.zframe) + finally: + sys.stderr = tmp + + def test_sparse_series_ops_fill(self): + import sys + buf = StringIO() + tmp = sys.stderr + sys.stderr = buf + try: + self._check_frame_ops(self.fill_frame) + finally: + sys.stderr = tmp + + def _check_frame_ops(self, frame): + fill = frame.default_fill_value + + def _compare_to_dense(a, b, da, db, op): + sparse_result = op(a, b) + dense_result = op(da, db) + + dense_result = dense_result.to_sparse(fill_value=fill) + assert_sp_frame_equal(sparse_result, dense_result, + exact_indices=False) + + if isinstance(a, DataFrame) and isinstance(db, DataFrame): + mixed_result = op(a, db) + tm.assert_isinstance(mixed_result, SparseDataFrame) + assert_sp_frame_equal(mixed_result, sparse_result, + exact_indices=False) + + opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + ops = [getattr(operator, name) for name in opnames] + + fidx = frame.index + + # time series operations + + series = [frame['A'], frame['B'], + frame['C'], frame['D'], + frame['A'].reindex(fidx[:7]), + frame['A'].reindex(fidx[::2]), + SparseSeries([], index=[])] + + for op in ops: + _compare_to_dense(frame, frame[::2], frame.to_dense(), + frame[::2].to_dense(), op) + for i, s in enumerate(series): + _compare_to_dense(frame, s, frame.to_dense(), + s.to_dense(), op) + _compare_to_dense(s, frame, s.to_dense(), + frame.to_dense(), op) + + # cross-sectional operations + series = [frame.xs(fidx[0]), + frame.xs(fidx[3]), + frame.xs(fidx[5]), + frame.xs(fidx[7]), + frame.xs(fidx[5])[:2]] + + for op in ops: + for s in series: + _compare_to_dense(frame, s, frame.to_dense(), + s, op) + _compare_to_dense(s, frame, s, + frame.to_dense(), op) + + # it works! + result = self.frame + self.frame.ix[:, ['A', 'B']] + + def test_op_corners(self): + empty = self.empty + self.empty + self.assertTrue(empty.empty) + + foo = self.frame + self.empty + tm.assert_isinstance(foo.index, DatetimeIndex) + assert_frame_equal(foo, self.frame * np.nan) + + foo = self.empty + self.frame + assert_frame_equal(foo, self.frame * np.nan) + + def test_scalar_ops(self): + pass + + def test_getitem(self): + # 1585 select multiple columns + sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c']) + + result = sdf[['a', 'b']] + exp = sdf.reindex(columns=['a', 'b']) + assert_sp_frame_equal(result, exp) + + self.assertRaises(Exception, sdf.__getitem__, ['a', 'd']) + + def test_icol(self): + # 2227 + result = self.frame.icol(0) + self.assertTrue(isinstance(result, SparseSeries)) + assert_sp_series_equal(result, self.frame['A']) + + # preserve sparse index type. #2251 + data = {'A': [0, 1]} + iframe = SparseDataFrame(data, default_kind='integer') + self.assertEqual(type(iframe['A'].sp_index), + type(iframe.icol(0).sp_index)) + + def test_set_value(self): + + # ok as the index gets conver to object + frame = self.frame.copy() + res = frame.set_value('foobar', 'B', 1.5) + self.assertEqual(res.index.dtype, 'object') + + res = self.frame + res.index = res.index.astype(object) + + res = self.frame.set_value('foobar', 'B', 1.5) + self.assertIsNot(res, self.frame) + self.assertEqual(res.index[-1], 'foobar') + self.assertEqual(res.get_value('foobar', 'B'), 1.5) + + res2 = res.set_value('foobar', 'qux', 1.5) + self.assertIsNot(res2, res) + self.assert_numpy_array_equal(res2.columns, + list(self.frame.columns) + ['qux']) + self.assertEqual(res2.get_value('foobar', 'qux'), 1.5) + + def test_fancy_index_misc(self): + # axis = 0 + sliced = self.frame.ix[-2:, :] + expected = self.frame.reindex(index=self.frame.index[-2:]) + assert_sp_frame_equal(sliced, expected) + + # axis = 1 + sliced = self.frame.ix[:, -2:] + expected = self.frame.reindex(columns=self.frame.columns[-2:]) + assert_sp_frame_equal(sliced, expected) + + def test_getitem_overload(self): + # slicing + sl = self.frame[:20] + assert_sp_frame_equal(sl, self.frame.reindex(self.frame.index[:20])) + + # boolean indexing + d = self.frame.index[5] + indexer = self.frame.index > d + + subindex = self.frame.index[indexer] + subframe = self.frame[indexer] + + self.assert_numpy_array_equal(subindex, subframe.index) + self.assertRaises(Exception, self.frame.__getitem__, indexer[:-1]) + + def test_setitem(self): + def _check_frame(frame): + N = len(frame) + + # insert SparseSeries + frame['E'] = frame['A'] + tm.assert_isinstance(frame['E'], SparseSeries) + assert_sp_series_equal(frame['E'], frame['A']) + + # insert SparseSeries differently-indexed + to_insert = frame['A'][::2] + frame['E'] = to_insert + expected = to_insert.to_dense().reindex( + frame.index).fillna(to_insert.fill_value) + assert_series_equal(frame['E'].to_dense(), expected) + + # insert Series + frame['F'] = frame['A'].to_dense() + tm.assert_isinstance(frame['F'], SparseSeries) + assert_sp_series_equal(frame['F'], frame['A']) + + # insert Series differently-indexed + to_insert = frame['A'].to_dense()[::2] + frame['G'] = to_insert + expected = to_insert.reindex( + frame.index).fillna(frame.default_fill_value) + assert_series_equal(frame['G'].to_dense(), expected) + + # insert ndarray + frame['H'] = np.random.randn(N) + tm.assert_isinstance(frame['H'], SparseSeries) + + to_sparsify = np.random.randn(N) + to_sparsify[N // 2:] = frame.default_fill_value + frame['I'] = to_sparsify + self.assertEqual(len(frame['I'].sp_values), N // 2) + + # insert ndarray wrong size + self.assertRaises(Exception, frame.__setitem__, 'foo', + np.random.randn(N - 1)) + + # scalar value + frame['J'] = 5 + self.assertEqual(len(frame['J'].sp_values), N) + self.assertTrue((frame['J'].sp_values == 5).all()) + + frame['K'] = frame.default_fill_value + self.assertEqual(len(frame['K'].sp_values), 0) + + self._check_all(_check_frame) + + def test_setitem_corner(self): + self.frame['a'] = self.frame['B'] + assert_sp_series_equal(self.frame['a'], self.frame['B']) + + def test_setitem_array(self): + arr = self.frame['B'] + + self.frame['E'] = arr + assert_sp_series_equal(self.frame['E'], self.frame['B']) + + self.frame['F'] = arr[:-1] + index = self.frame.index[:-1] + assert_sp_series_equal( + self.frame['E'].reindex(index), self.frame['F'].reindex(index)) + + def test_delitem(self): + A = self.frame['A'] + C = self.frame['C'] + + del self.frame['B'] + self.assertNotIn('B', self.frame) + assert_sp_series_equal(self.frame['A'], A) + assert_sp_series_equal(self.frame['C'], C) + + del self.frame['D'] + self.assertNotIn('D', self.frame) + + del self.frame['A'] + self.assertNotIn('A', self.frame) + + def test_set_columns(self): + self.frame.columns = self.frame.columns + self.assertRaises(Exception, setattr, self.frame, 'columns', + self.frame.columns[:-1]) + + def test_set_index(self): + self.frame.index = self.frame.index + self.assertRaises(Exception, setattr, self.frame, 'index', + self.frame.index[:-1]) + + def test_append(self): + a = self.frame[:5] + b = self.frame[5:] + + appended = a.append(b) + assert_sp_frame_equal(appended, self.frame, exact_indices=False) + + a = self.frame.ix[:5, :3] + b = self.frame.ix[5:] + appended = a.append(b) + assert_sp_frame_equal( + appended.ix[:, :3], self.frame.ix[:, :3], exact_indices=False) + + def test_apply(self): + applied = self.frame.apply(np.sqrt) + tm.assert_isinstance(applied, SparseDataFrame) + assert_almost_equal(applied.values, np.sqrt(self.frame.values)) + + applied = self.fill_frame.apply(np.sqrt) + self.assertEqual(applied['A'].fill_value, np.sqrt(2)) + + # agg / broadcast + broadcasted = self.frame.apply(np.sum, broadcast=True) + tm.assert_isinstance(broadcasted, SparseDataFrame) + assert_frame_equal(broadcasted.to_dense(), + self.frame.to_dense().apply(np.sum, broadcast=True)) + + self.assertIs(self.empty.apply(np.sqrt), self.empty) + + from pandas.core import nanops + applied = self.frame.apply(np.sum) + assert_series_equal(applied, + self.frame.to_dense().apply(nanops.nansum)) + + def test_apply_nonuq(self): + df_orig = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) + df = df_orig.to_sparse() + rs = df.apply(lambda s: s[0], axis=1) + xp = Series([1., 4., 7.], ['a', 'a', 'c']) + assert_series_equal(rs, xp) + + # df.T breaks + df = df_orig.T.to_sparse() + rs = df.apply(lambda s: s[0], axis=0) + # no non-unique columns supported in sparse yet + # assert_series_equal(rs, xp) + + def test_applymap(self): + # just test that it works + result = self.frame.applymap(lambda x: x * 2) + tm.assert_isinstance(result, SparseDataFrame) + + def test_astype(self): + self.assertRaises(Exception, self.frame.astype, np.int64) + + def test_fillna(self): + df = self.zframe.reindex(lrange(5)) + result = df.fillna(0) + expected = df.to_dense().fillna(0).to_sparse(fill_value=0) + assert_sp_frame_equal(result, expected, exact_indices=False) + + result = df.copy() + result.fillna(0, inplace=True) + expected = df.to_dense().fillna(0).to_sparse(fill_value=0) + assert_sp_frame_equal(result, expected, exact_indices=False) + + result = df.copy() + result = df['A'] + result.fillna(0, inplace=True) + assert_series_equal(result, df['A'].fillna(0)) + + def test_rename(self): + # just check this works + renamed = self.frame.rename(index=str) + renamed = self.frame.rename(columns=lambda x: '%s%d' % (x, len(x))) + + def test_corr(self): + res = self.frame.corr() + assert_frame_equal(res, self.frame.to_dense().corr()) + + def test_describe(self): + self.frame['foo'] = np.nan + self.frame.get_dtype_counts() + str(self.frame) + desc = self.frame.describe() + + def test_join(self): + left = self.frame.ix[:, ['A', 'B']] + right = self.frame.ix[:, ['C', 'D']] + joined = left.join(right) + assert_sp_frame_equal(joined, self.frame, exact_indices=False) + + right = self.frame.ix[:, ['B', 'D']] + self.assertRaises(Exception, left.join, right) + + with tm.assertRaisesRegexp(ValueError, 'Other Series must have a name'): + self.frame.join(Series(np.random.randn(len(self.frame)), + index=self.frame.index)) + + def test_reindex(self): + + def _check_frame(frame): + index = frame.index + sidx = index[::2] + sidx2 = index[:5] + + sparse_result = frame.reindex(sidx) + dense_result = frame.to_dense().reindex(sidx) + assert_frame_equal(sparse_result.to_dense(), dense_result) + + assert_frame_equal(frame.reindex(list(sidx)).to_dense(), + dense_result) + + sparse_result2 = sparse_result.reindex(index) + dense_result2 = dense_result.reindex( + index).fillna(frame.default_fill_value) + assert_frame_equal(sparse_result2.to_dense(), dense_result2) + + # propagate CORRECT fill value + assert_almost_equal(sparse_result.default_fill_value, + frame.default_fill_value) + assert_almost_equal(sparse_result['A'].fill_value, + frame['A'].fill_value) + + # length zero + length_zero = frame.reindex([]) + self.assertEqual(len(length_zero), 0) + self.assertEqual(len(length_zero.columns), len(frame.columns)) + self.assertEqual(len(length_zero['A']), 0) + + # frame being reindexed has length zero + length_n = length_zero.reindex(index) + self.assertEqual(len(length_n), len(frame)) + self.assertEqual(len(length_n.columns), len(frame.columns)) + self.assertEqual(len(length_n['A']), len(frame)) + + # reindex columns + reindexed = frame.reindex(columns=['A', 'B', 'Z']) + self.assertEqual(len(reindexed.columns), 3) + assert_almost_equal(reindexed['Z'].fill_value, + frame.default_fill_value) + self.assertTrue(np.isnan(reindexed['Z'].sp_values).all()) + + _check_frame(self.frame) + _check_frame(self.iframe) + _check_frame(self.zframe) + _check_frame(self.fill_frame) + + # with copy=False + reindexed = self.frame.reindex(self.frame.index, copy=False) + reindexed['F'] = reindexed['A'] + self.assertIn('F', self.frame) + + reindexed = self.frame.reindex(self.frame.index) + reindexed['G'] = reindexed['A'] + self.assertNotIn('G', self.frame) + + def test_reindex_fill_value(self): + rng = bdate_range('20110110', periods=20) + result = self.zframe.reindex(rng, fill_value=0) + expected = self.zframe.reindex(rng).fillna(0) + assert_sp_frame_equal(result, expected) + + def test_take(self): + result = self.frame.take([1, 0, 2], axis=1) + expected = self.frame.reindex(columns=['B', 'A', 'C']) + assert_sp_frame_equal(result, expected) + + def test_density(self): + df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C': np.arange(10), + 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) + + self.assertEqual(df.density, 0.75) + + def test_to_dense(self): + def _check(frame): + dense_dm = frame.to_dense() + assert_frame_equal(frame, dense_dm) + + self._check_all(_check) + + def test_stack_sparse_frame(self): + def _check(frame): + dense_frame = frame.to_dense() + + wp = Panel.from_dict({'foo': frame}) + from_dense_lp = wp.to_frame() + + from_sparse_lp = spf.stack_sparse_frame(frame) + + self.assert_numpy_array_equal(from_dense_lp.values, + from_sparse_lp.values) + + _check(self.frame) + _check(self.iframe) + + # for now + self.assertRaises(Exception, _check, self.zframe) + self.assertRaises(Exception, _check, self.fill_frame) + + def test_transpose(self): + def _check(frame): + transposed = frame.T + untransposed = transposed.T + assert_sp_frame_equal(frame, untransposed) + self._check_all(_check) + + def test_shift(self): + def _check(frame): + shifted = frame.shift(0) + assert_sp_frame_equal(shifted, frame) + + f = lambda s: s.shift(1) + _dense_frame_compare(frame, f) + + f = lambda s: s.shift(-2) + _dense_frame_compare(frame, f) + + f = lambda s: s.shift(2, freq='B') + _dense_frame_compare(frame, f) + + f = lambda s: s.shift(2, freq=datetools.bday) + _dense_frame_compare(frame, f) + + self._check_all(_check) + + def test_count(self): + result = self.frame.count() + dense_result = self.frame.to_dense().count() + assert_series_equal(result, dense_result) + + result = self.frame.count(1) + dense_result = self.frame.to_dense().count(1) + + # win32 don't check dtype + assert_series_equal(result, dense_result, check_dtype=False) + + def test_cumsum(self): + result = self.frame.cumsum() + expected = self.frame.to_dense().cumsum() + tm.assert_isinstance(result, SparseDataFrame) + assert_frame_equal(result.to_dense(), expected) + + def _check_all(self, check_func): + check_func(self.frame) + check_func(self.iframe) + check_func(self.zframe) + check_func(self.fill_frame) + + def test_combine_first(self): + df = self.frame + + result = df[::2].combine_first(df) + result2 = df[::2].combine_first(df.to_dense()) + + expected = df[::2].to_dense().combine_first(df.to_dense()) + expected = expected.to_sparse(fill_value=df.default_fill_value) + + assert_sp_frame_equal(result, result2) + assert_sp_frame_equal(result, expected) + + def test_combine_add(self): + df = self.frame.to_dense() + df2 = df.copy() + df2['C'][:3] = np.nan + df['A'][:3] = 5.7 + + result = df.to_sparse().add(df2.to_sparse(), fill_value=0) + expected = df.add(df2, fill_value=0).to_sparse() + assert_sp_frame_equal(result, expected) + + def test_isin(self): + sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.) + xp = sparse_df[sparse_df.flag == 1.] + rs = sparse_df[sparse_df.flag.isin([1.])] + assert_frame_equal(xp, rs) + + def test_sparse_pow_issue(self): + # 2220 + df = SparseDataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) + + # note : no error without nan + df = SparseDataFrame({'A': [nan, 0, 1]}) + + # note that 2 ** df works fine, also df ** 1 + result = 1 ** df + + r1 = result.take([0], 1)['A'] + r2 = result['A'] + + self.assertEqual(len(r2.sp_values), len(r1.sp_values)) + + def test_as_blocks(self): + df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]}, + dtype='float64') + + df_blocks = df.blocks + self.assertEqual(list(df_blocks.keys()), ['float64']) + assert_frame_equal(df_blocks['float64'], df) + + +def _dense_series_compare(s, f): + result = f(s) + assert(isinstance(result, SparseSeries)) + dense_result = f(s.to_dense()) + assert_series_equal(result.to_dense(), dense_result) + + +def _dense_frame_compare(frame, f): + result = f(frame) + assert(isinstance(frame, SparseDataFrame)) + dense_result = f(frame.to_dense()).fillna(frame.default_fill_value) + assert_frame_equal(result.to_dense(), dense_result) + + +def panel_data1(): + index = bdate_range('1/1/2011', periods=8) + + return DataFrame({ + 'A': [nan, nan, nan, 0, 1, 2, 3, 4], + 'B': [0, 1, 2, 3, 4, nan, nan, nan], + 'C': [0, 1, 2, nan, nan, nan, 3, 4], + 'D': [nan, 0, 1, nan, 2, 3, 4, nan] + }, index=index) + + +def panel_data2(): + index = bdate_range('1/1/2011', periods=9) + + return DataFrame({ + 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5], + 'B': [0, 1, 2, 3, 4, 5, nan, nan, nan], + 'C': [0, 1, 2, nan, nan, nan, 3, 4, 5], + 'D': [nan, 0, 1, nan, 2, 3, 4, 5, nan] + }, index=index) + + +def panel_data3(): + index = bdate_range('1/1/2011', periods=10).shift(-2) + + return DataFrame({ + 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, 3, 4, 5, 6, nan, nan, nan], + 'C': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'D': [nan, 0, 1, nan, 2, 3, 4, 5, 6, nan] + }, index=index) + + +class TestSparsePanel(tm.TestCase, + test_panel.SafeForLongAndSparse, + test_panel.SafeForSparse): + _multiprocess_can_split_ = True + + @classmethod + def assert_panel_equal(cls, x, y): + assert_sp_panel_equal(x, y) + + def setUp(self): + self.data_dict = { + 'ItemA': panel_data1(), + 'ItemB': panel_data2(), + 'ItemC': panel_data3(), + 'ItemD': panel_data1(), + } + self.panel = SparsePanel(self.data_dict) + + @staticmethod + def _test_op(panel, op): + # arithmetic tests + result = op(panel, 1) + assert_sp_frame_equal(result['ItemA'], op(panel['ItemA'], 1)) + + def test_constructor(self): + self.assertRaises(ValueError, SparsePanel, self.data_dict, + items=['Item0', 'ItemA', 'ItemB']) + with tm.assertRaisesRegexp(TypeError, + "input must be a dict, a 'list' was passed"): + SparsePanel(['a', 'b', 'c']) + + def test_from_dict(self): + fd = SparsePanel.from_dict(self.data_dict) + assert_sp_panel_equal(fd, self.panel) + + def test_pickle(self): + def _test_roundtrip(panel): + pickled = pickle.dumps(panel, protocol=pickle.HIGHEST_PROTOCOL) + unpickled = pickle.loads(pickled) + tm.assert_isinstance(unpickled.items, Index) + tm.assert_isinstance(unpickled.major_axis, Index) + tm.assert_isinstance(unpickled.minor_axis, Index) + assert_sp_panel_equal(panel, unpickled) + + _test_roundtrip(self.panel) + + def test_dense_to_sparse(self): + wp = Panel.from_dict(self.data_dict) + dwp = wp.to_sparse() + tm.assert_isinstance(dwp['ItemA']['A'], SparseSeries) + + def test_to_dense(self): + dwp = self.panel.to_dense() + dwp2 = Panel.from_dict(self.data_dict) + assert_panel_equal(dwp, dwp2) + + def test_to_frame(self): + def _compare_with_dense(panel): + slp = panel.to_frame() + dlp = panel.to_dense().to_frame() + + self.assert_numpy_array_equal(slp.values, dlp.values) + self.assertTrue(slp.index.equals(dlp.index)) + + _compare_with_dense(self.panel) + _compare_with_dense(self.panel.reindex(items=['ItemA'])) + + zero_panel = SparsePanel(self.data_dict, default_fill_value=0) + self.assertRaises(Exception, zero_panel.to_frame) + + self.assertRaises(Exception, self.panel.to_frame, + filter_observations=False) + + def test_long_to_wide_sparse(self): + pass + + def test_values(self): + pass + + def test_setitem(self): + self.panel['ItemE'] = self.panel['ItemC'] + self.panel['ItemF'] = self.panel['ItemC'].to_dense() + + assert_sp_frame_equal(self.panel['ItemE'], self.panel['ItemC']) + assert_sp_frame_equal(self.panel['ItemF'], self.panel['ItemC']) + assert_almost_equal(self.panel.items, ['ItemA', 'ItemB', 'ItemC', + 'ItemD', 'ItemE', 'ItemF']) + + self.assertRaises(Exception, self.panel.__setitem__, 'item6', 1) + + def test_set_value(self): + def _check_loc(item, major, minor, val=1.5): + res = self.panel.set_value(item, major, minor, val) + self.assertIsNot(res, self.panel) + self.assertEqual(res.get_value(item, major, minor), val) + + _check_loc('ItemA', self.panel.major_axis[4], self.panel.minor_axis[3]) + _check_loc('ItemF', self.panel.major_axis[4], self.panel.minor_axis[3]) + _check_loc('ItemF', 'foo', self.panel.minor_axis[3]) + _check_loc('ItemE', 'foo', 'bar') + + def test_delitem_pop(self): + del self.panel['ItemB'] + assert_almost_equal(self.panel.items, ['ItemA', 'ItemC', 'ItemD']) + crackle = self.panel['ItemC'] + pop = self.panel.pop('ItemC') + self.assertIs(pop, crackle) + assert_almost_equal(self.panel.items, ['ItemA', 'ItemD']) + + self.assertRaises(KeyError, self.panel.__delitem__, 'ItemC') + + def test_copy(self): + cop = self.panel.copy() + assert_sp_panel_equal(cop, self.panel) + + def test_reindex(self): + def _compare_with_dense(swp, items, major, minor): + swp_re = swp.reindex(items=items, major=major, + minor=minor) + dwp_re = swp.to_dense().reindex(items=items, major=major, + minor=minor) + assert_panel_equal(swp_re.to_dense(), dwp_re) + + _compare_with_dense(self.panel, self.panel.items[:2], + self.panel.major_axis[::2], + self.panel.minor_axis[::2]) + _compare_with_dense(self.panel, None, + self.panel.major_axis[::2], + self.panel.minor_axis[::2]) + + self.assertRaises(ValueError, self.panel.reindex) + + # TODO: do something about this later... + self.assertRaises(Exception, self.panel.reindex, + items=['item0', 'ItemA', 'ItemB']) + + # test copying + cp = self.panel.reindex(self.panel.major_axis, copy=True) + cp['ItemA']['E'] = cp['ItemA']['A'] + self.assertNotIn('E', self.panel['ItemA']) + + def test_operators(self): + def _check_ops(panel): + def _dense_comp(op): + dense = panel.to_dense() + sparse_result = op(panel) + dense_result = op(dense) + assert_panel_equal(sparse_result.to_dense(), dense_result) + + def _mixed_comp(op): + result = op(panel, panel.to_dense()) + expected = op(panel.to_dense(), panel.to_dense()) + assert_panel_equal(result, expected) + + op1 = lambda x: x + 2 + + _dense_comp(op1) + op2 = lambda x: x.add(x.reindex(major=x.major_axis[::2])) + _dense_comp(op2) + op3 = lambda x: x.subtract(x.mean(0), axis=0) + _dense_comp(op3) + op4 = lambda x: x.subtract(x.mean(1), axis=1) + _dense_comp(op4) + op5 = lambda x: x.subtract(x.mean(2), axis=2) + _dense_comp(op5) + + _mixed_comp(Panel.multiply) + _mixed_comp(Panel.subtract) + + # TODO: this case not yet supported! + # op6 = lambda x: x.add(x.to_frame()) + # _dense_comp(op6) + + _check_ops(self.panel) + + def test_major_xs(self): + def _dense_comp(sparse): + dense = sparse.to_dense() + + for idx in sparse.major_axis: + dslice = dense.major_xs(idx) + sslice = sparse.major_xs(idx) + assert_frame_equal(dslice, sslice) + + _dense_comp(self.panel) + + def test_minor_xs(self): + def _dense_comp(sparse): + dense = sparse.to_dense() + + for idx in sparse.minor_axis: + dslice = dense.minor_xs(idx) + sslice = sparse.minor_xs(idx).to_dense() + assert_frame_equal(dslice, sslice) + + _dense_comp(self.panel) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) + + # nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure', + # '--with-profile'], + # exit=False) diff --git a/pandas/src/datetime.pxd b/pandas/src/datetime.pxd new file mode 100644 index 00000000..abd3bc33 --- /dev/null +++ b/pandas/src/datetime.pxd @@ -0,0 +1,193 @@ +from numpy cimport int64_t, int32_t, npy_int64, npy_int32, ndarray +from cpython cimport PyObject + +from cpython cimport PyUnicode_Check, PyUnicode_AsASCIIString + + +cdef extern from "headers/stdint.h": + enum: INT64_MIN + enum: INT32_MIN + + + +cdef extern from "datetime.h": + + ctypedef class datetime.date [object PyDateTime_Date]: + pass + + ctypedef class datetime.datetime [object PyDateTime_DateTime]: + pass + + ctypedef class datetime.timedelta [object PyDateTime_Delta]: + pass + + void PyDateTime_IMPORT() + + int PyDateTime_GET_YEAR(date) + int PyDateTime_GET_MONTH(date) + int PyDateTime_GET_DAY(date) + int PyDateTime_DATE_GET_HOUR(object o) + int PyDateTime_DATE_GET_MINUTE(object o) + int PyDateTime_DATE_GET_SECOND(object o) + int PyDateTime_DATE_GET_MICROSECOND(object o) + int PyDateTime_TIME_GET_HOUR(object o) + int PyDateTime_TIME_GET_MINUTE(object o) + int PyDateTime_TIME_GET_SECOND(object o) + int PyDateTime_TIME_GET_MICROSECOND(object o) + bint PyDateTime_Check(object o) + bint PyDate_Check(object o) + bint PyTime_Check(object o) + bint PyDelta_Check(object o) + object PyDateTime_FromDateAndTime(int year, int month, int day, int hour, + int minute, int second, int us) + +cdef extern from "datetime_helper.h": + void mangle_nat(object o) + +cdef extern from "numpy/ndarrayobject.h": + + ctypedef int64_t npy_timedelta + ctypedef int64_t npy_datetime + + ctypedef enum NPY_CASTING: + NPY_NO_CASTING + NPY_EQUIV_CASTING + NPY_SAFE_CASTING + NPY_SAME_KIND_CASTING + NPY_UNSAFE_CASTING + + +cdef extern from "numpy_helper.h": + npy_datetime get_datetime64_value(object o) + +cdef extern from "numpy/npy_common.h": + + ctypedef unsigned char npy_bool + +cdef extern from "datetime/np_datetime.h": + + ctypedef enum PANDAS_DATETIMEUNIT: + PANDAS_FR_Y + PANDAS_FR_M + PANDAS_FR_W + PANDAS_FR_D + PANDAS_FR_B + PANDAS_FR_h + PANDAS_FR_m + PANDAS_FR_s + PANDAS_FR_ms + PANDAS_FR_us + PANDAS_FR_ns + PANDAS_FR_ps + PANDAS_FR_fs + PANDAS_FR_as + + ctypedef struct pandas_datetimestruct: + npy_int64 year + npy_int32 month, day, hour, min, sec, us, ps, as + + int cmp_pandas_datetimestruct(pandas_datetimestruct *a, + pandas_datetimestruct *b) + + int convert_pydatetime_to_datetimestruct(PyObject *obj, + pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, + int apply_tzinfo) + + npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d) + void pandas_datetime_to_datetimestruct(npy_datetime val, + PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result) + int days_per_month_table[2][12] + + int dayofweek(int y, int m, int d) + int is_leapyear(int64_t year) + PANDAS_DATETIMEUNIT get_datetime64_unit(object o) + +cdef extern from "datetime/np_datetime_strings.h": + + int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, pandas_datetimestruct *out, + npy_bool *out_local, PANDAS_DATETIMEUNIT *out_bestunit, + npy_bool *out_special) + + int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting) + + int get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) + + # int parse_python_string(object obj, pandas_datetimestruct *out) except -1 + + + + +cdef inline _string_to_dts(object val, pandas_datetimestruct* dts): + cdef int result + cdef char *tmp + + if PyUnicode_Check(val): + val = PyUnicode_AsASCIIString(val); + + tmp = val + result = _cstring_to_dts(tmp, len(val), dts) + + if result == -1: + raise ValueError('Unable to parse %s' % str(val)) + +cdef inline int _cstring_to_dts(char *val, int length, + pandas_datetimestruct* dts): + cdef: + npy_bool islocal, special + PANDAS_DATETIMEUNIT out_bestunit + int result + + result = parse_iso_8601_datetime(val, length, PANDAS_FR_ns, + NPY_UNSAFE_CASTING, + dts, &islocal, &out_bestunit, &special) + return result + + +cdef inline object _datetime64_to_datetime(int64_t val): + cdef pandas_datetimestruct dts + pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) + return _dts_to_pydatetime(&dts) + +cdef inline object _dts_to_pydatetime(pandas_datetimestruct *dts): + return PyDateTime_FromDateAndTime(dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec, dts.us) + +cdef inline int64_t _pydatetime_to_dts(object val, pandas_datetimestruct *dts): + dts.year = PyDateTime_GET_YEAR(val) + dts.month = PyDateTime_GET_MONTH(val) + dts.day = PyDateTime_GET_DAY(val) + dts.hour = PyDateTime_DATE_GET_HOUR(val) + dts.min = PyDateTime_DATE_GET_MINUTE(val) + dts.sec = PyDateTime_DATE_GET_SECOND(val) + dts.us = PyDateTime_DATE_GET_MICROSECOND(val) + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + +cdef inline int64_t _dtlike_to_datetime64(object val, + pandas_datetimestruct *dts): + dts.year = val.year + dts.month = val.month + dts.day = val.day + dts.hour = val.hour + dts.min = val.minute + dts.sec = val.second + dts.us = val.microsecond + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + +cdef inline int64_t _date_to_datetime64(object val, + pandas_datetimestruct *dts): + dts.year = PyDateTime_GET_YEAR(val) + dts.month = PyDateTime_GET_MONTH(val) + dts.day = PyDateTime_GET_DAY(val) + dts.hour = dts.min = dts.sec = dts.us = 0 + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c new file mode 100644 index 00000000..c30b404d --- /dev/null +++ b/pandas/src/datetime/np_datetime.c @@ -0,0 +1,1018 @@ +/* + * This is derived from Numpy 1.7 + * + * See NP_LICENSE.txt + */ + +#define NO_IMPORT + +#include +#include + +/* #define __MSVCRT_VERSION__ 0x0700 /\* whatever above 0x0601 *\/ */ +/* #include */ +/* #define time_t __time64_t */ +/* #define localtime _localtime64 */ +/* #define time _time64 */ + +#include +#include +#include "np_datetime.h" + +#if PY_MAJOR_VERSION >= 3 + #define PyIntObject PyLongObject + #define PyInt_Type PyLong_Type + #define PyInt_Check(op) PyLong_Check(op) + #define PyInt_CheckExact(op) PyLong_CheckExact(op) + #define PyInt_FromString PyLong_FromString + #define PyInt_FromUnicode PyLong_FromUnicode + #define PyInt_FromLong PyLong_FromLong + #define PyInt_FromSize_t PyLong_FromSize_t + #define PyInt_FromSsize_t PyLong_FromSsize_t + #define PyInt_AsLong PyLong_AsLong + #define PyInt_AS_LONG PyLong_AS_LONG + #define PyInt_AsSsize_t PyLong_AsSsize_t + #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask + #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask +#endif + +const int days_per_month_table[2][12] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } +}; + +/* + * Returns 1 if the given year is a leap year, 0 otherwise. + */ +int is_leapyear(npy_int64 year) +{ + return (year & 0x3) == 0 && /* year % 4 == 0 */ + ((year % 100) != 0 || + (year % 400) == 0); +} + +/* + * Sakamoto's method, from wikipedia + */ +int dayofweek(int y, int m, int d) +{ + int day; + static const int t[] = {0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4}; + y -= m < 3; + day = (y + y/4 - y/100 + y/400 + t[m-1] + d) % 7; + // convert to python day + return (day + 6) % 7; +} + +/* + * Adjusts a datetimestruct based on a minutes offset. Assumes + * the current values are valid.g + */ +void +add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes) +{ + int isleap; + + /* MINUTES */ + dts->min += minutes; + while (dts->min < 0) { + dts->min += 60; + dts->hour--; + } + while (dts->min >= 60) { + dts->min -= 60; + dts->hour++; + } + + /* HOURS */ + while (dts->hour < 0) { + dts->hour += 24; + dts->day--; + } + while (dts->hour >= 24) { + dts->hour -= 24; + dts->day++; + } + + /* DAYS */ + if (dts->day < 1) { + dts->month--; + if (dts->month < 1) { + dts->year--; + dts->month = 12; + } + isleap = is_leapyear(dts->year); + dts->day += days_per_month_table[isleap][dts->month-1]; + } + else if (dts->day > 28) { + isleap = is_leapyear(dts->year); + if (dts->day > days_per_month_table[isleap][dts->month-1]) { + dts->day -= days_per_month_table[isleap][dts->month-1]; + dts->month++; + if (dts->month > 12) { + dts->year++; + dts->month = 1; + } + } + } +} + +/* + * Calculates the days offset from the 1970 epoch. + */ +npy_int64 +get_datetimestruct_days(const pandas_datetimestruct *dts) +{ + int i, month; + npy_int64 year, days = 0; + const int *month_lengths; + + year = dts->year - 1970; + days = year * 365; + + /* Adjust for leap years */ + if (days >= 0) { + /* + * 1968 is the closest leap year before 1970. + * Exclude the current year, so add 1. + */ + year += 1; + /* Add one day for each 4 years */ + days += year / 4; + /* 1900 is the closest previous year divisible by 100 */ + year += 68; + /* Subtract one day for each 100 years */ + days -= year / 100; + /* 1600 is the closest previous year divisible by 400 */ + year += 300; + /* Add one day for each 400 years */ + days += year / 400; + } + else { + /* + * 1972 is the closest later year after 1970. + * Include the current year, so subtract 2. + */ + year -= 2; + /* Subtract one day for each 4 years */ + days += year / 4; + /* 2000 is the closest later year divisible by 100 */ + year -= 28; + /* Add one day for each 100 years */ + days -= year / 100; + /* 2000 is also the closest later year divisible by 400 */ + /* Subtract one day for each 400 years */ + days += year / 400; + } + + month_lengths = days_per_month_table[is_leapyear(dts->year)]; + month = dts->month - 1; + + /* Add the months */ + for (i = 0; i < month; ++i) { + days += month_lengths[i]; + } + + /* Add the days */ + days += dts->day - 1; + + return days; +} + +/* + * Modifies '*days_' to be the day offset within the year, + * and returns the year. + */ +static npy_int64 +days_to_yearsdays(npy_int64 *days_) +{ + const npy_int64 days_per_400years = (400*365 + 100 - 4 + 1); + /* Adjust so it's relative to the year 2000 (divisible by 400) */ + npy_int64 days = (*days_) - (365*30 + 7); + npy_int64 year; + + /* Break down the 400 year cycle to get the year and day within the year */ + if (days >= 0) { + year = 400 * (days / days_per_400years); + days = days % days_per_400years; + } + else { + year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); + days = days % days_per_400years; + if (days < 0) { + days += days_per_400years; + } + } + + /* Work out the year/day within the 400 year cycle */ + if (days >= 366) { + year += 100 * ((days-1) / (100*365 + 25 - 1)); + days = (days-1) % (100*365 + 25 - 1); + if (days >= 365) { + year += 4 * ((days+1) / (4*365 + 1)); + days = (days+1) % (4*365 + 1); + if (days >= 366) { + year += (days-1) / 365; + days = (days-1) % 365; + } + } + } + + *days_ = days; + return year + 2000; +} + +/* + * Adjusts a datetimestruct based on a seconds offset. Assumes + * the current values are valid. + */ +NPY_NO_EXPORT void +add_seconds_to_datetimestruct(pandas_datetimestruct *dts, int seconds) +{ + int minutes; + + dts->sec += seconds; + if (dts->sec < 0) { + minutes = dts->sec / 60; + dts->sec = dts->sec % 60; + if (dts->sec < 0) { + --minutes; + dts->sec += 60; + } + add_minutes_to_datetimestruct(dts, minutes); + } + else if (dts->sec >= 60) { + minutes = dts->sec / 60; + dts->sec = dts->sec % 60; + add_minutes_to_datetimestruct(dts, minutes); + } +} + +/* + * Fills in the year, month, day in 'dts' based on the days + * offset from 1970. + */ +static void +set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts) +{ + const int *month_lengths; + int i; + + dts->year = days_to_yearsdays(&days); + month_lengths = days_per_month_table[is_leapyear(dts->year)]; + + for (i = 0; i < 12; ++i) { + if (days < month_lengths[i]) { + dts->month = i + 1; + dts->day = days + 1; + return; + } + else { + days -= month_lengths[i]; + } + } +} + +/* + * Compares two pandas_datetimestruct objects chronologically + */ +int +cmp_pandas_datetimestruct(pandas_datetimestruct *a, pandas_datetimestruct *b) +{ + if (a->year > b->year) { + return 1; + } else if (a->year < b->year) { + return -1; + } + + if (a->month > b->month) { + return 1; + } else if (a->month < b->month) { + return -1; + } + + if (a->day > b->day) { + return 1; + } else if (a->day < b->day) { + return -1; + } + + if (a->hour > b->hour) { + return 1; + } else if (a->hour < b->hour) { + return -1; + } + + if (a->min > b->min) { + return 1; + } else if (a->min < b->min) { + return -1; + } + + if (a->sec > b->sec) { + return 1; + } else if (a->sec < b->sec) { + return -1; + } + + if (a->us > b->us) { + return 1; + } else if (a->us < b->us) { + return -1; + } + + if (a->ps > b->ps) { + return 1; + } else if (a->ps < b->ps) { + return -1; + } + + if (a->as > b->as) { + return 1; + } else if (a->as < b->as) { + return -1; + } + + return 0; +} + +/* + * + * Tests for and converts a Python datetime.datetime or datetime.date + * object into a NumPy pandas_datetimestruct. + * + * While the C API has PyDate_* and PyDateTime_* functions, the following + * implementation just asks for attributes, and thus supports + * datetime duck typing. The tzinfo time zone conversion would require + * this style of access anyway. + * + * 'out_bestunit' gives a suggested unit based on whether the object + * was a datetime.date or datetime.datetime object. + * + * If 'apply_tzinfo' is 1, this function uses the tzinfo to convert + * to UTC time, otherwise it returns the struct with the local time. + * + * Returns -1 on error, 0 on success, and 1 (with no error set) + * if obj doesn't have the neeeded date or datetime attributes. + */ +int +convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, + int apply_tzinfo) +{ + PyObject *tmp; + int isleap; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_datetimestruct)); + out->month = 1; + out->day = 1; + + /* Need at least year/month/day attributes */ + if (!PyObject_HasAttrString(obj, "year") || + !PyObject_HasAttrString(obj, "month") || + !PyObject_HasAttrString(obj, "day")) { + return 1; + } + + /* Get the year */ + tmp = PyObject_GetAttrString(obj, "year"); + if (tmp == NULL) { + return -1; + } + out->year = PyInt_AsLong(tmp); + if (out->year == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the month */ + tmp = PyObject_GetAttrString(obj, "month"); + if (tmp == NULL) { + return -1; + } + out->month = PyInt_AsLong(tmp); + if (out->month == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the day */ + tmp = PyObject_GetAttrString(obj, "day"); + if (tmp == NULL) { + return -1; + } + out->day = PyInt_AsLong(tmp); + if (out->day == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Validate that the month and day are valid for the year */ + if (out->month < 1 || out->month > 12) { + goto invalid_date; + } + isleap = is_leapyear(out->year); + if (out->day < 1 || + out->day > days_per_month_table[isleap][out->month-1]) { + goto invalid_date; + } + + /* Check for time attributes (if not there, return success as a date) */ + if (!PyObject_HasAttrString(obj, "hour") || + !PyObject_HasAttrString(obj, "minute") || + !PyObject_HasAttrString(obj, "second") || + !PyObject_HasAttrString(obj, "microsecond")) { + /* The best unit for date is 'D' */ + if (out_bestunit != NULL) { + *out_bestunit = PANDAS_FR_D; + } + return 0; + } + + /* Get the hour */ + tmp = PyObject_GetAttrString(obj, "hour"); + if (tmp == NULL) { + return -1; + } + out->hour = PyInt_AsLong(tmp); + if (out->hour == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the minute */ + tmp = PyObject_GetAttrString(obj, "minute"); + if (tmp == NULL) { + return -1; + } + out->min = PyInt_AsLong(tmp); + if (out->min == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the second */ + tmp = PyObject_GetAttrString(obj, "second"); + if (tmp == NULL) { + return -1; + } + out->sec = PyInt_AsLong(tmp); + if (out->sec == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the microsecond */ + tmp = PyObject_GetAttrString(obj, "microsecond"); + if (tmp == NULL) { + return -1; + } + out->us = PyInt_AsLong(tmp); + if (out->us == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + if (out->hour < 0 || out->hour >= 24 || + out->min < 0 || out->min >= 60 || + out->sec < 0 || out->sec >= 60 || + out->us < 0 || out->us >= 1000000) { + goto invalid_time; + } + + /* Apply the time zone offset if it exists */ + if (apply_tzinfo && PyObject_HasAttrString(obj, "tzinfo")) { + tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return -1; + } + if (tmp == Py_None) { + Py_DECREF(tmp); + } + else { + PyObject *offset; + int seconds_offset, minutes_offset; + + /* The utcoffset function should return a timedelta */ + offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* + * The timedelta should have a function "total_seconds" + * which contains the value we want. + */ + tmp = PyObject_CallMethod(offset, "total_seconds", ""); + if (tmp == NULL) { + return -1; + } + seconds_offset = PyInt_AsLong(tmp); + if (seconds_offset == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Convert to a minutes offset and apply it */ + minutes_offset = seconds_offset / 60; + + add_minutes_to_datetimestruct(out, -minutes_offset); + } + } + + /* The resolution of Python's datetime is 'us' */ + if (out_bestunit != NULL) { + *out_bestunit = PANDAS_FR_us; + } + + return 0; + +invalid_date: + PyErr_Format(PyExc_ValueError, + "Invalid date (%d,%d,%d) when converting to NumPy datetime", + (int)out->year, (int)out->month, (int)out->day); + return -1; + +invalid_time: + PyErr_Format(PyExc_ValueError, + "Invalid time (%d,%d,%d,%d) when converting " + "to NumPy datetime", + (int)out->hour, (int)out->min, (int)out->sec, (int)out->us); + return -1; +} + +npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) +{ + pandas_datetime_metadata meta; + npy_datetime result = PANDAS_DATETIME_NAT; + + meta.base = fr; + meta.num = 1; + + convert_datetimestruct_to_datetime(&meta, d, &result); + return result; +} + +void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result) +{ + pandas_datetime_metadata meta; + + meta.base = fr; + meta.num = 1; + + convert_datetime_to_datetimestruct(&meta, val, result); +} + +PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { + return ((PyDatetimeScalarObject *) obj)->obmeta.base; +} + + +/* + * Converts a datetime from a datetimestruct to a datetime based + * on some metadata. The date is assumed to be valid. + * + * TODO: If meta->num is really big, there could be overflow + * + * Returns 0 on success, -1 on failure. + */ +int +convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, + const pandas_datetimestruct *dts, + npy_datetime *out) +{ + npy_datetime ret; + PANDAS_DATETIMEUNIT base = meta->base; + + if (base == PANDAS_FR_Y) { + /* Truncate to the year */ + ret = dts->year - 1970; + } + else if (base == PANDAS_FR_M) { + /* Truncate to the month */ + ret = 12 * (dts->year - 1970) + (dts->month - 1); + } + else { + /* Otherwise calculate the number of days to start */ + npy_int64 days = get_datetimestruct_days(dts); + + switch (base) { + case PANDAS_FR_W: + /* Truncate to weeks */ + if (days >= 0) { + ret = days / 7; + } + else { + ret = (days - 6) / 7; + } + break; + case PANDAS_FR_D: + ret = days; + break; + case PANDAS_FR_h: + ret = days * 24 + + dts->hour; + break; + case PANDAS_FR_m: + ret = (days * 24 + + dts->hour) * 60 + + dts->min; + break; + case PANDAS_FR_s: + ret = ((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec; + break; + case PANDAS_FR_ms: + ret = (((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000 + + dts->us / 1000; + break; + case PANDAS_FR_us: + ret = (((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us; + break; + case PANDAS_FR_ns: + ret = ((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000 + + dts->ps / 1000; + break; + case PANDAS_FR_ps: + ret = ((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000000 + + dts->ps; + break; + case PANDAS_FR_fs: + /* only 2.6 hours */ + ret = (((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000000 + + dts->ps) * 1000 + + dts->as / 1000; + break; + case PANDAS_FR_as: + /* only 9.2 secs */ + ret = (((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000000 + + dts->ps) * 1000000 + + dts->as; + break; + default: + /* Something got corrupted */ + PyErr_SetString(PyExc_ValueError, + "NumPy datetime metadata with corrupt unit value"); + return -1; + } + } + + /* Divide by the multiplier */ + if (meta->num > 1) { + if (ret >= 0) { + ret /= meta->num; + } + else { + ret = (ret - meta->num + 1) / meta->num; + } + } + + *out = ret; + + return 0; +} + + +/* + * This provides the casting rules for the TIMEDELTA data type units. + * + * Notably, there is a barrier between the nonlinear years and + * months units, and all the other units. + */ +npy_bool +can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting) +{ + switch (casting) { + /* Allow anything with unsafe casting */ + case NPY_UNSAFE_CASTING: + return 1; + + /* + * Only enforce the 'date units' vs 'time units' barrier with + * 'same_kind' casting. + */ + case NPY_SAME_KIND_CASTING: + return (src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || + (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M); + + /* + * Enforce the 'date units' vs 'time units' barrier and that + * casting is only allowed towards more precise units with + * 'safe' casting. + */ + case NPY_SAFE_CASTING: + return (src_unit <= dst_unit) && + ((src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || + (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M)); + + /* Enforce equality with 'no' or 'equiv' casting */ + default: + return src_unit == dst_unit; + } +} + +/* + * This provides the casting rules for the DATETIME data type units. + * + * Notably, there is a barrier between 'date units' and 'time units' + * for all but 'unsafe' casting. + */ +npy_bool +can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting) +{ + switch (casting) { + /* Allow anything with unsafe casting */ + case NPY_UNSAFE_CASTING: + return 1; + + /* + * Only enforce the 'date units' vs 'time units' barrier with + * 'same_kind' casting. + */ + case NPY_SAME_KIND_CASTING: + return (src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || + (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D); + + /* + * Enforce the 'date units' vs 'time units' barrier and that + * casting is only allowed towards more precise units with + * 'safe' casting. + */ + case NPY_SAFE_CASTING: + return (src_unit <= dst_unit) && + ((src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || + (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D)); + + /* Enforce equality with 'no' or 'equiv' casting */ + default: + return src_unit == dst_unit; + } +} + +/* + * Converts a datetime based on the given metadata into a datetimestruct + */ +int +convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, + npy_datetime dt, + pandas_datetimestruct *out) +{ + npy_int64 perday; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_datetimestruct)); + out->year = 1970; + out->month = 1; + out->day = 1; + + /* TODO: Change to a mechanism that avoids the potential overflow */ + dt *= meta->num; + + /* + * Note that care must be taken with the / and % operators + * for negative values. + */ + switch (meta->base) { + case PANDAS_FR_Y: + out->year = 1970 + dt; + break; + + case PANDAS_FR_M: + if (dt >= 0) { + out->year = 1970 + dt / 12; + out->month = dt % 12 + 1; + } + else { + out->year = 1969 + (dt + 1) / 12; + out->month = 12 + (dt + 1)% 12; + } + break; + + case PANDAS_FR_W: + /* A week is 7 days */ + set_datetimestruct_days(dt * 7, out); + break; + + case PANDAS_FR_D: + set_datetimestruct_days(dt, out); + break; + + case PANDAS_FR_h: + perday = 24LL; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt; + break; + + case PANDAS_FR_m: + perday = 24LL * 60; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / 60; + out->min = dt % 60; + break; + + case PANDAS_FR_s: + perday = 24LL * 60 * 60; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60); + out->min = (dt / 60) % 60; + out->sec = dt % 60; + break; + + case PANDAS_FR_ms: + perday = 24LL * 60 * 60 * 1000; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000LL); + out->min = (dt / (60*1000LL)) % 60; + out->sec = (dt / 1000LL) % 60; + out->us = (dt % 1000LL) * 1000; + break; + + case PANDAS_FR_us: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000000LL); + out->min = (dt / (60*1000000LL)) % 60; + out->sec = (dt / 1000000LL) % 60; + out->us = dt % 1000000LL; + break; + + case PANDAS_FR_ns: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000000000LL); + out->min = (dt / (60*1000000000LL)) % 60; + out->sec = (dt / 1000000000LL) % 60; + out->us = (dt / 1000LL) % 1000000LL; + out->ps = (dt % 1000LL) * 1000; + break; + + case PANDAS_FR_ps: + perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000000000000LL); + out->min = (dt / (60*1000000000000LL)) % 60; + out->sec = (dt / 1000000000000LL) % 60; + out->us = (dt / 1000000LL) % 1000000LL; + out->ps = dt % 1000000LL; + break; + + case PANDAS_FR_fs: + /* entire range is only +- 2.6 hours */ + if (dt >= 0) { + out->hour = dt / (60*60*1000000000000000LL); + out->min = (dt / (60*1000000000000000LL)) % 60; + out->sec = (dt / 1000000000000000LL) % 60; + out->us = (dt / 1000000000LL) % 1000000LL; + out->ps = (dt / 1000LL) % 1000000LL; + out->as = (dt % 1000LL) * 1000; + } + else { + npy_datetime minutes; + + minutes = dt / (60*1000000000000000LL); + dt = dt % (60*1000000000000000LL); + if (dt < 0) { + dt += (60*1000000000000000LL); + --minutes; + } + /* Offset the negative minutes */ + add_minutes_to_datetimestruct(out, minutes); + out->sec = (dt / 1000000000000000LL) % 60; + out->us = (dt / 1000000000LL) % 1000000LL; + out->ps = (dt / 1000LL) % 1000000LL; + out->as = (dt % 1000LL) * 1000; + } + break; + + case PANDAS_FR_as: + /* entire range is only +- 9.2 seconds */ + if (dt >= 0) { + out->sec = (dt / 1000000000000000000LL) % 60; + out->us = (dt / 1000000000000LL) % 1000000LL; + out->ps = (dt / 1000000LL) % 1000000LL; + out->as = dt % 1000000LL; + } + else { + npy_datetime seconds; + + seconds = dt / 1000000000000000000LL; + dt = dt % 1000000000000000000LL; + if (dt < 0) { + dt += 1000000000000000000LL; + --seconds; + } + /* Offset the negative seconds */ + add_seconds_to_datetimestruct(out, seconds); + out->us = (dt / 1000000000000LL) % 1000000LL; + out->ps = (dt / 1000000LL) % 1000000LL; + out->as = dt % 1000000LL; + } + break; + + default: + PyErr_SetString(PyExc_RuntimeError, + "NumPy datetime metadata is corrupted with invalid " + "base unit"); + return -1; + } + + return 0; +} + diff --git a/pandas/src/datetime/np_datetime.h b/pandas/src/datetime/np_datetime.h new file mode 100644 index 00000000..f200d3a2 --- /dev/null +++ b/pandas/src/datetime/np_datetime.h @@ -0,0 +1,119 @@ +/* + * This is derived from numpy 1.7 + * See NP_LICENSE.TXT + */ + +#ifndef _PANDAS_DATETIME_H_ +#define _PANDAS_DATETIME_H_ + +#include + +typedef enum { + PANDAS_FR_Y = 0, /* Years */ + PANDAS_FR_M = 1, /* Months */ + PANDAS_FR_W = 2, /* Weeks */ + /* Gap where NPY_FR_B was */ + PANDAS_FR_D = 4, /* Days */ + PANDAS_FR_h = 5, /* hours */ + PANDAS_FR_m = 6, /* minutes */ + PANDAS_FR_s = 7, /* seconds */ + PANDAS_FR_ms = 8,/* milliseconds */ + PANDAS_FR_us = 9,/* microseconds */ + PANDAS_FR_ns = 10,/* nanoseconds */ + PANDAS_FR_ps = 11,/* picoseconds */ + PANDAS_FR_fs = 12,/* femtoseconds */ + PANDAS_FR_as = 13,/* attoseconds */ + PANDAS_FR_GENERIC = 14 /* Generic, unbound units, can convert to anything */ +} PANDAS_DATETIMEUNIT; + +#define PANDAS_DATETIME_NUMUNITS 13 + +#define PANDAS_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) + +#define PANDAS_DATETIME_NAT NPY_MIN_INT64 + +typedef struct { + npy_int64 year; + npy_int32 month, day, hour, min, sec, us, ps, as; +} pandas_datetimestruct; + +typedef struct { + PANDAS_DATETIMEUNIT base; + int num; +} pandas_datetime_metadata; + +// stuff pandas needs +// ---------------------------------------------------------------------------- + +int convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, + int apply_tzinfo); + +npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d); + +void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result); + +int dayofweek(int y, int m, int d); + +extern const int days_per_month_table[2][12]; + +// stuff numpy-derived code needs in header +// ---------------------------------------------------------------------------- + +int is_leapyear(npy_int64 year); + +/* + * Converts a datetime from a datetimestruct to a datetime based + * on some metadata. The date is assumed to be valid. + * + * TODO: If meta->num is really big, there could be overflow + * + * Returns 0 on success, -1 on failure. + */ +int +convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, + const pandas_datetimestruct *dts, + npy_datetime *out); + +/* + * Calculates the days offset from the 1970 epoch. + */ +npy_int64 +get_datetimestruct_days(const pandas_datetimestruct *dts); + +/* + * Adjusts a datetimestruct based on a minutes offset. Assumes + * the current values are valid. + */ +void +add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes); + +/* + * This provides the casting rules for the TIMEDELTA data type units. + * + * Notably, there is a barrier between the nonlinear years and + * months units, and all the other units. + */ +//npy_bool +//can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, +// PANDAS_DATETIMEUNIT dst_unit, +// NPY_CASTING casting); + +npy_bool +can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting); + + +int +convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, + npy_datetime dt, + pandas_datetimestruct *out); + + +PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj); + + +#endif diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c new file mode 100644 index 00000000..9c78e995 --- /dev/null +++ b/pandas/src/datetime/np_datetime_strings.c @@ -0,0 +1,1463 @@ +/* + * This file implements string parsing and creation for NumPy datetime. + * + * Written by Mark Wiebe (mwwiebe@gmail.com) + * Copyright (c) 2011 by Enthought, Inc. + * + * See NP_LICENSE.txt for the license. + */ + +#define PY_SSIZE_T_CLEAN +#define NO_IMPORT + +#include + +#include + +#include +#include "numpy/arrayscalars.h" + +#include "np_datetime.h" +#include "np_datetime_strings.h" + +NPY_NO_EXPORT const char * +npy_casting_to_string(NPY_CASTING casting) +{ + switch (casting) { + case NPY_NO_CASTING: + return "'no'"; + case NPY_EQUIV_CASTING: + return "'equiv'"; + case NPY_SAFE_CASTING: + return "'safe'"; + case NPY_SAME_KIND_CASTING: + return "'same_kind'"; + case NPY_UNSAFE_CASTING: + return "'unsafe'"; + default: + return ""; + } +} + +/* Platform-specific time_t typedef */ +typedef time_t NPY_TIME_T; + +/*// We *do* want these symbols, but for cython, not for C. fine in mac osx,*/ +/*// linux complains.*/ +/*static void _suppress_unused_variable_warning(void)*/ +/*{*/ +/* int x = days_per_month_table[0][0];*/ +/* x = x;*/ + +/* int y = _month_offset[0][0];*/ +/* y = y;*/ + +/* char *z = _datetime_strings[0];*/ +/* z = z;*/ +/*}*/ + +/* Exported as DATETIMEUNITS in multiarraymodule.c */ +static char *_datetime_strings[PANDAS_DATETIME_NUMUNITS] = { + "Y", + "M", + "W", + "D", + "h", + "m", + "s", + "ms", + "us", + "ns", + "ps", + "fs", + "as", +}; +/* + * Wraps `localtime` functionality for multiple platforms. This + * converts a time value to a time structure in the local timezone. + * + * Returns 0 on success, -1 on failure. + */ +static int +get_localtime(NPY_TIME_T *ts, struct tm *tms) +{ + char *func_name = ""; +#if defined(_WIN32) + #if defined(_MSC_VER) && (_MSC_VER >= 1400) + if (localtime_s(tms, ts) != 0) { + func_name = "localtime_s"; + goto fail; + } + #elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) + if (_localtime64_s(tms, ts) != 0) { + func_name = "_localtime64_s"; + goto fail; + } + #else + struct tm *tms_tmp; + tms_tmp = localtime(ts); + if (tms_tmp == NULL) { + func_name = "localtime"; + goto fail; + } + memcpy(tms, tms_tmp, sizeof(struct tm)); + #endif +#else + if (localtime_r(ts, tms) == NULL) { + func_name = "localtime_r"; + goto fail; + } +#endif + + return 0; + +fail: + PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " + "to a local time", func_name); + return -1; +} + +/* + * Wraps `gmtime` functionality for multiple platforms. This + * converts a time value to a time structure in UTC. + * + * Returns 0 on success, -1 on failure. + */ +static int +get_gmtime(NPY_TIME_T *ts, struct tm *tms) +{ + char *func_name = ""; +#if defined(_WIN32) + #if defined(_MSC_VER) && (_MSC_VER >= 1400) + if (gmtime_s(tms, ts) != 0) { + func_name = "gmtime_s"; + goto fail; + } + #elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) + if (_gmtime64_s(tms, ts) != 0) { + func_name = "_gmtime64_s"; + goto fail; + } + #else + struct tm *tms_tmp; + tms_tmp = gmtime(ts); + if (tms_tmp == NULL) { + func_name = "gmtime"; + goto fail; + } + memcpy(tms, tms_tmp, sizeof(struct tm)); + #endif +#else + if (gmtime_r(ts, tms) == NULL) { + func_name = "gmtime_r"; + goto fail; + } +#endif + + return 0; + +fail: + PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " + "to a UTC time", func_name); + return -1; +} + +/* + * Converts a datetimestruct in UTC to a datetimestruct in local time, + * also returning the timezone offset applied. + * + * Returns 0 on success, -1 on failure. + */ +static int +convert_datetimestruct_utc_to_local(pandas_datetimestruct *out_dts_local, + const pandas_datetimestruct *dts_utc, int *out_timezone_offset) +{ + NPY_TIME_T rawtime = 0, localrawtime; + struct tm tm_; + npy_int64 year_correction = 0; + + /* Make a copy of the input 'dts' to modify */ + *out_dts_local = *dts_utc; + + /* HACK: Use a year < 2038 for later years for small time_t */ + if (sizeof(NPY_TIME_T) == 4 && out_dts_local->year >= 2038) { + if (is_leapyear(out_dts_local->year)) { + /* 2036 is a leap year */ + year_correction = out_dts_local->year - 2036; + out_dts_local->year -= year_correction; + } + else { + /* 2037 is not a leap year */ + year_correction = out_dts_local->year - 2037; + out_dts_local->year -= year_correction; + } + } + + /* + * Convert everything in 'dts' to a time_t, to minutes precision. + * This is POSIX time, which skips leap-seconds, but because + * we drop the seconds value from the pandas_datetimestruct, everything + * is ok for this operation. + */ + rawtime = (time_t)get_datetimestruct_days(out_dts_local) * 24 * 60 * 60; + rawtime += dts_utc->hour * 60 * 60; + rawtime += dts_utc->min * 60; + + /* localtime converts a 'time_t' into a local 'struct tm' */ + if (get_localtime(&rawtime, &tm_) < 0) { + return -1; + } + + /* Copy back all the values except seconds */ + out_dts_local->min = tm_.tm_min; + out_dts_local->hour = tm_.tm_hour; + out_dts_local->day = tm_.tm_mday; + out_dts_local->month = tm_.tm_mon + 1; + out_dts_local->year = tm_.tm_year + 1900; + + /* Extract the timezone offset that was applied */ + rawtime /= 60; + localrawtime = (time_t)get_datetimestruct_days(out_dts_local) * 24 * 60; + localrawtime += out_dts_local->hour * 60; + localrawtime += out_dts_local->min; + + *out_timezone_offset = localrawtime - rawtime; + + /* Reapply the year 2038 year correction HACK */ + out_dts_local->year += year_correction; + + return 0; +} + +#if 0 +/* + * Converts a datetimestruct in local time to a datetimestruct in UTC. + * + * Returns 0 on success, -1 on failure. + */ +static int +convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, + const pandas_datetimestruct *dts_local) +{ + npy_int64 year_correction = 0; + + /* Make a copy of the input 'dts' to modify */ + *out_dts_utc = *dts_local; + + /* HACK: Use a year < 2038 for later years for small time_t */ + if (sizeof(NPY_TIME_T) == 4 && out_dts_utc->year >= 2038) { + if (is_leapyear(out_dts_utc->year)) { + /* 2036 is a leap year */ + year_correction = out_dts_utc->year - 2036; + out_dts_utc->year -= year_correction; + } + else { + /* 2037 is not a leap year */ + year_correction = out_dts_utc->year - 2037; + out_dts_utc->year -= year_correction; + } + } + + /* + * ISO 8601 states to treat date-times without a timezone offset + * or 'Z' for UTC as local time. The C standard libary functions + * mktime and gmtime allow us to do this conversion. + * + * Only do this timezone adjustment for recent and future years. + * In this case, "recent" is defined to be 1970 and later, because + * on MS Windows, mktime raises an error when given an earlier date. + */ + if (out_dts_utc->year >= 1970) { + NPY_TIME_T rawtime = 0; + struct tm tm_; + + tm_.tm_sec = out_dts_utc->sec; + tm_.tm_min = out_dts_utc->min; + tm_.tm_hour = out_dts_utc->hour; + tm_.tm_mday = out_dts_utc->day; + tm_.tm_mon = out_dts_utc->month - 1; + tm_.tm_year = out_dts_utc->year - 1900; + tm_.tm_isdst = -1; + + /* mktime converts a local 'struct tm' into a time_t */ + rawtime = mktime(&tm_); + if (rawtime == -1) { + PyErr_SetString(PyExc_OSError, "Failed to use mktime to " + "convert local time to UTC"); + return -1; + } + + /* gmtime converts a 'time_t' into a UTC 'struct tm' */ + if (get_gmtime(&rawtime, &tm_) < 0) { + return -1; + } + out_dts_utc->sec = tm_.tm_sec; + out_dts_utc->min = tm_.tm_min; + out_dts_utc->hour = tm_.tm_hour; + out_dts_utc->day = tm_.tm_mday; + out_dts_utc->month = tm_.tm_mon + 1; + out_dts_utc->year = tm_.tm_year + 1900; + } + + /* Reapply the year 2038 year correction HACK */ + out_dts_utc->year += year_correction; + + return 0; +} +#endif + +/* int */ +/* parse_python_string(PyObject* obj, pandas_datetimestruct *dts) { */ +/* PyObject *bytes = NULL; */ +/* char *str = NULL; */ +/* Py_ssize_t len = 0; */ +/* PANDAS_DATETIMEUNIT bestunit = -1; */ + +/* /\* Convert to an ASCII string for the date parser *\/ */ +/* if (PyUnicode_Check(obj)) { */ +/* bytes = PyUnicode_AsASCIIString(obj); */ +/* if (bytes == NULL) { */ +/* return -1; */ +/* } */ +/* } */ +/* else { */ +/* bytes = obj; */ +/* Py_INCREF(bytes); */ +/* } */ +/* if (PyBytes_AsStringAndSize(bytes, &str, &len) == -1) { */ +/* Py_DECREF(bytes); */ +/* return -1; */ +/* } */ + +/* /\* Parse the ISO date *\/ */ +/* if (parse_iso_8601_datetime(str, len, PANDAS_FR_us, NPY_UNSAFE_CASTING, */ +/* dts, NULL, &bestunit, NULL) < 0) { */ +/* Py_DECREF(bytes); */ +/* return -1; */ +/* } */ +/* Py_DECREF(bytes); */ + +/* return 0; */ +/* } */ + + +/* + * Parses (almost) standard ISO 8601 date strings. The differences are: + * + * + The date "20100312" is parsed as the year 20100312, not as + * equivalent to "2010-03-12". The '-' in the dates are not optional. + * + Only seconds may have a decimal point, with up to 18 digits after it + * (maximum attoseconds precision). + * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate + * the date and the time. Both are treated equivalently. + * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats. + * + Doesn't handle leap seconds (seconds value has 60 in these cases). + * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow + * + Accepts special values "NaT" (not a time), "Today", (current + * day according to local time) and "Now" (current time in UTC). + * + * 'str' must be a NULL-terminated string, and 'len' must be its length. + * 'unit' should contain -1 if the unit is unknown, or the unit + * which will be used if it is. + * 'casting' controls how the detected unit from the string is allowed + * to be cast to the 'unit' parameter. + * + * 'out' gets filled with the parsed date-time. + * 'out_local' gets set to 1 if the parsed time was in local time, + * to 0 otherwise. The values 'now' and 'today' don't get counted + * as local, and neither do UTC +/-#### timezone offsets, because + * they aren't using the computer's local timezone offset. + * 'out_bestunit' gives a suggested unit based on the amount of + * resolution provided in the string, or -1 for NaT. + * 'out_special' gets set to 1 if the parsed time was 'today', + * 'now', or ''/'NaT'. For 'today', the unit recommended is + * 'D', for 'now', the unit recommended is 's', and for 'NaT' + * the unit recommended is 'Y'. + * + * Returns 0 on success, -1 on failure. + */ +int +parse_iso_8601_datetime(char *str, int len, + PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, + pandas_datetimestruct *out, + npy_bool *out_local, + PANDAS_DATETIMEUNIT *out_bestunit, + npy_bool *out_special) +{ + int year_leap = 0; + int i, numdigits; + char *substr, sublen; + PANDAS_DATETIMEUNIT bestunit; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_datetimestruct)); + out->month = 1; + out->day = 1; + + + /* + * The string "today" means take today's date in local time, and + * convert it to a date representation. This date representation, if + * forced into a time unit, will be at midnight UTC. + * This is perhaps a little weird, but done so that the + * 'datetime64[D]' type produces the date you expect, rather than + * switching to an adjacent day depending on the current time and your + * timezone. + */ + if (len == 5 && tolower(str[0]) == 't' && + tolower(str[1]) == 'o' && + tolower(str[2]) == 'd' && + tolower(str[3]) == 'a' && + tolower(str[4]) == 'y') { + NPY_TIME_T rawtime = 0; + struct tm tm_; + + time(&rawtime); + if (get_localtime(&rawtime, &tm_) < 0) { + return -1; + } + out->year = tm_.tm_year + 1900; + out->month = tm_.tm_mon + 1; + out->day = tm_.tm_mday; + + bestunit = PANDAS_FR_D; + + /* + * Indicate that this was a special value, and + * is a date (unit 'D'). + */ + if (out_local != NULL) { + *out_local = 0; + } + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + if (out_special != NULL) { + *out_special = 1; + } + + /* Check the casting rule */ + if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + casting)) { + PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + "'%s' using casting rule %s", + str, _datetime_strings[unit], + npy_casting_to_string(casting)); + return -1; + } + + return 0; + } + + /* The string "now" resolves to the current UTC time */ + if (len == 3 && tolower(str[0]) == 'n' && + tolower(str[1]) == 'o' && + tolower(str[2]) == 'w') { + NPY_TIME_T rawtime = 0; + pandas_datetime_metadata meta; + + time(&rawtime); + + /* Set up a dummy metadata for the conversion */ + meta.base = PANDAS_FR_s; + meta.num = 1; + + bestunit = PANDAS_FR_s; + + /* + * Indicate that this was a special value, and + * use 's' because the time() function has resolution + * seconds. + */ + if (out_local != NULL) { + *out_local = 0; + } + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + if (out_special != NULL) { + *out_special = 1; + } + + /* Check the casting rule */ + if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + casting)) { + PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + "'%s' using casting rule %s", + str, _datetime_strings[unit], + npy_casting_to_string(casting)); + return -1; + } + + return convert_datetime_to_datetimestruct(&meta, rawtime, out); + } + + /* Anything else isn't a special value */ + if (out_special != NULL) { + *out_special = 0; + } + + substr = str; + sublen = len; + + /* Skip leading whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + } + + /* Leading '-' sign for negative year */ + if (*substr == '-') { + ++substr; + --sublen; + } + + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE YEAR (digits until the '-' character) */ + out->year = 0; + while (sublen > 0 && isdigit(*substr)) { + out->year = 10 * out->year + (*substr - '0'); + ++substr; + --sublen; + } + + /* Negate the year if necessary */ + if (str[0] == '-') { + out->year = -out->year; + } + /* Check whether it's a leap-year */ + year_leap = is_leapyear(out->year); + + /* Next character must be a '-' or the end of the string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; + } + bestunit = PANDAS_FR_Y; + goto finish; + } + else if (*substr == '-') { + ++substr; + --sublen; + } + else { + goto parse_error; + } + + /* Can't have a trailing '-' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE MONTH (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->month = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->month < 1 || out->month > 12) { + PyErr_Format(PyExc_ValueError, + "Month out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a '-' or the end of the string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; + } + bestunit = PANDAS_FR_M; + goto finish; + } + else if (*substr == '-') { + ++substr; + --sublen; + } + else { + goto parse_error; + } + + /* Can't have a trailing '-' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE DAY (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->day = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->day < 1 || + out->day > days_per_month_table[year_leap][out->month-1]) { + PyErr_Format(PyExc_ValueError, + "Day out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a 'T', ' ', or end of string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; + } + bestunit = PANDAS_FR_D; + goto finish; + } + else if (*substr != 'T' && *substr != ' ') { + goto parse_error; + } + else { + ++substr; + --sublen; + } + + /* PARSE THE HOURS (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->hour = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->hour < 0 || out->hour >= 24) { + PyErr_Format(PyExc_ValueError, + "Hours out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a ':' or the end of the string */ + if (sublen > 0 && *substr == ':') { + ++substr; + --sublen; + } + else { + bestunit = PANDAS_FR_h; + goto parse_timezone; + } + + /* Can't have a trailing ':' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE MINUTES (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->min = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->hour < 0 || out->min >= 60) { + PyErr_Format(PyExc_ValueError, + "Minutes out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a ':' or the end of the string */ + if (sublen > 0 && *substr == ':') { + ++substr; + --sublen; + } + else { + bestunit = PANDAS_FR_m; + goto parse_timezone; + } + + /* Can't have a trailing ':' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE SECONDS (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->sec = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->sec < 0 || out->sec >= 60) { + PyErr_Format(PyExc_ValueError, + "Seconds out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character may be a '.' indicating fractional seconds */ + if (sublen > 0 && *substr == '.') { + ++substr; + --sublen; + } + else { + bestunit = PANDAS_FR_s; + goto parse_timezone; + } + + /* PARSE THE MICROSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->us *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->us += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; + } + } + + if (sublen == 0 || !isdigit(*substr)) { + if (numdigits > 3) { + bestunit = PANDAS_FR_us; + } + else { + bestunit = PANDAS_FR_ms; + } + goto parse_timezone; + } + + /* PARSE THE PICOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->ps *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->ps += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; + } + } + + if (sublen == 0 || !isdigit(*substr)) { + if (numdigits > 3) { + bestunit = PANDAS_FR_ps; + } + else { + bestunit = PANDAS_FR_ns; + } + goto parse_timezone; + } + + /* PARSE THE ATTOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->as *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->as += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; + } + } + + if (numdigits > 3) { + bestunit = PANDAS_FR_as; + } + else { + bestunit = PANDAS_FR_fs; + } + +parse_timezone: + if (sublen == 0) { + // Unlike NumPy, treating no time zone as naive + goto finish; + +/* + if (convert_datetimestruct_local_to_utc(out, out) < 0) { + goto error; + } + + // Since neither "Z" nor a time-zone was specified, it's local + if (out_local != NULL) { + *out_local = 1; + } + + goto finish; +*/ + } + + /* UTC specifier */ + if (*substr == 'Z') { + /* "Z" means not local */ + if (out_local != NULL) { + *out_local = 0; + } + + if (sublen == 1) { + goto finish; + } + else { + ++substr; + --sublen; + } + } + /* Time zone offset */ + else if (*substr == '-' || *substr == '+') { + int offset_neg = 0, offset_hour = 0, offset_minute = 0; + + /* + * Since "local" means local with respect to the current + * machine, we say this is non-local. + */ + if (out_local != NULL) { + *out_local = 0; + } + + if (*substr == '-') { + offset_neg = 1; + } + ++substr; + --sublen; + + /* The hours offset */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_hour >= 24) { + PyErr_Format(PyExc_ValueError, + "Timezone hours offset out of range " + "in datetime string \"%s\"", str); + goto error; + } + } + else { + goto parse_error; + } + + /* The minutes offset is optional */ + if (sublen > 0) { + /* Optional ':' */ + if (*substr == ':') { + ++substr; + --sublen; + } + + /* The minutes offset (at the end of the string) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_minute >= 60) { + PyErr_Format(PyExc_ValueError, + "Timezone minutes offset out of range " + "in datetime string \"%s\"", str); + goto error; + } + } + else { + goto parse_error; + } + } + + /* Apply the time zone offset */ + if (offset_neg) { + offset_hour = -offset_hour; + offset_minute = -offset_minute; + } + add_minutes_to_datetimestruct(out, -60 * offset_hour - offset_minute); + } + + /* Skip trailing whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + } + + if (sublen != 0) { + goto parse_error; + } + +finish: + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + + /* Check the casting rule */ + if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + casting)) { + PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + "'%s' using casting rule %s", + str, _datetime_strings[unit], + npy_casting_to_string(casting)); + return -1; + } + + return 0; + +parse_error: + PyErr_Format(PyExc_ValueError, + "Error parsing datetime string \"%s\" at position %d", + str, (int)(substr-str)); + return -1; + +error: + return -1; +} + +/* + * Provides a string length to use for converting datetime + * objects with the given local and unit settings. + */ +int +get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) +{ + int len = 0; + + /* If no unit is provided, return the maximum length */ + if (base == -1) { + return PANDAS_DATETIME_MAX_ISO8601_STRLEN; + } + + switch (base) { + /* Generic units can only be used to represent NaT */ + /*case PANDAS_FR_GENERIC:*/ + /* return 4;*/ + case PANDAS_FR_as: + len += 3; /* "###" */ + case PANDAS_FR_fs: + len += 3; /* "###" */ + case PANDAS_FR_ps: + len += 3; /* "###" */ + case PANDAS_FR_ns: + len += 3; /* "###" */ + case PANDAS_FR_us: + len += 3; /* "###" */ + case PANDAS_FR_ms: + len += 4; /* ".###" */ + case PANDAS_FR_s: + len += 3; /* ":##" */ + case PANDAS_FR_m: + len += 3; /* ":##" */ + case PANDAS_FR_h: + len += 3; /* "T##" */ + case PANDAS_FR_D: + case PANDAS_FR_W: + len += 3; /* "-##" */ + case PANDAS_FR_M: + len += 3; /* "-##" */ + case PANDAS_FR_Y: + len += 21; /* 64-bit year */ + break; + default: + len += 3; /* handle the now defunct NPY_FR_B */ + break; + } + + if (base >= PANDAS_FR_h) { + if (local) { + len += 5; /* "+####" or "-####" */ + } + else { + len += 1; /* "Z" */ + } + } + + len += 1; /* NULL terminator */ + + return len; +} + +/* + * Finds the largest unit whose value is nonzero, and for which + * the remainder for the rest of the units is zero. + */ +static PANDAS_DATETIMEUNIT +lossless_unit_from_datetimestruct(pandas_datetimestruct *dts) +{ + if (dts->as % 1000 != 0) { + return PANDAS_FR_as; + } + else if (dts->as != 0) { + return PANDAS_FR_fs; + } + else if (dts->ps % 1000 != 0) { + return PANDAS_FR_ps; + } + else if (dts->ps != 0) { + return PANDAS_FR_ns; + } + else if (dts->us % 1000 != 0) { + return PANDAS_FR_us; + } + else if (dts->us != 0) { + return PANDAS_FR_ms; + } + else if (dts->sec != 0) { + return PANDAS_FR_s; + } + else if (dts->min != 0) { + return PANDAS_FR_m; + } + else if (dts->hour != 0) { + return PANDAS_FR_h; + } + else if (dts->day != 1) { + return PANDAS_FR_D; + } + else if (dts->month != 1) { + return PANDAS_FR_M; + } + else { + return PANDAS_FR_Y; + } +} + +/* + * Converts an pandas_datetimestruct to an (almost) ISO 8601 + * NULL-terminated string. If the string fits in the space exactly, + * it leaves out the NULL terminator and returns success. + * + * The differences from ISO 8601 are the 'NaT' string, and + * the number of year digits is >= 4 instead of strictly 4. + * + * If 'local' is non-zero, it produces a string in local time with + * a +-#### timezone offset, otherwise it uses timezone Z (UTC). + * + * 'base' restricts the output to that unit. Set 'base' to + * -1 to auto-detect a base after which all the values are zero. + * + * 'tzoffset' is used if 'local' is enabled, and 'tzoffset' is + * set to a value other than -1. This is a manual override for + * the local time zone to use, as an offset in minutes. + * + * 'casting' controls whether data loss is allowed by truncating + * the data to a coarser unit. This interacts with 'local', slightly, + * in order to form a date unit string as a local time, the casting + * must be unsafe. + * + * Returns 0 on success, -1 on failure (for example if the output + * string was too short). + */ +int +make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting) +{ + pandas_datetimestruct dts_local; + int timezone_offset = 0; + + char *substr = outstr, sublen = outlen; + int tmplen; + + /* Only do local time within a reasonable year range */ + if ((dts->year <= 1800 || dts->year >= 10000) && tzoffset == -1) { + local = 0; + } + + /* Automatically detect a good unit */ + if (base == -1) { + base = lossless_unit_from_datetimestruct(dts); + /* + * If there's a timezone, use at least minutes precision, + * and never split up hours and minutes by default + */ + if ((base < PANDAS_FR_m && local) || base == PANDAS_FR_h) { + base = PANDAS_FR_m; + } + /* Don't split up dates by default */ + else if (base < PANDAS_FR_D) { + base = PANDAS_FR_D; + } + } + /* + * Print weeks with the same precision as days. + * + * TODO: Could print weeks with YYYY-Www format if the week + * epoch is a Monday. + */ + else if (base == PANDAS_FR_W) { + base = PANDAS_FR_D; + } + + /* Use the C API to convert from UTC to local time */ + if (local && tzoffset == -1) { + if (convert_datetimestruct_utc_to_local(&dts_local, dts, + &timezone_offset) < 0) { + return -1; + } + + /* Set dts to point to our local time instead of the UTC time */ + dts = &dts_local; + } + /* Use the manually provided tzoffset */ + else if (local) { + /* Make a copy of the pandas_datetimestruct we can modify */ + dts_local = *dts; + dts = &dts_local; + + /* Set and apply the required timezone offset */ + timezone_offset = tzoffset; + add_minutes_to_datetimestruct(dts, timezone_offset); + } + + /* + * Now the datetimestruct data is in the final form for + * the string representation, so ensure that the data + * is being cast according to the casting rule. + */ + if (casting != NPY_UNSAFE_CASTING) { + /* Producing a date as a local time is always 'unsafe' */ + if (base <= PANDAS_FR_D && local) { + PyErr_SetString(PyExc_TypeError, "Cannot create a local " + "timezone-based date string from a NumPy " + "datetime without forcing 'unsafe' casting"); + return -1; + } + /* Only 'unsafe' and 'same_kind' allow data loss */ + else { + PANDAS_DATETIMEUNIT unitprec; + + unitprec = lossless_unit_from_datetimestruct(dts); + if (casting != NPY_SAME_KIND_CASTING && unitprec > base) { + PyErr_Format(PyExc_TypeError, "Cannot create a " + "string with unit precision '%s' " + "from the NumPy datetime, which has data at " + "unit precision '%s', " + "requires 'unsafe' or 'same_kind' casting", + _datetime_strings[base], + _datetime_strings[unitprec]); + return -1; + } + } + } + + /* YEAR */ + /* + * Can't use PyOS_snprintf, because it always produces a '\0' + * character at the end, and NumPy string types are permitted + * to have data all the way to the end of the buffer. + */ +#ifdef _WIN32 + tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); +#else + tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, (long long)dts->year); +#endif + /* If it ran out of space or there isn't space for the NULL terminator */ + if (tmplen < 0 || tmplen > sublen) { + goto string_too_short; + } + substr += tmplen; + sublen -= tmplen; + + /* Stop if the unit is years */ + if (base == PANDAS_FR_Y) { + if (sublen > 0) { + *substr = '\0'; + } + return 0; + } + + /* MONTH */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->month / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->month % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is months */ + if (base == PANDAS_FR_M) { + if (sublen > 0) { + *substr = '\0'; + } + return 0; + } + + /* DAY */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->day / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->day % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is days */ + if (base == PANDAS_FR_D) { + if (sublen > 0) { + *substr = '\0'; + } + return 0; + } + + /* HOUR */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = 'T'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->hour / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->hour % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is hours */ + if (base == PANDAS_FR_h) { + goto add_time_zone; + } + + /* MINUTE */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->min / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->min % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is minutes */ + if (base == PANDAS_FR_m) { + goto add_time_zone; + } + + /* SECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->sec / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->sec % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is seconds */ + if (base == PANDAS_FR_s) { + goto add_time_zone; + } + + /* MILLISECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = '.'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 100000) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->us / 10000) % 10 + '0'); + if (sublen < 4 ) { + goto string_too_short; + } + substr[3] = (char)((dts->us / 1000) % 10 + '0'); + substr += 4; + sublen -= 4; + + /* Stop if the unit is milliseconds */ + if (base == PANDAS_FR_ms) { + goto add_time_zone; + } + + /* MICROSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->us / 100) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 10) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(dts->us % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is microseconds */ + if (base == PANDAS_FR_us) { + goto add_time_zone; + } + + /* NANOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100000) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10000) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->ps / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is nanoseconds */ + if (base == PANDAS_FR_ns) { + goto add_time_zone; + } + + /* PICOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(dts->ps % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is picoseconds */ + if (base == PANDAS_FR_ps) { + goto add_time_zone; + } + + /* FEMTOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100000) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10000) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->as / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is femtoseconds */ + if (base == PANDAS_FR_fs) { + goto add_time_zone; + } + + /* ATTOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(dts->as % 10 + '0'); + substr += 3; + sublen -= 3; + +add_time_zone: + if (local) { + /* Add the +/- sign */ + if (sublen < 1) { + goto string_too_short; + } + if (timezone_offset < 0) { + substr[0] = '-'; + timezone_offset = -timezone_offset; + } + else { + substr[0] = '+'; + } + substr += 1; + sublen -= 1; + + /* Add the timezone offset */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((timezone_offset / (10*60)) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((timezone_offset / 60) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(((timezone_offset % 60) / 10) % 10 + '0'); + if (sublen < 4 ) { + goto string_too_short; + } + substr[3] = (char)((timezone_offset % 60) % 10 + '0'); + substr += 4; + sublen -= 4; + } + /* UTC "Zulu" time */ + else { + if (sublen < 1) { + goto string_too_short; + } + substr[0] = 'Z'; + substr += 1; + sublen -= 1; + } + + /* Add a NULL terminator, and return */ + if (sublen > 0) { + substr[0] = '\0'; + } + + return 0; + +string_too_short: + PyErr_Format(PyExc_RuntimeError, + "The string provided for NumPy ISO datetime formatting " + "was too short, with length %d", + outlen); + return -1; +} diff --git a/pandas/src/datetime/np_datetime_strings.h b/pandas/src/datetime/np_datetime_strings.h new file mode 100644 index 00000000..9a2488fe --- /dev/null +++ b/pandas/src/datetime/np_datetime_strings.h @@ -0,0 +1,86 @@ +/* + * This is derived from numpy 1.7. See NP_LICENSE.txt + */ + +#ifndef _NPY_PRIVATE__DATETIME_STRINGS_H_ +#define _NPY_PRIVATE__DATETIME_STRINGS_H_ + +/* + * Parses (almost) standard ISO 8601 date strings. The differences are: + * + * + The date "20100312" is parsed as the year 20100312, not as + * equivalent to "2010-03-12". The '-' in the dates are not optional. + * + Only seconds may have a decimal point, with up to 18 digits after it + * (maximum attoseconds precision). + * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate + * the date and the time. Both are treated equivalently. + * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats. + * + Doesn't handle leap seconds (seconds value has 60 in these cases). + * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow + * + Accepts special values "NaT" (not a time), "Today", (current + * day according to local time) and "Now" (current time in UTC). + * + * 'str' must be a NULL-terminated string, and 'len' must be its length. + * 'unit' should contain -1 if the unit is unknown, or the unit + * which will be used if it is. + * 'casting' controls how the detected unit from the string is allowed + * to be cast to the 'unit' parameter. + * + * 'out' gets filled with the parsed date-time. + * 'out_local' gets set to 1 if the parsed time was in local time, + * to 0 otherwise. The values 'now' and 'today' don't get counted + * as local, and neither do UTC +/-#### timezone offsets, because + * they aren't using the computer's local timezone offset. + * 'out_bestunit' gives a suggested unit based on the amount of + * resolution provided in the string, or -1 for NaT. + * 'out_special' gets set to 1 if the parsed time was 'today', + * 'now', or ''/'NaT'. For 'today', the unit recommended is + * 'D', for 'now', the unit recommended is 's', and for 'NaT' + * the unit recommended is 'Y'. + * + * Returns 0 on success, -1 on failure. + */ +int +parse_iso_8601_datetime(char *str, int len, + PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, + pandas_datetimestruct *out, + npy_bool *out_local, + PANDAS_DATETIMEUNIT *out_bestunit, + npy_bool *out_special); + +/* + * Provides a string length to use for converting datetime + * objects with the given local and unit settings. + */ +int +get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base); + +/* + * Converts an pandas_datetimestruct to an (almost) ISO 8601 + * NULL-terminated string. + * + * If 'local' is non-zero, it produces a string in local time with + * a +-#### timezone offset, otherwise it uses timezone Z (UTC). + * + * 'base' restricts the output to that unit. Set 'base' to + * -1 to auto-detect a base after which all the values are zero. + * + * 'tzoffset' is used if 'local' is enabled, and 'tzoffset' is + * set to a value other than -1. This is a manual override for + * the local time zone to use, as an offset in minutes. + * + * 'casting' controls whether data loss is allowed by truncating + * the data to a coarser unit. This interacts with 'local', slightly, + * in order to form a date unit string as a local time, the casting + * must be unsafe. + * + * Returns 0 on success, -1 on failure (for example if the output + * string was too short). + */ +int +make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting); + +#endif diff --git a/pandas/src/datetime_helper.h b/pandas/src/datetime_helper.h new file mode 100644 index 00000000..8be5f597 --- /dev/null +++ b/pandas/src/datetime_helper.h @@ -0,0 +1,6 @@ +#include "datetime.h" + +void mangle_nat(PyObject *val) { + PyDateTime_GET_MONTH(val) = -1; + PyDateTime_GET_DAY(val) = -1; +} diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py new file mode 100644 index 00000000..842be5a1 --- /dev/null +++ b/pandas/src/generate_code.py @@ -0,0 +1,2395 @@ +from __future__ import print_function +# we only need to be able to run this file on 2.7 +# don't introduce a pandas/pandas.compat import +# or we get a bootstrapping problem +from StringIO import StringIO + +header = """ +cimport numpy as np +cimport cython + +from libc.string cimport memmove + +from numpy cimport * + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyTuple_SetItem, + PyTuple_New) +from cpython cimport PyFloat_Check +cimport cpython + +import numpy as np +isnan = np.isnan + +from datetime import datetime as pydatetime + +# this is our datetime.pxd +from datetime cimport * + +from khash cimport * + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan, get_nat + +cdef int64_t iNaT = get_nat() + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + +""" + + +take_1d_template = """@cython.wraparound(False) +def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s] values, + ndarray[int64_t] indexer, + ndarray[%(c_type_out)s] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + %(c_type_out)s fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = %(preval)svalues[idx]%(postval)s + +""" + +take_2d_axis0_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_%(name)s_%(dest)s(%(c_type_in)s[:, :] values, + ndarray[int64_t] indexer, + %(c_type_out)s[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF %(can_copy)s: + cdef: + %(c_type_out)s *v + %(c_type_out)s *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(%(c_type_out)s) and + sizeof(%(c_type_out)s) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(%(c_type_out)s) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = %(preval)svalues[idx, j]%(postval)s + +""" + +take_2d_axis1_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_%(name)s_%(dest)s(%(c_type_in)s[:, :] values, + ndarray[int64_t] indexer, + %(c_type_out)s[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = %(preval)svalues[i, idx]%(postval)s + +""" + +take_2d_multi_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, + indexer, + ndarray[%(c_type_out)s, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + %(c_type_out)s fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = %(preval)svalues[idx, idx1[j]]%(postval)s + +""" + + + +''' +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +''' + +backfill_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef %(c_type)s cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +""" + + +pad_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef %(c_type)s cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +""" + +pad_1d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_%(name)s(ndarray[%(c_type)s] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +""" + +pad_2d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +""" + +backfill_2d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +""" + +backfill_1d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_%(name)s(ndarray[%(c_type)s] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +""" + + +diff_2d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_%(name)s(ndarray[%(c_type)s, ndim=2] arr, + ndarray[%(dest_type2)s, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +""" + +is_monotonic_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_%(name)s(ndarray[%(c_type)s] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + %(c_type)s prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +""" + +map_indices_template = """@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_%(name)s(ndarray[%(c_type)s] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +""" + +groupby_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +""" + +group_last_template = """@cython.wraparound(False) +@cython.wraparound(False) +def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_last_bin_template = """@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_nth_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_nth_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +""" + +group_add_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +""" + +group_add_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b, nbins + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +""" + +group_prod_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +""" + +group_prod_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + prodx[b, 0] *= val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +""" + +group_var_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, ct + ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + sumxx[lab, j] += val * val + else: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + sumxx[lab, 0] += val * val + + + for i in range(len(counts)): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +""" + +group_var_bin_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, ct + ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + sumxx[b, j] += val * val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + sumxx[b, 0] += val * val + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +""" + +group_count_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + %(c_type)s val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +""" + +group_count_bin_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + %(c_type)s val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +""" +# add passing bin edges, instead of labels + + +#---------------------------------------------------------------------- +# group_min, group_max + +group_min_bin_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +""" + +group_max_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +""" + +group_max_bin_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +""" + + +group_min_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +""" + + +group_mean_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count +""" + +group_mean_bin_template = """ +def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + ndarray[%(dest_type2)s, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + count = nobs[i, j] + if count == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count +""" + +group_ohlc_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, + ndarray[int64_t] counts, + ndarray[%(dest_type2)s, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + %(dest_type2)s val, count + %(dest_type2)s vopen, vhigh, vlow, vclose, NA + bint got_first = 0 + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + + b = 0 + if K > 1: + raise NotImplementedError + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose +""" + +arrmap_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_%(name)s(ndarray[%(c_type)s] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +""" + +#---------------------------------------------------------------------- +# Joins on ordered, unique indices + +# right might contain non-unique values + +left_join_unique_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + %(c_type)s lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +""" + +# @cython.wraparound(False) +# @cython.boundscheck(False) + +left_join_template = """ +def left_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +""" + + +inner_join_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +""" + + +outer_join_template2 = """@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +""" + +outer_join_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + i += 1 + count += 1 + break + else: + if left[i] == right[j]: + i += 1 + j += 1 + elif left[i] < right[j]: + i += 1 + else: + j += 1 + + count += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + j += 1 + + count += 1 + + return result, lindexer, rindexer + +""" + +# ensure_dtype functions + +ensure_dtype_template = """ +cpdef ensure_%(name)s(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_%(ctype)s: + return arr + else: + return arr.astype(np.%(dtype)s) + else: + return np.array(arr, dtype=np.%(dtype)s) + +""" + +ensure_functions = [ + ('float64', 'FLOAT64', 'float64'), + ('float32', 'FLOAT32', 'float32'), + ('int8', 'INT8', 'int8'), + ('int16', 'INT16', 'int16'), + ('int32', 'INT32', 'int32'), + ('int64', 'INT64', 'int64'), + # ('platform_int', 'INT', 'int_'), + ('object', 'OBJECT', 'object_'), +] + +def generate_ensure_dtypes(): + output = StringIO() + for name, ctype, dtype in ensure_functions: + filled = ensure_dtype_template % locals() + output.write(filled) + return output.getvalue() + +#---------------------------------------------------------------------- +# Fast "put" logic for speeding up interleaving logic + +put2d_template = """ +def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[%(dest_type2)s] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] +""" + + +#------------------------------------------------------------------------- +# Generators + +def generate_put_template(template, use_ints=True, use_floats=True, + use_objects=False, use_datelikes=False): + floats_list = [ + ('float64', 'float64_t', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'float32_t', 'np.float32'), + ] + ints_list = [ + ('int8', 'int8_t', 'float32_t', 'np.float32'), + ('int16', 'int16_t', 'float32_t', 'np.float32'), + ('int32', 'int32_t', 'float64_t', 'np.float64'), + ('int64', 'int64_t', 'float64_t', 'np.float64'), + ] + date_like_list = [ + ('int64', 'int64_t', 'float64_t', 'np.float64'), + ] + object_list = [('object', 'object', 'object', 'np.object_')] + function_list = [] + if use_floats: + function_list.extend(floats_list) + if use_ints: + function_list.extend(ints_list) + if use_objects: + function_list.extend(object_list) + if use_datelikes: + function_list.extend(date_like_list) + + output = StringIO() + for name, c_type, dest_type, dest_dtype in function_list: + func = template % {'name': name, + 'c_type': c_type, + 'dest_type': dest_type.replace('_t', ''), + 'dest_type2': dest_type, + 'dest_dtype': dest_dtype} + output.write(func) + return output.getvalue() + +def generate_take_template(template, exclude=None): + # name, dest, ctypein, ctypeout, preval, postval, cancopy + function_list = [ + ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), + ('bool', 'object', 'uint8_t', 'object', + 'True if ', ' > 0 else False', False), + ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), + ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), + ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), + ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), + ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), + ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), + ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), + ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), + ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), + ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), + ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), + ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), + ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), + ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), + ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), + ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), + ('object', 'object', 'object', 'object', '', '', False) + ] + + output = StringIO() + for (name, dest, c_type_in, c_type_out, + preval, postval, can_copy) in function_list: + if exclude is not None and name in exclude: + continue + + func = template % {'name': name, 'dest': dest, + 'c_type_in': c_type_in, 'c_type_out': c_type_out, + 'preval': preval, 'postval': postval, + 'can_copy': 'True' if can_copy else 'False'} + output.write(func) + return output.getvalue() + +def generate_from_template(template, exclude=None): + # name, ctype, capable of holding NA + function_list = [ + ('float64', 'float64_t', 'np.float64', True), + ('float32', 'float32_t', 'np.float32', True), + ('object', 'object', 'object', True), + ('int32', 'int32_t', 'np.int32', False), + ('int64', 'int64_t', 'np.int64', False), + ('bool', 'uint8_t', 'np.bool', False) + ] + + output = StringIO() + for name, c_type, dtype, can_hold_na in function_list: + if exclude is not None and name in exclude: + continue + + func = template % {'name': name, 'c_type': c_type, + 'dtype': dtype, + 'raise_on_na': 'False' if can_hold_na else 'True'} + output.write(func) + return output.getvalue() + +put_2d = [diff_2d_template] +groupbys = [group_last_template, + group_last_bin_template, + group_nth_template, + group_nth_bin_template, + group_add_template, + group_add_bin_template, + group_prod_template, + group_prod_bin_template, + group_var_template, + group_var_bin_template, + group_mean_template, + group_mean_bin_template, + group_min_template, + group_min_bin_template, + group_max_template, + group_max_bin_template, + group_ohlc_template] + +groupby_count = [group_count_template, group_count_bin_template] + +templates_1d = [map_indices_template, + pad_template, + backfill_template, + pad_1d_template, + backfill_1d_template, + pad_2d_template, + backfill_2d_template, + is_monotonic_template, + groupby_template, + arrmap_template] + +nobool_1d_templates = [left_join_unique_template, + left_join_template, + outer_join_template2, + inner_join_template] + +take_templates = [take_1d_template, + take_2d_axis0_template, + take_2d_axis1_template, + take_2d_multi_template] + + +def generate_take_cython_file(path='generated.pyx'): + with open(path, 'w') as f: + print(header, file=f) + + print(generate_ensure_dtypes(), file=f) + + for template in templates_1d: + print(generate_from_template(template), file=f) + + for template in take_templates: + print(generate_take_template(template), file=f) + + for template in put_2d: + print(generate_put_template(template), file=f) + + for template in groupbys: + print(generate_put_template(template, use_ints=False), file=f) + + for template in groupby_count: + print(generate_put_template(template, use_ints=False, + use_datelikes=True, use_objects=True), + file=f) + + # for template in templates_1d_datetime: + # print >> f, generate_from_template_datetime(template) + + # for template in templates_2d_datetime: + # print >> f, generate_from_template_datetime(template, ndim=2) + + for template in nobool_1d_templates: + print(generate_from_template(template, exclude=['bool']), file=f) + + +if __name__ == '__main__': + generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx new file mode 100644 index 00000000..97a34582 --- /dev/null +++ b/pandas/src/generated.pyx @@ -0,0 +1,8756 @@ + +cimport numpy as np +cimport cython + +from libc.string cimport memmove + +from numpy cimport * + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyTuple_SetItem, + PyTuple_New) +from cpython cimport PyFloat_Check +cimport cpython + +import numpy as np +isnan = np.isnan + +from datetime import datetime as pydatetime + +# this is our datetime.pxd +from datetime cimport * + +from khash cimport * + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan, get_nat + +cdef int64_t iNaT = get_nat() + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + + + +cpdef ensure_float64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT64: + return arr + else: + return arr.astype(np.float64) + else: + return np.array(arr, dtype=np.float64) + + +cpdef ensure_float32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT32: + return arr + else: + return arr.astype(np.float32) + else: + return np.array(arr, dtype=np.float32) + + +cpdef ensure_int8(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT8: + return arr + else: + return arr.astype(np.int8) + else: + return np.array(arr, dtype=np.int8) + + +cpdef ensure_int16(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT16: + return arr + else: + return arr.astype(np.int16) + else: + return np.array(arr, dtype=np.int16) + + +cpdef ensure_int32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT32: + return arr + else: + return arr.astype(np.int32) + else: + return np.array(arr, dtype=np.int32) + + +cpdef ensure_int64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT64: + return arr + else: + return arr.astype(np.int64) + else: + return np.array(arr, dtype=np.int64) + + +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + else: + return np.array(arr, dtype=np.object_) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float64(ndarray[float64_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float32(ndarray[float32_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_object(ndarray[object] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int32(ndarray[int32_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int64(ndarray[int64_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_bool(ndarray[uint8_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float32_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float64(ndarray[float64_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + float64_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float32(ndarray[float32_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + float32_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_object(ndarray[object] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + object prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int32(ndarray[int32_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + int32_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int64(ndarray[int64_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + int64_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_bool(ndarray[uint8_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + uint8_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float64(ndarray[float64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float32(ndarray[float32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_object(ndarray[object] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int32(ndarray[int32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int64(ndarray[int64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_bool(ndarray[uint8_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float64(ndarray[float64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float32(ndarray[float32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_object(ndarray[object] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int32(ndarray[int32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int64(ndarray[int64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_bool(ndarray[uint8_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +def take_1d_bool_bool(ndarray[uint8_t] values, + ndarray[int64_t] indexer, + ndarray[uint8_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + uint8_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_bool_object(ndarray[uint8_t] values, + ndarray[int64_t] indexer, + ndarray[object] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + object fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = True if values[idx] > 0 else False + +@cython.wraparound(False) +def take_1d_int8_int8(ndarray[int8_t] values, + ndarray[int64_t] indexer, + ndarray[int8_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int8_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int8_int32(ndarray[int8_t] values, + ndarray[int64_t] indexer, + ndarray[int32_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int8_int64(ndarray[int8_t] values, + ndarray[int64_t] indexer, + ndarray[int64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int8_float64(ndarray[int8_t] values, + ndarray[int64_t] indexer, + ndarray[float64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int16_int16(ndarray[int16_t] values, + ndarray[int64_t] indexer, + ndarray[int16_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int16_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int16_int32(ndarray[int16_t] values, + ndarray[int64_t] indexer, + ndarray[int32_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int16_int64(ndarray[int16_t] values, + ndarray[int64_t] indexer, + ndarray[int64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int16_float64(ndarray[int16_t] values, + ndarray[int64_t] indexer, + ndarray[float64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int32_int32(ndarray[int32_t] values, + ndarray[int64_t] indexer, + ndarray[int32_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int32_int64(ndarray[int32_t] values, + ndarray[int64_t] indexer, + ndarray[int64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int32_float64(ndarray[int32_t] values, + ndarray[int64_t] indexer, + ndarray[float64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int64_int64(ndarray[int64_t] values, + ndarray[int64_t] indexer, + ndarray[int64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int64_float64(ndarray[int64_t] values, + ndarray[int64_t] indexer, + ndarray[float64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_float32_float32(ndarray[float32_t] values, + ndarray[int64_t] indexer, + ndarray[float32_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + float32_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_float32_float64(ndarray[float32_t] values, + ndarray[int64_t] indexer, + ndarray[float64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_float64_float64(ndarray[float64_t] values, + ndarray[int64_t] indexer, + ndarray[float64_t] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + +@cython.wraparound(False) +def take_1d_object_object(ndarray[object] values, + ndarray[int64_t] indexer, + ndarray[object] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + object fv + + n = len(indexer) + + fv = fill_value + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool_bool(uint8_t[:, :] values, + ndarray[int64_t] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + uint8_t *v + uint8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(uint8_t) and + sizeof(uint8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(uint8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool_object(uint8_t[:, :] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = True if values[idx, j] > 0 else False + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int8(int8_t[:, :] values, + ndarray[int64_t] indexer, + int8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int8_t *v + int8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int8_t) and + sizeof(int8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int32(int8_t[:, :] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int64(int8_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_float64(int8_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int16(int16_t[:, :] values, + ndarray[int64_t] indexer, + int16_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int16_t *v + int16_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int16_t) and + sizeof(int16_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int16_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int32(int16_t[:, :] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int64(int16_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_float64(int16_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int32(int32_t[:, :] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int64(int32_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_float64(int32_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_int64(int64_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_float64(int64_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float32(float32_t[:, :] values, + ndarray[int64_t] indexer, + float32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float32_t *v + float32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float32_t) and + sizeof(float32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float64(float32_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64_float64(float64_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object_object(object[:, :] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_bool(uint8_t[:, :] values, + ndarray[int64_t] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_object(uint8_t[:, :] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = True if values[i, idx] > 0 else False + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int8(int8_t[:, :] values, + ndarray[int64_t] indexer, + int8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int32(int8_t[:, :] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int64(int8_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_float64(int8_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int16(int16_t[:, :] values, + ndarray[int64_t] indexer, + int16_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int32(int16_t[:, :] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int64(int16_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_float64(int16_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_int32(int32_t[:, :] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_int64(int32_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_float64(int32_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64_int64(int64_t[:, :] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64_float64(int64_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float32_float32(float32_t[:, :] values, + ndarray[int64_t] indexer, + float32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float32_float64(float32_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float64_float64(float64_t[:, :] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_object_object(object[:, :] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values, + indexer, + ndarray[uint8_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + uint8_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values, + indexer, + ndarray[object, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + object fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = True if values[idx, idx1[j]] > 0 else False + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int8_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int8_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int16_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int16_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values, + indexer, + ndarray[float32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_object_object(ndarray[object, ndim=2] values, + indexer, + ndarray[object, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + object fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float64(ndarray[float64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float32(ndarray[float32_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int8(ndarray[int8_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int16(ndarray[int16_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int32(ndarray[int32_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int64(ndarray[int64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.wraparound(False) +@cython.wraparound(False) +def group_last_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b, nbins + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b, nbins + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + prodx[b, 0] *= val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + prodx[b, 0] *= val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + sumxx[lab, j] += val * val + else: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + sumxx[lab, 0] += val * val + + + for i in range(len(counts)): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, ct + ndarray[float32_t, ndim=2] nobs, sumx, sumxx + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + sumxx[lab, j] += val * val + else: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + sumxx[lab, 0] += val * val + + + for i in range(len(counts)): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + sumxx[b, j] += val * val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + sumxx[b, 0] += val * val + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) +@cython.wraparound(False) +@cython.boundscheck(False) +def group_var_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, ct + ndarray[float32_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + sumxx[b, j] += val * val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + sumxx[b, 0] += val * val + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + + +def group_mean_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + +def group_mean_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + ndarray[float32_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + ndarray[float32_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + float64_t vopen, vhigh, vlow, vclose, NA + bint got_first = 0 + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + + b = 0 + if K > 1: + raise NotImplementedError + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float32_t val, count + float32_t vopen, vhigh, vlow, vclose, NA + bint got_first = 0 + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + + b = 0 + if K > 1: + raise NotImplementedError + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + float64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + float32_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + object val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_int64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, lab + Py_ssize_t N = values.shape[0], K = values.shape[1] + int64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + if len(values) != len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[lab, j] += val == val and val != iNaT + + for i in range(len(counts)): + for j in range(K): + out[i, j] = nobs[i, j] + + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + float64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + float32_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + object val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_count_bin_int64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, ngroups + Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0 + int64_t val + ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]), + dtype=np.int64) + + ngroups = len(bins) + (bins[len(bins) - 1] != N) + + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + nobs[b, j] += val == val and val != iNaT + + for i in range(ngroups): + for j in range(K): + out[i, j] = nobs[i, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + object lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + + +def left_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_object(ndarray[object] left, + ndarray[object] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_object(ndarray[object] left, + ndarray[object] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + diff --git a/pandas/src/headers/math.h b/pandas/src/headers/math.h new file mode 100644 index 00000000..8ccf11d0 --- /dev/null +++ b/pandas/src/headers/math.h @@ -0,0 +1,11 @@ +#ifndef _PANDAS_MATH_H_ +#define _PANDAS_MATH_H_ + +#if defined(_MSC_VER) +#include +__inline int signbit(double num) { return _copysign(1.0, num) < 0; } +#else +#include +#endif + +#endif diff --git a/pandas/src/headers/ms_inttypes.h b/pandas/src/headers/ms_inttypes.h new file mode 100644 index 00000000..1be38033 --- /dev/null +++ b/pandas/src/headers/ms_inttypes.h @@ -0,0 +1,305 @@ +// ISO C9x compliant inttypes.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_INTTYPES_H_ // [ +#define _MSC_INTTYPES_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include "ms_stdint.h" + +// 7.8 Format conversion of integer types + +typedef struct { + intmax_t quot; + intmax_t rem; +} imaxdiv_t; + +// 7.8.1 Macros for format specifiers + +#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 + +// The fprintf macros for signed integers are: +#define PRId8 "d" +#define PRIi8 "i" +#define PRIdLEAST8 "d" +#define PRIiLEAST8 "i" +#define PRIdFAST8 "d" +#define PRIiFAST8 "i" + +#define PRId16 "hd" +#define PRIi16 "hi" +#define PRIdLEAST16 "hd" +#define PRIiLEAST16 "hi" +#define PRIdFAST16 "hd" +#define PRIiFAST16 "hi" + +#define PRId32 "I32d" +#define PRIi32 "I32i" +#define PRIdLEAST32 "I32d" +#define PRIiLEAST32 "I32i" +#define PRIdFAST32 "I32d" +#define PRIiFAST32 "I32i" + +#define PRId64 "I64d" +#define PRIi64 "I64i" +#define PRIdLEAST64 "I64d" +#define PRIiLEAST64 "I64i" +#define PRIdFAST64 "I64d" +#define PRIiFAST64 "I64i" + +#define PRIdMAX "I64d" +#define PRIiMAX "I64i" + +#define PRIdPTR "Id" +#define PRIiPTR "Ii" + +// The fprintf macros for unsigned integers are: +#define PRIo8 "o" +#define PRIu8 "u" +#define PRIx8 "x" +#define PRIX8 "X" +#define PRIoLEAST8 "o" +#define PRIuLEAST8 "u" +#define PRIxLEAST8 "x" +#define PRIXLEAST8 "X" +#define PRIoFAST8 "o" +#define PRIuFAST8 "u" +#define PRIxFAST8 "x" +#define PRIXFAST8 "X" + +#define PRIo16 "ho" +#define PRIu16 "hu" +#define PRIx16 "hx" +#define PRIX16 "hX" +#define PRIoLEAST16 "ho" +#define PRIuLEAST16 "hu" +#define PRIxLEAST16 "hx" +#define PRIXLEAST16 "hX" +#define PRIoFAST16 "ho" +#define PRIuFAST16 "hu" +#define PRIxFAST16 "hx" +#define PRIXFAST16 "hX" + +#define PRIo32 "I32o" +#define PRIu32 "I32u" +#define PRIx32 "I32x" +#define PRIX32 "I32X" +#define PRIoLEAST32 "I32o" +#define PRIuLEAST32 "I32u" +#define PRIxLEAST32 "I32x" +#define PRIXLEAST32 "I32X" +#define PRIoFAST32 "I32o" +#define PRIuFAST32 "I32u" +#define PRIxFAST32 "I32x" +#define PRIXFAST32 "I32X" + +#define PRIo64 "I64o" +#define PRIu64 "I64u" +#define PRIx64 "I64x" +#define PRIX64 "I64X" +#define PRIoLEAST64 "I64o" +#define PRIuLEAST64 "I64u" +#define PRIxLEAST64 "I64x" +#define PRIXLEAST64 "I64X" +#define PRIoFAST64 "I64o" +#define PRIuFAST64 "I64u" +#define PRIxFAST64 "I64x" +#define PRIXFAST64 "I64X" + +#define PRIoMAX "I64o" +#define PRIuMAX "I64u" +#define PRIxMAX "I64x" +#define PRIXMAX "I64X" + +#define PRIoPTR "Io" +#define PRIuPTR "Iu" +#define PRIxPTR "Ix" +#define PRIXPTR "IX" + +// The fscanf macros for signed integers are: +#define SCNd8 "d" +#define SCNi8 "i" +#define SCNdLEAST8 "d" +#define SCNiLEAST8 "i" +#define SCNdFAST8 "d" +#define SCNiFAST8 "i" + +#define SCNd16 "hd" +#define SCNi16 "hi" +#define SCNdLEAST16 "hd" +#define SCNiLEAST16 "hi" +#define SCNdFAST16 "hd" +#define SCNiFAST16 "hi" + +#define SCNd32 "ld" +#define SCNi32 "li" +#define SCNdLEAST32 "ld" +#define SCNiLEAST32 "li" +#define SCNdFAST32 "ld" +#define SCNiFAST32 "li" + +#define SCNd64 "I64d" +#define SCNi64 "I64i" +#define SCNdLEAST64 "I64d" +#define SCNiLEAST64 "I64i" +#define SCNdFAST64 "I64d" +#define SCNiFAST64 "I64i" + +#define SCNdMAX "I64d" +#define SCNiMAX "I64i" + +#ifdef _WIN64 // [ +# define SCNdPTR "I64d" +# define SCNiPTR "I64i" +#else // _WIN64 ][ +# define SCNdPTR "ld" +# define SCNiPTR "li" +#endif // _WIN64 ] + +// The fscanf macros for unsigned integers are: +#define SCNo8 "o" +#define SCNu8 "u" +#define SCNx8 "x" +#define SCNX8 "X" +#define SCNoLEAST8 "o" +#define SCNuLEAST8 "u" +#define SCNxLEAST8 "x" +#define SCNXLEAST8 "X" +#define SCNoFAST8 "o" +#define SCNuFAST8 "u" +#define SCNxFAST8 "x" +#define SCNXFAST8 "X" + +#define SCNo16 "ho" +#define SCNu16 "hu" +#define SCNx16 "hx" +#define SCNX16 "hX" +#define SCNoLEAST16 "ho" +#define SCNuLEAST16 "hu" +#define SCNxLEAST16 "hx" +#define SCNXLEAST16 "hX" +#define SCNoFAST16 "ho" +#define SCNuFAST16 "hu" +#define SCNxFAST16 "hx" +#define SCNXFAST16 "hX" + +#define SCNo32 "lo" +#define SCNu32 "lu" +#define SCNx32 "lx" +#define SCNX32 "lX" +#define SCNoLEAST32 "lo" +#define SCNuLEAST32 "lu" +#define SCNxLEAST32 "lx" +#define SCNXLEAST32 "lX" +#define SCNoFAST32 "lo" +#define SCNuFAST32 "lu" +#define SCNxFAST32 "lx" +#define SCNXFAST32 "lX" + +#define SCNo64 "I64o" +#define SCNu64 "I64u" +#define SCNx64 "I64x" +#define SCNX64 "I64X" +#define SCNoLEAST64 "I64o" +#define SCNuLEAST64 "I64u" +#define SCNxLEAST64 "I64x" +#define SCNXLEAST64 "I64X" +#define SCNoFAST64 "I64o" +#define SCNuFAST64 "I64u" +#define SCNxFAST64 "I64x" +#define SCNXFAST64 "I64X" + +#define SCNoMAX "I64o" +#define SCNuMAX "I64u" +#define SCNxMAX "I64x" +#define SCNXMAX "I64X" + +#ifdef _WIN64 // [ +# define SCNoPTR "I64o" +# define SCNuPTR "I64u" +# define SCNxPTR "I64x" +# define SCNXPTR "I64X" +#else // _WIN64 ][ +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" +# define SCNXPTR "lX" +#endif // _WIN64 ] + +#endif // __STDC_FORMAT_MACROS ] + +// 7.8.2 Functions for greatest-width integer types + +// 7.8.2.1 The imaxabs function +#define imaxabs _abs64 + +// 7.8.2.2 The imaxdiv function + +// This is modified version of div() function from Microsoft's div.c found +// in %MSVC.NET%\crt\src\div.c +#ifdef STATIC_IMAXDIV // [ +static +#else // STATIC_IMAXDIV ][ +_inline +#endif // STATIC_IMAXDIV ] +imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) +{ + imaxdiv_t result; + + result.quot = numer / denom; + result.rem = numer % denom; + + if (numer < 0 && result.rem > 0) { + // did division wrong; must fix up + ++result.quot; + result.rem -= denom; + } + + return result; +} + +// 7.8.2.3 The strtoimax and strtoumax functions +#define strtoimax _strtoi64 +#define strtoumax _strtoui64 + +// 7.8.2.4 The wcstoimax and wcstoumax functions +#define wcstoimax _wcstoi64 +#define wcstoumax _wcstoui64 + + +#endif // _MSC_INTTYPES_H_ ] diff --git a/pandas/src/headers/ms_stdint.h b/pandas/src/headers/ms_stdint.h new file mode 100644 index 00000000..c66fbb81 --- /dev/null +++ b/pandas/src/headers/ms_stdint.h @@ -0,0 +1,247 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2008 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +#define INTMAX_C INT64_C +#define UINTMAX_C UINT64_C + +#endif // __STDC_CONSTANT_MACROS ] + + +#endif // _MSC_STDINT_H_ ] diff --git a/pandas/src/headers/portable.h b/pandas/src/headers/portable.h new file mode 100644 index 00000000..b9868276 --- /dev/null +++ b/pandas/src/headers/portable.h @@ -0,0 +1,8 @@ +#ifndef _PANDAS_PORTABLE_H_ +#define _PANDAS_PORTABLE_H_ + +#if defined(_MSC_VER) +#define strcasecmp( s1, s2 ) _stricmp( s1, s2 ) +#endif + +#endif diff --git a/pandas/src/headers/stdint.h b/pandas/src/headers/stdint.h new file mode 100644 index 00000000..b0fd235a --- /dev/null +++ b/pandas/src/headers/stdint.h @@ -0,0 +1,10 @@ +#ifndef _PANDAS_STDINT_H_ +#define _PANDAS_STDINT_H_ + +#if defined(_MSC_VER) +#include "ms_stdint.h" +#else +#include +#endif + +#endif diff --git a/pandas/src/helper.h b/pandas/src/helper.h new file mode 100644 index 00000000..e97e45f4 --- /dev/null +++ b/pandas/src/helper.h @@ -0,0 +1,16 @@ +#ifndef C_HELPER_H +#define C_HELPER_H + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +#endif diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx new file mode 100644 index 00000000..bd23135b --- /dev/null +++ b/pandas/src/inference.pyx @@ -0,0 +1,1124 @@ +cimport util +from tslib import NaT +from datetime import datetime, timedelta +iNaT = util.get_nat() + +# core.common import for fast inference checks +def is_float(object obj): + return util.is_float_object(obj) + +def is_integer(object obj): + return util.is_integer_object(obj) + +def is_bool(object obj): + return util.is_bool_object(obj) + +def is_complex(object obj): + return util.is_complex_object(obj) + +_TYPE_MAP = { + 'int8': 'integer', + 'int16': 'integer', + 'int32': 'integer', + 'int64': 'integer', + 'i' : 'integer', + 'uint8': 'integer', + 'uint16': 'integer', + 'uint32': 'integer', + 'uint64': 'integer', + 'u' : 'integer', + 'float32': 'floating', + 'float64': 'floating', + 'f' : 'floating', + 'complex128': 'complex', + 'c' : 'complex', + 'string': 'string', + 'S' : 'string', + 'unicode': 'unicode', + 'U' : 'unicode', + 'bool': 'boolean', + 'b' : 'boolean', + 'datetime64[ns]' : 'datetime64', + 'M' : 'datetime64', + 'timedelta64[ns]' : 'timedelta64', + 'm' : 'timedelta64', +} + +# types only exist on certain platform +try: + np.float128 + _TYPE_MAP['float128'] = 'floating' +except AttributeError: + pass +try: + np.complex256 + _TYPE_MAP['complex256'] = 'complex' +except AttributeError: + pass +try: + np.float16 + _TYPE_MAP['float16'] = 'floating' +except AttributeError: + pass + +def infer_dtype(object _values): + cdef: + Py_ssize_t i, n + object val + ndarray values + + if isinstance(_values, np.ndarray): + values = _values + elif hasattr(_values,'values'): + values = _values.values + else: + if not isinstance(_values, list): + _values = list(_values) + values = list_to_object_array(_values) + + values = getattr(values, 'values', values) + + val_name = values.dtype.name + if val_name in _TYPE_MAP: + return _TYPE_MAP[val_name] + val_kind = values.dtype.kind + if val_kind in _TYPE_MAP: + return _TYPE_MAP[val_kind] + + if values.dtype != np.object_: + values = values.astype('O') + + n = len(values) + if n == 0: + return 'empty' + + # make contiguous + values = values.ravel() + + # try to use a valid value + for i in range(n): + val = util.get_value_1d(values, i) + if not is_null_datetimelike(val): + break + + if util.is_datetime64_object(val) or val is NaT: + if is_datetime64_array(values): + return 'datetime64' + elif is_timedelta_or_timedelta64_array(values): + return 'timedelta' + + elif util.is_integer_object(val): + # a timedelta will show true here as well + if is_timedelta(val): + if is_timedelta_or_timedelta64_array(values): + return 'timedelta' + + if is_integer_array(values): + return 'integer' + elif is_integer_float_array(values): + return 'mixed-integer-float' + elif is_timedelta_or_timedelta64_array(values): + return 'timedelta' + return 'mixed-integer' + + elif is_datetime(val): + if is_datetime_array(values): + return 'datetime' + + elif is_date(val): + if is_date_array(values): + return 'date' + + elif is_time(val): + if is_time_array(values): + return 'time' + + elif util.is_float_object(val): + if is_float_array(values): + return 'floating' + elif is_integer_float_array(values): + return 'mixed-integer-float' + + elif util.is_bool_object(val): + if is_bool_array(values): + return 'boolean' + + elif PyString_Check(val): + if is_string_array(values): + return 'string' + + elif PyUnicode_Check(val): + if is_unicode_array(values): + return 'unicode' + + elif is_timedelta(val): + if is_timedelta_or_timedelta64_array(values): + return 'timedelta' + + elif is_period(val): + if is_period_array(values): + return 'period' + + for i in range(n): + val = util.get_value_1d(values, i) + if util.is_integer_object(val): + return 'mixed-integer' + + return 'mixed' + +def infer_dtype_list(list values): + cdef: + Py_ssize_t i, n = len(values) + pass + + +def is_possible_datetimelike_array(object arr): + # determine if we have a possible datetimelike (or null-like) array + cdef: + Py_ssize_t i, n = len(arr) + bint seen_timedelta = 0, seen_datetime = 0 + object v + + for i in range(n): + v = arr[i] + if util.is_string_object(v): + continue + elif util._checknull(v): + continue + elif is_datetime(v): + seen_datetime=1 + elif is_timedelta(v): + seen_timedelta=1 + else: + return False + return seen_datetime or seen_timedelta + +cdef inline bint is_null_datetimelike(v): + # determine if we have a null for a timedelta/datetime (or integer versions)x + if util._checknull(v): + return True + elif util.is_timedelta64_object(v): + return v.view('int64') == iNaT + elif util.is_datetime64_object(v): + return v.view('int64') == iNaT + elif util.is_integer_object(v): + return v == iNaT + elif v is NaT: + return True + return False + +cdef inline bint is_datetime(object o): + return PyDateTime_Check(o) + +cdef inline bint is_date(object o): + return PyDate_Check(o) + +cdef inline bint is_time(object o): + return PyTime_Check(o) + +cdef inline bint is_timedelta(object o): + return PyDelta_Check(o) or util.is_timedelta64_object(o) + +def is_bool_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.bool_): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not util.is_bool_object(objbuf[i]): + return False + return True + else: + return False + +def is_integer(object o): + return util.is_integer_object(o) + +def is_integer_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.integer): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not util.is_integer_object(objbuf[i]): + return False + return True + else: + return False + +def is_integer_float_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.integer): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not (util.is_integer_object(objbuf[i]) or + util.is_float_object(objbuf[i])): + + return False + return True + else: + return False + +def is_float_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.floating): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not util.is_float_object(objbuf[i]): + return False + return True + else: + return False + +def is_string_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, (np.string_, np.unicode_)): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not PyString_Check(objbuf[i]): + return False + return True + else: + return False + +def is_unicode_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.unicode_): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not PyUnicode_Check(objbuf[i]): + return False + return True + else: + return False + + +def is_datetime_array(ndarray[object] values): + cdef int i, null_count = 0, n = len(values) + cdef object v + if n == 0: + return False + + # return False for all nulls + for i in range(n): + v = values[i] + if is_null_datetimelike(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not is_datetime(v): + return False + return null_count != n + +def is_datetime64_array(ndarray values): + cdef int i, null_count = 0, n = len(values) + cdef object v + if n == 0: + return False + + # return False for all nulls + for i in range(n): + v = values[i] + if is_null_datetimelike(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not util.is_datetime64_object(v): + return False + return null_count != n + +def is_timedelta_array(ndarray values): + cdef int i, null_count = 0, n = len(values) + cdef object v + if n == 0: + return False + for i in range(n): + v = values[i] + if is_null_datetimelike(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not PyDelta_Check(v): + return False + return null_count != n + +def is_timedelta64_array(ndarray values): + cdef int i, null_count = 0, n = len(values) + cdef object v + if n == 0: + return False + for i in range(n): + v = values[i] + if is_null_datetimelike(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not util.is_timedelta64_object(v): + return False + return null_count != n + +def is_timedelta_or_timedelta64_array(ndarray values): + """ infer with timedeltas and/or nat/none """ + cdef int i, null_count = 0, n = len(values) + cdef object v + if n == 0: + return False + for i in range(n): + v = values[i] + if is_null_datetimelike(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not is_timedelta(v): + return False + return null_count != n + +def is_date_array(ndarray[object] values): + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not is_date(values[i]): + return False + return True + +def is_time_array(ndarray[object] values): + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not is_time(values[i]): + return False + return True + +def is_period(object o): + from pandas import Period + return isinstance(o,Period) + +def is_period_array(ndarray[object] values): + cdef int i, n = len(values) + from pandas.tseries.period import Period + + if n == 0: + return False + for i in range(n): + if not isinstance(values[i], Period): + return False + return True + + +cdef extern from "parse_helper.h": + inline int floatify(object, double *result) except -1 + +cdef double fINT64_MAX = INT64_MAX +cdef double fINT64_MIN = INT64_MIN + + +def maybe_convert_numeric(object[:] values, set na_values, + bint convert_empty=True, bint coerce_numeric=False): + ''' + Type inference function-- convert strings to numeric (potentially) and + convert to proper dtype array + ''' + cdef: + int status + Py_ssize_t i, n = values.size + ndarray[float64_t] floats = np.empty(n, dtype='f8') + ndarray[complex128_t] complexes = np.empty(n, dtype='c16') + ndarray[int64_t] ints = np.empty(n, dtype='i8') + ndarray[uint8_t] bools = np.empty(n, dtype='u1') + bint seen_float = False + bint seen_complex = False + bint seen_int = False + bint seen_bool = False + object val + float64_t fval + + for i in range(n): + val = values[i] + + if val in na_values: + floats[i] = complexes[i] = nan + seen_float = True + elif util.is_float_object(val): + floats[i] = complexes[i] = val + seen_float = True + elif util.is_integer_object(val): + floats[i] = ints[i] = val + seen_int = True + elif util.is_bool_object(val): + floats[i] = ints[i] = bools[i] = val + seen_bool = True + elif val is None: + floats[i] = complexes[i] = nan + seen_float = True + elif hasattr(val, '__len__') and len(val) == 0: + if convert_empty or coerce_numeric: + floats[i] = complexes[i] = nan + seen_float = True + else: + raise ValueError('Empty string encountered') + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = True + else: + try: + status = floatify(val, &fval) + floats[i] = fval + if not seen_float: + if '.' in val or fval == INF or fval == NEGINF: + seen_float = True + elif 'inf' in val: # special case to handle +/-inf + seen_float = True + elif fval < fINT64_MAX and fval > fINT64_MIN: + try: + ints[i] = int(val) + except ValueError: + ints[i] = fval + else: + seen_float = True + except: + if not coerce_numeric: + raise + + floats[i] = nan + seen_float = True + + if seen_complex: + return complexes + elif seen_float: + return floats + elif seen_int: + return ints + elif seen_bool: + return bools.view(np.bool_) + return ints + + +def maybe_convert_objects(ndarray[object] objects, bint try_float=0, + bint safe=0, bint convert_datetime=0, + bint convert_timedelta=0): + ''' + Type inference function-- convert object array to proper dtype + ''' + cdef: + Py_ssize_t i, n + ndarray[float64_t] floats + ndarray[complex128_t] complexes + ndarray[int64_t] ints + ndarray[uint8_t] bools + ndarray[int64_t] idatetimes + ndarray[int64_t] itimedeltas + bint seen_float = 0 + bint seen_complex = 0 + bint seen_datetime = 0 + bint seen_timedelta = 0 + bint seen_int = 0 + bint seen_bool = 0 + bint seen_object = 0 + bint seen_null = 0 + bint seen_numeric = 0 + object val, onan + float64_t fval, fnan + + n = len(objects) + + floats = np.empty(n, dtype='f8') + complexes = np.empty(n, dtype='c16') + ints = np.empty(n, dtype='i8') + bools = np.empty(n, dtype=np.uint8) + + if convert_datetime: + datetimes = np.empty(n, dtype='M8[ns]') + idatetimes = datetimes.view(np.int64) + + if convert_timedelta: + timedeltas = np.empty(n, dtype='m8[ns]') + itimedeltas = timedeltas.view(np.int64) + + onan = np.nan + fnan = np.nan + + for i from 0 <= i < n: + val = objects[i] + + if val is None: + seen_null = 1 + floats[i] = complexes[i] = fnan + elif util.is_bool_object(val): + seen_bool = 1 + bools[i] = val + elif util.is_float_object(val): + floats[i] = complexes[i] = val + seen_float = 1 + elif util.is_datetime64_object(val): + if convert_datetime: + idatetimes[i] = convert_to_tsobject(val, None, None).value + seen_datetime = 1 + else: + seen_object = 1 + # objects[i] = val.astype('O') + break + elif is_timedelta(val): + if convert_timedelta: + itimedeltas[i] = convert_to_timedelta64(val, 'ns', False) + seen_timedelta = 1 + else: + seen_object = 1 + break + elif util.is_integer_object(val): + seen_int = 1 + floats[i] = val + complexes[i] = val + if not seen_null: + try: + ints[i] = val + except OverflowError: + seen_object = 1 + break + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = 1 + elif PyDateTime_Check(val) or util.is_datetime64_object(val): + if convert_datetime: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val, None, None).value + else: + seen_object = 1 + break + elif try_float and not util.is_string_object(val): + # this will convert Decimal objects + try: + floats[i] = float(val) + complexes[i] = complex(val) + seen_float = 1 + except Exception: + seen_object = 1 + break + else: + seen_object = 1 + break + + seen_numeric = seen_complex or seen_float or seen_int + + if not seen_object: + + if not safe: + if seen_null: + if not seen_bool and not seen_datetime and not seen_timedelta: + if seen_complex: + return complexes + elif seen_float or seen_int: + return floats + else: + if not seen_bool: + if seen_datetime: + if not seen_numeric: + return datetimes + elif seen_timedelta: + if not seen_numeric: + return timedeltas + else: + if seen_complex: + return complexes + elif seen_float: + return floats + elif seen_int: + return ints + elif not seen_datetime and not seen_numeric and not seen_timedelta: + return bools.view(np.bool_) + + else: + # don't cast int to float, etc. + if seen_null: + if not seen_bool and not seen_datetime and not seen_timedelta: + if seen_complex: + if not seen_int: + return complexes + elif seen_float: + if not seen_int: + return floats + else: + if not seen_bool: + if seen_datetime: + if not seen_numeric: + return datetimes + elif seen_timedelta: + if not seen_numeric: + return timedeltas + else: + if seen_complex: + if not seen_int: + return complexes + elif seen_float: + if not seen_int: + return floats + elif seen_int: + return ints + elif not seen_datetime and not seen_numeric and not seen_timedelta: + return bools.view(np.bool_) + + return objects + + +def convert_sql_column(x): + return maybe_convert_objects(x, try_float=1) + +def try_parse_dates(ndarray[object] values, parser=None, + dayfirst=False,default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + if default is None: # GH2618 + date=datetime.now() + default=datetime(date.year,date.month,1) + + try: + from dateutil.parser import parse + parse_date = lambda x: parse(x, dayfirst=dayfirst,default=default) + except ImportError: # pragma: no cover + def parse_date(s): + try: + return datetime.strptime(s, '%m/%d/%Y') + except Exception: + return s + # EAFP here + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values + else: + parse_date = parser + + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # raise if passed parser and it failed + raise + + return result + +def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, + date_parser=None, time_parser=None, + dayfirst=False,default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + from datetime import date, time, datetime, timedelta + + n = len(dates) + if len(times) != n: + raise ValueError('Length of dates and times must be equal') + result = np.empty(n, dtype='O') + + if date_parser is None: + if default is None: # GH2618 + date=datetime.now() + default=datetime(date.year,date.month,1) + + try: + from dateutil.parser import parse + parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) + except ImportError: # pragma: no cover + def parse_date(s): + try: + return date.strptime(s, '%m/%d/%Y') + except Exception: + return s + else: + parse_date = date_parser + + if time_parser is None: + try: + from dateutil.parser import parse + parse_time = lambda x: parse(x) + except ImportError: # pragma: no cover + def parse_time(s): + try: + return time.strptime(s, '%H:%M:%S') + except Exception: + return s + + else: + parse_time = time_parser + + for i from 0 <= i < n: + d = parse_date(str(dates[i])) + t = parse_time(str(times[i])) + result[i] = datetime(d.year, d.month, d.day, + t.hour, t.minute, t.second) + + return result + + +def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, + ndarray[object] days): + cdef: + Py_ssize_t i, n + ndarray[object] result + + from datetime import datetime + + n = len(years) + if len(months) != n or len(days) != n: + raise ValueError('Length of years/months/days must all be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) + + return result + +def try_parse_datetime_components(ndarray[object] years, + ndarray[object] months, + ndarray[object] days, + ndarray[object] hours, + ndarray[object] minutes, + ndarray[object] seconds): + + cdef: + Py_ssize_t i, n + ndarray[object] result + int secs + double float_secs + double micros + + from datetime import datetime + + n = len(years) + if (len(months) != n or len(days) != n or len(hours) != n or + len(minutes) != n or len(seconds) != n): + raise ValueError('Length of all datetime components must be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + float_secs = float(seconds[i]) + secs = int(float_secs) + + micros = float_secs - secs + if micros > 0: + micros = micros * 1000000 + + result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), + int(hours[i]), int(minutes[i]), secs, + int(micros)) + + return result + +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): + cdef: + Py_ssize_t i, n + object val, onan + Py_ssize_t na_count = 0 + dict memo = {} + + n = len(values) + onan = np.nan + + for i from 0 <= i < n: + val = values[i] + if (convert_empty and val == '') or (val in na_values): + values[i] = onan + na_count += 1 + elif val in memo: + values[i] = memo[val] + else: + memo[val] = val + + return na_count + +def maybe_convert_bool(ndarray[object] arr, + true_values=None, false_values=None): + cdef: + Py_ssize_t i, n + ndarray[uint8_t] result + object val + set true_vals, false_vals + int na_count = 0 + + n = len(arr) + result = np.empty(n, dtype=np.uint8) + + # the defaults + true_vals = set(('True', 'TRUE', 'true')) + false_vals = set(('False', 'FALSE', 'false')) + + if true_values is not None: + true_vals = true_vals | set(true_values) + + if false_values is not None: + false_vals = false_vals | set(false_values) + + for i from 0 <= i < n: + val = arr[i] + + if cpython.PyBool_Check(val): + if val is True: + result[i] = 1 + else: + result[i] = 0 + elif val in true_vals: + result[i] = 1 + elif val in false_vals: + result[i] = 0 + elif PyFloat_Check(val): + result[i] = UINT8_MAX + na_count += 1 + else: + return arr + + if na_count > 0: + mask = result == UINT8_MAX + arr = result.view(np.bool_).astype(object) + np.putmask(arr, mask, np.nan) + return arr + else: + return result.view(np.bool_) + + +def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, + bint convert=1): + ''' + Substitute for np.vectorize with pandas-friendly dtype inference + + Parameters + ---------- + arr : ndarray + f : function + + Returns + ------- + mapped : ndarray + ''' + cdef: + Py_ssize_t i, n + ndarray[object] result + object val + + n = len(arr) + result = np.empty(n, dtype=object) + for i in range(n): + if mask[i]: + val = util.get_value_at(arr, i) + else: + val = f(util.get_value_at(arr, i)) + + # unbox 0-dim arrays, GH #690 + if is_array(val) and PyArray_NDIM(val) == 0: + # is there a faster way to unbox? + val = val.item() + + result[i] = val + + if convert: + return maybe_convert_objects(result, + try_float=0, + convert_datetime=0, + convert_timedelta=0) + + return result + +def map_infer(ndarray arr, object f, bint convert=1): + ''' + Substitute for np.vectorize with pandas-friendly dtype inference + + Parameters + ---------- + arr : ndarray + f : function + + Returns + ------- + mapped : ndarray + ''' + cdef: + Py_ssize_t i, n + ndarray[object] result + object val + + n = len(arr) + result = np.empty(n, dtype=object) + for i in range(n): + val = f(util.get_value_at(arr, i)) + + # unbox 0-dim arrays, GH #690 + if is_array(val) and PyArray_NDIM(val) == 0: + # is there a faster way to unbox? + val = val.item() + + result[i] = val + + if convert: + return maybe_convert_objects(result, + try_float=0, + convert_datetime=0, + convert_timedelta=0) + + return result + + +def to_object_array(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + list row + + n = len(rows) + + k = 0 + for i from 0 <= i < n: + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + for i from 0 <= i < n: + row = rows[i] + + for j from 0 <= j < len(row): + result[i, j] = row[j] + + return result + +def tuples_to_object_array(ndarray[object] tuples): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple tup + + n = len(tuples) + k = len(tuples[0]) + result = np.empty((n, k), dtype=object) + for i in range(n): + tup = tuples[i] + for j in range(k): + result[i, j] = tup[j] + + return result + +def to_object_array_tuples(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple row + + n = len(rows) + + k = 0 + for i from 0 <= i < n: + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + try: + for i in range(n): + row = rows[i] + for j from 0 <= j < len(row): + result[i, j] = row[j] + except Exception: + # upcast any subclasses to tuple + for i in range(n): + row = tuple(rows[i]) + for j from 0 <= j < len(row): + result[i, j] = row[j] + + return result + + +def fast_multiget(dict mapping, ndarray keys, default=np.nan): + cdef: + Py_ssize_t i, n = len(keys) + object val + ndarray[object] output = np.empty(n, dtype='O') + + if n == 0: + # kludge, for Series + return np.empty(0, dtype='f8') + + keys = getattr(keys, 'values', keys) + + for i in range(n): + val = util.get_value_1d(keys, i) + if val in mapping: + output[i] = mapping[val] + else: + output[i] = default + + return maybe_convert_objects(output) diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx new file mode 100644 index 00000000..91102a2f --- /dev/null +++ b/pandas/src/join.pyx @@ -0,0 +1,241 @@ +def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups): + cdef: + Py_ssize_t i, j, k, count = 0 + ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc + + # NA group in location 0 + + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + count += lc * rc + + # group 0 is the NA group + cdef: + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + return (_get_result_indexer(left_sorter, left_indexer), + _get_result_indexer(right_sorter, right_indexer)) + +def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups, sort=True): + cdef: + Py_ssize_t i, j, k, count = 0 + ndarray[int64_t] left_count, right_count + ndarray left_sorter, right_sorter, rev + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc + + # NA group in location 0 + + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + if right_count[i] > 0: + count += left_count[i] * right_count[i] + else: + count += left_count[i] + + # group 0 is the NA group + cdef: + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + left_indexer = _get_result_indexer(left_sorter, left_indexer) + right_indexer = _get_result_indexer(right_sorter, right_indexer) + + if not sort: + if left_sorter.dtype != np.int_: + left_sorter = left_sorter.astype(np.int_) + + rev = np.empty(len(left), dtype=np.int_) + rev.put(left_sorter, np.arange(len(left))) + + right_indexer = right_indexer.take(rev) + left_indexer = left_indexer.take(rev) + + return left_indexer, right_indexer + + + +def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups): + cdef: + Py_ssize_t i, j, k, count = 0 + ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc + + # NA group in location 0 + + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + count += lc * rc + else: + count += lc + rc + + # group 0 is the NA group + cdef: + int64_t left_pos = 0, right_pos = 0 + Py_ssize_t offset, position = 0 + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + elif lc == 0: + for j in range(rc): + left_indexer[position + j] = -1 + right_indexer[position + j] = right_pos + j + position += rc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + return (_get_result_indexer(left_sorter, left_indexer), + _get_result_indexer(right_sorter, right_indexer)) + + + +def _get_result_indexer(sorter, indexer): + if indexer.dtype != np.int_: + indexer = indexer.astype(np.int_) + if len(sorter) > 0: + res = sorter.take(indexer) + np.putmask(res, indexer == -1, -1) + else: + # length-0 case + res = np.empty(len(indexer), dtype=np.int64) + res.fill(-1) + + return res + + + +def ffill_indexer(ndarray[int64_t] indexer): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] result + int64_t val, last_obs + + result = np.empty(n, dtype=np.int64) + last_obs = -1 + + for i in range(n): + val = indexer[i] + if val == -1: + result[i] = last_obs + else: + result[i] = val + last_obs = val + + return result + + +def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, + int64_t max_group): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] result, last_obs + int64_t gid, val + + result = np.empty(n, dtype=np.int64) + + last_obs = np.empty(max_group, dtype=np.int64) + last_obs.fill(-1) + + for i in range(n): + gid = group_ids[i] + val = indexer[i] + if val == -1: + result[i] = last_obs[gid] + else: + result[i] = val + last_obs[gid] = val + + return result + diff --git a/pandas/src/khash.pxd b/pandas/src/khash.pxd new file mode 100644 index 00000000..a8fd51a6 --- /dev/null +++ b/pandas/src/khash.pxd @@ -0,0 +1,124 @@ +from cpython cimport PyObject +from numpy cimport int64_t, int32_t, uint32_t, float64_t + +cdef extern from "khash_python.h": + ctypedef uint32_t khint_t + ctypedef khint_t khiter_t + + ctypedef struct kh_pymap_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + PyObject **keys + size_t *vals + + inline kh_pymap_t* kh_init_pymap() + inline void kh_destroy_pymap(kh_pymap_t*) + inline void kh_clear_pymap(kh_pymap_t*) + inline khint_t kh_get_pymap(kh_pymap_t*, PyObject*) + inline void kh_resize_pymap(kh_pymap_t*, khint_t) + inline khint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*) + inline void kh_del_pymap(kh_pymap_t*, khint_t) + + bint kh_exist_pymap(kh_pymap_t*, khiter_t) + + ctypedef struct kh_pyset_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + PyObject **keys + size_t *vals + + inline kh_pyset_t* kh_init_pyset() + inline void kh_destroy_pyset(kh_pyset_t*) + inline void kh_clear_pyset(kh_pyset_t*) + inline khint_t kh_get_pyset(kh_pyset_t*, PyObject*) + inline void kh_resize_pyset(kh_pyset_t*, khint_t) + inline khint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*) + inline void kh_del_pyset(kh_pyset_t*, khint_t) + + bint kh_exist_pyset(kh_pyset_t*, khiter_t) + + ctypedef char* kh_cstr_t + + ctypedef struct kh_str_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + kh_cstr_t *keys + size_t *vals + + inline kh_str_t* kh_init_str() + inline void kh_destroy_str(kh_str_t*) + inline void kh_clear_str(kh_str_t*) + inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) + inline void kh_resize_str(kh_str_t*, khint_t) + inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) + inline void kh_del_str(kh_str_t*, khint_t) + + bint kh_exist_str(kh_str_t*, khiter_t) + + + ctypedef struct kh_int64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int64_t *keys + size_t *vals + + inline kh_int64_t* kh_init_int64() + inline void kh_destroy_int64(kh_int64_t*) + inline void kh_clear_int64(kh_int64_t*) + inline khint_t kh_get_int64(kh_int64_t*, int64_t) + inline void kh_resize_int64(kh_int64_t*, khint_t) + inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*) + inline void kh_del_int64(kh_int64_t*, khint_t) + + bint kh_exist_int64(kh_int64_t*, khiter_t) + + ctypedef struct kh_float64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + float64_t *keys + size_t *vals + + inline kh_float64_t* kh_init_float64() + inline void kh_destroy_float64(kh_float64_t*) + inline void kh_clear_float64(kh_float64_t*) + inline khint_t kh_get_float64(kh_float64_t*, float64_t) + inline void kh_resize_float64(kh_float64_t*, khint_t) + inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*) + inline void kh_del_float64(kh_float64_t*, khint_t) + + bint kh_exist_float64(kh_float64_t*, khiter_t) + + ctypedef struct kh_int32_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int32_t *keys + size_t *vals + + inline kh_int32_t* kh_init_int32() + inline void kh_destroy_int32(kh_int32_t*) + inline void kh_clear_int32(kh_int32_t*) + inline khint_t kh_get_int32(kh_int32_t*, int32_t) + inline void kh_resize_int32(kh_int32_t*, khint_t) + inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*) + inline void kh_del_int32(kh_int32_t*, khint_t) + + bint kh_exist_int32(kh_int32_t*, khiter_t) + + # sweep factorize + + ctypedef struct kh_strbox_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + kh_cstr_t *keys + PyObject **vals + + inline kh_strbox_t* kh_init_strbox() + inline void kh_destroy_strbox(kh_strbox_t*) + inline void kh_clear_strbox(kh_strbox_t*) + inline khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) + inline void kh_resize_strbox(kh_strbox_t*, khint_t) + inline khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) + inline void kh_del_strbox(kh_strbox_t*, khint_t) + + bint kh_exist_strbox(kh_strbox_t*, khiter_t) + diff --git a/pandas/src/klib/khash.h b/pandas/src/klib/khash.h new file mode 100644 index 00000000..4350ff06 --- /dev/null +++ b/pandas/src/klib/khash.h @@ -0,0 +1,578 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.6" + +#include +#include +#include + + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khuint64_t; +typedef signed long khint64_t; +#else +typedef unsigned long long khuint64_t; +typedef signed long long khint64_t; +#endif + +typedef double khfloat64_t; + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1) +#define __ac_isdel(flag, i) (0) +#define __ac_iseither(flag, i) __ac_isempty(flag, i) +#define __ac_set_isdel_false(flag, i) (0) +#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU))) +#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU))) +#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) +#define __ac_set_isdel_true(flag, i) (0) + +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + inc) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isempty_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + inc = __ac_inc(k, mask); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + inc) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) + +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static PANDAS_INLINE khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name(void) + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_UINT64(name) \ + KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_UINT64(name, khval_t) \ + KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + + +#define kh_exist_str(h, k) (kh_exist(h, k)) +#define kh_exist_float64(h, k) (kh_exist(h, k)) +#define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_int32(h, k) (kh_exist(h, k)) + +KHASH_MAP_INIT_STR(str, size_t) +KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_INT64(int64, size_t) + + +#endif /* __AC_KHASH_H */ diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h new file mode 100644 index 00000000..d3ef48de --- /dev/null +++ b/pandas/src/klib/khash_python.h @@ -0,0 +1,49 @@ +#include + +#include "khash.h" + +// kludge + +#define kh_float64_hash_func _Py_HashDouble +#define kh_float64_hash_equal kh_int64_hash_equal + +#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) + +KHASH_MAP_INIT_FLOAT64(float64, size_t) + + +int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { + int result = PyObject_RichCompareBool(a, b, Py_EQ); + if (result < 0) { + PyErr_Clear(); + return 0; + } + return result; +} + + +#define kh_python_hash_func(key) (PyObject_Hash(key)) +#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) + + +// Python object + +typedef PyObject* kh_pyobject_t; + +#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \ + KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \ + kh_python_hash_func, kh_python_hash_equal) + +KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t) + +#define KHASH_SET_INIT_PYOBJECT(name) \ + KHASH_INIT(name, kh_pyobject_t, char, 0, \ + kh_python_hash_func, kh_python_hash_equal) + +KHASH_SET_INIT_PYOBJECT(pyset) + +#define kh_exist_pymap(h, k) (kh_exist(h, k)) +#define kh_exist_pyset(h, k) (kh_exist(h, k)) + +KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) diff --git a/pandas/src/klib/ktypes.h b/pandas/src/klib/ktypes.h new file mode 100644 index 00000000..981f1737 --- /dev/null +++ b/pandas/src/klib/ktypes.h @@ -0,0 +1,6 @@ +#ifndef __KTYPES_H +#define __KTYPES_H + +/* compipler specific configuration */ + +#endif /* __KTYPES_H */ diff --git a/pandas/src/klib/kvec.h b/pandas/src/klib/kvec.h new file mode 100644 index 00000000..032962e5 --- /dev/null +++ b/pandas/src/klib/kvec.h @@ -0,0 +1,151 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include +#include +#include + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) + +#define kv_copy(type, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, v, x) do { \ + if ((v)->n == (v)->m) { \ + (v)->m = (v)->m? (v)->m<<1 : 2; \ + (v)->a = (type*)realloc((v)->a, sizeof(type) * (v)->m); \ + } \ + (v)->a[(v)->n++] = (x); \ + } while (0) + +#define kv_pushp(type, v) (((v).n == (v).m)? \ + ((v).m = ((v).m? (v).m<<1 : 2), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : 0), ((v).a + ((v).n++)) + +#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ + ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : (v).n <= (size_t)(i)? (v).n = (i) \ + : 0), (v).a[(i)] + +// #define kv_int64_push(v, x) (kv_push(int64_t, (v), (x))) + +typedef struct { + size_t n, m; + int64_t* a; +} kv_int64_t; + +typedef struct { + size_t n, m; + double* a; +} kv_double; + +typedef struct { + size_t n, m; + PyObject** a; +} kv_object_t; + +void PANDAS_INLINE kv_object_push(kv_object_t *v, PyObject *x) { + do { + if (v->n == v->m) { + v->m = v->m? v->m<<1 : 2; + v->a = (PyObject**)realloc(v->a, sizeof(PyObject*) * v->m); + } + v->a[v->n++] = x; + } while (0); + // kv_push(PyObject*, v, x); + Py_INCREF(x); +} + +void PANDAS_INLINE kv_int64_push(kv_int64_t *v, int64_t x) { + kv_push(int64_t, v, x); +} + +void PANDAS_INLINE kv_double_push(kv_double *v, double x) { + kv_push(double, v, x); +} + +void PANDAS_INLINE kv_object_destroy(kv_object_t *v) { + int i; + for (i = 0; i < v->n; ++i) + { + Py_XDECREF(v->a[i]); + } + free(v->a); +} + + +#endif diff --git a/pandas/src/msgpack/pack.h b/pandas/src/msgpack/pack.h new file mode 100644 index 00000000..bb939d93 --- /dev/null +++ b/pandas/src/msgpack/pack.h @@ -0,0 +1,108 @@ +/* + * MessagePack for Python packing routine + * + * Copyright (C) 2009 Naoki INADA + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "sysdep.h" +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _MSC_VER +#define inline __inline +#endif + +typedef struct msgpack_packer { + char *buf; + size_t length; + size_t buf_size; +} msgpack_packer; + +typedef struct Packer Packer; + +static inline int msgpack_pack_short(msgpack_packer* pk, short d); +static inline int msgpack_pack_int(msgpack_packer* pk, int d); +static inline int msgpack_pack_long(msgpack_packer* pk, long d); +static inline int msgpack_pack_long_long(msgpack_packer* pk, long long d); +static inline int msgpack_pack_unsigned_short(msgpack_packer* pk, unsigned short d); +static inline int msgpack_pack_unsigned_int(msgpack_packer* pk, unsigned int d); +static inline int msgpack_pack_unsigned_long(msgpack_packer* pk, unsigned long d); +static inline int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d); + +static inline int msgpack_pack_uint8(msgpack_packer* pk, uint8_t d); +static inline int msgpack_pack_uint16(msgpack_packer* pk, uint16_t d); +static inline int msgpack_pack_uint32(msgpack_packer* pk, uint32_t d); +static inline int msgpack_pack_uint64(msgpack_packer* pk, uint64_t d); +static inline int msgpack_pack_int8(msgpack_packer* pk, int8_t d); +static inline int msgpack_pack_int16(msgpack_packer* pk, int16_t d); +static inline int msgpack_pack_int32(msgpack_packer* pk, int32_t d); +static inline int msgpack_pack_int64(msgpack_packer* pk, int64_t d); + +static inline int msgpack_pack_float(msgpack_packer* pk, float d); +static inline int msgpack_pack_double(msgpack_packer* pk, double d); + +static inline int msgpack_pack_nil(msgpack_packer* pk); +static inline int msgpack_pack_true(msgpack_packer* pk); +static inline int msgpack_pack_false(msgpack_packer* pk); + +static inline int msgpack_pack_array(msgpack_packer* pk, unsigned int n); + +static inline int msgpack_pack_map(msgpack_packer* pk, unsigned int n); + +static inline int msgpack_pack_raw(msgpack_packer* pk, size_t l); +static inline int msgpack_pack_raw_body(msgpack_packer* pk, const void* b, size_t l); + +static inline int msgpack_pack_write(msgpack_packer* pk, const char *data, size_t l) +{ + char* buf = pk->buf; + size_t bs = pk->buf_size; + size_t len = pk->length; + + if (len + l > bs) { + bs = (len + l) * 2; + buf = (char*)realloc(buf, bs); + if (!buf) return -1; + } + memcpy(buf + len, data, l); + len += l; + + pk->buf = buf; + pk->buf_size = bs; + pk->length = len; + return 0; +} + +#define msgpack_pack_inline_func(name) \ + static inline int msgpack_pack ## name + +#define msgpack_pack_inline_func_cint(name) \ + static inline int msgpack_pack ## name + +#define msgpack_pack_user msgpack_packer* + +#define msgpack_pack_append_buffer(user, buf, len) \ + return msgpack_pack_write(user, (const char*)buf, len) + +#include "pack_template.h" + +#ifdef __cplusplus +} +#endif diff --git a/pandas/src/msgpack/pack_template.h b/pandas/src/msgpack/pack_template.h new file mode 100644 index 00000000..65c959dd --- /dev/null +++ b/pandas/src/msgpack/pack_template.h @@ -0,0 +1,771 @@ +/* + * MessagePack packing routine template + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(__LITTLE_ENDIAN__) +#define TAKE8_8(d) ((uint8_t*)&d)[0] +#define TAKE8_16(d) ((uint8_t*)&d)[0] +#define TAKE8_32(d) ((uint8_t*)&d)[0] +#define TAKE8_64(d) ((uint8_t*)&d)[0] +#elif defined(__BIG_ENDIAN__) +#define TAKE8_8(d) ((uint8_t*)&d)[0] +#define TAKE8_16(d) ((uint8_t*)&d)[1] +#define TAKE8_32(d) ((uint8_t*)&d)[3] +#define TAKE8_64(d) ((uint8_t*)&d)[7] +#endif + +#ifndef msgpack_pack_inline_func +#error msgpack_pack_inline_func template is not defined +#endif + +#ifndef msgpack_pack_user +#error msgpack_pack_user type is not defined +#endif + +#ifndef msgpack_pack_append_buffer +#error msgpack_pack_append_buffer callback is not defined +#endif + + +/* + * Integer + */ + +#define msgpack_pack_real_uint8(x, d) \ +do { \ + if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \ + } else { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_8(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ +} while(0) + +#define msgpack_pack_real_uint16(x, d) \ +do { \ + if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \ + } else if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } \ +} while(0) + +#define msgpack_pack_real_uint32(x, d) \ +do { \ + if(d < (1<<8)) { \ + if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \ + } else { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else { \ + if(d < (1<<16)) { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_uint64(x, d) \ +do { \ + if(d < (1ULL<<8)) { \ + if(d < (1ULL<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \ + } else { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else { \ + if(d < (1ULL<<16)) { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else if(d < (1ULL<<32)) { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } else { \ + /* unsigned 64 */ \ + unsigned char buf[9]; \ + buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \ + msgpack_pack_append_buffer(x, buf, 9); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_int8(x, d) \ +do { \ + if(d < -(1<<5)) { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_8(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \ + } \ +} while(0) + +#define msgpack_pack_real_int16(x, d) \ +do { \ + if(d < -(1<<5)) { \ + if(d < -(1<<7)) { \ + /* signed 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_16(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \ + } else { \ + if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_int32(x, d) \ +do { \ + if(d < -(1<<5)) { \ + if(d < -(1<<15)) { \ + /* signed 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } else if(d < -(1<<7)) { \ + /* signed 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_32(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } else if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \ + } else { \ + if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else if(d < (1<<16)) { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } \ + } \ +} while(0) + +#define msgpack_pack_real_int64(x, d) \ +do { \ + if(d < -(1LL<<5)) { \ + if(d < -(1LL<<15)) { \ + if(d < -(1LL<<31)) { \ + /* signed 64 */ \ + unsigned char buf[9]; \ + buf[0] = 0xd3; _msgpack_store64(&buf[1], d); \ + msgpack_pack_append_buffer(x, buf, 9); \ + } else { \ + /* signed 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } \ + } else { \ + if(d < -(1<<7)) { \ + /* signed 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } else { \ + /* signed 8 */ \ + unsigned char buf[2] = {0xd0, TAKE8_64(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } \ + } \ + } else if(d < (1<<7)) { \ + /* fixnum */ \ + msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \ + } else { \ + if(d < (1LL<<16)) { \ + if(d < (1<<8)) { \ + /* unsigned 8 */ \ + unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \ + msgpack_pack_append_buffer(x, buf, 2); \ + } else { \ + /* unsigned 16 */ \ + unsigned char buf[3]; \ + buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ + msgpack_pack_append_buffer(x, buf, 3); \ + } \ + } else { \ + if(d < (1LL<<32)) { \ + /* unsigned 32 */ \ + unsigned char buf[5]; \ + buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ + msgpack_pack_append_buffer(x, buf, 5); \ + } else { \ + /* unsigned 64 */ \ + unsigned char buf[9]; \ + buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \ + msgpack_pack_append_buffer(x, buf, 9); \ + } \ + } \ + } \ +} while(0) + + +#ifdef msgpack_pack_inline_func_fixint + +msgpack_pack_inline_func_fixint(_uint8)(msgpack_pack_user x, uint8_t d) +{ + unsigned char buf[2] = {0xcc, TAKE8_8(d)}; + msgpack_pack_append_buffer(x, buf, 2); +} + +msgpack_pack_inline_func_fixint(_uint16)(msgpack_pack_user x, uint16_t d) +{ + unsigned char buf[3]; + buf[0] = 0xcd; _msgpack_store16(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 3); +} + +msgpack_pack_inline_func_fixint(_uint32)(msgpack_pack_user x, uint32_t d) +{ + unsigned char buf[5]; + buf[0] = 0xce; _msgpack_store32(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 5); +} + +msgpack_pack_inline_func_fixint(_uint64)(msgpack_pack_user x, uint64_t d) +{ + unsigned char buf[9]; + buf[0] = 0xcf; _msgpack_store64(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 9); +} + +msgpack_pack_inline_func_fixint(_int8)(msgpack_pack_user x, int8_t d) +{ + unsigned char buf[2] = {0xd0, TAKE8_8(d)}; + msgpack_pack_append_buffer(x, buf, 2); +} + +msgpack_pack_inline_func_fixint(_int16)(msgpack_pack_user x, int16_t d) +{ + unsigned char buf[3]; + buf[0] = 0xd1; _msgpack_store16(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 3); +} + +msgpack_pack_inline_func_fixint(_int32)(msgpack_pack_user x, int32_t d) +{ + unsigned char buf[5]; + buf[0] = 0xd2; _msgpack_store32(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 5); +} + +msgpack_pack_inline_func_fixint(_int64)(msgpack_pack_user x, int64_t d) +{ + unsigned char buf[9]; + buf[0] = 0xd3; _msgpack_store64(&buf[1], d); + msgpack_pack_append_buffer(x, buf, 9); +} + +#undef msgpack_pack_inline_func_fixint +#endif + + +msgpack_pack_inline_func(_uint8)(msgpack_pack_user x, uint8_t d) +{ + msgpack_pack_real_uint8(x, d); +} + +msgpack_pack_inline_func(_uint16)(msgpack_pack_user x, uint16_t d) +{ + msgpack_pack_real_uint16(x, d); +} + +msgpack_pack_inline_func(_uint32)(msgpack_pack_user x, uint32_t d) +{ + msgpack_pack_real_uint32(x, d); +} + +msgpack_pack_inline_func(_uint64)(msgpack_pack_user x, uint64_t d) +{ + msgpack_pack_real_uint64(x, d); +} + +msgpack_pack_inline_func(_int8)(msgpack_pack_user x, int8_t d) +{ + msgpack_pack_real_int8(x, d); +} + +msgpack_pack_inline_func(_int16)(msgpack_pack_user x, int16_t d) +{ + msgpack_pack_real_int16(x, d); +} + +msgpack_pack_inline_func(_int32)(msgpack_pack_user x, int32_t d) +{ + msgpack_pack_real_int32(x, d); +} + +msgpack_pack_inline_func(_int64)(msgpack_pack_user x, int64_t d) +{ + msgpack_pack_real_int64(x, d); +} + + +#ifdef msgpack_pack_inline_func_cint + +msgpack_pack_inline_func_cint(_short)(msgpack_pack_user x, short d) +{ +#if defined(SIZEOF_SHORT) +#if SIZEOF_SHORT == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_SHORT == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(SHRT_MAX) +#if SHRT_MAX == 0x7fff + msgpack_pack_real_int16(x, d); +#elif SHRT_MAX == 0x7fffffff + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(short) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(short) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_int)(msgpack_pack_user x, int d) +{ +#if defined(SIZEOF_INT) +#if SIZEOF_INT == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_INT == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(INT_MAX) +#if INT_MAX == 0x7fff + msgpack_pack_real_int16(x, d); +#elif INT_MAX == 0x7fffffff + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(int) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(int) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_long)(msgpack_pack_user x, long d) +{ +#if defined(SIZEOF_LONG) +#if SIZEOF_LONG == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_LONG == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(LONG_MAX) +#if LONG_MAX == 0x7fffL + msgpack_pack_real_int16(x, d); +#elif LONG_MAX == 0x7fffffffL + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(long) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(long) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_long_long)(msgpack_pack_user x, long long d) +{ +#if defined(SIZEOF_LONG_LONG) +#if SIZEOF_LONG_LONG == 2 + msgpack_pack_real_int16(x, d); +#elif SIZEOF_LONG_LONG == 4 + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#elif defined(LLONG_MAX) +#if LLONG_MAX == 0x7fffL + msgpack_pack_real_int16(x, d); +#elif LLONG_MAX == 0x7fffffffL + msgpack_pack_real_int32(x, d); +#else + msgpack_pack_real_int64(x, d); +#endif + +#else +if(sizeof(long long) == 2) { + msgpack_pack_real_int16(x, d); +} else if(sizeof(long long) == 4) { + msgpack_pack_real_int32(x, d); +} else { + msgpack_pack_real_int64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_short)(msgpack_pack_user x, unsigned short d) +{ +#if defined(SIZEOF_SHORT) +#if SIZEOF_SHORT == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_SHORT == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(USHRT_MAX) +#if USHRT_MAX == 0xffffU + msgpack_pack_real_uint16(x, d); +#elif USHRT_MAX == 0xffffffffU + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned short) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned short) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_int)(msgpack_pack_user x, unsigned int d) +{ +#if defined(SIZEOF_INT) +#if SIZEOF_INT == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_INT == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(UINT_MAX) +#if UINT_MAX == 0xffffU + msgpack_pack_real_uint16(x, d); +#elif UINT_MAX == 0xffffffffU + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned int) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned int) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_long)(msgpack_pack_user x, unsigned long d) +{ +#if defined(SIZEOF_LONG) +#if SIZEOF_LONG == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_LONG == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(ULONG_MAX) +#if ULONG_MAX == 0xffffUL + msgpack_pack_real_uint16(x, d); +#elif ULONG_MAX == 0xffffffffUL + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned long) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned long) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +msgpack_pack_inline_func_cint(_unsigned_long_long)(msgpack_pack_user x, unsigned long long d) +{ +#if defined(SIZEOF_LONG_LONG) +#if SIZEOF_LONG_LONG == 2 + msgpack_pack_real_uint16(x, d); +#elif SIZEOF_LONG_LONG == 4 + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#elif defined(ULLONG_MAX) +#if ULLONG_MAX == 0xffffUL + msgpack_pack_real_uint16(x, d); +#elif ULLONG_MAX == 0xffffffffUL + msgpack_pack_real_uint32(x, d); +#else + msgpack_pack_real_uint64(x, d); +#endif + +#else +if(sizeof(unsigned long long) == 2) { + msgpack_pack_real_uint16(x, d); +} else if(sizeof(unsigned long long) == 4) { + msgpack_pack_real_uint32(x, d); +} else { + msgpack_pack_real_uint64(x, d); +} +#endif +} + +#undef msgpack_pack_inline_func_cint +#endif + + + +/* + * Float + */ + +msgpack_pack_inline_func(_float)(msgpack_pack_user x, float d) +{ + union { float f; uint32_t i; } mem; + mem.f = d; + unsigned char buf[5]; + buf[0] = 0xca; _msgpack_store32(&buf[1], mem.i); + msgpack_pack_append_buffer(x, buf, 5); +} + +msgpack_pack_inline_func(_double)(msgpack_pack_user x, double d) +{ + union { double f; uint64_t i; } mem; + mem.f = d; + unsigned char buf[9]; + buf[0] = 0xcb; +#if defined(__arm__) && !(__ARM_EABI__) // arm-oabi + // https://github.com/msgpack/msgpack-perl/pull/1 + mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL); +#endif + _msgpack_store64(&buf[1], mem.i); + msgpack_pack_append_buffer(x, buf, 9); +} + + +/* + * Nil + */ + +msgpack_pack_inline_func(_nil)(msgpack_pack_user x) +{ + static const unsigned char d = 0xc0; + msgpack_pack_append_buffer(x, &d, 1); +} + + +/* + * Boolean + */ + +msgpack_pack_inline_func(_true)(msgpack_pack_user x) +{ + static const unsigned char d = 0xc3; + msgpack_pack_append_buffer(x, &d, 1); +} + +msgpack_pack_inline_func(_false)(msgpack_pack_user x) +{ + static const unsigned char d = 0xc2; + msgpack_pack_append_buffer(x, &d, 1); +} + + +/* + * Array + */ + +msgpack_pack_inline_func(_array)(msgpack_pack_user x, unsigned int n) +{ + if(n < 16) { + unsigned char d = 0x90 | n; + msgpack_pack_append_buffer(x, &d, 1); + } else if(n < 65536) { + unsigned char buf[3]; + buf[0] = 0xdc; _msgpack_store16(&buf[1], (uint16_t)n); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xdd; _msgpack_store32(&buf[1], (uint32_t)n); + msgpack_pack_append_buffer(x, buf, 5); + } +} + + +/* + * Map + */ + +msgpack_pack_inline_func(_map)(msgpack_pack_user x, unsigned int n) +{ + if(n < 16) { + unsigned char d = 0x80 | n; + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); + } else if(n < 65536) { + unsigned char buf[3]; + buf[0] = 0xde; _msgpack_store16(&buf[1], (uint16_t)n); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xdf; _msgpack_store32(&buf[1], (uint32_t)n); + msgpack_pack_append_buffer(x, buf, 5); + } +} + + +/* + * Raw + */ + +msgpack_pack_inline_func(_raw)(msgpack_pack_user x, size_t l) +{ + if(l < 32) { + unsigned char d = 0xa0 | (uint8_t)l; + msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); + } else if(l < 65536) { + unsigned char buf[3]; + buf[0] = 0xda; _msgpack_store16(&buf[1], (uint16_t)l); + msgpack_pack_append_buffer(x, buf, 3); + } else { + unsigned char buf[5]; + buf[0] = 0xdb; _msgpack_store32(&buf[1], (uint32_t)l); + msgpack_pack_append_buffer(x, buf, 5); + } +} + +msgpack_pack_inline_func(_raw_body)(msgpack_pack_user x, const void* b, size_t l) +{ + msgpack_pack_append_buffer(x, (const unsigned char*)b, l); +} + +#undef msgpack_pack_inline_func +#undef msgpack_pack_user +#undef msgpack_pack_append_buffer + +#undef TAKE8_8 +#undef TAKE8_16 +#undef TAKE8_32 +#undef TAKE8_64 + +#undef msgpack_pack_real_uint8 +#undef msgpack_pack_real_uint16 +#undef msgpack_pack_real_uint32 +#undef msgpack_pack_real_uint64 +#undef msgpack_pack_real_int8 +#undef msgpack_pack_real_int16 +#undef msgpack_pack_real_int32 +#undef msgpack_pack_real_int64 + diff --git a/pandas/src/msgpack/sysdep.h b/pandas/src/msgpack/sysdep.h new file mode 100644 index 00000000..4fedbd8b --- /dev/null +++ b/pandas/src/msgpack/sysdep.h @@ -0,0 +1,195 @@ +/* + * MessagePack system dependencies + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MSGPACK_SYSDEP_H__ +#define MSGPACK_SYSDEP_H__ + +#include +#include +#if defined(_MSC_VER) && _MSC_VER < 1600 +typedef __int8 int8_t; +typedef unsigned __int8 uint8_t; +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#elif defined(_MSC_VER) // && _MSC_VER >= 1600 +#include +#else +#include +#include +#endif + +#ifdef _WIN32 +#define _msgpack_atomic_counter_header +typedef long _msgpack_atomic_counter_t; +#define _msgpack_sync_decr_and_fetch(ptr) InterlockedDecrement(ptr) +#define _msgpack_sync_incr_and_fetch(ptr) InterlockedIncrement(ptr) +#elif defined(__GNUC__) && ((__GNUC__*10 + __GNUC_MINOR__) < 41) +#define _msgpack_atomic_counter_header "gcc_atomic.h" +#else +typedef unsigned int _msgpack_atomic_counter_t; +#define _msgpack_sync_decr_and_fetch(ptr) __sync_sub_and_fetch(ptr, 1) +#define _msgpack_sync_incr_and_fetch(ptr) __sync_add_and_fetch(ptr, 1) +#endif + +#ifdef _WIN32 + +#ifdef __cplusplus +/* numeric_limits::min,max */ +#ifdef max +#undef max +#endif +#ifdef min +#undef min +#endif +#endif + +#else +#include /* __BYTE_ORDER */ +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define __LITTLE_ENDIAN__ +#elif __BYTE_ORDER == __BIG_ENDIAN +#define __BIG_ENDIAN__ +#elif _WIN32 +#define __LITTLE_ENDIAN__ +#endif +#endif + + +#ifdef __LITTLE_ENDIAN__ + +#ifdef _WIN32 +# if defined(ntohs) +# define _msgpack_be16(x) ntohs(x) +# elif defined(_byteswap_ushort) || (defined(_MSC_VER) && _MSC_VER >= 1400) +# define _msgpack_be16(x) ((uint16_t)_byteswap_ushort((unsigned short)x)) +# else +# define _msgpack_be16(x) ( \ + ((((uint16_t)x) << 8) ) | \ + ((((uint16_t)x) >> 8) ) ) +# endif +#else +# define _msgpack_be16(x) ntohs(x) +#endif + +#ifdef _WIN32 +# if defined(ntohl) +# define _msgpack_be32(x) ntohl(x) +# elif defined(_byteswap_ulong) || (defined(_MSC_VER) && _MSC_VER >= 1400) +# define _msgpack_be32(x) ((uint32_t)_byteswap_ulong((unsigned long)x)) +# else +# define _msgpack_be32(x) \ + ( ((((uint32_t)x) << 24) ) | \ + ((((uint32_t)x) << 8) & 0x00ff0000U ) | \ + ((((uint32_t)x) >> 8) & 0x0000ff00U ) | \ + ((((uint32_t)x) >> 24) ) ) +# endif +#else +# define _msgpack_be32(x) ntohl(x) +#endif + +#if defined(_byteswap_uint64) || (defined(_MSC_VER) && _MSC_VER >= 1400) +# define _msgpack_be64(x) (_byteswap_uint64(x)) +#elif defined(bswap_64) +# define _msgpack_be64(x) bswap_64(x) +#elif defined(__DARWIN_OSSwapInt64) +# define _msgpack_be64(x) __DARWIN_OSSwapInt64(x) +#else +#define _msgpack_be64(x) \ + ( ((((uint64_t)x) << 56) ) | \ + ((((uint64_t)x) << 40) & 0x00ff000000000000ULL ) | \ + ((((uint64_t)x) << 24) & 0x0000ff0000000000ULL ) | \ + ((((uint64_t)x) << 8) & 0x000000ff00000000ULL ) | \ + ((((uint64_t)x) >> 8) & 0x00000000ff000000ULL ) | \ + ((((uint64_t)x) >> 24) & 0x0000000000ff0000ULL ) | \ + ((((uint64_t)x) >> 40) & 0x000000000000ff00ULL ) | \ + ((((uint64_t)x) >> 56) ) ) +#endif + +#define _msgpack_load16(cast, from) ((cast)( \ + (((uint16_t)((uint8_t*)(from))[0]) << 8) | \ + (((uint16_t)((uint8_t*)(from))[1]) ) )) + +#define _msgpack_load32(cast, from) ((cast)( \ + (((uint32_t)((uint8_t*)(from))[0]) << 24) | \ + (((uint32_t)((uint8_t*)(from))[1]) << 16) | \ + (((uint32_t)((uint8_t*)(from))[2]) << 8) | \ + (((uint32_t)((uint8_t*)(from))[3]) ) )) + +#define _msgpack_load64(cast, from) ((cast)( \ + (((uint64_t)((uint8_t*)(from))[0]) << 56) | \ + (((uint64_t)((uint8_t*)(from))[1]) << 48) | \ + (((uint64_t)((uint8_t*)(from))[2]) << 40) | \ + (((uint64_t)((uint8_t*)(from))[3]) << 32) | \ + (((uint64_t)((uint8_t*)(from))[4]) << 24) | \ + (((uint64_t)((uint8_t*)(from))[5]) << 16) | \ + (((uint64_t)((uint8_t*)(from))[6]) << 8) | \ + (((uint64_t)((uint8_t*)(from))[7]) ) )) + +#else + +#define _msgpack_be16(x) (x) +#define _msgpack_be32(x) (x) +#define _msgpack_be64(x) (x) + +#define _msgpack_load16(cast, from) ((cast)( \ + (((uint16_t)((uint8_t*)from)[0]) << 8) | \ + (((uint16_t)((uint8_t*)from)[1]) ) )) + +#define _msgpack_load32(cast, from) ((cast)( \ + (((uint32_t)((uint8_t*)from)[0]) << 24) | \ + (((uint32_t)((uint8_t*)from)[1]) << 16) | \ + (((uint32_t)((uint8_t*)from)[2]) << 8) | \ + (((uint32_t)((uint8_t*)from)[3]) ) )) + +#define _msgpack_load64(cast, from) ((cast)( \ + (((uint64_t)((uint8_t*)from)[0]) << 56) | \ + (((uint64_t)((uint8_t*)from)[1]) << 48) | \ + (((uint64_t)((uint8_t*)from)[2]) << 40) | \ + (((uint64_t)((uint8_t*)from)[3]) << 32) | \ + (((uint64_t)((uint8_t*)from)[4]) << 24) | \ + (((uint64_t)((uint8_t*)from)[5]) << 16) | \ + (((uint64_t)((uint8_t*)from)[6]) << 8) | \ + (((uint64_t)((uint8_t*)from)[7]) ) )) +#endif + + +#define _msgpack_store16(to, num) \ + do { uint16_t val = _msgpack_be16(num); memcpy(to, &val, 2); } while(0) +#define _msgpack_store32(to, num) \ + do { uint32_t val = _msgpack_be32(num); memcpy(to, &val, 4); } while(0) +#define _msgpack_store64(to, num) \ + do { uint64_t val = _msgpack_be64(num); memcpy(to, &val, 8); } while(0) + +/* +#define _msgpack_load16(cast, from) \ + ({ cast val; memcpy(&val, (char*)from, 2); _msgpack_be16(val); }) +#define _msgpack_load32(cast, from) \ + ({ cast val; memcpy(&val, (char*)from, 4); _msgpack_be32(val); }) +#define _msgpack_load64(cast, from) \ + ({ cast val; memcpy(&val, (char*)from, 8); _msgpack_be64(val); }) +*/ + + +#endif /* msgpack/sysdep.h */ + diff --git a/pandas/src/msgpack/unpack.h b/pandas/src/msgpack/unpack.h new file mode 100644 index 00000000..3dc88e5f --- /dev/null +++ b/pandas/src/msgpack/unpack.h @@ -0,0 +1,235 @@ +/* + * MessagePack for Python unpacking routine + * + * Copyright (C) 2009 Naoki INADA + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define MSGPACK_EMBED_STACK_SIZE (1024) +#include "unpack_define.h" + +typedef struct unpack_user { + int use_list; + PyObject *object_hook; + bool has_pairs_hook; + PyObject *list_hook; + const char *encoding; + const char *unicode_errors; +} unpack_user; + + +#define msgpack_unpack_struct(name) \ + struct template ## name + +#define msgpack_unpack_func(ret, name) \ + static inline ret template ## name + +#define msgpack_unpack_callback(name) \ + template_callback ## name + +#define msgpack_unpack_object PyObject* + +#define msgpack_unpack_user unpack_user + +typedef int (*execute_fn)(msgpack_unpack_struct(_context)* ctx, const char* data, size_t len, size_t* off); + +struct template_context; +typedef struct template_context template_context; + +static inline msgpack_unpack_object template_callback_root(unpack_user* u) +{ + return NULL; +} + +static inline int template_callback_uint16(unpack_user* u, uint16_t d, msgpack_unpack_object* o) +{ + PyObject *p = PyInt_FromLong((long)d); + if (!p) + return -1; + *o = p; + return 0; +} +static inline int template_callback_uint8(unpack_user* u, uint8_t d, msgpack_unpack_object* o) +{ + return template_callback_uint16(u, d, o); +} + + +static inline int template_callback_uint32(unpack_user* u, uint32_t d, msgpack_unpack_object* o) +{ + PyObject *p; + if (d > LONG_MAX) { + p = PyLong_FromUnsignedLong((unsigned long)d); + } else { + p = PyInt_FromLong((long)d); + } + if (!p) + return -1; + *o = p; + return 0; +} + +static inline int template_callback_uint64(unpack_user* u, uint64_t d, msgpack_unpack_object* o) +{ + PyObject *p = PyLong_FromUnsignedLongLong(d); + if (!p) + return -1; + *o = p; + return 0; +} + +static inline int template_callback_int32(unpack_user* u, int32_t d, msgpack_unpack_object* o) +{ + PyObject *p = PyInt_FromLong(d); + if (!p) + return -1; + *o = p; + return 0; +} + +static inline int template_callback_int16(unpack_user* u, int16_t d, msgpack_unpack_object* o) +{ + return template_callback_int32(u, d, o); +} + +static inline int template_callback_int8(unpack_user* u, int8_t d, msgpack_unpack_object* o) +{ + return template_callback_int32(u, d, o); +} + +static inline int template_callback_int64(unpack_user* u, int64_t d, msgpack_unpack_object* o) +{ + PyObject *p = PyLong_FromLongLong(d); + if (!p) + return -1; + *o = p; + return 0; +} + +static inline int template_callback_double(unpack_user* u, double d, msgpack_unpack_object* o) +{ + PyObject *p = PyFloat_FromDouble(d); + if (!p) + return -1; + *o = p; + return 0; +} + +static inline int template_callback_float(unpack_user* u, float d, msgpack_unpack_object* o) +{ + return template_callback_double(u, d, o); +} + +static inline int template_callback_nil(unpack_user* u, msgpack_unpack_object* o) +{ Py_INCREF(Py_None); *o = Py_None; return 0; } + +static inline int template_callback_true(unpack_user* u, msgpack_unpack_object* o) +{ Py_INCREF(Py_True); *o = Py_True; return 0; } + +static inline int template_callback_false(unpack_user* u, msgpack_unpack_object* o) +{ Py_INCREF(Py_False); *o = Py_False; return 0; } + +static inline int template_callback_array(unpack_user* u, unsigned int n, msgpack_unpack_object* o) +{ + PyObject *p = u->use_list ? PyList_New(n) : PyTuple_New(n); + + if (!p) + return -1; + *o = p; + return 0; +} + +static inline int template_callback_array_item(unpack_user* u, unsigned int current, msgpack_unpack_object* c, msgpack_unpack_object o) +{ + if (u->use_list) + PyList_SET_ITEM(*c, current, o); + else + PyTuple_SET_ITEM(*c, current, o); + return 0; +} + +static inline int template_callback_array_end(unpack_user* u, msgpack_unpack_object* c) +{ + if (u->list_hook) { + PyObject *new_c = PyEval_CallFunction(u->list_hook, "(O)", *c); + if (!new_c) + return -1; + Py_DECREF(*c); + *c = new_c; + } + return 0; +} + +static inline int template_callback_map(unpack_user* u, unsigned int n, msgpack_unpack_object* o) +{ + PyObject *p; + if (u->has_pairs_hook) { + p = PyList_New(n); // Or use tuple? + } + else { + p = PyDict_New(); + } + if (!p) + return -1; + *o = p; + return 0; +} + +static inline int template_callback_map_item(unpack_user* u, unsigned int current, msgpack_unpack_object* c, msgpack_unpack_object k, msgpack_unpack_object v) +{ + if (u->has_pairs_hook) { + msgpack_unpack_object item = PyTuple_Pack(2, k, v); + if (!item) + return -1; + Py_DECREF(k); + Py_DECREF(v); + PyList_SET_ITEM(*c, current, item); + return 0; + } + else if (PyDict_SetItem(*c, k, v) == 0) { + Py_DECREF(k); + Py_DECREF(v); + return 0; + } + return -1; +} + +static inline int template_callback_map_end(unpack_user* u, msgpack_unpack_object* c) +{ + if (u->object_hook) { + PyObject *new_c = PyEval_CallFunction(u->object_hook, "(O)", *c); + if (!new_c) + return -1; + + Py_DECREF(*c); + *c = new_c; + } + return 0; +} + +static inline int template_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o) +{ + PyObject *py; + if(u->encoding) { + py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors); + } else { + py = PyBytes_FromStringAndSize(p, l); + } + if (!py) + return -1; + *o = py; + return 0; +} + +#include "unpack_template.h" diff --git a/pandas/src/msgpack/unpack_define.h b/pandas/src/msgpack/unpack_define.h new file mode 100644 index 00000000..959d3519 --- /dev/null +++ b/pandas/src/msgpack/unpack_define.h @@ -0,0 +1,93 @@ +/* + * MessagePack unpacking routine template + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MSGPACK_UNPACK_DEFINE_H__ +#define MSGPACK_UNPACK_DEFINE_H__ + +#include "msgpack/sysdep.h" +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifndef MSGPACK_EMBED_STACK_SIZE +#define MSGPACK_EMBED_STACK_SIZE 32 +#endif + + +typedef enum { + CS_HEADER = 0x00, // nil + + //CS_ = 0x01, + //CS_ = 0x02, // false + //CS_ = 0x03, // true + + //CS_ = 0x04, + //CS_ = 0x05, + //CS_ = 0x06, + //CS_ = 0x07, + + //CS_ = 0x08, + //CS_ = 0x09, + CS_FLOAT = 0x0a, + CS_DOUBLE = 0x0b, + CS_UINT_8 = 0x0c, + CS_UINT_16 = 0x0d, + CS_UINT_32 = 0x0e, + CS_UINT_64 = 0x0f, + CS_INT_8 = 0x10, + CS_INT_16 = 0x11, + CS_INT_32 = 0x12, + CS_INT_64 = 0x13, + + //CS_ = 0x14, + //CS_ = 0x15, + //CS_BIG_INT_16 = 0x16, + //CS_BIG_INT_32 = 0x17, + //CS_BIG_FLOAT_16 = 0x18, + //CS_BIG_FLOAT_32 = 0x19, + CS_RAW_16 = 0x1a, + CS_RAW_32 = 0x1b, + CS_ARRAY_16 = 0x1c, + CS_ARRAY_32 = 0x1d, + CS_MAP_16 = 0x1e, + CS_MAP_32 = 0x1f, + + //ACS_BIG_INT_VALUE, + //ACS_BIG_FLOAT_VALUE, + ACS_RAW_VALUE, +} msgpack_unpack_state; + + +typedef enum { + CT_ARRAY_ITEM, + CT_MAP_KEY, + CT_MAP_VALUE, +} msgpack_container_type; + + +#ifdef __cplusplus +} +#endif + +#endif /* msgpack/unpack_define.h */ + diff --git a/pandas/src/msgpack/unpack_template.h b/pandas/src/msgpack/unpack_template.h new file mode 100644 index 00000000..83b6918d --- /dev/null +++ b/pandas/src/msgpack/unpack_template.h @@ -0,0 +1,492 @@ +/* + * MessagePack unpacking routine template + * + * Copyright (C) 2008-2010 FURUHASHI Sadayuki + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef msgpack_unpack_func +#error msgpack_unpack_func template is not defined +#endif + +#ifndef msgpack_unpack_callback +#error msgpack_unpack_callback template is not defined +#endif + +#ifndef msgpack_unpack_struct +#error msgpack_unpack_struct template is not defined +#endif + +#ifndef msgpack_unpack_struct_decl +#define msgpack_unpack_struct_decl(name) msgpack_unpack_struct(name) +#endif + +#ifndef msgpack_unpack_object +#error msgpack_unpack_object type is not defined +#endif + +#ifndef msgpack_unpack_user +#error msgpack_unpack_user type is not defined +#endif + +#ifndef USE_CASE_RANGE +#if !defined(_MSC_VER) +#define USE_CASE_RANGE +#endif +#endif + +msgpack_unpack_struct_decl(_stack) { + msgpack_unpack_object obj; + size_t size; + size_t count; + unsigned int ct; + msgpack_unpack_object map_key; +}; + +msgpack_unpack_struct_decl(_context) { + msgpack_unpack_user user; + unsigned int cs; + unsigned int trail; + unsigned int top; + /* + msgpack_unpack_struct(_stack)* stack; + unsigned int stack_size; + msgpack_unpack_struct(_stack) embed_stack[MSGPACK_EMBED_STACK_SIZE]; + */ + msgpack_unpack_struct(_stack) stack[MSGPACK_EMBED_STACK_SIZE]; +}; + + +msgpack_unpack_func(void, _init)(msgpack_unpack_struct(_context)* ctx) +{ + ctx->cs = CS_HEADER; + ctx->trail = 0; + ctx->top = 0; + /* + ctx->stack = ctx->embed_stack; + ctx->stack_size = MSGPACK_EMBED_STACK_SIZE; + */ + ctx->stack[0].obj = msgpack_unpack_callback(_root)(&ctx->user); +} + +/* +msgpack_unpack_func(void, _destroy)(msgpack_unpack_struct(_context)* ctx) +{ + if(ctx->stack_size != MSGPACK_EMBED_STACK_SIZE) { + free(ctx->stack); + } +} +*/ + +msgpack_unpack_func(msgpack_unpack_object, _data)(msgpack_unpack_struct(_context)* ctx) +{ + return (ctx)->stack[0].obj; +} + + +template +msgpack_unpack_func(int, _execute)(msgpack_unpack_struct(_context)* ctx, const char* data, size_t len, size_t* off) +{ + assert(len >= *off); + + const unsigned char* p = (unsigned char*)data + *off; + const unsigned char* const pe = (unsigned char*)data + len; + const void* n = NULL; + + unsigned int trail = ctx->trail; + unsigned int cs = ctx->cs; + unsigned int top = ctx->top; + msgpack_unpack_struct(_stack)* stack = ctx->stack; + /* + unsigned int stack_size = ctx->stack_size; + */ + msgpack_unpack_user* user = &ctx->user; + + msgpack_unpack_object obj; + msgpack_unpack_struct(_stack)* c = NULL; + + int ret; + +#define construct_cb(name) \ + construct && msgpack_unpack_callback(name) + +#define push_simple_value(func) \ + if(construct_cb(func)(user, &obj) < 0) { goto _failed; } \ + goto _push +#define push_fixed_value(func, arg) \ + if(construct_cb(func)(user, arg, &obj) < 0) { goto _failed; } \ + goto _push +#define push_variable_value(func, base, pos, len) \ + if(construct_cb(func)(user, \ + (const char*)base, (const char*)pos, len, &obj) < 0) { goto _failed; } \ + goto _push + +#define again_fixed_trail(_cs, trail_len) \ + trail = trail_len; \ + cs = _cs; \ + goto _fixed_trail_again +#define again_fixed_trail_if_zero(_cs, trail_len, ifzero) \ + trail = trail_len; \ + if(trail == 0) { goto ifzero; } \ + cs = _cs; \ + goto _fixed_trail_again + +#define start_container(func, count_, ct_) \ + if(top >= MSGPACK_EMBED_STACK_SIZE) { goto _failed; } /* FIXME */ \ + if(construct_cb(func)(user, count_, &stack[top].obj) < 0) { goto _failed; } \ + if((count_) == 0) { obj = stack[top].obj; \ + if (construct_cb(func##_end)(user, &obj) < 0) { goto _failed; } \ + goto _push; } \ + stack[top].ct = ct_; \ + stack[top].size = count_; \ + stack[top].count = 0; \ + ++top; \ + /*printf("container %d count %d stack %d\n",stack[top].obj,count_,top);*/ \ + /*printf("stack push %d\n", top);*/ \ + /* FIXME \ + if(top >= stack_size) { \ + if(stack_size == MSGPACK_EMBED_STACK_SIZE) { \ + size_t csize = sizeof(msgpack_unpack_struct(_stack)) * MSGPACK_EMBED_STACK_SIZE; \ + size_t nsize = csize * 2; \ + msgpack_unpack_struct(_stack)* tmp = (msgpack_unpack_struct(_stack)*)malloc(nsize); \ + if(tmp == NULL) { goto _failed; } \ + memcpy(tmp, ctx->stack, csize); \ + ctx->stack = stack = tmp; \ + ctx->stack_size = stack_size = MSGPACK_EMBED_STACK_SIZE * 2; \ + } else { \ + size_t nsize = sizeof(msgpack_unpack_struct(_stack)) * ctx->stack_size * 2; \ + msgpack_unpack_struct(_stack)* tmp = (msgpack_unpack_struct(_stack)*)realloc(ctx->stack, nsize); \ + if(tmp == NULL) { goto _failed; } \ + ctx->stack = stack = tmp; \ + ctx->stack_size = stack_size = stack_size * 2; \ + } \ + } \ + */ \ + goto _header_again + +#define NEXT_CS(p) \ + ((unsigned int)*p & 0x1f) + +#ifdef USE_CASE_RANGE +#define SWITCH_RANGE_BEGIN switch(*p) { +#define SWITCH_RANGE(FROM, TO) case FROM ... TO: +#define SWITCH_RANGE_DEFAULT default: +#define SWITCH_RANGE_END } +#else +#define SWITCH_RANGE_BEGIN { if(0) { +#define SWITCH_RANGE(FROM, TO) } else if(FROM <= *p && *p <= TO) { +#define SWITCH_RANGE_DEFAULT } else { +#define SWITCH_RANGE_END } } +#endif + + if(p == pe) { goto _out; } + do { + switch(cs) { + case CS_HEADER: + SWITCH_RANGE_BEGIN + SWITCH_RANGE(0x00, 0x7f) // Positive Fixnum + push_fixed_value(_uint8, *(uint8_t*)p); + SWITCH_RANGE(0xe0, 0xff) // Negative Fixnum + push_fixed_value(_int8, *(int8_t*)p); + SWITCH_RANGE(0xc0, 0xdf) // Variable + switch(*p) { + case 0xc0: // nil + push_simple_value(_nil); + //case 0xc1: // string + // again_terminal_trail(NEXT_CS(p), p+1); + case 0xc2: // false + push_simple_value(_false); + case 0xc3: // true + push_simple_value(_true); + //case 0xc4: + //case 0xc5: + //case 0xc6: + //case 0xc7: + //case 0xc8: + //case 0xc9: + case 0xca: // float + case 0xcb: // double + case 0xcc: // unsigned int 8 + case 0xcd: // unsigned int 16 + case 0xce: // unsigned int 32 + case 0xcf: // unsigned int 64 + case 0xd0: // signed int 8 + case 0xd1: // signed int 16 + case 0xd2: // signed int 32 + case 0xd3: // signed int 64 + again_fixed_trail(NEXT_CS(p), 1 << (((unsigned int)*p) & 0x03)); + //case 0xd4: + //case 0xd5: + //case 0xd6: // big integer 16 + //case 0xd7: // big integer 32 + //case 0xd8: // big float 16 + //case 0xd9: // big float 32 + case 0xda: // raw 16 + case 0xdb: // raw 32 + case 0xdc: // array 16 + case 0xdd: // array 32 + case 0xde: // map 16 + case 0xdf: // map 32 + again_fixed_trail(NEXT_CS(p), 2 << (((unsigned int)*p) & 0x01)); + default: + goto _failed; + } + SWITCH_RANGE(0xa0, 0xbf) // FixRaw + again_fixed_trail_if_zero(ACS_RAW_VALUE, ((unsigned int)*p & 0x1f), _raw_zero); + SWITCH_RANGE(0x90, 0x9f) // FixArray + start_container(_array, ((unsigned int)*p) & 0x0f, CT_ARRAY_ITEM); + SWITCH_RANGE(0x80, 0x8f) // FixMap + start_container(_map, ((unsigned int)*p) & 0x0f, CT_MAP_KEY); + + SWITCH_RANGE_DEFAULT + goto _failed; + SWITCH_RANGE_END + // end CS_HEADER + + + _fixed_trail_again: + ++p; + + default: + if((size_t)(pe - p) < trail) { goto _out; } + n = p; p += trail - 1; + switch(cs) { + //case CS_ + //case CS_ + case CS_FLOAT: { + union { uint32_t i; float f; } mem; + mem.i = _msgpack_load32(uint32_t,n); + push_fixed_value(_float, mem.f); } + case CS_DOUBLE: { + union { uint64_t i; double f; } mem; + mem.i = _msgpack_load64(uint64_t,n); +#if defined(__arm__) && !(__ARM_EABI__) // arm-oabi + // https://github.com/msgpack/msgpack-perl/pull/1 + mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL); +#endif + push_fixed_value(_double, mem.f); } + case CS_UINT_8: + push_fixed_value(_uint8, *(uint8_t*)n); + case CS_UINT_16: + push_fixed_value(_uint16, _msgpack_load16(uint16_t,n)); + case CS_UINT_32: + push_fixed_value(_uint32, _msgpack_load32(uint32_t,n)); + case CS_UINT_64: + push_fixed_value(_uint64, _msgpack_load64(uint64_t,n)); + + case CS_INT_8: + push_fixed_value(_int8, *(int8_t*)n); + case CS_INT_16: + push_fixed_value(_int16, _msgpack_load16(int16_t,n)); + case CS_INT_32: + push_fixed_value(_int32, _msgpack_load32(int32_t,n)); + case CS_INT_64: + push_fixed_value(_int64, _msgpack_load64(int64_t,n)); + + //case CS_ + //case CS_ + //case CS_BIG_INT_16: + // again_fixed_trail_if_zero(ACS_BIG_INT_VALUE, _msgpack_load16(uint16_t,n), _big_int_zero); + //case CS_BIG_INT_32: + // again_fixed_trail_if_zero(ACS_BIG_INT_VALUE, _msgpack_load32(uint32_t,n), _big_int_zero); + //case ACS_BIG_INT_VALUE: + //_big_int_zero: + // // FIXME + // push_variable_value(_big_int, data, n, trail); + + //case CS_BIG_FLOAT_16: + // again_fixed_trail_if_zero(ACS_BIG_FLOAT_VALUE, _msgpack_load16(uint16_t,n), _big_float_zero); + //case CS_BIG_FLOAT_32: + // again_fixed_trail_if_zero(ACS_BIG_FLOAT_VALUE, _msgpack_load32(uint32_t,n), _big_float_zero); + //case ACS_BIG_FLOAT_VALUE: + //_big_float_zero: + // // FIXME + // push_variable_value(_big_float, data, n, trail); + + case CS_RAW_16: + again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load16(uint16_t,n), _raw_zero); + case CS_RAW_32: + again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load32(uint32_t,n), _raw_zero); + case ACS_RAW_VALUE: + _raw_zero: + push_variable_value(_raw, data, n, trail); + + case CS_ARRAY_16: + start_container(_array, _msgpack_load16(uint16_t,n), CT_ARRAY_ITEM); + case CS_ARRAY_32: + /* FIXME security guard */ + start_container(_array, _msgpack_load32(uint32_t,n), CT_ARRAY_ITEM); + + case CS_MAP_16: + start_container(_map, _msgpack_load16(uint16_t,n), CT_MAP_KEY); + case CS_MAP_32: + /* FIXME security guard */ + start_container(_map, _msgpack_load32(uint32_t,n), CT_MAP_KEY); + + default: + goto _failed; + } + } + +_push: + if(top == 0) { goto _finish; } + c = &stack[top-1]; + switch(c->ct) { + case CT_ARRAY_ITEM: + if(construct_cb(_array_item)(user, c->count, &c->obj, obj) < 0) { goto _failed; } + if(++c->count == c->size) { + obj = c->obj; + if (construct_cb(_array_end)(user, &obj) < 0) { goto _failed; } + --top; + /*printf("stack pop %d\n", top);*/ + goto _push; + } + goto _header_again; + case CT_MAP_KEY: + c->map_key = obj; + c->ct = CT_MAP_VALUE; + goto _header_again; + case CT_MAP_VALUE: + if(construct_cb(_map_item)(user, c->count, &c->obj, c->map_key, obj) < 0) { goto _failed; } + if(++c->count == c->size) { + obj = c->obj; + if (construct_cb(_map_end)(user, &obj) < 0) { goto _failed; } + --top; + /*printf("stack pop %d\n", top);*/ + goto _push; + } + c->ct = CT_MAP_KEY; + goto _header_again; + + default: + goto _failed; + } + +_header_again: + cs = CS_HEADER; + ++p; + } while(p != pe); + goto _out; + + +_finish: + if (!construct) + msgpack_unpack_callback(_nil)(user, &obj); + stack[0].obj = obj; + ++p; + ret = 1; + /*printf("-- finish --\n"); */ + goto _end; + +_failed: + /*printf("** FAILED **\n"); */ + ret = -1; + goto _end; + +_out: + ret = 0; + goto _end; + +_end: + ctx->cs = cs; + ctx->trail = trail; + ctx->top = top; + *off = p - (const unsigned char*)data; + + return ret; +#undef construct_cb +} + +#undef SWITCH_RANGE_BEGIN +#undef SWITCH_RANGE +#undef SWITCH_RANGE_DEFAULT +#undef SWITCH_RANGE_END +#undef push_simple_value +#undef push_fixed_value +#undef push_variable_value +#undef again_fixed_trail +#undef again_fixed_trail_if_zero +#undef start_container + +template +msgpack_unpack_func(int, _container_header)(msgpack_unpack_struct(_context)* ctx, const char* data, size_t len, size_t* off) +{ + assert(len >= *off); + uint32_t size; + const unsigned char *const p = (unsigned char*)data + *off; + +#define inc_offset(inc) \ + if (len - *off < inc) \ + return 0; \ + *off += inc; + + switch (*p) { + case var_offset: + inc_offset(3); + size = _msgpack_load16(uint16_t, p + 1); + break; + case var_offset + 1: + inc_offset(5); + size = _msgpack_load32(uint32_t, p + 1); + break; +#ifdef USE_CASE_RANGE + case fixed_offset + 0x0 ... fixed_offset + 0xf: +#else + case fixed_offset + 0x0: + case fixed_offset + 0x1: + case fixed_offset + 0x2: + case fixed_offset + 0x3: + case fixed_offset + 0x4: + case fixed_offset + 0x5: + case fixed_offset + 0x6: + case fixed_offset + 0x7: + case fixed_offset + 0x8: + case fixed_offset + 0x9: + case fixed_offset + 0xa: + case fixed_offset + 0xb: + case fixed_offset + 0xc: + case fixed_offset + 0xd: + case fixed_offset + 0xe: + case fixed_offset + 0xf: +#endif + ++*off; + size = ((unsigned int)*p) & 0x0f; + break; + default: + PyErr_SetString(PyExc_ValueError, "Unexpected type header on stream"); + return -1; + } + msgpack_unpack_callback(_uint32)(&ctx->user, size, &ctx->stack[0].obj); + return 1; +} + +#undef SWITCH_RANGE_BEGIN +#undef SWITCH_RANGE +#undef SWITCH_RANGE_DEFAULT +#undef SWITCH_RANGE_END + +static const execute_fn template_construct = &template_execute; +static const execute_fn template_skip = &template_execute; +static const execute_fn read_array_header = &template_container_header<0x90, 0xdc>; +static const execute_fn read_map_header = &template_container_header<0x80, 0xde>; + +#undef msgpack_unpack_func +#undef msgpack_unpack_callback +#undef msgpack_unpack_struct +#undef msgpack_unpack_object +#undef msgpack_unpack_user + +#undef NEXT_CS + +/* vim: set ts=4 sw=4 noexpandtab */ diff --git a/pandas/src/numpy.pxd b/pandas/src/numpy.pxd new file mode 100644 index 00000000..9ab3b9b1 --- /dev/null +++ b/pandas/src/numpy.pxd @@ -0,0 +1,984 @@ +# NumPy static imports for Cython +# +# If any of the PyArray_* functions are called, import_array must be +# called first. +# +# This also defines backwards-compatability buffer acquisition +# code for use in Python 2.x (or Python <= 2.5 when NumPy starts +# implementing PEP-3118 directly). +# +# Because of laziness, the format string of the buffer is statically +# allocated. Increase the size if this is not enough, or submit a +# patch to do this properly. +# +# Author: Dag Sverre Seljebotn +# + +DEF _buffer_format_string_len = 255 + +cimport cpython.buffer as pybuf +from cpython.ref cimport Py_INCREF, Py_XDECREF +from cpython.object cimport PyObject +cimport libc.stdlib as stdlib +cimport libc.stdio as stdio + +cdef extern from "Python.h": + ctypedef int Py_intptr_t + +cdef extern from "numpy/arrayobject.h": + ctypedef Py_intptr_t npy_intp + ctypedef size_t npy_uintp + + cdef enum NPY_TYPES: + NPY_BOOL + NPY_BYTE + NPY_UBYTE + NPY_SHORT + NPY_USHORT + NPY_INT + NPY_UINT + NPY_LONG + NPY_ULONG + NPY_LONGLONG + NPY_ULONGLONG + NPY_FLOAT + NPY_DOUBLE + NPY_LONGDOUBLE + NPY_CFLOAT + NPY_CDOUBLE + NPY_CLONGDOUBLE + NPY_OBJECT + NPY_STRING + NPY_UNICODE + NPY_VOID + NPY_NTYPES + NPY_NOTYPE + + NPY_INT8 + NPY_INT16 + NPY_INT32 + NPY_INT64 + NPY_INT128 + NPY_INT256 + NPY_UINT8 + NPY_UINT16 + NPY_UINT32 + NPY_UINT64 + NPY_UINT128 + NPY_UINT256 + NPY_FLOAT16 + NPY_FLOAT32 + NPY_FLOAT64 + NPY_FLOAT80 + NPY_FLOAT96 + NPY_FLOAT128 + NPY_FLOAT256 + NPY_COMPLEX32 + NPY_COMPLEX64 + NPY_COMPLEX128 + NPY_COMPLEX160 + NPY_COMPLEX192 + NPY_COMPLEX256 + NPY_COMPLEX512 + + NPY_DATETIME + NPY_TIMEDELTA + + NPY_INTP + + ctypedef enum NPY_ORDER: + NPY_ANYORDER + NPY_CORDER + NPY_FORTRANORDER + + ctypedef enum NPY_CLIPMODE: + NPY_CLIP + NPY_WRAP + NPY_RAISE + + ctypedef enum NPY_SCALARKIND: + NPY_NOSCALAR, + NPY_BOOL_SCALAR, + NPY_INTPOS_SCALAR, + NPY_INTNEG_SCALAR, + NPY_FLOAT_SCALAR, + NPY_COMPLEX_SCALAR, + NPY_OBJECT_SCALAR + + ctypedef enum NPY_SORTKIND: + NPY_QUICKSORT + NPY_HEAPSORT + NPY_MERGESORT + + ctypedef enum NPY_SEARCHSIDE: + NPY_SEARCHLEFT + NPY_SEARCHRIGHT + + enum: + NPY_C_CONTIGUOUS + NPY_F_CONTIGUOUS + NPY_CONTIGUOUS + NPY_FORTRAN + NPY_OWNDATA + NPY_FORCECAST + NPY_ENSURECOPY + NPY_ENSUREARRAY + NPY_ELEMENTSTRIDES + NPY_ALIGNED + NPY_NOTSWAPPED + NPY_WRITEABLE + NPY_UPDATEIFCOPY + NPY_ARR_HAS_DESCR + + NPY_BEHAVED + NPY_BEHAVED_NS + NPY_CARRAY + NPY_CARRAY_RO + NPY_FARRAY + NPY_FARRAY_RO + NPY_DEFAULT + + NPY_IN_ARRAY + NPY_OUT_ARRAY + NPY_INOUT_ARRAY + NPY_IN_FARRAY + NPY_OUT_FARRAY + NPY_INOUT_FARRAY + + NPY_UPDATE_ALL + + cdef enum: + NPY_MAXDIMS + + npy_intp NPY_MAX_ELSIZE + + ctypedef void (*PyArray_VectorUnaryFunc)(void *, void *, npy_intp, void *, void *) + + ctypedef class numpy.dtype [object PyArray_Descr]: + # Use PyDataType_* macros when possible, however there are no macros + # for accessing some of the fields, so some are defined. Please + # ask on cython-dev if you need more. + cdef int type_num + cdef int itemsize "elsize" + cdef char byteorder + cdef object fields + cdef tuple names + + ctypedef extern class numpy.flatiter [object PyArrayIterObject]: + # Use through macros + pass + + ctypedef extern class numpy.broadcast [object PyArrayMultiIterObject]: + # Use through macros + pass + + ctypedef struct PyArrayObject: + # For use in situations where ndarray can't replace PyArrayObject*, + # like PyArrayObject**. + pass + + ctypedef class numpy.ndarray [object PyArrayObject]: + cdef __cythonbufferdefaults__ = {"mode": "strided"} + + cdef: + # Only taking a few of the most commonly used and stable fields. + # One should use PyArray_* macros instead to access the C fields. + char *data + int ndim "nd" + npy_intp *shape "dimensions" + npy_intp *strides + dtype descr + PyObject* base + + # Note: This syntax (function definition in pxd files) is an + # experimental exception made for __getbuffer__ and __releasebuffer__ + # -- the details of this may change. + def __getbuffer__(ndarray self, Py_buffer* info, int flags): + # This implementation of getbuffer is geared towards Cython + # requirements, and does not yet fullfill the PEP. + # In particular strided access is always provided regardless + # of flags + + if info == NULL: return + + cdef int copy_shape, i, ndim + cdef int endian_detector = 1 + cdef bint little_endian = ((&endian_detector)[0] != 0) + + ndim = PyArray_NDIM(self) + + if sizeof(npy_intp) != sizeof(Py_ssize_t): + copy_shape = 1 + else: + copy_shape = 0 + + if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) + and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): + raise ValueError(u"ndarray is not C contiguous") + + if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) + and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)): + raise ValueError(u"ndarray is not Fortran contiguous") + + info.buf = PyArray_DATA(self) + info.ndim = ndim + if copy_shape: + # Allocate new buffer for strides and shape info. + # This is allocated as one block, strides first. + info.strides = stdlib.malloc(sizeof(Py_ssize_t) * ndim * 2) + info.shape = info.strides + ndim + for i in range(ndim): + info.strides[i] = PyArray_STRIDES(self)[i] + info.shape[i] = PyArray_DIMS(self)[i] + else: + info.strides = PyArray_STRIDES(self) + info.shape = PyArray_DIMS(self) + info.suboffsets = NULL + info.itemsize = PyArray_ITEMSIZE(self) + info.readonly = not PyArray_ISWRITEABLE(self) + + cdef int t + cdef char* f = NULL + cdef dtype descr = self.descr + cdef list stack + cdef int offset + + cdef bint hasfields = PyDataType_HASFIELDS(descr) + + if not hasfields and not copy_shape: + # do not call releasebuffer + info.obj = None + else: + # need to call releasebuffer + info.obj = self + + if not hasfields: + t = descr.type_num + if ((descr.byteorder == '>' and little_endian) or + (descr.byteorder == '<' and not little_endian)): + raise ValueError(u"Non-native byte order not supported") + if t == NPY_BYTE: f = "b" + elif t == NPY_UBYTE: f = "B" + elif t == NPY_SHORT: f = "h" + elif t == NPY_USHORT: f = "H" + elif t == NPY_INT: f = "i" + elif t == NPY_UINT: f = "I" + elif t == NPY_LONG: f = "l" + elif t == NPY_ULONG: f = "L" + elif t == NPY_LONGLONG: f = "q" + elif t == NPY_ULONGLONG: f = "Q" + elif t == NPY_FLOAT: f = "f" + elif t == NPY_DOUBLE: f = "d" + elif t == NPY_LONGDOUBLE: f = "g" + elif t == NPY_CFLOAT: f = "Zf" + elif t == NPY_CDOUBLE: f = "Zd" + elif t == NPY_CLONGDOUBLE: f = "Zg" + elif t == NPY_OBJECT: f = "O" + else: + raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) + info.format = f + return + else: + info.format = stdlib.malloc(_buffer_format_string_len) + info.format[0] = '^' # Native data types, manual alignment + offset = 0 + f = _util_dtypestring(descr, info.format + 1, + info.format + _buffer_format_string_len, + &offset) + f[0] = 0 # Terminate format string + + def __releasebuffer__(ndarray self, Py_buffer* info): + if PyArray_HASFIELDS(self): + stdlib.free(info.format) + if sizeof(npy_intp) != sizeof(Py_ssize_t): + stdlib.free(info.strides) + # info.shape was stored after info.strides in the same block + + + ctypedef signed char npy_bool + + ctypedef signed char npy_byte + ctypedef signed short npy_short + ctypedef signed int npy_int + ctypedef signed long npy_long + ctypedef signed long long npy_longlong + + ctypedef unsigned char npy_ubyte + ctypedef unsigned short npy_ushort + ctypedef unsigned int npy_uint + ctypedef unsigned long npy_ulong + ctypedef unsigned long long npy_ulonglong + + ctypedef float npy_float + ctypedef double npy_double + ctypedef long double npy_longdouble + + ctypedef signed char npy_int8 + ctypedef signed short npy_int16 + ctypedef signed int npy_int32 + ctypedef signed long long npy_int64 + ctypedef signed long long npy_int96 + ctypedef signed long long npy_int128 + + ctypedef unsigned char npy_uint8 + ctypedef unsigned short npy_uint16 + ctypedef unsigned int npy_uint32 + ctypedef unsigned long long npy_uint64 + ctypedef unsigned long long npy_uint96 + ctypedef unsigned long long npy_uint128 + + ctypedef float npy_float16 + ctypedef float npy_float32 + ctypedef double npy_float64 + ctypedef long double npy_float80 + ctypedef long double npy_float96 + ctypedef long double npy_float128 + + ctypedef struct npy_cfloat: + double real + double imag + + ctypedef struct npy_cdouble: + double real + double imag + + ctypedef struct npy_clongdouble: + double real + double imag + + ctypedef struct npy_complex64: + double real + double imag + + ctypedef struct npy_complex128: + double real + double imag + + ctypedef struct npy_complex160: + double real + double imag + + ctypedef struct npy_complex192: + double real + double imag + + ctypedef struct npy_complex256: + double real + double imag + + ctypedef struct PyArray_Dims: + npy_intp *ptr + int len + + void import_array() + + # + # Macros from ndarrayobject.h + # + bint PyArray_CHKFLAGS(ndarray m, int flags) + bint PyArray_ISCONTIGUOUS(ndarray m) + bint PyArray_ISWRITEABLE(ndarray m) + bint PyArray_ISALIGNED(ndarray m) + + int PyArray_NDIM(ndarray) + bint PyArray_ISONESEGMENT(ndarray) + bint PyArray_ISFORTRAN(ndarray) + int PyArray_FORTRANIF(ndarray) + + void* PyArray_DATA(ndarray) + char* PyArray_BYTES(ndarray) + npy_intp* PyArray_DIMS(ndarray) + npy_intp* PyArray_STRIDES(ndarray) + npy_intp PyArray_DIM(ndarray, size_t) + npy_intp PyArray_STRIDE(ndarray, size_t) + + # object PyArray_BASE(ndarray) wrong refcount semantics + # dtype PyArray_DESCR(ndarray) wrong refcount semantics + int PyArray_FLAGS(ndarray) + npy_intp PyArray_ITEMSIZE(ndarray) + int PyArray_TYPE(ndarray arr) + + object PyArray_GETITEM(ndarray arr, void *itemptr) + int PyArray_SETITEM(ndarray arr, void *itemptr, object obj) + + bint PyTypeNum_ISBOOL(int) + bint PyTypeNum_ISUNSIGNED(int) + bint PyTypeNum_ISSIGNED(int) + bint PyTypeNum_ISINTEGER(int) + bint PyTypeNum_ISFLOAT(int) + bint PyTypeNum_ISNUMBER(int) + bint PyTypeNum_ISSTRING(int) + bint PyTypeNum_ISCOMPLEX(int) + bint PyTypeNum_ISPYTHON(int) + bint PyTypeNum_ISFLEXIBLE(int) + bint PyTypeNum_ISUSERDEF(int) + bint PyTypeNum_ISEXTENDED(int) + bint PyTypeNum_ISOBJECT(int) + + bint PyDataType_ISBOOL(dtype) + bint PyDataType_ISUNSIGNED(dtype) + bint PyDataType_ISSIGNED(dtype) + bint PyDataType_ISINTEGER(dtype) + bint PyDataType_ISFLOAT(dtype) + bint PyDataType_ISNUMBER(dtype) + bint PyDataType_ISSTRING(dtype) + bint PyDataType_ISCOMPLEX(dtype) + bint PyDataType_ISPYTHON(dtype) + bint PyDataType_ISFLEXIBLE(dtype) + bint PyDataType_ISUSERDEF(dtype) + bint PyDataType_ISEXTENDED(dtype) + bint PyDataType_ISOBJECT(dtype) + bint PyDataType_HASFIELDS(dtype) + + bint PyArray_ISBOOL(ndarray) + bint PyArray_ISUNSIGNED(ndarray) + bint PyArray_ISSIGNED(ndarray) + bint PyArray_ISINTEGER(ndarray) + bint PyArray_ISFLOAT(ndarray) + bint PyArray_ISNUMBER(ndarray) + bint PyArray_ISSTRING(ndarray) + bint PyArray_ISCOMPLEX(ndarray) + bint PyArray_ISPYTHON(ndarray) + bint PyArray_ISFLEXIBLE(ndarray) + bint PyArray_ISUSERDEF(ndarray) + bint PyArray_ISEXTENDED(ndarray) + bint PyArray_ISOBJECT(ndarray) + bint PyArray_HASFIELDS(ndarray) + + bint PyArray_ISVARIABLE(ndarray) + + bint PyArray_SAFEALIGNEDCOPY(ndarray) + bint PyArray_ISNBO(ndarray) + bint PyArray_IsNativeByteOrder(ndarray) + bint PyArray_ISNOTSWAPPED(ndarray) + bint PyArray_ISBYTESWAPPED(ndarray) + + bint PyArray_FLAGSWAP(ndarray, int) + + bint PyArray_ISCARRAY(ndarray) + bint PyArray_ISCARRAY_RO(ndarray) + bint PyArray_ISFARRAY(ndarray) + bint PyArray_ISFARRAY_RO(ndarray) + bint PyArray_ISBEHAVED(ndarray) + bint PyArray_ISBEHAVED_RO(ndarray) + + + bint PyDataType_ISNOTSWAPPED(dtype) + bint PyDataType_ISBYTESWAPPED(dtype) + + bint PyArray_DescrCheck(object) + + bint PyArray_Check(object) + bint PyArray_CheckExact(object) + + # Cannot be supported due to out arg: + # bint PyArray_HasArrayInterfaceType(object, dtype, object, object&) + # bint PyArray_HasArrayInterface(op, out) + + + bint PyArray_IsZeroDim(object) + # Cannot be supported due to ## ## in macro: + # bint PyArray_IsScalar(object, verbatim work) + bint PyArray_CheckScalar(object) + bint PyArray_IsPythonNumber(object) + bint PyArray_IsPythonScalar(object) + bint PyArray_IsAnyScalar(object) + bint PyArray_CheckAnyScalar(object) + ndarray PyArray_GETCONTIGUOUS(ndarray) + bint PyArray_SAMESHAPE(ndarray, ndarray) + npy_intp PyArray_SIZE(ndarray) + npy_intp PyArray_NBYTES(ndarray) + + object PyArray_FROM_O(object) + object PyArray_FROM_OF(object m, int flags) + bint PyArray_FROM_OT(object m, int type) + bint PyArray_FROM_OTF(object m, int type, int flags) + object PyArray_FROMANY(object m, int type, int min, int max, int flags) + object PyArray_ZEROS(int nd, npy_intp* dims, int type, int fortran) + object PyArray_EMPTY(int nd, npy_intp* dims, int type, int fortran) + void PyArray_FILLWBYTE(object, int val) + npy_intp PyArray_REFCOUNT(object) + object PyArray_ContiguousFromAny(op, int, int min_depth, int max_depth) + unsigned char PyArray_EquivArrTypes(ndarray a1, ndarray a2) + bint PyArray_EquivByteorders(int b1, int b2) + object PyArray_SimpleNew(int nd, npy_intp* dims, int typenum) + object PyArray_SimpleNewFromData(int nd, npy_intp* dims, int typenum, void* data) + #object PyArray_SimpleNewFromDescr(int nd, npy_intp* dims, dtype descr) + object PyArray_ToScalar(void* data, ndarray arr) + + void* PyArray_GETPTR1(ndarray m, npy_intp i) + void* PyArray_GETPTR2(ndarray m, npy_intp i, npy_intp j) + void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) + void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) + + void PyArray_XDECREF_ERR(ndarray) + # Cannot be supported due to out arg + # void PyArray_DESCR_REPLACE(descr) + + + object PyArray_Copy(ndarray) + object PyArray_FromObject(object op, int type, int min_depth, int max_depth) + object PyArray_ContiguousFromObject(object op, int type, int min_depth, int max_depth) + object PyArray_CopyFromObject(object op, int type, int min_depth, int max_depth) + + object PyArray_Cast(ndarray mp, int type_num) + object PyArray_Take(ndarray ap, object items, int axis) + object PyArray_Put(ndarray ap, object items, object values) + + void PyArray_ITER_RESET(flatiter it) nogil + void PyArray_ITER_NEXT(flatiter it) nogil + void PyArray_ITER_GOTO(flatiter it, npy_intp* destination) nogil + void PyArray_ITER_GOTO1D(flatiter it, npy_intp ind) nogil + void* PyArray_ITER_DATA(flatiter it) nogil + bint PyArray_ITER_NOTDONE(flatiter it) nogil + + void PyArray_MultiIter_RESET(broadcast multi) nogil + void PyArray_MultiIter_NEXT(broadcast multi) nogil + void PyArray_MultiIter_GOTO(broadcast multi, npy_intp dest) nogil + void PyArray_MultiIter_GOTO1D(broadcast multi, npy_intp ind) nogil + void* PyArray_MultiIter_DATA(broadcast multi, npy_intp i) nogil + void PyArray_MultiIter_NEXTi(broadcast multi, npy_intp i) nogil + bint PyArray_MultiIter_NOTDONE(broadcast multi) nogil + + # Functions from __multiarray_api.h + + # Functions taking dtype and returning object/ndarray are disabled + # for now as they steal dtype references. I'm conservative and disable + # more than is probably needed until it can be checked further. + int PyArray_SetNumericOps (object) + object PyArray_GetNumericOps () + int PyArray_INCREF (ndarray) + int PyArray_XDECREF (ndarray) + void PyArray_SetStringFunction (object, int) + dtype PyArray_DescrFromType (int) + object PyArray_TypeObjectFromType (int) + char * PyArray_Zero (ndarray) + char * PyArray_One (ndarray) + #object PyArray_CastToType (ndarray, dtype, int) + int PyArray_CastTo (ndarray, ndarray) + int PyArray_CastAnyTo (ndarray, ndarray) + int PyArray_CanCastSafely (int, int) + npy_bool PyArray_CanCastTo (dtype, dtype) + int PyArray_ObjectType (object, int) + dtype PyArray_DescrFromObject (object, dtype) + #ndarray* PyArray_ConvertToCommonType (object, int *) + dtype PyArray_DescrFromScalar (object) + dtype PyArray_DescrFromTypeObject (object) + npy_intp PyArray_Size (object) + #object PyArray_Scalar (void *, dtype, object) + #object PyArray_FromScalar (object, dtype) + void PyArray_ScalarAsCtype (object, void *) + #int PyArray_CastScalarToCtype (object, void *, dtype) + #int PyArray_CastScalarDirect (object, dtype, void *, int) + object PyArray_ScalarFromObject (object) + #PyArray_VectorUnaryFunc * PyArray_GetCastFunc (dtype, int) + object PyArray_FromDims (int, int *, int) + #object PyArray_FromDimsAndDataAndDescr (int, int *, dtype, char *) + #object PyArray_FromAny (object, dtype, int, int, int, object) + object PyArray_EnsureArray (object) + object PyArray_EnsureAnyArray (object) + #object PyArray_FromFile (stdio.FILE *, dtype, npy_intp, char *) + #object PyArray_FromString (char *, npy_intp, dtype, npy_intp, char *) + #object PyArray_FromBuffer (object, dtype, npy_intp, npy_intp) + #object PyArray_FromIter (object, dtype, npy_intp) + object PyArray_Return (ndarray) + #object PyArray_GetField (ndarray, dtype, int) + #int PyArray_SetField (ndarray, dtype, int, object) + object PyArray_Byteswap (ndarray, npy_bool) + object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER) + int PyArray_MoveInto (ndarray, ndarray) + int PyArray_CopyInto (ndarray, ndarray) + int PyArray_CopyAnyInto (ndarray, ndarray) + int PyArray_CopyObject (ndarray, object) + object PyArray_NewCopy (ndarray, NPY_ORDER) + object PyArray_ToList (ndarray) + object PyArray_ToString (ndarray, NPY_ORDER) + int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *) + int PyArray_Dump (object, object, int) + object PyArray_Dumps (object, int) + int PyArray_ValidType (int) + void PyArray_UpdateFlags (ndarray, int) + object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object) + #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object) + #dtype PyArray_DescrNew (dtype) + dtype PyArray_DescrNewFromType (int) + double PyArray_GetPriority (object, double) + object PyArray_IterNew (object) + object PyArray_MultiIterNew (int, ...) + + int PyArray_PyIntAsInt (object) + npy_intp PyArray_PyIntAsIntp (object) + int PyArray_Broadcast (broadcast) + void PyArray_FillObjectArray (ndarray, object) + int PyArray_FillWithScalar (ndarray, object) + npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *) + dtype PyArray_DescrNewByteorder (dtype, char) + object PyArray_IterAllButAxis (object, int *) + #object PyArray_CheckFromAny (object, dtype, int, int, int, object) + #object PyArray_FromArray (ndarray, dtype, int) + object PyArray_FromInterface (object) + object PyArray_FromStructInterface (object) + #object PyArray_FromArrayAttr (object, dtype, object) + #NPY_SCALARKIND PyArray_ScalarKind (int, ndarray*) + int PyArray_CanCoerceScalar (int, int, NPY_SCALARKIND) + object PyArray_NewFlagsObject (object) + npy_bool PyArray_CanCastScalar (type, type) + #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t) + int PyArray_RemoveSmallest (broadcast) + int PyArray_ElementStrides (object) + void PyArray_Item_INCREF (char *, dtype) + void PyArray_Item_XDECREF (char *, dtype) + object PyArray_FieldNames (object) + object PyArray_Transpose (ndarray, PyArray_Dims *) + object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE) + object PyArray_PutTo (ndarray, object, object, NPY_CLIPMODE) + object PyArray_PutMask (ndarray, object, object) + object PyArray_Repeat (ndarray, object, int) + object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE) + int PyArray_Sort (ndarray, int, NPY_SORTKIND) + object PyArray_ArgSort (ndarray, int, NPY_SORTKIND) + object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE) + object PyArray_ArgMax (ndarray, int, ndarray) + object PyArray_ArgMin (ndarray, int, ndarray) + object PyArray_Reshape (ndarray, object) + object PyArray_Newshape (ndarray, PyArray_Dims *, NPY_ORDER) + object PyArray_Squeeze (ndarray) + #object PyArray_View (ndarray, dtype, type) + object PyArray_SwapAxes (ndarray, int, int) + object PyArray_Max (ndarray, int, ndarray) + object PyArray_Min (ndarray, int, ndarray) + object PyArray_Ptp (ndarray, int, ndarray) + object PyArray_Mean (ndarray, int, int, ndarray) + object PyArray_Trace (ndarray, int, int, int, int, ndarray) + object PyArray_Diagonal (ndarray, int, int, int) + object PyArray_Clip (ndarray, object, object, ndarray) + object PyArray_Conjugate (ndarray, ndarray) + object PyArray_Nonzero (ndarray) + object PyArray_Std (ndarray, int, int, ndarray, int) + object PyArray_Sum (ndarray, int, int, ndarray) + object PyArray_CumSum (ndarray, int, int, ndarray) + object PyArray_Prod (ndarray, int, int, ndarray) + object PyArray_CumProd (ndarray, int, int, ndarray) + object PyArray_All (ndarray, int, ndarray) + object PyArray_Any (ndarray, int, ndarray) + object PyArray_Compress (ndarray, object, int, ndarray) + object PyArray_Flatten (ndarray, NPY_ORDER) + object PyArray_Ravel (ndarray, NPY_ORDER) + npy_intp PyArray_MultiplyList (npy_intp *, int) + int PyArray_MultiplyIntList (int *, int) + void * PyArray_GetPtr (ndarray, npy_intp*) + int PyArray_CompareLists (npy_intp *, npy_intp *, int) + #int PyArray_AsCArray (object*, void *, npy_intp *, int, dtype) + #int PyArray_As1D (object*, char **, int *, int) + #int PyArray_As2D (object*, char ***, int *, int *, int) + int PyArray_Free (object, void *) + #int PyArray_Converter (object, object*) + int PyArray_IntpFromSequence (object, npy_intp *, int) + object PyArray_Concatenate (object, int) + object PyArray_InnerProduct (object, object) + object PyArray_MatrixProduct (object, object) + object PyArray_CopyAndTranspose (object) + object PyArray_Correlate (object, object, int) + int PyArray_TypestrConvert (int, int) + #int PyArray_DescrConverter (object, dtype*) + #int PyArray_DescrConverter2 (object, dtype*) + int PyArray_IntpConverter (object, PyArray_Dims *) + #int PyArray_BufferConverter (object, chunk) + int PyArray_AxisConverter (object, int *) + int PyArray_BoolConverter (object, npy_bool *) + int PyArray_ByteorderConverter (object, char *) + int PyArray_OrderConverter (object, NPY_ORDER *) + unsigned char PyArray_EquivTypes (dtype, dtype) + #object PyArray_Zeros (int, npy_intp *, dtype, int) + #object PyArray_Empty (int, npy_intp *, dtype, int) + object PyArray_Where (object, object, object) + object PyArray_Arange (double, double, double, int) + #object PyArray_ArangeObj (object, object, object, dtype) + int PyArray_SortkindConverter (object, NPY_SORTKIND *) + object PyArray_LexSort (object, int) + object PyArray_Round (ndarray, int, ndarray) + unsigned char PyArray_EquivTypenums (int, int) + int PyArray_RegisterDataType (dtype) + int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *) + int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND) + #void PyArray_InitArrFuncs (PyArray_ArrFuncs *) + object PyArray_IntTupleFromIntp (int, npy_intp *) + int PyArray_TypeNumFromName (char *) + int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *) + #int PyArray_OutputConverter (object, ndarray*) + object PyArray_BroadcastToShape (object, npy_intp *, int) + void _PyArray_SigintHandler (int) + void* _PyArray_GetSigintBuf () + #int PyArray_DescrAlignConverter (object, dtype*) + #int PyArray_DescrAlignConverter2 (object, dtype*) + int PyArray_SearchsideConverter (object, void *) + object PyArray_CheckAxis (ndarray, int *, int) + npy_intp PyArray_OverflowMultiplyList (npy_intp *, int) + int PyArray_CompareString (char *, char *, size_t) + + +# Typedefs that matches the runtime dtype objects in +# the numpy module. + +# The ones that are commented out needs an IFDEF function +# in Cython to enable them only on the right systems. + +ctypedef npy_int8 int8_t +ctypedef npy_int16 int16_t +ctypedef npy_int32 int32_t +ctypedef npy_int64 int64_t +#ctypedef npy_int96 int96_t +#ctypedef npy_int128 int128_t + +ctypedef npy_uint8 uint8_t +ctypedef npy_uint16 uint16_t +ctypedef npy_uint32 uint32_t +ctypedef npy_uint64 uint64_t +#ctypedef npy_uint96 uint96_t +#ctypedef npy_uint128 uint128_t + +ctypedef npy_float16 float16_t +ctypedef npy_float32 float32_t +ctypedef npy_float64 float64_t +#ctypedef npy_float80 float80_t +#ctypedef npy_float128 float128_t + +ctypedef float complex complex64_t +ctypedef double complex complex128_t + +# The int types are mapped a bit surprising -- +# numpy.int corresponds to 'l' and numpy.long to 'q' +ctypedef npy_long int_t +ctypedef npy_longlong long_t +ctypedef npy_longlong longlong_t + +ctypedef npy_ulong uint_t +ctypedef npy_ulonglong ulong_t +ctypedef npy_ulonglong ulonglong_t + +ctypedef npy_intp intp_t +ctypedef npy_uintp uintp_t + +ctypedef npy_double float_t +ctypedef npy_double double_t +ctypedef npy_longdouble longdouble_t + +ctypedef npy_cfloat cfloat_t +ctypedef npy_cdouble cdouble_t +ctypedef npy_clongdouble clongdouble_t + +ctypedef npy_cdouble complex_t + +cdef inline object PyArray_MultiIterNew1(a): + return PyArray_MultiIterNew(1, a) + +cdef inline object PyArray_MultiIterNew2(a, b): + return PyArray_MultiIterNew(2, a, b) + +cdef inline object PyArray_MultiIterNew3(a, b, c): + return PyArray_MultiIterNew(3, a, b, c) + +cdef inline object PyArray_MultiIterNew4(a, b, c, d): + return PyArray_MultiIterNew(4, a, b, c, d) + +cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): + return PyArray_MultiIterNew(5, a, b, c, d, e) + +cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL: + # Recursive utility function used in __getbuffer__ to get format + # string. The new location in the format string is returned. + + cdef dtype child + cdef int delta_offset + cdef tuple i + cdef int endian_detector = 1 + cdef bint little_endian = ((&endian_detector)[0] != 0) + cdef tuple fields + + for childname in descr.names: + fields = descr.fields[childname] + child, new_offset = fields + + if (end - f) - (new_offset - offset[0]) < 15: + raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") + + if ((child.byteorder == '>' and little_endian) or + (child.byteorder == '<' and not little_endian)): + raise ValueError(u"Non-native byte order not supported") + # One could encode it in the format string and have Cython + # complain instead, BUT: < and > in format strings also imply + # standardized sizes for datatypes, and we rely on native in + # order to avoid reencoding data types based on their size. + # + # A proper PEP 3118 exporter for other clients than Cython + # must deal properly with this! + + # Output padding bytes + while offset[0] < new_offset: + f[0] = 120 # "x"; pad byte + f += 1 + offset[0] += 1 + + offset[0] += child.itemsize + + if not PyDataType_HASFIELDS(child): + t = child.type_num + if end - f < 5: + raise RuntimeError(u"Format string allocated too short.") + + # Until ticket #99 is fixed, use integers to avoid warnings + if t == NPY_BYTE: f[0] = 98 #"b" + elif t == NPY_UBYTE: f[0] = 66 #"B" + elif t == NPY_SHORT: f[0] = 104 #"h" + elif t == NPY_USHORT: f[0] = 72 #"H" + elif t == NPY_INT: f[0] = 105 #"i" + elif t == NPY_UINT: f[0] = 73 #"I" + elif t == NPY_LONG: f[0] = 108 #"l" + elif t == NPY_ULONG: f[0] = 76 #"L" + elif t == NPY_LONGLONG: f[0] = 113 #"q" + elif t == NPY_ULONGLONG: f[0] = 81 #"Q" + elif t == NPY_FLOAT: f[0] = 102 #"f" + elif t == NPY_DOUBLE: f[0] = 100 #"d" + elif t == NPY_LONGDOUBLE: f[0] = 103 #"g" + elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf + elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd + elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg + elif t == NPY_OBJECT: f[0] = 79 #"O" + else: + raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) + f += 1 + else: + # Cython ignores struct boundary information ("T{...}"), + # so don't output it + f = _util_dtypestring(child, f, end, offset) + return f + + +# +# ufunc API +# + +cdef extern from "numpy/ufuncobject.h": + + ctypedef void (*PyUFuncGenericFunction) (char **, npy_intp *, npy_intp *, void *) + + ctypedef extern class numpy.ufunc [object PyUFuncObject]: + cdef: + int nin, nout, nargs + int identity + PyUFuncGenericFunction *functions + void **data + int ntypes + int check_return + char *name + char *types + char *doc + void *ptr + PyObject *obj + PyObject *userloops + + cdef enum: + PyUFunc_Zero + PyUFunc_One + PyUFunc_None + UFUNC_ERR_IGNORE + UFUNC_ERR_WARN + UFUNC_ERR_RAISE + UFUNC_ERR_CALL + UFUNC_ERR_PRINT + UFUNC_ERR_LOG + UFUNC_MASK_DIVIDEBYZERO + UFUNC_MASK_OVERFLOW + UFUNC_MASK_UNDERFLOW + UFUNC_MASK_INVALID + UFUNC_SHIFT_DIVIDEBYZERO + UFUNC_SHIFT_OVERFLOW + UFUNC_SHIFT_UNDERFLOW + UFUNC_SHIFT_INVALID + UFUNC_FPE_DIVIDEBYZERO + UFUNC_FPE_OVERFLOW + UFUNC_FPE_UNDERFLOW + UFUNC_FPE_INVALID + UFUNC_ERR_DEFAULT + UFUNC_ERR_DEFAULT2 + + object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *, + void **, char *, int, int, int, int, char *, char *, int) + int PyUFunc_RegisterLoopForType(ufunc, int, + PyUFuncGenericFunction, int *, void *) + int PyUFunc_GenericFunction \ + (ufunc, PyObject *, PyObject *, PyArrayObject **) + void PyUFunc_f_f_As_d_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_d_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_f_f \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_g_g \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_F_F_As_D_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_F_F \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_D_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_G_G \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_O_O \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_ff_f_As_dd_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_ff_f \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_dd_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_gg_g \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_FF_F_As_DD_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_DD_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_FF_F \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_GG_G \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_OO_O \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_O_O_method \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_OO_O_method \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_On_Om \ + (char **, npy_intp *, npy_intp *, void *) + int PyUFunc_GetPyValues \ + (char *, int *, int *, PyObject **) + int PyUFunc_checkfperr \ + (int, PyObject *, int *) + void PyUFunc_clearfperr() + int PyUFunc_getfperr() + int PyUFunc_handlefperr \ + (int, PyObject *, int, int *) + int PyUFunc_ReplaceLoopBySignature \ + (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *) + object PyUFunc_FromFuncAndDataAndSignature \ + (PyUFuncGenericFunction *, void **, char *, int, int, int, + int, char *, char *, int, char *) + + void import_ufunc() + + +cdef inline void set_array_base(ndarray arr, object base): + cdef PyObject* baseptr + if base is None: + baseptr = NULL + else: + Py_INCREF(base) # important to do this before decref below! + baseptr = base + Py_XDECREF(arr.base) + arr.base = baseptr + +cdef inline object get_array_base(ndarray arr): + if arr.base is NULL: + return None + else: + return arr.base diff --git a/pandas/src/numpy_helper.h b/pandas/src/numpy_helper.h new file mode 100644 index 00000000..69b849de --- /dev/null +++ b/pandas/src/numpy_helper.h @@ -0,0 +1,185 @@ +#include "Python.h" +#include "numpy/arrayobject.h" +#include "numpy/arrayscalars.h" +#include "helper.h" + +#define PANDAS_FLOAT 0 +#define PANDAS_INT 1 +#define PANDAS_BOOL 2 +#define PANDAS_STRING 3 +#define PANDAS_OBJECT 4 +#define PANDAS_DATETIME 5 + +PANDAS_INLINE int +infer_type(PyObject* obj) { + if (PyBool_Check(obj)) { + return PANDAS_BOOL; + } + else if (PyArray_IsIntegerScalar(obj)) { + return PANDAS_INT; + } + else if (PyArray_IsScalar(obj, Datetime)) { + return PANDAS_DATETIME; + } + else if (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)) { + return PANDAS_FLOAT; + } + else if (PyString_Check(obj) || PyUnicode_Check(obj)) { + return PANDAS_STRING; + } + else { + return PANDAS_OBJECT; + } +} + +PANDAS_INLINE npy_int64 +get_nat(void) { + return NPY_MIN_INT64; +} + +PANDAS_INLINE npy_datetime +get_datetime64_value(PyObject* obj) { + return ((PyDatetimeScalarObject*) obj)->obval; + +} + +PANDAS_INLINE int +is_integer_object(PyObject* obj) { + return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); +// return PyArray_IsIntegerScalar(obj); +} + +PANDAS_INLINE int +is_float_object(PyObject* obj) { + return (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)); +} +PANDAS_INLINE int +is_complex_object(PyObject* obj) { + return (PyComplex_Check(obj) || PyArray_IsScalar(obj, ComplexFloating)); +} + +PANDAS_INLINE int +is_bool_object(PyObject* obj) { + return (PyBool_Check(obj) || PyArray_IsScalar(obj, Bool)); +} + +PANDAS_INLINE int +is_string_object(PyObject* obj) { + return (PyString_Check(obj) || PyUnicode_Check(obj)); +} + +PANDAS_INLINE int +is_datetime64_object(PyObject *obj) { + return PyArray_IsScalar(obj, Datetime); +} + +PANDAS_INLINE int +is_timedelta64_object(PyObject *obj) { + return PyArray_IsScalar(obj, Timedelta); +} + +PANDAS_INLINE int +assign_value_1d(PyArrayObject* ap, Py_ssize_t _i, PyObject* v) { + npy_intp i = (npy_intp) _i; + char *item = (char *) PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); + return PyArray_DESCR(ap)->f->setitem(v, item, ap); +} + +PANDAS_INLINE PyObject* +get_value_1d(PyArrayObject* ap, Py_ssize_t i) { + char *item = (char *) PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); + return PyArray_Scalar(item, PyArray_DESCR(ap), (PyObject*) ap); +} + + +PANDAS_INLINE char* +get_c_string(PyObject* obj) { +#if PY_VERSION_HEX >= 0x03000000 + PyObject* enc_str = PyUnicode_AsEncodedString(obj, "utf-8", "error"); + + char *ret; + ret = PyBytes_AS_STRING(enc_str); + + // TODO: memory leak here + + // Py_XDECREF(enc_str); + return ret; +#else + return PyString_AsString(obj); +#endif +} + +PANDAS_INLINE PyObject* +char_to_string(char* data) { +#if PY_VERSION_HEX >= 0x03000000 + return PyUnicode_FromString(data); +#else + return PyString_FromString(data); +#endif +} + +// PANDAS_INLINE int +// is_string(PyObject* obj) { +// #if PY_VERSION_HEX >= 0x03000000 +// return PyUnicode_Check(obj); +// #else +// return PyString_Check(obj); +// #endif + +PyObject* sarr_from_data(PyArray_Descr *descr, int length, void* data) { + PyArrayObject *result; + npy_intp dims[1] = {length}; + Py_INCREF(descr); // newfromdescr steals a reference to descr + result = (PyArrayObject*) PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, + NULL, data, 0, NULL); + + // Returned array doesn't own data by default + result->flags |= NPY_OWNDATA; + + return (PyObject*) result; +} + + +void transfer_object_column(char *dst, char *src, size_t stride, + size_t length) { + int i; + size_t sz = sizeof(PyObject*); + + for (i = 0; i < length; ++i) + { + // uninitialized data + + // Py_XDECREF(*((PyObject**) dst)); + + memcpy(dst, src, sz); + Py_INCREF(*((PyObject**) dst)); + src += sz; + dst += stride; + } +} + +void set_array_owndata(PyArrayObject *ao) { + ao->flags |= NPY_OWNDATA; +} + +void set_array_not_contiguous(PyArrayObject *ao) { + ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS); +} + + +// PANDAS_INLINE PyObject* +// get_base_ndarray(PyObject* ap) { +// // if (!ap || (NULL == ap)) { +// // Py_RETURN_NONE; +// // } + +// while (!PyArray_CheckExact(ap)) { +// ap = PyArray_BASE((PyArrayObject*) ap); +// if (ap == Py_None) Py_RETURN_NONE; +// } +// // PyArray_BASE is a borrowed reference +// if(ap) { +// Py_INCREF(ap); +// } +// return ap; +// } diff --git a/pandas/src/offsets.pyx b/pandas/src/offsets.pyx new file mode 100644 index 00000000..096198c8 --- /dev/null +++ b/pandas/src/offsets.pyx @@ -0,0 +1,367 @@ + +ctypedef enum time_res: + r_min = 0 + r_microsecond + r_second + r_minute + r_hour + r_day + r_month + r_year + r_max = 98 + r_invalid = 99 + + +cdef conversion_factor(time_res res1, time_res res2): + cdef: + time_res min_res, max_res + int64_t factor + + min_res = min(res1, res2) + max_res = max(res1, res2) + factor = 1 + + if min_res == max_res: + return factor + + while min_res < max_res: + if min_res < r_microsecond: + raise "Cannot convert from less than us" + elif min_res == r_microsecond: + factor *= 1000000 + min_res = r_second + elif min_res == r_second: + factor *= 60 + min_res = r_minute + elif min_res == r_minute: + factor *= 60 + min_res = r_hour + elif min_res == r_hour: + factor *= 24 + min_res = r_day + else: + raise "Cannot convert to month or year" + + return factor + +# Logic to generate ranges +# ----------------------------------------------------------------------------- + +cdef inline int64_t weekend_adjustment(int64_t dow, int bkwd): + if dow > 4: # sat or sun? + if bkwd: # roll back 1 or 2 days + return (4 - dow) + else: # roll forward 2 or 1 days + return (7 - dow) + return 0 + +cdef int64_t us_in_day = conversion_factor(r_microsecond, r_day) + +cdef class _Offset: + """ + Base class to generate timestamps. Set the anchor, and then move offsets + with next & prev. Retrieve timestamp with ts attribute. + """ + cdef: + int64_t t, dow, biz, dayoffset + object start + _TSObject ts + + def __cinit__(self): + self.t=0 + self.dow=0 + self.biz=0 + self.dayoffset=0 + + cpdef anchor(self, object start=None): + if start is not None: + self.start = start + self.ts = convert_to_tsobject(self.start, None, None) + self._setup() + + cdef _setup(self): + pass + + cpdef next(self): + pass + + cpdef __next__(self): + """wrapper around next""" + return self.next() + + cpdef prev(self): + pass + + cdef int64_t _ts(self): + """ + Access the current timestamp value, with a possible weekday + adjustment. + """ + cdef int64_t adj + + if self.biz != 0: + adj = weekend_adjustment(self.dow, self.biz < 0) + return self.t + us_in_day * adj + else: + return self.t + + cdef int64_t _get_anchor(self): + """ + Retrieve an anchor relating to current offset we're on. + """ + return self.t - self.dayoffset * us_in_day + + property ts: + def __get__(self): + return self._ts() + +cdef class YearOffset(_Offset): + """ + Generate annual timestamps from provided start time; apply dayoffset to + each timestamp. If biz > 0, we choose the next business day at each time; + previous if < 0. + + Parameters + ---------- + dayoffset : int + biz : int + """ + cdef: + int64_t y, ly + + def __init__(self, int64_t dayoffset=0, int64_t biz=0, object anchor=None): + self.dayoffset = dayoffset + self.biz = biz + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + + self.t = ts.value + self.dayoffset * us_in_day + self.y = ts.dts.year + + self.ly = (ts.dts.month > 2 or + ts.dts.month == 2 and ts.dts.day == 29) + + if self.biz != 0: + self.dow = (ts_dayofweek(ts) + self.dayoffset) % 7 + + cpdef next(self): + cdef int64_t days + + days = 365 + is_leapyear(self.y + self.ly) + + self.t += days * us_in_day + self.y += 1 + + if self.biz != 0: + self.dow = (self.dow + days) % 7 + + cpdef prev(self): + cdef int64_t days + + days = 365 + is_leapyear(self.y - (1-self.ly)) + + self.t -= days * us_in_day + self.y -= 1 + + if self.biz != 0: + self.dow = (self.dow - days) % 7 + +cdef class MonthOffset(_Offset): + """ + Generate monthly timestamps from provided start time, and apply dayoffset + to each timestamp. Stride to construct strided timestamps (eg quarterly). + If biz > 0, we choose the next business day at each time; previous if < 0. + + Parameters + ---------- + dayoffset : int + stride : int, > 0 + biz : int + """ + cdef: + Py_ssize_t stride, ly, m + int64_t y + + def __init__(self, int64_t dayoffset=0, Py_ssize_t stride=1, + int64_t biz=0, object anchor=None): + self.dayoffset = dayoffset + self.stride = stride + self.biz = biz + + if stride <= 0: + raise ValueError("Stride must be positive") + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + + self.t = ts.value + (self.dayoffset * us_in_day) + + # for day counting + self.m = ts.dts.month - 1 + self.y = ts.dts.year + self.ly = is_leapyear(self.y) + + if self.biz != 0: + self.dow = (ts_dayofweek(ts) + self.dayoffset) % 7 + + cpdef next(self): + cdef: + int64_t tmp, days + Py_ssize_t j + + days = 0 + for j in range(0, self.stride): + if self.m >= 12: + self.m -= 12 + self.y += 1 + self.ly = is_leapyear(self.y) + days += days_per_month_table[self.ly][self.m] + self.m += 1 + + self.t += days * us_in_day + + if self.biz != 0: + self.dow = (self.dow + days) % 7 + + cpdef prev(self): + cdef: + int64_t tmp, days + Py_ssize_t j + + days = 0 + for j in range(0, self.stride): + self.m -= 1 + if self.m < 0: + self.m += 12 + self.y -= 1 + self.ly = is_leapyear(self.y) + days += days_per_month_table[self.ly][self.m] + + self.t -= days * us_in_day + + if self.biz != 0: + self.dow = (self.dow - days) % 7 + +cdef class DayOfMonthOffset(_Offset): + """ + Generate relative monthly timestamps from month & year of provided start + time. For example, fridays of the third week of each month (week=3, day=4); + or, thursdays of the last week of each month (week=-1, day=3). + + Parameters + ---------- + week : int + day : int, 0 to 6 + """ + cdef: + Py_ssize_t ly, m + int64_t y, day, week + + def __init__(self, int64_t week=0, int64_t day=0, object anchor=None): + self.week = week + self.day = day + + if self.day < 0 or self.day > 6: + raise ValueError("Day offset must be 0 to 6") + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + + # rewind to beginning of month + self.t = ts.value - (ts.dts.day - 1) * us_in_day + self.dow = dayofweek(ts.dts.year, ts.dts.month, 1) + + # for day counting + self.m = ts.dts.month - 1 + self.y = ts.dts.year + self.ly = is_leapyear(self.y) + + cpdef next(self): + cdef: + int64_t tmp, days + + days = days_per_month_table[self.ly][self.m] + self.t += days * us_in_day + self.dow = (self.dow + days) % 7 + + self.m += 1 + if self.m >= 12: + self.m -= 12 + self.y += 1 + self.ly = is_leapyear(self.y) + + cpdef prev(self): + cdef: + int64_t tmp, days + + days = days_per_month_table[self.ly][(self.m - 1) % 12] + self.t -= days * us_in_day + self.dow = (self.dow - days) % 7 + + self.m -= 1 + if self.m < 0: + self.m += 12 + self.y -= 1 + self.ly = is_leapyear(self.y) + + cdef int64_t _ts(self): + """ + Overwrite default adjustment + """ + cdef int64_t adj = (self.week * 7) + (self.day - self.dow) % 7 + return self.t + us_in_day * adj + +cdef class DayOffset(_Offset): + """ + Generate daily timestamps beginning with first valid time >= start time. If + biz != 0, we skip weekends. Stride, to construct weekly timestamps. + + Parameters + ---------- + stride : int, > 0 + biz : boolean + """ + cdef: + Py_ssize_t stride + + def __init__(self, int64_t stride=1, int64_t biz=0, object anchor=None): + self.stride = stride + self.biz = biz + + if self.stride <= 0: + raise ValueError("Stride must be positive") + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + self.t = ts.value + if self.biz != 0: + self.dow = ts_dayofweek(ts) + + cpdef next(self): + self.t += (self.stride * us_in_day) + if self.biz != 0: + self.dow = (self.dow + self.stride) % 7 + if self.dow >= 5: + self.t += (7 - self.dow) * us_in_day + self.dow = 0 + + cpdef prev(self): + self.t -= (self.stride * us_in_day) + if self.biz != 0: + self.dow = (self.dow - self.stride) % 7 + if self.dow >= 5: + self.t += (4 - self.dow) * us_in_day + self.dow = 4 diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h new file mode 100644 index 00000000..763cbc03 --- /dev/null +++ b/pandas/src/parse_helper.h @@ -0,0 +1,246 @@ +#include +#include + +static double xstrtod(const char *p, char **q, char decimal, char sci, + int skip_trailing); + +int to_double(char *item, double *p_value, char sci, char decimal) +{ + char *p_end; + + *p_value = xstrtod(item, &p_end, decimal, sci, 1); + + return (errno == 0) && (!*p_end); +} + +#if PY_VERSION_HEX < 0x02060000 + #define PyBytes_Check PyString_Check + #define PyBytes_AS_STRING PyString_AS_STRING +#endif + +int floatify(PyObject* str, double *result) { + int status; + char *data; + PyObject* tmp = NULL; + const char sci = 'E'; + const char dec = '.'; + + if (PyBytes_Check(str)) { + data = PyBytes_AS_STRING(str); + } else if (PyUnicode_Check(str)) { + tmp = PyUnicode_AsUTF8String(str); + data = PyBytes_AS_STRING(tmp); + } else { + PyErr_SetString(PyExc_TypeError, "Invalid object type"); + return -1; + } + + status = to_double(data, result, sci, dec); + + if (!status) { + /* handle inf/-inf */ + if (0 == strcmp(data, "-inf")) { + *result = -HUGE_VAL; + } else if (0 == strcmp(data, "inf")) { + *result = HUGE_VAL; + } else { + PyErr_SetString(PyExc_ValueError, "Unable to parse string"); + Py_XDECREF(tmp); + return -1; + } + } + + Py_XDECREF(tmp); + return 0; + +/* +#if PY_VERSION_HEX >= 0x03000000 + return PyFloat_FromString(str); +#else + return PyFloat_FromString(str, NULL); +#endif +*/ + +} + + +// --------------------------------------------------------------------------- +// Implementation of xstrtod + +// +// strtod.c +// +// Convert string to double +// +// Copyright (C) 2002 Michael Ringgaard. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// 3. Neither the name of the project nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// ----------------------------------------------------------------------- +// Modifications by Warren Weckesser, March 2011: +// * Rename strtod() to xstrtod(). +// * Added decimal and sci arguments. +// * Skip trailing spaces. +// * Commented out the other functions. +// + +PANDAS_INLINE void lowercase(char *p) { + for ( ; *p; ++p) *p = tolower(*p); +} + +PANDAS_INLINE void uppercase(char *p) { + for ( ; *p; ++p) *p = toupper(*p); +} + + +static double xstrtod(const char *str, char **endptr, char decimal, + char sci, int skip_trailing) +{ + double number; + int exponent; + int negative; + char *p = (char *) str; + double p10; + int n; + int num_digits; + int num_decimals; + + errno = 0; + + // Skip leading whitespace + while (isspace(*p)) p++; + + // Handle optional sign + negative = 0; + switch (*p) + { + case '-': negative = 1; // Fall through to increment position + case '+': p++; + } + + number = 0.; + exponent = 0; + num_digits = 0; + num_decimals = 0; + + // Process string of digits + while (isdigit(*p)) + { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + } + + // Process decimal part + if (*p == decimal) + { + p++; + + while (isdigit(*p)) + { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } + + exponent -= num_decimals; + } + + if (num_digits == 0) + { + errno = ERANGE; + return 0.0; + } + + // Correct for sign + if (negative) number = -number; + + // Process an exponent string + if (toupper(*p) == toupper(sci)) + { + // Handle optional sign + negative = 0; + switch (*++p) + { + case '-': negative = 1; // Fall through to increment pos + case '+': p++; + } + + // Process string of digits + n = 0; + while (isdigit(*p)) + { + n = n * 10 + (*p - '0'); + p++; + } + + if (negative) + exponent -= n; + else + exponent += n; + } + + + if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) + { + + errno = ERANGE; + return HUGE_VAL; + } + + // Scale the result + p10 = 10.; + n = exponent; + if (n < 0) n = -n; + while (n) + { + if (n & 1) + { + if (exponent < 0) + number /= p10; + else + number *= p10; + } + n >>= 1; + p10 *= p10; + } + + + if (number == HUGE_VAL) { + errno = ERANGE; + } + + if (skip_trailing) { + // Skip trailing whitespace + while (isspace(*p)) p++; + } + + if (endptr) *endptr = p; + + + return number; +} diff --git a/pandas/src/parser/.gitignore b/pandas/src/parser/.gitignore new file mode 100644 index 00000000..f07e771a --- /dev/null +++ b/pandas/src/parser/.gitignore @@ -0,0 +1,2 @@ +!*.c +test* \ No newline at end of file diff --git a/pandas/src/parser/Makefile b/pandas/src/parser/Makefile new file mode 100644 index 00000000..ec88eaf4 --- /dev/null +++ b/pandas/src/parser/Makefile @@ -0,0 +1,13 @@ +PYTHONBASE = /Library/Frameworks/EPD64.framework/Versions/Current +NUMPY_INC = /Library/Frameworks/EPD64.framework/Versions/7.1/lib/python2.7/site-packages/numpy/core/include +PYTHON_INC = -I$(PYTHONBASE)/include/python2.7 -I$(NUMPY_INC) +PYTHON_LINK = -L$(PYTHONBASE)/lib -lpython + +SOURCES = conversions.c parser.c str_to.c + +check-syntax: + gcc -g $(PYTHON_INC) -o /dev/null -S ${CHK_SOURCES} + +test: $(SOURCES) + gcc $(PYTHON_INC) -o test $(SOURCES) + ./test \ No newline at end of file diff --git a/pandas/src/parser/io.c b/pandas/src/parser/io.c new file mode 100644 index 00000000..e6d54bd5 --- /dev/null +++ b/pandas/src/parser/io.c @@ -0,0 +1,281 @@ +#include "io.h" + + /* + On-disk FILE, uncompressed + */ + + +void *new_file_source(char *fname, size_t buffer_size) { + file_source *fs = (file_source *) malloc(sizeof(file_source)); + fs->fp = fopen(fname, "rb"); + + if (fs->fp == NULL) { + free(fs); + return NULL; + } + setbuf(fs->fp, NULL); + + fs->initial_file_pos = ftell(fs->fp); + + // Only allocate this heap memory if we are not memory-mapping the file + fs->buffer = (char*) malloc((buffer_size + 1) * sizeof(char)); + + if (fs->buffer == NULL) { + return NULL; + } + + memset(fs->buffer, 0, buffer_size + 1); + fs->buffer[buffer_size] = '\0'; + + return (void *) fs; +} + + +// XXX handle on systems without the capability + + +/* + * void *new_file_buffer(FILE *f, int buffer_size) + * + * Allocate a new file_buffer. + * Returns NULL if the memory allocation fails or if the call to mmap fails. + * + * buffer_size is ignored. + */ + + +void* new_rd_source(PyObject *obj) { + rd_source *rds = (rd_source *) malloc(sizeof(rd_source)); + + /* hold on to this object */ + Py_INCREF(obj); + rds->obj = obj; + rds->buffer = NULL; + rds->position = 0; + + return (void*) rds; +} + +/* + + Cleanup callbacks + + */ + +int del_file_source(void *fs) { + // fseek(FS(fs)->fp, FS(fs)->initial_file_pos, SEEK_SET); + if (fs == NULL) + return 0; + + /* allocated on the heap */ + free(FS(fs)->buffer); + fclose(FS(fs)->fp); + free(fs); + + return 0; +} + +int del_rd_source(void *rds) { + Py_XDECREF(RDS(rds)->obj); + Py_XDECREF(RDS(rds)->buffer); + free(rds); + + return 0; +} + +/* + + IO callbacks + + */ + + +void* buffer_file_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status) { + file_source *src = FS(source); + + *bytes_read = fread((void*) src->buffer, sizeof(char), nbytes, + src->fp); + + if (*bytes_read == 0) { + *status = REACHED_EOF; + } else { + *status = 0; + } + + return (void*) src->buffer; + +} + + +void* buffer_rd_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status) { + PyGILState_STATE state; + PyObject *result, *func, *args, *tmp; + + void *retval; + + size_t length; + rd_source *src = RDS(source); + + /* delete old object */ + Py_XDECREF(src->buffer); + args = Py_BuildValue("(i)", nbytes); + + state = PyGILState_Ensure(); + func = PyObject_GetAttrString(src->obj, "read"); + /* printf("%s\n", PyBytes_AsString(PyObject_Repr(func))); */ + + /* TODO: does this release the GIL? */ + result = PyObject_CallObject(func, args); + Py_XDECREF(args); + Py_XDECREF(func); + + /* PyObject_Print(PyObject_Type(result), stdout, 0); */ + if (result == NULL) { + PyGILState_Release(state); + *bytes_read = 0; + *status = CALLING_READ_FAILED; + return NULL; + } + else if (!PyBytes_Check(result)) { + tmp = PyUnicode_AsUTF8String(result); + Py_XDECREF(result); + result = tmp; + } + + length = PySequence_Length(result); + + if (length == 0) + *status = REACHED_EOF; + else + *status = 0; + + /* hang on to the Python object */ + src->buffer = result; + retval = (void*) PyBytes_AsString(result); + + + PyGILState_Release(state); + + /* TODO: more error handling */ + *bytes_read = length; + + return retval; +} + + +#ifdef HAVE_MMAP + +#include +#include + +void *new_mmap(char *fname) +{ + struct stat buf; + int fd; + memory_map *mm; + /* off_t position; */ + off_t filesize; + + mm = (memory_map *) malloc(sizeof(memory_map)); + mm->fp = fopen(fname, "rb"); + + fd = fileno(mm->fp); + if (fstat(fd, &buf) == -1) { + fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", errno); + return NULL; + } + filesize = buf.st_size; /* XXX This might be 32 bits. */ + + + if (mm == NULL) { + /* XXX Eventually remove this print statement. */ + fprintf(stderr, "new_file_buffer: malloc() failed.\n"); + return NULL; + } + mm->size = (off_t) filesize; + mm->line_number = 0; + + mm->fileno = fd; + mm->position = ftell(mm->fp); + mm->last_pos = (off_t) filesize; + + mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); + if (mm->memmap == NULL) { + /* XXX Eventually remove this print statement. */ + fprintf(stderr, "new_file_buffer: mmap() failed.\n"); + free(mm); + mm = NULL; + } + + return (void*) mm; +} + + +int del_mmap(void *src) +{ + munmap(MM(src)->memmap, MM(src)->size); + + fclose(MM(src)->fp); + + /* + * With a memory mapped file, there is no need to do + * anything if restore == RESTORE_INITIAL. + */ + /* if (restore == RESTORE_FINAL) { */ + /* fseek(FB(fb)->file, FB(fb)->current_pos, SEEK_SET); */ + /* } */ + free(src); + + return 0; +} + +void* buffer_mmap_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status) { + void *retval; + memory_map *src = MM(source); + + if (src->position == src->last_pos) { + *bytes_read = 0; + *status = REACHED_EOF; + return NULL; + } + + retval = src->memmap + src->position; + + if (src->position + nbytes > src->last_pos) { + // fewer than nbytes remaining + *bytes_read = src->last_pos - src->position; + } else { + *bytes_read = nbytes; + } + + *status = 0; + + /* advance position in mmap data structure */ + src->position += *bytes_read; + + return retval; +} + +#else + +/* kludgy */ + +void *new_mmap(char *fname) { + return NULL; +} + +int del_mmap(void *src) { + return 0; +} + +/* don't use this! */ + +void* buffer_mmap_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status) { + return NULL; +} + +#endif diff --git a/pandas/src/parser/io.h b/pandas/src/parser/io.h new file mode 100644 index 00000000..f5831ad9 --- /dev/null +++ b/pandas/src/parser/io.h @@ -0,0 +1,85 @@ +#include "Python.h" +#include "tokenizer.h" + + +typedef struct _file_source { + /* The file being read. */ + FILE *fp; + + char *buffer; + /* Size of the file, in bytes. */ + /* off_t size; */ + + /* file position when the file_buffer was created. */ + off_t initial_file_pos; + + /* Offset in the file of the data currently in the buffer. */ + off_t buffer_file_pos; + + /* Actual number of bytes in the current buffer. (Can be less than buffer_size.) */ + off_t last_pos; + + /* Size (in bytes) of the buffer. */ + // off_t buffer_size; + + /* Pointer to the buffer. */ + // char *buffer; + +} file_source; + +#define FS(source) ((file_source *)source) + +#if !defined(_WIN32) +#define HAVE_MMAP +#endif + +typedef struct _memory_map { + + FILE *fp; + + /* Size of the file, in bytes. */ + off_t size; + + /* file position when the file_buffer was created. */ + off_t initial_file_pos; + + int line_number; + + int fileno; + off_t position; + off_t last_pos; + char *memmap; + +} memory_map; + +#define MM(src) ((memory_map*) src) + +void *new_mmap(char *fname); + +int del_mmap(void *src); + +void* buffer_mmap_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status); + + +typedef struct _rd_source { + PyObject* obj; + PyObject* buffer; + size_t position; +} rd_source; + +#define RDS(source) ((rd_source *)source) + +void *new_file_source(char *fname, size_t buffer_size); + +void *new_rd_source(PyObject *obj); + +int del_file_source(void *src); +int del_rd_source(void *src); + +void* buffer_file_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status); + +void* buffer_rd_bytes(void *source, size_t nbytes, + size_t *bytes_read, int *status); + diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c new file mode 100644 index 00000000..1e957648 --- /dev/null +++ b/pandas/src/parser/tokenizer.c @@ -0,0 +1,2232 @@ +/* + +Copyright (c) 2012, Lambda Foundry, Inc., except where noted + +Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause +BSD + +See LICENSE for the license + +*/ + + /* + Low-level ascii-file processing for pandas. Combines some elements from + Python's built-in csv module and Warren Weckesser's textreader project on + GitHub. See Python Software Foundation License and BSD licenses for these. + + */ + + +#include "tokenizer.h" + +#include +#include +#include + + +#define READ_ERROR_OUT_OF_MEMORY 1 + + +/* +* restore: +* RESTORE_NOT (0): +* Free memory, but leave the file position wherever it +* happend to be. +* RESTORE_INITIAL (1): +* Restore the file position to the location at which +* the file_buffer was created. +* RESTORE_FINAL (2): +* Put the file position at the next byte after the +* data read from the file_buffer. +*/ +#define RESTORE_NOT 0 +#define RESTORE_INITIAL 1 +#define RESTORE_FINAL 2 + + + + +static void *safe_realloc(void *buffer, size_t size) { + void *result; + // OS X is weird + // http://stackoverflow.com/questions/9560609/ + // different-realloc-behaviour-in-linux-and-osx + + result = realloc(buffer, size); + + if (result != NULL) { + // errno gets set to 12 on my OS Xmachine in some cases even when the + // realloc succeeds. annoying + errno = 0; + } else { + return buffer; + } + + return result; +} + + +void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { + // column i, starting at 0 + self->words = parser->words; + self->col = i; + self->line_start = parser->line_start + start; +} + +coliter_t *coliter_new(parser_t *self, int i) { + // column i, starting at 0 + coliter_t *iter = (coliter_t*) malloc(sizeof(coliter_t)); + + if (NULL == iter) { + return NULL; + } + + coliter_setup(iter, self, i, 0); + return iter; +} + + + + /* int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error); */ + /* uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); */ + + +static void free_if_not_null(void *ptr) { + if (ptr != NULL) free(ptr); + } + + + + /* + + Parser / tokenizer + + */ + + +static void *grow_buffer(void *buffer, int length, int *capacity, + int space, int elsize, int *error) { + int cap = *capacity; + + // Can we fit potentially nbytes tokens (+ null terminators) in the stream? + while (length + space > cap) { + cap = cap? cap << 1 : 2; + + buffer = safe_realloc(buffer, elsize * cap); + + if (buffer == NULL) { + // TODO: error codes + *error = -1; + } + } + + // sigh, multiple return values + *capacity = cap; + *error = 0; + return buffer; + } + + +void parser_set_default_options(parser_t *self) { + self->decimal = '.'; + self->sci = 'E'; + + // For tokenization + self->state = START_RECORD; + + self->delimiter = ','; // XXX + self->delim_whitespace = 0; + + self->doublequote = 0; + self->quotechar = '"'; + self->escapechar = 0; + + self->lineterminator = '\0'; /* NUL->standard logic */ + + self->skipinitialspace = 0; + self->quoting = QUOTE_MINIMAL; + self->allow_embedded_newline = 1; + self->strict = 0; + + self->expected_fields = -1; + self->error_bad_lines = 0; + self->warn_bad_lines = 0; + + self->commentchar = '#'; + self->thousands = '\0'; + + self->skipset = NULL; + self->skip_footer = 0; +} + +int get_parser_memory_footprint(parser_t *self) { + return 0; +} + +parser_t* parser_new() { + return (parser_t*) calloc(1, sizeof(parser_t)); +} + + + +int parser_clear_data_buffers(parser_t *self) { + free_if_not_null(self->stream); + free_if_not_null(self->words); + free_if_not_null(self->word_starts); + free_if_not_null(self->line_start); + free_if_not_null(self->line_fields); + + return 0; +} + +int parser_cleanup(parser_t *self) { + if (self->cb_cleanup == NULL) { + return 0; + } + + if (self->cb_cleanup(self->source) < 0) { + return -1; + } + + if (parser_clear_data_buffers(self) < 0) { + return -1; + } + + // XXX where to put this + free_if_not_null(self->error_msg); + free_if_not_null(self->warn_msg); + + if (self->skipset != NULL) + kh_destroy_int64((kh_int64_t*) self->skipset); + + return 0; +} + + + +int parser_init(parser_t *self) { + int sz; + + /* + Initialize data buffers + */ + + self->stream = NULL; + self->words = NULL; + self->word_starts = NULL; + self->line_start = NULL; + self->line_fields = NULL; + + // token stream + self->stream = (char*) malloc(STREAM_INIT_SIZE * sizeof(char)); + if (self->stream == NULL) { + return PARSER_OUT_OF_MEMORY; + } + self->stream_cap = STREAM_INIT_SIZE; + self->stream_len = 0; + + // word pointers and metadata + sz = STREAM_INIT_SIZE / 10; + sz = sz? sz : 1; + self->words = (char**) malloc(sz * sizeof(char*)); + self->word_starts = (int*) malloc(sz * sizeof(int)); + self->words_cap = sz; + self->words_len = 0; + + // line pointers and metadata + self->line_start = (int*) malloc(sz * sizeof(int)); + + self->line_fields = (int*) malloc(sz * sizeof(int)); + + self->lines_cap = sz; + self->lines = 0; + self->file_lines = 0; + + if (self->stream == NULL || self->words == NULL || + self->word_starts == NULL || self->line_start == NULL || + self->line_fields == NULL) { + + parser_cleanup(self); + + return PARSER_OUT_OF_MEMORY; + } + + /* amount of bytes buffered */ + self->datalen = 0; + self->datapos = 0; + + self->line_start[0] = 0; + self->line_fields[0] = 0; + + self->pword_start = self->stream; + self->word_start = 0; + + self->state = START_RECORD; + + self->error_msg = NULL; + self->warn_msg = NULL; + + self->commentchar = '\0'; + + return 0; +} + + +void parser_free(parser_t *self) { + // opposite of parser_init + parser_cleanup(self); + free(self); +} + +static int make_stream_space(parser_t *self, size_t nbytes) { + int i, status, cap; + void *orig_ptr; + + // Can we fit potentially nbytes tokens (+ null terminators) in the stream? + + /* TRACE(("maybe growing buffers\n")); */ + + /* + TOKEN STREAM + */ + + orig_ptr = (void *) self->stream; + self->stream = (char*) grow_buffer((void *) self->stream, + self->stream_len, + &self->stream_cap, nbytes * 2, + sizeof(char), &status); + + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc sets errno when moving buffer? + if (self->stream != orig_ptr) { + // uff + /* TRACE(("Moving word pointers\n")) */ + + self->pword_start = self->stream + self->word_start; + + for (i = 0; i < self->words_len; ++i) + { + self->words[i] = self->stream + self->word_starts[i]; + } + } + + + /* + WORD VECTORS + */ + + cap = self->words_cap; + self->words = (char**) grow_buffer((void *) self->words, + self->words_len, + &self->words_cap, nbytes, + sizeof(char*), &status); + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + + // realloc took place + if (cap != self->words_cap) { + self->word_starts = (int*) safe_realloc((void *) self->word_starts, + sizeof(int) * self->words_cap); + if (self->word_starts == NULL) { + return PARSER_OUT_OF_MEMORY; + } + } + + + /* + LINE VECTORS + */ + /* + printf("Line_start: "); + + for (j = 0; j < self->lines + 1; ++j) { + printf("%d ", self->line_fields[j]); + } + printf("\n"); + + printf("lines_cap: %d\n", self->lines_cap); + */ + cap = self->lines_cap; + self->line_start = (int*) grow_buffer((void *) self->line_start, + self->lines + 1, + &self->lines_cap, nbytes, + sizeof(int), &status); + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc took place + if (cap != self->lines_cap) { + self->line_fields = (int*) safe_realloc((void *) self->line_fields, + sizeof(int) * self->lines_cap); + + if (self->line_fields == NULL) { + return PARSER_OUT_OF_MEMORY; + } + } + + /* TRACE(("finished growing buffers\n")); */ + + return 0; +} + + +static int push_char(parser_t *self, char c) { + /* TRACE(("pushing %c \n", c)) */ + self->stream[self->stream_len++] = c; + return 0; +} + +static int P_INLINE end_field(parser_t *self) { + // XXX cruft + self->numeric_field = 0; + + // null terminate token + push_char(self, '\0'); + + // set pointer and metadata + self->words[self->words_len] = self->pword_start; + + TRACE(("Char diff: %d\n", self->pword_start - self->words[0])); + + TRACE(("Saw word %s at: %d. Total: %d\n", + self->pword_start, self->word_start, self->words_len + 1)) + + self->word_starts[self->words_len] = self->word_start; + self->words_len++; + + // increment line field count + self->line_fields[self->lines]++; + + // New field begin in stream + self->pword_start = self->stream + self->stream_len; + self->word_start = self->stream_len; + + return 0; +} + + +static void append_warning(parser_t *self, const char *msg) { + int ex_length; + int length = strlen(msg); + + if (self->warn_msg == NULL) { + self->warn_msg = (char*) malloc(length + 1); + strcpy(self->warn_msg, msg); + } else { + ex_length = strlen(self->warn_msg); + self->warn_msg = (char*) safe_realloc(self->warn_msg, + ex_length + length + 1); + strcpy(self->warn_msg + ex_length, msg); + } +} + +static int end_line(parser_t *self) { + int fields; + khiter_t k; /* for hash set detection */ + int ex_fields = self->expected_fields; + char *msg; + + fields = self->line_fields[self->lines]; + + TRACE(("Line end, nfields: %d\n", fields)); + + if (self->lines > 0) { + if (self->expected_fields >= 0) { + ex_fields = self->expected_fields; + } else { + ex_fields = self->line_fields[self->lines - 1]; + } + } + + if (self->skipset != NULL) { + k = kh_get_int64((kh_int64_t*) self->skipset, self->file_lines); + + if (k != ((kh_int64_t*)self->skipset)->n_buckets) { + TRACE(("Skipping row %d\n", self->file_lines)); + // increment file line count + self->file_lines++; + + // skip the tokens from this bad line + self->line_start[self->lines] += fields; + + // reset field count + self->line_fields[self->lines] = 0; + return 0; + } + } + + /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ + + if (!(self->lines <= self->header_end + 1) + && (self->expected_fields < 0 && fields > ex_fields)) { + // increment file line count + self->file_lines++; + + // skip the tokens from this bad line + self->line_start[self->lines] += fields; + + // reset field count + self->line_fields[self->lines] = 0; + + // file_lines is now the _actual_ file line number (starting at 1) + + if (self->error_bad_lines) { + self->error_msg = (char*) malloc(100); + sprintf(self->error_msg, "Expected %d fields in line %d, saw %d\n", + ex_fields, self->file_lines, fields); + + TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); + + return -1; + } else { + // simply skip bad lines + if (self->warn_bad_lines) { + // pass up error message + msg = (char*) malloc(100); + sprintf(msg, "Skipping line %d: expected %d fields, saw %d\n", + self->file_lines, ex_fields, fields); + append_warning(self, msg); + free(msg); + } + } + } + else { + /* missing trailing delimiters */ + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { + + /* Might overrun the buffer when closing fields */ + if (make_stream_space(self, ex_fields - fields) < 0) { + self->error_msg = "out of memory"; + return -1; + } + + while (fields < ex_fields){ + end_field(self); + /* printf("Prior word: %s\n", self->words[self->words_len - 2]); */ + fields++; + } + } + + // increment both line counts + self->file_lines++; + + self->lines++; + + /* coliter_t it; */ + /* coliter_setup(&it, self, 5, self->lines - 1); */ + /* printf("word at column 5: %s\n", COLITER_NEXT(it)); */ + + // good line, set new start point + self->line_start[self->lines] = (self->line_start[self->lines - 1] + + fields); + + TRACE(("new line start: %d\n", self->line_start[self->lines])); + + // new line start with 0 fields + self->line_fields[self->lines] = 0; + } + + TRACE(("Finished line, at %d\n", self->lines)); + + return 0; +} + + + +int parser_add_skiprow(parser_t *self, int64_t row) { + khiter_t k; + kh_int64_t *set; + int ret = 0; + + if (self->skipset == NULL) { + self->skipset = (void*) kh_init_int64(); + } + + set = (kh_int64_t*) self->skipset; + + k = kh_put_int64(set, row, &ret); + set->keys[k] = row; + + return 0; +} + +static int parser_buffer_bytes(parser_t *self, size_t nbytes) { + int status; + size_t bytes_read; + void *src = self->source; + + status = 0; + self->datapos = 0; + self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); + self->datalen = bytes_read; + + if (status != REACHED_EOF && self->data == NULL) { + self->error_msg = (char*) malloc(200); + + if (status == CALLING_READ_FAILED) { + sprintf(self->error_msg, ("Calling read(nbytes) on source failed. " + "Try engine='python'.")); + } else { + sprintf(self->error_msg, "Unknown error in IO callback"); + } + return -1; + } + + TRACE(("datalen: %d\n", self->datalen)); + + return status; +} + + +/* + + Tokenization macros and state machine code + +*/ + +// printf("pushing %c\n", c); + +#if defined(VERBOSE) +#define PUSH_CHAR(c) \ + printf("Pushing %c, slen now: %d\n", c, slen); \ + *stream++ = c; \ + slen++; +#else +#define PUSH_CHAR(c) \ + *stream++ = c; \ + slen++; +#endif + + + +// This is a little bit of a hack but works for now + +#define END_FIELD() \ + self->stream_len = slen; \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; + +#define END_LINE_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + \ + } + +#define END_LINE_AND_FIELD_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + \ + } + +#define END_LINE() END_LINE_STATE(START_RECORD) + +#define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) + +typedef int (*parser_op)(parser_t *self, size_t line_limit); + +#define _TOKEN_CLEANUP() \ + self->stream_len = slen; \ + self->datapos = i; \ + TRACE(("datapos: %d, datalen: %d\n", self->datapos, self->datalen)); + + + +int tokenize_delimited(parser_t *self, size_t line_limit) +{ + int i, slen, start_lines; + char c; + char *stream; + char *buf = self->data + self->datapos; + + + start_lines = self->lines; + + if (make_stream_space(self, self->datalen - self->datapos) < 0) { + self->error_msg = "out of memory"; + return -1; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + + TRACE(("%s\n", buf)); + + for (i = self->datapos; i < self->datalen; ++i) + { + // Next character in file + c = *buf++; + + TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); + + switch(self->state) { + + case START_RECORD: + // start of record + + if (c == '\n') { + // \n\r possible? + END_LINE(); + break; + } else if (c == '\r') { + self->state = EAT_CRNL; + break; + } else if (c == self->commentchar) { + self->state = EAT_LINE_COMMENT; + break; + } + + /* normal character - handle as START_FIELD */ + self->state = START_FIELD; + /* fallthru */ + + case START_FIELD: + /* expecting field */ + if (c == '\n') { + END_FIELD(); + END_LINE(); + } else if (c == '\r') { + END_FIELD(); + self->state = EAT_CRNL; + } + else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + /* start quoted field */ + self->state = IN_QUOTED_FIELD; + } + else if (c == self->escapechar) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + else if (c == ' ' && self->skipinitialspace) + /* ignore space at start of field */ + ; + else if (c == self->delimiter) { + /* save empty field */ + END_FIELD(); + } + else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } + else { + /* begin new unquoted field */ + if (self->quoting == QUOTE_NONNUMERIC) + self->numeric_field = 1; + + // TRACE(("pushing %c", c)); + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case ESCAPED_CHAR: + /* if (c == '\0') */ + /* c = '\n'; */ + + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + + case EAT_LINE_COMMENT: + if (c == '\n') { + self->file_lines++; + self->state = START_RECORD; + } else if (c == '\r') { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; + + case IN_FIELD: + /* in unquoted field */ + if (c == '\n') { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } else if (c == '\r') { + END_FIELD(); + self->state = EAT_CRNL; + } + else if (c == self->escapechar) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + else if (c == self->delimiter) { + // End of field. End of line not reached yet + END_FIELD(); + self->state = START_FIELD; + } + else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } + else { + /* normal character - save in field */ + PUSH_CHAR(c); + } + break; + + case IN_QUOTED_FIELD: + /* in quoted field */ + if (c == self->escapechar) { + /* Possible escape character */ + self->state = ESCAPE_IN_QUOTED_FIELD; + } + else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + if (self->doublequote) { + /* doublequote; " represented by "" */ + self->state = QUOTE_IN_QUOTED_FIELD; + } + else { + /* end of quote part of field */ + self->state = IN_FIELD; + } + } + else { + /* normal character - save in field */ + PUSH_CHAR(c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + /* if (c == '\0') */ + /* c = '\n'; */ + + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + /* doublequote - seen a quote in an quoted field */ + if (self->quoting != QUOTE_NONE && c == self->quotechar) { + /* save "" as " */ + + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } + else if (c == self->delimiter) { + // End of field. End of line not reached yet + + END_FIELD(); + self->state = START_FIELD; + } + else if (c == '\n') { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } + else if (c == '\r') { + END_FIELD(); + self->state = EAT_CRNL; + } + else if (!self->strict) { + PUSH_CHAR(c); + self->state = IN_FIELD; + } + else { + self->error_msg = (char*) malloc(50); + sprintf(self->error_msg, "'%c' expected after '%c'", + self->delimiter, self->quotechar); + goto parsingerror; + } + break; + + case EAT_COMMENT: + if (c == '\n') { + END_LINE(); + } else if (c == '\r') { + self->state = EAT_CRNL; + } + break; + + case EAT_CRNL: + if (c == '\n') { + END_LINE(); + /* self->state = START_RECORD; */ + } else if (c == self->delimiter){ + // Handle \r-delimited files + END_LINE_AND_FIELD_STATE(START_FIELD); + } else { + /* \r line terminator */ + + /* UGH. we don't actually want to consume the token. fix this later */ + self->stream_len = slen; + if (end_line(self) < 0) { + goto parsingerror; + } + stream = self->stream + self->stream_len; + slen = self->stream_len; + self->state = START_RECORD; + + /* HACK, let's try this one again */ + --i; buf--; + if (line_limit > 0 && self->lines == start_lines + line_limit) { + goto linelimit; + } + + } + break; + + case EAT_CRNL_NOP: /* inside an ignored comment line */ + self->state = START_RECORD; + /* \r line terminator -- parse this character again */ + if (c != '\n' && c != self->delimiter) { + --i; + --buf; + } + break; + + default: + break; + + } + } + + _TOKEN_CLEANUP(); + + TRACE(("Finished tokenizing input\n")) + + return 0; + +parsingerror: + i++; + _TOKEN_CLEANUP(); + + return -1; + +linelimit: + i++; + _TOKEN_CLEANUP(); + + return 0; +} + +/* custom line terminator */ +int tokenize_delim_customterm(parser_t *self, size_t line_limit) +{ + + int i, slen, start_lines; + char c; + char *stream; + char *buf = self->data + self->datapos; + + + start_lines = self->lines; + + if (make_stream_space(self, self->datalen - self->datapos) < 0) { + self->error_msg = "out of memory"; + return -1; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + + TRACE(("%s\n", buf)); + + for (i = self->datapos; i < self->datalen; ++i) + { + // Next character in file + c = *buf++; + + TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); + + switch(self->state) { + case START_RECORD: + // start of record + if (c == self->lineterminator) { + // \n\r possible? + END_LINE(); + break; + } + /* normal character - handle as START_FIELD */ + self->state = START_FIELD; + /* fallthru */ + case START_FIELD: + /* expecting field */ + if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } + else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + /* start quoted field */ + self->state = IN_QUOTED_FIELD; + } + else if (c == self->escapechar) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + else if (c == ' ' && self->skipinitialspace) + /* ignore space at start of field */ + ; + else if (c == self->delimiter) { + /* save empty field */ + END_FIELD(); + } + else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } + else { + /* begin new unquoted field */ + if (self->quoting == QUOTE_NONNUMERIC) + self->numeric_field = 1; + + // TRACE(("pushing %c", c)); + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case ESCAPED_CHAR: + /* if (c == '\0') */ + /* c = '\n'; */ + + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + + case IN_FIELD: + /* in unquoted field */ + if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } + else if (c == self->escapechar) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + else if (c == self->delimiter) { + // End of field. End of line not reached yet + END_FIELD(); + self->state = START_FIELD; + } + else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } + else { + /* normal character - save in field */ + PUSH_CHAR(c); + } + break; + + case IN_QUOTED_FIELD: + /* in quoted field */ + if (c == self->escapechar) { + /* Possible escape character */ + self->state = ESCAPE_IN_QUOTED_FIELD; + } + else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + if (self->doublequote) { + /* doublequote; " represented by "" */ + self->state = QUOTE_IN_QUOTED_FIELD; + } + else { + /* end of quote part of field */ + self->state = IN_FIELD; + } + } + else { + /* normal character - save in field */ + PUSH_CHAR(c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + /* doublequote - seen a quote in an quoted field */ + if (self->quoting != QUOTE_NONE && c == self->quotechar) { + /* save "" as " */ + + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } + else if (c == self->delimiter) { + // End of field. End of line not reached yet + + END_FIELD(); + self->state = START_FIELD; + } + else if (c == self->lineterminator) { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } + else if (!self->strict) { + PUSH_CHAR(c); + self->state = IN_FIELD; + } + else { + self->error_msg = (char*) malloc(50); + sprintf(self->error_msg, "'%c' expected after '%c'", + self->delimiter, self->quotechar); + goto parsingerror; + } + break; + + case EAT_COMMENT: + if (c == self->lineterminator) { + END_LINE(); + } + break; + + default: + break; + + } + } + + _TOKEN_CLEANUP(); + + TRACE(("Finished tokenizing input\n")) + + return 0; + +parsingerror: + i++; + _TOKEN_CLEANUP(); + + return -1; + +linelimit: + i++; + _TOKEN_CLEANUP(); + + return 0; +} + +int tokenize_whitespace(parser_t *self, size_t line_limit) +{ + int i, slen, start_lines; + char c; + char *stream; + char *buf = self->data + self->datapos; + + start_lines = self->lines; + + if (make_stream_space(self, self->datalen - self->datapos) < 0) { + self->error_msg = "out of memory"; + return -1; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + + TRACE(("%s\n", buf)); + + for (i = self->datapos; i < self->datalen; ++i) + { + // Next character in file + c = *buf++; + + TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); + + switch(self->state) { + + case EAT_WHITESPACE: + if (!IS_WHITESPACE(c)) { + // END_FIELD(); + self->state = START_FIELD; + // Fall through to subsequent state + } else { + // if whitespace char, keep slurping + break; + } + + case START_RECORD: + // start of record + if (c == '\n') { + // \n\r possible? + END_LINE(); + break; + } else if (c == '\r') { + self->state = EAT_CRNL; + break; + } else if (IS_WHITESPACE(c)) { + self->state = EAT_WHITESPACE; + break; + } else { + /* normal character - handle as START_FIELD */ + self->state = START_FIELD; + } + /* fallthru */ + case START_FIELD: + /* expecting field */ + if (c == '\n') { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } else if (c == '\r') { + END_FIELD(); + self->state = EAT_CRNL; + } + else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + /* start quoted field */ + self->state = IN_QUOTED_FIELD; + } + else if (c == self->escapechar) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + /* else if (c == ' ' && self->skipinitialspace) */ + /* /\* ignore space at start of field *\/ */ + /* ; */ + else if (IS_WHITESPACE(c)) { + self->state = EAT_WHITESPACE; + } + else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } + else { + /* begin new unquoted field */ + if (self->quoting == QUOTE_NONNUMERIC) + self->numeric_field = 1; + + // TRACE(("pushing %c", c)); + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case ESCAPED_CHAR: + /* if (c == '\0') */ + /* c = '\n'; */ + + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + + case IN_FIELD: + /* in unquoted field */ + if (c == '\n') { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } else if (c == '\r') { + END_FIELD(); + self->state = EAT_CRNL; + } + else if (c == self->escapechar) { + /* possible escaped character */ + self->state = ESCAPED_CHAR; + } + else if (IS_WHITESPACE(c)) { + // End of field. End of line not reached yet + END_FIELD(); + self->state = EAT_WHITESPACE; + } + else if (c == self->commentchar) { + END_FIELD(); + self->state = EAT_COMMENT; + } + else { + /* normal character - save in field */ + PUSH_CHAR(c); + } + break; + + case IN_QUOTED_FIELD: + /* in quoted field */ + if (c == self->escapechar) { + /* Possible escape character */ + self->state = ESCAPE_IN_QUOTED_FIELD; + } + else if (c == self->quotechar && + self->quoting != QUOTE_NONE) { + if (self->doublequote) { + /* doublequote; " represented by "" */ + self->state = QUOTE_IN_QUOTED_FIELD; + } + else { + /* end of quote part of field */ + self->state = IN_FIELD; + } + } + else { + /* normal character - save in field */ + PUSH_CHAR(c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + /* if (c == '\0') */ + /* c = '\n'; */ + + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + /* doublequote - seen a quote in an quoted field */ + if (self->quoting != QUOTE_NONE && c == self->quotechar) { + /* save "" as " */ + + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } + else if (IS_WHITESPACE(c)) { + // End of field. End of line not reached yet + + END_FIELD(); + self->state = EAT_WHITESPACE; + } + else if (c == '\n') { + END_FIELD(); + END_LINE(); + /* self->state = START_RECORD; */ + } + else if (c == '\r') { + END_FIELD(); + self->state = EAT_CRNL; + } + else if (!self->strict) { + PUSH_CHAR(c); + self->state = IN_FIELD; + } + else { + self->error_msg = (char*) malloc(50); + sprintf(self->error_msg, "'%c' expected after '%c'", + self->delimiter, self->quotechar); + goto parsingerror; + } + break; + + case EAT_CRNL: + if (c == '\n') { + END_LINE(); + /* self->state = START_RECORD; */ + } else if (IS_WHITESPACE(c)){ + // Handle \r-delimited files + END_LINE_STATE(EAT_WHITESPACE); + } else { + /* XXX + * first character of a new record--need to back up and reread + * to handle properly... + */ + i--; buf--; /* back up one character (HACK!) */ + END_LINE_STATE(START_RECORD); + } + break; + + case EAT_COMMENT: + if (c == '\n') { + END_LINE(); + } else if (c == '\r') { + self->state = EAT_CRNL; + } + break; + + default: + break; + + + } + + } + + _TOKEN_CLEANUP(); + + TRACE(("Finished tokenizing input\n")) + + return 0; + +parsingerror: + i++; + _TOKEN_CLEANUP(); + + return -1; + +linelimit: + i++; + _TOKEN_CLEANUP(); + + return 0; +} + + +static int parser_handle_eof(parser_t *self) { + TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) + if (self->datalen == 0 && (self->state != START_RECORD)) { + // test cases needed here + // TODO: empty field at end of line + TRACE(("handling eof\n")); + + if (self->state == IN_FIELD || self->state == START_FIELD) { + if (end_field(self) < 0) + return -1; + } else if (self->state == QUOTE_IN_QUOTED_FIELD) { + if (end_field(self) < 0) + return -1; + } else if (self->state == IN_QUOTED_FIELD) { + self->error_msg = (char*) malloc(100); + sprintf(self->error_msg, "EOF inside string starting at line %d", + self->file_lines); + return -1; + } + + if (end_line(self) < 0) + return -1; + + return 0; + } + else if (self->datalen == 0 && (self->state == START_RECORD)) { + return 0; + } + + return -1; +} + +int parser_consume_rows(parser_t *self, size_t nrows) { + int i, offset, word_deletions, char_count; + + if (nrows > self->lines) { + nrows = self->lines; + } + + /* do nothing */ + if (nrows == 0) + return 0; + + /* cannot guarantee that nrows + 1 has been observed */ + word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; + char_count = (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1); + + TRACE(("Deleting %d words, %d chars\n", word_deletions, char_count)); + + /* move stream, only if something to move */ + if (char_count < self->stream_len) { + memmove((void*) self->stream, (void*) (self->stream + char_count), + self->stream_len - char_count); + } + /* buffer counts */ + self->stream_len -= char_count; + + /* move token metadata */ + for (i = 0; i < self->words_len - word_deletions; ++i) { + offset = i + word_deletions; + + self->words[i] = self->words[offset] - char_count; + self->word_starts[i] = self->word_starts[offset] - char_count; + } + self->words_len -= word_deletions; + + /* move current word pointer to stream */ + self->pword_start -= char_count; + self->word_start -= char_count; + /* + printf("Line_start: "); + for (i = 0; i < self->lines + 1; ++i) { + printf("%d ", self->line_fields[i]); + } + printf("\n"); + */ + /* move line metadata */ + for (i = 0; i < self->lines - nrows + 1; ++i) + { + offset = i + nrows; + self->line_start[i] = self->line_start[offset] - word_deletions; + + /* TRACE(("First word in line %d is now %s\n", i, */ + /* self->words[self->line_start[i]])); */ + + self->line_fields[i] = self->line_fields[offset]; + } + self->lines -= nrows; + /* self->line_fields[self->lines] = 0; */ + + return 0; +} + +static size_t _next_pow2(size_t sz) { + size_t result = 1; + while (result < sz) result *= 2; + return result; +} + +int parser_trim_buffers(parser_t *self) { + /* + Free memory + */ + size_t new_cap; + + /* trim stream */ + new_cap = _next_pow2(self->stream_len) + 1; + if (new_cap < self->stream_cap) { + self->stream = safe_realloc((void*) self->stream, new_cap); + self->stream_cap = new_cap; + } + + /* trim words, word_starts */ + new_cap = _next_pow2(self->words_len) + 1; + if (new_cap < self->words_cap) { + self->words = (char**) safe_realloc((void*) self->words, + new_cap * sizeof(char*)); + self->word_starts = (int*) safe_realloc((void*) self->word_starts, + new_cap * sizeof(int)); + self->words_cap = new_cap; + } + + /* trim line_start, line_fields */ + new_cap = _next_pow2(self->lines) + 1; + if (new_cap < self->lines_cap) { + self->line_start = (int*) safe_realloc((void*) self->line_start, + new_cap * sizeof(int)); + self->line_fields = (int*) safe_realloc((void*) self->line_fields, + new_cap * sizeof(int)); + self->lines_cap = new_cap; + } + + return 0; +} + +void debug_print_parser(parser_t *self) { + int j, line; + char *token; + + for (line = 0; line < self->lines; ++line) + { + printf("(Parsed) Line %d: ", line); + + for (j = 0; j < self->line_fields[j]; ++j) + { + token = self->words[j + self->line_start[line]]; + printf("%s ", token); + } + printf("\n"); + } +} + +int clear_parsed_lines(parser_t *self, size_t nlines) { + // TODO. move data up in stream, shift relevant word pointers + + return 0; +} + + +/* + nrows : number of rows to tokenize (or until reach EOF) + all : tokenize all the data vs. certain number of rows + */ + +int _tokenize_helper(parser_t *self, size_t nrows, int all) { + parser_op tokenize_bytes; + + int status = 0; + int start_lines = self->lines; + + if (self->delim_whitespace) { + tokenize_bytes = tokenize_whitespace; + } else if (self->lineterminator == '\0') { + tokenize_bytes = tokenize_delimited; + } else { + tokenize_bytes = tokenize_delim_customterm; + } + + if (self->state == FINISHED) { + return 0; + } + + TRACE(("Asked to tokenize %d rows\n", (int) nrows)); + + while (1) { + if (!all && self->lines - start_lines >= nrows) + break; + + if (self->datapos == self->datalen) { + status = parser_buffer_bytes(self, self->chunksize); + + if (status == REACHED_EOF) { + // close out last line + status = parser_handle_eof(self); + self->state = FINISHED; + break; + } else if (status != 0) { + return status; + } + } + + TRACE(("Trying to process %d bytes\n", self->datalen - self->datapos)); + /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */ + + status = tokenize_bytes(self, nrows); + + /* debug_print_parser(self); */ + + if (status < 0) { + // XXX + TRACE(("Status %d returned from tokenize_bytes, breaking\n", + status)); + status = -1; + break; + } + } + TRACE(("leaving tokenize_helper\n")); + return status; +} + +int tokenize_nrows(parser_t *self, size_t nrows) { + int status = _tokenize_helper(self, nrows, 0); + return status; +} + +int tokenize_all_rows(parser_t *self) { + int status = _tokenize_helper(self, -1, 1); + return status; +} + +void test_count_lines(char *fname) { + clock_t start = clock(); + + char *buffer, *tmp; + size_t bytes, lines = 0; + int i; + FILE *fp = fopen(fname, "rb"); + + buffer = (char*) malloc(CHUNKSIZE * sizeof(char)); + + while(1) { + tmp = buffer; + bytes = fread((void *) buffer, sizeof(char), CHUNKSIZE, fp); + // printf("Read %d bytes\n", bytes); + + if (bytes == 0) { + break; + } + + for (i = 0; i < bytes; ++i) + { + if (*tmp++ == '\n') { + lines++; + } + } + } + + + printf("Saw %d lines\n", (int) lines); + + free(buffer); + fclose(fp); + + printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC); +} + + + +// forward declaration +static double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); + + +P_INLINE void lowercase(char *p) { + for ( ; *p; ++p) *p = tolower(*p); +} + +P_INLINE void uppercase(char *p) { + for ( ; *p; ++p) *p = toupper(*p); +} + + +/* + * `item` must be the nul-terminated string that is to be + * converted to a double. + * + * To be successful, to_double() must use *all* the characters + * in `item`. E.g. "1.q25" will fail. Leading and trailing + * spaces are allowed. + * + * `sci` is the scientific notation exponent character, usually + * either 'E' or 'D'. Case is ignored. + * + * `decimal` is the decimal point character, usually either + * '.' or ','. + * + */ + +int to_double(char *item, double *p_value, char sci, char decimal, char tsep) +{ + char *p_end; + + *p_value = xstrtod(item, &p_end, decimal, sci, tsep, TRUE); + + return (errno == 0) && (!*p_end); +} + + +int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal) +{ + char *p_end; + + *p_real = xstrtod(item, &p_end, decimal, sci, '\0', FALSE); + if (*p_end == '\0') { + *p_imag = 0.0; + return errno == 0; + } + if (*p_end == 'i' || *p_end == 'j') { + *p_imag = *p_real; + *p_real = 0.0; + ++p_end; + } + else { + if (*p_end == '+') { + ++p_end; + } + *p_imag = xstrtod(p_end, &p_end, decimal, sci, '\0', FALSE); + if (errno || ((*p_end != 'i') && (*p_end != 'j'))) { + return FALSE; + } + ++p_end; + } + while(*p_end == ' ') { + ++p_end; + } + return *p_end == '\0'; +} + + +int P_INLINE to_longlong(char *item, long long *p_value) +{ + char *p_end; + + // Try integer conversion. We explicitly give the base to be 10. If + // we used 0, strtoll() would convert '012' to 10, because the leading 0 in + // '012' signals an octal number in C. For a general purpose reader, that + // would be a bug, not a feature. + *p_value = strtoll(item, &p_end, 10); + + // Allow trailing spaces. + while (isspace(*p_end)) ++p_end; + + return (errno == 0) && (!*p_end); +} + +int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep) +{ + int i, pos, status, n = strlen(item), count = 0; + char *tmp; + char *p_end; + + for (i = 0; i < n; ++i) + { + if (*(item + i) == tsep) { + count++; + } + } + + if (count == 0) { + return to_longlong(item, p_value); + } + + tmp = (char*) malloc((n - count + 1) * sizeof(char)); + if (tmp == NULL) { + return 0; + } + + pos = 0; + for (i = 0; i < n; ++i) + { + if (item[i] != tsep) + tmp[pos++] = item[i]; + } + + tmp[pos] = '\0'; + + status = to_longlong(tmp, p_value); + free(tmp); + + return status; +} + +int to_boolean(char *item, uint8_t *val) { + char *tmp; + int i, status = 0; + + static const char *tstrs[1] = {"TRUE"}; + static const char *fstrs[1] = {"FALSE"}; + + tmp = malloc(sizeof(char) * (strlen(item) + 1)); + strcpy(tmp, item); + uppercase(tmp); + + for (i = 0; i < 1; ++i) + { + if (strcmp(tmp, tstrs[i]) == 0) { + *val = 1; + goto done; + } + } + + for (i = 0; i < 1; ++i) + { + if (strcmp(tmp, fstrs[i]) == 0) { + *val = 0; + goto done; + } + } + + status = -1; + +done: + free(tmp); + return status; +} + +// #define TEST + +#ifdef TEST + +int main(int argc, char *argv[]) +{ + double x, y; + long long xi; + int status; + char *s; + + //s = "0.10e-3-+5.5e2i"; + // s = "1-0j"; + // status = to_complex(s, &x, &y, 'e', '.'); + s = "123,789"; + status = to_longlong_thousands(s, &xi, ','); + printf("s = '%s'\n", s); + printf("status = %d\n", status); + printf("x = %d\n", (int) xi); + + // printf("x = %lg, y = %lg\n", x, y); + + return 0; +} +#endif + +// --------------------------------------------------------------------------- +// Implementation of xstrtod + +// +// strtod.c +// +// Convert string to double +// +// Copyright (C) 2002 Michael Ringgaard. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// 3. Neither the name of the project nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// ----------------------------------------------------------------------- +// Modifications by Warren Weckesser, March 2011: +// * Rename strtod() to xstrtod(). +// * Added decimal and sci arguments. +// * Skip trailing spaces. +// * Commented out the other functions. +// Modifications by Richard T Guy, August 2013: +// * Add tsep argument for thousands separator +// + +static double xstrtod(const char *str, char **endptr, char decimal, + char sci, char tsep, int skip_trailing) +{ + double number; + int exponent; + int negative; + char *p = (char *) str; + double p10; + int n; + int num_digits; + int num_decimals; + + errno = 0; + + // Skip leading whitespace + while (isspace(*p)) p++; + + // Handle optional sign + negative = 0; + switch (*p) + { + case '-': negative = 1; // Fall through to increment position + case '+': p++; + } + + number = 0.; + exponent = 0; + num_digits = 0; + num_decimals = 0; + + // Process string of digits + while (isdigit(*p)) + { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + + p += (tsep != '\0' & *p == tsep); + } + + // Process decimal part + if (*p == decimal) + { + p++; + + while (isdigit(*p)) + { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } + + exponent -= num_decimals; + } + + if (num_digits == 0) + { + errno = ERANGE; + return 0.0; + } + + // Correct for sign + if (negative) number = -number; + + // Process an exponent string + if (toupper(*p) == toupper(sci)) + { + // Handle optional sign + negative = 0; + switch (*++p) + { + case '-': negative = 1; // Fall through to increment pos + case '+': p++; + } + + // Process string of digits + n = 0; + while (isdigit(*p)) + { + n = n * 10 + (*p - '0'); + p++; + } + + if (negative) + exponent -= n; + else + exponent += n; + } + + + if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) + { + + errno = ERANGE; + return HUGE_VAL; + } + + // Scale the result + p10 = 10.; + n = exponent; + if (n < 0) n = -n; + while (n) + { + if (n & 1) + { + if (exponent < 0) + number /= p10; + else + number *= p10; + } + n >>= 1; + p10 *= p10; + } + + + if (number == HUGE_VAL) { + errno = ERANGE; + } + + if (skip_trailing) { + // Skip trailing whitespace + while (isspace(*p)) p++; + } + + if (endptr) *endptr = p; + + + return number; +} + +/* +float strtof(const char *str, char **endptr) +{ + return (float) strtod(str, endptr); +} + + +long double strtold(const char *str, char **endptr) +{ + return strtod(str, endptr); +} + +double atof(const char *str) +{ + return strtod(str, NULL); +} +*/ + +// End of xstrtod code +// --------------------------------------------------------------------------- + +int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, + int *error, char tsep) +{ + const char *p = (const char *) p_item; + int isneg = 0; + int64_t number = 0; + int d; + + // Skip leading spaces. + while (isspace(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + isneg = 1; + ++p; + } + else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + // Error... + *error = ERROR_NO_DIGITS; + return 0; + } + + if (isneg) { + // If number is greater than pre_min, at least one more digit + // can be processed without overflowing. + int dig_pre_min = -(int_min % 10); + int64_t pre_min = int_min / 10; + + // Process the digits. + d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit(d)) { + break; + } + if ((number > pre_min) || + ((number == pre_min) && (d - '0' <= dig_pre_min))) { + + number = number * 10 - (d - '0'); + d = *++p; + } + else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } else { + while (isdigit(d)) { + if ((number > pre_min) || + ((number == pre_min) && (d - '0' <= dig_pre_min))) { + + number = number * 10 - (d - '0'); + d = *++p; + } + else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } + } + else { + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + int64_t pre_max = int_max / 10; + int dig_pre_max = int_max % 10; + + //printf("pre_max = %lld dig_pre_max = %d\n", pre_max, dig_pre_max); + + // Process the digits. + d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit(d)) { + break; + } + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + + number = number * 10 + (d - '0'); + d = *++p; + + } + else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } else { + while (isdigit(d)) { + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + + number = number * 10 + (d - '0'); + d = *++p; + + } + else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } + } + + // Skip trailing spaces. + while (isspace(*p)) { + ++p; + } + + // Did we use up all the characters? + if (*p) { + *error = ERROR_INVALID_CHARS; + return 0; + } + + *error = 0; + return number; +} + + +uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error) +{ + int d, dig_pre_max; + uint64_t pre_max; + const char *p = (const char *) p_item; + uint64_t number = 0; + + // Skip leading spaces. + while (isspace(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + *error = ERROR_MINUS_SIGN; + return 0; + } + if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + // Error... + *error = ERROR_NO_DIGITS; + return 0; + } + + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + pre_max = uint_max / 10; + dig_pre_max = uint_max % 10; + + // Process the digits. + d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + *error = ERROR_OVERFLOW; + return 0; + } + } + + // Skip trailing spaces. + while (isspace(*p)) { + ++p; + } + + // Did we use up all the characters? + if (*p) { + *error = ERROR_INVALID_CHARS; + return 0; + } + + *error = 0; + return number; +} diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h new file mode 100644 index 00000000..6af63c07 --- /dev/null +++ b/pandas/src/parser/tokenizer.h @@ -0,0 +1,266 @@ +/* + +Copyright (c) 2012, Lambda Foundry, Inc., except where noted + +Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause +BSD + +See LICENSE for the license + +*/ + +#ifndef _PARSER_COMMON_H_ +#define _PARSER_COMMON_H_ + +#include +#include +#include +#include +#include + +#include + +#define ERROR_OK 0 +#define ERROR_NO_DIGITS 1 +#define ERROR_OVERFLOW 2 +#define ERROR_INVALID_CHARS 3 +#define ERROR_MINUS_SIGN 4 + +#if defined(_MSC_VER) +#include "../headers/ms_stdint.h" +#else +#include +#endif + +#include "khash.h" + +#define CHUNKSIZE 1024*256 +#define KB 1024 +#define MB 1024 * KB +#define STREAM_INIT_SIZE 32 + +#define REACHED_EOF 1 +#define CALLING_READ_FAILED 2 + +#ifndef P_INLINE + #if defined(__GNUC__) + #define P_INLINE __inline__ + #elif defined(_MSC_VER) + #define P_INLINE + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define P_INLINE inline + #else + #define P_INLINE + #endif +#endif + +#if defined(_MSC_VER) +#define strtoll _strtoi64 +#endif + +/* + + C flat file parsing low level code for pandas / NumPy + + */ + +#define FALSE 0 +#define TRUE 1 + +/* Maximum number of columns in a file. */ +#define MAX_NUM_COLUMNS 2000 + +/* Maximum number of characters in single field. */ + +#define FIELD_BUFFER_SIZE 2000 + + +/* + * Common set of error types for the read_rows() and tokenize() + * functions. + */ + +#define ERROR_OUT_OF_MEMORY 1 +#define ERROR_INVALID_COLUMN_INDEX 10 +#define ERROR_CHANGED_NUMBER_OF_FIELDS 12 +#define ERROR_TOO_MANY_CHARS 21 +#define ERROR_TOO_MANY_FIELDS 22 +#define ERROR_NO_DATA 23 + + +/* #define VERBOSE */ + +#if defined(VERBOSE) +#define TRACE(X) printf X; +#else +#define TRACE(X) +#endif + + +#define PARSER_OUT_OF_MEMORY -1 + + +/* + * XXX Might want to couple count_rows() with read_rows() to avoid duplication + * of some file I/O. + */ + +/* + * WORD_BUFFER_SIZE determines the maximum amount of non-delimiter + * text in a row. + */ +#define WORD_BUFFER_SIZE 4000 + + +typedef enum { + START_RECORD, + START_FIELD, + ESCAPED_CHAR, + IN_FIELD, + IN_QUOTED_FIELD, + ESCAPE_IN_QUOTED_FIELD, + QUOTE_IN_QUOTED_FIELD, + EAT_CRNL, + EAT_CRNL_NOP, + EAT_WHITESPACE, + EAT_COMMENT, + EAT_LINE_COMMENT, + FINISHED +} ParserState; + +typedef enum { + QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE +} QuoteStyle; + + +typedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, + int *status); +typedef int (*io_cleanup)(void *src); + +typedef struct parser_t { + void *source; + io_callback cb_io; + io_cleanup cb_cleanup; + + int chunksize; // Number of bytes to prepare for each chunk + char *data; // pointer to data to be processed + int datalen; // amount of data available + int datapos; + + // where to write out tokenized data + char *stream; + int stream_len; + int stream_cap; + + // Store words in (potentially ragged) matrix for now, hmm + char **words; + int *word_starts; // where we are in the stream + int words_len; + int words_cap; + + char *pword_start; // pointer to stream start of current field + int word_start; // position start of current field + + int *line_start; // position in words for start of line + int *line_fields; // Number of fields in each line + int lines; // Number of (good) lines observed + int file_lines; // Number of file lines observed (including bad or skipped) + int lines_cap; // Vector capacity + + // Tokenizing stuff + ParserState state; + int doublequote; /* is " represented by ""? */ + char delimiter; /* field separator */ + int delim_whitespace; /* delimit by consuming space/tabs instead */ + char quotechar; /* quote character */ + char escapechar; /* escape character */ + char lineterminator; + int skipinitialspace; /* ignore spaces following delimiter? */ + int quoting; /* style of quoting to write */ + + // krufty, hmm =/ + int numeric_field; + + char commentchar; + int allow_embedded_newline; + int strict; /* raise exception on bad CSV */ + + int expected_fields; + int error_bad_lines; + int warn_bad_lines; + + // floating point options + char decimal; + char sci; + + // thousands separator (comma, period) + char thousands; + + int header; // Boolean: 1: has header, 0: no header + int header_start; // header row start + int header_end; // header row end + + void *skipset; + int skip_footer; + + // error handling + char *warn_msg; + char *error_msg; +} parser_t; + + + + +typedef struct coliter_t { + char **words; + int *line_start; + int col; +} coliter_t; + +void coliter_setup(coliter_t *self, parser_t *parser, int i, int start); +coliter_t *coliter_new(parser_t *self, int i); + +/* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */ +// #define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col] + +#define COLITER_NEXT(iter) iter.words[*iter.line_start++ + iter.col] + +parser_t* parser_new(); + +int parser_init(parser_t *self); + +int parser_consume_rows(parser_t *self, size_t nrows); + +int parser_trim_buffers(parser_t *self); + +int parser_add_skiprow(parser_t *self, int64_t row); + +void parser_free(parser_t *self); + +void parser_set_default_options(parser_t *self); + +void debug_print_parser(parser_t *self); + +int tokenize_nrows(parser_t *self, size_t nrows); + +int tokenize_all_rows(parser_t *self); + +/* + + Have parsed / type-converted a chunk of data and want to free memory from the + token stream + + */ +int clear_parsed_lines(parser_t *self, size_t nlines); + +int64_t str_to_int64(const char *p_item, int64_t int_min, + int64_t int_max, int *error, char tsep); +uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); + +int P_INLINE to_double(char *item, double *p_value, char sci, char decimal, char tsep); +int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); +int P_INLINE to_longlong(char *item, long long *p_value); +int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); +int P_INLINE to_boolean(char *item, uint8_t *val); + +#endif // _PARSER_COMMON_H_ diff --git a/pandas/src/period.c b/pandas/src/period.c new file mode 100644 index 00000000..5a744de4 --- /dev/null +++ b/pandas/src/period.c @@ -0,0 +1,1441 @@ +#include "period.h" + + +/* + * Borrowed and derived code from scikits.timeseries that we will expose via + * Cython to pandas. This primarily concerns period representation and + * frequency conversion routines. + */ + +/* see end of file for stuff pandas uses (search for 'pandas') */ + +/* ------------------------------------------------------------------ + * Code derived from scikits.timeseries + * ------------------------------------------------------------------*/ + +static int mod_compat(int x, int m) { + int result = x % m; + if (result < 0) return result + m; + return result; +} + +static int floordiv(int x, int divisor) { + if (x < 0) { + if (mod_compat(x, divisor)) { + return x / divisor - 1; + } + else return x / divisor; + } else { + return x / divisor; + } +} + +static asfreq_info NULL_AF_INFO; + +/* Table with day offsets for each month (0-based, without and with leap) */ +static int month_offset[2][13] = { + { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }, + { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 } +}; + +/* Table of number of days in a month (0-based, without and with leap) */ +static int days_in_month[2][12] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } +}; + +/* Return 1/0 iff year points to a leap year in calendar. */ +static int dInfoCalc_Leapyear(npy_int64 year, int calendar) +{ + if (calendar == GREGORIAN_CALENDAR) { + return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); + } else { + return (year % 4 == 0); + } +} + +/* Return the day of the week for the given absolute date. */ +static int dInfoCalc_DayOfWeek(npy_int64 absdate) +{ + int day_of_week; + + if (absdate >= 1) { + day_of_week = (absdate - 1) % 7; + } else { + day_of_week = 6 - ((-absdate) % 7); + } + return day_of_week; +} + +static int monthToQuarter(int month) { return ((month-1)/3)+1; } + +/* Return the year offset, that is the absolute date of the day + 31.12.(year-1) in the given calendar. + + Note: + For the Julian calendar we shift the absdate (which is measured + using the Gregorian Epoch) value by two days because the Epoch + (0001-01-01) in the Julian calendar lies 2 days before the Epoch in + the Gregorian calendar. */ +static int dInfoCalc_YearOffset(npy_int64 year, int calendar) +{ + year--; + if (calendar == GREGORIAN_CALENDAR) { + if (year >= 0 || -1/4 == -1) + return year*365 + year/4 - year/100 + year/400; + else + return year*365 + (year-3)/4 - (year-99)/100 + (year-399)/400; + } + else if (calendar == JULIAN_CALENDAR) { + if (year >= 0 || -1/4 == -1) + return year*365 + year/4 - 2; + else + return year*365 + (year-3)/4 - 2; + } + Py_Error(PyExc_ValueError, "unknown calendar"); + onError: + return INT_ERR_CODE; +} + +/* Set the instance's value using the given date and time. calendar may be set + * to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to indicate the calendar + * to be used. */ + +static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, + int year, int month, int day, int hour, int minute, double second, + int calendar) +{ + + /* Calculate the absolute date */ + { + int leap; + npy_int64 absdate; + int yearoffset; + + /* Range check */ + Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), + PyExc_ValueError, + "year out of range: %i", + year); + + /* Is it a leap year ? */ + leap = dInfoCalc_Leapyear(year, calendar); + + /* Negative month values indicate months relative to the years end */ + if (month < 0) month += 13; + Py_AssertWithArg(month >= 1 && month <= 12, + PyExc_ValueError, + "month out of range (1-12): %i", + month); + + /* Negative values indicate days relative to the months end */ + if (day < 0) day += days_in_month[leap][month - 1] + 1; + Py_AssertWithArg(day >= 1 && day <= days_in_month[leap][month - 1], + PyExc_ValueError, + "day out of range: %i", + day); + + yearoffset = dInfoCalc_YearOffset(year, calendar); + if (PyErr_Occurred()) goto onError; + + absdate = day + month_offset[leap][month - 1] + yearoffset; + + dinfo->absdate = absdate; + + dinfo->year = year; + dinfo->month = month; + dinfo->quarter = ((month-1)/3)+1; + dinfo->day = day; + + dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); + dinfo->day_of_year = (short)(absdate - yearoffset); + + dinfo->calendar = calendar; + } + + /* Calculate the absolute time */ + { + Py_AssertWithArg(hour >= 0 && hour <= 23, + PyExc_ValueError, + "hour out of range (0-23): %i", + hour); + Py_AssertWithArg(minute >= 0 && minute <= 59, + PyExc_ValueError, + "minute out of range (0-59): %i", + minute); + Py_AssertWithArg(second >= (double)0.0 && + (second < (double)60.0 || + (hour == 23 && minute == 59 && + second < (double)61.0)), + PyExc_ValueError, + "second out of range (0.0 - <60.0; <61.0 for 23:59): %f", + second); + + dinfo->abstime = (double)(hour*3600 + minute*60) + second; + + dinfo->hour = hour; + dinfo->minute = minute; + dinfo->second = second; + } + return 0; + + onError: + return INT_ERR_CODE; +} + +/* Sets the date part of the date_info struct using the indicated + calendar. + + XXX This could also be done using some integer arithmetics rather + than with this iterative approach... */ +static +int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, + npy_int64 absdate, int calendar) +{ + register npy_int64 year; + npy_int64 yearoffset; + int leap,dayoffset; + int *monthoffset; + + /* Approximate year */ + if (calendar == GREGORIAN_CALENDAR) { + year = (npy_int64)(((double)absdate) / 365.2425); + } else if (calendar == JULIAN_CALENDAR) { + year = (npy_int64)(((double)absdate) / 365.25); + } else { + Py_Error(PyExc_ValueError, "unknown calendar"); + } + + if (absdate > 0) year++; + + /* Apply corrections to reach the correct year */ + while (1) { + /* Calculate the year offset */ + yearoffset = dInfoCalc_YearOffset(year, calendar); + if (PyErr_Occurred()) + goto onError; + + /* Backward correction: absdate must be greater than the + yearoffset */ + if (yearoffset >= absdate) { + year--; + continue; + } + + dayoffset = absdate - yearoffset; + leap = dInfoCalc_Leapyear(year,calendar); + + /* Forward correction: non leap years only have 365 days */ + if (dayoffset > 365 && !leap) { + year++; + continue; + } + break; + } + + dinfo->year = year; + dinfo->calendar = calendar; + + /* Now iterate to find the month */ + monthoffset = month_offset[leap]; + { + register int month; + + for (month = 1; month < 13; month++) { + if (monthoffset[month] >= dayoffset) + break; + } + + dinfo->month = month; + dinfo->quarter = monthToQuarter(month); + dinfo->day = dayoffset - month_offset[leap][month-1]; + } + + + dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); + dinfo->day_of_year = dayoffset; + dinfo->absdate = absdate; + + return 0; + + onError: + return INT_ERR_CODE; +} + +/////////////////////////////////////////////// + +// frequency specifc conversion routines +// each function must take an integer fromDate and +// a char relation ('S' or 'E' for 'START' or 'END') +/////////////////////////////////////////////////////////////////////// + +// helpers for frequency conversion routines // + +static int daytime_conversion_factors[][2] = { + { FR_DAY, 1 }, + { FR_HR, 24 }, + { FR_MIN, 60 }, + { FR_SEC, 60 }, + { FR_MS, 1000 }, + { FR_US, 1000 }, + { FR_NS, 1000 }, + { 0, 0 } +}; + +static npy_int64** daytime_conversion_factor_matrix = NULL; + +PANDAS_INLINE static int max_value(int a, int b) { + return a > b ? a : b; +} + +PANDAS_INLINE static int min_value(int a, int b) { + return a < b ? a : b; +} + +PANDAS_INLINE static int get_freq_group(int freq) { + return (freq/1000)*1000; +} + +PANDAS_INLINE static int get_freq_group_index(int freq) { + return freq/1000; +} + +static int calc_conversion_factors_matrix_size() { + int matrix_size = 0; + int index; + for (index=0;; index++) { + int period_value = get_freq_group_index(daytime_conversion_factors[index][0]); + if (period_value == 0) { + break; + } + matrix_size = max_value(matrix_size, period_value); + } + return matrix_size + 1; +} + +static void alloc_conversion_factors_matrix(int matrix_size) { + int row_index; + int column_index; + daytime_conversion_factor_matrix = malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); + for (row_index = 0; row_index < matrix_size; row_index++) { + daytime_conversion_factor_matrix[row_index] = malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); + for (column_index = 0; column_index < matrix_size; column_index++) { + daytime_conversion_factor_matrix[row_index][column_index] = 0; + } + } +} + +static npy_int64 calculate_conversion_factor(int start_value, int end_value) { + npy_int64 conversion_factor = 0; + int index; + for (index=0;; index++) { + int freq_group = daytime_conversion_factors[index][0]; + + if (freq_group == 0) { + conversion_factor = 0; + break; + } + + if (freq_group == start_value) { + conversion_factor = 1; + } else { + conversion_factor *= daytime_conversion_factors[index][1]; + } + + if (freq_group == end_value) { + break; + } + } + return conversion_factor; +} + +static void populate_conversion_factors_matrix() { + int row_index_index; + int row_value, row_index; + int column_index_index; + int column_value, column_index; + + for (row_index_index = 0;; row_index_index++) { + row_value = daytime_conversion_factors[row_index_index][0]; + if (row_value == 0) { + break; + } + row_index = get_freq_group_index(row_value); + for (column_index_index = row_index_index;; column_index_index++) { + column_value = daytime_conversion_factors[column_index_index][0]; + if (column_value == 0) { + break; + } + column_index = get_freq_group_index(column_value); + + daytime_conversion_factor_matrix[row_index][column_index] = calculate_conversion_factor(row_value, column_value); + } + } +} + +void initialize_daytime_conversion_factor_matrix() { + if (daytime_conversion_factor_matrix == NULL) { + int matrix_size = calc_conversion_factors_matrix_size(); + alloc_conversion_factors_matrix(matrix_size); + populate_conversion_factors_matrix(); + } +} + +PANDAS_INLINE npy_int64 get_daytime_conversion_factor(int from_index, int to_index) +{ + return daytime_conversion_factor_matrix[min_value(from_index, to_index)][max_value(from_index, to_index)]; +} + +PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, asfreq_info *af_info, int atEnd) +{ + if (atEnd) { + return (ordinal + 1) * af_info->intraday_conversion_factor - 1; + } else { + return ordinal * af_info->intraday_conversion_factor; + } +} + +PANDAS_INLINE npy_int64 downsample_daytime(npy_int64 ordinal, asfreq_info *af_info, int atEnd) +{ + return ordinal / (af_info->intraday_conversion_factor); +} + +PANDAS_INLINE static npy_int64 transform_via_day(npy_int64 ordinal, char relation, asfreq_info *af_info, freq_conv_func first_func, freq_conv_func second_func) { + //printf("transform_via_day(%ld, %ld, %d)\n", ordinal, af_info->intraday_conversion_factor, af_info->intraday_conversion_upsample); + npy_int64 result; + + result = (*first_func)(ordinal, relation, af_info); + result = (*second_func)(result, relation, af_info); + + return result; +} + +static npy_int64 DtoB_weekday(npy_int64 absdate) { + return (((absdate) / 7) * 5) + (absdate) % 7 - BDAY_OFFSET; +} + +static npy_int64 DtoB_WeekendToMonday(npy_int64 absdate, int day_of_week) { + if (day_of_week > 4) { + //change to Monday after weekend + absdate += (7 - day_of_week); + } + return DtoB_weekday(absdate); +} + +static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { + if (day_of_week > 4) { + //change to friday before weekend + absdate -= (day_of_week - 4); + } + return DtoB_weekday(absdate); +} + +static npy_int64 absdate_from_ymd(int y, int m, int d) { + struct date_info tempDate; + if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, GREGORIAN_CALENDAR)) { + return INT_ERR_CODE; + } + return tempDate.absdate; +} + +//************ FROM DAILY *************** + +static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + struct date_info dinfo; + ordinal = downsample_daytime(ordinal, af_info, 0); + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + if (dinfo.month > af_info->to_a_year_end) { + return (npy_int64)(dinfo.year + 1 - BASE_YEAR); + } + else { + return (npy_int64)(dinfo.year - BASE_YEAR); + } +} + +static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, int *quarter) { + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + if (af_info->to_q_year_end != 12) { + dinfo.month -= af_info->to_q_year_end; + if (dinfo.month <= 0) { dinfo.month += 12; } + else { dinfo.year += 1; } + dinfo.quarter = monthToQuarter(dinfo.month); + } + + *year = dinfo.year; + *quarter = dinfo.quarter; + + return 0; +} + +static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + int year, quarter; + + ordinal = downsample_daytime(ordinal, af_info, 0); + + if (DtoQ_yq(ordinal, af_info, &year, &quarter) == INT_ERR_CODE) { + return INT_ERR_CODE; + } + + return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); +} + +static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + struct date_info dinfo; + + ordinal = downsample_daytime(ordinal, af_info, 0); + + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); +} + +static npy_int64 asfreq_DTtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + ordinal = downsample_daytime(ordinal, af_info, 0); + return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end))/7 + 1 - WEEK_OFFSET; +} + +static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + struct date_info dinfo; + + ordinal = downsample_daytime(ordinal, af_info, 0); + + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + + if (relation == 'S') { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } else { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } +} + +// all intra day calculations are now done within one function +static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return downsample_daytime(ordinal, af_info, relation == 'E'); +} + +static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return upsample_daytime(ordinal, af_info, relation == 'E'); +} +//************ FROM BUSINESS *************** + +static npy_int64 asfreq_BtoDT(npy_int64 ordinal, char relation, asfreq_info *af_info) +{ + ordinal += BDAY_OFFSET; + ordinal = (((ordinal - 1) / 5) * 7 + + mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); + + return upsample_daytime(ordinal, af_info, relation != 'S'); +} + +static npy_int64 asfreq_BtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoA); +} + +static npy_int64 asfreq_BtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoQ); +} + +static npy_int64 asfreq_BtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoM); +} + +static npy_int64 asfreq_BtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, asfreq_DTtoW); +} + +//************ FROM WEEKLY *************** + +static npy_int64 asfreq_WtoDT(npy_int64 ordinal, char relation, asfreq_info *af_info) { + ordinal += WEEK_OFFSET; + if (relation != 'S') { + ordinal += 1; + } + + ordinal = ordinal * 7 - 6 + af_info->from_week_end - ORD_OFFSET; + + if (relation != 'S') { + ordinal -= 1; + } + + return upsample_daytime(ordinal, af_info, relation != 'S'); +} + +static npy_int64 asfreq_WtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoA); +} + +static npy_int64 asfreq_WtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoQ); +} + +static npy_int64 asfreq_WtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoM); +} + +static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, asfreq_DTtoW); +} + +static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } + else { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } +} + +//************ FROM MONTHLY *************** +static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { + *y = floordiv(ordinal, 12) + BASE_YEAR; + *m = mod_compat(ordinal, 12) + 1; +} + + +static npy_int64 asfreq_MtoDT(npy_int64 ordinal, char relation, asfreq_info* af_info) { + npy_int64 absdate; + int y, m; + + if (relation == 'E') { + ordinal += 1; + } + MtoD_ym(ordinal, &y, &m); + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + ordinal = absdate - ORD_OFFSET; + + if (relation == 'E') { + ordinal -= 1; + } + + return upsample_daytime(ordinal, af_info, relation != 'S'); +} + +static npy_int64 asfreq_MtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, asfreq_DTtoA); +} + +static npy_int64 asfreq_MtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, asfreq_DTtoQ); +} + +static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, asfreq_DTtoW); +} + +static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + struct date_info dinfo; + + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } + else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } +} + +//************ FROM QUARTERLY *************** + +static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { + *y = floordiv(ordinal, 4) + BASE_YEAR; + *m = mod_compat(ordinal, 4) * 3 + 1; + + if (af_info->from_q_year_end != 12) { + *m += af_info->from_q_year_end; + if (*m > 12) { *m -= 12; } + else { *y -= 1; } + } +} + +static npy_int64 asfreq_QtoDT(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + npy_int64 absdate; + int y, m; + + if (relation == 'E') { + ordinal += 1; + } + + QtoD_ym(ordinal, &y, &m, af_info); + + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + + if (relation == 'E') { + absdate -= 1; + } + + return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); +} + +static npy_int64 asfreq_QtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoQ); +} + +static npy_int64 asfreq_QtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoA); +} + +static npy_int64 asfreq_QtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoM); +} + +static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, asfreq_DTtoW); +} + +static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } + else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } +} + + +//************ FROM ANNUAL *************** + +static npy_int64 asfreq_AtoDT(npy_int64 year, char relation, asfreq_info *af_info) { + npy_int64 absdate; + int month = (af_info->from_a_year_end) % 12; + + // start from 1970 + year += BASE_YEAR; + + month += 1; + + if (af_info->from_a_year_end != 12) { + year -= 1; + } + + if (relation == 'E') { + year += 1; + } + + absdate = absdate_from_ymd(year, month, 1); + + if (absdate == INT_ERR_CODE) { + return INT_ERR_CODE; + } + + if (relation == 'E') { + absdate -= 1; + } + + return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); +} + +static npy_int64 asfreq_AtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoA); +} + +static npy_int64 asfreq_AtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoQ); +} + +static npy_int64 asfreq_AtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoM); +} + +static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, asfreq_DTtoW); +} + +static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } + else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } +} + +static npy_int64 nofunc(npy_int64 ordinal, char relation, asfreq_info *af_info) { return INT_ERR_CODE; } +static npy_int64 no_op(npy_int64 ordinal, char relation, asfreq_info *af_info) { return ordinal; } + +// end of frequency specific conversion routines + +static int calc_a_year_end(int freq, int group) { + int result = (freq - group) % 12; + if (result == 0) {return 12;} + else {return result;} +} + +static int calc_week_end(int freq, int group) { + return freq - group; +} + +void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) { + int fromGroup = get_freq_group(fromFreq); + int toGroup = get_freq_group(toFreq); + + af_info->intraday_conversion_factor = + get_daytime_conversion_factor( + get_freq_group_index(max_value(fromGroup, FR_DAY)), + get_freq_group_index(max_value(toGroup, FR_DAY)) + ); + + //printf("get_asfreq_info(%d, %d) %ld, %d\n", fromFreq, toFreq, af_info->intraday_conversion_factor, af_info->intraday_conversion_upsample); + + switch(fromGroup) + { + case FR_WK: + af_info->from_week_end = calc_week_end(fromFreq, fromGroup); + break; + case FR_ANN: + af_info->from_a_year_end = calc_a_year_end(fromFreq, fromGroup); + break; + case FR_QTR: + af_info->from_q_year_end = calc_a_year_end(fromFreq, fromGroup); + break; + } + + switch(toGroup) + { + case FR_WK: + af_info->to_week_end = calc_week_end(toFreq, toGroup); + break; + case FR_ANN: + af_info->to_a_year_end = calc_a_year_end(toFreq, toGroup); + break; + case FR_QTR: + af_info->to_q_year_end = calc_a_year_end(toFreq, toGroup); + break; + } +} + + +freq_conv_func get_asfreq_func(int fromFreq, int toFreq) +{ + int fromGroup = get_freq_group(fromFreq); + int toGroup = get_freq_group(toFreq); + + if (fromGroup == FR_UND) { fromGroup = FR_DAY; } + + switch(fromGroup) + { + case FR_ANN: + switch(toGroup) + { + case FR_ANN: return &asfreq_AtoA; + case FR_QTR: return &asfreq_AtoQ; + case FR_MTH: return &asfreq_AtoM; + case FR_WK: return &asfreq_AtoW; + case FR_BUS: return &asfreq_AtoB; + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: + case FR_MS: + case FR_US: + case FR_NS: + return &asfreq_AtoDT; + + default: return &nofunc; + } + + case FR_QTR: + switch(toGroup) + { + case FR_ANN: return &asfreq_QtoA; + case FR_QTR: return &asfreq_QtoQ; + case FR_MTH: return &asfreq_QtoM; + case FR_WK: return &asfreq_QtoW; + case FR_BUS: return &asfreq_QtoB; + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: + case FR_MS: + case FR_US: + case FR_NS: + return &asfreq_QtoDT; + default: return &nofunc; + } + + case FR_MTH: + switch(toGroup) + { + case FR_ANN: return &asfreq_MtoA; + case FR_QTR: return &asfreq_MtoQ; + case FR_MTH: return &no_op; + case FR_WK: return &asfreq_MtoW; + case FR_BUS: return &asfreq_MtoB; + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: + case FR_MS: + case FR_US: + case FR_NS: + return &asfreq_MtoDT; + default: return &nofunc; + } + + case FR_WK: + switch(toGroup) + { + case FR_ANN: return &asfreq_WtoA; + case FR_QTR: return &asfreq_WtoQ; + case FR_MTH: return &asfreq_WtoM; + case FR_WK: return &asfreq_WtoW; + case FR_BUS: return &asfreq_WtoB; + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: + case FR_MS: + case FR_US: + case FR_NS: + return &asfreq_WtoDT; + default: return &nofunc; + } + + case FR_BUS: + switch(toGroup) + { + case FR_ANN: return &asfreq_BtoA; + case FR_QTR: return &asfreq_BtoQ; + case FR_MTH: return &asfreq_BtoM; + case FR_WK: return &asfreq_BtoW; + case FR_BUS: return &no_op; + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: + case FR_MS: + case FR_US: + case FR_NS: + return &asfreq_BtoDT; + default: return &nofunc; + } + + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: + case FR_MS: + case FR_US: + case FR_NS: + switch(toGroup) + { + case FR_ANN: return &asfreq_DTtoA; + case FR_QTR: return &asfreq_DTtoQ; + case FR_MTH: return &asfreq_DTtoM; + case FR_WK: return &asfreq_DTtoW; + case FR_BUS: return &asfreq_DTtoB; + case FR_DAY: + case FR_HR: + case FR_MIN: + case FR_SEC: + case FR_MS: + case FR_US: + case FR_NS: + if (fromGroup > toGroup) { + return &asfreq_DownsampleWithinDay; + } else { + return &asfreq_UpsampleWithinDay; + } + default: return &nofunc; + } + + default: return &nofunc; + } +} + +double get_abs_time(int freq, npy_int64 date_ordinal, npy_int64 ordinal) { + //printf("get_abs_time %d %lld %lld\n", freq, date_ordinal, ordinal); + + int freq_index, day_index, base_index; + npy_int64 per_day, start_ord; + double unit, result; + + if (freq <= FR_DAY) { + return 0; + } + + freq_index = get_freq_group_index(freq); + day_index = get_freq_group_index(FR_DAY); + base_index = get_freq_group_index(FR_SEC); + + //printf(" indices: day %d, freq %d, base %d\n", day_index, freq_index, base_index); + + per_day = get_daytime_conversion_factor(day_index, freq_index); + unit = get_daytime_conversion_factor(freq_index, base_index); + + //printf(" per_day: %lld, unit: %f\n", per_day, unit); + + if (base_index < freq_index) { + unit = 1 / unit; + //printf(" corrected unit: %f\n", unit); + } + + start_ord = date_ordinal * per_day; + //printf("start_ord: %lld\n", start_ord); + result = (double) ( unit * (ordinal - start_ord)); + //printf(" result: %f\n", result); + return result; +} + +/* Sets the time part of the DateTime object. */ +static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, + double abstime) +{ + int inttime; + int hour,minute; + double second; + + inttime = (int)abstime; + hour = inttime / 3600; + minute = (inttime % 3600) / 60; + second = abstime - (double)(hour*3600 + minute*60); + + dinfo->hour = hour; + dinfo->minute = minute; + dinfo->second = second; + + dinfo->abstime = abstime; + + return 0; +} + +/* Set the instance's value using the given date and time. calendar + may be set to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to + indicate the calendar to be used. */ +static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, + npy_int64 absdate, + double abstime, + int calendar) +{ + /* Bounds check */ + Py_AssertWithArg(abstime >= 0.0 && abstime <= SECONDS_PER_DAY, + PyExc_ValueError, + "abstime out of range (0.0 - 86400.0): %f", + abstime); + + /* Calculate the date */ + if (dInfoCalc_SetFromAbsDate(dinfo, absdate, calendar)) goto onError; + + /* Calculate the time */ + if (dInfoCalc_SetFromAbsTime(dinfo, abstime)) goto onError; + + return 0; +onError: + return INT_ERR_CODE; +} + +/* ------------------------------------------------------------------ + * New pandas API-helper code, to expose to cython + * ------------------------------------------------------------------*/ + +npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation) +{ + npy_int64 val; + freq_conv_func func; + asfreq_info finfo; + + func = get_asfreq_func(freq1, freq2); + + get_asfreq_info(freq1, freq2, &finfo); + + //printf("\n%x %d %d %ld %ld\n", func, freq1, freq2, finfo.intraday_conversion_factor, -finfo.intraday_conversion_factor); + + val = (*func)(period_ordinal, relation, &finfo); + + if (val == INT_ERR_CODE) { + //Py_Error(PyExc_ValueError, "Unable to convert to desired frequency."); + goto onError; + } + return val; +onError: + return INT_ERR_CODE; +} + + +/* generate an ordinal in period space */ +npy_int64 get_period_ordinal(int year, int month, int day, + int hour, int minute, int second, int microseconds, int picoseconds, + int freq) +{ + npy_int64 absdays, delta, seconds; + npy_int64 weeks, days; + npy_int64 ordinal, day_adj; + int freq_group, fmonth, mdiff; + freq_group = get_freq_group(freq); + + if (freq == FR_SEC || freq == FR_MS || freq == FR_US || freq == FR_NS) { + + absdays = absdate_from_ymd(year, month, day); + delta = (absdays - ORD_OFFSET); + seconds = (npy_int64)(delta * 86400 + hour * 3600 + minute * 60 + second); + + switch(freq) { + case FR_MS: + return seconds * 1000 + microseconds / 1000; + + case FR_US: + return seconds * 1000000 + microseconds; + + case FR_NS: + return seconds * 1000000000 + microseconds * 1000 + picoseconds / 1000; + } + + return seconds; + } + + if (freq == FR_MIN) { + absdays = absdate_from_ymd(year, month, day); + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*1440 + hour*60 + minute); + } + + if (freq == FR_HR) { + if ((absdays = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) + { + goto onError; + } + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*24 + hour); + } + + if (freq == FR_DAY) + { + return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); + } + + if (freq == FR_UND) + { + return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); + } + + if (freq == FR_BUS) + { + if((days = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) + { + goto onError; + } + // calculate the current week assuming sunday as last day of a week + weeks = (days - BASE_WEEK_TO_DAY_OFFSET) / DAYS_PER_WEEK; + // calculate the current weekday (in range 1 .. 7) + delta = (days - BASE_WEEK_TO_DAY_OFFSET) % DAYS_PER_WEEK + 1; + // return the number of business days in full weeks plus the business days in the last - possible partial - week + return (npy_int64)(weeks * BUSINESS_DAYS_PER_WEEK) + + (delta <= BUSINESS_DAYS_PER_WEEK + ? delta + : BUSINESS_DAYS_PER_WEEK + 1) + - BDAY_OFFSET; + } + + if (freq_group == FR_WK) + { + if((ordinal = (npy_int64)absdate_from_ymd(year, month, day)) == INT_ERR_CODE) + { + goto onError; + } + day_adj = freq - FR_WK; + return (ordinal - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET; + } + + if (freq == FR_MTH) + { + return (year - BASE_YEAR) * 12 + month - 1; + } + + if (freq_group == FR_QTR) + { + fmonth = freq - FR_QTR; + if (fmonth == 0) fmonth = 12; + + mdiff = month - fmonth; + if (mdiff < 0) mdiff += 12; + if (month >= fmonth) mdiff += 12; + + return (year - BASE_YEAR) * 4 + (mdiff - 1) / 3; + } + + if (freq_group == FR_ANN) + { + fmonth = freq - FR_ANN; + if (fmonth == 0) fmonth = 12; + if (month <= fmonth) { + return year - BASE_YEAR; + } + else { + return year - BASE_YEAR + 1; + } + } + + Py_Error(PyExc_RuntimeError, "Unable to generate frequency ordinal"); + +onError: + return INT_ERR_CODE; +} + +/* + Returns the proleptic Gregorian ordinal of the date, as an integer. + This corresponds to the number of days since Jan., 1st, 1AD. + When the instance has a frequency less than daily, the proleptic date + is calculated for the last day of the period. + */ + +npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) +{ + asfreq_info af_info; + freq_conv_func toDaily = NULL; + + if (freq == FR_DAY) + return period_ordinal + ORD_OFFSET; + + toDaily = get_asfreq_func(freq, FR_DAY); + get_asfreq_info(freq, FR_DAY, &af_info); + + return toDaily(period_ordinal, 'E', &af_info) + ORD_OFFSET; +} + +char *str_replace(const char *s, const char *old, const char *new) { + char *ret; + int i, count = 0; + size_t newlen = strlen(new); + size_t oldlen = strlen(old); + + for (i = 0; s[i] != '\0'; i++) { + if (strstr(&s[i], old) == &s[i]) { + count++; + i += oldlen - 1; + } + } + + ret = PyArray_malloc(i + 1 + count * (newlen - oldlen)); + if (ret == NULL) {return (char *)PyErr_NoMemory();} + + i = 0; + while (*s) { + if (strstr(s, old) == s) { + strcpy(&ret[i], new); + i += newlen; + s += oldlen; + } else { + ret[i++] = *s++; + } + } + ret[i] = '\0'; + + return ret; +} + +// function to generate a nice string representation of the period +// object, originally from DateObject_strftime + +char* c_strftime(struct date_info *tmp, char *fmt) { + struct tm c_date; + char* result; + struct date_info dinfo = *tmp; + int result_len = strlen(fmt) + 50; + + c_date.tm_sec = (int)dinfo.second; + c_date.tm_min = dinfo.minute; + c_date.tm_hour = dinfo.hour; + c_date.tm_mday = dinfo.day; + c_date.tm_mon = dinfo.month - 1; + c_date.tm_year = dinfo.year - 1900; + c_date.tm_wday = (dinfo.day_of_week + 1) % 7; + c_date.tm_yday = dinfo.day_of_year - 1; + c_date.tm_isdst = -1; + + result = malloc(result_len * sizeof(char)); + + strftime(result, result_len, fmt, &c_date); + + return result; +} + +int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { + asfreq_info af_info; + int qtr_freq; + npy_int64 daily_ord; + npy_int64 (*toDaily)(npy_int64, char, asfreq_info*) = NULL; + + toDaily = get_asfreq_func(freq, FR_DAY); + get_asfreq_info(freq, FR_DAY, &af_info); + + daily_ord = toDaily(ordinal, 'E', &af_info); + + if (get_freq_group(freq) == FR_QTR) { + qtr_freq = freq; + } else { qtr_freq = FR_QTR; } + get_asfreq_info(FR_DAY, qtr_freq, &af_info); + + if(DtoQ_yq(daily_ord, &af_info, year, quarter) == INT_ERR_CODE) + return -1; + + return 0; +} + + + + + +static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { + asfreq_info af_info; + int qtr_freq; + + ordinal = get_python_ordinal(ordinal, freq) - ORD_OFFSET; + + if (get_freq_group(freq) == FR_QTR) + qtr_freq = freq; + else + qtr_freq = FR_QTR; + + get_asfreq_info(FR_DAY, qtr_freq, &af_info); + + if (DtoQ_yq(ordinal, &af_info, year, quarter) == INT_ERR_CODE) + return INT_ERR_CODE; + + if ((qtr_freq % 1000) > 12) + *year -= 1; + + return 0; +} + +static int _ISOWeek(struct date_info *dinfo) +{ + int week; + + /* Estimate */ + week = (dinfo->day_of_year-1) - dinfo->day_of_week + 3; + if (week >= 0) week = week / 7 + 1; + + /* Verify */ + if (week < 0) { + /* The day lies in last week of the previous year */ + if ((week > -2) || + (week == -2 && dInfoCalc_Leapyear(dinfo->year-1, dinfo->calendar))) + week = 53; + else + week = 52; + } else if (week == 53) { + /* Check if the week belongs to year or year+1 */ + if (31-dinfo->day + dinfo->day_of_week < 3) { + week = 1; + } + } + + return week; +} + +int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) +{ + npy_int64 absdate = get_python_ordinal(ordinal, freq); + double abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal); + + while (abstime < 0) { + abstime += 86400; + absdate -= 1; + } + while (abstime >= 86400) { + abstime -= 86400; + absdate += 1; + } + + if(dInfoCalc_SetFromAbsDateTime(dinfo, absdate, + abstime, GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + + return 0; +} + +int pyear(npy_int64 ordinal, int freq) { + struct date_info dinfo; + get_date_info(ordinal, freq, &dinfo); + return dinfo.year; +} + +int pqyear(npy_int64 ordinal, int freq) { + int year, quarter; + if( _quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) + return INT_ERR_CODE; + return year; +} + +int pquarter(npy_int64 ordinal, int freq) { + int year, quarter; + if(_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) + return INT_ERR_CODE; + return quarter; +} + +int pmonth(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.month; +} + +int pday(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day; +} + +int pweekday(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day_of_week; +} + +int pday_of_week(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day_of_week; +} + +int pday_of_year(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day_of_year; +} + +int pweek(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return _ISOWeek(&dinfo); +} + +int phour(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.hour; +} + +int pminute(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.minute; +} + +int psecond(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return (int)dinfo.second; +} diff --git a/pandas/src/period.h b/pandas/src/period.h new file mode 100644 index 00000000..55c3722e --- /dev/null +++ b/pandas/src/period.h @@ -0,0 +1,169 @@ +/* + * Borrowed and derived code from scikits.timeseries that we will expose via + * Cython to pandas. This primarily concerns interval representation and + * frequency conversion routines. + */ + +#ifndef C_PERIOD_H +#define C_PERIOD_H + +#include +#include "helper.h" +#include "numpy/ndarraytypes.h" +#include "headers/stdint.h" +#include "limits.h" + +/* + * declarations from period here + */ + +#define GREGORIAN_CALENDAR 0 +#define JULIAN_CALENDAR 1 + +#define SECONDS_PER_DAY ((double) 86400.0) + +#define Py_AssertWithArg(x,errortype,errorstr,a1) {if (!(x)) {PyErr_Format(errortype,errorstr,a1);goto onError;}} +#define Py_Error(errortype,errorstr) {PyErr_SetString(errortype,errorstr);goto onError;} + +/*** FREQUENCY CONSTANTS ***/ + +// HIGHFREQ_ORIG is the datetime ordinal from which to begin the second +// frequency ordinal sequence + +// typedef int64_t npy_int64; +// begins second ordinal at 1/1/1970 unix epoch + +// #define HIGHFREQ_ORIG 62135683200LL +#define BASE_YEAR 1970 +#define ORD_OFFSET 719163LL // days until 1970-01-01 +#define BDAY_OFFSET 513689LL // days until 1970-01-01 +#define WEEK_OFFSET 102737LL +#define BASE_WEEK_TO_DAY_OFFSET 1 // difference between day 0 and end of week in days +#define DAYS_PER_WEEK 7 +#define BUSINESS_DAYS_PER_WEEK 5 +#define HIGHFREQ_ORIG 0 // ORD_OFFSET * 86400LL // days until 1970-01-01 + +#define FR_ANN 1000 /* Annual */ +#define FR_ANNDEC FR_ANN /* Annual - December year end*/ +#define FR_ANNJAN 1001 /* Annual - January year end*/ +#define FR_ANNFEB 1002 /* Annual - February year end*/ +#define FR_ANNMAR 1003 /* Annual - March year end*/ +#define FR_ANNAPR 1004 /* Annual - April year end*/ +#define FR_ANNMAY 1005 /* Annual - May year end*/ +#define FR_ANNJUN 1006 /* Annual - June year end*/ +#define FR_ANNJUL 1007 /* Annual - July year end*/ +#define FR_ANNAUG 1008 /* Annual - August year end*/ +#define FR_ANNSEP 1009 /* Annual - September year end*/ +#define FR_ANNOCT 1010 /* Annual - October year end*/ +#define FR_ANNNOV 1011 /* Annual - November year end*/ + +/* The standard quarterly frequencies with various fiscal year ends + eg, Q42005 for Q@OCT runs Aug 1, 2005 to Oct 31, 2005 */ +#define FR_QTR 2000 /* Quarterly - December year end (default quarterly) */ +#define FR_QTRDEC FR_QTR /* Quarterly - December year end */ +#define FR_QTRJAN 2001 /* Quarterly - January year end */ +#define FR_QTRFEB 2002 /* Quarterly - February year end */ +#define FR_QTRMAR 2003 /* Quarterly - March year end */ +#define FR_QTRAPR 2004 /* Quarterly - April year end */ +#define FR_QTRMAY 2005 /* Quarterly - May year end */ +#define FR_QTRJUN 2006 /* Quarterly - June year end */ +#define FR_QTRJUL 2007 /* Quarterly - July year end */ +#define FR_QTRAUG 2008 /* Quarterly - August year end */ +#define FR_QTRSEP 2009 /* Quarterly - September year end */ +#define FR_QTROCT 2010 /* Quarterly - October year end */ +#define FR_QTRNOV 2011 /* Quarterly - November year end */ + +#define FR_MTH 3000 /* Monthly */ + +#define FR_WK 4000 /* Weekly */ +#define FR_WKSUN FR_WK /* Weekly - Sunday end of week */ +#define FR_WKMON 4001 /* Weekly - Monday end of week */ +#define FR_WKTUE 4002 /* Weekly - Tuesday end of week */ +#define FR_WKWED 4003 /* Weekly - Wednesday end of week */ +#define FR_WKTHU 4004 /* Weekly - Thursday end of week */ +#define FR_WKFRI 4005 /* Weekly - Friday end of week */ +#define FR_WKSAT 4006 /* Weekly - Saturday end of week */ + +#define FR_BUS 5000 /* Business days */ +#define FR_DAY 6000 /* Daily */ +#define FR_HR 7000 /* Hourly */ +#define FR_MIN 8000 /* Minutely */ +#define FR_SEC 9000 /* Secondly */ +#define FR_MS 10000 /* Millisecondly */ +#define FR_US 11000 /* Microsecondly */ +#define FR_NS 12000 /* Nanosecondly */ + +#define FR_UND -10000 /* Undefined */ + +#define INT_ERR_CODE INT32_MIN + +#define MEM_CHECK(item) if (item == NULL) { return PyErr_NoMemory(); } +#define ERR_CHECK(item) if (item == NULL) { return NULL; } + +typedef struct asfreq_info { + int from_week_end; // day the week ends on in the "from" frequency + int to_week_end; // day the week ends on in the "to" frequency + + int from_a_year_end; // month the year ends on in the "from" frequency + int to_a_year_end; // month the year ends on in the "to" frequency + + int from_q_year_end; // month the year ends on in the "from" frequency + int to_q_year_end; // month the year ends on in the "to" frequency + + npy_int64 intraday_conversion_factor; +} asfreq_info; + + +typedef struct date_info { + npy_int64 absdate; + double abstime; + + double second; + int minute; + int hour; + int day; + int month; + int quarter; + int year; + int day_of_week; + int day_of_year; + int calendar; +} date_info; + +typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info*); + +/* + * new pandas API helper functions here + */ + +npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation); + +npy_int64 get_period_ordinal(int year, int month, int day, + int hour, int minute, int second, int microseconds, int picoseconds, + int freq); + +npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq); + +int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo); +freq_conv_func get_asfreq_func(int fromFreq, int toFreq); +void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info); + +int pyear(npy_int64 ordinal, int freq); +int pqyear(npy_int64 ordinal, int freq); +int pquarter(npy_int64 ordinal, int freq); +int pmonth(npy_int64 ordinal, int freq); +int pday(npy_int64 ordinal, int freq); +int pweekday(npy_int64 ordinal, int freq); +int pday_of_week(npy_int64 ordinal, int freq); +int pday_of_year(npy_int64 ordinal, int freq); +int pweek(npy_int64 ordinal, int freq); +int phour(npy_int64 ordinal, int freq); +int pminute(npy_int64 ordinal, int freq); +int psecond(npy_int64 ordinal, int freq); + +double getAbsTime(int freq, npy_int64 dailyDate, npy_int64 originalDate); +char *c_strftime(struct date_info *dinfo, char *fmt); +int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); + +void initialize_daytime_conversion_factor_matrix(); +#endif diff --git a/pandas/src/properties.pyx b/pandas/src/properties.pyx new file mode 100644 index 00000000..e619a3b6 --- /dev/null +++ b/pandas/src/properties.pyx @@ -0,0 +1,65 @@ +from cpython cimport PyDict_Contains, PyDict_GetItem, PyDict_GetItem + + +cdef class cache_readonly(object): + + cdef readonly: + object func, name, allow_setting + + def __init__(self, func=None, allow_setting=False): + if func is not None: + self.func = func + self.name = func.__name__ + self.allow_setting = allow_setting + + def __call__(self, func, doc=None): + self.func = func + self.name = func.__name__ + return self + + def __get__(self, obj, typ): + # Get the cache or set a default one if needed + + cache = getattr(obj, '_cache', None) + if cache is None: + try: + cache = obj._cache = {} + except (AttributeError): + return + + if PyDict_Contains(cache, self.name): + # not necessary to Py_INCREF + val = PyDict_GetItem(cache, self.name) + else: + val = self.func(obj) + PyDict_SetItem(cache, self.name, val) + return val + + def __set__(self, obj, value): + + if not self.allow_setting: + raise Exception("cannot set values for [%s]" % self.name) + + # Get the cache or set a default one if needed + cache = getattr(obj, '_cache', None) + if cache is None: + try: + cache = obj._cache = {} + except (AttributeError): + return + + PyDict_SetItem(cache, self.name, value) + +cdef class AxisProperty(object): + cdef: + Py_ssize_t axis + + def __init__(self, axis=0): + self.axis = axis + + def __get__(self, obj, type): + cdef list axes = obj._data.axes + return axes[self.axis] + + def __set__(self, obj, value): + obj._set_axis(self.axis, value) diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx new file mode 100644 index 00000000..a22e7e63 --- /dev/null +++ b/pandas/src/reduce.pyx @@ -0,0 +1,596 @@ +#cython=False +from numpy cimport * +import numpy as np + +from distutils.version import LooseVersion + +is_numpy_prior_1_6_2 = LooseVersion(np.__version__) < '1.6.2' + +cdef class Reducer: + ''' + Performs generic reduction operation on a C or Fortran-contiguous ndarray + while avoiding ndarray construction overhead + ''' + cdef: + Py_ssize_t increment, chunksize, nresults + object arr, dummy, f, labels, typ, index + + def __init__(self, object arr, object f, axis=1, dummy=None, + labels=None): + n, k = arr.shape + + if axis == 0: + if not arr.flags.f_contiguous: + arr = arr.copy('F') + + self.nresults = k + self.chunksize = n + self.increment = n * arr.dtype.itemsize + else: + if not arr.flags.c_contiguous: + arr = arr.copy('C') + + self.nresults = n + self.chunksize = k + self.increment = k * arr.dtype.itemsize + + + self.f = f + self.arr = arr + self.typ = None + self.labels = labels + self.dummy, index = self._check_dummy(dummy=dummy) + + self.labels = labels + self.index = index + + def _check_dummy(self, dummy=None): + cdef object index + + if dummy is None: + dummy = np.empty(self.chunksize, dtype=self.arr.dtype) + index = None + + # our ref is stolen later since we are creating this array + # in cython, so increment first + Py_INCREF(dummy) + else: + # we passed a series-like + if hasattr(dummy,'values'): + + self.typ = type(dummy) + index = getattr(dummy,'index',None) + dummy = dummy.values + + if dummy.dtype != self.arr.dtype: + raise ValueError('Dummy array must be same dtype') + if len(dummy) != self.chunksize: + raise ValueError('Dummy array must be length %d' % + self.chunksize) + + return dummy, index + + def get_result(self): + cdef: + char* dummy_buf + ndarray arr, result, chunk + Py_ssize_t i, incr + flatiter it + object res, name, labels, index + object cached_typ = None + + arr = self.arr + chunk = self.dummy + dummy_buf = chunk.data + chunk.data = arr.data + labels = self.labels + index = self.index + incr = self.increment + + try: + for i in range(self.nresults): + + if labels is not None: + name = util.get_value_at(labels, i) + else: + name = None + + # create the cached type + # each time just reassign the data + if i == 0: + + if self.typ is not None: + + # recreate with the index if supplied + if index is not None: + + cached_typ = self.typ(chunk, index=index, name=name) + + else: + + # use the passsed typ, sans index + cached_typ = self.typ(chunk, name=name) + + # use the cached_typ if possible + if cached_typ is not None: + object.__setattr__(cached_typ._data._block, 'values', chunk) + object.__setattr__(cached_typ, 'name', name) + res = self.f(cached_typ) + else: + res = self.f(chunk) + + if hasattr(res,'values'): + res = res.values + + if i == 0: + result = self._get_result_array(res) + it = PyArray_IterNew(result) + + PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) + chunk.data = chunk.data + self.increment + PyArray_ITER_NEXT(it) + except Exception, e: + if hasattr(e, 'args'): + e.args = e.args + (i,) + raise + finally: + # so we don't free the wrong memory + chunk.data = dummy_buf + + if result.dtype == np.object_: + result = maybe_convert_objects(result) + + return result + + def _get_result_array(self, object res): + try: + assert(not isinstance(res, np.ndarray)) + assert(not (isinstance(res, list) and len(res) == len(self.dummy))) + + result = np.empty(self.nresults, dtype='O') + result[0] = res + except Exception: + raise ValueError('function does not reduce') + return result + + +cdef class SeriesBinGrouper: + ''' + Performs grouping operation according to bin edges, rather than labels + ''' + cdef: + Py_ssize_t nresults, ngroups + bint passed_dummy + + cdef public: + object arr, index, dummy_arr, dummy_index, values, f, bins, typ, name + + def __init__(self, object series, object f, object bins, object dummy): + n = len(series) + + self.bins = bins + self.f = f + + values = series.values + if not values.flags.c_contiguous: + values = values.copy('C') + self.arr = values + self.index = series.index + self.typ = type(series) + self.name = getattr(series,'name',None) + + self.dummy_arr, self.dummy_index = self._check_dummy(dummy) + self.passed_dummy = dummy is not None + + # kludge for #1688 + if len(bins) > 0 and bins[-1] == len(series): + self.ngroups = len(bins) + else: + self.ngroups = len(bins) + 1 + + def _check_dummy(self, dummy=None): + if dummy is None: + values = np.empty(0, dtype=self.arr.dtype) + index = None + else: + values = dummy.values + if values.dtype != self.arr.dtype: + raise ValueError('Dummy array must be same dtype') + if not values.flags.contiguous: + values = values.copy() + index = dummy.index + + return values, index + + def get_result(self): + cdef: + ndarray arr, result + ndarray[int64_t] counts + Py_ssize_t i, n, group_size + object res + bint initialized = 0 + Slider vslider, islider + object gin, typ, name + object cached_typ = None + + counts = np.zeros(self.ngroups, dtype=np.int64) + + if self.ngroups > 0: + counts[0] = self.bins[0] + for i in range(1, self.ngroups): + if i == self.ngroups - 1: + counts[i] = len(self.arr) - self.bins[i-1] + else: + counts[i] = self.bins[i] - self.bins[i-1] + + group_size = 0 + n = len(self.arr) + name = self.name + + vslider = Slider(self.arr, self.dummy_arr) + islider = Slider(self.index, self.dummy_index) + + gin = self.dummy_index._engine + + try: + for i in range(self.ngroups): + group_size = counts[i] + + islider.set_length(group_size) + vslider.set_length(group_size) + + if cached_typ is None: + cached_typ = self.typ(vslider.buf, index=islider.buf, + name=name) + else: + object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ, '_index', islider.buf) + object.__setattr__(cached_typ, 'name', name) + + res = self.f(cached_typ) + res = _extract_result(res) + if not initialized: + result = self._get_result_array(res) + initialized = 1 + + util.assign_value_1d(result, i, res) + + islider.advance(group_size) + vslider.advance(group_size) + + gin.clear_mapping() + except: + raise + finally: + # so we don't free the wrong memory + islider.reset() + vslider.reset() + + if result.dtype == np.object_: + result = maybe_convert_objects(result) + + return result, counts + + def _get_result_array(self, object res): + try: + assert(not isinstance(res, np.ndarray)) + assert(not (isinstance(res, list) and len(res) == len(self.dummy_arr))) + + result = np.empty(self.ngroups, dtype='O') + except Exception: + raise ValueError('function does not reduce') + return result + + +cdef class SeriesGrouper: + ''' + Performs generic grouping operation while avoiding ndarray construction + overhead + ''' + cdef: + Py_ssize_t nresults, ngroups + bint passed_dummy + + cdef public: + object arr, index, dummy_arr, dummy_index, f, labels, values, typ, name + + def __init__(self, object series, object f, object labels, + Py_ssize_t ngroups, object dummy): + n = len(series) + + self.labels = labels + self.f = f + + values = series.values + if not values.flags.c_contiguous: + values = values.copy('C') + self.arr = values + self.index = series.index + self.typ = type(series) + self.name = getattr(series,'name',None) + + self.dummy_arr, self.dummy_index = self._check_dummy(dummy) + self.passed_dummy = dummy is not None + self.ngroups = ngroups + + def _check_dummy(self, dummy=None): + if dummy is None: + values = np.empty(0, dtype=self.arr.dtype) + index = None + else: + values = dummy.values + if dummy.dtype != self.arr.dtype: + raise ValueError('Dummy array must be same dtype') + if not values.flags.contiguous: + values = values.copy() + index = dummy.index + + return values, index + + def get_result(self): + cdef: + ndarray arr, result + ndarray[int64_t] labels, counts + Py_ssize_t i, n, group_size, lab + object res + bint initialized = 0 + Slider vslider, islider + object gin, typ, name + object cached_typ = None + + labels = self.labels + counts = np.zeros(self.ngroups, dtype=np.int64) + group_size = 0 + n = len(self.arr) + name = self.name + + vslider = Slider(self.arr, self.dummy_arr) + islider = Slider(self.index, self.dummy_index) + + gin = self.dummy_index._engine + + try: + for i in range(n): + group_size += 1 + + lab = labels[i] + + if i == n - 1 or lab != labels[i + 1]: + if lab == -1: + islider.advance(group_size) + vslider.advance(group_size) + group_size = 0 + continue + + islider.set_length(group_size) + vslider.set_length(group_size) + + if cached_typ is None: + cached_typ = self.typ(vslider.buf, index=islider.buf, + name=name) + else: + object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ, '_index', islider.buf) + object.__setattr__(cached_typ, 'name', name) + + res = self.f(cached_typ) + res = _extract_result(res) + if not initialized: + result = self._get_result_array(res) + initialized = 1 + + util.assign_value_1d(result, lab, res) + counts[lab] = group_size + islider.advance(group_size) + vslider.advance(group_size) + + group_size = 0 + + gin.clear_mapping() + + except: + raise + finally: + # so we don't free the wrong memory + islider.reset() + vslider.reset() + + if result.dtype == np.object_: + result = maybe_convert_objects(result) + + return result, counts + + def _get_result_array(self, object res): + try: + assert(not isinstance(res, np.ndarray)) + assert(not (isinstance(res, list) and len(res) == len(self.dummy_arr))) + + result = np.empty(self.ngroups, dtype='O') + except Exception: + raise ValueError('function does not reduce') + return result + +cdef inline _extract_result(object res): + ''' extract the result object, it might be a 0-dim ndarray + or a len-1 0-dim, or a scalar ''' + if hasattr(res,'values'): + res = res.values + if not np.isscalar(res): + if isinstance(res, np.ndarray): + if res.ndim == 0: + res = res.item() + elif res.ndim == 1 and len(res) == 1: + res = res[0] + return res + +cdef class Slider: + ''' + Only handles contiguous data for now + ''' + cdef: + ndarray values, buf + Py_ssize_t stride, orig_len, orig_stride + char *orig_data + + def __init__(self, object values, object buf): + assert(values.ndim == 1) + if not values.flags.contiguous: + values = values.copy() + + assert(values.dtype == buf.dtype) + self.values = values + self.buf = buf + self.stride = values.strides[0] + + self.orig_data = self.buf.data + self.orig_len = self.buf.shape[0] + self.orig_stride = self.buf.strides[0] + + self.buf.data = self.values.data + self.buf.strides[0] = self.stride + + cpdef advance(self, Py_ssize_t k): + self.buf.data = self.buf.data + self.stride * k + + cdef move(self, int start, int end): + ''' + For slicing + ''' + self.buf.data = self.values.data + self.stride * start + self.buf.shape[0] = end - start + + cpdef set_length(self, Py_ssize_t length): + self.buf.shape[0] = length + + cpdef reset(self): + self.buf.shape[0] = self.orig_len + self.buf.data = self.orig_data + self.buf.strides[0] = self.orig_stride + + +class InvalidApply(Exception): + pass + +def apply_frame_axis0(object frame, object f, object names, + ndarray[int64_t] starts, ndarray[int64_t] ends): + cdef: + BlockSlider slider + Py_ssize_t i, n = len(starts) + list results + object piece + dict item_cache + + if frame.index._has_complex_internals: + raise InvalidApply('Cannot modify frame index internals') + + + results = [] + + # Need to infer if our low-level mucking is going to cause a segfault + if n > 0: + chunk = frame[starts[0]:ends[0]] + shape_before = chunk.shape + try: + result = f(chunk) + if result is chunk: + raise InvalidApply('Function unsafe for fast apply') + except: + raise InvalidApply('Let this error raise above us') + + slider = BlockSlider(frame) + + mutated = False + item_cache = slider.dummy._item_cache + gin = slider.dummy.index._engine # f7u12 + try: + for i in range(n): + slider.move(starts[i], ends[i]) + + item_cache.clear() # ugh + gin.clear_mapping() + + object.__setattr__(slider.dummy, 'name', names[i]) + piece = f(slider.dummy) + + # I'm paying the price for index-sharing, ugh + try: + if piece.index is slider.dummy.index: + piece = piece.copy() + else: + mutated = True + except AttributeError: + pass + results.append(piece) + finally: + slider.reset() + + return results, mutated + +cdef class BlockSlider: + ''' + Only capable of sliding on axis=0 + ''' + + cdef public: + object frame, dummy + int nblocks + Slider idx_slider + list blocks + + cdef: + char **base_ptrs + + def __init__(self, frame): + self.frame = frame + self.dummy = frame[:0] + + self.blocks = [b.values for b in self.dummy._data.blocks] + + for x in self.blocks: + util.set_array_not_contiguous(x) + + self.nblocks = len(self.blocks) + self.idx_slider = Slider(self.frame.index, self.dummy.index) + + self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) + for i, block in enumerate(self.blocks): + self.base_ptrs[i] = ( block).data + + def __dealloc__(self): + free(self.base_ptrs) + + cpdef move(self, int start, int end): + cdef: + ndarray arr + + # move blocks + for i in range(self.nblocks): + arr = self.blocks[i] + + # axis=1 is the frame's axis=0 + arr.data = self.base_ptrs[i] + arr.strides[1] * start + arr.shape[1] = end - start + + self.idx_slider.move(start, end) + + cdef reset(self): + cdef: + ndarray arr + + # move blocks + for i in range(self.nblocks): + arr = self.blocks[i] + + # axis=1 is the frame's axis=0 + arr.data = self.base_ptrs[i] + arr.shape[1] = 0 + + self.idx_slider.reset() + + +def reduce(arr, f, axis=0, dummy=None, labels=None): + if labels._has_complex_internals: + raise Exception('Cannot use shortcut') + + reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) + return reducer.get_result() diff --git a/pandas/src/skiplist.h b/pandas/src/skiplist.h new file mode 100644 index 00000000..57b32005 --- /dev/null +++ b/pandas/src/skiplist.h @@ -0,0 +1,281 @@ + +/* + Flexibly-sized, indexable skiplist data structure for maintaining a sorted + list of values + + Port of Wes McKinney's Cython version of Raymond Hettinger's original pure + Python recipe (http://rhettinger.wordpress.com/2010/02/06/lost-knowledge/) + */ + +// #include +// #include + + +#include +#include +#include +#include + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +PANDAS_INLINE static float __skiplist_nanf(void) +{ + const union { int __i; float __f;} __bint = {0x7fc00000UL}; + return __bint.__f; +} +#define PANDAS_NAN ((double) __skiplist_nanf()) + + +static PANDAS_INLINE double Log2(double val) { + return log(val) / log(2.); +} + +typedef struct node_t node_t; + +struct node_t { + double value; + int is_nil; + int levels; + node_t **next; + int *width; + int ref_count; +}; + +typedef struct { + node_t *head; + int size, maxlevels; + node_t **tmp_chain; + int *tmp_steps; +} skiplist_t; + +static PANDAS_INLINE double urand(void) { + return rand() / ((double) RAND_MAX + 1); +} + +static PANDAS_INLINE int int_min(int a, int b) { + return a < b ? a : b; +} + +static PANDAS_INLINE node_t *node_init(double value, int levels) { + node_t *result; + result = (node_t*) calloc(1, sizeof(node_t)); + + result->value = value; + result->levels = levels; + result->is_nil = 0; + result->ref_count = 0; + + result->next = (node_t**) malloc(levels * sizeof(node_t*)); + result->width = (int*) malloc(levels * sizeof(int)); + + return result; +} + +// do this ourselves + +static PANDAS_INLINE void node_incref(node_t *node) { + node->ref_count += 1; +} + +static PANDAS_INLINE void node_decref(node_t *node) { + node->ref_count -= 1; +} + +static void node_destroy(node_t *node) { + int i; + if (node) { + if (node->ref_count == 1) { + for (i = 0; i < node->levels; ++i) { + node_destroy(node->next[i]); + } + free(node->next); + free(node->width); + // printf("Reference count was 1, freeing\n"); + free(node); + } + else { + node_decref(node); + } + // pretty sure that freeing the struct above will be enough + } +} + +static PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) { + skiplist_t *result; + node_t *NIL, *head; + int maxlevels, i; + + maxlevels = Log2((double) expected_size); + result = (skiplist_t*) calloc(1, sizeof(skiplist_t)); + result->tmp_chain = (node_t**) malloc(maxlevels * sizeof(node_t*)); + result->tmp_steps = (int*) malloc(maxlevels * sizeof(int)); + result->maxlevels = maxlevels; + + head = result->head = node_init(PANDAS_NAN, maxlevels); + node_incref(head); + + NIL = node_init(0, 0); + NIL->is_nil = 1; + + for (i = 0; i < maxlevels; ++i) + { + head->next[i] = NIL; + head->width[i] = 1; + node_incref(NIL); + } + + return result; +} + +static PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) { + if (skp) { + node_destroy(skp->head); + free(skp->tmp_steps); + free(skp->tmp_chain); + free(skp); + } +} + + +// 1 if left < right, 0 if left == right, -1 if left > right + +static PANDAS_INLINE int _node_cmp(node_t* node, double value){ + if (node->is_nil || node->value > value) { + return -1; + } + else if (node->value < value) { + return 1; + } + else { + return 0; + } +} + +static PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { + node_t *node; + int level; + + if (i < 0 || i >= skp->size) { + *ret = 0; + return 0; + } + + node = skp->head; + i++; + for (level = skp->maxlevels - 1; level >= 0; --level) + { + while (node->width[level] <= i) + { + i = i - node->width[level]; + node = node->next[level]; + } + } + + *ret = 1; + return node->value; +} + +static PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { + node_t *node, *prevnode, *newnode, *next_at_level; + int *steps_at_level; + int size, steps, level; + node_t **chain; + + chain = skp->tmp_chain; + + steps_at_level = skp->tmp_steps; + memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); + + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) + { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) >= 0) { + steps_at_level[level] += node->width[level]; + node = next_at_level; + next_at_level = node->next[level]; + } + chain[level] = node; + } + + size = int_min(skp->maxlevels, 1 - ((int) Log2(urand()))); + + newnode = node_init(value, size); + steps = 0; + + for (level = 0; level < size; ++level) { + prevnode = chain[level]; + newnode->next[level] = prevnode->next[level]; + + prevnode->next[level] = newnode; + node_incref(newnode); // increment the reference count + + newnode->width[level] = prevnode->width[level] - steps; + prevnode->width[level] = steps + 1; + + steps += steps_at_level[level]; + } + + for (level = size; level < skp->maxlevels; ++level) { + chain[level]->width[level] += 1; + } + + skp->size++; + + return 1; +} + +static PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { + int level, size; + node_t *node, *prevnode, *tmpnode, *next_at_level; + node_t **chain; + + chain = skp->tmp_chain; + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) + { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) > 0) { + node = next_at_level; + next_at_level = node->next[level]; + } + chain[level] = node; + } + + if (value != chain[0]->next[0]->value) { + return 0; + } + + size = chain[0]->next[0]->levels; + + for (level = 0; level < size; ++level) { + prevnode = chain[level]; + + tmpnode = prevnode->next[level]; + + prevnode->width[level] += tmpnode->width[level] - 1; + prevnode->next[level] = tmpnode->next[level]; + + tmpnode->next[level] = NULL; + node_destroy(tmpnode); // decrement refcount or free + } + + for (level = size; level < skp->maxlevels; ++level) { + chain[level]->width[level] -= 1; + } + + skp->size--; + return 1; +} diff --git a/pandas/src/skiplist.pxd b/pandas/src/skiplist.pxd new file mode 100644 index 00000000..c1221c47 --- /dev/null +++ b/pandas/src/skiplist.pxd @@ -0,0 +1,21 @@ +cdef extern from "skiplist.h": + ctypedef struct node_t: + double value + int is_nil + int levels + node_t **next + int *width + int ref_count + + ctypedef struct skiplist_t: + node_t *head + int size, maxlevels + node_t **tmp_chain + int *tmp_steps + + inline skiplist_t* skiplist_init(int) + inline void skiplist_destroy(skiplist_t*) + inline double skiplist_get(skiplist_t*, int, int*) + inline int skiplist_insert(skiplist_t*, double) + inline int skiplist_remove(skiplist_t*, double) + diff --git a/pandas/src/skiplist.pyx b/pandas/src/skiplist.pyx new file mode 100644 index 00000000..4e00fd27 --- /dev/null +++ b/pandas/src/skiplist.pyx @@ -0,0 +1,153 @@ +# Cython version of IndexableSkiplist, for implementing moving median +# with O(log n) updates +# Original author: Raymond Hettinger +# Original license: MIT +# Link: http://code.activestate.com/recipes/576930/ + +# Cython version: Wes McKinney + +cdef extern from "numpy/arrayobject.h": + + void import_array() + +cdef extern from "math.h": + double log(double x) + +# MSVC does not have log2! + +cdef double Log2(double x): + return log(x) / log(2.) + +cimport numpy as np +from numpy cimport * +import numpy as np + +from random import random + +# initialize numpy +import_array() + +# TODO: optimize this, make less messy + +cdef class Node: + cdef public: + double_t value + list next + list width + + def __init__(self, double_t value, list next, list width): + self.value = value + self.next = next + self.width = width + +# Singleton terminator node +NIL = Node(np.inf, [], []) + +cdef class IndexableSkiplist: + ''' + Sorted collection supporting O(lg n) insertion, removal, and + lookup by rank. + ''' + cdef: + Py_ssize_t size, maxlevels + Node head + + def __init__(self, expected_size=100): + self.size = 0 + self.maxlevels = int(1 + Log2(expected_size)) + self.head = Node(np.NaN, [NIL] * self.maxlevels, [1] * self.maxlevels) + + def __len__(self): + return self.size + + def __getitem__(self, i): + return self.get(i) + + cpdef get(self, Py_ssize_t i): + cdef Py_ssize_t level + cdef Node node + + node = self.head + i += 1 + + for level in range(self.maxlevels - 1, -1, -1): + while node.width[level] <= i: + i -= node.width[level] + node = node.next[level] + + + return node.value + + cpdef insert(self, double value): + cdef Py_ssize_t level, steps, d + cdef Node node, prevnode, newnode, next_at_level, tmp + cdef list chain, steps_at_level + + # find first node on each level where node.next[levels].value > value + chain = [None] * self.maxlevels + steps_at_level = [0] * self.maxlevels + node = self.head + + for level in range(self.maxlevels - 1, -1, -1): + next_at_level = node.next[level] + + while next_at_level.value <= value: + steps_at_level[level] = (steps_at_level[level] + + node.width[level]) + node = next_at_level + next_at_level = node.next[level] + + chain[level] = node + + # insert a link to the newnode at each level + d = min(self.maxlevels, 1 - int(Log2(random()))) + newnode = Node(value, [None] * d, [None] * d) + steps = 0 + + for level in range(d): + prevnode = chain[level] + newnode.next[level] = prevnode.next[level] + prevnode.next[level] = newnode + newnode.width[level] = (prevnode.width[level] - steps) + prevnode.width[level] = steps + 1 + steps += steps_at_level[level] + + for level in range(d, self.maxlevels): + ( chain[level]).width[level] += 1 + + self.size += 1 + + cpdef remove(self, double value): + cdef Py_ssize_t level, d + cdef Node node, prevnode, tmpnode, next_at_level + cdef list chain + + # find first node on each level where node.next[levels].value >= value + chain = [None] * self.maxlevels + node = self.head + + for level in range(self.maxlevels - 1, -1, -1): + next_at_level = node.next[level] + while next_at_level.value < value: + node = next_at_level + next_at_level = node.next[level] + + chain[level] = node + + if value != ( ( ( chain[0]).next)[0]).value: + raise KeyError('Not Found') + + # remove one link at each level + d = len(( ( ( chain[0]).next)[0]).next) + + for level in range(d): + prevnode = chain[level] + tmpnode = prevnode.next[level] + prevnode.width[level] += tmpnode.width[level] - 1 + prevnode.next[level] = tmpnode.next[level] + + for level in range(d, self.maxlevels): + tmpnode = chain[level] + tmpnode.width[level] -= 1 + + self.size -= 1 diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx new file mode 100644 index 00000000..579d473c --- /dev/null +++ b/pandas/src/sparse.pyx @@ -0,0 +1,1190 @@ +from numpy cimport ndarray, int32_t, float64_t +cimport numpy as np + +cimport cython + +import numpy as np +import operator +import sys + +np.import_array() +np.import_ufunc() + +#------------------------------------------------------------------------------- +# Preamble stuff + +cdef float64_t NaN = np.NaN +cdef float64_t INF = np.inf + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +#------------------------------------------------------------------------------- + + +cdef class SparseIndex: + ''' + Abstract superclass for sparse index types + ''' + def __init__(self): + raise NotImplementedError + + +cdef class IntIndex(SparseIndex): + ''' + Object for holding exact integer sparse indexing information + + Parameters + ---------- + length : integer + indices : array-like + Contains integers corresponding to + ''' + cdef readonly: + Py_ssize_t length, npoints + ndarray indices + + def __init__(self, Py_ssize_t length, indices): + self.length = length + self.indices = np.ascontiguousarray(indices, dtype=np.int32) + self.npoints = len(self.indices) + + def __reduce__(self): + args = (self.length, self.indices) + return (IntIndex, args) + + def __repr__(self): + output = 'IntIndex\n' + output += 'Indices: %s\n' % repr(self.indices) + return output + + def check_integrity(self): + ''' + Only need be strictly ascending and nothing less than 0 or greater than + totall ength + ''' + pass + + def equals(self, other): + if not isinstance(other, IntIndex): + return False + + if self is other: + return True + + same_length = self.length == other.length + same_indices = np.array_equal(self.indices, other.indices) + return same_length and same_indices + + @property + def ngaps(self): + return self.length - self.npoints + + def to_int_index(self): + return self + + def to_block_index(self): + locs, lens = get_blocks(self.indices) + return BlockIndex(self.length, locs, lens) + + cpdef IntIndex intersect(self, SparseIndex y_): + cdef: + Py_ssize_t out_length, xi, yi = 0 + int32_t xind + ndarray[int32_t, ndim=1] xindices, yindices + list new_list = [] + IntIndex y + + # if is one already, returns self + y = y_.to_int_index() + + if self.length != y.length: + raise Exception('Indices must reference same underlying length') + + xindices = self.indices + yindices = y.indices + + for xi from 0 <= xi < self.npoints: + xind = xindices[xi] + + while yi < y.npoints and yindices[yi] < xind: + yi += 1 + + if yi >= y.npoints: + break + + # TODO: would a two-pass algorithm be faster? + if yindices[yi] == xind: + new_list.append(xind) + + return IntIndex(self.length, new_list) + + cpdef IntIndex make_union(self, SparseIndex y_): + cdef: + Py_ssize_t out_length, i, xi, yi + int32_t xind + ndarray[int32_t, ndim=1] xindices, yindices + list new_list = [] + IntIndex x, y + + x = self + + # if is one already, returns self + y = y_.to_int_index() + + if self.length != y.length: + raise Exception('Indices must reference same underlying length') + + xindices = self.indices + yindices = y.indices + + xi = yi = 0 + while True: + if xi == x.npoints: + while yi < y.npoints: + new_list.append(yindices[yi]) + yi += 1 + break + elif yi == y.npoints: + while xi < x.npoints: + new_list.append(xindices[xi]) + xi += 1 + break + + xind = xindices[xi] + yind = yindices[yi] + + if xind == yind: + new_list.append(xind) + xi += 1 + yi += 1 + elif xind < yind: + new_list.append(xind) + xi += 1 + else: + new_list.append(yind) + yi += 1 + + return IntIndex(x.length, new_list) + + @cython.wraparound(False) + cpdef lookup(self, Py_ssize_t index): + cdef: + Py_ssize_t res, n, cum_len = 0 + ndarray[int32_t, ndim=1] inds + + inds = self.indices + res = inds.searchsorted(index) + if res == self.npoints: + return -1 + elif inds[res] == index: + return res + else: + return -1 + + cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values, + float64_t fill_value, SparseIndex other_): + cdef: + Py_ssize_t i = 0, j = 0 + IntIndex other + ndarray[float64_t, ndim=1] result + ndarray[int32_t, ndim=1] sinds, oinds + + other = other_.to_int_index() + + oinds = other.indices + sinds = self.indices + + result = np.empty(other.npoints, dtype=np.float64) + result.fill(fill_value) + + for 0 <= i < other.npoints: + while oinds[i] > sinds[j] and j < self.npoints: + j += 1 + + if j == self.npoints: + break + + if oinds[i] < sinds[j]: + continue + elif oinds[i] == sinds[j]: + result[i] = values[j] + j += 1 + + return result + + cpdef put(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices, object to_put): + pass + + cpdef take(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices): + pass + +cpdef get_blocks(ndarray[int32_t, ndim=1] indices): + cdef: + Py_ssize_t i, npoints + int32_t block, length = 1, cur, prev + list locs = [], lens = [] + + npoints = len(indices) + + # just handle the special empty case separately + if npoints == 0: + return [], [] + + # TODO: two-pass algorithm faster? + prev = block = indices[0] + for i from 1 <= i < npoints: + cur = indices[i] + if cur - prev > 1: + # new block + locs.append(block) + lens.append(length) + block = cur + length = 1 + else: + # same block, increment length + length += 1 + + prev = cur + + locs.append(block) + lens.append(length) + return locs, lens + +#------------------------------------------------------------------------------- +# BlockIndex + +cdef class BlockIndex(SparseIndex): + ''' + Object for holding block-based sparse indexing information + + Parameters + ---------- + ''' + cdef readonly: + Py_ssize_t nblocks, npoints, length + ndarray blocs, blengths + + cdef: + object __weakref__ # need to be picklable + int32_t* locbuf, *lenbuf + + def __init__(self, length, blocs, blengths): + + self.blocs = np.ascontiguousarray(blocs, dtype=np.int32) + self.blengths = np.ascontiguousarray(blengths, dtype=np.int32) + + # in case we need + self.locbuf = self.blocs.data + self.lenbuf = self.blengths.data + + self.length = length + self.nblocks = len(self.blocs) + self.npoints = self.blengths.sum() + + # self.block_start = blocs + # self.block_end = blocs + blengths + + self.check_integrity() + + def __reduce__(self): + args = (self.length, self.blocs, self.blengths) + return (BlockIndex, args) + + def __repr__(self): + output = 'BlockIndex\n' + output += 'Block locations: %s\n' % repr(self.blocs) + output += 'Block lengths: %s' % repr(self.blengths) + + return output + + @property + def ngaps(self): + return self.length - self.npoints + + cpdef check_integrity(self): + ''' + Check: + - Locations are in ascending order + - No overlapping blocks + - Blocks to not start after end of index, nor extend beyond end + ''' + cdef: + Py_ssize_t i + ndarray[int32_t, ndim=1] blocs, blengths + + blocs = self.blocs + blengths = self.blengths + + if len(blocs) != len(blengths): + raise ValueError('block bound arrays must be same length') + + for i from 0 <= i < self.nblocks: + if i > 0: + if blocs[i] <= blocs[i-1]: + raise ValueError('Locations not in ascending order') + + if i < self.nblocks - 1: + if blocs[i] + blengths[i] > blocs[i + 1]: + raise ValueError('Block %d overlaps' % i) + else: + if blocs[i] + blengths[i] > self.length: + raise ValueError('Block %d extends beyond end' % i) + + # no zero-length blocks + if blengths[i] == 0: + raise ValueError('Zero-length block %d' % i) + + def equals(self, other): + if not isinstance(other, BlockIndex): + return False + + if self is other: + return True + + same_length = self.length == other.length + same_blocks = (np.array_equal(self.blocs, other.blocs) and + np.array_equal(self.blengths, other.blengths)) + return same_length and same_blocks + + def to_block_index(self): + return self + + def to_int_index(self): + cdef: + Py_ssize_t i = 0, j, b + int32_t offset + ndarray[int32_t, ndim=1] indices + + indices = np.empty(self.npoints, dtype=np.int32) + + for b from 0 <= b < self.nblocks: + offset = self.locbuf[b] + + for j from 0 <= j < self.lenbuf[b]: + indices[i] = offset + j + i += 1 + + return IntIndex(self.length, indices) + + cpdef BlockIndex intersect(self, SparseIndex other): + ''' + Intersect two BlockIndex objects + + Parameters + ---------- + + Returns + ------- + intersection : BlockIndex + ''' + cdef: + BlockIndex y + ndarray[int32_t, ndim=1] xloc, xlen, yloc, ylen + + list out_blocs = [] + list out_blengths = [] + + Py_ssize_t xi = 0, yi = 0 + int32_t cur_loc, cur_length, diff + + y = other.to_block_index() + + if self.length != y.length: + raise Exception('Indices must reference same underlying length') + + xloc = self.blocs + xlen = self.blengths + yloc = y.blocs + ylen = y.blengths + + while True: + # we are done (or possibly never began) + if xi >= self.nblocks or yi >= y.nblocks: + break + + # completely symmetric...would like to avoid code dup but oh well + if xloc[xi] >= yloc[yi]: + cur_loc = xloc[xi] + diff = xloc[xi] - yloc[yi] + + if ylen[yi] <= diff: + # have to skip this block + yi += 1 + continue + + if ylen[yi] - diff < xlen[xi]: + # take end of y block, move onward + cur_length = ylen[yi] - diff + yi += 1 + else: + # take end of x block + cur_length = xlen[xi] + xi += 1 + + else: # xloc[xi] < yloc[yi] + cur_loc = yloc[yi] + diff = yloc[yi] - xloc[xi] + + if xlen[xi] <= diff: + # have to skip this block + xi += 1 + continue + + if xlen[xi] - diff < ylen[yi]: + # take end of x block, move onward + cur_length = xlen[xi] - diff + xi += 1 + else: + # take end of y block + cur_length = ylen[yi] + yi += 1 + + out_blocs.append(cur_loc) + out_blengths.append(cur_length) + + return BlockIndex(self.length, out_blocs, out_blengths) + + cpdef BlockIndex make_union(self, SparseIndex y): + ''' + Combine together two BlockIndex objects, accepting indices if contained + in one or the other + + Parameters + ---------- + other : SparseIndex + + Notes + ----- + union is a protected keyword in Cython, hence make_union + + Returns + ------- + union : BlockIndex + ''' + return BlockUnion(self, y.to_block_index()).result + + cpdef lookup(self, Py_ssize_t index): + ''' + + Returns -1 if not found + ''' + cdef: + Py_ssize_t i, cum_len + ndarray[int32_t, ndim=1] locs, lens + + locs = self.blocs + lens = self.blengths + + if self.nblocks == 0: + return -1 + elif index < locs[0]: + return -1 + + cum_len = 0 + for i from 0 <= i < self.nblocks: + if index >= locs[i] and index < locs[i] + lens[i]: + return cum_len + index - locs[i] + cum_len += lens[i] + + return -1 + + cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values, + float64_t fill_value, SparseIndex other_): + cdef: + Py_ssize_t i = 0, j = 0, ocur, ocurlen + BlockIndex other + ndarray[float64_t, ndim=1] result + ndarray[int32_t, ndim=1] slocs, slens, olocs, olens + + other = other_.to_block_index() + + olocs = other.blocs + olens = other.blengths + slocs = self.blocs + slens = self.blengths + + result = np.empty(other.npoints, dtype=np.float64) + + for 0 <= i < other.nblocks: + ocur = olocs[i] + ocurlen = olens[i] + + while slocs[j] + slens[j] < ocur: + j += 1 + + cpdef put(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices, object to_put): + pass + + cpdef take(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices): + pass + + +cdef class BlockMerge(object): + ''' + Object-oriented approach makes sharing state between recursive functions a + lot easier and reduces code duplication + ''' + cdef: + BlockIndex x, y, result + ndarray xstart, xlen, xend, ystart, ylen, yend + int32_t xi, yi # block indices + + def __init__(self, BlockIndex x, BlockIndex y): + self.x = x + self.y = y + + if x.length != y.length: + raise Exception('Indices must reference same underlying length') + + self.xstart = self.x.blocs + self.ystart = self.y.blocs + + self.xend = self.x.blocs + self.x.blengths + self.yend = self.y.blocs + self.y.blengths + + # self.xlen = self.x.blengths + # self.ylen = self.y.blengths + + self.xi = 0 + self.yi = 0 + + self.result = self._make_merged_blocks() + + cdef _make_merged_blocks(self): + raise NotImplementedError + + cdef _set_current_indices(self, int32_t xi, int32_t yi, bint mode): + if mode == 0: + self.xi = xi + self.yi = yi + else: + self.xi = yi + self.yi = xi + +cdef class BlockIntersection(BlockMerge): + ''' + not done yet + ''' + pass + +cdef class BlockUnion(BlockMerge): + ''' + Object-oriented approach makes sharing state between recursive functions a + lot easier and reduces code duplication + ''' + + cdef _make_merged_blocks(self): + cdef: + ndarray[int32_t, ndim=1] xstart, xend, ystart, yend + int32_t nstart, nend, diff + list out_blocs = [], out_blengths = [] + + xstart = self.xstart + xend = self.xend + ystart = self.ystart + yend = self.yend + + while True: + # we are done (or possibly never began) + if self.xi >= self.x.nblocks and self.yi >= self.y.nblocks: + break + elif self.yi >= self.y.nblocks: + # through with y, just pass through x blocks + nstart = xstart[self.xi] + nend = xend[self.xi] + self.xi += 1 + elif self.xi >= self.x.nblocks: + # through with x, just pass through y blocks + nstart = ystart[self.yi] + nend = yend[self.yi] + self.yi += 1 + else: + # find end of new block + if xstart[self.xi] < ystart[self.yi]: + nstart = xstart[self.xi] + nend = self._find_next_block_end(0) + else: + nstart = ystart[self.yi] + nend = self._find_next_block_end(1) + + out_blocs.append(nstart) + out_blengths.append(nend - nstart) + + return BlockIndex(self.x.length, out_blocs, out_blengths) + + cdef int32_t _find_next_block_end(self, bint mode) except -1: + ''' + Wow, this got complicated in a hurry + + mode 0: block started in index x + mode 1: block started in index y + ''' + cdef: + ndarray[int32_t, ndim=1] xstart, xend, ystart, yend + int32_t xi, yi, xnblocks, ynblocks, nend + + if mode != 0 and mode != 1: + raise Exception('Mode must be 0 or 1') + + # so symmetric code will work + if mode == 0: + xstart = self.xstart + xend = self.xend + xi = self.xi + + ystart = self.ystart + yend = self.yend + yi = self.yi + ynblocks = self.y.nblocks + else: + xstart = self.ystart + xend = self.yend + xi = self.yi + + ystart = self.xstart + yend = self.xend + yi = self.xi + ynblocks = self.x.nblocks + + nend = xend[xi] + + # print 'here xi=%d, yi=%d, mode=%d, nend=%d' % (self.xi, self.yi, + # mode, nend) + + # done with y? + if yi == ynblocks: + self._set_current_indices(xi + 1, yi, mode) + return nend + elif nend < ystart[yi]: + # block ends before y block + self._set_current_indices(xi + 1, yi, mode) + return nend + else: + while yi < ynblocks and nend > yend[yi]: + yi += 1 + + self._set_current_indices(xi + 1, yi, mode) + + if yi == ynblocks: + return nend + + if nend < ystart[yi]: + # we're done, return the block end + return nend + else: + # merge blocks, continue searching + # this also catches the case where blocks + return self._find_next_block_end(1 - mode) + + +#------------------------------------------------------------------------------- +# Sparse arithmetic + +ctypedef float64_t (* double_func)(float64_t a, float64_t b) + +cdef inline tuple sparse_nancombine(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex, + double_func op): + # faster to convert to IntIndex + return int_nanop(x, xindex.to_int_index(), + y, yindex.to_int_index(), op) + + # if isinstance(xindex, BlockIndex): + # return block_nanop(x, xindex.to_block_index(), + # y, yindex.to_block_index(), op) + # elif isinstance(xindex, IntIndex): + # return int_nanop(x, xindex.to_int_index(), + # y, yindex.to_int_index(), op) + + +cdef inline tuple sparse_combine(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill, + double_func op): + if isinstance(xindex, BlockIndex): + return block_op(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill, op) + elif isinstance(xindex, IntIndex): + return int_op(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill, op) + +# NaN-based arithmetic operation-- no handling of fill values +# TODO: faster to convert everything to dense? + +@cython.boundscheck(False) +cdef inline tuple block_nanop(ndarray x_, BlockIndex xindex, + ndarray y_, BlockIndex yindex, + double_func op): + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0, obp = 0 # block positions + Py_ssize_t xblock = 0, yblock = 0, outblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + out_index = xindex.intersect(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + + # I have a feeling this is inefficient + + # walk x + while xindex.locbuf[xblock] + xbp < out_index.locbuf[outblock] + obp: + xbp += 1 + xi += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + # walk y + while yindex.locbuf[yblock] + ybp < out_index.locbuf[outblock] + obp: + ybp += 1 + yi += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + out[out_i] = op(x[xi], y[yi]) + + # advance. strikes me as too complicated + xi += 1 + yi += 1 + + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + obp += 1 + if obp == out_index.lenbuf[outblock]: + outblock += 1 + obp = 0 + + return out, out_index + +@cython.boundscheck(False) +cdef inline tuple int_nanop(ndarray x_, IntIndex xindex, + ndarray y_, IntIndex yindex, + double_func op): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.intersect(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + + # walk x + while xindices[xi] < out_indices[out_i]: + xi += 1 + + # walk y + while yindices[yi] < out_indices[out_i]: + yi += 1 + + out[out_i] = op(x[xi], y[yi]) + + # advance + xi += 1 + yi += 1 + + return out, out_index + + +@cython.boundscheck(False) +cdef inline tuple block_op(ndarray x_, BlockIndex xindex, float64_t xfill, + ndarray y_, BlockIndex yindex, float64_t yfill, + double_func op): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = op(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index + + +@cython.boundscheck(False) +cdef inline tuple int_op(ndarray x_, IntIndex xindex, float64_t xfill, + ndarray y_, IntIndex yindex, float64_t yfill, + double_func op): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = op(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + + return out, out_index + +cdef inline float64_t __add(float64_t a, float64_t b): + return a + b + +cdef inline float64_t __sub(float64_t a, float64_t b): + return a - b + +cdef inline float64_t __rsub(float64_t a, float64_t b): + return b - a + +cdef inline float64_t __div(float64_t a, float64_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return a / b + +cdef inline float64_t __rdiv(float64_t a, float64_t b): + return __div(b, a) + +cdef inline float64_t __floordiv(float64_t a, float64_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return a // b + +cdef inline float64_t __rfloordiv(float64_t a, float64_t b): + return __floordiv(b, a) + +cdef inline float64_t __mul(float64_t a, float64_t b): + return a * b +cdef inline float64_t __eq(float64_t a, float64_t b): + return a == b +cdef inline float64_t __ne(float64_t a, float64_t b): + return a != b +cdef inline float64_t __lt(float64_t a, float64_t b): + return a < b +cdef inline float64_t __gt(float64_t a, float64_t b): + return a > b + +cdef inline float64_t __pow(float64_t a, float64_t b): + # NaN + if a != a or b != b: + return NaN + return a ** b + +cdef inline float64_t __rpow(float64_t a, float64_t b): + return __pow(b, a) + + +# This probably needs to be "templated" to achieve maximum performance. +# TODO: quantify performance boost to "templating" + +cpdef sparse_nanadd(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __add) + +cpdef sparse_nansub(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __sub) + +cpdef sparse_nanrsub(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rsub) + +cpdef sparse_nanmul(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __mul) + +cpdef sparse_nandiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __div) + +cpdef sparse_nanrdiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rdiv) + +sparse_nantruediv = sparse_nandiv +sparse_nanrtruediv = sparse_nanrdiv + +cpdef sparse_nanfloordiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __floordiv) + +cpdef sparse_nanrfloordiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rfloordiv) + +cpdef sparse_nanpow(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __pow) + +cpdef sparse_nanrpow(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rpow) + +cpdef sparse_add(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __add) + +cpdef sparse_sub(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __sub) + +cpdef sparse_rsub(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rsub) + +cpdef sparse_mul(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __mul) + +cpdef sparse_div(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __div) + +cpdef sparse_rdiv(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rdiv) + +sparse_truediv = sparse_div +sparse_rtruediv = sparse_rdiv + +cpdef sparse_floordiv(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __floordiv) + +cpdef sparse_rfloordiv(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rfloordiv) + +cpdef sparse_pow(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __pow) + +cpdef sparse_rpow(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rpow) + + +#------------------------------------------------------------------------------- +# Indexing operations + +def get_reindexer(ndarray[object, ndim=1] values, dict index_map): + cdef object idx + cdef Py_ssize_t i + cdef Py_ssize_t new_length = len(values) + cdef ndarray[int32_t, ndim=1] indexer + + indexer = np.empty(new_length, dtype=np.int32) + + for i in range(new_length): + idx = values[i] + if idx in index_map: + indexer[i] = index_map[idx] + else: + indexer[i] = -1 + + return indexer + +# def reindex_block(ndarray[float64_t, ndim=1] values, +# BlockIndex sparse_index, +# ndarray[int32_t, ndim=1] indexer): +# cdef: +# Py_ssize_t i, length +# ndarray[float64_t, ndim=1] out + +# out = np.empty(length, dtype=np.float64) + +# for i from 0 <= i < length: +# if indexer[i] == -1: +# pass + + +# cdef class SparseCruncher(object): +# ''' +# Class to acquire float pointer for convenient operations on sparse data +# structures +# ''' +# cdef: +# SparseIndex index +# float64_t* buf + +# def __init__(self, ndarray[float64_t, ndim=1, mode='c'] values, +# SparseIndex index): + +# self.index = index +# self.buf = values.data + + +def reindex_integer(ndarray[float64_t, ndim=1] values, + IntIndex sparse_index, + ndarray[int32_t, ndim=1] indexer): + pass diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx new file mode 100644 index 00000000..bff07042 --- /dev/null +++ b/pandas/src/testing.pyx @@ -0,0 +1,141 @@ +import numpy as np + +from pandas import compat +from pandas.core.common import isnull + +cdef NUMERIC_TYPES = ( + bool, + int, + float, + np.bool, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float16, + np.float32, + np.float64, +) + +cdef bint is_comparable_as_number(obj): + return isinstance(obj, NUMERIC_TYPES) + +cdef bint isiterable(obj): + return hasattr(obj, '__iter__') + +cdef bint has_length(obj): + return hasattr(obj, '__len__') + +cdef bint is_dictlike(obj): + return hasattr(obj, 'keys') and hasattr(obj, '__getitem__') + +cdef bint decimal_almost_equal(double desired, double actual, int decimal): + # Code from + # http://docs.scipy.org/doc/numpy/reference/generated + # /numpy.testing.assert_almost_equal.html + return abs(desired - actual) < (0.5 * 10.0 ** -decimal) + +cpdef assert_dict_equal(a, b, bint compare_keys=True): + assert is_dictlike(a) and is_dictlike(b), ( + "Cannot compare dict objects, one or both is not dict-like" + ) + + a_keys = frozenset(a.keys()) + b_keys = frozenset(b.keys()) + + if compare_keys: + assert a_keys == b_keys + + for k in a_keys: + assert_almost_equal(a[k], b[k]) + + return True + +cpdef assert_almost_equal(a, b, bint check_less_precise=False): + cdef: + int decimal + Py_ssize_t i, na, nb + double fa, fb + + if isinstance(a, dict) or isinstance(b, dict): + return assert_dict_equal(a, b) + + if (isinstance(a, compat.string_types) or + isinstance(b, compat.string_types)): + assert a == b, "%r != %r" % (a, b) + return True + + if isiterable(a): + assert isiterable(b), ( + "First object is iterable, second isn't: %r != %r" % (a, b) + ) + assert has_length(a) and has_length(b), ( + "Can't compare objects without length, one or both is invalid: " + "(%r, %r)" % (a, b) + ) + + na, nb = len(a), len(b) + assert na == nb, ( + "Length of two iterators not the same: %r != %r" % (na, nb) + ) + if isinstance(a, np.ndarray) and isinstance(b, np.ndarray): + try: + if np.array_equal(a, b): + return True + except: + pass + + for i in xrange(na): + assert_almost_equal(a[i], b[i], check_less_precise) + + return True + elif isiterable(b): + assert False, ( + "Second object is iterable, first isn't: %r != %r" % (a, b) + ) + + if isnull(a): + assert isnull(b), ( + "First object is null, second isn't: %r != %r" % (a, b) + ) + return True + elif isnull(b): + assert isnull(a), ( + "First object is not null, second is null: %r != %r" % (a, b) + ) + return True + + if is_comparable_as_number(a): + assert is_comparable_as_number(b), ( + "First object is numeric, second is not: %r != %r" % (a, b) + ) + + decimal = 5 + + # deal with differing dtypes + if check_less_precise: + decimal = 3 + + if np.isinf(a): + assert np.isinf(b), "First object is inf, second isn't" + else: + fa, fb = a, b + + # case for zero + if abs(fa) < 1e-5: + if not decimal_almost_equal(fa, fb, decimal): + assert False, ( + '(very low values) expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) + ) + else: + if not decimal_almost_equal(1, fb / fa, decimal): + assert False, 'expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) + + else: + assert a == b, "%r != %r" % (a, b) + + return True diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h new file mode 100644 index 00000000..4d7af3dd --- /dev/null +++ b/pandas/src/ujson/lib/ultrajson.h @@ -0,0 +1,313 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. +*/ + +/* +Ultra fast JSON encoder and decoder +Developed by Jonas Tarnstrom (jonas@esn.me). + +Encoder notes: +------------------ + +:: Cyclic references :: +Cyclic referenced objects are not detected. +Set JSONObjectEncoder.recursionMax to suitable value or make sure input object +tree doesn't have cyclic references. + +*/ + +#ifndef __ULTRAJSON_H__ +#define __ULTRAJSON_H__ + +#include +#include + +// Don't output any extra whitespaces when encoding +#define JSON_NO_EXTRA_WHITESPACE + +// Max decimals to encode double floating point numbers with +#ifndef JSON_DOUBLE_MAX_DECIMALS +#define JSON_DOUBLE_MAX_DECIMALS 15 +#endif + +// Max recursion depth, default for encoder +#ifndef JSON_MAX_RECURSION_DEPTH +#define JSON_MAX_RECURSION_DEPTH 1024 +#endif + +// Max recursion depth, default for decoder +#ifndef JSON_MAX_OBJECT_DEPTH +#define JSON_MAX_OBJECT_DEPTH 1024 +#endif + +/* +Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */ +#ifndef JSON_MAX_STACK_BUFFER_SIZE +#define JSON_MAX_STACK_BUFFER_SIZE 131072 +#endif + +#ifdef _WIN32 + +typedef __int64 JSINT64; +typedef unsigned __int64 JSUINT64; + +typedef __int32 JSINT32; +typedef unsigned __int32 JSUINT32; +typedef unsigned __int8 JSUINT8; +typedef unsigned __int16 JSUTF16; +typedef unsigned __int32 JSUTF32; +typedef __int64 JSLONG; + +#define EXPORTFUNCTION __declspec(dllexport) + +#define FASTCALL_MSVC __fastcall +#define FASTCALL_ATTR +#define INLINE_PREFIX __inline + +#else + +#include +typedef int64_t JSINT64; +typedef uint64_t JSUINT64; + +typedef int32_t JSINT32; +typedef uint32_t JSUINT32; + +#define FASTCALL_MSVC + +#if !defined __x86_64__ +#define FASTCALL_ATTR __attribute__((fastcall)) +#else +#define FASTCALL_ATTR +#endif + +#define INLINE_PREFIX inline + +typedef uint8_t JSUINT8; +typedef uint16_t JSUTF16; +typedef uint32_t JSUTF32; + +typedef int64_t JSLONG; + +#define EXPORTFUNCTION +#endif + +#if !(defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__)) + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __LITTLE_ENDIAN__ +#else + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __BIG_ENDIAN__ +#endif + +#endif + +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +#error "Endianess not supported" +#endif + +enum JSTYPES +{ + JT_NULL, // NULL + JT_TRUE, //boolean true + JT_FALSE, //boolean false + JT_INT, //(JSINT32 (signed 32-bit)) + JT_LONG, //(JSINT64 (signed 64-bit)) + JT_DOUBLE, //(double) + JT_UTF8, //(char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect +}; + +typedef void * JSOBJ; +typedef void * JSITER; + +typedef struct __JSONTypeContext +{ + int type; + void *encoder; + void *prv; +} JSONTypeContext; + +/* +Function pointer declarations, suitable for implementing UltraJSON */ +typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); +typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); +typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); +typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); +typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, size_t *outLen); +typedef void *(*JSPFN_MALLOC)(size_t size); +typedef void (*JSPFN_FREE)(void *pptr); +typedef void *(*JSPFN_REALLOC)(void *base, size_t size); + +typedef struct __JSONObjectEncoder +{ + void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); + double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + + /* + Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) + Implementor should setup iteration state in ti->prv + */ + JSPFN_ITERBEGIN iterBegin; + + /* + Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items. + Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this + */ + JSPFN_ITERNEXT iterNext; + + /* + Ends the iteration of an iteratable object. + Any iteration state stored in ti->prv can be freed here + */ + JSPFN_ITEREND iterEnd; + + /* + Returns a reference to the value object of an iterator + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETVALUE iterGetValue; + + /* + Return name of iterator. + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETNAME iterGetName; + + /* + Release a value as indicated by setting ti->release = 1 in the previous getValue call. + The ti->prv array should contain the necessary context to release the value + */ + void (*releaseObject)(JSOBJ obj); + + /* Library functions + Set to NULL to use STDLIB malloc,realloc,free */ + JSPFN_MALLOC malloc; + JSPFN_REALLOC realloc; + JSPFN_FREE free; + + /* + Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/ + int recursionMax; + + /* + Configuration for max decimals of double floating poiunt numbers to encode (0-9) */ + int doublePrecision; + + /* + If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + int forceASCII; + + /* + If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */ + int encodeHTMLChars; + + /* + Set to an error message if error occured */ + const char *errorMsg; + JSOBJ errorObj; + + /* Buffer stuff */ + char *start; + char *offset; + char *end; + int heap; + int level; + +} JSONObjectEncoder; + + +/* +Encode an object structure into JSON. + +Arguments: +obj - An anonymous type representing the object +enc - Function definitions for querying JSOBJ type +buffer - Preallocated buffer to store result in. If NULL function allocates own buffer +cbBuffer - Length of buffer (ignored if buffer is NULL) + +Returns: +Encoded JSON object as a null terminated char string. + +NOTE: +If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer. +Life cycle of the provided buffer must still be handled by caller. + +If the return value doesn't equal the specified buffer caller must release the memory using +JSONObjectEncoder.free or free() as specified when calling this function. +*/ +EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); + + + +typedef struct __JSONObjectDecoder +{ + JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end); + int (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value); + int (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value); + JSOBJ (*newTrue)(void *prv); + JSOBJ (*newFalse)(void *prv); + JSOBJ (*newNull)(void *prv); + JSOBJ (*newObject)(void *prv, void *decoder); + JSOBJ (*endObject)(void *prv, JSOBJ obj); + JSOBJ (*newArray)(void *prv, void *decoder); + JSOBJ (*endArray)(void *prv, JSOBJ obj); + JSOBJ (*newInt)(void *prv, JSINT32 value); + JSOBJ (*newLong)(void *prv, JSINT64 value); + JSOBJ (*newDouble)(void *prv, double value); + void (*releaseObject)(void *prv, JSOBJ obj, void *decoder); + JSPFN_MALLOC malloc; + JSPFN_FREE free; + JSPFN_REALLOC realloc; + char *errorStr; + char *errorOffset; + int preciseFloat; + void *prv; +} JSONObjectDecoder; + +EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); + +#endif diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c new file mode 100644 index 00000000..bae075b4 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -0,0 +1,929 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the ESN Social Software AB nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms +* Copyright (c) 1988-1993 The Regents of the University of California. +* Copyright (c) 1994 Sun Microsystems, Inc. +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif +#ifndef NULL +#define NULL 0 +#endif + +struct DecoderState +{ + char *start; + char *end; + wchar_t *escStart; + wchar_t *escEnd; + int escHeap; + int lastType; + JSUINT32 objDepth; + void *prv; + JSONObjectDecoder *dec; +}; + +JSOBJ FASTCALL_MSVC decode_any( struct DecoderState *ds) FASTCALL_ATTR; +typedef JSOBJ (*PFN_DECODER)( struct DecoderState *ds); + +static JSOBJ SetError( struct DecoderState *ds, int offset, const char *message) +{ + ds->dec->errorOffset = ds->start + offset; + ds->dec->errorStr = (char *) message; + return NULL; +} + +static void ClearError( struct DecoderState *ds) +{ + ds->dec->errorOffset = 0; + ds->dec->errorStr = NULL; +} + +double createDouble(double intNeg, double intValue, double frcValue, int frcDecimalCount) +{ + static const double g_pow10[] = {1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001,0.0000001, 0.00000001, 0.000000001, 0.0000000001, 0.00000000001, 0.000000000001, 0.0000000000001, 0.00000000000001, 0.000000000000001}; + return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) +{ + char *end; + double value; + errno = 0; + + value = strtod(ds->start, &end); + + if (errno == ERANGE) + { + return SetError(ds, -1, "Range error when decoding numeric as double"); + } + + ds->start = end; + return ds->dec->newDouble(ds->prv, value); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric (struct DecoderState *ds) +{ + int intNeg = 1; + int mantSize = 0; + JSUINT64 intValue; + int chr; + int decimalCount = 0; + double frcValue = 0.0; + double expNeg; + double expValue; + char *offset = ds->start; + + JSUINT64 overflowLimit = LLONG_MAX; + + if (*(offset) == '-') + { + offset ++; + intNeg = -1; + overflowLimit = LLONG_MIN; + } + + // Scan integer part + intValue = 0; + + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + //FIXME: Check for arithemtic overflow here + //PERF: Don't do 64-bit arithmetic here unless we know we have to + intValue = intValue * 10ULL + (JSLONG) (chr - 48); + + if (intValue > overflowLimit) + { + return SetError(ds, -1, overflowLimit == LLONG_MAX ? "Value is too big" : "Value is too small"); + } + + offset ++; + mantSize ++; + break; + } + case '.': + { + offset ++; + goto DECODE_FRACTION; + break; + } + case 'e': + case 'E': + { + offset ++; + goto DECODE_EXPONENT; + break; + } + + default: + { + goto BREAK_INT_LOOP; + break; + } + } + } + +BREAK_INT_LOOP: + + ds->lastType = JT_INT; + ds->start = offset; + + if ((intValue >> 31)) + { + return ds->dec->newLong(ds->prv, (JSINT64) (intValue * (JSINT64) intNeg)); + } + else + { + return ds->dec->newInt(ds->prv, (JSINT32) (intValue * intNeg)); + } + +DECODE_FRACTION: + + if (ds->dec->preciseFloat) + { + return decodePreciseFloat(ds); + } + + // Scan fraction part + frcValue = 0.0; + for (;;) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) + { + frcValue = frcValue * 10.0 + (double) (chr - 48); + decimalCount ++; + } + offset ++; + break; + } + case 'e': + case 'E': + { + offset ++; + goto DECODE_EXPONENT; + break; + } + default: + { + goto BREAK_FRC_LOOP; + } + } + } + +BREAK_FRC_LOOP: + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble (ds->prv, createDouble( (double) intNeg, (double) intValue, frcValue, decimalCount)); + +DECODE_EXPONENT: + if (ds->dec->preciseFloat) + { + return decodePreciseFloat(ds); + } + + expNeg = 1.0; + + if (*(offset) == '-') + { + expNeg = -1.0; + offset ++; + } + else + if (*(offset) == '+') + { + expNeg = +1.0; + offset ++; + } + + expValue = 0.0; + + for (;;) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + expValue = expValue * 10.0 + (double) (chr - 48); + offset ++; + break; + } + default: + { + goto BREAK_EXP_LOOP; + } + } + } + +BREAK_EXP_LOOP: + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble (ds->prv, createDouble( (double) intNeg, (double) intValue , frcValue, decimalCount) * pow(10.0, expValue * expNeg)); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'r') + goto SETERROR; + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_TRUE; + ds->start = offset; + return ds->dec->newTrue(ds->prv); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'true'"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'a') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 's') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_FALSE; + ds->start = offset; + return ds->dec->newFalse(ds->prv); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'false'"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'null'"); +} + +FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) +{ + char *offset; + + for (offset = ds->start; (ds->end - offset) > 0; offset ++) + { + switch (*offset) + { + case ' ': + case '\t': + case '\r': + case '\n': + break; + + default: + ds->start = offset; + return; + } + } + + if (offset == ds->end) + { + ds->start = ds->end; + } +} + +enum DECODESTRINGSTATE +{ + DS_ISNULL = 0x32, + DS_ISQUOTE, + DS_ISESCAPE, + DS_UTFLENERROR, + +}; + +static const JSUINT8 g_decoderLookup[256] = +{ + /* 0x00 */ DS_ISNULL, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x20 */ 1, 1, DS_ISQUOTE, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, DS_ISESCAPE, 1, 1, 1, + /* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + /* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + /* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, +}; + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) +{ + JSUTF16 sur[2] = { 0 }; + int iSur = 0; + int index; + wchar_t *escOffset; + wchar_t *escStart; + size_t escLen = (ds->escEnd - ds->escStart); + JSUINT8 *inputOffset; + JSUINT8 oct; + JSUTF32 ucs; + ds->lastType = JT_INVALID; + ds->start ++; + + if ( (size_t) (ds->end - ds->start) > escLen) + { + size_t newSize = (ds->end - ds->start); + + if (ds->escHeap) + { + if (newSize > (UINT_MAX / sizeof(wchar_t))) + { + return SetError(ds, -1, "Could not reserve memory block"); + } + escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t)); + if (!escStart) + { + ds->dec->free(ds->escStart); + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escStart = escStart; + } + else + { + wchar_t *oldStart = ds->escStart; + ds->escHeap = 1; + if (newSize > (UINT_MAX / sizeof(wchar_t))) + { + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t)); + if (!ds->escStart) + { + return SetError(ds, -1, "Could not reserve memory block"); + } + memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); + } + + ds->escEnd = ds->escStart + newSize; + } + + escOffset = ds->escStart; + inputOffset = (JSUINT8 *) ds->start; + + for (;;) + { + switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) + { + case DS_ISNULL: + { + return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); + } + case DS_ISQUOTE: + { + ds->lastType = JT_UTF8; + inputOffset ++; + ds->start += ( (char *) inputOffset - (ds->start)); + return ds->dec->newString(ds->prv, ds->escStart, escOffset); + } + case DS_UTFLENERROR: + { + return SetError (ds, -1, "Invalid UTF-8 sequence length when decoding 'string'"); + } + case DS_ISESCAPE: + inputOffset ++; + switch (*inputOffset) + { + case '\\': *(escOffset++) = L'\\'; inputOffset++; continue; + case '\"': *(escOffset++) = L'\"'; inputOffset++; continue; + case '/': *(escOffset++) = L'/'; inputOffset++; continue; + case 'b': *(escOffset++) = L'\b'; inputOffset++; continue; + case 'f': *(escOffset++) = L'\f'; inputOffset++; continue; + case 'n': *(escOffset++) = L'\n'; inputOffset++; continue; + case 'r': *(escOffset++) = L'\r'; inputOffset++; continue; + case 't': *(escOffset++) = L'\t'; inputOffset++; continue; + + case 'u': + { + int index; + inputOffset ++; + + for (index = 0; index < 4; index ++) + { + switch (*inputOffset) + { + case '\0': return SetError (ds, -1, "Unterminated unicode escape sequence when decoding 'string'"); + default: return SetError (ds, -1, "Unexpected character in unicode escape sequence when decoding 'string'"); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0'); + break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); + break; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); + break; + } + + inputOffset ++; + } + + if (iSur == 0) + { + if((sur[iSur] & 0xfc00) == 0xd800) + { + // First of a surrogate pair, continue parsing + iSur ++; + break; + } + (*escOffset++) = (wchar_t) sur[iSur]; + iSur = 0; + } + else + { + // Decode pair + if ((sur[1] & 0xfc00) != 0xdc00) + { + return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'"); + } +#if WCHAR_MAX == 0xffff + (*escOffset++) = (wchar_t) sur[0]; + (*escOffset++) = (wchar_t) sur[1]; +#else + (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); +#endif + iSur = 0; + } + break; + } + + case '\0': return SetError(ds, -1, "Unterminated escape sequence when decoding 'string'"); + default: return SetError(ds, -1, "Unrecognized escape sequence when decoding 'string'"); + } + break; + + case 1: + { + *(escOffset++) = (wchar_t) (*inputOffset++); + break; + } + + case 2: + { + ucs = (*inputOffset++) & 0x1f; + ucs <<= 6; + if (((*inputOffset) & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + ucs |= (*inputOffset++) & 0x3f; + if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 3: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x0f; + + for (index = 0; index < 2; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 4: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x07; + + for (index = 0; index < 3; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'"); + +#if WCHAR_MAX == 0xffff + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800; + *(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00; + } + else + { + *(escOffset++) = (wchar_t) ucs; + } +#else + *(escOffset++) = (wchar_t) ucs; +#endif + break; + } + } + } +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) +{ + JSOBJ itemValue; + JSOBJ newObj; + int len; + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); + } + + newObj = ds->dec->newArray(ds->prv, ds->dec); + len = 0; + + ds->lastType = JT_INVALID; + ds->start ++; + + for (;;) + { + SkipWhitespace(ds); + + if ((*ds->start) == ']') + { + ds->objDepth--; + if (len == 0) + { + ds->start ++; + return ds->dec->endArray(ds->prv, newObj); + } + + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError(ds, -1, "Unexpected character found when decoding array value (1)"); + } + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } + + if (!ds->dec->arrayAddItem (ds->prv, newObj, itemValue)) + { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case ']': + { + ds->objDepth--; + return ds->dec->endArray(ds->prv, newObj); + } + case ',': + break; + + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError(ds, -1, "Unexpected character found when decoding array value (2)"); + } + + len ++; + } +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object( struct DecoderState *ds) +{ + JSOBJ itemName; + JSOBJ itemValue; + JSOBJ newObj; + + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); + } + + newObj = ds->dec->newObject(ds->prv, ds->dec); + + ds->start ++; + + for (;;) + { + SkipWhitespace(ds); + + if ((*ds->start) == '}') + { + ds->objDepth--; + ds->start ++; + return ds->dec->endObject(ds->prv, newObj); + } + + ds->lastType = JT_INVALID; + itemName = decode_any(ds); + + if (itemName == NULL) + { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } + + if (ds->lastType != JT_UTF8) + { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError(ds, -1, "Key name of object must be 'string' when decoding 'object'"); + } + + SkipWhitespace(ds); + + if (*(ds->start++) != ':') + { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError(ds, -1, "No ':' found when decoding object value"); + } + + SkipWhitespace(ds); + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return NULL; + } + + if (!ds->dec->objectAddKey (ds->prv, newObj, itemName, itemValue)) + { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + ds->dec->releaseObject(ds->prv, itemValue, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case '}': + { + ds->objDepth--; + return ds->dec->endObject(ds->prv, newObj); + } + case ',': + break; + + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError(ds, -1, "Unexpected character found when decoding object value"); + } + } +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) +{ + for (;;) + { + switch (*ds->start) + { + case '\"': + return decode_string (ds); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + return decode_numeric (ds); + + case '[': return decode_array (ds); + case '{': return decode_object (ds); + case 't': return decode_true (ds); + case 'f': return decode_false (ds); + case 'n': return decode_null (ds); + + case ' ': + case '\t': + case '\r': + case '\n': + // White space + ds->start ++; + break; + + default: + return SetError(ds, -1, "Expected object or value"); + } + } +} + +JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer) +{ + /* + FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */ + char *locale; + struct DecoderState ds; + wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSOBJ ret; + + ds.start = (char *) buffer; + ds.end = ds.start + cbBuffer; + + ds.escStart = escBuffer; + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escHeap = 0; + ds.prv = dec->prv; + ds.dec = dec; + ds.dec->errorStr = NULL; + ds.dec->errorOffset = NULL; + ds.objDepth = 0; + + ds.dec = dec; + + locale = setlocale(LC_NUMERIC, NULL); + if (strcmp(locale, "C")) + { + locale = strdup(locale); + if (!locale) + { + return SetError(&ds, -1, "Could not reserve memory block"); + } + setlocale(LC_NUMERIC, "C"); + ret = decode_any (&ds); + setlocale(LC_NUMERIC, locale); + free(locale); + } + else + { + ret = decode_any (&ds); + } + + if (ds.escHeap) + { + dec->free(ds.escStart); + } + + SkipWhitespace(&ds); + + if (ds.start != ds.end && ret) + { + dec->releaseObject(ds.prv, ret, ds.dec); + return SetError(&ds, -1, "Trailing data"); + } + + return ret; +} diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c new file mode 100644 index 00000000..5e2a226a --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -0,0 +1,947 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include +#include + +#include + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* +Worst cases being: + +Control characters (ASCII < 32) +0x00 (1 byte) input => \u0000 output (6 bytes) +1 * 6 => 6 (6 bytes required) + +or UTF-16 surrogate pairs +4 bytes input in UTF-8 => \uXXXX\uYYYY (12 bytes). + +4 * 6 => 24 bytes (12 bytes required) + +The extra 2 bytes are for the quotes around the string + +*/ +#define RESERVE_STRING(_len) (2 + ((_len) * 6)) + +static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; +static const char g_hexChars[] = "0123456789abcdef"; +static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/"; + +/* +FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands. +Needs a cleanup and more documentation */ + +/* +Table for pure ascii output escaping all characters above 127 to \uXXXX */ +static const JSUINT8 g_asciiOutputTable[256] = +{ +/* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, +/* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, +/* 0x20 */ 1, 1, 20, 1, 1, 1, 29, 1, 1, 1, 1, 1, 1, 1, 1, 24, +/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 29, 1, 29, 1, +/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1, +/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 +}; + +static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message) +{ + enc->errorMsg = message; + enc->errorObj = obj; +} + +/* +FIXME: Keep track of how big these get across several encoder calls and try to make an estimate +That way we won't run our head into the wall each call */ +void Buffer_Realloc (JSONObjectEncoder *enc, size_t cbNeeded) +{ + size_t curSize = enc->end - enc->start; + size_t newSize = curSize * 2; + size_t offset = enc->offset - enc->start; + + while (newSize < curSize + cbNeeded) + { + newSize *= 2; + } + + if (enc->heap) + { + enc->start = (char *) enc->realloc (enc->start, newSize); + if (!enc->start) + { + SetError (NULL, enc, "Could not reserve memory block"); + return; + } + } + else + { + char *oldStart = enc->start; + enc->heap = 1; + enc->start = (char *) enc->malloc (newSize); + if (!enc->start) + { + SetError (NULL, enc, "Could not reserve memory block"); + return; + } + memcpy (enc->start, oldStart, offset); + } + enc->offset = enc->start + offset; + enc->end = enc->start + newSize; +} + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (char *outputOffset, unsigned short value) +{ + *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; + *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; + *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; + *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; +} + +int Buffer_EscapeStringUnvalidated (JSONObjectEncoder *enc, const char *io, const char *end) +{ + char *of = (char *) enc->offset; + + for (;;) + { + switch (*io) + { + case 0x00: + { + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + break; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + } + case '\"': (*of++) = '\\'; (*of++) = '\"'; break; + case '\\': (*of++) = '\\'; (*of++) = '\\'; break; + case '/': (*of++) = '\\'; (*of++) = '/'; break; + case '\b': (*of++) = '\\'; (*of++) = 'b'; break; + case '\f': (*of++) = '\\'; (*of++) = 'f'; break; + case '\n': (*of++) = '\\'; (*of++) = 'n'; break; + case '\r': (*of++) = '\\'; (*of++) = 'r'; break; + case '\t': (*of++) = '\\'; (*of++) = 't'; break; + + case 0x26: // '/' + case 0x3c: // '<' + case 0x3e: // '>' + { + if (enc->encodeHTMLChars) + { + // Fall through to \u00XX case below. + } + else + { + // Same as default case below. + (*of++) = (*io); + break; + } + } + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + break; + } + default: (*of++) = (*io); break; + } + io++; + } +} + +int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + JSUTF32 ucs; + char *of = (char *) enc->offset; + + for (;;) + { + JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io]; + + switch (utflen) + { + case 0: + { + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + io ++; + continue; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + } + + case 1: + { + *(of++)= (*io++); + continue; + } + + case 2: + { + JSUTF32 in; + JSUTF16 in16; + + if (end - io < 1) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + in = (JSUTF32) in16; + +#ifdef __LITTLE_ENDIAN__ + ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); +#else + ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); +#endif + + if (ucs < 0x80) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 2 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 2; + break; + } + + case 3: + { + JSUTF32 in; + JSUTF16 in16; + JSUINT8 in8; + + if (end - io < 2) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + memcpy(&in8, io + 2, sizeof(JSUINT8)); +#ifdef __LITTLE_ENDIAN__ + in = (JSUTF32) in16; + in |= in8 << 16; + ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); +#else + in = in16 << 8; + in |= in8; + ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + + if (ucs < 0x800) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 3; + break; + } + case 4: + { + JSUTF32 in; + + if (end - io < 3) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in, io, sizeof(JSUTF32)); +#ifdef __LITTLE_ENDIAN__ + ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); +#else + ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + if (ucs < 0x10000) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 4 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 4; + break; + } + + + case 5: + case 6: + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unsupported UTF-8 sequence length when encoding string"); + return FALSE; + } + + case 29: + { + if (enc->encodeHTMLChars) + { + // Fall through to \u00XX case 30 below. + } + else + { + // Same as case 1 above. + *(of++) = (*io++); + continue; + } + } + + case 30: + { + // \uXXXX encode + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + io ++; + continue; + } + case 10: + case 12: + case 14: + case 16: + case 18: + case 20: + case 22: + case 24: + { + *(of++) = *( (char *) (g_escapeChars + utflen + 0)); + *(of++) = *( (char *) (g_escapeChars + utflen + 1)); + io ++; + continue; + } + // This can never happen, it's here to make L4 VC++ happy + default: + { + ucs = 0; + break; + } + } + + /* + If the character is a UTF8 sequence of length > 1 we end up here */ + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short) (ucs >> 10) + 0xd800); + of += 4; + + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short) (ucs & 0x3ff) + 0xdc00); + of += 4; + } + else + { + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short) ucs); + of += 4; + } + } +} + +#define Buffer_Reserve(__enc, __len) \ + if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + + +#define Buffer_AppendCharUnchecked(__enc, __chr) \ + *((__enc)->offset++) = __chr; \ + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char* begin, char* end) +{ + char aux; + while (end > begin) + aux = *end, *end-- = *begin, *begin++ = aux; +} + +void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) +{ + char* wstr; + JSUINT32 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10)); while(uvalue /= 10); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) +{ + char* wstr; + JSUINT64 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10ULL)); while(uvalue /= 10ULL); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, double value) +{ + /* if input is beyond the thresholds, revert to exponential */ + const double thres_max = (double) 1e16 - 1; + const double thres_min = (double) 1e-15; + char precision_str[20]; + int count; + double diff = 0.0; + char* str = enc->offset; + char* wstr = str; + unsigned long long whole; + double tmp; + unsigned long long frac; + int neg; + double pow10; + + if (value == HUGE_VAL || value == -HUGE_VAL) + { + SetError (obj, enc, "Invalid Inf value when encoding double"); + return FALSE; + } + + if (!(value == value)) + { + SetError (obj, enc, "Invalid Nan value when encoding double"); + return FALSE; + } + + /* we'll work in positive values and deal with the + negative sign issue later */ + neg = 0; + if (value < 0) + { + neg = 1; + value = -value; + } + + /* + for very large or small numbers switch back to native sprintf for + exponentials. anyone want to write code to replace this? */ + if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) + { + precision_str[0] = '%'; + precision_str[1] = '.'; +#if defined(_WIN32) && defined(_MSC_VER) + sprintf_s(precision_str+2, sizeof(precision_str)-2, "%ug", enc->doublePrecision); + enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, neg ? -value : value); +#else + snprintf(precision_str+2, sizeof(precision_str)-2, "%ug", enc->doublePrecision); + enc->offset += snprintf(str, enc->end - enc->offset, precision_str, neg ? -value : value); +#endif + return TRUE; + } + + pow10 = g_pow10[enc->doublePrecision]; + + whole = (unsigned long long) value; + tmp = (value - whole) * pow10; + frac = (unsigned long long)(tmp); + diff = tmp - frac; + + if (diff > 0.5) + { + ++frac; + /* handle rollover, e.g. case 0.99 with prec 1 is 1.0 */ + if (frac >= pow10) + { + frac = 0; + ++whole; + } + } + else + if (diff == 0.5 && ((frac == 0) || (frac & 1))) + { + /* if halfway, round up if odd, OR + if last digit is 0. That last part is strange */ + ++frac; + } + + if (enc->doublePrecision == 0) + { + diff = value - whole; + + if (diff > 0.5) + { + /* greater than 0.5, round up, e.g. 1.6 -> 2 */ + ++whole; + } + else + if (diff == 0.5 && (whole & 1)) + { + /* exactly 0.5 and ODD, then round up */ + /* 1.5 -> 2, but 2.5 -> 2 */ + ++whole; + } + + //vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 + } + else + if (frac) + { + count = enc->doublePrecision; + // now do fractional part, as an unsigned number + // we know it is not 0 but we can have leading zeros, these + // should be removed + while (!(frac % 10)) + { + --count; + frac /= 10; + } + //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 + + // now do fractional part, as an unsigned number + do + { + --count; + *wstr++ = (char)(48 + (frac % 10)); + } while (frac /= 10); + // add extra 0s + while (count-- > 0) + { + *wstr++ = '0'; + } + // add decimal + *wstr++ = '.'; + } + else + { + *wstr++ = '0'; + *wstr++ = '.'; + } + + // do whole part + // Take care of sign + // Conversion. Number is reversed. + do *wstr++ = (char)(48 + (whole % 10)); while (whole /= 10); + + if (neg) + { + *wstr++ = '-'; + } + strreverse(str, wstr-1); + enc->offset += (wstr - (enc->offset)); + + return TRUE; +} + +/* +FIXME: +Handle integration functions returning NULL here */ + +/* +FIXME: +Perhaps implement recursion detection */ + +void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) +{ + const char *value; + char *objName; + int count; + JSOBJ iterObj; + size_t szlen; + JSONTypeContext tc; + tc.encoder = enc; + + if (enc->level > enc->recursionMax) + { + SetError (obj, enc, "Maximum recursion level reached"); + return; + } + + /* + This reservation must hold + + length of _name as encoded worst case + + maxLength of double to string OR maxLength of JSLONG to string + */ + + Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); + if (enc->errorMsg) + { + return; + } + + if (name) + { + Buffer_AppendCharUnchecked(enc, '\"'); + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) + { + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) + { + return; + } + } + + Buffer_AppendCharUnchecked(enc, '\"'); + + Buffer_AppendCharUnchecked (enc, ':'); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + enc->beginTypeContext(obj, &tc); + + switch (tc.type) + { + case JT_INVALID: + { + return; + } + + case JT_ARRAY: + { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '['); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (buffer, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + + enc->level ++; + encode (iterObj, enc, NULL, 0); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, ']'); + break; + } + + case JT_OBJECT: + { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '{'); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + objName = enc->iterGetName(obj, &tc, &szlen); + + enc->level ++; + encode (iterObj, enc, objName, szlen); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, '}'); + break; + } + + case JT_LONG: + { + Buffer_AppendLongUnchecked (enc, enc->getLongValue(obj, &tc)); + break; + } + + case JT_INT: + { + Buffer_AppendIntUnchecked (enc, enc->getIntValue(obj, &tc)); + break; + } + + case JT_TRUE: + { + Buffer_AppendCharUnchecked (enc, 't'); + Buffer_AppendCharUnchecked (enc, 'r'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + case JT_FALSE: + { + Buffer_AppendCharUnchecked (enc, 'f'); + Buffer_AppendCharUnchecked (enc, 'a'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 's'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + + case JT_NULL: + { + Buffer_AppendCharUnchecked (enc, 'n'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 'l'); + break; + } + + case JT_DOUBLE: + { + if (!Buffer_AppendDoubleUnchecked (obj, enc, enc->getDoubleValue(obj, &tc))) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + break; + } + + case JT_UTF8: + { + value = enc->getStringValue(obj, &tc, &szlen); + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) + { + enc->endTypeContext(obj, &tc); + return; + } + Buffer_AppendCharUnchecked (enc, '\"'); + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + + Buffer_AppendCharUnchecked (enc, '\"'); + break; + } + } + + enc->endTypeContext(obj, &tc); + enc->level --; +} + +char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer) +{ + char *locale; + enc->malloc = enc->malloc ? enc->malloc : malloc; + enc->free = enc->free ? enc->free : free; + enc->realloc = enc->realloc ? enc->realloc : realloc; + enc->errorMsg = NULL; + enc->errorObj = NULL; + enc->level = 0; + + if (enc->recursionMax < 1) + { + enc->recursionMax = JSON_MAX_RECURSION_DEPTH; + } + + if (enc->doublePrecision < 0 || + enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) + { + enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; + } + + if (_buffer == NULL) + { + _cbBuffer = 32768; + enc->start = (char *) enc->malloc (_cbBuffer); + if (!enc->start) + { + SetError(obj, enc, "Could not reserve memory block"); + return NULL; + } + enc->heap = 1; + } + else + { + enc->start = _buffer; + enc->heap = 0; + } + + enc->end = enc->start + _cbBuffer; + enc->offset = enc->start; + + locale = setlocale(LC_NUMERIC, NULL); + if (strcmp(locale, "C")) + { + locale = strdup(locale); + if (!locale) + { + SetError(NULL, enc, "Could not reserve memory block"); + return NULL; + } + setlocale(LC_NUMERIC, "C"); + encode (obj, enc, NULL, 0); + setlocale(LC_NUMERIC, locale); + free(locale); + } + else + { + encode (obj, enc, NULL, 0); + } + + Buffer_Reserve(enc, 1); + if (enc->errorMsg) + { + return NULL; + } + Buffer_AppendCharUnchecked(enc, '\0'); + + return enc->start; +} diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c new file mode 100644 index 00000000..9c1b4feb --- /dev/null +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -0,0 +1,736 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. +*/ + +#include "py_defines.h" +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#define NO_IMPORT_ARRAY +#include +#include + + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +typedef struct __PyObjectDecoder +{ + JSONObjectDecoder dec; + + void* npyarr; // Numpy context buffer + void* npyarr_addr; // Ref to npyarr ptr to track DECREF calls + npy_intp curdim; // Current array dimension + + PyArray_Descr* dtype; +} PyObjectDecoder; + +typedef struct __NpyArrContext +{ + PyObject* ret; + PyObject* labels[2]; + PyArray_Dims shape; + + PyObjectDecoder* dec; + + npy_intp i; + npy_intp elsize; + npy_intp elcount; +} NpyArrContext; + +// Numpy handling based on numpy internal code, specifically the function +// PyArray_FromIter. + +// numpy related functions are inter-dependent so declare them all here, +// to ensure the compiler catches any errors + +// standard numpy array handling +JSOBJ Object_npyNewArray(void *prv, void* decoder); +JSOBJ Object_npyEndArray(void *prv, JSOBJ obj); +int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value); + +// for more complex dtypes (object and string) fill a standard Python list +// and convert to a numpy array when done. +JSOBJ Object_npyNewArrayList(void *prv, void* decoder); +JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj); +int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value); + +// labelled support, encode keys and values of JS object into separate numpy +// arrays +JSOBJ Object_npyNewObject(void *prv, void* decoder); +JSOBJ Object_npyEndObject(void *prv, JSOBJ obj); +int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value); + +// free the numpy context buffer +void Npy_releaseContext(NpyArrContext* npyarr) +{ + PRINTMARK(); + if (npyarr) + { + if (npyarr->shape.ptr) + { + PyObject_Free(npyarr->shape.ptr); + } + if (npyarr->dec) + { + npyarr->dec->npyarr = NULL; + npyarr->dec->curdim = 0; + } + Py_XDECREF(npyarr->labels[0]); + Py_XDECREF(npyarr->labels[1]); + Py_XDECREF(npyarr->ret); + PyObject_Free(npyarr); + } +} + +JSOBJ Object_npyNewArray(void *prv, void* _decoder) +{ + NpyArrContext* npyarr; + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + if (decoder->curdim <= 0) + { + // start of array - initialise the context buffer + npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + decoder->npyarr_addr = npyarr; + + if (!npyarr) + { + PyErr_NoMemory(); + return NULL; + } + + npyarr->dec = decoder; + npyarr->labels[0] = npyarr->labels[1] = NULL; + + npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp)*NPY_MAXDIMS); + npyarr->shape.len = 1; + npyarr->ret = NULL; + + npyarr->elsize = 0; + npyarr->elcount = 4; + npyarr->i = 0; + } + else + { + // starting a new dimension continue the current array (and reshape after) + npyarr = (NpyArrContext*) decoder->npyarr; + if (decoder->curdim >= npyarr->shape.len) + { + npyarr->shape.len++; + } + } + + npyarr->shape.ptr[decoder->curdim] = 0; + decoder->curdim++; + return npyarr; +} + +PyObject* Npy_returnLabelled(NpyArrContext* npyarr) +{ + PyObject* ret = npyarr->ret; + npy_intp i; + + if (npyarr->labels[0] || npyarr->labels[1]) + { + // finished decoding, build tuple with values and labels + ret = PyTuple_New(npyarr->shape.len+1); + for (i = 0; i < npyarr->shape.len; i++) + { + if (npyarr->labels[i]) + { + PyTuple_SET_ITEM(ret, i+1, npyarr->labels[i]); + npyarr->labels[i] = NULL; + } + else + { + Py_INCREF(Py_None); + PyTuple_SET_ITEM(ret, i+1, Py_None); + } + } + PyTuple_SET_ITEM(ret, 0, npyarr->ret); + } + + return ret; +} + +JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) +{ + PyObject *ret; + char* new_data; + NpyArrContext* npyarr = (NpyArrContext*) obj; + int emptyType = NPY_DEFAULT_TYPE; + npy_intp i; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + ret = npyarr->ret; + i = npyarr->i; + + npyarr->dec->curdim--; + + if (i == 0 || !npyarr->ret) { + // empty array would not have been initialised so do it now. + if (npyarr->dec->dtype) + { + emptyType = npyarr->dec->dtype->type_num; + } + npyarr->ret = ret = PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); + } + else if (npyarr->dec->curdim <= 0) + { + // realloc to final size + new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); + if (new_data == NULL) { + PyErr_NoMemory(); + Npy_releaseContext(npyarr); + return NULL; + } + ((PyArrayObject*) ret)->data = (void*) new_data; + // PyArray_BYTES(ret) = new_data; + } + + if (npyarr->dec->curdim <= 0) + { + // finished decoding array, reshape if necessary + if (npyarr->shape.len > 1) + { + npyarr->ret = PyArray_Newshape((PyArrayObject*) ret, &npyarr->shape, NPY_ANYORDER); + Py_DECREF(ret); + } + + ret = Npy_returnLabelled(npyarr); + + npyarr->ret = NULL; + Npy_releaseContext(npyarr); + } + + return ret; +} + +int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) +{ + PyObject* type; + PyArray_Descr* dtype; + npy_intp i; + char *new_data, *item; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + + i = npyarr->i; + + npyarr->shape.ptr[npyarr->dec->curdim-1]++; + + if (PyArray_Check((PyObject*)value)) + { + // multidimensional array, keep decoding values. + return 1; + } + + if (!npyarr->ret) + { + // Array not initialised yet. + // We do it here so we can 'sniff' the data type if none was provided + if (!npyarr->dec->dtype) + { + type = PyObject_Type(value); + if(!PyArray_DescrConverter(type, &dtype)) + { + Py_DECREF(type); + goto fail; + } + Py_INCREF(dtype); + Py_DECREF(type); + } + else + { + dtype = PyArray_DescrNew(npyarr->dec->dtype); + } + + // If it's an object or string then fill a Python list and subsequently + // convert. Otherwise we would need to somehow mess about with + // reference counts when renewing memory. + npyarr->elsize = dtype->elsize; + if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) + { + Py_XDECREF(dtype); + + if (npyarr->dec->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + npyarr->elcount = 0; + npyarr->ret = PyList_New(0); + if (!npyarr->ret) + { + goto fail; + } + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArrayList; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayListAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArrayList; + return Object_npyArrayListAddItem(prv, obj, value); + } + + npyarr->ret = PyArray_NewFromDescr(&PyArray_Type, dtype, 1, + &npyarr->elcount, NULL,NULL, 0, NULL); + + if (!npyarr->ret) + { + goto fail; + } + } + + if (i >= npyarr->elcount) { + // Grow PyArray_DATA(ret): + // this is similar for the strategy for PyListObject, but we use + // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... + if (npyarr->elsize == 0) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + + npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; + if (npyarr->elcount <= NPY_MAX_INTP/npyarr->elsize) { + new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), npyarr->elcount * npyarr->elsize); + } + else { + PyErr_NoMemory(); + goto fail; + } + ((PyArrayObject*) npyarr->ret)->data = (void*) new_data; + + // PyArray_BYTES(npyarr->ret) = new_data; + } + + PyArray_DIMS(npyarr->ret)[0] = i + 1; + + if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL + || PyArray_SETITEM(npyarr->ret, item, value) == -1) { + goto fail; + } + + Py_DECREF( (PyObject *) value); + npyarr->i++; + return 1; + +fail: + + Npy_releaseContext(npyarr); + return 0; +} + +JSOBJ Object_npyNewArrayList(void *prv, void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, "nesting not supported for object or variable length dtypes"); + Npy_releaseContext(decoder->npyarr); + return NULL; +} + +JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) +{ + PyObject *list, *ret; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + // convert decoded list to numpy array + list = (PyObject *) npyarr->ret; + npyarr->ret = PyArray_FROM_O(list); + + ret = Npy_returnLabelled(npyarr); + npyarr->ret = list; + + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArray; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArray; + Npy_releaseContext(npyarr); + return ret; +} + +int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) +{ + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + PyList_Append((PyObject*) npyarr->ret, value); + Py_DECREF( (PyObject *) value); + npyarr->elcount++; + return 1; +} + + +JSOBJ Object_npyNewObject(void *prv, void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + if (decoder->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "labels only supported up to 2 dimensions"); + return NULL; + } + + return ((JSONObjectDecoder*)decoder)->newArray(prv, decoder); +} + +JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) +{ + PyObject *list; + npy_intp labelidx; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + labelidx = npyarr->dec->curdim-1; + + list = npyarr->labels[labelidx]; + if (list) + { + npyarr->labels[labelidx] = PyArray_FROM_O(list); + Py_DECREF(list); + } + + return (PyObject*) ((JSONObjectDecoder*)npyarr->dec)->endArray(prv, obj); +} + +int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PyObject *label; + npy_intp labelidx; + // add key to label array, value to values array + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + + label = (PyObject*) name; + labelidx = npyarr->dec->curdim-1; + + if (!npyarr->labels[labelidx]) + { + npyarr->labels[labelidx] = PyList_New(0); + } + + // only fill label array once, assumes all column labels are the same + // for 2-dimensional arrays. + if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) + { + PyList_Append(npyarr->labels[labelidx], label); + } + + if(((JSONObjectDecoder*)npyarr->dec)->arrayAddItem(prv, obj, value)) + { + Py_DECREF(label); + return 1; + } + return 0; +} + +int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PyDict_SetItem (obj, name, value); + Py_DECREF( (PyObject *) name); + Py_DECREF( (PyObject *) value); + return 1; +} + +int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) +{ + PyList_Append(obj, value); + Py_DECREF( (PyObject *) value); + return 1; +} + +JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) +{ + return PyUnicode_FromWideChar (start, (end - start)); +} + +JSOBJ Object_newTrue(void *prv) +{ + Py_RETURN_TRUE; +} + +JSOBJ Object_newFalse(void *prv) +{ + Py_RETURN_FALSE; +} + +JSOBJ Object_newNull(void *prv) +{ + Py_RETURN_NONE; +} + +JSOBJ Object_newObject(void *prv, void* decoder) +{ + return PyDict_New(); +} + +JSOBJ Object_endObject(void *prv, JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newArray(void *prv, void* decoder) +{ + return PyList_New(0); +} + +JSOBJ Object_endArray(void *prv, JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newInteger(void *prv, JSINT32 value) +{ + return PyInt_FromLong( (long) value); +} + +JSOBJ Object_newLong(void *prv, JSINT64 value) +{ + return PyLong_FromLongLong (value); +} + +JSOBJ Object_newDouble(void *prv, double value) +{ + return PyFloat_FromDouble(value); +} + +static void Object_releaseObject(void *prv, JSOBJ obj, void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + if (obj != decoder->npyarr_addr) + { + Py_XDECREF( ((PyObject *)obj)); + } +} + +static char *g_kwlist[] = {"obj", "precise_float", "numpy", "labelled", "dtype", NULL}; + +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *ret; + PyObject *sarg; + PyObject *arg; + PyObject *opreciseFloat = NULL; + JSONObjectDecoder *decoder; + PyObjectDecoder pyDecoder; + PyArray_Descr *dtype = NULL; + int numpy = 0, labelled = 0; + + JSONObjectDecoder dec = + { + Object_newString, + Object_objectAddKey, + Object_arrayAddItem, + Object_newTrue, + Object_newFalse, + Object_newNull, + Object_newObject, + Object_endObject, + Object_newArray, + Object_endArray, + Object_newInteger, + Object_newLong, + Object_newDouble, + Object_releaseObject, + PyObject_Malloc, + PyObject_Free, + PyObject_Realloc + }; + + dec.preciseFloat = 0; + dec.prv = NULL; + + pyDecoder.dec = dec; + pyDecoder.curdim = 0; + pyDecoder.npyarr = NULL; + pyDecoder.npyarr_addr = NULL; + + decoder = (JSONObjectDecoder*) &pyDecoder; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg, &opreciseFloat, &numpy, &labelled, PyArray_DescrConverter2, &dtype)) + { + Npy_releaseContext(pyDecoder.npyarr); + return NULL; + } + + if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) + { + decoder->preciseFloat = 1; + } + + if (PyString_Check(arg)) + { + sarg = arg; + } + else + if (PyUnicode_Check(arg)) + { + sarg = PyUnicode_AsUTF8String(arg); + if (sarg == NULL) + { + //Exception raised above us by codec according to docs + return NULL; + } + } + else + { + PyErr_Format(PyExc_TypeError, "Expected String or Unicode"); + return NULL; + } + + decoder->errorStr = NULL; + decoder->errorOffset = NULL; + + if (numpy) + { + pyDecoder.dtype = dtype; + decoder->newArray = Object_npyNewArray; + decoder->endArray = Object_npyEndArray; + decoder->arrayAddItem = Object_npyArrayAddItem; + + if (labelled) + { + decoder->newObject = Object_npyNewObject; + decoder->endObject = Object_npyEndObject; + decoder->objectAddKey = Object_npyObjectAddKey; + } + } + + ret = JSON_DecodeObject(decoder, PyString_AS_STRING(sarg), PyString_GET_SIZE(sarg)); + + if (sarg != arg) + { + Py_DECREF(sarg); + } + + if (PyErr_Occurred()) + { + if (ret) + { + Py_DECREF( (PyObject *) ret); + } + Npy_releaseContext(pyDecoder.npyarr); + return NULL; + } + + if (decoder->errorStr) + { + /* + FIXME: It's possible to give a much nicer error message here with actual failing element in input etc*/ + + PyErr_Format (PyExc_ValueError, "%s", decoder->errorStr); + + if (ret) + { + Py_DECREF( (PyObject *) ret); + } + Npy_releaseContext(pyDecoder.npyarr); + + return NULL; + } + + return ret; +} + +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *read; + PyObject *string; + PyObject *result; + PyObject *file = NULL; + PyObject *argtuple; + + if (!PyArg_ParseTuple (args, "O", &file)) + { + return NULL; + } + + if (!PyObject_HasAttrString (file, "read")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + read = PyObject_GetAttrString (file, "read"); + + if (!PyCallable_Check (read)) { + Py_XDECREF(read); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + string = PyObject_CallObject (read, NULL); + Py_XDECREF(read); + + if (string == NULL) + { + return NULL; + } + + argtuple = PyTuple_Pack(1, string); + + result = JSONToObj (self, argtuple, kwargs); + + Py_XDECREF(argtuple); + Py_XDECREF(string); + + if (result == NULL) { + return NULL; + } + + return result; +} diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c new file mode 100644 index 00000000..f6cb5b98 --- /dev/null +++ b/pandas/src/ujson/python/objToJSON.c @@ -0,0 +1,2103 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the ESN Social Software AB nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms +* Copyright (c) 1988-1993 The Regents of the University of California. +* Copyright (c) 1994 Sun Microsystems, Inc. +*/ +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY + +#include "py_defines.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static PyObject* type_decimal; + +#define NPY_JSON_BUFSIZE 32768 + +static PyTypeObject* cls_dataframe; +static PyTypeObject* cls_series; +static PyTypeObject* cls_index; +static PyTypeObject* cls_nat; + +typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, void *outValue, size_t *_outLen); + +#if (PY_VERSION_HEX < 0x02050000) +typedef ssize_t Py_ssize_t; +#endif + +typedef struct __NpyArrContext +{ + PyObject *array; + char* dataptr; + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) + npy_intp dim; + npy_intp stride; + npy_intp ndim; + npy_intp index[NPY_MAXDIMS]; + int type_num; + PyArray_GetItemFunc* getitem; + + char** rowLabels; + char** columnLabels; +} NpyArrContext; + +typedef struct __TypeContext +{ + JSPFN_ITERBEGIN iterBegin; + JSPFN_ITEREND iterEnd; + JSPFN_ITERNEXT iterNext; + JSPFN_ITERGETNAME iterGetName; + JSPFN_ITERGETVALUE iterGetValue; + PFN_PyTypeToJSON PyTypeToJSON; + PyObject *newObj; + PyObject *dictObj; + Py_ssize_t index; + Py_ssize_t size; + PyObject *itemValue; + PyObject *itemName; + PyObject *attrList; + PyObject *iterator; + + JSINT64 longValue; + + char *cStr; + NpyArrContext *npyarr; + int transpose; + char** rowLabels; + char** columnLabels; + npy_intp rowLabelsLen; + npy_intp columnLabelsLen; +} TypeContext; + +typedef struct __PyObjectEncoder +{ + JSONObjectEncoder enc; + + // pass through the NpyArrContext when encoding multi-dimensional arrays + NpyArrContext* npyCtxtPassthru; + + // pass through a request for a specific encoding context + int requestType; + TypeContext* requestTypeContext; + + int datetimeIso; + PANDAS_DATETIMEUNIT datetimeUnit; + + // output format style for pandas data types + int outputFormat; + int originalOutputFormat; + + PyObject *defaultHandler; +} PyObjectEncoder; + +#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) + +struct PyDictIterState +{ + PyObject *keys; + size_t i; + size_t sz; +}; + +enum PANDAS_FORMAT +{ + SPLIT, + RECORDS, + INDEX, + COLUMNS, + VALUES +}; + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +// import_array() compat +#if (PY_VERSION_HEX >= 0x03000000) +void *initObjToJSON(void) +#else +void initObjToJSON(void) +#endif +{ + PyObject *mod_pandas; + PyObject *mod_tslib; + PyObject* mod_decimal = PyImport_ImportModule("decimal"); + type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); + Py_INCREF(type_decimal); + Py_DECREF(mod_decimal); + + PyDateTime_IMPORT; + + mod_pandas = PyImport_ImportModule("pandas"); + if (mod_pandas) + { + cls_dataframe = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "DataFrame"); + cls_index = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "Index"); + cls_series = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "Series"); + Py_DECREF(mod_pandas); + } + + mod_tslib = PyImport_ImportModule("pandas.tslib"); + if (mod_tslib) + { + cls_nat = (PyTypeObject*) PyObject_GetAttrString(mod_tslib, "NaTType"); + Py_DECREF(mod_tslib); + } + + /* Initialise numpy API and use 2/3 compatible return */ + import_array(); + return NUMPY_IMPORT_ARRAY_RETVAL; +} + +TypeContext* createTypeContext() +{ + TypeContext *pc; + + pc = PyObject_Malloc(sizeof(TypeContext)); + if (!pc) + { + PyErr_NoMemory(); + return NULL; + } + pc->newObj = NULL; + pc->dictObj = NULL; + pc->itemValue = NULL; + pc->itemName = NULL; + pc->attrList = NULL; + pc->index = 0; + pc->size = 0; + pc->longValue = 0; + pc->cStr = NULL; + pc->npyarr = NULL; + pc->rowLabels = NULL; + pc->columnLabels = NULL; + pc->transpose = 0; + pc->rowLabelsLen = 0; + pc->columnLabelsLen = 0; + + return pc; +} + +static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT32 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT64 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + *((JSINT64 *) outValue) = GET_TC(tc)->longValue; + return NULL; +} + +static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); + return NULL; +} + +static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((double *) outValue) = PyFloat_AsDouble (obj); + return NULL; +} + +static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *_outLen = PyString_GET_SIZE(obj); + return PyString_AS_STRING(obj); +} + +static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *newObj = PyUnicode_EncodeUTF8 (PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), NULL); + + GET_TC(tc)->newObj = newObj; + + *_outLen = PyString_GET_SIZE(newObj); + return PyString_AS_STRING(newObj); +} + +static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + int base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; + + if (((PyObjectEncoder*) tc->encoder)->datetimeIso) + { + PRINTMARK(); + *_outLen = (size_t) get_datetime_iso_8601_strlen(0, base); + GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen)); + if (!GET_TC(tc)->cStr) + { + PyErr_NoMemory(); + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; + return NULL; + } + + if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 0, base, -1, NPY_UNSAFE_CASTING)) + { + PRINTMARK(); + *_outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; + } + else + { + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; + PyObject_Free(GET_TC(tc)->cStr); + return NULL; + } + } + else + { + PRINTMARK(); + *((JSINT64*)outValue) = pandas_datetimestruct_to_datetime(base, dts); + return NULL; + } +} + +static void *NpyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + pandas_datetimestruct dts; + PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *) _obj; + PRINTMARK(); + + pandas_datetime_to_datetimestruct(obj->obval, obj->obmeta.base, &dts); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); +} + +static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + pandas_datetimestruct dts; + PyObject *obj = (PyObject *) _obj; + + PRINTMARK(); + + if (!convert_pydatetime_to_datetimestruct(obj, &dts, NULL, 1)) + { + PRINTMARK(); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + } + else + { + if (!PyErr_Occurred()) + { + PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); + } + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; + return NULL; + } +} + +static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + pandas_datetimestruct dts; + PyObject *obj = (PyObject *) _obj; + + PRINTMARK(); + + pandas_datetime_to_datetimestruct(PyLong_AsLongLong(obj), PANDAS_FR_ns, &dts); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); +} + +static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *str; + PyObject *tmp; + + str = PyObject_CallMethod(obj, "isoformat", NULL); + if (str == NULL) { + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, "Failed to convert time"); + return NULL; + } + if (PyUnicode_Check(str)) + { + tmp = str; + str = PyUnicode_AsUTF8String(str); + Py_DECREF(tmp); + } + outValue = (void *) PyString_AS_STRING (str); + *outLen = strlen ((char *) outValue); + Py_DECREF(str); + return outValue; +} + +void requestDateEncoding(PyObject* obj, PyObjectEncoder* pyenc) +{ + if (obj == Py_None) { + pyenc->requestType = JT_NULL; + return; + } + + if (pyenc->datetimeIso) + { + pyenc->requestType = JT_UTF8; + } + else + { + pyenc->requestType = JT_LONG; + } + pyenc->requestTypeContext = createTypeContext(); + pyenc->requestTypeContext->PyTypeToJSON = NpyDatetime64ToJSON; +} + + +//============================================================================= +// Numpy array iteration functions +//============================================================================= +int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) +{ + return 0; +} + +void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) +{ + PyArrayObject *obj; + NpyArrContext *npyarr; + + if (GET_TC(tc)->newObj) + { + obj = (PyArrayObject *) GET_TC(tc)->newObj; + } + else + { + obj = (PyArrayObject *) _obj; + } + + if (PyArray_SIZE(obj) > 0) + { + PRINTMARK(); + npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + GET_TC(tc)->npyarr = npyarr; + + if (!npyarr) + { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + npyarr->array = (PyObject*) obj; + npyarr->getitem = (PyArray_GetItemFunc*) PyArray_DESCR(obj)->f->getitem; + npyarr->dataptr = PyArray_DATA(obj); + npyarr->ndim = PyArray_NDIM(obj) - 1; + npyarr->curdim = 0; + npyarr->type_num = PyArray_DESCR(obj)->type_num; + + if (GET_TC(tc)->transpose) + { + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; + } + else + { + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; + } + + npyarr->columnLabels = GET_TC(tc)->columnLabels; + npyarr->rowLabels = GET_TC(tc)->rowLabels; + } + else + { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + } + PRINTMARK(); +} + +void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + + if (npyarr) + { + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + } + GET_TC(tc)->itemValue = NULL; + + PyObject_Free(npyarr); + } + PRINTMARK(); +} + +void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); +} + +void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + // finished this dimension, reset the data pointer + npyarr = GET_TC(tc)->npyarr; + npyarr->curdim--; + npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; + npyarr->stridedim -= npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->dataptr += npyarr->stride; + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } +} + +int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + + PRINTMARK(); + + npyarr = GET_TC(tc)->npyarr; + + if (PyErr_Occurred()) + { + return 0; + } + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + return 0; + } + +#if NPY_API_VERSION < 0x00000007 + if(PyTypeNum_ISDATETIME(npyarr->type_num)) + { + GET_TC(tc)->itemValue = PyArray_ToScalar(npyarr->dataptr, npyarr->array); + } + else + { + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + } +#else + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + if(PyTypeNum_ISDATETIME(npyarr->type_num)) + { + requestDateEncoding(GET_TC(tc)->itemValue, (PyObjectEncoder*) tc->encoder); + } +#endif + + npyarr->dataptr += npyarr->stride; + npyarr->index[npyarr->stridedim]++; + return 1; +} + +int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + + if (PyErr_Occurred()) + { + PRINTMARK(); + return 0; + } + + if (npyarr->curdim >= npyarr->ndim || npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + // innermost dimension, start retrieving item values + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + return NpyArr_iterNextItem(_obj, tc); + } + + // dig a dimension deeper + npyarr->index[npyarr->stridedim]++; + + npyarr->curdim++; + npyarr->stridedim += npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->index[npyarr->stridedim] = 0; + + ((PyObjectEncoder*) tc->encoder)->npyCtxtPassthru = npyarr; + GET_TC(tc)->itemValue = npyarr->array; + return 1; +} + +JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + JSONObjectEncoder* enc = (JSONObjectEncoder*) tc->encoder; + NpyArrContext* npyarr; + npy_intp idx; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) + { + idx = npyarr->index[npyarr->stridedim] - 1; + *outLen = strlen(npyarr->columnLabels[idx]); + memcpy(enc->offset, npyarr->columnLabels[idx], sizeof(char)*(*outLen)); + enc->offset += *outLen; + *outLen = 0; + return NULL; + } + else + { + idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; + *outLen = strlen(npyarr->rowLabels[idx]); + memcpy(enc->offset, npyarr->rowLabels[idx], sizeof(char)*(*outLen)); + enc->offset += *outLen; + *outLen = 0; + return NULL; + } +} + +//============================================================================= +// Tuple iteration functions +// itemValue is borrowed reference, no ref counting +//============================================================================= +void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyTuple_GET_SIZE( (PyObject *) obj); + GET_TC(tc)->itemValue = NULL; +} + +int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + PyObject *item; + + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + return 0; + } + + item = PyTuple_GET_ITEM (obj, GET_TC(tc)->index); + + GET_TC(tc)->itemValue = item; + GET_TC(tc)->index ++; + return 1; +} + +void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// Iterator iteration functions +// itemValue is borrowed reference, no ref counting +//============================================================================= +void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->itemValue = NULL; + GET_TC(tc)->iterator = PyObject_GetIter(obj); +} + +int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + PyObject *item; + + if (GET_TC(tc)->itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + item = PyIter_Next(GET_TC(tc)->iterator); + + if (item == NULL) + { + return 0; + } + + GET_TC(tc)->itemValue = item; + return 1; +} + +void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + if (GET_TC(tc)->iterator) + { + Py_DECREF(GET_TC(tc)->iterator); + GET_TC(tc)->iterator = NULL; + } +} + +JSOBJ Iter_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Iter_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// Dir iteration functions +// itemName ref is borrowed from PyObject_Dir (attrList). No refcount +// itemValue ref is from PyObject_GetAttr. Ref counted +//============================================================================= +void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->attrList = PyObject_Dir(obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); + PRINTMARK(); +} + +void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + + Py_DECREF( (PyObject *) GET_TC(tc)->attrList); + PRINTMARK(); +} + +int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *itemValue = GET_TC(tc)->itemValue; + PyObject *itemName = GET_TC(tc)->itemName; + PyObject* attr; + PyObject* attrName; + char* attrStr; + + if (itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = itemValue = NULL; + } + + if (itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = itemName = NULL; + } + + for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index ++) + { + attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); +#if PY_MAJOR_VERSION >= 3 + attr = PyUnicode_AsUTF8String(attrName); +#else + attr = attrName; + Py_INCREF(attr); +#endif + attrStr = PyString_AS_STRING(attr); + + if (attrStr[0] == '_') + { + PRINTMARK(); + Py_DECREF(attr); + continue; + } + + itemValue = PyObject_GetAttr(obj, attrName); + if (itemValue == NULL) + { + PyErr_Clear(); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + if (PyCallable_Check(itemValue)) + { + Py_DECREF(itemValue); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index ++; + + PRINTMARK(); + itemName = attr; + break; + } + + if (itemName == NULL) + { + GET_TC(tc)->index = GET_TC(tc)->size; + GET_TC(tc)->itemValue = NULL; + return 0; + } + + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index ++; + + PRINTMARK(); + return 1; +} + +JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + PRINTMARK(); + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + + +//============================================================================= +// List iteration functions +// itemValue is borrowed from object (which is list). No refcounting +//============================================================================= +void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE( (PyObject *) obj); +} + +int List_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->itemValue = PyList_GET_ITEM (obj, GET_TC(tc)->index); + GET_TC(tc)->index ++; + return 1; +} + +void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// pandas Index iteration functions +//============================================================================= +void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->cStr) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->cStr) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); +} + +JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; +} + +//============================================================================= +// pandas Series iteration functions +//============================================================================= +void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series + if (!GET_TC(tc)->cStr) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->cStr) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char)*6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + PRINTMARK(); +} + +JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; +} + +//============================================================================= +// pandas DataFrame iteration functions +//============================================================================= +void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->cStr) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->cStr) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->cStr, "columns", sizeof(char)*8); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char)*6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + PRINTMARK(); +} + +JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; +} + +//============================================================================= +// Dict iteration functions +// itemName might converted to string (Python_Str). Do refCounting +// itemValue is borrowed from object (which is dict). No refCounting +//============================================================================= +void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + PRINTMARK(); +} + +int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ +#if PY_MAJOR_VERSION >= 3 + PyObject* itemNameTmp; +#endif + + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + + + if (!PyDict_Next ( (PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) + { + PRINTMARK(); + return 0; + } + + if (PyUnicode_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); + } + else + if (!PyString_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); +#if PY_MAJOR_VERSION >= 3 + itemNameTmp = GET_TC(tc)->itemName; + GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); + Py_DECREF(itemNameTmp); +#endif + } + else + { + Py_INCREF(GET_TC(tc)->itemName); + } + PRINTMARK(); + return 1; +} + +void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + Py_DECREF(GET_TC(tc)->dictObj); + PRINTMARK(); +} + +JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + +void NpyArr_freeLabels(char** labels, npy_intp len) +{ + npy_intp i; + + if (labels) + { + for (i = 0; i < len; i++) + { + PyObject_Free(labels[i]); + } + PyObject_Free(labels); + } +} + +char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_intp num) +{ + // NOTE this function steals a reference to labels. + PyArrayObject* labelsTmp = NULL; + PyObject* item = NULL; + npy_intp i, stride, len, need_quotes; + char** ret; + char *dataptr, *cLabel, *origend, *origst, *origoffset; + char labelBuffer[NPY_JSON_BUFSIZE]; + PyArray_GetItemFunc* getitem; + int type_num; + PRINTMARK(); + + if (PyArray_SIZE(labels) < num) + { + PyErr_SetString(PyExc_ValueError, "Label array sizes do not match corresponding data shape"); + Py_DECREF(labels); + return 0; + } + + ret = PyObject_Malloc(sizeof(char*)*num); + if (!ret) + { + PyErr_NoMemory(); + Py_DECREF(labels); + return 0; + } + + for (i = 0; i < num; i++) + { + ret[i] = NULL; + } + + origst = enc->start; + origend = enc->end; + origoffset = enc->offset; + + stride = PyArray_STRIDE(labels, 0); + dataptr = PyArray_DATA(labels); + getitem = (PyArray_GetItemFunc*) PyArray_DESCR(labels)->f->getitem; + type_num = PyArray_DESCR(labels)->type_num; + + for (i = 0; i < num; i++) + { +#if NPY_API_VERSION < 0x00000007 + if(PyTypeNum_ISDATETIME(type_num)) + { + item = PyArray_ToScalar(dataptr, labels); + } + else + { + item = getitem(dataptr, labels); + } +#else + item = getitem(dataptr, labels); + if(PyTypeNum_ISDATETIME(type_num)) + { + requestDateEncoding(item, (PyObjectEncoder*) enc); + } +#endif + if (!item) + { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); + Py_DECREF(item); + + if (PyErr_Occurred() || enc->errorMsg) + { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + need_quotes = ((*cLabel) != '"'); + len = enc->offset - cLabel + 1 + 2 * need_quotes; + ret[i] = PyObject_Malloc(sizeof(char)*len); + + if (!ret[i]) + { + PyErr_NoMemory(); + ret = 0; + break; + } + + if (need_quotes) + { + ret[i][0] = '"'; + memcpy(ret[i]+1, cLabel, sizeof(char)*(len-4)); + ret[i][len-3] = '"'; + } + else + { + memcpy(ret[i], cLabel, sizeof(char)*(len-2)); + } + ret[i][len-2] = ':'; + ret[i][len-1] = '\0'; + dataptr += stride; + } + + enc->start = origst; + enc->end = origend; + enc->offset = origoffset; + + Py_DECREF(labels); + return ret; +} + +void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) +{ + PyObject *obj, *exc, *toDictFunc, *defaultObj; + TypeContext *pc; + PyObjectEncoder *enc; + double val; + PRINTMARK(); + if (!_obj) { + tc->type = JT_INVALID; + return; + } + + obj = (PyObject*) _obj; + enc = (PyObjectEncoder*) tc->encoder; + + if (enc->requestType) + { + PRINTMARK(); + tc->type = enc->requestType; + tc->prv = enc->requestTypeContext; + + enc->requestType = 0; + enc->requestTypeContext = NULL; + return; + } + + pc = createTypeContext(); + if (!pc) + { + tc->type = JT_INVALID; + return; + } + tc->prv = pc; + + if (PyIter_Check(obj) || PyArray_Check(obj)) + { + PRINTMARK(); + goto ISITERABLE; + } + + if (PyBool_Check(obj)) + { + PRINTMARK(); + tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; + return; + } + else + if (PyLong_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + GET_TC(tc)->longValue = PyLong_AsLongLong(obj); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + goto INVALID; + } + + return; + } + else + if (PyInt_Check(obj)) + { + PRINTMARK(); + +#ifdef _LP64 + pc->PyTypeToJSON = PyIntToINT64; tc->type = JT_LONG; +#else + pc->PyTypeToJSON = PyIntToINT32; tc->type = JT_INT; +#endif + return; + } + else + if (PyFloat_Check(obj)) + { + PRINTMARK(); + val = PyFloat_AS_DOUBLE (obj); + if (npy_isnan(val) || npy_isinf(val)) + { + tc->type = JT_NULL; + } + else + { + pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; + } + return; + } + else + if (PyString_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyStringToUTF8; tc->type = JT_UTF8; + return; + } + else + if (PyUnicode_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyUnicodeToUTF8; tc->type = JT_UTF8; + return; + } + else + if (obj == Py_None) + { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + else + if (PyObject_IsInstance(obj, type_decimal)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; + return; + } + else + if (PyDateTime_Check(obj) || PyDate_Check(obj)) + { + if (PyObject_TypeCheck(obj, cls_nat)) + { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + + PRINTMARK(); + pc->PyTypeToJSON = PyDateTimeToJSON; + if (enc->datetimeIso) + { + PRINTMARK(); + tc->type = JT_UTF8; + } + else + { + PRINTMARK(); + tc->type = JT_LONG; + } + return; + } + else + if (PyTime_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyTimeToJSON; tc->type = JT_UTF8; + return; + } + else + if (PyArray_IsScalar(obj, Datetime)) + { + PRINTMARK(); + if (((PyDatetimeScalarObject*) obj)->obval == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + + PRINTMARK(); + pc->PyTypeToJSON = NpyDateTimeToJSON; + if (enc->datetimeIso) + { + tc->type = JT_UTF8; + } + else + { + tc->type = JT_LONG; + } + return; + } + else + if (PyArray_IsScalar(obj, Integer)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + goto INVALID; + } + + return; + } + else + if (PyArray_IsScalar(obj, Bool)) + { + PRINTMARK(); + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_BOOL)); + tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; + return; + } + else + if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) + { + PRINTMARK(); + pc->PyTypeToJSON = NpyFloatToDOUBLE; tc->type = JT_DOUBLE; + return; + } + +ISITERABLE: + + if (PyObject_TypeCheck(obj, cls_index)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Index_iterBegin; + pc->iterEnd = Index_iterEnd; + pc->iterNext = Index_iterNext; + pc->iterGetValue = Index_iterGetValue; + pc->iterGetName = Index_iterGetName; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, cls_series)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Series_iterBegin; + pc->iterEnd = Series_iterEnd; + pc->iterNext = Series_iterNext; + pc->iterGetValue = Series_iterGetValue; + pc->iterGetName = Series_iterGetName; + return; + } + + pc->newObj = PyObject_GetAttrString(obj, "values"); + + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + goto INVALID; + } + } + else + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyArray_Check(obj)) + { + if (enc->npyCtxtPassthru) + { + PRINTMARK(); + pc->npyarr = enc->npyCtxtPassthru; + tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); + pc->iterBegin = NpyArrPassThru_iterBegin; + pc->iterEnd = NpyArrPassThru_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + enc->npyCtxtPassthru = NULL; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, cls_dataframe)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } + + PRINTMARK(); + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + if (enc->outputFormat == VALUES) + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + else + if (enc->outputFormat == RECORDS) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + goto INVALID; + } + } + else + if (enc->outputFormat == INDEX) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "index"), "values"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + } + else + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "index"), "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + pc->transpose = 1; + } + return; + } + else + if (PyDict_Check(obj)) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); + + return; + } + else + if (PyList_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } + else + if (PyTuple_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } + else + if (PyAnySet_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Iter_iterBegin; + pc->iterEnd = Iter_iterEnd; + pc->iterNext = Iter_iterNext; + pc->iterGetValue = Iter_iterGetValue; + pc->iterGetName = Iter_iterGetName; + return; + } + + toDictFunc = PyObject_GetAttrString(obj, "toDict"); + + if (toDictFunc) + { + PyObject* tuple = PyTuple_New(0); + PyObject* toDictResult = PyObject_Call(toDictFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toDictFunc); + + if (toDictResult == NULL) + { + PyErr_Clear(); + tc->type = JT_NULL; + return; + } + + if (!PyDict_Check(toDictResult)) + { + Py_DECREF(toDictResult); + tc->type = JT_NULL; + return; + } + + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = toDictResult; + return; + } + + PyErr_Clear(); + + if (enc->defaultHandler) + { + PRINTMARK(); + defaultObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); + if (defaultObj == NULL || PyErr_Occurred()) + { + if (!PyErr_Occurred()) + { + PyErr_SetString(PyExc_TypeError, "Failed to execute default handler"); + } + goto INVALID; + } + encode (defaultObj, enc, NULL, 0); + Py_DECREF(defaultObj); + goto INVALID; + } + + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dir_iterBegin; + pc->iterEnd = Dir_iterEnd; + pc->iterNext = Dir_iterNext; + pc->iterGetValue = Dir_iterGetValue; + pc->iterGetName = Dir_iterGetName; + return; + +INVALID: + tc->type = JT_INVALID; + PyObject_Free(tc->prv); + tc->prv = NULL; + return; +} + +void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + Py_XDECREF(GET_TC(tc)->newObj); + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); + + PyObject_Free(GET_TC(tc)->cStr); + PyObject_Free(tc->prv); + tc->prv = NULL; +} + +const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) +{ + return GET_TC(tc)->PyTypeToJSON (obj, tc, NULL, _outLen); +} + +JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT64 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + +JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT32 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + +double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) +{ + double ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + +static void Object_releaseObject(JSOBJ _obj) +{ + Py_DECREF( (PyObject *) _obj); +} + +void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterBegin(obj, tc); +} + +int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterNext(obj, tc); +} + +void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterEnd(obj, tc); +} + +JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterGetValue(obj, tc); +} + +char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return GET_TC(tc)->iterGetName(obj, tc, outLen); +} + +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "encode_html_chars", "orient", "date_unit", "iso_dates", "default_handler", NULL}; + + char buffer[65536]; + char *ret; + PyObject *newobj; + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; + int idoublePrecision = 10; // default double precision setting + PyObject *oencodeHTMLChars = NULL; + char *sOrient = NULL; + char *sdateFormat = NULL; + PyObject *oisoDates = 0; + PyObject *odefHandler = 0; + + PyObjectEncoder pyEncoder = + { + { + Object_beginTypeContext, + Object_endTypeContext, + Object_getStringValue, + Object_getLongValue, + Object_getIntValue, + Object_getDoubleValue, + Object_iterBegin, + Object_iterNext, + Object_iterEnd, + Object_iterGetValue, + Object_iterGetName, + Object_releaseObject, + PyObject_Malloc, + PyObject_Realloc, + PyObject_Free, + -1, //recursionMax + idoublePrecision, + 1, //forceAscii + 0, //encodeHTMLChars + } + }; + JSONObjectEncoder* encoder = (JSONObjectEncoder*) &pyEncoder; + + pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.requestType = 0; + pyEncoder.requestTypeContext = NULL; + pyEncoder.datetimeIso = 0; + pyEncoder.datetimeUnit = PANDAS_FR_ms; + pyEncoder.outputFormat = COLUMNS; + pyEncoder.defaultHandler = 0; + + PRINTMARK(); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOO", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, &oisoDates, &odefHandler)) + { + return NULL; + } + + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) + { + encoder->forceASCII = 0; + } + + if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) + { + encoder->encodeHTMLChars = 1; + } + + if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) + { + PyErr_Format ( + PyExc_ValueError, + "Invalid value '%d' for option 'double_precision', max is '%u'", + idoublePrecision, + JSON_DOUBLE_MAX_DECIMALS); + return NULL; + } + encoder->doublePrecision = idoublePrecision; + + if (sOrient != NULL) + { + if (strcmp(sOrient, "records") == 0) + { + pyEncoder.outputFormat = RECORDS; + } + else + if (strcmp(sOrient, "index") == 0) + { + pyEncoder.outputFormat = INDEX; + } + else + if (strcmp(sOrient, "split") == 0) + { + pyEncoder.outputFormat = SPLIT; + } + else + if (strcmp(sOrient, "values") == 0) + { + pyEncoder.outputFormat = VALUES; + } + else + if (strcmp(sOrient, "columns") != 0) + { + PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'orient'", sOrient); + return NULL; + } + } + + if (sdateFormat != NULL) + { + if (strcmp(sdateFormat, "s") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_s; + } + else + if (strcmp(sdateFormat, "ms") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_ms; + } + else + if (strcmp(sdateFormat, "us") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_us; + } + else + if (strcmp(sdateFormat, "ns") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_ns; + } + else + { + PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'date_unit'", sdateFormat); + return NULL; + } + } + + if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) + { + pyEncoder.datetimeIso = 1; + } + + + if (odefHandler != NULL && odefHandler != Py_None) + { + if (!PyCallable_Check(odefHandler)) + { + PyErr_SetString (PyExc_TypeError, "Default handler is not callable"); + return NULL; + } + pyEncoder.defaultHandler = odefHandler; + } + + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; + PRINTMARK(); + ret = JSON_EncodeObject (oinput, encoder, buffer, sizeof (buffer)); + PRINTMARK(); + + if (PyErr_Occurred()) + { + PRINTMARK(); + return NULL; + } + + if (encoder->errorMsg) + { + PRINTMARK(); + if (ret != buffer) + { + encoder->free (ret); + } + + PyErr_Format (PyExc_OverflowError, "%s", encoder->errorMsg); + return NULL; + } + + newobj = PyString_FromString (ret); + + if (ret != buffer) + { + encoder->free (ret); + } + + PRINTMARK(); + + return newobj; +} + +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *data; + PyObject *file; + PyObject *string; + PyObject *write; + PyObject *argtuple; + + PRINTMARK(); + + if (!PyArg_ParseTuple (args, "OO", &data, &file)) + { + return NULL; + } + + if (!PyObject_HasAttrString (file, "write")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + write = PyObject_GetAttrString (file, "write"); + + if (!PyCallable_Check (write)) + { + Py_XDECREF(write); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + argtuple = PyTuple_Pack(1, data); + + string = objToJSON (self, argtuple, kwargs); + + if (string == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(argtuple); + + argtuple = PyTuple_Pack (1, string); + if (argtuple == NULL) + { + Py_XDECREF(write); + return NULL; + } + if (PyObject_CallObject (write, argtuple) == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(write); + Py_DECREF(argtuple); + Py_XDECREF(string); + + PRINTMARK(); + + Py_RETURN_NONE; +} diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/src/ujson/python/py_defines.h new file mode 100644 index 00000000..31291421 --- /dev/null +++ b/pandas/src/ujson/python/py_defines.h @@ -0,0 +1,52 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. +*/ + +#include + +#if PY_MAJOR_VERSION >= 3 + +#define PyInt_Check PyLong_Check +#define PyInt_AS_LONG PyLong_AsLong +#define PyInt_FromLong PyLong_FromLong + +#define PyString_Check PyBytes_Check +#define PyString_GET_SIZE PyBytes_GET_SIZE +#define PyString_AS_STRING PyBytes_AS_STRING + +#define PyString_FromString PyUnicode_FromString + +#endif diff --git a/pandas/src/ujson/python/ujson.c b/pandas/src/ujson/python/ujson.c new file mode 100644 index 00000000..2eb8a80c --- /dev/null +++ b/pandas/src/ujson/python/ujson.c @@ -0,0 +1,112 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the ESN Social Software AB nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms +* Copyright (c) 1988-1993 The Regents of the University of California. +* Copyright (c) 1994 Sun Microsystems, Inc. +*/ + +#include "py_defines.h" +#include "version.h" + +/* objToJSON */ +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs); +void initObjToJSON(void); + +/* JSONToObj */ +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs); + +/* objToJSONFile */ +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs); + +/* JSONFileToObj */ +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs); + + +#define ENCODER_HELP_TEXT "Use ensure_ascii=false to output UTF-8. Pass in double_precision to alter the maximum digit precision of doubles. Set encode_html_chars=True to encode < > & as unicode escape sequences." + +static PyMethodDef ujsonMethods[] = { + {"encode", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. " ENCODER_HELP_TEXT}, + {"decode", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure. Use precise_float=True to use high precision float decoder."}, + {"dumps", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. " ENCODER_HELP_TEXT}, + {"loads", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure. Use precise_float=True to use high precision float decoder."}, + {"dump", (PyCFunction) objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. " ENCODER_HELP_TEXT}, + {"load", (PyCFunction) JSONFileToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as file to dict object structure. Use precise_float=True to use high precision float decoder."}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + +#if PY_MAJOR_VERSION >= 3 + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "_pandasujson", + 0, /* m_doc */ + -1, /* m_size */ + ujsonMethods, /* m_methods */ + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ +}; + +#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void) +#define PYMODULE_CREATE() PyModule_Create(&moduledef) +#define MODINITERROR return NULL + +#else + +#define PYMODINITFUNC PyMODINIT_FUNC initjson(void) +#define PYMODULE_CREATE() Py_InitModule("json", ujsonMethods) +#define MODINITERROR return + +#endif + +PYMODINITFUNC +{ + PyObject *module; + PyObject *version_string; + + initObjToJSON(); + module = PYMODULE_CREATE(); + + if (module == NULL) + { + MODINITERROR; + } + + version_string = PyString_FromString (UJSON_VERSION); + PyModule_AddObject (module, "__version__", version_string); + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif +} diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h new file mode 100644 index 00000000..0ccfbfe7 --- /dev/null +++ b/pandas/src/ujson/python/version.h @@ -0,0 +1,38 @@ +/* +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. +*/ + +#define UJSON_VERSION "1.33" diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd new file mode 100644 index 00000000..cc1921e6 --- /dev/null +++ b/pandas/src/util.pxd @@ -0,0 +1,84 @@ +from numpy cimport ndarray +cimport numpy as cnp +cimport cpython + +cdef extern from "numpy_helper.h": + inline void set_array_owndata(ndarray ao) + inline void set_array_not_contiguous(ndarray ao) + + inline int is_integer_object(object) + inline int is_float_object(object) + inline int is_complex_object(object) + inline int is_bool_object(object) + inline int is_string_object(object) + inline int is_datetime64_object(object) + inline int is_timedelta64_object(object) + inline int assign_value_1d(ndarray, Py_ssize_t, object) except -1 + inline cnp.int64_t get_nat() + inline object get_value_1d(ndarray, Py_ssize_t) + inline int floatify(object, double*) except -1 + inline char *get_c_string(object) + inline object char_to_string(char*) + inline void transfer_object_column(char *dst, char *src, size_t stride, + size_t length) + object sarr_from_data(cnp.dtype, int length, void* data) + +cdef inline object get_value_at(ndarray arr, object loc): + cdef: + Py_ssize_t i, sz + void* data_ptr + if is_float_object(loc): + casted = int(loc) + if casted == loc: + loc = casted + i = loc + sz = cnp.PyArray_SIZE(arr) + + if i < 0 and sz > 0: + i += sz + elif i >= sz or sz == 0: + raise IndexError('index out of bounds') + + return get_value_1d(arr, i) + +cdef inline set_value_at(ndarray arr, object loc, object value): + cdef: + Py_ssize_t i, sz + if is_float_object(loc): + casted = int(loc) + if casted == loc: + loc = casted + i = loc + sz = cnp.PyArray_SIZE(arr) + + if i < 0: + i += sz + elif i >= sz: + raise IndexError('index out of bounds') + + assign_value_1d(arr, i, value) + +cdef inline int is_contiguous(ndarray arr): + return cnp.PyArray_CHKFLAGS(arr, cnp.NPY_C_CONTIGUOUS) + +cdef inline is_array(object o): + return cnp.PyArray_Check(o) + + +cdef inline bint _checknull(object val): + try: + return val is None or (cpython.PyFloat_Check(val) and val != val) + except ValueError: + return False + +cdef inline bint _checknull_old(object val): + import numpy as np + cdef double INF = np.inf + cdef double NEGINF = -INF + try: + return val is None or val != val or val == INF or val == NEGINF + except ValueError: + return False + +cdef inline bint _checknan(object val): + return not cnp.PyArray_Check(val) and val != val diff --git a/pandas/stats/__init__.py b/pandas/stats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/stats/api.py b/pandas/stats/api.py new file mode 100644 index 00000000..3732f9ed --- /dev/null +++ b/pandas/stats/api.py @@ -0,0 +1,9 @@ +""" +Common namespace of statistical functions +""" + +# pylint: disable-msg=W0611,W0614,W0401 + +from pandas.stats.moments import * +from pandas.stats.interface import ols +from pandas.stats.fama_macbeth import fama_macbeth diff --git a/pandas/stats/common.py b/pandas/stats/common.py new file mode 100644 index 00000000..c30b3e7a --- /dev/null +++ b/pandas/stats/common.py @@ -0,0 +1,41 @@ + +_WINDOW_TYPES = { + 0: 'full_sample', + 1: 'rolling', + 2: 'expanding' +} +# also allow 'rolling' as key +_WINDOW_TYPES.update((v, v) for k,v in list(_WINDOW_TYPES.items())) +_ADDITIONAL_CLUSTER_TYPES = set(("entity", "time")) + +def _get_cluster_type(cluster_type): + # this was previous behavior + if cluster_type is None: + return cluster_type + try: + return _get_window_type(cluster_type) + except ValueError: + final_type = str(cluster_type).lower().replace("_", " ") + if final_type in _ADDITIONAL_CLUSTER_TYPES: + return final_type + raise ValueError('Unrecognized cluster type: %s' % cluster_type) + +def _get_window_type(window_type): + # e.g., 0, 1, 2 + final_type = _WINDOW_TYPES.get(window_type) + # e.g., 'full_sample' + final_type = final_type or _WINDOW_TYPES.get(str(window_type).lower().replace(" ", "_")) + if final_type is None: + raise ValueError('Unrecognized window type: %s' % window_type) + return final_type + +def banner(text, width=80): + """ + + """ + toFill = width - len(text) + + left = toFill // 2 + right = toFill - left + + return '%s%s%s' % ('-' * left, text, '-' * right) diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py new file mode 100644 index 00000000..38fb5894 --- /dev/null +++ b/pandas/stats/fama_macbeth.py @@ -0,0 +1,226 @@ +from pandas.core.base import StringMixin +from pandas.compat import StringIO, range + +import numpy as np + +from pandas.core.api import Series, DataFrame +import pandas.stats.common as common +from pandas.util.decorators import cache_readonly + + +def fama_macbeth(**kwargs): + """Runs Fama-MacBeth regression. + + Parameters + ---------- + Takes the same arguments as a panel OLS, in addition to: + + nw_lags_beta: int + Newey-West adjusts the betas by the given lags + """ + window_type = kwargs.get('window_type') + if window_type is None: + klass = FamaMacBeth + else: + klass = MovingFamaMacBeth + + return klass(**kwargs) + + +class FamaMacBeth(StringMixin): + def __init__(self, y, x, intercept=True, nw_lags=None, + nw_lags_beta=None, + entity_effects=False, time_effects=False, x_effects=None, + cluster=None, dropped_dummies={}, verbose=False): + self._nw_lags_beta = nw_lags_beta + + from pandas.stats.plm import MovingPanelOLS + self._ols_result = MovingPanelOLS( + y=y, x=x, window_type='rolling', window=1, + intercept=intercept, + nw_lags=nw_lags, entity_effects=entity_effects, + time_effects=time_effects, x_effects=x_effects, cluster=cluster, + dropped_dummies=dropped_dummies, verbose=verbose) + + self._cols = self._ols_result._x.columns + + @cache_readonly + def _beta_raw(self): + return self._ols_result._beta_raw + + @cache_readonly + def _stats(self): + return _calc_t_stat(self._beta_raw, self._nw_lags_beta) + + @cache_readonly + def _mean_beta_raw(self): + return self._stats[0] + + @cache_readonly + def _std_beta_raw(self): + return self._stats[1] + + @cache_readonly + def _t_stat_raw(self): + return self._stats[2] + + def _make_result(self, result): + return Series(result, index=self._cols) + + @cache_readonly + def mean_beta(self): + return self._make_result(self._mean_beta_raw) + + @cache_readonly + def std_beta(self): + return self._make_result(self._std_beta_raw) + + @cache_readonly + def t_stat(self): + return self._make_result(self._t_stat_raw) + + @cache_readonly + def _results(self): + return { + 'mean_beta': self._mean_beta_raw, + 'std_beta': self._std_beta_raw, + 't_stat': self._t_stat_raw, + } + + @cache_readonly + def _coef_table(self): + buffer = StringIO() + buffer.write('%13s %13s %13s %13s %13s %13s\n' % + ('Variable', 'Beta', 'Std Err', 't-stat', 'CI 2.5%', 'CI 97.5%')) + template = '%13s %13.4f %13.4f %13.2f %13.4f %13.4f\n' + + for i, name in enumerate(self._cols): + if i and not (i % 5): + buffer.write('\n' + common.banner('')) + + mean_beta = self._results['mean_beta'][i] + std_beta = self._results['std_beta'][i] + t_stat = self._results['t_stat'][i] + ci1 = mean_beta - 1.96 * std_beta + ci2 = mean_beta + 1.96 * std_beta + + values = '(%s)' % name, mean_beta, std_beta, t_stat, ci1, ci2 + + buffer.write(template % values) + + if self._nw_lags_beta is not None: + buffer.write('\n') + buffer.write('*** The Std Err, t-stat are Newey-West ' + 'adjusted with Lags %5d\n' % self._nw_lags_beta) + + return buffer.getvalue() + + def __unicode__(self): + return self.summary + + @cache_readonly + def summary(self): + template = """ +----------------------Summary of Fama-MacBeth Analysis------------------------- + +Formula: Y ~ %(formulaRHS)s +# betas : %(nu)3d + +----------------------Summary of Estimated Coefficients------------------------ +%(coefTable)s +--------------------------------End of Summary--------------------------------- +""" + params = { + 'formulaRHS': ' + '.join(self._cols), + 'nu': len(self._beta_raw), + 'coefTable': self._coef_table, + } + + return template % params + + +class MovingFamaMacBeth(FamaMacBeth): + def __init__(self, y, x, window_type='rolling', window=10, + intercept=True, nw_lags=None, nw_lags_beta=None, + entity_effects=False, time_effects=False, x_effects=None, + cluster=None, dropped_dummies={}, verbose=False): + self._window_type = common._get_window_type(window_type) + self._window = window + + FamaMacBeth.__init__( + self, y=y, x=x, intercept=intercept, + nw_lags=nw_lags, nw_lags_beta=nw_lags_beta, + entity_effects=entity_effects, time_effects=time_effects, + x_effects=x_effects, cluster=cluster, + dropped_dummies=dropped_dummies, verbose=verbose) + + self._index = self._ols_result._index + self._T = len(self._index) + + @property + def _is_rolling(self): + return self._window_type == 'rolling' + + def _calc_stats(self): + mean_betas = [] + std_betas = [] + t_stats = [] + + # XXX + + mask = self._ols_result._rolling_ols_call[2] + obs_total = mask.astype(int).cumsum() + + start = self._window - 1 + betas = self._beta_raw + for i in range(start, self._T): + if self._is_rolling: + begin = i - start + else: + begin = 0 + + B = betas[max(obs_total[begin] - 1, 0): obs_total[i]] + mean_beta, std_beta, t_stat = _calc_t_stat(B, self._nw_lags_beta) + mean_betas.append(mean_beta) + std_betas.append(std_beta) + t_stats.append(t_stat) + + return np.array([mean_betas, std_betas, t_stats]) + + _stats = cache_readonly(_calc_stats) + + def _make_result(self, result): + return DataFrame(result, index=self._result_index, columns=self._cols) + + @cache_readonly + def _result_index(self): + mask = self._ols_result._rolling_ols_call[2] + # HACK XXX + return self._index[mask.cumsum() >= self._window] + + @cache_readonly + def _results(self): + return { + 'mean_beta': self._mean_beta_raw[-1], + 'std_beta': self._std_beta_raw[-1], + 't_stat': self._t_stat_raw[-1], + } + + +def _calc_t_stat(beta, nw_lags_beta): + N = len(beta) + B = beta - beta.mean(0) + C = np.dot(B.T, B) / N + + if nw_lags_beta is not None: + for i in range(nw_lags_beta + 1): + + cov = np.dot(B[i:].T, B[:(N - i)]) / N + weight = i / (nw_lags_beta + 1) + C += 2 * (1 - weight) * cov + + mean_beta = beta.mean(0) + std_beta = np.sqrt(np.diag(C)) / np.sqrt(N) + t_stat = mean_beta / std_beta + + return mean_beta, std_beta, t_stat diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py new file mode 100644 index 00000000..6d7bf329 --- /dev/null +++ b/pandas/stats/interface.py @@ -0,0 +1,135 @@ +from pandas.core.api import Series, DataFrame, Panel, MultiIndex +from pandas.stats.ols import OLS, MovingOLS +from pandas.stats.plm import PanelOLS, MovingPanelOLS, NonPooledPanelOLS +import pandas.stats.common as common + + +def ols(**kwargs): + """Returns the appropriate OLS object depending on whether you need + simple or panel OLS, and a full-sample or rolling/expanding OLS. + + Will be a normal linear regression or a (pooled) panel regression depending + on the type of the inputs: + + y : Series, x : DataFrame -> OLS + y : Series, x : dict of DataFrame -> OLS + y : DataFrame, x : DataFrame -> PanelOLS + y : DataFrame, x : dict of DataFrame/Panel -> PanelOLS + y : Series with MultiIndex, x : Panel/DataFrame + MultiIndex -> PanelOLS + + Parameters + ---------- + y: Series or DataFrame + See above for types + x: Series, DataFrame, dict of Series, dict of DataFrame, Panel + weights : Series or ndarray + The weights are presumed to be (proportional to) the inverse of the + variance of the observations. That is, if the variables are to be + transformed by 1/sqrt(W) you must supply weights = 1/W + intercept: bool + True if you want an intercept. Defaults to True. + nw_lags: None or int + Number of Newey-West lags. Defaults to None. + nw_overlap: bool + Whether there are overlaps in the NW lags. Defaults to False. + window_type: {'full sample', 'rolling', 'expanding'} + 'full sample' by default + window: int + size of window (for rolling/expanding OLS). If window passed and no + explicit window_type, 'rolling" will be used as the window_type + + Panel OLS options: + pool: bool + Whether to run pooled panel regression. Defaults to true. + entity_effects: bool + Whether to account for entity fixed effects. Defaults to false. + time_effects: bool + Whether to account for time fixed effects. Defaults to false. + x_effects: list + List of x's to account for fixed effects. Defaults to none. + dropped_dummies: dict + Key is the name of the variable for the fixed effect. + Value is the value of that variable for which we drop the dummy. + + For entity fixed effects, key equals 'entity'. + + By default, the first dummy is dropped if no dummy is specified. + cluster: {'time', 'entity'} + cluster variances + + Examples + -------- + # Run simple OLS. + result = ols(y=y, x=x) + + # Run rolling simple OLS with window of size 10. + result = ols(y=y, x=x, window_type='rolling', window=10) + print(result.beta) + + result = ols(y=y, x=x, nw_lags=1) + + # Set up LHS and RHS for data across all items + y = A + x = {'B' : B, 'C' : C} + + # Run panel OLS. + result = ols(y=y, x=x) + + # Run expanding panel OLS with window 10 and entity clustering. + result = ols(y=y, x=x, cluster='entity', window_type='expanding', window=10) + + Returns + ------- + The appropriate OLS object, which allows you to obtain betas and various + statistics, such as std err, t-stat, etc. + """ + pool = kwargs.get('pool') + if 'pool' in kwargs: + del kwargs['pool'] + + window_type = kwargs.get('window_type') + window = kwargs.get('window') + + if window_type is None: + if window is None: + window_type = 'full_sample' + else: + window_type = 'rolling' + else: + window_type = common._get_window_type(window_type) + + if window_type != 'full_sample': + kwargs['window_type'] = common._get_window_type(window_type) + + y = kwargs.get('y') + x = kwargs.get('x') + + panel = False + if isinstance(y, DataFrame) or (isinstance(y, Series) and + isinstance(y.index, MultiIndex)): + panel = True + if isinstance(x, Panel): + panel = True + + if window_type == 'full_sample': + for rolling_field in ('window_type', 'window', 'min_periods'): + if rolling_field in kwargs: + del kwargs[rolling_field] + + if panel: + if pool is False: + klass = NonPooledPanelOLS + else: + klass = PanelOLS + else: + klass = OLS + else: + if panel: + if pool is False: + klass = NonPooledPanelOLS + else: + klass = MovingPanelOLS + else: + klass = MovingOLS + + return klass(**kwargs) diff --git a/pandas/stats/math.py b/pandas/stats/math.py new file mode 100644 index 00000000..505415be --- /dev/null +++ b/pandas/stats/math.py @@ -0,0 +1,130 @@ +# pylint: disable-msg=E1103 +# pylint: disable-msg=W0212 + +from __future__ import division + +from pandas.compat import range +import numpy as np +import numpy.linalg as linalg + + +def rank(X, cond=1.0e-12): + """ + Return the rank of a matrix X based on its generalized inverse, + not the SVD. + """ + X = np.asarray(X) + if len(X.shape) == 2: + import scipy.linalg as SL + D = SL.svdvals(X) + result = np.add.reduce(np.greater(D / D.max(), cond)) + return int(result.astype(np.int32)) + else: + return int(not np.alltrue(np.equal(X, 0.))) + + +def solve(a, b): + """Returns the solution of A X = B.""" + try: + return linalg.solve(a, b) + except linalg.LinAlgError: + return np.dot(linalg.pinv(a), b) + + +def inv(a): + """Returns the inverse of A.""" + try: + return np.linalg.inv(a) + except linalg.LinAlgError: + return np.linalg.pinv(a) + + +def is_psd(m): + eigvals = linalg.eigvals(m) + return np.isreal(eigvals).all() and (eigvals >= 0).all() + + +def newey_west(m, max_lags, nobs, df, nw_overlap=False): + """ + Compute Newey-West adjusted covariance matrix, taking into account + specified number of leads / lags + + Parameters + ---------- + m : (N x K) + max_lags : int + nobs : int + Number of observations in model + df : int + Degrees of freedom in explanatory variables + nw_overlap : boolean, default False + Assume data is overlapping + + Returns + ------- + ndarray (K x K) + + Reference + --------- + Newey, W. K. & West, K. D. (1987) A Simple, Positive + Semi-definite, Heteroskedasticity and Autocorrelation Consistent + Covariance Matrix, Econometrica, vol. 55(3), 703-708 + """ + Xeps = np.dot(m.T, m) + for lag in range(1, max_lags + 1): + auto_cov = np.dot(m[:-lag].T, m[lag:]) + weight = lag / (max_lags + 1) + if nw_overlap: + weight = 0 + bb = auto_cov + auto_cov.T + dd = (1 - weight) * bb + Xeps += dd + + Xeps *= nobs / (nobs - df) + + if nw_overlap and not is_psd(Xeps): + new_max_lags = int(np.ceil(max_lags * 1.5)) +# print('nw_overlap is True and newey_west generated a non positive ' +# 'semidefinite matrix, so using newey_west with max_lags of %d.' +# % new_max_lags) + return newey_west(m, new_max_lags, nobs, df) + + return Xeps + + +def calc_F(R, r, beta, var_beta, nobs, df): + """ + Computes the standard F-test statistic for linear restriction + hypothesis testing + + Parameters + ---------- + R: ndarray (N x N) + Restriction matrix + r: ndarray (N x 1) + Restriction vector + beta: ndarray (N x 1) + Estimated model coefficients + var_beta: ndarray (N x N) + Variance covariance matrix of regressors + nobs: int + Number of observations in model + df: int + Model degrees of freedom + + Returns + ------- + F value, (q, df_resid), p value + """ + from scipy.stats import f + + hyp = np.dot(R, beta.reshape(len(beta), 1)) - r + RSR = np.dot(R, np.dot(var_beta, R.T)) + + q = len(r) + + F = np.dot(hyp.T, np.dot(inv(RSR), hyp)).squeeze() / q + + p_value = 1 - f.cdf(F, q, nobs - df) + + return F, (q, nobs - df), p_value diff --git a/pandas/stats/misc.py b/pandas/stats/misc.py new file mode 100644 index 00000000..ef663b25 --- /dev/null +++ b/pandas/stats/misc.py @@ -0,0 +1,386 @@ +from numpy import NaN +from pandas import compat +import numpy as np + +from pandas.core.api import Series, DataFrame, isnull, notnull +from pandas.core.series import remove_na +from pandas.compat import zip + + +def zscore(series): + return (series - series.mean()) / np.std(series, ddof=0) + + +def correl_ts(frame1, frame2): + """ + Pairwise correlation of columns of two DataFrame objects + + Parameters + ---------- + + Returns + ------- + y : Series + """ + results = {} + for col, series in compat.iteritems(frame1): + if col in frame2: + other = frame2[col] + + idx1 = series.valid().index + idx2 = other.valid().index + + common_index = idx1.intersection(idx2) + + seriesStand = zscore(series.reindex(common_index)) + otherStand = zscore(other.reindex(common_index)) + results[col] = (seriesStand * otherStand).mean() + + return Series(results) + + +def correl_xs(frame1, frame2): + return correl_ts(frame1.T, frame2.T) + +def percentileofscore(a, score, kind='rank'): + """The percentile rank of a score relative to a list of scores. + + A `percentileofscore` of, for example, 80% means that 80% of the + scores in `a` are below the given score. In the case of gaps or + ties, the exact definition depends on the optional keyword, `kind`. + + Parameters + ---------- + a: array like + Array of scores to which `score` is compared. + score: int or float + Score that is compared to the elements in `a`. + kind: {'rank', 'weak', 'strict', 'mean'}, optional + This optional parameter specifies the interpretation of the + resulting score: + + - "rank": Average percentage ranking of score. In case of + multiple matches, average the percentage rankings of + all matching scores. + - "weak": This kind corresponds to the definition of a cumulative + distribution function. A percentileofscore of 80% + means that 80% of values are less than or equal + to the provided score. + - "strict": Similar to "weak", except that only values that are + strictly less than the given score are counted. + - "mean": The average of the "weak" and "strict" scores, often used in + testing. See + + http://en.wikipedia.org/wiki/Percentile_rank + + Returns + ------- + pcos : float + Percentile-position of score (0-100) relative to `a`. + + Examples + -------- + Three-quarters of the given values lie below a given score: + + >>> percentileofscore([1, 2, 3, 4], 3) + 75.0 + + With multiple matches, note how the scores of the two matches, 0.6 + and 0.8 respectively, are averaged: + + >>> percentileofscore([1, 2, 3, 3, 4], 3) + 70.0 + + Only 2/5 values are strictly less than 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') + 40.0 + + But 4/5 values are less than or equal to 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') + 80.0 + + The average between the weak and the strict scores is + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') + 60.0 + + """ + a = np.array(a) + n = len(a) + + if kind == 'rank': + if not(np.any(a == score)): + a = np.append(a, score) + a_len = np.array(lrange(len(a))) + else: + a_len = np.array(lrange(len(a))) + 1.0 + + a = np.sort(a) + idx = [a == score] + pct = (np.mean(a_len[idx]) / n) * 100.0 + return pct + + elif kind == 'strict': + return sum(a < score) / float(n) * 100 + elif kind == 'weak': + return sum(a <= score) / float(n) * 100 + elif kind == 'mean': + return (sum(a < score) + sum(a <= score)) * 50 / float(n) + else: + raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") + +def percentileRank(frame, column=None, kind='mean'): + """ + Return score at percentile for each point in time (cross-section) + + Parameters + ---------- + frame: DataFrame + column: string or Series, optional + Column name or specific Series to compute percentiles for. + If not provided, percentiles are computed for all values at each + point in time. Note that this can take a LONG time. + kind: {'rank', 'weak', 'strict', 'mean'}, optional + This optional parameter specifies the interpretation of the + resulting score: + + - "rank": Average percentage ranking of score. In case of + multiple matches, average the percentage rankings of + all matching scores. + - "weak": This kind corresponds to the definition of a cumulative + distribution function. A percentileofscore of 80% + means that 80% of values are less than or equal + to the provided score. + - "strict": Similar to "weak", except that only values that are + strictly less than the given score are counted. + - "mean": The average of the "weak" and "strict" scores, often used in + testing. See + + http://en.wikipedia.org/wiki/Percentile_rank + + Returns + ------- + TimeSeries or DataFrame, depending on input + """ + fun = lambda xs, score: percentileofscore(remove_na(xs), + score, kind=kind) + + results = {} + framet = frame.T + if column is not None: + if isinstance(column, Series): + for date, xs in compat.iteritems(frame.T): + results[date] = fun(xs, column.get(date, NaN)) + else: + for date, xs in compat.iteritems(frame.T): + results[date] = fun(xs, xs[column]) + results = Series(results) + else: + for column in frame.columns: + for date, xs in compat.iteritems(framet): + results.setdefault(date, {})[column] = fun(xs, xs[column]) + results = DataFrame(results).T + return results + + +def bucket(series, k, by=None): + """ + Produce DataFrame representing quantiles of a Series + + Parameters + ---------- + series : Series + k : int + number of quantiles + by : Series or same-length array + bucket by value + + Returns + ------- + DataFrame + """ + if by is None: + by = series + else: + by = by.reindex(series.index) + + split = _split_quantile(by, k) + mat = np.empty((len(series), k), dtype=float) * np.NaN + + for i, v in enumerate(split): + mat[:, i][v] = series.take(v) + + return DataFrame(mat, index=series.index, columns=np.arange(k) + 1) + + +def _split_quantile(arr, k): + arr = np.asarray(arr) + mask = np.isfinite(arr) + order = arr[mask].argsort() + n = len(arr) + + return np.array_split(np.arange(n)[mask].take(order), k) + + +def bucketcat(series, cats): + """ + Produce DataFrame representing quantiles of a Series + + Parameters + ---------- + series : Series + cat : Series or same-length array + bucket by category; mutually exclusive with 'by' + + Returns + ------- + DataFrame + """ + if not isinstance(series, Series): + series = Series(series, index=np.arange(len(series))) + + cats = np.asarray(cats) + + unique_labels = np.unique(cats) + unique_labels = unique_labels[com.notnull(unique_labels)] + + # group by + data = {} + + for label in unique_labels: + data[label] = series[cats == label] + + return DataFrame(data, columns=unique_labels) + + +def bucketpanel(series, bins=None, by=None, cat=None): + """ + Bucket data by two Series to create summary panel + + Parameters + ---------- + series : Series + bins : tuple (length-2) + e.g. (2, 2) + by : tuple of Series + bucket by value + cat : tuple of Series + bucket by category; mutually exclusive with 'by' + + Returns + ------- + DataFrame + """ + use_by = by is not None + use_cat = cat is not None + + if use_by and use_cat: + raise Exception('must specify by or cat, but not both') + elif use_by: + if len(by) != 2: + raise Exception('must provide two bucketing series') + + xby, yby = by + xbins, ybins = bins + + return _bucketpanel_by(series, xby, yby, xbins, ybins) + + elif use_cat: + xcat, ycat = cat + return _bucketpanel_cat(series, xcat, ycat) + else: + raise Exception('must specify either values or categories ' + 'to bucket by') + + +def _bucketpanel_by(series, xby, yby, xbins, ybins): + xby = xby.reindex(series.index) + yby = yby.reindex(series.index) + + xlabels = _bucket_labels(xby.reindex(series.index), xbins) + ylabels = _bucket_labels(yby.reindex(series.index), ybins) + + labels = _uniquify(xlabels, ylabels, xbins, ybins) + + mask = com.isnull(labels) + labels[mask] = -1 + + unique_labels = np.unique(labels) + bucketed = bucketcat(series, labels) + + _ulist = list(labels) + index_map = dict((x, _ulist.index(x)) for x in unique_labels) + + def relabel(key): + pos = index_map[key] + + xlab = xlabels[pos] + ylab = ylabels[pos] + + return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL', + int(ylab) if com.notnull(ylab) else 'NULL') + + return bucketed.rename(columns=relabel) + + +def _bucketpanel_cat(series, xcat, ycat): + xlabels, xmapping = _intern(xcat) + ylabels, ymapping = _intern(ycat) + + shift = 10 ** (np.ceil(np.log10(ylabels.max()))) + labels = xlabels * shift + ylabels + + sorter = labels.argsort() + sorted_labels = labels.take(sorter) + sorted_xlabels = xlabels.take(sorter) + sorted_ylabels = ylabels.take(sorter) + + unique_labels = np.unique(labels) + unique_labels = unique_labels[com.notnull(unique_labels)] + + locs = sorted_labels.searchsorted(unique_labels) + xkeys = sorted_xlabels.take(locs) + ykeys = sorted_ylabels.take(locs) + + stringified = ['(%s, %s)' % arg + for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))] + + result = bucketcat(series, labels) + result.columns = stringified + + return result + + +def _intern(values): + # assumed no NaN values + values = np.asarray(values) + + uniqued = np.unique(values) + labels = uniqued.searchsorted(values) + return labels, uniqued + + +def _uniquify(xlabels, ylabels, xbins, ybins): + # encode the stuff, create unique label + shifter = 10 ** max(xbins, ybins) + _xpiece = xlabels * shifter + _ypiece = ylabels + + return _xpiece + _ypiece + + +def _bucket_labels(series, k): + arr = np.asarray(series) + mask = np.isfinite(arr) + order = arr[mask].argsort() + n = len(series) + + split = np.array_split(np.arange(n)[mask].take(order), k) + + mat = np.empty(n, dtype=float) * np.NaN + for i, v in enumerate(split): + mat[v] = i + + return mat + 1 diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py new file mode 100644 index 00000000..e5d96ee6 --- /dev/null +++ b/pandas/stats/moments.py @@ -0,0 +1,991 @@ +""" +Provides rolling statistical moments and related descriptive +statistics implemented in Cython +""" +from __future__ import division + +from functools import wraps +from collections import defaultdict + +from numpy import NaN +import numpy as np + +from pandas.core.api import DataFrame, Series, Panel, notnull +import pandas.algos as algos +import pandas.core.common as pdcom + +from pandas.util.decorators import Substitution, Appender + +__all__ = ['rolling_count', 'rolling_max', 'rolling_min', + 'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov', + 'rolling_corr', 'rolling_var', 'rolling_skew', 'rolling_kurt', + 'rolling_quantile', 'rolling_median', 'rolling_apply', + 'rolling_corr_pairwise', 'rolling_window', + 'ewma', 'ewmvar', 'ewmstd', 'ewmvol', 'ewmcorr', 'ewmcov', + 'expanding_count', 'expanding_max', 'expanding_min', + 'expanding_sum', 'expanding_mean', 'expanding_std', + 'expanding_cov', 'expanding_corr', 'expanding_var', + 'expanding_skew', 'expanding_kurt', 'expanding_quantile', + 'expanding_median', 'expanding_apply', 'expanding_corr_pairwise'] + +#------------------------------------------------------------------------------ +# Docs + +# The order of arguments for the _doc_template is: +# (header, args, kwargs, returns, notes) + +_doc_template = """ +%s + +Parameters +---------- +%s%s +Returns +------- +%s +%s +""" + +_roll_kw = """window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. +min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). +freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. +center : boolean, default False + Set the labels at the center of the window. +how : string, default '%s' + Method for down- or re-sampling +""" + +_roll_notes = r""" +Notes +----- +By default, the result is set to the right edge of the window. This can be +changed to the center of the window by setting ``center=True``. + +The `freq` keyword is used to conform time series data to a specified +frequency by resampling the data. This is done with the default parameters +of :meth:`~pandas.Series.resample` (i.e. using the `mean`). +""" + + +_ewm_kw = r"""com : float. optional + Center of mass: :math:`\alpha = 1 / (1 + com)`, +span : float, optional + Specify decay in terms of span, :math:`\alpha = 2 / (span + 1)` +halflife : float, optional + Specify decay in terms of halflife, :math:`\alpha = 1 - exp(log(0.5) / halflife)` +min_periods : int, default 0 + Number of observations in sample to require (only affects + beginning) +freq : None or string alias / date offset object, default=None + Frequency to conform to before computing statistic +adjust : boolean, default True + Divide by decaying adjustment factor in beginning periods to account for + imbalance in relative weightings (viewing EWMA as a moving average) +how : string, default 'mean' + Method for down- or re-sampling +""" + +_ewm_notes = r""" +Notes +----- +Either center of mass or span must be specified + +EWMA is sometimes specified using a "span" parameter `s`, we have that the +decay parameter :math:`\alpha` is related to the span as +:math:`\alpha = 2 / (s + 1) = 1 / (1 + c)` + +where `c` is the center of mass. Given a span, the associated center of mass is +:math:`c = (s - 1) / 2` + +So a "20-day EWMA" would have center 9.5. +""" + +_expanding_kw = """min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). +freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. +""" + + +_type_of_input_retval = "y : type of input argument" + +_flex_retval = """y : type depends on inputs + DataFrame / DataFrame -> DataFrame (matches on columns) or Panel (pairwise) + DataFrame / Series -> Computes result for each column + Series / Series -> Series""" + +_pairwise_retval = "y : Panel whose items are df1.index values" + +_unary_arg = "arg : Series, DataFrame\n" + +_binary_arg_flex = """arg1 : Series, DataFrame, or ndarray +arg2 : Series, DataFrame, or ndarray, optional + if not supplied then will default to arg1 and produce pairwise output +""" + +_binary_arg = """arg1 : Series, DataFrame, or ndarray +arg2 : Series, DataFrame, or ndarray +""" + +_pairwise_arg = """df1 : DataFrame +df2 : DataFrame +""" + +_pairwise_kw = """pairwise : bool, default False + If False then only matching columns between arg1 and arg2 will be used and + the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the output + will be a Panel in the case of DataFrame inputs. In the case of missing + elements, only complete pairwise observations will be used. +""" + +_bias_kw = r"""bias : boolean, default False + Use a standard estimation bias correction +""" + + +def rolling_count(arg, window, freq=None, center=False, how=None): + """ + Rolling count of number of non-NaN observations inside provided window. + + Parameters + ---------- + arg : DataFrame or numpy ndarray-like + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. + center : boolean, default False + Whether the label should correspond with center of window + how : string, default 'mean' + Method for down- or re-sampling + + Returns + ------- + rolling_count : type of caller + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + """ + arg = _conv_timerule(arg, freq, how) + window = min(window, len(arg)) + + return_hook, values = _process_data_structure(arg, kill_inf=False) + + converted = np.isfinite(values).astype(float) + result = rolling_sum(converted, window, min_periods=1, + center=center) # already converted + + # putmask here? + result[np.isnan(result)] = 0 + + return return_hook(result) + + +@Substitution("Unbiased moving covariance.", _binary_arg_flex, + _roll_kw%'None'+_pairwise_kw, _flex_retval, _roll_notes) +@Appender(_doc_template) +def rolling_cov(arg1, arg2=None, window=None, min_periods=None, freq=None, + center=False, pairwise=None, how=None): + if window is None and isinstance(arg2, (int, float)): + window = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset + elif arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) + + def _get_cov(X, Y): + adj_window = min(window, len(X), len(Y)) + mean = lambda x: rolling_mean(x, adj_window, min_periods, center=center) + count = rolling_count(X + Y, adj_window, center=center) + bias_adj = count / (count - 1) + return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj + rs = _flex_binary_moment(arg1, arg2, _get_cov, pairwise=bool(pairwise)) + return rs + + +@Substitution("Moving sample correlation.", _binary_arg_flex, + _roll_kw%'None'+_pairwise_kw, _flex_retval, _roll_notes) +@Appender(_doc_template) +def rolling_corr(arg1, arg2=None, window=None, min_periods=None, freq=None, + center=False, pairwise=None, how=None): + if window is None and isinstance(arg2, (int, float)): + window = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset + elif arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) + + def _get_corr(a, b): + adj_window = min(window, len(a), len(b)) + num = rolling_cov(a, b, adj_window, min_periods, freq=freq, + center=center) + den = (rolling_std(a, adj_window, min_periods, freq=freq, + center=center) * + rolling_std(b, adj_window, min_periods, freq=freq, + center=center)) + return num / den + + return _flex_binary_moment(arg1, arg2, _get_corr, pairwise=bool(pairwise)) + + +def _flex_binary_moment(arg1, arg2, f, pairwise=False): + if not (isinstance(arg1,(np.ndarray, Series, DataFrame)) and + isinstance(arg2,(np.ndarray, Series, DataFrame))): + raise TypeError("arguments to moment function must be of type " + "np.ndarray/Series/DataFrame") + + if isinstance(arg1, (np.ndarray, Series)) and \ + isinstance(arg2, (np.ndarray,Series)): + X, Y = _prep_binary(arg1, arg2) + return f(X, Y) + elif isinstance(arg1, DataFrame): + results = {} + if isinstance(arg2, DataFrame): + X, Y = arg1.align(arg2, join='outer') + if pairwise is False: + X = X + 0 * Y + Y = Y + 0 * X + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + elif pairwise is True: + results = defaultdict(dict) + for i, k1 in enumerate(arg1.columns): + for j, k2 in enumerate(arg2.columns): + if j 1: + result = np.apply_along_axis(calc, axis, values) + else: + result = calc(values) + + rs = return_hook(result) + if center: + rs = _center_window(rs, window, axis) + return rs + + +def _center_window(rs, window, axis): + if axis > rs.ndim-1: + raise ValueError("Requested axis is larger then no. of argument " + "dimensions") + + offset = int((window - 1) / 2.) + if isinstance(rs, (Series, DataFrame, Panel)): + rs = rs.shift(-offset, axis=axis) + else: + rs_indexer = [slice(None)] * rs.ndim + rs_indexer[axis] = slice(None, -offset) + + lead_indexer = [slice(None)] * rs.ndim + lead_indexer[axis] = slice(offset, None) + + na_indexer = [slice(None)] * rs.ndim + na_indexer[axis] = slice(-offset, None) + + rs[tuple(rs_indexer)] = np.copy(rs[tuple(lead_indexer)]) + rs[tuple(na_indexer)] = np.nan + return rs + + +def _process_data_structure(arg, kill_inf=True): + if isinstance(arg, DataFrame): + return_hook = lambda v: type(arg)(v, index=arg.index, + columns=arg.columns) + values = arg.values + elif isinstance(arg, Series): + values = arg.values + return_hook = lambda v: Series(v, arg.index) + else: + return_hook = lambda v: v + values = arg + + if not issubclass(values.dtype.type, float): + values = values.astype(float) + + if kill_inf: + values = values.copy() + values[np.isinf(values)] = np.NaN + + return return_hook, values + +#------------------------------------------------------------------------------ +# Exponential moving moments + + +def _get_center_of_mass(com, span, halflife): + valid_count = len([x for x in [com, span, halflife] if x is not None]) + if valid_count > 1: + raise Exception("com, span, and halflife are mutually exclusive") + + if span is not None: + # convert span to center of mass + com = (span - 1) / 2. + elif halflife is not None: + # convert halflife to center of mass + decay = 1 - np.exp(np.log(0.5) / halflife) + com = 1 / decay - 1 + elif com is None: + raise Exception("Must pass one of com, span, or halflife") + + return float(com) + + +@Substitution("Exponentially-weighted moving average", _unary_arg, _ewm_kw, + _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) +def ewma(arg, com=None, span=None, halflife=None, min_periods=0, freq=None, + adjust=True, how=None): + com = _get_center_of_mass(com, span, halflife) + arg = _conv_timerule(arg, freq, how) + + def _ewma(v): + result = algos.ewma(v, com, int(adjust)) + first_index = _first_valid_index(v) + result[first_index: first_index + min_periods] = NaN + return result + + return_hook, values = _process_data_structure(arg) + output = np.apply_along_axis(_ewma, 0, values) + return return_hook(output) + + +def _first_valid_index(arr): + # argmax scans from left + return notnull(arr).argmax() if len(arr) else 0 + + +@Substitution("Exponentially-weighted moving variance", _unary_arg, + _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) +def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, + freq=None, how=None): + com = _get_center_of_mass(com, span, halflife) + arg = _conv_timerule(arg, freq, how) + moment2nd = ewma(arg * arg, com=com, min_periods=min_periods) + moment1st = ewma(arg, com=com, min_periods=min_periods) + + result = moment2nd - moment1st ** 2 + if not bias: + result *= (1.0 + 2.0 * com) / (2.0 * com) + + return result + + +@Substitution("Exponentially-weighted moving std", _unary_arg, + _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) +def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False): + result = ewmvar(arg, com=com, span=span, halflife=halflife, + min_periods=min_periods, bias=bias) + return _zsqrt(result) + +ewmvol = ewmstd + + +@Substitution("Exponentially-weighted moving covariance", _binary_arg_flex, + _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) +def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, + bias=False, freq=None, pairwise=None, how=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and com is None: + com = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) + + def _get_ewmcov(X, Y): + mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods) + return (mean(X * Y) - mean(X) * mean(Y)) + result = _flex_binary_moment(arg1, arg2, _get_ewmcov, + pairwise=bool(pairwise)) + if not bias: + com = _get_center_of_mass(com, span, halflife) + result *= (1.0 + 2.0 * com) / (2.0 * com) + + return result + + +@Substitution("Exponentially-weighted moving correlation", _binary_arg_flex, + _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) +def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, + freq=None, pairwise=None, how=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and com is None: + com = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + arg1 = _conv_timerule(arg1, freq, how) + arg2 = _conv_timerule(arg2, freq, how) + + def _get_ewmcorr(X, Y): + mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods) + var = lambda x: ewmvar(x, com=com, span=span, halflife=halflife, min_periods=min_periods, + bias=True) + return (mean(X * Y) - mean(X) * mean(Y)) / _zsqrt(var(X) * var(Y)) + result = _flex_binary_moment(arg1, arg2, _get_ewmcorr, + pairwise=bool(pairwise)) + return result + + +def _zsqrt(x): + result = np.sqrt(x) + mask = x < 0 + + if isinstance(x, DataFrame): + if mask.values.any(): + result[mask] = 0 + else: + if mask.any(): + result[mask] = 0 + + return result + + +def _prep_binary(arg1, arg2): + if not isinstance(arg2, type(arg1)): + raise Exception('Input arrays must be of the same type!') + + # mask out values, this also makes a common index... + X = arg1 + 0 * arg2 + Y = arg2 + 0 * arg1 + + return X, Y + +#---------------------------------------------------------------------- +# Python interface to Cython functions + + +def _conv_timerule(arg, freq, how): + + types = (DataFrame, Series) + if freq is not None and isinstance(arg, types): + # Conform to whatever frequency needed. + arg = arg.resample(freq, how=how) + + return arg + + +def _require_min_periods(p): + def _check_func(minp, window): + if minp is None: + return window + else: + return max(p, minp) + return _check_func + + +def _use_window(minp, window): + if minp is None: + return window + else: + return minp + + +def _rolling_func(func, desc, check_minp=_use_window, how=None): + if how is None: + how_arg_str = 'None' + else: + how_arg_str = "'%s"%how + + @Substitution(desc, _unary_arg, _roll_kw%how_arg_str, _type_of_input_retval, + _roll_notes) + @Appender(_doc_template) + @wraps(func) + def f(arg, window, min_periods=None, freq=None, center=False, how=how, + **kwargs): + def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): + minp = check_minp(minp, window) + return func(arg, window, minp, **kwds) + return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, + center=center, how=how, **kwargs) + + return f + +rolling_max = _rolling_func(algos.roll_max2, 'Moving maximum.', how='max') +rolling_min = _rolling_func(algos.roll_min2, 'Moving minimum.', how='min') +rolling_sum = _rolling_func(algos.roll_sum, 'Moving sum.') +rolling_mean = _rolling_func(algos.roll_mean, 'Moving mean.') +rolling_median = _rolling_func(algos.roll_median_cython, 'Moving median.', + how='median') + +_ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw)) +rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation.', + check_minp=_require_min_periods(1)) +rolling_var = _rolling_func(algos.roll_var, 'Unbiased moving variance.', + check_minp=_require_min_periods(1)) +rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.', + check_minp=_require_min_periods(3)) +rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.', + check_minp=_require_min_periods(4)) + + +def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, + center=False): + """Moving quantile. + + Parameters + ---------- + arg : Series, DataFrame + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. + quantile : float + 0 <= quantile <= 1 + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. + center : boolean, default False + Whether the label should correspond with center of window + + Returns + ------- + y : type of input argument + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + """ + + def call_cython(arg, window, minp, args=(), kwargs={}): + minp = _use_window(minp, window) + return algos.roll_quantile(arg, window, minp, quantile) + return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, + center=center) + + +def rolling_apply(arg, window, func, min_periods=None, freq=None, + center=False, args=(), kwargs={}): + """Generic moving function application. + + Parameters + ---------- + arg : Series, DataFrame + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. + func : function + Must produce a single value from an ndarray input + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. + center : boolean, default False + Whether the label should correspond with center of window + args : tuple + Passed on to func + kwargs : dict + Passed on to func + + Returns + ------- + y : type of input argument + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + """ + def call_cython(arg, window, minp, args, kwargs): + minp = _use_window(minp, window) + return algos.roll_generic(arg, window, minp, func, args, kwargs) + return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, + center=center, args=args, kwargs=kwargs) + + +def rolling_window(arg, window=None, win_type=None, min_periods=None, + freq=None, center=False, mean=True, + axis=0, how=None, **kwargs): + """ + Applies a moving window of type ``window_type`` and size ``window`` + on the data. + + Parameters + ---------- + arg : Series, DataFrame + window : int or ndarray + Weighting window specification. If the window is an integer, then it is + treated as the window length and win_type is required + win_type : str, default None + Window type (see Notes) + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. + center : boolean, default False + Whether the label should correspond with center of window + mean : boolean, default True + If True computes weighted mean, else weighted sum + axis : {0, 1}, default 0 + how : string, default 'mean' + Method for down- or re-sampling + + Returns + ------- + y : type of input argument + + Notes + ----- + The recognized window types are: + + * ``boxcar`` + * ``triang`` + * ``blackman`` + * ``hamming`` + * ``bartlett`` + * ``parzen`` + * ``bohman`` + * ``blackmanharris`` + * ``nuttall`` + * ``barthann`` + * ``kaiser`` (needs beta) + * ``gaussian`` (needs std) + * ``general_gaussian`` (needs power, width) + * ``slepian`` (needs width). + + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + """ + if isinstance(window, (list, tuple, np.ndarray)): + if win_type is not None: + raise ValueError(('Do not specify window type if using custom ' + 'weights')) + window = pdcom._asarray_tuplesafe(window).astype(float) + elif pdcom.is_integer(window): # window size + if win_type is None: + raise ValueError('Must specify window type') + try: + import scipy.signal as sig + except ImportError: + raise ImportError('Please install scipy to generate window weight') + win_type = _validate_win_type(win_type, kwargs) # may pop from kwargs + window = sig.get_window(win_type, window).astype(float) + else: + raise ValueError('Invalid window %s' % str(window)) + + minp = _use_window(min_periods, len(window)) + + arg = _conv_timerule(arg, freq, how) + return_hook, values = _process_data_structure(arg) + + f = lambda x: algos.roll_window(x, window, minp, avg=mean) + result = np.apply_along_axis(f, axis, values) + + rs = return_hook(result) + if center: + rs = _center_window(rs, len(window), axis) + return rs + + +def _validate_win_type(win_type, kwargs): + # may pop from kwargs + arg_map = {'kaiser': ['beta'], + 'gaussian': ['std'], + 'general_gaussian': ['power', 'width'], + 'slepian': ['width']} + if win_type in arg_map: + return tuple([win_type] + + _pop_args(win_type, arg_map[win_type], kwargs)) + return win_type + + +def _pop_args(win_type, arg_names, kwargs): + msg = '%s window requires %%s' % win_type + all_args = [] + for n in arg_names: + if n not in kwargs: + raise ValueError(msg % n) + all_args.append(kwargs.pop(n)) + return all_args + + +def _expanding_func(func, desc, check_minp=_use_window): + @Substitution(desc, _unary_arg, _expanding_kw, _type_of_input_retval, "") + @Appender(_doc_template) + @wraps(func) + def f(arg, min_periods=1, freq=None, center=False, **kwargs): + window = len(arg) + + def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): + minp = check_minp(minp, window) + return func(arg, window, minp, **kwds) + return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, + center=center, **kwargs) + + return f + +expanding_max = _expanding_func(algos.roll_max2, 'Expanding maximum.') +expanding_min = _expanding_func(algos.roll_min2, 'Expanding minimum.') +expanding_sum = _expanding_func(algos.roll_sum, 'Expanding sum.') +expanding_mean = _expanding_func(algos.roll_mean, 'Expanding mean.') +expanding_median = _expanding_func( + algos.roll_median_cython, 'Expanding median.') + +expanding_std = _expanding_func(_ts_std, + 'Unbiased expanding standard deviation.', + check_minp=_require_min_periods(2)) +expanding_var = _expanding_func(algos.roll_var, 'Unbiased expanding variance.', + check_minp=_require_min_periods(2)) +expanding_skew = _expanding_func( + algos.roll_skew, 'Unbiased expanding skewness.', + check_minp=_require_min_periods(3)) +expanding_kurt = _expanding_func( + algos.roll_kurt, 'Unbiased expanding kurtosis.', + check_minp=_require_min_periods(4)) + + +def expanding_count(arg, freq=None, center=False): + """ + Expanding count of number of non-NaN observations. + + Parameters + ---------- + arg : DataFrame or numpy ndarray-like + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. + center : boolean, default False + Whether the label should correspond with center of window. + + Returns + ------- + expanding_count : type of caller + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + """ + return rolling_count(arg, len(arg), freq=freq, center=center) + + +def expanding_quantile(arg, quantile, min_periods=1, freq=None, + center=False): + """Expanding quantile. + + Parameters + ---------- + arg : Series, DataFrame + quantile : float + 0 <= quantile <= 1 + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. + center : boolean, default False + Whether the label should correspond with center of window. + + Returns + ------- + y : type of input argument + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + """ + return rolling_quantile(arg, len(arg), quantile, min_periods=min_periods, + freq=freq, center=center) + + +@Substitution("Unbiased expanding covariance.", _binary_arg_flex, + _expanding_kw+_pairwise_kw, _flex_retval, "") +@Appender(_doc_template) +def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, center=False, + pairwise=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and min_periods is None: + min_periods = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + window = len(arg1) + len(arg2) + return rolling_cov(arg1, arg2, window, + min_periods=min_periods, freq=freq, + center=center, pairwise=pairwise) + + +@Substitution("Expanding sample correlation.", _binary_arg_flex, + _expanding_kw+_pairwise_kw, _flex_retval, "") +@Appender(_doc_template) +def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, center=False, + pairwise=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and min_periods is None: + min_periods = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + window = len(arg1) + len(arg2) + return rolling_corr(arg1, arg2, window, + min_periods=min_periods, + freq=freq, center=center, pairwise=pairwise) + + +@Substitution("Deprecated. Use expanding_corr(..., pairwise=True) instead.\n\n" + "Pairwise expanding sample correlation", _pairwise_arg, + _expanding_kw, _pairwise_retval, "") +@Appender(_doc_template) +def expanding_corr_pairwise(df1, df2=None, min_periods=1, freq=None, + center=False): + import warnings + warnings.warn("expanding_corr_pairwise is deprecated, use expanding_corr(..., pairwise=True)", FutureWarning) + return expanding_corr(df1, df2, min_periods=min_periods, + freq=freq, center=center, pairwise=True) + + +def expanding_apply(arg, func, min_periods=1, freq=None, center=False, + args=(), kwargs={}): + """Generic expanding function application. + + Parameters + ---------- + arg : Series, DataFrame + func : function + Must produce a single value from an ndarray input + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. + center : boolean, default False + Whether the label should correspond with center of window. + args : tuple + Passed on to func + kwargs : dict + Passed on to func + + Returns + ------- + y : type of input argument + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + """ + window = len(arg) + return rolling_apply(arg, window, func, min_periods=min_periods, freq=freq, + center=center, args=args, kwargs=kwargs) diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py new file mode 100644 index 00000000..9d22068c --- /dev/null +++ b/pandas/stats/ols.py @@ -0,0 +1,1363 @@ +""" +Ordinary least squares regression +""" + +# pylint: disable-msg=W0201 + +from pandas.compat import zip, range, StringIO +from itertools import starmap +from pandas import compat +import numpy as np + +from pandas.core.api import DataFrame, Series, isnull +from pandas.core.base import StringMixin +from pandas.core.common import _ensure_float64 +from pandas.core.index import MultiIndex +from pandas.core.panel import Panel +from pandas.util.decorators import cache_readonly + +import pandas.stats.common as scom +import pandas.stats.math as math +import pandas.stats.moments as moments + +_FP_ERR = 1e-8 + + +class OLS(StringMixin): + """ + Runs a full sample ordinary least squares regression. + + Parameters + ---------- + y : Series + x : Series, DataFrame, dict of Series + intercept : bool + True if you want an intercept. + weights : array-like, optional + 1d array of weights. If you supply 1/W then the variables are pre- + multiplied by 1/sqrt(W). If no weights are supplied the default value + is 1 and WLS reults are the same as OLS. + nw_lags : None or int + Number of Newey-West lags. + nw_overlap : boolean, default False + Assume data is overlapping when computing Newey-West estimator + + """ + _panel_model = False + + def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, + nw_overlap=False): + try: + import statsmodels.api as sm + except ImportError: + import scikits.statsmodels.api as sm + + self._x_orig = x + self._y_orig = y + self._weights_orig = weights + self._intercept = intercept + self._nw_lags = nw_lags + self._nw_overlap = nw_overlap + + (self._y, self._x, self._weights, self._x_filtered, + self._index, self._time_has_obs) = self._prepare_data() + + if self._weights is not None: + self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0) + self._y_trans = self._y * np.sqrt(self._weights) + self.sm_ols = sm.WLS(self._y.get_values(), + self._x.get_values(), + weights=self._weights.values).fit() + else: + self._x_trans = self._x + self._y_trans = self._y + self.sm_ols = sm.OLS(self._y.get_values(), + self._x.get_values()).fit() + + def _prepare_data(self): + """ + Cleans the input for single OLS. + + Parameters + ---------- + lhs: Series + Dependent variable in the regression. + rhs: dict, whose values are Series, DataFrame, or dict + Explanatory variables of the regression. + + Returns + ------- + Series, DataFrame + Cleaned lhs and rhs + """ + (filt_lhs, filt_rhs, filt_weights, + pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig, + self._weights_orig) + if self._intercept: + filt_rhs['intercept'] = 1. + pre_filt_rhs['intercept'] = 1. + + if hasattr(filt_weights,'to_dense'): + filt_weights = filt_weights.to_dense() + + return (filt_lhs, filt_rhs, filt_weights, + pre_filt_rhs, index, valid) + + @property + def nobs(self): + return self._nobs + + @property + def _nobs(self): + return len(self._y) + + @property + def nw_lags(self): + return self._nw_lags + + @property + def x(self): + """Returns the filtered x used in the regression.""" + return self._x + + @property + def y(self): + """Returns the filtered y used in the regression.""" + return self._y + + @cache_readonly + def _beta_raw(self): + """Runs the regression and returns the beta.""" + return self.sm_ols.params + + @cache_readonly + def beta(self): + """Returns the betas in Series form.""" + return Series(self._beta_raw, index=self._x.columns) + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + return math.rank(self._x.values) + + @cache_readonly + def df(self): + """Returns the degrees of freedom. + + This equals the rank of the X matrix. + """ + return self._df_raw + + @cache_readonly + def _df_model_raw(self): + """Returns the raw model degrees of freedom.""" + return self.sm_ols.df_model + + @cache_readonly + def df_model(self): + """Returns the degrees of freedom of the model.""" + return self._df_model_raw + + @cache_readonly + def _df_resid_raw(self): + """Returns the raw residual degrees of freedom.""" + return self.sm_ols.df_resid + + @cache_readonly + def df_resid(self): + """Returns the degrees of freedom of the residuals.""" + return self._df_resid_raw + + @cache_readonly + def _f_stat_raw(self): + """Returns the raw f-stat value.""" + from scipy.stats import f + + cols = self._x.columns + + if self._nw_lags is None: + F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) + + q = len(cols) + if 'intercept' in cols: + q -= 1 + + shape = q, self.df_resid + p_value = 1 - f.cdf(F, shape[0], shape[1]) + return F, shape, p_value + + k = len(cols) + R = np.eye(k) + r = np.zeros((k, 1)) + + try: + intercept = cols.get_loc('intercept') + R = np.concatenate((R[0: intercept], R[intercept + 1:])) + r = np.concatenate((r[0: intercept], r[intercept + 1:])) + except KeyError: + # no intercept + pass + + return math.calc_F(R, r, self._beta_raw, self._var_beta_raw, + self._nobs, self.df) + + @cache_readonly + def f_stat(self): + """Returns the f-stat value.""" + return f_stat_to_dict(self._f_stat_raw) + + def f_test(self, hypothesis): + """Runs the F test, given a joint hypothesis. The hypothesis is + represented by a collection of equations, in the form + + A*x_1+B*x_2=C + + You must provide the coefficients even if they're 1. No spaces. + + The equations can be passed as either a single string or a + list of strings. + + Examples + -------- + o = ols(...) + o.f_test('1*x1+2*x2=0,1*x3=0') + o.f_test(['1*x1+2*x2=0','1*x3=0']) + """ + + x_names = self._x.columns + + R = [] + r = [] + + if isinstance(hypothesis, str): + eqs = hypothesis.split(',') + elif isinstance(hypothesis, list): + eqs = hypothesis + else: # pragma: no cover + raise Exception('hypothesis must be either string or list') + for equation in eqs: + row = np.zeros(len(x_names)) + lhs, rhs = equation.split('=') + for s in lhs.split('+'): + ss = s.split('*') + coeff = float(ss[0]) + x_name = ss[1] + + if x_name not in x_names: + raise Exception('no coefficient named %s' % x_name) + idx = x_names.get_loc(x_name) + row[idx] = coeff + rhs = float(rhs) + + R.append(row) + r.append(rhs) + + R = np.array(R) + q = len(r) + r = np.array(r).reshape(q, 1) + + result = math.calc_F(R, r, self._beta_raw, self._var_beta_raw, + self._nobs, self.df) + + return f_stat_to_dict(result) + + @cache_readonly + def _p_value_raw(self): + """Returns the raw p values.""" + from scipy.stats import t + + return 2 * t.sf(np.fabs(self._t_stat_raw), + self._df_resid_raw) + + @cache_readonly + def p_value(self): + """Returns the p values.""" + return Series(self._p_value_raw, index=self.beta.index) + + @cache_readonly + def _r2_raw(self): + """Returns the raw r-squared values.""" + if self._use_centered_tss: + return 1 - self.sm_ols.ssr / self.sm_ols.centered_tss + else: + return 1 - self.sm_ols.ssr / self.sm_ols.uncentered_tss + + @property + def _use_centered_tss(self): + # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR + return self._intercept + + @cache_readonly + def r2(self): + """Returns the r-squared values.""" + return self._r2_raw + + @cache_readonly + def _r2_adj_raw(self): + """Returns the raw r-squared adjusted values.""" + return self.sm_ols.rsquared_adj + + @cache_readonly + def r2_adj(self): + """Returns the r-squared adjusted values.""" + return self._r2_adj_raw + + @cache_readonly + def _resid_raw(self): + """Returns the raw residuals.""" + return self.sm_ols.resid + + @cache_readonly + def resid(self): + """Returns the residuals.""" + return Series(self._resid_raw, index=self._x.index) + + @cache_readonly + def _rmse_raw(self): + """Returns the raw rmse values.""" + return np.sqrt(self.sm_ols.mse_resid) + + @cache_readonly + def rmse(self): + """Returns the rmse value.""" + return self._rmse_raw + + @cache_readonly + def _std_err_raw(self): + """Returns the raw standard err values.""" + return np.sqrt(np.diag(self._var_beta_raw)) + + @cache_readonly + def std_err(self): + """Returns the standard err values of the betas.""" + return Series(self._std_err_raw, index=self.beta.index) + + @cache_readonly + def _t_stat_raw(self): + """Returns the raw t-stat value.""" + return self._beta_raw / self._std_err_raw + + @cache_readonly + def t_stat(self): + """Returns the t-stat values of the betas.""" + return Series(self._t_stat_raw, index=self.beta.index) + + @cache_readonly + def _var_beta_raw(self): + """ + Returns the raw covariance of beta. + """ + x = self._x.values + y = self._y.values + + xx = np.dot(x.T, x) + + if self._nw_lags is None: + return math.inv(xx) * (self._rmse_raw ** 2) + else: + resid = y - np.dot(x, self._beta_raw) + m = (x.T * resid).T + + xeps = math.newey_west(m, self._nw_lags, self._nobs, self._df_raw, + self._nw_overlap) + + xx_inv = math.inv(xx) + return np.dot(xx_inv, np.dot(xeps, xx_inv)) + + @cache_readonly + def var_beta(self): + """Returns the variance-covariance matrix of beta.""" + return DataFrame(self._var_beta_raw, index=self.beta.index, + columns=self.beta.index) + + @cache_readonly + def _y_fitted_raw(self): + """Returns the raw fitted y values.""" + if self._weights is None: + X = self._x_filtered.values + else: + # XXX + return self.sm_ols.fittedvalues + + b = self._beta_raw + return np.dot(X, b) + + @cache_readonly + def y_fitted(self): + """Returns the fitted y values. This equals BX.""" + if self._weights is None: + index = self._x_filtered.index + orig_index = index + else: + index = self._y.index + orig_index = self._y_orig.index + + result = Series(self._y_fitted_raw, index=index) + return result.reindex(orig_index) + + @cache_readonly + def _y_predict_raw(self): + """Returns the raw predicted y values.""" + return self._y_fitted_raw + + @cache_readonly + def y_predict(self): + """Returns the predicted y values. + + For in-sample, this is same as y_fitted.""" + return self.y_fitted + + def predict(self, beta=None, x=None, fill_value=None, + fill_method=None, axis=0): + """ + Parameters + ---------- + beta : Series + x : Series or DataFrame + fill_value : scalar or dict, default None + fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + axis : {0, 1}, default 0 + See DataFrame.fillna for more details + + Notes + ----- + 1. If both fill_value and fill_method are None then NaNs are dropped + (this is the default behavior) + 2. An intercept will be automatically added to the new_y_values if + the model was fitted using an intercept + + Returns + ------- + Series of predicted values + """ + if beta is None and x is None: + return self.y_predict + + if beta is None: + beta = self.beta + else: + beta = beta.reindex(self.beta.index) + if isnull(beta).any(): + raise ValueError('Must supply betas for same variables') + + if x is None: + x = self._x + orig_x = x + else: + orig_x = x + if fill_value is None and fill_method is None: + x = x.dropna(how='any') + else: + x = x.fillna(value=fill_value, method=fill_method, axis=axis) + if isinstance(x, Series): + x = DataFrame({'x': x}) + if self._intercept: + x['intercept'] = 1. + + x = x.reindex(columns=self._x.columns) + + rs = np.dot(x.values, beta.values) + return Series(rs, x.index).reindex(orig_x.index) + + RESULT_FIELDS = ['r2', 'r2_adj', 'df', 'df_model', 'df_resid', 'rmse', + 'f_stat', 'beta', 'std_err', 't_stat', 'p_value', 'nobs'] + + @cache_readonly + def _results(self): + results = {} + for result in self.RESULT_FIELDS: + results[result] = getattr(self, result) + + return results + + @cache_readonly + def _coef_table(self): + buf = StringIO() + + buf.write('%14s %10s %10s %10s %10s %10s %10s\n' % + ('Variable', 'Coef', 'Std Err', 't-stat', + 'p-value', 'CI 2.5%', 'CI 97.5%')) + buf.write(scom.banner('')) + coef_template = '\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f' + + results = self._results + + beta = results['beta'] + + for i, name in enumerate(beta.index): + if i and not (i % 5): + buf.write('\n' + scom.banner('')) + + std_err = results['std_err'][name] + CI1 = beta[name] - 1.96 * std_err + CI2 = beta[name] + 1.96 * std_err + + t_stat = results['t_stat'][name] + p_value = results['p_value'][name] + + line = coef_template % (name, + beta[name], std_err, t_stat, p_value, CI1, CI2) + + buf.write(line) + + if self.nw_lags is not None: + buf.write('\n') + buf.write('*** The calculations are Newey-West ' + 'adjusted with lags %5d\n' % self.nw_lags) + + return buf.getvalue() + + @cache_readonly + def summary_as_matrix(self): + """Returns the formatted results of the OLS as a DataFrame.""" + results = self._results + beta = results['beta'] + data = {'beta': results['beta'], + 't-stat': results['t_stat'], + 'p-value': results['p_value'], + 'std err': results['std_err']} + return DataFrame(data, beta.index).T + + @cache_readonly + def summary(self): + """ + This returns the formatted result of the OLS computation + """ + template = """ +%(bannerTop)s + +Formula: Y ~ %(formula)s + +Number of Observations: %(nobs)d +Number of Degrees of Freedom: %(df)d + +R-squared: %(r2)10.4f +Adj R-squared: %(r2_adj)10.4f + +Rmse: %(rmse)10.4f + +F-stat %(f_stat_shape)s: %(f_stat)10.4f, p-value: %(f_stat_p_value)10.4f + +Degrees of Freedom: model %(df_model)d, resid %(df_resid)d + +%(bannerCoef)s +%(coef_table)s +%(bannerEnd)s +""" + coef_table = self._coef_table + + results = self._results + + f_stat = results['f_stat'] + + bracketed = ['<%s>' % str(c) for c in results['beta'].index] + + formula = StringIO() + formula.write(bracketed[0]) + tot = len(bracketed[0]) + line = 1 + for coef in bracketed[1:]: + tot = tot + len(coef) + 3 + + if tot // (68 * line): + formula.write('\n' + ' ' * 12) + line += 1 + + formula.write(' + ' + coef) + + params = { + 'bannerTop': scom.banner('Summary of Regression Analysis'), + 'bannerCoef': scom.banner('Summary of Estimated Coefficients'), + 'bannerEnd': scom.banner('End of Summary'), + 'formula': formula.getvalue(), + 'r2': results['r2'], + 'r2_adj': results['r2_adj'], + 'nobs': results['nobs'], + 'df': results['df'], + 'df_model': results['df_model'], + 'df_resid': results['df_resid'], + 'coef_table': coef_table, + 'rmse': results['rmse'], + 'f_stat': f_stat['f-stat'], + 'f_stat_shape': '(%d, %d)' % (f_stat['DF X'], f_stat['DF Resid']), + 'f_stat_p_value': f_stat['p-value'], + } + + return template % params + + def __unicode__(self): + return self.summary + + @cache_readonly + def _time_obs_count(self): + # XXX + return self._time_has_obs.astype(int) + + @property + def _total_times(self): + return self._time_has_obs.sum() + + +class MovingOLS(OLS): + """ + Runs a rolling/expanding simple OLS. + + Parameters + ---------- + y : Series + x : Series, DataFrame, or dict of Series + weights : array-like, optional + 1d array of weights. If None, equivalent to an unweighted OLS. + window_type : {'full sample', 'rolling', 'expanding'} + Default expanding + window : int + size of window (for rolling/expanding OLS) + min_periods : int + Threshold of non-null data points to require. + If None, defaults to size of window. + intercept : bool + True if you want an intercept. + nw_lags : None or int + Number of Newey-West lags. + nw_overlap : boolean, default False + Assume data is overlapping when computing Newey-West estimator + + """ + def __init__(self, y, x, weights=None, window_type='expanding', + window=None, min_periods=None, intercept=True, + nw_lags=None, nw_overlap=False): + + self._args = dict(intercept=intercept, nw_lags=nw_lags, + nw_overlap=nw_overlap) + + OLS.__init__(self, y=y, x=x, weights=weights, **self._args) + + self._set_window(window_type, window, min_periods) + + def _set_window(self, window_type, window, min_periods): + self._window_type = scom._get_window_type(window_type) + + if self._is_rolling: + if window is None: + raise AssertionError("Must specify window.") + if min_periods is None: + min_periods = window + else: + window = len(self._x) + if min_periods is None: + min_periods = 1 + + self._window = int(window) + self._min_periods = min_periods + +#------------------------------------------------------------------------------ +# "Public" results + + @cache_readonly + def beta(self): + """Returns the betas in Series/DataFrame form.""" + return DataFrame(self._beta_raw, + index=self._result_index, + columns=self._x.columns) + + @cache_readonly + def rank(self): + return Series(self._rank_raw, index=self._result_index) + + @cache_readonly + def df(self): + """Returns the degrees of freedom.""" + return Series(self._df_raw, index=self._result_index) + + @cache_readonly + def df_model(self): + """Returns the model degrees of freedom.""" + return Series(self._df_model_raw, index=self._result_index) + + @cache_readonly + def df_resid(self): + """Returns the residual degrees of freedom.""" + return Series(self._df_resid_raw, index=self._result_index) + + @cache_readonly + def f_stat(self): + """Returns the f-stat value.""" + f_stat_dicts = dict((date, f_stat_to_dict(f_stat)) + for date, f_stat in zip(self.beta.index, + self._f_stat_raw)) + + return DataFrame(f_stat_dicts).T + + def f_test(self, hypothesis): + raise NotImplementedError('must use full sample') + + @cache_readonly + def forecast_mean(self): + return Series(self._forecast_mean_raw, index=self._result_index) + + @cache_readonly + def forecast_vol(self): + return Series(self._forecast_vol_raw, index=self._result_index) + + @cache_readonly + def p_value(self): + """Returns the p values.""" + cols = self.beta.columns + return DataFrame(self._p_value_raw, columns=cols, + index=self._result_index) + + @cache_readonly + def r2(self): + """Returns the r-squared values.""" + return Series(self._r2_raw, index=self._result_index) + + @cache_readonly + def resid(self): + """Returns the residuals.""" + return Series(self._resid_raw[self._valid_obs_labels], + index=self._result_index) + + @cache_readonly + def r2_adj(self): + """Returns the r-squared adjusted values.""" + index = self.r2.index + + return Series(self._r2_adj_raw, index=index) + + @cache_readonly + def rmse(self): + """Returns the rmse values.""" + return Series(self._rmse_raw, index=self._result_index) + + @cache_readonly + def std_err(self): + """Returns the standard err values.""" + return DataFrame(self._std_err_raw, columns=self.beta.columns, + index=self._result_index) + + @cache_readonly + def t_stat(self): + """Returns the t-stat value.""" + return DataFrame(self._t_stat_raw, columns=self.beta.columns, + index=self._result_index) + + @cache_readonly + def var_beta(self): + """Returns the covariance of beta.""" + result = {} + result_index = self._result_index + for i in range(len(self._var_beta_raw)): + dm = DataFrame(self._var_beta_raw[i], columns=self.beta.columns, + index=self.beta.columns) + result[result_index[i]] = dm + + return Panel.from_dict(result, intersect=False) + + @cache_readonly + def y_fitted(self): + """Returns the fitted y values.""" + return Series(self._y_fitted_raw[self._valid_obs_labels], + index=self._result_index) + + @cache_readonly + def y_predict(self): + """Returns the predicted y values.""" + return Series(self._y_predict_raw[self._valid_obs_labels], + index=self._result_index) + +#------------------------------------------------------------------------------ +# "raw" attributes, calculations + + @property + def _is_rolling(self): + return self._window_type == 'rolling' + + @cache_readonly + def _beta_raw(self): + """Runs the regression and returns the beta.""" + beta, indices, mask = self._rolling_ols_call + + return beta[indices] + + @cache_readonly + def _result_index(self): + return self._index[self._valid_indices] + + @property + def _valid_indices(self): + return self._rolling_ols_call[1] + + @cache_readonly + def _rolling_ols_call(self): + return self._calc_betas(self._x_trans, self._y_trans) + + def _calc_betas(self, x, y): + N = len(self._index) + K = len(self._x.columns) + + betas = np.empty((N, K), dtype=float) + betas[:] = np.NaN + + valid = self._time_has_obs + enough = self._enough_obs + window = self._window + + # Use transformed (demeaned) Y, X variables + cum_xx = self._cum_xx(x) + cum_xy = self._cum_xy(x, y) + + for i in range(N): + if not valid[i] or not enough[i]: + continue + + xx = cum_xx[i] + xy = cum_xy[i] + if self._is_rolling and i >= window: + xx = xx - cum_xx[i - window] + xy = xy - cum_xy[i - window] + + betas[i] = math.solve(xx, xy) + + mask = -np.isnan(betas).any(axis=1) + have_betas = np.arange(N)[mask] + + return betas, have_betas, mask + + def _rolling_rank(self): + dates = self._index + window = self._window + + ranks = np.empty(len(dates), dtype=float) + ranks[:] = np.NaN + for i, date in enumerate(dates): + if self._is_rolling and i >= window: + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + x_slice = self._x.truncate(before=prior_date, after=date).values + + if len(x_slice) == 0: + continue + + ranks[i] = math.rank(x_slice) + + return ranks + + def _cum_xx(self, x): + dates = self._index + K = len(x.columns) + valid = self._time_has_obs + cum_xx = [] + + slicer = lambda df, dt: df.truncate(dt, dt).values + if not self._panel_model: + _get_index = x.index.get_loc + + def slicer(df, dt): + i = _get_index(dt) + return df.values[i:i + 1, :] + + last = np.zeros((K, K)) + + for i, date in enumerate(dates): + if not valid[i]: + cum_xx.append(last) + continue + + x_slice = slicer(x, date) + xx = last = last + np.dot(x_slice.T, x_slice) + cum_xx.append(xx) + + return cum_xx + + def _cum_xy(self, x, y): + dates = self._index + valid = self._time_has_obs + cum_xy = [] + + x_slicer = lambda df, dt: df.truncate(dt, dt).values + if not self._panel_model: + _get_index = x.index.get_loc + + def x_slicer(df, dt): + i = _get_index(dt) + return df.values[i:i + 1] + + _y_get_index = y.index.get_loc + _values = y.values + if isinstance(y.index, MultiIndex): + def y_slicer(df, dt): + loc = _y_get_index(dt) + return _values[loc] + else: + def y_slicer(df, dt): + i = _y_get_index(dt) + return _values[i:i + 1] + + last = np.zeros(len(x.columns)) + for i, date in enumerate(dates): + if not valid[i]: + cum_xy.append(last) + continue + + x_slice = x_slicer(x, date) + y_slice = y_slicer(y, date) + + xy = last = last + np.dot(x_slice.T, y_slice) + cum_xy.append(xy) + + return cum_xy + + @cache_readonly + def _rank_raw(self): + rank = self._rolling_rank() + return rank[self._valid_indices] + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + return self._rank_raw + + @cache_readonly + def _df_model_raw(self): + """Returns the raw model degrees of freedom.""" + return self._df_raw - 1 + + @cache_readonly + def _df_resid_raw(self): + """Returns the raw residual degrees of freedom.""" + return self._nobs - self._df_raw + + @cache_readonly + def _f_stat_raw(self): + """Returns the raw f-stat value.""" + from scipy.stats import f + + items = self.beta.columns + nobs = self._nobs + df = self._df_raw + df_resid = nobs - df + + # var_beta has not been newey-west adjusted + if self._nw_lags is None: + F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) + + q = len(items) + if 'intercept' in items: + q -= 1 + + def get_result_simple(Fst, d): + return Fst, (q, d), 1 - f.cdf(Fst, q, d) + + # Compute the P-value for each pair + result = starmap(get_result_simple, zip(F, df_resid)) + + return list(result) + + K = len(items) + R = np.eye(K) + r = np.zeros((K, 1)) + + try: + intercept = items.get_loc('intercept') + R = np.concatenate((R[0: intercept], R[intercept + 1:])) + r = np.concatenate((r[0: intercept], r[intercept + 1:])) + except KeyError: + # no intercept + pass + + def get_result(beta, vcov, n, d): + return math.calc_F(R, r, beta, vcov, n, d) + + results = starmap(get_result, + zip(self._beta_raw, self._var_beta_raw, nobs, df)) + + return list(results) + + @cache_readonly + def _p_value_raw(self): + """Returns the raw p values.""" + from scipy.stats import t + + result = [2 * t.sf(a, b) + for a, b in zip(np.fabs(self._t_stat_raw), + self._df_resid_raw)] + + return np.array(result) + + @cache_readonly + def _resid_stats(self): + uncentered_sst = [] + sst = [] + sse = [] + + Yreg = self._y + Y = self._y_trans + X = self._x_trans + weights = self._weights + + dates = self._index + window = self._window + for n, index in enumerate(self._valid_indices): + if self._is_rolling and index >= window: + prior_date = dates[index - window + 1] + else: + prior_date = dates[0] + + date = dates[index] + beta = self._beta_raw[n] + + X_slice = X.truncate(before=prior_date, after=date).values + Y_slice = _y_converter(Y.truncate(before=prior_date, after=date)) + + resid = Y_slice - np.dot(X_slice, beta) + + if weights is not None: + Y_slice = _y_converter(Yreg.truncate(before=prior_date, + after=date)) + weights_slice = weights.truncate(prior_date, date) + demeaned = Y_slice - np.average(Y_slice, weights=weights_slice) + SS_total = (weights_slice * demeaned ** 2).sum() + else: + SS_total = ((Y_slice - Y_slice.mean()) ** 2).sum() + + SS_err = (resid ** 2).sum() + SST_uncentered = (Y_slice ** 2).sum() + + sse.append(SS_err) + sst.append(SS_total) + uncentered_sst.append(SST_uncentered) + + return { + 'sse': np.array(sse), + 'centered_tss': np.array(sst), + 'uncentered_tss': np.array(uncentered_sst), + } + + @cache_readonly + def _rmse_raw(self): + """Returns the raw rmse values.""" + return np.sqrt(self._resid_stats['sse'] / self._df_resid_raw) + + @cache_readonly + def _r2_raw(self): + rs = self._resid_stats + + if self._use_centered_tss: + return 1 - rs['sse'] / rs['centered_tss'] + else: + return 1 - rs['sse'] / rs['uncentered_tss'] + + @cache_readonly + def _r2_adj_raw(self): + """Returns the raw r-squared adjusted values.""" + nobs = self._nobs + factors = (nobs - 1) / (nobs - self._df_raw) + return 1 - (1 - self._r2_raw) * factors + + @cache_readonly + def _resid_raw(self): + """Returns the raw residuals.""" + return (self._y.values - self._y_fitted_raw) + + @cache_readonly + def _std_err_raw(self): + """Returns the raw standard err values.""" + results = [] + for i in range(len(self._var_beta_raw)): + results.append(np.sqrt(np.diag(self._var_beta_raw[i]))) + + return np.array(results) + + @cache_readonly + def _t_stat_raw(self): + """Returns the raw t-stat value.""" + return self._beta_raw / self._std_err_raw + + @cache_readonly + def _var_beta_raw(self): + """Returns the raw covariance of beta.""" + x = self._x_trans + y = self._y_trans + dates = self._index + nobs = self._nobs + rmse = self._rmse_raw + beta = self._beta_raw + df = self._df_raw + window = self._window + cum_xx = self._cum_xx(self._x) + + results = [] + for n, i in enumerate(self._valid_indices): + xx = cum_xx[i] + date = dates[i] + + if self._is_rolling and i >= window: + xx = xx - cum_xx[i - window] + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + x_slice = x.truncate(before=prior_date, after=date) + y_slice = y.truncate(before=prior_date, after=date) + xv = x_slice.values + yv = np.asarray(y_slice) + + if self._nw_lags is None: + result = math.inv(xx) * (rmse[n] ** 2) + else: + resid = yv - np.dot(xv, beta[n]) + m = (xv.T * resid).T + + xeps = math.newey_west(m, self._nw_lags, nobs[n], df[n], + self._nw_overlap) + + xx_inv = math.inv(xx) + result = np.dot(xx_inv, np.dot(xeps, xx_inv)) + + results.append(result) + + return np.array(results) + + @cache_readonly + def _forecast_mean_raw(self): + """Returns the raw covariance of beta.""" + nobs = self._nobs + window = self._window + + # x should be ones + dummy = DataFrame(index=self._y.index) + dummy['y'] = 1 + + cum_xy = self._cum_xy(dummy, self._y) + + results = [] + for n, i in enumerate(self._valid_indices): + sumy = cum_xy[i] + + if self._is_rolling and i >= window: + sumy = sumy - cum_xy[i - window] + + results.append(sumy[0] / nobs[n]) + + return np.array(results) + + @cache_readonly + def _forecast_vol_raw(self): + """Returns the raw covariance of beta.""" + beta = self._beta_raw + window = self._window + dates = self._index + x = self._x + + results = [] + for n, i in enumerate(self._valid_indices): + date = dates[i] + if self._is_rolling and i >= window: + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + x_slice = x.truncate(prior_date, date).values + x_demeaned = x_slice - x_slice.mean(0) + x_cov = np.dot(x_demeaned.T, x_demeaned) / (len(x_slice) - 1) + + B = beta[n] + result = np.dot(B, np.dot(x_cov, B)) + results.append(np.sqrt(result)) + + return np.array(results) + + @cache_readonly + def _y_fitted_raw(self): + """Returns the raw fitted y values.""" + return (self._x.values * self._beta_matrix(lag=0)).sum(1) + + @cache_readonly + def _y_predict_raw(self): + """Returns the raw predicted y values.""" + return (self._x.values * self._beta_matrix(lag=1)).sum(1) + + @cache_readonly + def _results(self): + results = {} + for result in self.RESULT_FIELDS: + value = getattr(self, result) + if isinstance(value, Series): + value = value[self.beta.index[-1]] + elif isinstance(value, DataFrame): + value = value.xs(self.beta.index[-1]) + else: # pragma: no cover + raise Exception('Problem retrieving %s' % result) + results[result] = value + + return results + + @cache_readonly + def _window_time_obs(self): + window_obs = moments.rolling_sum(self._time_obs_count > 0, + self._window, min_periods=1) + + window_obs[np.isnan(window_obs)] = 0 + return window_obs.astype(int) + + @cache_readonly + def _nobs_raw(self): + if self._is_rolling: + window = self._window + else: + # expanding case + window = len(self._index) + + result = moments.rolling_sum(self._time_obs_count, window, + min_periods=1) + + return result.astype(int) + + def _beta_matrix(self, lag=0): + if lag < 0: + raise AssertionError("'lag' must be greater than or equal to 0, " + "input was {0}".format(lag)) + + betas = self._beta_raw + + labels = np.arange(len(self._y)) - lag + indexer = self._valid_obs_labels.searchsorted(labels, side='left') + indexer[indexer == len(betas)] = len(betas) - 1 + + beta_matrix = betas[indexer] + beta_matrix[labels < self._valid_obs_labels[0]] = np.NaN + + return beta_matrix + + @cache_readonly + def _valid_obs_labels(self): + dates = self._index[self._valid_indices] + return self._y.index.searchsorted(dates) + + @cache_readonly + def _nobs(self): + return self._nobs_raw[self._valid_indices] + + @property + def nobs(self): + return Series(self._nobs, index=self._result_index) + + @cache_readonly + def _enough_obs(self): + # XXX: what's the best way to determine where to start? + return self._nobs_raw >= max(self._min_periods, + len(self._x.columns) + 1) + + +def _safe_update(d, other): + """ + Combine dictionaries with non-overlapping keys + """ + for k, v in compat.iteritems(other): + if k in d: + raise Exception('Duplicate regressor: %s' % k) + + d[k] = v + + +def _filter_data(lhs, rhs, weights=None): + """ + Cleans the input for single OLS. + + Parameters + ---------- + lhs : Series + Dependent variable in the regression. + rhs : dict, whose values are Series, DataFrame, or dict + Explanatory variables of the regression. + weights : array-like, optional + 1d array of weights. If None, equivalent to an unweighted OLS. + + Returns + ------- + Series, DataFrame + Cleaned lhs and rhs + """ + if not isinstance(lhs, Series): + if len(lhs) != len(rhs): + raise AssertionError("length of lhs must equal length of rhs") + lhs = Series(lhs, index=rhs.index) + + rhs = _combine_rhs(rhs) + lhs = DataFrame({'__y__': lhs}, dtype=float) + pre_filt_rhs = rhs.dropna(how='any') + + combined = rhs.join(lhs, how='outer') + if weights is not None: + combined['__weights__'] = weights + + valid = (combined.count(1) == len(combined.columns)).values + index = combined.index + combined = combined[valid] + + if weights is not None: + filt_weights = combined.pop('__weights__') + else: + filt_weights = None + + filt_lhs = combined.pop('__y__') + filt_rhs = combined + + if hasattr(filt_weights,'to_dense'): + filt_weights = filt_weights.to_dense() + + return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights, + pre_filt_rhs.to_dense(), index, valid) + + +def _combine_rhs(rhs): + """ + Glue input X variables together while checking for potential + duplicates + """ + series = {} + + if isinstance(rhs, Series): + series['x'] = rhs + elif isinstance(rhs, DataFrame): + series = rhs.copy() + elif isinstance(rhs, dict): + for name, value in compat.iteritems(rhs): + if isinstance(value, Series): + _safe_update(series, {name: value}) + elif isinstance(value, (dict, DataFrame)): + _safe_update(series, value) + else: # pragma: no cover + raise Exception('Invalid RHS data type: %s' % type(value)) + else: # pragma: no cover + raise Exception('Invalid RHS type: %s' % type(rhs)) + + if not isinstance(series, DataFrame): + series = DataFrame(series, dtype=float) + + return series + +# A little kludge so we can use this method for both +# MovingOLS and MovingPanelOLS + + +def _y_converter(y): + y = y.values.squeeze() + if y.ndim == 0: # pragma: no cover + return np.array([y]) + else: + return y + + +def f_stat_to_dict(result): + f_stat, shape, p_value = result + + result = {} + result['f-stat'] = f_stat + result['DF X'] = shape[0] + result['DF Resid'] = shape[1] + result['p-value'] = p_value + + return result diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py new file mode 100644 index 00000000..3c671194 --- /dev/null +++ b/pandas/stats/plm.py @@ -0,0 +1,814 @@ +""" +Linear regression objects for panel data +""" + +# pylint: disable-msg=W0231 +# pylint: disable-msg=E1101,E1103 + +from __future__ import division +from pandas.compat import range +from pandas import compat +import warnings + +import numpy as np + +from pandas.core.panel import Panel +from pandas.core.frame import DataFrame +from pandas.core.reshape import get_dummies +from pandas.core.series import Series +from pandas.core.sparse import SparsePanel +from pandas.stats.ols import OLS, MovingOLS +import pandas.stats.common as com +import pandas.stats.math as math +from pandas.util.decorators import cache_readonly + + +class PanelOLS(OLS): + """Implements panel OLS. + + See ols function docs + """ + _panel_model = True + + def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, + entity_effects=False, time_effects=False, x_effects=None, + cluster=None, dropped_dummies=None, verbose=False, + nw_overlap=False): + self._x_orig = x + self._y_orig = y + self._weights = weights + + self._intercept = intercept + self._nw_lags = nw_lags + self._nw_overlap = nw_overlap + self._entity_effects = entity_effects + self._time_effects = time_effects + self._x_effects = x_effects + self._dropped_dummies = dropped_dummies or {} + self._cluster = com._get_cluster_type(cluster) + self._verbose = verbose + + (self._x, self._x_trans, + self._x_filtered, self._y, + self._y_trans) = self._prepare_data() + + self._index = self._x.index.levels[0] + + self._T = len(self._index) + + def log(self, msg): + if self._verbose: # pragma: no cover + print(msg) + + def _prepare_data(self): + """Cleans and stacks input data into DataFrame objects + + If time effects is True, then we turn off intercepts and omit an item + from every (entity and x) fixed effect. + + Otherwise: + - If we have an intercept, we omit an item from every fixed effect. + - Else, we omit an item from every fixed effect except one of them. + + The categorical variables will get dropped from x. + """ + (x, x_filtered, y, weights, cat_mapping) = self._filter_data() + + self.log('Adding dummies to X variables') + x = self._add_dummies(x, cat_mapping) + + self.log('Adding dummies to filtered X variables') + x_filtered = self._add_dummies(x_filtered, cat_mapping) + + if self._x_effects: + x = x.drop(self._x_effects, axis=1) + x_filtered = x_filtered.drop(self._x_effects, axis=1) + + if self._time_effects: + x_regressor = x.sub(x.mean(level=0), level=0) + + unstacked_y = y.unstack() + y_regressor = unstacked_y.sub(unstacked_y.mean(1), axis=0).stack() + y_regressor.index = y.index + + elif self._intercept: + # only add intercept when no time effects + self.log('Adding intercept') + x = x_regressor = add_intercept(x) + x_filtered = add_intercept(x_filtered) + y_regressor = y + else: + self.log('No intercept added') + x_regressor = x + y_regressor = y + + if weights is not None: + if not y_regressor.index.equals(weights.index): + raise AssertionError("y_regressor and weights must have the " + "same index") + if not x_regressor.index.equals(weights.index): + raise AssertionError("x_regressor and weights must have the " + "same index") + + rt_weights = np.sqrt(weights) + y_regressor = y_regressor * rt_weights + x_regressor = x_regressor.mul(rt_weights, axis=0) + + return x, x_regressor, x_filtered, y, y_regressor + + def _filter_data(self): + """ + + """ + data = self._x_orig + cat_mapping = {} + + if isinstance(data, DataFrame): + data = data.to_panel() + else: + if isinstance(data, Panel): + data = data.copy() + + if not isinstance(data, SparsePanel): + data, cat_mapping = self._convert_x(data) + + if not isinstance(data, Panel): + data = Panel.from_dict(data, intersect=True) + + x_names = data.items + + if self._weights is not None: + data['__weights__'] = self._weights + + # Filter x's without y (so we can make a prediction) + filtered = data.to_frame() + + # Filter all data together using to_frame + + # convert to DataFrame + y = self._y_orig + if isinstance(y, Series): + y = y.unstack() + + data['__y__'] = y + data_long = data.to_frame() + + x_filt = filtered.filter(x_names) + x = data_long.filter(x_names) + y = data_long['__y__'] + + if self._weights is not None and not self._weights.empty: + weights = data_long['__weights__'] + else: + weights = None + + return x, x_filt, y, weights, cat_mapping + + def _convert_x(self, x): + # Converts non-numeric data in x to floats. x_converted is the + # DataFrame with converted values, and x_conversion is a dict that + # provides the reverse mapping. For example, if 'A' was converted to 0 + # for x named 'variety', then x_conversion['variety'][0] is 'A'. + x_converted = {} + cat_mapping = {} + # x can be either a dict or a Panel, but in Python 3, dicts don't have + # .iteritems + iteritems = getattr(x, 'iteritems', x.items) + for key, df in iteritems(): + if not isinstance(df, DataFrame): + raise AssertionError("all input items must be DataFrames, " + "at least one is of " + "type {0}".format(type(df))) + + if _is_numeric(df): + x_converted[key] = df + else: + try: + df = df.astype(float) + except (TypeError, ValueError): + values = df.values + distinct_values = sorted(set(values.flat)) + cat_mapping[key] = dict(enumerate(distinct_values)) + new_values = np.searchsorted(distinct_values, values) + x_converted[key] = DataFrame(new_values, index=df.index, + columns=df.columns) + + if len(cat_mapping) == 0: + x_converted = x + + return x_converted, cat_mapping + + def _add_dummies(self, panel, mapping): + """ + Add entity and / or categorical dummies to input X DataFrame + + Returns + ------- + DataFrame + """ + panel = self._add_entity_effects(panel) + panel = self._add_categorical_dummies(panel, mapping) + + return panel + + def _add_entity_effects(self, panel): + """ + Add entity dummies to panel + + Returns + ------- + DataFrame + """ + from pandas.core.reshape import make_axis_dummies + + if not self._entity_effects: + return panel + + self.log('-- Adding entity fixed effect dummies') + + dummies = make_axis_dummies(panel, 'minor') + + if not self._use_all_dummies: + if 'entity' in self._dropped_dummies: + to_exclude = str(self._dropped_dummies.get('entity')) + else: + to_exclude = dummies.columns[0] + + if to_exclude not in dummies.columns: + raise Exception('%s not in %s' % (to_exclude, + dummies.columns)) + + self.log('-- Excluding dummy for entity: %s' % to_exclude) + + dummies = dummies.filter(dummies.columns - [to_exclude]) + + dummies = dummies.add_prefix('FE_') + panel = panel.join(dummies) + + return panel + + def _add_categorical_dummies(self, panel, cat_mappings): + """ + Add categorical dummies to panel + + Returns + ------- + DataFrame + """ + if not self._x_effects: + return panel + + dropped_dummy = (self._entity_effects and not self._use_all_dummies) + + for effect in self._x_effects: + self.log('-- Adding fixed effect dummies for %s' % effect) + + dummies = get_dummies(panel[effect]) + + val_map = cat_mappings.get(effect) + if val_map: + val_map = dict((v, k) for k, v in compat.iteritems(val_map)) + + if dropped_dummy or not self._use_all_dummies: + if effect in self._dropped_dummies: + to_exclude = mapped_name = self._dropped_dummies.get( + effect) + + if val_map: + mapped_name = val_map[to_exclude] + else: + to_exclude = mapped_name = dummies.columns[0] + + if mapped_name not in dummies.columns: # pragma: no cover + raise Exception('%s not in %s' % (to_exclude, + dummies.columns)) + + self.log( + '-- Excluding dummy for %s: %s' % (effect, to_exclude)) + + dummies = dummies.filter(dummies.columns - [mapped_name]) + dropped_dummy = True + + dummies = _convertDummies(dummies, cat_mappings.get(effect)) + dummies = dummies.add_prefix('%s_' % effect) + panel = panel.join(dummies) + + return panel + + @property + def _use_all_dummies(self): + """ + In the case of using an intercept or including time fixed + effects, completely partitioning the sample would make the X + not full rank. + """ + return (not self._intercept and not self._time_effects) + + @cache_readonly + def _beta_raw(self): + """Runs the regression and returns the beta.""" + X = self._x_trans.values + Y = self._y_trans.values.squeeze() + + beta, _, _, _ = np.linalg.lstsq(X, Y) + + return beta + + @cache_readonly + def beta(self): + return Series(self._beta_raw, index=self._x.columns) + + @cache_readonly + def _df_model_raw(self): + """Returns the raw model degrees of freedom.""" + return self._df_raw - 1 + + @cache_readonly + def _df_resid_raw(self): + """Returns the raw residual degrees of freedom.""" + return self._nobs - self._df_raw + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + df = math.rank(self._x_trans.values) + if self._time_effects: + df += self._total_times + + return df + + @cache_readonly + def _r2_raw(self): + Y = self._y_trans.values.squeeze() + X = self._x_trans.values + + resid = Y - np.dot(X, self._beta_raw) + + SSE = (resid ** 2).sum() + + if self._use_centered_tss: + SST = ((Y - np.mean(Y)) ** 2).sum() + else: + SST = (Y ** 2).sum() + + return 1 - SSE / SST + + @property + def _use_centered_tss(self): + # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR + return self._intercept or self._entity_effects or self._time_effects + + @cache_readonly + def _r2_adj_raw(self): + """Returns the raw r-squared adjusted values.""" + nobs = self._nobs + factors = (nobs - 1) / (nobs - self._df_raw) + return 1 - (1 - self._r2_raw) * factors + + @cache_readonly + def _resid_raw(self): + Y = self._y.values.squeeze() + X = self._x.values + return Y - np.dot(X, self._beta_raw) + + @cache_readonly + def resid(self): + return self._unstack_vector(self._resid_raw) + + @cache_readonly + def _rmse_raw(self): + """Returns the raw rmse values.""" + # X = self._x.values + # Y = self._y.values.squeeze() + + X = self._x_trans.values + Y = self._y_trans.values.squeeze() + + resid = Y - np.dot(X, self._beta_raw) + ss = (resid ** 2).sum() + return np.sqrt(ss / (self._nobs - self._df_raw)) + + @cache_readonly + def _var_beta_raw(self): + cluster_axis = None + if self._cluster == 'time': + cluster_axis = 0 + elif self._cluster == 'entity': + cluster_axis = 1 + + x = self._x + y = self._y + + if self._time_effects: + xx = _xx_time_effects(x, y) + else: + xx = np.dot(x.values.T, x.values) + + return _var_beta_panel(y, x, self._beta_raw, xx, + self._rmse_raw, cluster_axis, self._nw_lags, + self._nobs, self._df_raw, self._nw_overlap) + + @cache_readonly + def _y_fitted_raw(self): + """Returns the raw fitted y values.""" + return np.dot(self._x.values, self._beta_raw) + + @cache_readonly + def y_fitted(self): + return self._unstack_vector(self._y_fitted_raw, index=self._x.index) + + def _unstack_vector(self, vec, index=None): + if index is None: + index = self._y_trans.index + panel = DataFrame(vec, index=index, columns=['dummy']) + return panel.to_panel()['dummy'] + + def _unstack_y(self, vec): + unstacked = self._unstack_vector(vec) + return unstacked.reindex(self.beta.index) + + @cache_readonly + def _time_obs_count(self): + return self._y_trans.count(level=0).values + + @cache_readonly + def _time_has_obs(self): + return self._time_obs_count > 0 + + @property + def _nobs(self): + return len(self._y) + + +def _convertDummies(dummies, mapping): + # cleans up the names of the generated dummies + new_items = [] + for item in dummies.columns: + if not mapping: + var = str(item) + if isinstance(item, float): + var = '%g' % item + + new_items.append(var) + else: + # renames the dummies if a conversion dict is provided + new_items.append(mapping[int(item)]) + + dummies = DataFrame(dummies.values, index=dummies.index, + columns=new_items) + + return dummies + + +def _is_numeric(df): + for col in df: + if df[col].dtype.name == 'object': + return False + + return True + + +def add_intercept(panel, name='intercept'): + """ + Add column of ones to input panel + + Parameters + ---------- + panel: Panel / DataFrame + name: string, default 'intercept'] + + Returns + ------- + New object (same type as input) + """ + panel = panel.copy() + panel[name] = 1. + + return panel.consolidate() + + +class MovingPanelOLS(MovingOLS, PanelOLS): + """Implements rolling/expanding panel OLS. + + See ols function docs + """ + _panel_model = True + + def __init__(self, y, x, weights=None, + window_type='expanding', window=None, + min_periods=None, + min_obs=None, + intercept=True, + nw_lags=None, nw_overlap=False, + entity_effects=False, + time_effects=False, + x_effects=None, + cluster=None, + dropped_dummies=None, + verbose=False): + + self._args = dict(intercept=intercept, + nw_lags=nw_lags, + nw_overlap=nw_overlap, + entity_effects=entity_effects, + time_effects=time_effects, + x_effects=x_effects, + cluster=cluster, + dropped_dummies=dropped_dummies, + verbose=verbose) + + PanelOLS.__init__(self, y=y, x=x, weights=weights, + **self._args) + + self._set_window(window_type, window, min_periods) + + if min_obs is None: + min_obs = len(self._x.columns) + 1 + + self._min_obs = min_obs + + @cache_readonly + def resid(self): + return self._unstack_y(self._resid_raw) + + @cache_readonly + def y_fitted(self): + return self._unstack_y(self._y_fitted_raw) + + @cache_readonly + def y_predict(self): + """Returns the predicted y values.""" + return self._unstack_y(self._y_predict_raw) + + def lagged_y_predict(self, lag=1): + """ + Compute forecast Y value lagging coefficient by input number + of time periods + + Parameters + ---------- + lag : int + + Returns + ------- + DataFrame + """ + x = self._x.values + betas = self._beta_matrix(lag=lag) + return self._unstack_y((betas * x).sum(1)) + + @cache_readonly + def _rolling_ols_call(self): + return self._calc_betas(self._x_trans, self._y_trans) + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + df = self._rolling_rank() + + if self._time_effects: + df += self._window_time_obs + + return df[self._valid_indices] + + @cache_readonly + def _var_beta_raw(self): + """Returns the raw covariance of beta.""" + x = self._x + y = self._y + + dates = x.index.levels[0] + + cluster_axis = None + if self._cluster == 'time': + cluster_axis = 0 + elif self._cluster == 'entity': + cluster_axis = 1 + + nobs = self._nobs + rmse = self._rmse_raw + beta = self._beta_raw + df = self._df_raw + window = self._window + + if not self._time_effects: + # Non-transformed X + cum_xx = self._cum_xx(x) + + results = [] + for n, i in enumerate(self._valid_indices): + if self._is_rolling and i >= window: + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + date = dates[i] + + x_slice = x.truncate(prior_date, date) + y_slice = y.truncate(prior_date, date) + + if self._time_effects: + xx = _xx_time_effects(x_slice, y_slice) + else: + xx = cum_xx[i] + if self._is_rolling and i >= window: + xx = xx - cum_xx[i - window] + + result = _var_beta_panel(y_slice, x_slice, beta[n], xx, rmse[n], + cluster_axis, self._nw_lags, + nobs[n], df[n], self._nw_overlap) + + results.append(result) + + return np.array(results) + + @cache_readonly + def _resid_raw(self): + beta_matrix = self._beta_matrix(lag=0) + + Y = self._y.values.squeeze() + X = self._x.values + resid = Y - (X * beta_matrix).sum(1) + + return resid + + @cache_readonly + def _y_fitted_raw(self): + x = self._x.values + betas = self._beta_matrix(lag=0) + return (betas * x).sum(1) + + @cache_readonly + def _y_predict_raw(self): + """Returns the raw predicted y values.""" + x = self._x.values + betas = self._beta_matrix(lag=1) + return (betas * x).sum(1) + + def _beta_matrix(self, lag=0): + if lag < 0: + raise AssertionError("'lag' must be greater than or equal to 0, " + "input was {0}".format(lag)) + + index = self._y_trans.index + major_labels = index.labels[0] + labels = major_labels - lag + indexer = self._valid_indices.searchsorted(labels, side='left') + + beta_matrix = self._beta_raw[indexer] + beta_matrix[labels < self._valid_indices[0]] = np.NaN + + return beta_matrix + + @cache_readonly + def _enough_obs(self): + # XXX: what's the best way to determine where to start? + # TODO: write unit tests for this + + rank_threshold = len(self._x.columns) + 1 + if self._min_obs < rank_threshold: # pragma: no cover + warnings.warn('min_obs is smaller than rank of X matrix') + + enough_observations = self._nobs_raw >= self._min_obs + enough_time_periods = self._window_time_obs >= self._min_periods + return enough_time_periods & enough_observations + + +def create_ols_dict(attr): + def attr_getter(self): + d = {} + for k, v in compat.iteritems(self.results): + result = getattr(v, attr) + d[k] = result + + return d + + return attr_getter + + +def create_ols_attr(attr): + return property(create_ols_dict(attr)) + + +class NonPooledPanelOLS(object): + """Implements non-pooled panel OLS. + + Parameters + ---------- + y : DataFrame + x : Series, DataFrame, or dict of Series + intercept : bool + True if you want an intercept. + nw_lags : None or int + Number of Newey-West lags. + window_type : {'full_sample', 'rolling', 'expanding'} + 'full_sample' by default + window : int + size of window (for rolling/expanding OLS) + """ + + ATTRIBUTES = [ + 'beta', + 'df', + 'df_model', + 'df_resid', + 'f_stat', + 'p_value', + 'r2', + 'r2_adj', + 'resid', + 'rmse', + 'std_err', + 'summary_as_matrix', + 't_stat', + 'var_beta', + 'x', + 'y', + 'y_fitted', + 'y_predict' + ] + + def __init__(self, y, x, window_type='full_sample', window=None, + min_periods=None, intercept=True, nw_lags=None, + nw_overlap=False): + + for attr in self.ATTRIBUTES: + setattr(self.__class__, attr, create_ols_attr(attr)) + + results = {} + + for entity in y: + entity_y = y[entity] + + entity_x = {} + for x_var in x: + entity_x[x_var] = x[x_var][entity] + + from pandas.stats.interface import ols + results[entity] = ols(y=entity_y, + x=entity_x, + window_type=window_type, + window=window, + min_periods=min_periods, + intercept=intercept, + nw_lags=nw_lags, + nw_overlap=nw_overlap) + + self.results = results + + +def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, + nw_lags, nobs, df, nw_overlap): + from pandas.core.frame import group_agg + xx_inv = math.inv(xx) + + yv = y.values + + if cluster_axis is None: + if nw_lags is None: + return xx_inv * (rmse ** 2) + else: + resid = yv - np.dot(x.values, beta) + m = (x.values.T * resid).T + + xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap) + + return np.dot(xx_inv, np.dot(xeps, xx_inv)) + else: + Xb = np.dot(x.values, beta).reshape((len(x.values), 1)) + resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid']) + + if cluster_axis == 1: + x = x.swaplevel(0, 1).sortlevel(0) + resid = resid.swaplevel(0, 1).sortlevel(0) + + m = group_agg(x.values * resid.values, x.index._bounds, + lambda x: np.sum(x, axis=0)) + + if nw_lags is None: + nw_lags = 0 + + xox = 0 + for i in range(len(x.index.levels[0])): + xox += math.newey_west(m[i: i + 1], nw_lags, + nobs, df, nw_overlap) + + return np.dot(xx_inv, np.dot(xox, xx_inv)) + + +def _xx_time_effects(x, y): + """ + Returns X'X - (X'T) (T'T)^-1 (T'X) + """ + # X'X + xx = np.dot(x.values.T, x.values) + xt = x.sum(level=0).values + + count = y.unstack().count(1).values + selector = count > 0 + + # X'X - (T'T)^-1 (T'X) + xt = xt[selector] + count = count[selector] + + return xx - np.dot(xt.T / count, xt) diff --git a/pandas/stats/tests/__init__.py b/pandas/stats/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/stats/tests/common.py b/pandas/stats/tests/common.py new file mode 100644 index 00000000..717eb512 --- /dev/null +++ b/pandas/stats/tests/common.py @@ -0,0 +1,160 @@ +# pylint: disable-msg=W0611,W0402 + +from datetime import datetime +import string +import nose + +import numpy as np + +from pandas import DataFrame, bdate_range +from pandas.util.testing import assert_almost_equal # imported in other tests +import pandas.util.testing as tm + +N = 100 +K = 4 + +start = datetime(2007, 1, 1) +DATE_RANGE = bdate_range(start, periods=N) + +COLS = ['Col' + c for c in string.ascii_uppercase[:K]] + + +def makeDataFrame(): + data = DataFrame(np.random.randn(N, K), + columns=COLS, + index=DATE_RANGE) + + return data + + +def getBasicDatasets(): + A = makeDataFrame() + B = makeDataFrame() + C = makeDataFrame() + + return A, B, C + + +def check_for_scipy(): + try: + import scipy + except ImportError: + raise nose.SkipTest('no scipy') + + +def check_for_statsmodels(): + _have_statsmodels = True + try: + import statsmodels.api as sm + except ImportError: + try: + import scikits.statsmodels.api as sm + except ImportError: + raise nose.SkipTest('no statsmodels') + + +class BaseTest(tm.TestCase): + def setUp(self): + check_for_scipy() + check_for_statsmodels() + + self.A, self.B, self.C = getBasicDatasets() + + self.createData1() + self.createData2() + self.createData3() + + def createData1(self): + date = datetime(2007, 1, 1) + date2 = datetime(2007, 1, 15) + date3 = datetime(2007, 1, 22) + + A = self.A.copy() + B = self.B.copy() + C = self.C.copy() + + A['ColA'][date] = np.NaN + B['ColA'][date] = np.NaN + C['ColA'][date] = np.NaN + C['ColA'][date2] = np.NaN + + # truncate data to save time + A = A[:30] + B = B[:30] + C = C[:30] + + self.panel_y = A + self.panel_x = {'B': B, 'C': C} + + self.series_panel_y = A.filter(['ColA']) + self.series_panel_x = {'B': B.filter(['ColA']), + 'C': C.filter(['ColA'])} + self.series_y = A['ColA'] + self.series_x = {'B': B['ColA'], + 'C': C['ColA']} + + def createData2(self): + y_data = [[1, np.NaN], + [2, 3], + [4, 5]] + y_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)] + y_cols = ['A', 'B'] + self.panel_y2 = DataFrame(np.array(y_data), index=y_index, + columns=y_cols) + + x1_data = [[6, np.NaN], + [7, 8], + [9, 30], + [11, 12]] + x1_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4)] + x1_cols = ['A', 'B'] + x1 = DataFrame(np.array(x1_data), index=x1_index, + columns=x1_cols) + + x2_data = [[13, 14, np.NaN], + [15, np.NaN, np.NaN], + [16, 17, 48], + [19, 20, 21], + [22, 23, 24]] + x2_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5)] + x2_cols = ['C', 'A', 'B'] + x2 = DataFrame(np.array(x2_data), index=x2_index, + columns=x2_cols) + + self.panel_x2 = {'x1': x1, 'x2': x2} + + def createData3(self): + y_data = [[1, 2], + [3, 4]] + y_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2)] + y_cols = ['A', 'B'] + self.panel_y3 = DataFrame(np.array(y_data), index=y_index, + columns=y_cols) + + x1_data = [['A', 'B'], + ['C', 'A']] + x1_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2)] + x1_cols = ['A', 'B'] + x1 = DataFrame(np.array(x1_data), index=x1_index, + columns=x1_cols) + + x2_data = [['foo', 'bar'], + ['baz', 'foo']] + x2_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2)] + x2_cols = ['A', 'B'] + x2 = DataFrame(np.array(x2_data), index=x2_index, + columns=x2_cols) + + self.panel_x3 = {'x1': x1, 'x2': x2} diff --git a/pandas/stats/tests/test_fama_macbeth.py b/pandas/stats/tests/test_fama_macbeth.py new file mode 100644 index 00000000..dd2f1963 --- /dev/null +++ b/pandas/stats/tests/test_fama_macbeth.py @@ -0,0 +1,64 @@ +from pandas import DataFrame, Panel +from pandas.stats.api import fama_macbeth +from .common import assert_almost_equal, BaseTest + +from pandas.compat import range +from pandas import compat +import numpy as np + + +class TestFamaMacBeth(BaseTest): + def testFamaMacBethRolling(self): + # self.checkFamaMacBethExtended('rolling', self.panel_x, self.panel_y, + # nw_lags_beta=2) + + # df = DataFrame(np.random.randn(50, 10)) + x = dict((k, DataFrame(np.random.randn(50, 10))) for k in 'abcdefg') + x = Panel.from_dict(x) + y = (DataFrame(np.random.randn(50, 10)) + + DataFrame(0.01 * np.random.randn(50, 10))) + self.checkFamaMacBethExtended('rolling', x, y, nw_lags_beta=2) + self.checkFamaMacBethExtended('expanding', x, y, nw_lags_beta=2) + + def checkFamaMacBethExtended(self, window_type, x, y, **kwds): + window = 25 + + result = fama_macbeth(y=y, x=x, window_type=window_type, window=window, + **kwds) + self._check_stuff_works(result) + + index = result._index + time = len(index) + + for i in range(time - window + 1): + if window_type == 'rolling': + start = index[i] + else: + start = index[0] + + end = index[i + window - 1] + + x2 = {} + for k, v in compat.iteritems(x): + x2[k] = v.truncate(start, end) + y2 = y.truncate(start, end) + + reference = fama_macbeth(y=y2, x=x2, **kwds) + assert_almost_equal(reference._stats, result._stats[:, i]) + + static = fama_macbeth(y=y2, x=x2, **kwds) + self._check_stuff_works(static) + + def _check_stuff_works(self, result): + # does it work? + attrs = ['mean_beta', 'std_beta', 't_stat'] + for attr in attrs: + getattr(result, attr) + + # does it work? + result.summary + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_math.py b/pandas/stats/tests/test_math.py new file mode 100644 index 00000000..1d1288e1 --- /dev/null +++ b/pandas/stats/tests/test_math.py @@ -0,0 +1,67 @@ +import nose + +from datetime import datetime +from numpy.random import randn +import numpy as np + +from pandas.core.api import Series, DataFrame, date_range +from pandas.util.testing import assert_almost_equal +import pandas.core.datetools as datetools +import pandas.stats.moments as mom +import pandas.util.testing as tm +import pandas.stats.math as pmath +import pandas.tests.test_series as ts +from pandas import ols + +N, K = 100, 10 + +_have_statsmodels = True +try: + import statsmodels.api as sm +except ImportError: + try: + import scikits.statsmodels.api as sm + except ImportError: + _have_statsmodels = False + + +class TestMath(tm.TestCase): + + _nan_locs = np.arange(20, 40) + _inf_locs = np.array([]) + + def setUp(self): + arr = randn(N) + arr[self._nan_locs] = np.NaN + + self.arr = arr + self.rng = date_range(datetime(2009, 1, 1), periods=N) + + self.series = Series(arr.copy(), index=self.rng) + + self.frame = DataFrame(randn(N, K), index=self.rng, + columns=np.arange(K)) + + def test_rank_1d(self): + self.assertEqual(1, pmath.rank(self.series)) + self.assertEqual(0, pmath.rank(Series(0, self.series.index))) + + def test_solve_rect(self): + if not _have_statsmodels: + raise nose.SkipTest("no statsmodels") + + b = Series(np.random.randn(N), self.frame.index) + result = pmath.solve(self.frame, b) + expected = ols(y=b, x=self.frame, intercept=False).beta + self.assertTrue(np.allclose(result, expected)) + + def test_inv_illformed(self): + singular = DataFrame(np.array([[1, 1], [2, 2]])) + rs = pmath.inv(singular) + expected = np.array([[0.1, 0.2], [0.1, 0.2]]) + self.assertTrue(np.allclose(rs, expected)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py new file mode 100644 index 00000000..8f20a4d4 --- /dev/null +++ b/pandas/stats/tests/test_moments.py @@ -0,0 +1,1064 @@ +import nose +import sys +import functools + +from datetime import datetime +from numpy.random import randn +import numpy as np + +from pandas import Series, DataFrame, bdate_range, isnull, notnull +from pandas.util.testing import ( + assert_almost_equal, assert_series_equal, assert_frame_equal +) +import pandas.core.datetools as datetools +import pandas.stats.moments as mom +import pandas.util.testing as tm +from pandas.compat import range, zip, PY3, StringIO + +N, K = 100, 10 + + +class TestMoments(tm.TestCase): + + _multiprocess_can_split_ = True + + _nan_locs = np.arange(20, 40) + _inf_locs = np.array([]) + + def setUp(self): + arr = randn(N) + arr[self._nan_locs] = np.NaN + + self.arr = arr + self.rng = bdate_range(datetime(2009, 1, 1), periods=N) + + self.series = Series(arr.copy(), index=self.rng) + + self.frame = DataFrame(randn(N, K), index=self.rng, + columns=np.arange(K)) + + def test_centered_axis_validation(self): + # ok + mom.rolling_mean(Series(np.ones(10)),3,center=True ,axis=0) + # bad axis + self.assertRaises(ValueError, mom.rolling_mean,Series(np.ones(10)),3,center=True ,axis=1) + + # ok ok + mom.rolling_mean(DataFrame(np.ones((10,10))),3,center=True ,axis=0) + mom.rolling_mean(DataFrame(np.ones((10,10))),3,center=True ,axis=1) + # bad axis + self.assertRaises(ValueError, mom.rolling_mean,DataFrame(np.ones((10,10))),3,center=True ,axis=2) + + def test_rolling_sum(self): + self._check_moment_func(mom.rolling_sum, np.sum) + + def test_rolling_count(self): + counter = lambda x: np.isfinite(x).astype(float).sum() + self._check_moment_func(mom.rolling_count, counter, + has_min_periods=False, + preserve_nan=False, + fill_value=0) + + def test_rolling_mean(self): + self._check_moment_func(mom.rolling_mean, np.mean) + + def test_cmov_mean(self): + tm._skip_if_no_scipy() + try: + from scikits.timeseries.lib import cmov_mean + except ImportError: + raise nose.SkipTest("no scikits.timeseries") + + vals = np.random.randn(10) + xp = cmov_mean(vals, 5) + + rs = mom.rolling_mean(vals, 5, center=True) + assert_almost_equal(xp.compressed(), rs[2:-2]) + assert_almost_equal(xp.mask, np.isnan(rs)) + + xp = Series(rs) + rs = mom.rolling_mean(Series(vals), 5, center=True) + assert_series_equal(xp, rs) + + def test_cmov_window(self): + tm._skip_if_no_scipy() + try: + from scikits.timeseries.lib import cmov_window + except ImportError: + raise nose.SkipTest("no scikits.timeseries") + + vals = np.random.randn(10) + xp = cmov_window(vals, 5, 'boxcar') + + rs = mom.rolling_window(vals, 5, 'boxcar', center=True) + assert_almost_equal(xp.compressed(), rs[2:-2]) + assert_almost_equal(xp.mask, np.isnan(rs)) + + xp = Series(rs) + rs = mom.rolling_window(Series(vals), 5, 'boxcar', center=True) + assert_series_equal(xp, rs) + + def test_cmov_window_corner(self): + tm._skip_if_no_scipy() + try: + from scikits.timeseries.lib import cmov_window + except ImportError: + raise nose.SkipTest("no scikits.timeseries") + + # all nan + vals = np.empty(10, dtype=float) + vals.fill(np.nan) + rs = mom.rolling_window(vals, 5, 'boxcar', center=True) + self.assertTrue(np.isnan(rs).all()) + + # empty + vals = np.array([]) + rs = mom.rolling_window(vals, 5, 'boxcar', center=True) + self.assertEqual(len(rs), 0) + + # shorter than window + vals = np.random.randn(5) + rs = mom.rolling_window(vals, 10, 'boxcar') + self.assertTrue(np.isnan(rs).all()) + self.assertEqual(len(rs), 5) + + def test_cmov_window_frame(self): + tm._skip_if_no_scipy() + try: + from scikits.timeseries.lib import cmov_window + except ImportError: + raise nose.SkipTest("no scikits.timeseries") + + # DataFrame + vals = np.random.randn(10, 2) + xp = cmov_window(vals, 5, 'boxcar') + rs = mom.rolling_window(DataFrame(vals), 5, 'boxcar', center=True) + assert_frame_equal(DataFrame(xp), rs) + + def test_cmov_window_na_min_periods(self): + tm._skip_if_no_scipy() + try: + from scikits.timeseries.lib import cmov_window + except ImportError: + raise nose.SkipTest("no scikits.timeseries") + + # min_periods + vals = Series(np.random.randn(10)) + vals[4] = np.nan + vals[8] = np.nan + + xp = mom.rolling_mean(vals, 5, min_periods=4, center=True) + rs = mom.rolling_window(vals, 5, 'boxcar', min_periods=4, center=True) + + assert_series_equal(xp, rs) + + def test_cmov_window_regular(self): + tm._skip_if_no_scipy() + try: + from scikits.timeseries.lib import cmov_window + except ImportError: + raise nose.SkipTest("no scikits.timeseries") + + win_types = ['triang', 'blackman', 'hamming', 'bartlett', 'bohman', + 'blackmanharris', 'nuttall', 'barthann'] + for wt in win_types: + vals = np.random.randn(10) + xp = cmov_window(vals, 5, wt) + + rs = mom.rolling_window(Series(vals), 5, wt, center=True) + assert_series_equal(Series(xp), rs) + + def test_cmov_window_special(self): + tm._skip_if_no_scipy() + try: + from scikits.timeseries.lib import cmov_window + except ImportError: + raise nose.SkipTest("no scikits.timeseries") + + win_types = ['kaiser', 'gaussian', 'general_gaussian', 'slepian'] + kwds = [{'beta': 1.}, {'std': 1.}, {'power': 2., 'width': 2.}, + {'width': 0.5}] + + for wt, k in zip(win_types, kwds): + vals = np.random.randn(10) + xp = cmov_window(vals, 5, (wt,) + tuple(k.values())) + + rs = mom.rolling_window(Series(vals), 5, wt, center=True, + **k) + assert_series_equal(Series(xp), rs) + + def test_rolling_median(self): + self._check_moment_func(mom.rolling_median, np.median) + + def test_rolling_min(self): + self._check_moment_func(mom.rolling_min, np.min) + + a = np.array([1, 2, 3, 4, 5]) + b = mom.rolling_min(a, window=100, min_periods=1) + assert_almost_equal(b, np.ones(len(a))) + + self.assertRaises(ValueError, mom.rolling_min, np.array([1, + 2, 3]), window=3, min_periods=5) + + def test_rolling_max(self): + self._check_moment_func(mom.rolling_max, np.max) + + a = np.array([1, 2, 3, 4, 5]) + b = mom.rolling_max(a, window=100, min_periods=1) + assert_almost_equal(a, b) + + self.assertRaises(ValueError, mom.rolling_max, np.array([1, + 2, 3]), window=3, min_periods=5) + + def test_rolling_quantile(self): + qs = [.1, .5, .9] + + def scoreatpercentile(a, per): + values = np.sort(a, axis=0) + + idx = per / 1. * (values.shape[0] - 1) + return values[int(idx)] + + for q in qs: + def f(x, window, min_periods=None, freq=None, center=False): + return mom.rolling_quantile(x, window, q, + min_periods=min_periods, + freq=freq, + center=center) + + def alt(x): + return scoreatpercentile(x, q) + + self._check_moment_func(f, alt) + + def test_rolling_apply(self): + ser = Series([]) + assert_series_equal( + ser, mom.rolling_apply(ser, 10, lambda x: x.mean())) + + def roll_mean(x, window, min_periods=None, freq=None, center=False): + return mom.rolling_apply(x, window, + lambda x: x[np.isfinite(x)].mean(), + min_periods=min_periods, + freq=freq, + center=center) + self._check_moment_func(roll_mean, np.mean) + + def test_rolling_apply_out_of_bounds(self): + # #1850 + arr = np.arange(4) + + # it works! + result = mom.rolling_apply(arr, 10, np.sum) + self.assertTrue(isnull(result).all()) + + result = mom.rolling_apply(arr, 10, np.sum, min_periods=1) + assert_almost_equal(result, result) + + def test_rolling_std(self): + self._check_moment_func(mom.rolling_std, + lambda x: np.std(x, ddof=1)) + self._check_moment_func(functools.partial(mom.rolling_std, ddof=0), + lambda x: np.std(x, ddof=0)) + + def test_rolling_std_1obs(self): + result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), + 1, min_periods=1) + expected = np.zeros(5) + + assert_almost_equal(result, expected) + + result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), + 3, min_periods=2) + self.assertTrue(np.isnan(result[2])) + + def test_rolling_std_neg_sqrt(self): + # unit test from Bottleneck + + # Test move_nanstd for neg sqrt. + + a = np.array([0.0011448196318903589, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767]) + b = mom.rolling_std(a, window=3) + self.assertTrue(np.isfinite(b[2:]).all()) + + b = mom.ewmstd(a, span=3) + self.assertTrue(np.isfinite(b[2:]).all()) + + def test_rolling_var(self): + self._check_moment_func(mom.rolling_var, + lambda x: np.var(x, ddof=1), + test_stable=True) + self._check_moment_func(functools.partial(mom.rolling_var, ddof=0), + lambda x: np.var(x, ddof=0)) + + def test_rolling_skew(self): + try: + from scipy.stats import skew + except ImportError: + raise nose.SkipTest('no scipy') + self._check_moment_func(mom.rolling_skew, + lambda x: skew(x, bias=False)) + + def test_rolling_kurt(self): + try: + from scipy.stats import kurtosis + except ImportError: + raise nose.SkipTest('no scipy') + self._check_moment_func(mom.rolling_kurt, + lambda x: kurtosis(x, bias=False)) + + def test_fperr_robustness(self): + # TODO: remove this once python 2.5 out of picture + if PY3: + raise nose.SkipTest("doesn't work on python 3") + + # #2114 + data = '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a@\xaa\xaa\xaa\xaa\xaa\xaa\x02@8\x8e\xe38\x8e\xe3\xe8?z\t\xed%\xb4\x97\xd0?\xa2\x0c<\xdd\x9a\x1f\xb6?\x82\xbb\xfa&y\x7f\x9d?\xac\'\xa7\xc4P\xaa\x83?\x90\xdf\xde\xb0k8j?`\xea\xe9u\xf2zQ?*\xe37\x9d\x98N7?\xe2.\xf5&v\x13\x1f?\xec\xc9\xf8\x19\xa4\xb7\x04?\x90b\xf6w\x85\x9f\xeb>\xb5A\xa4\xfaXj\xd2>F\x02\xdb\xf8\xcb\x8d\xb8>.\xac<\xfb\x87^\xa0>\xe8:\xa6\xf9_\xd3\x85>\xfb?\xe2cUU\xfd?\xfc\x7fA\xed8\x8e\xe3?\xa5\xaa\xac\x91\xf6\x12\xca?n\x1cs\xb6\xf9a\xb1?\xe8%D\xf3L-\x97?5\xddZD\x11\xe7~?#>\xe7\x82\x0b\x9ad?\xd9R4Y\x0fxK?;7x;\nP2?N\xf4JO\xb8j\x18?4\xf81\x8a%G\x00?\x9a\xf5\x97\r2\xb4\xe5>\xcd\x9c\xca\xbcB\xf0\xcc>3\x13\x87(\xd7J\xb3>\x99\x19\xb4\xe0\x1e\xb9\x99>ff\xcd\x95\x14&\x81>\x88\x88\xbc\xc7p\xddf>`\x0b\xa6_\x96|N>@\xb2n\xea\x0eS4>U\x98\x938i\x19\x1b>\x8eeb\xd0\xf0\x10\x02>\xbd\xdc-k\x96\x16\xe8=(\x93\x1e\xf2\x0e\x0f\xd0=\xe0n\xd3Bii\xb5=*\xe9\x19Y\x8c\x8c\x9c=\xc6\xf0\xbb\x90]\x08\x83=]\x96\xfa\xc0|`i=>d\xfc\xd5\xfd\xeaP=R0\xfb\xc7\xa7\x8e6=\xc2\x95\xf9_\x8a\x13\x1e=\xd6c\xa6\xea\x06\r\x04=r\xda\xdd8\t\xbc\xea<\xf6\xe6\x93\xd0\xb0\xd2\xd1<\x9d\xdeok\x96\xc3\xb7<&~\xea9s\xaf\x9f\xb8\x02@\xc6\xd2&\xfd\xa8\xf5\xe8?\xd9\xe1\x19\xfe\xc5\xa3\xd0?v\x82"\xa8\xb2/\xb6?\x9dX\x835\xee\x94\x9d?h\x90W\xce\x9e\xb8\x83?\x8a\xc0th~Kj?\\\x80\xf8\x9a\xa9\x87Q?%\xab\xa0\xce\x8c_7?1\xe4\x80\x13\x11*\x1f? \x98\x00\r\xb6\xc6\x04?\x80u\xabf\x9d\xb3\xeb>UNrD\xbew\xd2>\x1c\x13C[\xa8\x9f\xb8>\x12b\xd7m-\x1fQ@\xe3\x85>\xe6\x91)l\x00/m>Da\xc6\xf2\xaatS>\x05\xd7]\xee\xe3\xf09>' + + arr = np.frombuffer(data, dtype='= 0).all()) + + result = mom.rolling_mean(arr, 2) + self.assertTrue((result[1:] >= 0).all()) + + result = mom.rolling_var(arr, 2) + self.assertTrue((result[1:] >= 0).all()) + + # #2527, ugh + arr = np.array([0.00012456, 0.0003, 0]) + result = mom.rolling_mean(arr, 1) + self.assertTrue(result[-1] >= 0) + + result = mom.rolling_mean(-arr, 1) + self.assertTrue(result[-1] <= 0) + + def _check_moment_func(self, func, static_comp, window=50, + has_min_periods=True, + has_center=True, + has_time_rule=True, + preserve_nan=True, + fill_value=None, + test_stable=False): + + self._check_ndarray(func, static_comp, window=window, + has_min_periods=has_min_periods, + preserve_nan=preserve_nan, + has_center=has_center, + fill_value=fill_value, + test_stable=test_stable) + + self._check_structures(func, static_comp, + has_min_periods=has_min_periods, + has_time_rule=has_time_rule, + fill_value=fill_value, + has_center=has_center) + + def _check_ndarray(self, func, static_comp, window=50, + has_min_periods=True, + preserve_nan=True, + has_center=True, + fill_value=None, + test_stable=False, + test_window=True): + + result = func(self.arr, window) + assert_almost_equal(result[-1], + static_comp(self.arr[-50:])) + + if preserve_nan: + assert(np.isnan(result[self._nan_locs]).all()) + + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + + if has_min_periods: + result = func(arr, 50, min_periods=30) + assert_almost_equal(result[-1], static_comp(arr[10:-10])) + + # min_periods is working correctly + result = func(arr, 20, min_periods=15) + self.assertTrue(np.isnan(result[23])) + self.assertFalse(np.isnan(result[24])) + + self.assertFalse(np.isnan(result[-6])) + self.assertTrue(np.isnan(result[-5])) + + arr2 = randn(20) + result = func(arr2, 10, min_periods=5) + self.assertTrue(isnull(result[3])) + self.assertTrue(notnull(result[4])) + + # min_periods=0 + result0 = func(arr, 20, min_periods=0) + result1 = func(arr, 20, min_periods=1) + assert_almost_equal(result0, result1) + else: + result = func(arr, 50) + assert_almost_equal(result[-1], static_comp(arr[10:-10])) + + if has_center: + if has_min_periods: + result = func(arr, 20, min_periods=15, center=True) + expected = func(arr, 20, min_periods=15) + else: + result = func(arr, 20, center=True) + expected = func(arr, 20) + + assert_almost_equal(result[1], expected[10]) + if fill_value is None: + self.assertTrue(np.isnan(result[-9:]).all()) + else: + self.assertTrue((result[-9:] == 0).all()) + if has_min_periods: + self.assertTrue(np.isnan(expected[23])) + self.assertTrue(np.isnan(result[14])) + self.assertTrue(np.isnan(expected[-5])) + self.assertTrue(np.isnan(result[-14])) + + if test_stable: + result = func(self.arr + 1e9, window) + assert_almost_equal(result[-1], + static_comp(self.arr[-50:] + 1e9)) + + # Test window larger than array, #7297 + if test_window: + if has_min_periods: + for minp in (0, len(self.arr)-1, len(self.arr)): + result = func(self.arr, len(self.arr)+1, min_periods=minp) + expected = func(self.arr, len(self.arr), min_periods=minp) + nan_mask = np.isnan(result) + self.assertTrue(np.array_equal(nan_mask, + np.isnan(expected))) + nan_mask = ~nan_mask + assert_almost_equal(result[nan_mask], expected[nan_mask]) + else: + result = func(self.arr, len(self.arr)+1) + expected = func(self.arr, len(self.arr)) + nan_mask = np.isnan(result) + self.assertTrue(np.array_equal(nan_mask, np.isnan(expected))) + nan_mask = ~nan_mask + assert_almost_equal(result[nan_mask], expected[nan_mask]) + + + + + def _check_structures(self, func, static_comp, + has_min_periods=True, has_time_rule=True, + has_center=True, + fill_value=None): + + series_result = func(self.series, 50) + tm.assert_isinstance(series_result, Series) + + frame_result = func(self.frame, 50) + self.assertEqual(type(frame_result), DataFrame) + + # check time_rule works + if has_time_rule: + win = 25 + minp = 10 + + if has_min_periods: + series_result = func(self.series[::2], win, min_periods=minp, + freq='B') + frame_result = func(self.frame[::2], win, min_periods=minp, + freq='B') + else: + series_result = func(self.series[::2], win, freq='B') + frame_result = func(self.frame[::2], win, freq='B') + + last_date = series_result.index[-1] + prev_date = last_date - 24 * datetools.bday + + trunc_series = self.series[::2].truncate(prev_date, last_date) + trunc_frame = self.frame[::2].truncate(prev_date, last_date) + + assert_almost_equal(series_result[-1], static_comp(trunc_series)) + + assert_almost_equal(frame_result.xs(last_date), + trunc_frame.apply(static_comp)) + + if has_center: + if has_min_periods: + minp = 10 + series_xp = func(self.series, 25, min_periods=minp).shift(-12) + frame_xp = func(self.frame, 25, min_periods=minp).shift(-12) + + series_rs = func(self.series, 25, min_periods=minp, + center=True) + frame_rs = func(self.frame, 25, min_periods=minp, + center=True) + + else: + series_xp = func(self.series, 25).shift(-12) + frame_xp = func(self.frame, 25).shift(-12) + + series_rs = func(self.series, 25, center=True) + frame_rs = func(self.frame, 25, center=True) + + if fill_value is not None: + series_xp = series_xp.fillna(fill_value) + frame_xp = frame_xp.fillna(fill_value) + assert_series_equal(series_xp, series_rs) + assert_frame_equal(frame_xp, frame_rs) + + def test_ewma(self): + self._check_ew(mom.ewma) + + arr = np.zeros(1000) + arr[5] = 1 + result = mom.ewma(arr, span=100, adjust=False).sum() + self.assertTrue(np.abs(result - 1) < 1e-2) + + def test_ewma_nan_handling(self): + s = Series([1.] + [np.nan] * 5 + [1.]) + + result = mom.ewma(s, com=5) + assert_almost_equal(result, [1] * len(s)) + + def test_ewmvar(self): + self._check_ew(mom.ewmvar) + + def test_ewmvol(self): + self._check_ew(mom.ewmvol) + + def test_ewma_span_com_args(self): + A = mom.ewma(self.arr, com=9.5) + B = mom.ewma(self.arr, span=20) + assert_almost_equal(A, B) + + self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20) + self.assertRaises(Exception, mom.ewma, self.arr) + + def test_ewma_halflife_arg(self): + A = mom.ewma(self.arr, com=13.932726172912965) + B = mom.ewma(self.arr, halflife=10.0) + assert_almost_equal(A, B) + + self.assertRaises(Exception, mom.ewma, self.arr, span=20, halflife=50) + self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, halflife=50) + self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20, halflife=50) + self.assertRaises(Exception, mom.ewma, self.arr) + + def test_ew_empty_arrays(self): + arr = np.array([], dtype=np.float64) + + funcs = [mom.ewma, mom.ewmvol, mom.ewmvar] + for f in funcs: + result = f(arr, 3) + assert_almost_equal(result, arr) + + def _check_ew(self, func): + self._check_ew_ndarray(func) + self._check_ew_structures(func) + + def _check_ew_ndarray(self, func, preserve_nan=False): + result = func(self.arr, com=10) + if preserve_nan: + assert(np.isnan(result[self._nan_locs]).all()) + + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + + # ??? check something + + # pass in ints + result2 = func(np.arange(50), span=10) + self.assertEqual(result2.dtype, np.float_) + + def _check_ew_structures(self, func): + series_result = func(self.series, com=10) + tm.assert_isinstance(series_result, Series) + frame_result = func(self.frame, com=10) + self.assertEqual(type(frame_result), DataFrame) + + # binary moments + def test_rolling_cov(self): + A = self.series + B = A + randn(len(A)) + + result = mom.rolling_cov(A, B, 50, min_periods=25) + assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) + + def test_rolling_cov_pairwise(self): + self._check_pairwise_moment(mom.rolling_cov, 10, min_periods=5) + + def test_rolling_corr(self): + A = self.series + B = A + randn(len(A)) + + result = mom.rolling_corr(A, B, 50, min_periods=25) + assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) + + # test for correct bias correction + a = tm.makeTimeSeries() + b = tm.makeTimeSeries() + a[:5] = np.nan + b[:10] = np.nan + + result = mom.rolling_corr(a, b, len(a), min_periods=1) + assert_almost_equal(result[-1], a.corr(b)) + + def test_rolling_corr_pairwise(self): + self._check_pairwise_moment(mom.rolling_corr, 10, min_periods=5) + + def _check_pairwise_moment(self, func, *args, **kwargs): + panel = func(self.frame, *args, **kwargs) + + actual = panel.ix[:, 1, 5] + expected = func(self.frame[1], self.frame[5], *args, **kwargs) + tm.assert_series_equal(actual, expected) + + def test_flex_binary_moment(self): + # GH3155 + # don't blow the stack + self.assertRaises(TypeError, mom._flex_binary_moment,5,6,None) + + def test_corr_sanity(self): + #GH 3155 + df = DataFrame( + np.array( + [[ 0.87024726, 0.18505595], + [ 0.64355431, 0.3091617 ], + [ 0.92372966, 0.50552513], + [ 0.00203756, 0.04520709], + [ 0.84780328, 0.33394331], + [ 0.78369152, 0.63919667]]) + ) + + res = mom.rolling_corr(df[0],df[1],5,center=True) + self.assertTrue(all([np.abs(np.nan_to_num(x)) <=1 for x in res])) + + # and some fuzzing + for i in range(10): + df = DataFrame(np.random.rand(30,2)) + res = mom.rolling_corr(df[0],df[1],5,center=True) + try: + self.assertTrue(all([np.abs(np.nan_to_num(x)) <=1 for x in res])) + except: + print(res) + + + def test_flex_binary_frame(self): + def _check(method): + series = self.frame[1] + + res = method(series, self.frame, 10) + res2 = method(self.frame, series, 10) + exp = self.frame.apply(lambda x: method(series, x, 10)) + + tm.assert_frame_equal(res, exp) + tm.assert_frame_equal(res2, exp) + + frame2 = self.frame.copy() + frame2.values[:] = np.random.randn(*frame2.shape) + + res3 = method(self.frame, frame2, 10) + exp = DataFrame(dict((k, method(self.frame[k], frame2[k], 10)) + for k in self.frame)) + tm.assert_frame_equal(res3, exp) + + methods = [mom.rolling_corr, mom.rolling_cov] + for meth in methods: + _check(meth) + + def test_ewmcov(self): + self._check_binary_ew(mom.ewmcov) + + def test_ewmcov_pairwise(self): + self._check_pairwise_moment(mom.ewmcov, span=10, min_periods=5) + + def test_ewmcorr(self): + self._check_binary_ew(mom.ewmcorr) + + def test_ewmcorr_pairwise(self): + self._check_pairwise_moment(mom.ewmcorr, span=10, min_periods=5) + + def _check_binary_ew(self, func): + A = Series(randn(50), index=np.arange(50)) + B = A[2:] + randn(48) + + A[:10] = np.NaN + B[-10:] = np.NaN + + result = func(A, B, 20, min_periods=5) + + self.assertTrue(np.isnan(result.values[:15]).all()) + self.assertFalse(np.isnan(result.values[15:]).any()) + + self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5) + + def test_expanding_apply(self): + ser = Series([]) + assert_series_equal(ser, mom.expanding_apply(ser, lambda x: x.mean())) + + def expanding_mean(x, min_periods=1, freq=None): + return mom.expanding_apply(x, + lambda x: x.mean(), + min_periods=min_periods, + freq=freq) + self._check_expanding(expanding_mean, np.mean) + + def test_expanding_apply_args_kwargs(self): + def mean_w_arg(x, const): + return np.mean(x) + const + + df = DataFrame(np.random.rand(20, 3)) + + expected = mom.expanding_apply(df, np.mean) + 20. + + assert_frame_equal(mom.expanding_apply(df, mean_w_arg, args=(20,)), + expected) + assert_frame_equal(mom.expanding_apply(df, mean_w_arg, + kwargs={'const' : 20}), + expected) + + + def test_expanding_corr(self): + A = self.series.dropna() + B = (A + randn(len(A)))[:-5] + + result = mom.expanding_corr(A, B) + + rolling_result = mom.rolling_corr(A, B, len(A), min_periods=1) + + assert_almost_equal(rolling_result, result) + + def test_expanding_count(self): + result = mom.expanding_count(self.series) + assert_almost_equal(result, mom.rolling_count(self.series, + len(self.series))) + + def test_expanding_quantile(self): + result = mom.expanding_quantile(self.series, 0.5) + + rolling_result = mom.rolling_quantile(self.series, + len(self.series), + 0.5, min_periods=1) + + assert_almost_equal(result, rolling_result) + + def test_expanding_cov(self): + A = self.series + B = (A + randn(len(A)))[:-5] + + result = mom.expanding_cov(A, B) + + rolling_result = mom.rolling_cov(A, B, len(A), min_periods=1) + + assert_almost_equal(rolling_result, result) + + def test_expanding_max(self): + self._check_expanding(mom.expanding_max, np.max, preserve_nan=False) + + def test_expanding_cov_pairwise(self): + result = mom.expanding_cov(self.frame) + + rolling_result = mom.rolling_cov(self.frame, len(self.frame), + min_periods=1) + + for i in result.items: + assert_almost_equal(result[i], rolling_result[i]) + + def test_expanding_corr_pairwise(self): + result = mom.expanding_corr(self.frame) + + rolling_result = mom.rolling_corr(self.frame, len(self.frame), + min_periods=1) + + for i in result.items: + assert_almost_equal(result[i], rolling_result[i]) + + def test_expanding_cov_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = mom.expanding_cov(s1, s2) + expected = Series([None, None, 2.0]) + assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = mom.expanding_cov(s1, s2a) + assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = mom.expanding_cov(s1, s2) + expected = Series([None, None, None, 4.5]) + assert_series_equal(result, expected) + + def test_expanding_corr_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = mom.expanding_corr(s1, s2) + expected = Series([None, None, 1.0]) + assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = mom.expanding_corr(s1, s2a) + assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = mom.expanding_corr(s1, s2) + expected = Series([None, None, None, 1.]) + assert_series_equal(result, expected) + + def test_rolling_cov_diff_length(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = mom.rolling_cov(s1, s2, window=3, min_periods=2) + expected = Series([None, None, 2.0]) + assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = mom.rolling_cov(s1, s2a, window=3, min_periods=2) + assert_series_equal(result, expected) + + def test_rolling_corr_diff_length(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = mom.rolling_corr(s1, s2, window=3, min_periods=2) + expected = Series([None, None, 1.0]) + assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = mom.rolling_corr(s1, s2a, window=3, min_periods=2) + assert_series_equal(result, expected) + + def test_expanding_cov_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame([[1,5], [3, 2], [3,9]], columns=['A','B']) + df1a = DataFrame([[1,5], [3,9]], index=[0,2], columns=['A','B']) + df2 = DataFrame([[5,6], [None,None], [2,1]], columns=['X','Y']) + df2a = DataFrame([[5,6], [2,1]], index=[0,2], columns=['X','Y']) + result1 = mom.expanding_cov(df1, df2, pairwise=True)[2] + result2 = mom.expanding_cov(df1, df2a, pairwise=True)[2] + result3 = mom.expanding_cov(df1a, df2, pairwise=True)[2] + result4 = mom.expanding_cov(df1a, df2a, pairwise=True)[2] + expected = DataFrame([[-3., -5.], [-6., -10.]], index=['A','B'], columns=['X','Y']) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + assert_frame_equal(result4, expected) + + def test_expanding_corr_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame([[1,2], [3, 2], [3,4]], columns=['A','B']) + df1a = DataFrame([[1,2], [3,4]], index=[0,2], columns=['A','B']) + df2 = DataFrame([[5,6], [None,None], [2,1]], columns=['X','Y']) + df2a = DataFrame([[5,6], [2,1]], index=[0,2], columns=['X','Y']) + result1 = mom.expanding_corr(df1, df2, pairwise=True)[2] + result2 = mom.expanding_corr(df1, df2a, pairwise=True)[2] + result3 = mom.expanding_corr(df1a, df2, pairwise=True)[2] + result4 = mom.expanding_corr(df1a, df2a, pairwise=True)[2] + expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], index=['A','B'], columns=['X','Y']) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + assert_frame_equal(result4, expected) + + def test_rolling_skew_edge_cases(self): + + all_nan = Series([np.NaN] * 5) + + # yields all NaN (0 variance) + d = Series([1] * 5) + x = mom.rolling_skew(d, window=5) + assert_series_equal(all_nan, x) + + # yields all NaN (window too small) + d = Series(np.random.randn(5)) + x = mom.rolling_skew(d, window=2) + assert_series_equal(all_nan, x) + + # yields [NaN, NaN, NaN, 0.177994, 1.548824] + d = Series([-1.50837035, -0.1297039 , 0.19501095, + 1.73508164, 0.41941401]) + expected = Series([np.NaN, np.NaN, np.NaN, + 0.177994, 1.548824]) + x = mom.rolling_skew(d, window=4) + assert_series_equal(expected, x) + + def test_rolling_kurt_edge_cases(self): + + all_nan = Series([np.NaN] * 5) + + # yields all NaN (0 variance) + d = Series([1] * 5) + x = mom.rolling_kurt(d, window=5) + assert_series_equal(all_nan, x) + + # yields all NaN (window too small) + d = Series(np.random.randn(5)) + x = mom.rolling_kurt(d, window=3) + assert_series_equal(all_nan, x) + + # yields [NaN, NaN, NaN, 1.224307, 2.671499] + d = Series([-1.50837035, -0.1297039 , 0.19501095, + 1.73508164, 0.41941401]) + expected = Series([np.NaN, np.NaN, np.NaN, + 1.224307, 2.671499]) + x = mom.rolling_kurt(d, window=4) + assert_series_equal(expected, x) + + def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, + has_time_rule=True, preserve_nan=True): + result = func(self.arr) + + assert_almost_equal(result[10], + static_comp(self.arr[:11])) + + if preserve_nan: + assert(np.isnan(result[self._nan_locs]).all()) + + arr = randn(50) + + if has_min_periods: + result = func(arr, min_periods=30) + assert(np.isnan(result[:29]).all()) + assert_almost_equal(result[-1], static_comp(arr[:50])) + + # min_periods is working correctly + result = func(arr, min_periods=15) + self.assertTrue(np.isnan(result[13])) + self.assertFalse(np.isnan(result[14])) + + arr2 = randn(20) + result = func(arr2, min_periods=5) + self.assertTrue(isnull(result[3])) + self.assertTrue(notnull(result[4])) + + # min_periods=0 + result0 = func(arr, min_periods=0) + result1 = func(arr, min_periods=1) + assert_almost_equal(result0, result1) + else: + result = func(arr) + assert_almost_equal(result[-1], static_comp(arr[:50])) + + def _check_expanding_structures(self, func): + series_result = func(self.series) + tm.assert_isinstance(series_result, Series) + frame_result = func(self.frame) + self.assertEqual(type(frame_result), DataFrame) + + def _check_expanding(self, func, static_comp, has_min_periods=True, + has_time_rule=True, + preserve_nan=True): + self._check_expanding_ndarray(func, static_comp, + has_min_periods=has_min_periods, + has_time_rule=has_time_rule, + preserve_nan=preserve_nan) + self._check_expanding_structures(func) + + def test_rolling_max_gh6297(self): + """Replicate result expected in GH #6297""" + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 2 datapoints on one of the days + indices.append(datetime(1975, 1, 3, 6, 0)) + series = Series(range(1, 7), index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D') + assert_series_equal(expected, x) + + def test_rolling_max_how_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be max + expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D') + assert_series_equal(expected, x) + + # Now specify median (10.0) + expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D', how='median') + assert_series_equal(expected, x) + + # Now specify mean (4+10+20)/3 + v = (4.0+10.0+20.0)/3.0 + expected = Series([0.0, 1.0, 2.0, 3.0, v], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_max(series, window=1, freq='D', how='mean') + assert_series_equal(expected, x) + + + def test_rolling_min_how_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be min + expected = Series([0.0, 1.0, 2.0, 3.0, 4.0], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_min(series, window=1, freq='D') + assert_series_equal(expected, x) + + def test_rolling_median_how_resample(self): + + indices = [datetime(1975, 1, i) for i in range(1, 6)] + # So that we can have 3 datapoints on last day (4, 10, and 20) + indices.append(datetime(1975, 1, 5, 1)) + indices.append(datetime(1975, 1, 5, 2)) + series = Series(list(range(0, 5)) + [10, 20], index=indices) + # Use floats instead of ints as values + series = series.map(lambda x: float(x)) + # Sort chronologically + series = series.sort_index() + + # Default how should be median + expected = Series([0.0, 1.0, 2.0, 3.0, 10], + index=[datetime(1975, 1, i, 0) + for i in range(1, 6)]) + x = mom.rolling_median(series, window=1, freq='D') + assert_series_equal(expected, x) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py new file mode 100644 index 00000000..c6caadad --- /dev/null +++ b/pandas/stats/tests/test_ols.py @@ -0,0 +1,890 @@ +""" +Unit test suite for OLS and PanelOLS classes +""" + +# pylint: disable-msg=W0212 + +from __future__ import division + +from datetime import datetime +from pandas import compat +from distutils.version import LooseVersion +import nose +import numpy as np +from numpy.testing.decorators import slow + +from pandas import date_range, bdate_range +from pandas.core.panel import Panel +from pandas import DataFrame, Index, Series, notnull, datetools +from pandas.stats.api import ols +from pandas.stats.ols import _filter_data +from pandas.stats.plm import NonPooledPanelOLS, PanelOLS +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assertRaisesRegexp) +import pandas.util.testing as tm +import pandas.compat as compat +from .common import BaseTest + +_have_statsmodels = True +try: + import statsmodels.api as sm +except ImportError: + try: + import scikits.statsmodels.api as sm + except ImportError: + _have_statsmodels = False + + +def _check_repr(obj): + repr(obj) + str(obj) + + +def _compare_ols_results(model1, model2): + tm.assert_isinstance(model1, type(model2)) + + if hasattr(model1, '_window_type'): + _compare_moving_ols(model1, model2) + else: + _compare_fullsample_ols(model1, model2) + + +def _compare_fullsample_ols(model1, model2): + assert_series_equal(model1.beta, model2.beta) + + +def _compare_moving_ols(model1, model2): + assert_frame_equal(model1.beta, model2.beta) + + +class TestOLS(BaseTest): + + _multiprocess_can_split_ = True + + # TODO: Add tests for OLS y predict + # TODO: Right now we just check for consistency between full-sample and + # rolling/expanding results of the panel OLS. We should also cross-check + # with trusted implementations of panel OLS (e.g. R). + # TODO: Add tests for non pooled OLS. + + @classmethod + def setUpClass(cls): + super(TestOLS, cls).setUpClass() + try: + import matplotlib as mpl + mpl.use('Agg', warn=False) + except ImportError: + pass + + if not _have_statsmodels: + raise nose.SkipTest("no statsmodels") + + def testOLSWithDatasets_ccard(self): + self.checkDataSet(sm.datasets.ccard.load(), skip_moving=True) + self.checkDataSet(sm.datasets.cpunish.load(), skip_moving=True) + self.checkDataSet(sm.datasets.longley.load(), skip_moving=True) + self.checkDataSet(sm.datasets.stackloss.load(), skip_moving=True) + + @slow + def testOLSWithDatasets_copper(self): + self.checkDataSet(sm.datasets.copper.load()) + + @slow + def testOLSWithDatasets_scotland(self): + self.checkDataSet(sm.datasets.scotland.load()) + + # degenerate case fails on some platforms + # self.checkDataSet(datasets.ccard.load(), 39, 49) # one col in X all + # 0s + + def testWLS(self): + # WLS centered SS changed (fixed) in 0.5.0 + sm_version = sm.version.version + if sm_version < LooseVersion('0.5.0'): + raise nose.SkipTest("WLS centered SS not fixed in statsmodels" + " version {0}".format(sm_version)) + + X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) + Y = Series(np.random.randn(30)) + weights = X.std(1) + + self._check_wls(X, Y, weights) + + weights.ix[[5, 15]] = np.nan + Y[[2, 21]] = np.nan + self._check_wls(X, Y, weights) + + def _check_wls(self, x, y, weights): + result = ols(y=y, x=x, weights=1 / weights) + + combined = x.copy() + combined['__y__'] = y + combined['__weights__'] = weights + combined = combined.dropna() + + endog = combined.pop('__y__').values + aweights = combined.pop('__weights__').values + exog = sm.add_constant(combined.values, prepend=False) + + sm_result = sm.WLS(endog, exog, weights=1 / aweights).fit() + + assert_almost_equal(sm_result.params, result._beta_raw) + assert_almost_equal(sm_result.resid, result._resid_raw) + + self.checkMovingOLS('rolling', x, y, weights=weights) + self.checkMovingOLS('expanding', x, y, weights=weights) + + def checkDataSet(self, dataset, start=None, end=None, skip_moving=False): + exog = dataset.exog[start: end] + endog = dataset.endog[start: end] + x = DataFrame(exog, index=np.arange(exog.shape[0]), + columns=np.arange(exog.shape[1])) + y = Series(endog, index=np.arange(len(endog))) + + self.checkOLS(exog, endog, x, y) + + if not skip_moving: + self.checkMovingOLS('rolling', x, y) + self.checkMovingOLS('rolling', x, y, nw_lags=0) + self.checkMovingOLS('expanding', x, y, nw_lags=0) + self.checkMovingOLS('rolling', x, y, nw_lags=1) + self.checkMovingOLS('expanding', x, y, nw_lags=1) + self.checkMovingOLS('expanding', x, y, nw_lags=1, nw_overlap=True) + + def checkOLS(self, exog, endog, x, y): + reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit() + result = ols(y=y, x=x) + + # check that sparse version is the same + sparse_result = ols(y=y.to_sparse(), x=x.to_sparse()) + _compare_ols_results(result, sparse_result) + + assert_almost_equal(reference.params, result._beta_raw) + assert_almost_equal(reference.df_model, result._df_model_raw) + assert_almost_equal(reference.df_resid, result._df_resid_raw) + assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) + assert_almost_equal(reference.pvalues, result._p_value_raw) + assert_almost_equal(reference.rsquared, result._r2_raw) + assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) + assert_almost_equal(reference.resid, result._resid_raw) + assert_almost_equal(reference.bse, result._std_err_raw) + assert_almost_equal(reference.tvalues, result._t_stat_raw) + assert_almost_equal(reference.cov_params(), result._var_beta_raw) + assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) + + _check_non_raw_results(result) + + def checkMovingOLS(self, window_type, x, y, weights=None, **kwds): + window = sm.tools.tools.rank(x.values) * 2 + + moving = ols(y=y, x=x, weights=weights, window_type=window_type, + window=window, **kwds) + + # check that sparse version is the same + sparse_moving = ols(y=y.to_sparse(), x=x.to_sparse(), + weights=weights, + window_type=window_type, + window=window, **kwds) + _compare_ols_results(moving, sparse_moving) + + index = moving._index + + for n, i in enumerate(moving._valid_indices): + if window_type == 'rolling' and i >= window: + prior_date = index[i - window + 1] + else: + prior_date = index[0] + + date = index[i] + + x_iter = {} + for k, v in compat.iteritems(x): + x_iter[k] = v.truncate(before=prior_date, after=date) + y_iter = y.truncate(before=prior_date, after=date) + + static = ols(y=y_iter, x=x_iter, weights=weights, **kwds) + + self.compare(static, moving, event_index=i, + result_index=n) + + _check_non_raw_results(moving) + + FIELDS = ['beta', 'df', 'df_model', 'df_resid', 'f_stat', 'p_value', + 'r2', 'r2_adj', 'rmse', 'std_err', 't_stat', + 'var_beta'] + + def compare(self, static, moving, event_index=None, + result_index=None): + + index = moving._index + + # Check resid if we have a time index specified + if event_index is not None: + ref = static._resid_raw[-1] + + label = index[event_index] + + res = moving.resid[label] + + assert_almost_equal(ref, res) + + ref = static._y_fitted_raw[-1] + res = moving.y_fitted[label] + + assert_almost_equal(ref, res) + + # Check y_fitted + + for field in self.FIELDS: + attr = '_%s_raw' % field + + ref = getattr(static, attr) + res = getattr(moving, attr) + + if result_index is not None: + res = res[result_index] + + assert_almost_equal(ref, res) + + def test_ols_object_dtype(self): + df = DataFrame(np.random.randn(20, 2), dtype=object) + model = ols(y=df[0], x=df[1]) + summary = repr(model) + + +class TestOLSMisc(tm.TestCase): + + _multiprocess_can_split_ = True + + ''' + For test coverage with faux data + ''' + @classmethod + def setUpClass(cls): + super(TestOLSMisc, cls).setUpClass() + if not _have_statsmodels: + raise nose.SkipTest("no statsmodels") + + def test_f_test(self): + x = tm.makeTimeDataFrame() + y = x.pop('A') + + model = ols(y=y, x=x) + + hyp = '1*B+1*C+1*D=0' + result = model.f_test(hyp) + + hyp = ['1*B=0', + '1*C=0', + '1*D=0'] + result = model.f_test(hyp) + assert_almost_equal(result['f-stat'], model.f_stat['f-stat']) + + self.assertRaises(Exception, model.f_test, '1*A=0') + + def test_r2_no_intercept(self): + y = tm.makeTimeSeries() + x = tm.makeTimeDataFrame() + + x_with = x.copy() + x_with['intercept'] = 1. + + model1 = ols(y=y, x=x) + model2 = ols(y=y, x=x_with, intercept=False) + assert_series_equal(model1.beta, model2.beta) + + # TODO: can we infer whether the intercept is there... + self.assertNotEqual(model1.r2, model2.r2) + + # rolling + + model1 = ols(y=y, x=x, window=20) + model2 = ols(y=y, x=x_with, window=20, intercept=False) + assert_frame_equal(model1.beta, model2.beta) + self.assertTrue((model1.r2 != model2.r2).all()) + + def test_summary_many_terms(self): + x = DataFrame(np.random.randn(100, 20)) + y = np.random.randn(100) + model = ols(y=y, x=x) + model.summary + + def test_y_predict(self): + y = tm.makeTimeSeries() + x = tm.makeTimeDataFrame() + model1 = ols(y=y, x=x) + assert_series_equal(model1.y_predict, model1.y_fitted) + assert_almost_equal(model1._y_predict_raw, model1._y_fitted_raw) + + def test_predict(self): + y = tm.makeTimeSeries() + x = tm.makeTimeDataFrame() + model1 = ols(y=y, x=x) + assert_series_equal(model1.predict(), model1.y_predict) + assert_series_equal(model1.predict(x=x), model1.y_predict) + assert_series_equal(model1.predict(beta=model1.beta), model1.y_predict) + + exog = x.copy() + exog['intercept'] = 1. + rs = Series(np.dot(exog.values, model1.beta.values), x.index) + assert_series_equal(model1.y_predict, rs) + + x2 = x.reindex(columns=x.columns[::-1]) + assert_series_equal(model1.predict(x=x2), model1.y_predict) + + x3 = x2 + 10 + pred3 = model1.predict(x=x3) + x3['intercept'] = 1. + x3 = x3.reindex(columns=model1.beta.index) + expected = Series(np.dot(x3.values, model1.beta.values), x3.index) + assert_series_equal(expected, pred3) + + beta = Series(0., model1.beta.index) + pred4 = model1.predict(beta=beta) + assert_series_equal(Series(0., pred4.index), pred4) + + def test_predict_longer_exog(self): + exogenous = {"1998": "4760", "1999": "5904", "2000": "4504", + "2001": "9808", "2002": "4241", "2003": "4086", + "2004": "4687", "2005": "7686", "2006": "3740", + "2007": "3075", "2008": "3753", "2009": "4679", + "2010": "5468", "2011": "7154", "2012": "4292", + "2013": "4283", "2014": "4595", "2015": "9194", + "2016": "4221", "2017": "4520"} + endogenous = {"1998": "691", "1999": "1580", "2000": "80", + "2001": "1450", "2002": "555", "2003": "956", + "2004": "877", "2005": "614", "2006": "468", + "2007": "191"} + + endog = Series(endogenous) + exog = Series(exogenous) + model = ols(y=endog, x=exog) + + pred = model.y_predict + self.assertTrue(pred.index.equals(exog.index)) + + def test_longpanel_series_combo(self): + wp = tm.makePanel() + lp = wp.to_frame() + + y = lp.pop('ItemA') + model = ols(y=y, x=lp, entity_effects=True, window=20) + self.assertTrue(notnull(model.beta.values).all()) + tm.assert_isinstance(model, PanelOLS) + model.summary + + def test_series_rhs(self): + y = tm.makeTimeSeries() + x = tm.makeTimeSeries() + model = ols(y=y, x=x) + expected = ols(y=y, x={'x': x}) + assert_series_equal(model.beta, expected.beta) + + # GH 5233/5250 + assert_series_equal(model.y_predict, model.predict(x=x)) + + def test_various_attributes(self): + # just make sure everything "works". test correctness elsewhere + + x = DataFrame(np.random.randn(100, 5)) + y = np.random.randn(100) + model = ols(y=y, x=x, window=20) + + series_attrs = ['rank', 'df', 'forecast_mean', 'forecast_vol'] + + for attr in series_attrs: + value = getattr(model, attr) + tm.assert_isinstance(value, Series) + + # works + model._results + + def test_catch_regressor_overlap(self): + df1 = tm.makeTimeDataFrame().ix[:, ['A', 'B']] + df2 = tm.makeTimeDataFrame().ix[:, ['B', 'C', 'D']] + y = tm.makeTimeSeries() + + data = {'foo': df1, 'bar': df2} + self.assertRaises(Exception, ols, y=y, x=data) + + def test_plm_ctor(self): + y = tm.makeTimeDataFrame() + x = {'a': tm.makeTimeDataFrame(), + 'b': tm.makeTimeDataFrame()} + + model = ols(y=y, x=x, intercept=False) + model.summary + + model = ols(y=y, x=Panel(x)) + model.summary + + def test_plm_attrs(self): + y = tm.makeTimeDataFrame() + x = {'a': tm.makeTimeDataFrame(), + 'b': tm.makeTimeDataFrame()} + + rmodel = ols(y=y, x=x, window=10) + model = ols(y=y, x=x) + model.resid + rmodel.resid + + def test_plm_lagged_y_predict(self): + y = tm.makeTimeDataFrame() + x = {'a': tm.makeTimeDataFrame(), + 'b': tm.makeTimeDataFrame()} + + model = ols(y=y, x=x, window=10) + result = model.lagged_y_predict(2) + + def test_plm_f_test(self): + y = tm.makeTimeDataFrame() + x = {'a': tm.makeTimeDataFrame(), + 'b': tm.makeTimeDataFrame()} + + model = ols(y=y, x=x) + + hyp = '1*a+1*b=0' + result = model.f_test(hyp) + + hyp = ['1*a=0', + '1*b=0'] + result = model.f_test(hyp) + assert_almost_equal(result['f-stat'], model.f_stat['f-stat']) + + def test_plm_exclude_dummy_corner(self): + y = tm.makeTimeDataFrame() + x = {'a': tm.makeTimeDataFrame(), + 'b': tm.makeTimeDataFrame()} + + model = ols( + y=y, x=x, entity_effects=True, dropped_dummies={'entity': 'D'}) + model.summary + + self.assertRaises(Exception, ols, y=y, x=x, entity_effects=True, + dropped_dummies={'entity': 'E'}) + + def test_columns_tuples_summary(self): + # #1837 + X = DataFrame(np.random.randn(10, 2), columns=[('a', 'b'), ('c', 'd')]) + Y = Series(np.random.randn(10)) + + # it works! + model = ols(y=Y, x=X) + model.summary + + +class TestPanelOLS(BaseTest): + + _multiprocess_can_split_ = True + + FIELDS = ['beta', 'df', 'df_model', 'df_resid', 'f_stat', + 'p_value', 'r2', 'r2_adj', 'rmse', 'std_err', + 't_stat', 'var_beta'] + + _other_fields = ['resid', 'y_fitted'] + + def testFiltering(self): + result = ols(y=self.panel_y2, x=self.panel_x2) + + x = result._x + index = x.index.get_level_values(0) + index = Index(sorted(set(index))) + exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3)]) + self.assertTrue + (exp_index.equals(index)) + + index = x.index.get_level_values(1) + index = Index(sorted(set(index))) + exp_index = Index(['A', 'B']) + self.assertTrue(exp_index.equals(index)) + + x = result._x_filtered + index = x.index.get_level_values(0) + index = Index(sorted(set(index))) + exp_index = Index([datetime(2000, 1, 1), + datetime(2000, 1, 3), + datetime(2000, 1, 4)]) + self.assertTrue(exp_index.equals(index)) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + + exp_x = [[6, 14, 1], + [9, 17, 1], + [30, 48, 1]] + assert_almost_equal(exp_x, result._x.values) + + exp_x_filtered = [[6, 14, 1], + [9, 17, 1], + [30, 48, 1], + [11, 20, 1], + [12, 21, 1]] + assert_almost_equal(exp_x_filtered, result._x_filtered.values) + + self.assertTrue(result._x_filtered.index.levels[0].equals( + result.y_fitted.index)) + + def test_wls_panel(self): + y = tm.makeTimeDataFrame() + x = Panel({'x1': tm.makeTimeDataFrame(), + 'x2': tm.makeTimeDataFrame()}) + + y.ix[[1, 7], 'A'] = np.nan + y.ix[[6, 15], 'B'] = np.nan + y.ix[[3, 20], 'C'] = np.nan + y.ix[[5, 11], 'D'] = np.nan + + stack_y = y.stack() + stack_x = DataFrame(dict((k, v.stack()) + for k, v in compat.iteritems(x))) + + weights = x.std('items') + stack_weights = weights.stack() + + stack_y.index = stack_y.index._tuple_index + stack_x.index = stack_x.index._tuple_index + stack_weights.index = stack_weights.index._tuple_index + + result = ols(y=y, x=x, weights=1 / weights) + expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights) + + assert_almost_equal(result.beta, expected.beta) + + for attr in ['resid', 'y_fitted']: + rvals = getattr(result, attr).stack().values + evals = getattr(expected, attr).values + assert_almost_equal(rvals, evals) + + def testWithTimeEffects(self): + result = ols(y=self.panel_y2, x=self.panel_x2, time_effects=True) + + assert_almost_equal(result._y_trans.values.flat, [0, -0.5, 0.5]) + + exp_x = [[0, 0], [-10.5, -15.5], [10.5, 15.5]] + assert_almost_equal(result._x_trans.values, exp_x) + + # _check_non_raw_results(result) + + def testWithEntityEffects(self): + result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + + exp_x = DataFrame([[0., 6., 14., 1.], [0, 9, 17, 1], [1, 30, 48, 1]], + index=result._x.index, columns=['FE_B', 'x1', 'x2', + 'intercept'], + dtype=float) + tm.assert_frame_equal(result._x, exp_x.ix[:, result._x.columns]) + # _check_non_raw_results(result) + + def testWithEntityEffectsAndDroppedDummies(self): + result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True, + dropped_dummies={'entity': 'B'}) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + exp_x = DataFrame([[1., 6., 14., 1.], [1, 9, 17, 1], [0, 30, 48, 1]], + index=result._x.index, columns=['FE_A', 'x1', 'x2', + 'intercept'], + dtype=float) + tm.assert_frame_equal(result._x, exp_x.ix[:, result._x.columns]) + # _check_non_raw_results(result) + + def testWithXEffects(self): + result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1']) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + + res = result._x + exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], + columns=['x1_30', 'x1_9', 'x2', 'intercept'], + index=res.index, dtype=float) + assert_frame_equal(res, exp_x.reindex(columns=res.columns)) + + def testWithXEffectsAndDroppedDummies(self): + result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1'], + dropped_dummies={'x1': 30}) + + res = result._x + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]], + columns=['x1_6', 'x1_9', 'x2', 'intercept'], + index=res.index, dtype=float) + + assert_frame_equal(res, exp_x.reindex(columns=res.columns)) + + def testWithXEffectsAndConversion(self): + result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2']) + + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) + exp_x = [[0, 0, 0, 1, 1], [1, 0, 0, 0, 1], [0, 1, 1, 0, 1], + [0, 0, 0, 1, 1]] + assert_almost_equal(result._x.values, exp_x) + + exp_index = Index(['x1_B', 'x1_C', 'x2_baz', 'x2_foo', 'intercept']) + self.assertTrue(exp_index.equals(result._x.columns)) + + # _check_non_raw_results(result) + + def testWithXEffectsAndConversionAndDroppedDummies(self): + result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2'], + dropped_dummies={'x2': 'foo'}) + + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) + exp_x = [[0, 0, 0, 0, 1], [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], + [0, 0, 0, 0, 1]] + assert_almost_equal(result._x.values, exp_x) + + exp_index = Index(['x1_B', 'x1_C', 'x2_bar', 'x2_baz', 'intercept']) + self.assertTrue(exp_index.equals(result._x.columns)) + + # _check_non_raw_results(result) + + def testForSeries(self): + self.checkForSeries(self.series_panel_x, self.series_panel_y, + self.series_x, self.series_y) + + self.checkForSeries(self.series_panel_x, self.series_panel_y, + self.series_x, self.series_y, nw_lags=0) + + self.checkForSeries(self.series_panel_x, self.series_panel_y, + self.series_x, self.series_y, nw_lags=1, + nw_overlap=True) + + def testRolling(self): + self.checkMovingOLS(self.panel_x, self.panel_y) + + def testRollingWithFixedEffects(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + entity_effects=True) + self.checkMovingOLS(self.panel_x, self.panel_y, intercept=False, + entity_effects=True) + + def testRollingWithTimeEffects(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + time_effects=True) + + def testRollingWithNeweyWest(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + nw_lags=1) + + def testRollingWithEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + cluster='entity') + def testUnknownClusterRaisesValueError(self): + assertRaisesRegexp(ValueError, "Unrecognized cluster.*ridiculous", + self.checkMovingOLS, self.panel_x, self.panel_y, + cluster='ridiculous') + def testRollingWithTimeEffectsAndEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + time_effects=True, cluster='entity') + + def testRollingWithTimeCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + cluster='time') + + def testRollingWithNeweyWestAndEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + nw_lags=1, cluster='entity') + + def testRollingWithNeweyWestAndTimeEffectsAndEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + nw_lags=1, cluster='entity', + time_effects=True) + + def testExpanding(self): + self.checkMovingOLS( + self.panel_x, self.panel_y, window_type='expanding') + + def testNonPooled(self): + self.checkNonPooled(y=self.panel_y, x=self.panel_x) + self.checkNonPooled(y=self.panel_y, x=self.panel_x, + window_type='rolling', window=25, min_periods=10) + def testUnknownWindowType(self): + assertRaisesRegexp(ValueError, "window.*ridiculous", + self.checkNonPooled, y=self.panel_y, x=self.panel_x, + window_type='ridiculous', window=25, min_periods=10) + + def checkNonPooled(self, x, y, **kwds): + # For now, just check that it doesn't crash + result = ols(y=y, x=x, pool=False, **kwds) + + _check_repr(result) + for attr in NonPooledPanelOLS.ATTRIBUTES: + _check_repr(getattr(result, attr)) + + def checkMovingOLS(self, x, y, window_type='rolling', **kwds): + window = 25 # must be larger than rank of x + + moving = ols(y=y, x=x, window_type=window_type, + window=window, **kwds) + + index = moving._index + + for n, i in enumerate(moving._valid_indices): + if window_type == 'rolling' and i >= window: + prior_date = index[i - window + 1] + else: + prior_date = index[0] + + date = index[i] + + x_iter = {} + for k, v in compat.iteritems(x): + x_iter[k] = v.truncate(before=prior_date, after=date) + y_iter = y.truncate(before=prior_date, after=date) + + static = ols(y=y_iter, x=x_iter, **kwds) + + self.compare(static, moving, event_index=i, + result_index=n) + + _check_non_raw_results(moving) + + def checkForSeries(self, x, y, series_x, series_y, **kwds): + # Consistency check with simple OLS. + result = ols(y=y, x=x, **kwds) + reference = ols(y=series_y, x=series_x, **kwds) + + self.compare(reference, result) + + def compare(self, static, moving, event_index=None, + result_index=None): + + # Check resid if we have a time index specified + if event_index is not None: + staticSlice = _period_slice(static, -1) + movingSlice = _period_slice(moving, event_index) + + ref = static._resid_raw[staticSlice] + res = moving._resid_raw[movingSlice] + + assert_almost_equal(ref, res) + + ref = static._y_fitted_raw[staticSlice] + res = moving._y_fitted_raw[movingSlice] + + assert_almost_equal(ref, res) + + # Check y_fitted + + for field in self.FIELDS: + attr = '_%s_raw' % field + + ref = getattr(static, attr) + res = getattr(moving, attr) + + if result_index is not None: + res = res[result_index] + + assert_almost_equal(ref, res) + + def test_auto_rolling_window_type(self): + data = tm.makeTimeDataFrame() + y = data.pop('A') + + window_model = ols(y=y, x=data, window=20, min_periods=10) + rolling_model = ols(y=y, x=data, window=20, min_periods=10, + window_type='rolling') + + assert_frame_equal(window_model.beta, rolling_model.beta) + + +def _check_non_raw_results(model): + _check_repr(model) + _check_repr(model.resid) + _check_repr(model.summary_as_matrix) + _check_repr(model.y_fitted) + _check_repr(model.y_predict) + + +def _period_slice(panelModel, i): + index = panelModel._x_trans.index + period = index.levels[0][i] + + L, R = index.get_major_bounds(period, period) + + return slice(L, R) + + +class TestOLSFilter(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + date_index = date_range(datetime(2009, 12, 11), periods=3, + freq=datetools.bday) + ts = Series([3, 1, 4], index=date_index) + self.TS1 = ts + + date_index = date_range(datetime(2009, 12, 11), periods=5, + freq=datetools.bday) + ts = Series([1, 5, 9, 2, 6], index=date_index) + self.TS2 = ts + + date_index = date_range(datetime(2009, 12, 11), periods=3, + freq=datetools.bday) + ts = Series([5, np.nan, 3], index=date_index) + self.TS3 = ts + + date_index = date_range(datetime(2009, 12, 11), periods=5, + freq=datetools.bday) + ts = Series([np.nan, 5, 8, 9, 7], index=date_index) + self.TS4 = ts + + data = {'x1': self.TS2, 'x2': self.TS4} + self.DF1 = DataFrame(data=data) + + data = {'x1': self.TS2, 'x2': self.TS4} + self.DICT1 = data + + def testFilterWithSeriesRHS(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS1, {'x1': self.TS2}, None) + self.tsAssertEqual(self.TS1, lhs) + self.tsAssertEqual(self.TS2[:3], rhs['x1']) + self.tsAssertEqual(self.TS2, rhs_pre['x1']) + + def testFilterWithSeriesRHS2(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS2, {'x1': self.TS1}, None) + self.tsAssertEqual(self.TS2[:3], lhs) + self.tsAssertEqual(self.TS1, rhs['x1']) + self.tsAssertEqual(self.TS1, rhs_pre['x1']) + + def testFilterWithSeriesRHS3(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS3, {'x1': self.TS4}, None) + exp_lhs = self.TS3[2:3] + exp_rhs = self.TS4[2:3] + exp_rhs_pre = self.TS4[1:] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs, rhs['x1']) + self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1']) + + def testFilterWithDataFrameRHS(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS1, self.DF1, None) + exp_lhs = self.TS1[1:] + exp_rhs1 = self.TS2[1:3] + exp_rhs2 = self.TS4[1:3] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs1, rhs['x1']) + self.tsAssertEqual(exp_rhs2, rhs['x2']) + + def testFilterWithDictRHS(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS1, self.DICT1, None) + exp_lhs = self.TS1[1:] + exp_rhs1 = self.TS2[1:3] + exp_rhs2 = self.TS4[1:3] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs1, rhs['x1']) + self.tsAssertEqual(exp_rhs2, rhs['x2']) + + def tsAssertEqual(self, ts1, ts2): + self.assert_numpy_array_equal(ts1, ts2) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_var.py b/pandas/stats/tests/test_var.py new file mode 100644 index 00000000..ab5709d0 --- /dev/null +++ b/pandas/stats/tests/test_var.py @@ -0,0 +1,195 @@ +from __future__ import print_function +from numpy.testing import run_module_suite, assert_equal, TestCase + +from pandas.util.testing import assert_almost_equal + +from pandas.compat import range +import nose +import unittest + +raise nose.SkipTest('skipping this for now') + +try: + import statsmodels.tsa.var as sm_var + import statsmodels as sm +except ImportError: + import scikits.statsmodels.tsa.var as sm_var + import scikits.statsmodels as sm + + +import pandas.stats.var as _pvar +reload(_pvar) +from pandas.stats.var import VAR + +try: + import rpy2.robjects as robj + from rpy2.robjects import r + from rpy2.robjects.packages import importr + import pandas.rpy.common as rpy + vars = importr('vars') + urca = importr('urca') +except ImportError: + pass + +DECIMAL_6 = 6 +DECIMAL_5 = 5 +DECIMAL_4 = 4 +DECIMAL_3 = 3 +DECIMAL_2 = 2 + + +class CheckVAR(object): + def test_params(self): + assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_3) + + def test_neqs(self): + assert_equal(self.res1.neqs, self.res2.neqs) + + def test_nobs(self): + assert_equal(self.res1.avobs, self.res2.nobs) + + def test_df_eq(self): + assert_equal(self.res1.df_eq, self.res2.df_eq) + + def test_rmse(self): + results = self.res1.results + for i in range(len(results)): + assert_almost_equal(results[i].mse_resid ** .5, + eval('self.res2.rmse_' + str(i + 1)), DECIMAL_6) + + def test_rsquared(self): + results = self.res1.results + for i in range(len(results)): + assert_almost_equal(results[i].rsquared, + eval('self.res2.rsquared_' + str(i + 1)), DECIMAL_3) + + def test_llf(self): + results = self.res1.results + assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) + for i in range(len(results)): + assert_almost_equal(results[i].llf, + eval('self.res2.llf_' + str(i + 1)), DECIMAL_2) + + def test_aic(self): + assert_almost_equal(self.res1.aic, self.res2.aic) + + def test_bic(self): + assert_almost_equal(self.res1.bic, self.res2.bic) + + def test_hqic(self): + assert_almost_equal(self.res1.hqic, self.res2.hqic) + + def test_fpe(self): + assert_almost_equal(self.res1.fpe, self.res2.fpe) + + def test_detsig(self): + assert_almost_equal(self.res1.detomega, self.res2.detsig) + + def test_bse(self): + assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) + + +class Foo(object): + def __init__(self): + data = sm.datasets.macrodata.load() + data = data.data[['realinv', 'realgdp', 'realcons']].view((float, 3)) + data = diff(log(data), axis=0) + self.res1 = VAR2(endog=data).fit(maxlag=2) + from results import results_var + self.res2 = results_var.MacrodataResults() + + +class RVAR(object): + """ + Estimates VAR model using R vars package and rpy + """ + + def __init__(self, data, p=1, type='both'): + self.rdata = data + self.p = p + self.type = type + + self.pydata = rpy.convert_robj(data) + self._estimate = None + self.estimate() + + @property + def aic(self): + pass + + @property + def bic(self): + pass + + @property + def beta(self): + return rpy.convert_robj(r.coef(self._estimate)) + + def summary(self, equation=None): + print(r.summary(self._estimate, equation=equation)) + + def output(self): + print(self._estimate) + + def estimate(self): + self._estimate = r.VAR(self.rdata, p=self.p, type=self.type) + + def plot(self, names=None): + r.plot(model._estimate, names=names) + + def serial_test(self, lags_pt=16, type='PT.asymptotic'): + f = r['serial.test'] + + test = f(self._estimate, **{'lags.pt': lags_pt, + 'type': type}) + + return test + + def data_summary(self): + print(r.summary(self.rdata)) + + +class TestVAR(TestCase): + + def setUp(self): + try: + import rpy2 + except ImportError: + raise nose.SkipTest("No rpy2") + + self.rdata = rpy.load_data('Canada', package='vars', convert=False) + self.data = rpy.load_data('Canada', package='vars', convert=True) + + self.res = VAR(self.data) + self.ref = RVAR(self.rdata) + + def test_foo(self): + pass + +if __name__ == '__main__': + # canada = rpy.load_data('Canada', package='vars', convert=False) + + # model = RVAR(canada, p=1) + + # summary(Canada) + + # plot(Canada, nc=2, xlab="")ppp + + # adf1 <- summary(ur.df(Canada[, "prod"], type = "trend", lags = 2)) + # adf1 + + # adf2 <- summary(ur.df(diff(Canada[, "prod"]), type = "drift", lags = 1)) + # adf2 + + # VARselect(Canada, lag.max = 8, type = "both") + + # Canada <- Canada[, c("prod", "e", "U", "rw")] + + # p1ct <- VAR(Canada, p = 1, type = "both") + # p1ct + + # coefs <- coef(p1ct) + # class(coefs) + + # run_module_suite() + unittest.main() diff --git a/pandas/stats/var.py b/pandas/stats/var.py new file mode 100644 index 00000000..be55507f --- /dev/null +++ b/pandas/stats/var.py @@ -0,0 +1,595 @@ +from __future__ import division + +from pandas.compat import range, lrange, zip, reduce +from pandas import compat +import numpy as np +from pandas.core.base import StringMixin +from pandas.util.decorators import cache_readonly +from pandas.core.frame import DataFrame +from pandas.core.panel import Panel +from pandas.core.series import Series +import pandas.stats.common as common +from pandas.stats.math import inv +from pandas.stats.ols import _combine_rhs + + +class VAR(StringMixin): + """ + Estimates VAR(p) regression on multivariate time series data + presented in pandas data structures. + + Parameters + ---------- + data : DataFrame or dict of Series + p : lags to include + + """ + + def __init__(self, data, p=1, intercept=True): + try: + import statsmodels.tsa.vector_ar.api as sm_var + except ImportError: + import scikits.statsmodels.tsa.var as sm_var + + self._data = DataFrame(_combine_rhs(data)) + self._p = p + + self._columns = self._data.columns + self._index = self._data.index + + self._intercept = intercept + + @cache_readonly + def aic(self): + """Returns the Akaike information criterion.""" + return self._ic['aic'] + + @cache_readonly + def bic(self): + """Returns the Bayesian information criterion.""" + return self._ic['bic'] + + @cache_readonly + def beta(self): + """ + Returns a DataFrame, where each column x1 contains the betas + calculated by regressing the x1 column of the VAR input with + the lagged input. + + Returns + ------- + DataFrame + """ + d = dict([(key, value.beta) + for (key, value) in compat.iteritems(self.ols_results)]) + return DataFrame(d) + + def forecast(self, h): + """ + Returns a DataFrame containing the forecasts for 1, 2, ..., n time + steps. Each column x1 contains the forecasts of the x1 column. + + Parameters + ---------- + n: int + Number of time steps ahead to forecast. + + Returns + ------- + DataFrame + """ + forecast = self._forecast_raw(h)[:, 0, :] + return DataFrame(forecast, index=lrange(1, 1 + h), + columns=self._columns) + + def forecast_cov(self, h): + """ + Returns the covariance of the forecast residuals. + + Returns + ------- + DataFrame + """ + return [DataFrame(value, index=self._columns, columns=self._columns) + for value in self._forecast_cov_raw(h)] + + def forecast_std_err(self, h): + """ + Returns the standard errors of the forecast residuals. + + Returns + ------- + DataFrame + """ + return DataFrame(self._forecast_std_err_raw(h), + index=lrange(1, 1 + h), columns=self._columns) + + @cache_readonly + def granger_causality(self): + """Returns the f-stats and p-values from the Granger Causality Test. + + If the data consists of columns x1, x2, x3, then we perform the + following regressions: + + x1 ~ L(x2, x3) + x1 ~ L(x1, x3) + x1 ~ L(x1, x2) + + The f-stats of these results are placed in the 'x1' column of the + returned DataFrame. We then repeat for x2, x3. + + Returns + ------- + Dict, where 'f-stat' returns the DataFrame containing the f-stats, + and 'p-value' returns the DataFrame containing the corresponding + p-values of the f-stats. + """ + from pandas.stats.api import ols + from scipy.stats import f + + d = {} + for col in self._columns: + d[col] = {} + for i in range(1, 1 + self._p): + lagged_data = self._lagged_data[i].filter( + self._columns - [col]) + + for key, value in compat.iteritems(lagged_data): + d[col][_make_param_name(i, key)] = value + + f_stat_dict = {} + p_value_dict = {} + + for col, y in compat.iteritems(self._data): + ssr_full = (self.resid[col] ** 2).sum() + + f_stats = [] + p_values = [] + + for col2 in self._columns: + result = ols(y=y, x=d[col2]) + + resid = result.resid + ssr_reduced = (resid ** 2).sum() + + M = self._p + N = self._nobs + K = self._k * self._p + 1 + f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K)) + f_stats.append(f_stat) + + p_value = f.sf(f_stat, M, N - K) + p_values.append(p_value) + + f_stat_dict[col] = Series(f_stats, self._columns) + p_value_dict[col] = Series(p_values, self._columns) + + f_stat_mat = DataFrame(f_stat_dict) + p_value_mat = DataFrame(p_value_dict) + + return { + 'f-stat': f_stat_mat, + 'p-value': p_value_mat, + } + + @cache_readonly + def ols_results(self): + """ + Returns the results of the regressions: + x_1 ~ L(X) + x_2 ~ L(X) + ... + x_k ~ L(X) + + where X = [x_1, x_2, ..., x_k] + and L(X) represents the columns of X lagged 1, 2, ..., n lags + (n is the user-provided number of lags). + + Returns + ------- + dict + """ + from pandas.stats.api import ols + + d = {} + for i in range(1, 1 + self._p): + for col, series in compat.iteritems(self._lagged_data[i]): + d[_make_param_name(i, col)] = series + + result = dict([(col, ols(y=y, x=d, intercept=self._intercept)) + for col, y in compat.iteritems(self._data)]) + + return result + + @cache_readonly + def resid(self): + """ + Returns the DataFrame containing the residuals of the VAR regressions. + Each column x1 contains the residuals generated by regressing the x1 + column of the input against the lagged input. + + Returns + ------- + DataFrame + """ + d = dict([(col, series.resid) + for (col, series) in compat.iteritems(self.ols_results)]) + return DataFrame(d, index=self._index) + + @cache_readonly + def summary(self): + template = """ +%(banner_top)s + +Number of Observations: %(nobs)d +AIC: %(aic).3f +BIC: %(bic).3f + +%(banner_coef)s +%(coef_table)s +%(banner_end)s +""" + params = { + 'banner_top': common.banner('Summary of VAR'), + 'banner_coef': common.banner('Summary of Estimated Coefficients'), + 'banner_end': common.banner('End of Summary'), + 'coef_table': self.beta, + 'aic': self.aic, + 'bic': self.bic, + 'nobs': self._nobs, + } + + return template % params + + @cache_readonly + def _alpha(self): + """ + Returns array where the i-th element contains the intercept + when regressing the i-th column of self._data with the lagged data. + """ + if self._intercept: + return self._beta_raw[-1] + else: + return np.zeros(self._k) + + @cache_readonly + def _beta_raw(self): + return np.array([list(self.beta[col].values()) for col in self._columns]).T + + def _trans_B(self, h): + """ + Returns 0, 1, ..., (h-1)-th power of transpose of B as defined in + equation (4) on p. 142 of the Stata 11 Time Series reference book. + """ + result = [np.eye(1 + self._k * self._p)] + + row1 = np.zeros((1, 1 + self._k * self._p)) + row1[0, 0] = 1 + + v = self._alpha.reshape((self._k, 1)) + row2 = np.hstack(tuple([v] + self._lag_betas)) + + m = self._k * (self._p - 1) + row3 = np.hstack(( + np.zeros((m, 1)), + np.eye(m), + np.zeros((m, self._k)) + )) + + trans_B = np.vstack((row1, row2, row3)).T + + result.append(trans_B) + + for i in range(2, h): + result.append(np.dot(trans_B, result[i - 1])) + + return result + + @cache_readonly + def _x(self): + values = np.array([ + list(self._lagged_data[i][col].values()) + for i in range(1, 1 + self._p) + for col in self._columns + ]).T + + x = np.hstack((np.ones((len(values), 1)), values))[self._p:] + + return x + + @cache_readonly + def _cov_beta(self): + cov_resid = self._sigma + + x = self._x + + inv_cov_x = inv(np.dot(x.T, x)) + + return np.kron(inv_cov_x, cov_resid) + + def _data_xs(self, i): + """ + Returns the cross-section of the data at the given timestep. + """ + return self._data.values[i] + + def _forecast_cov_raw(self, n): + resid = self._forecast_cov_resid_raw(n) + # beta = self._forecast_cov_beta_raw(n) + + # return [a + b for a, b in zip(resid, beta)] + # TODO: ignore the beta forecast std err until it's verified + + return resid + + def _forecast_cov_beta_raw(self, n): + """ + Returns the covariance of the beta errors for the forecast at + 1, 2, ..., n timesteps. + """ + p = self._p + + values = self._data.values + T = len(values) - self._p - 1 + + results = [] + + for h in range(1, n + 1): + psi = self._psi(h) + trans_B = self._trans_B(h) + + sum = 0 + + cov_beta = self._cov_beta + + for t in range(T + 1): + index = t + p + y = values.take(lrange(index, index - p, -1), axis=0).ravel() + trans_Z = np.hstack(([1], y)) + trans_Z = trans_Z.reshape(1, len(trans_Z)) + + sum2 = 0 + for i in range(h): + ZB = np.dot(trans_Z, trans_B[h - 1 - i]) + + prod = np.kron(ZB, psi[i]) + sum2 = sum2 + prod + + sum = sum + chain_dot(sum2, cov_beta, sum2.T) + + results.append(sum / (T + 1)) + + return results + + def _forecast_cov_resid_raw(self, h): + """ + Returns the covariance of the residual errors for the forecast at + 1, 2, ..., h timesteps. + """ + psi_values = self._psi(h) + sum = 0 + result = [] + for i in range(h): + psi = psi_values[i] + sum = sum + chain_dot(psi, self._sigma, psi.T) + result.append(sum) + + return result + + def _forecast_raw(self, h): + """ + Returns the forecast at 1, 2, ..., h timesteps in the future. + """ + k = self._k + result = [] + for i in range(h): + sum = self._alpha.reshape(1, k) + for j in range(self._p): + beta = self._lag_betas[j] + idx = i - j + if idx > 0: + y = result[idx - 1] + else: + y = self._data_xs(idx - 1) + + sum = sum + np.dot(beta, y.T).T + result.append(sum) + + return np.array(result) + + def _forecast_std_err_raw(self, h): + """ + Returns the standard error of the forecasts + at 1, 2, ..., n timesteps. + """ + return np.array([np.sqrt(np.diag(value)) + for value in self._forecast_cov_raw(h)]) + + @cache_readonly + def _ic(self): + """ + Returns the Akaike/Bayesian information criteria. + """ + RSS = self._rss + k = self._p * (self._k * self._p + 1) + n = self._nobs * self._k + + return {'aic': 2 * k + n * np.log(RSS / n), + 'bic': n * np.log(RSS / n) + k * np.log(n)} + + @cache_readonly + def _k(self): + return len(self._columns) + + @cache_readonly + def _lag_betas(self): + """ + Returns list of B_i, where B_i represents the (k, k) matrix + with the j-th row containing the betas of regressing the j-th + column of self._data with self._data lagged i time steps. + First element is B_1, second element is B_2, etc. + """ + k = self._k + b = self._beta_raw + return [b[k * i: k * (i + 1)].T for i in range(self._p)] + + @cache_readonly + def _lagged_data(self): + return dict([(i, self._data.shift(i)) + for i in range(1, 1 + self._p)]) + + @cache_readonly + def _nobs(self): + return len(self._data) - self._p + + def _psi(self, h): + """ + psi value used for calculating standard error. + + Returns [psi_0, psi_1, ..., psi_(h - 1)] + """ + k = self._k + result = [np.eye(k)] + for i in range(1, h): + result.append(sum( + [np.dot(result[i - j], self._lag_betas[j - 1]) + for j in range(1, 1 + i) + if j <= self._p])) + + return result + + @cache_readonly + def _resid_raw(self): + resid = np.array([self.ols_results[col]._resid_raw + for col in self._columns]) + return resid + + @cache_readonly + def _rss(self): + """Returns the sum of the squares of the residuals.""" + return (self._resid_raw ** 2).sum() + + @cache_readonly + def _sigma(self): + """Returns covariance of resids.""" + k = self._k + n = self._nobs + + resid = self._resid_raw + + return np.dot(resid, resid.T) / (n - k) + + def __unicode__(self): + return self.summary + + +def lag_select(data, max_lags=5, ic=None): + """ + Select number of lags based on a variety of information criteria + + Parameters + ---------- + data : DataFrame-like + max_lags : int + Maximum number of lags to evaluate + ic : {None, 'aic', 'bic', ...} + Choosing None will just display the results + + Returns + ------- + None + """ + pass + + +class PanelVAR(VAR): + """ + Performs Vector Autoregression on panel data. + + Parameters + ---------- + data: Panel or dict of DataFrame + lags: int + """ + def __init__(self, data, lags, intercept=True): + self._data = _prep_panel_data(data) + self._p = lags + self._intercept = intercept + + self._columns = self._data.items + + @cache_readonly + def _nobs(self): + """Returns the number of observations.""" + _, timesteps, entities = self._data.values.shape + return (timesteps - self._p) * entities + + @cache_readonly + def _rss(self): + """Returns the sum of the squares of the residuals.""" + return (self.resid.values ** 2).sum() + + def forecast(self, h): + """ + Returns the forecasts at 1, 2, ..., n timesteps in the future. + """ + forecast = self._forecast_raw(h).T.swapaxes(1, 2) + index = lrange(1, 1 + h) + w = Panel(forecast, items=self._data.items, major_axis=index, + minor_axis=self._data.minor_axis) + return w + + @cache_readonly + def resid(self): + """ + Returns the DataFrame containing the residuals of the VAR regressions. + Each column x1 contains the residuals generated by regressing the x1 + column of the input against the lagged input. + + Returns + ------- + DataFrame + """ + d = dict([(key, value.resid) + for (key, value) in compat.iteritems(self.ols_results)]) + return Panel.fromDict(d) + + def _data_xs(self, i): + return self._data.values[:, i, :].T + + @cache_readonly + def _sigma(self): + """Returns covariance of resids.""" + k = self._k + resid = _drop_incomplete_rows(self.resid.toLong().values) + n = len(resid) + return np.dot(resid.T, resid) / (n - k) + + +def _prep_panel_data(data): + """Converts the given data into a Panel.""" + if isinstance(data, Panel): + return data + + return Panel.fromDict(data) + + +def _drop_incomplete_rows(array): + mask = np.isfinite(array).all(1) + indices = np.arange(len(array))[mask] + return array.take(indices, 0) + + +def _make_param_name(lag, name): + return 'L%d.%s' % (lag, name) + + +def chain_dot(*matrices): + """ + Returns the dot product of the given matrices. + + Parameters + ---------- + matrices: argument list of ndarray + """ + return reduce(lambda x, y: np.dot(y, x), matrices[::-1]) diff --git a/pandas/tests/__init__.py b/pandas/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tests/data/iris.csv b/pandas/tests/data/iris.csv new file mode 100644 index 00000000..c19b9c36 --- /dev/null +++ b/pandas/tests/data/iris.csv @@ -0,0 +1,151 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Name +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/pandas/tests/data/mindex_073.pickle b/pandas/tests/data/mindex_073.pickle new file mode 100644 index 0000000000000000000000000000000000000000..c99f51fa289ac53b0301d25dc3dec5cc61b670e0 GIT binary patch literal 670 zcmaJ;%TB{E5R8+Shj~Lw%i{xb>IoqZNSupBKu9@8C`pZ6i6-6PN>t*&B}V;fSf|-m zAc_weC$lp>-DOeX0Ej4t(`j>VBp# zFY45Xi_}LpwO1+(I;5bjD(LmkpbiENyH0&rjc$!P^q%t^+NDVpQ%n0}Yu_wYYZm&e zv|dd#mR?C*KnJ~K1YJ8#wJDB!s5S&Ai} +Length: 0, Freq: D, Timezone: None""" + exp2 = """ +[2011-01-01] +Length: 1, Freq: D, Timezone: None""" + exp3 = """ +[2011-01-01, 2011-01-02] +Length: 2, Freq: D, Timezone: None""" + exp4 = """ +[2011-01-01, ..., 2011-01-03] +Length: 3, Freq: D, Timezone: None""" + exp5 = """ +[2011-01-01 09:00:00+09:00, ..., 2011-01-01 11:00:00+09:00] +Length: 3, Freq: H, Timezone: Asia/Tokyo""" + exp6 = """ +[2011-01-01 09:00:00-05:00, ..., NaT] +Length: 3, Freq: None, Timezone: US/Eastern""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], + [exp1, exp2, exp3, exp4, exp5, exp6]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) + + +class TestPeriodIndexOps(Ops): + _allowed = '_allow_period_index_ops' + + def setUp(self): + super(TestPeriodIndexOps, self).setUp() + mask = lambda x: x._allow_datetime_index_ops or x._allow_period_index_ops + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ o for o in self.objs if not mask(o) ] + + def test_ops_properties(self): + self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) + self.check_ops_properties(['qyear'], lambda x: isinstance(x,PeriodIndex)) + + def test_asobject_tolist(self): + idx = pd.period_range(start='2013-01-01', periods=4, freq='M', name='idx') + expected_list = [pd.Period('2013-01-31', freq='M'), pd.Period('2013-02-28', freq='M'), + pd.Period('2013-03-31', freq='M'), pd.Period('2013-04-30', freq='M')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(idx.tolist(), expected_list) + + idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT', '2013-01-04'], freq='D', name='idx') + expected_list = [pd.Period('2013-01-01', freq='D'), pd.Period('2013-01-02', freq='D'), + pd.Period('NaT', freq='D'), pd.Period('2013-01-04', freq='D')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.asobject + self.assertTrue(isinstance(result, Index)) + self.assertEqual(result.dtype, object) + for i in [0, 1, 3]: + self.assertTrue(result[i], expected[i]) + self.assertTrue(result[2].ordinal, pd.tslib.iNaT) + self.assertTrue(result[2].freq, 'D') + self.assertEqual(result.name, expected.name) + + result_list = idx.tolist() + for i in [0, 1, 3]: + self.assertTrue(result_list[i], expected_list[i]) + self.assertTrue(result_list[2].ordinal, pd.tslib.iNaT) + self.assertTrue(result_list[2].freq, 'D') + + def test_minmax(self): + + # monotonic + idx1 = pd.PeriodIndex([pd.NaT, '2011-01-01', '2011-01-02', + '2011-01-03'], freq='D') + self.assertTrue(idx1.is_monotonic) + + # non-monotonic + idx2 = pd.PeriodIndex(['2011-01-01', pd.NaT, '2011-01-03', + '2011-01-02', pd.NaT], freq='D') + self.assertFalse(idx2.is_monotonic) + + for idx in [idx1, idx2]: + self.assertEqual(idx.min(), pd.Period('2011-01-01', freq='D')) + self.assertEqual(idx.max(), pd.Period('2011-01-03', freq='D')) + + for op in ['min', 'max']: + # Return NaT + obj = PeriodIndex([], freq='M') + result = getattr(obj, op)() + self.assertEqual(result.ordinal, tslib.iNaT) + self.assertEqual(result.freq, 'M') + + obj = PeriodIndex([pd.NaT], freq='M') + result = getattr(obj, op)() + self.assertEqual(result.ordinal, tslib.iNaT) + self.assertEqual(result.freq, 'M') + + obj = PeriodIndex([pd.NaT, pd.NaT, pd.NaT], freq='M') + result = getattr(obj, op)() + self.assertEqual(result.ordinal, tslib.iNaT) + self.assertEqual(result.freq, 'M') + + def test_representation(self): + # GH 7601 + idx1 = PeriodIndex([], freq='D') + idx2 = PeriodIndex(['2011-01-01'], freq='D') + idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') + + idx7 = pd.period_range('2013Q1', periods=1, freq="Q") + idx8 = pd.period_range('2013Q1', periods=2, freq="Q") + idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + + exp1 = """ +Length: 0, Freq: D""" + exp2 = """ +[2011-01-01] +Length: 1, Freq: D""" + exp3 = """ +[2011-01-01, 2011-01-02] +Length: 2, Freq: D""" + exp4 = """ +[2011-01-01, ..., 2011-01-03] +Length: 3, Freq: D""" + exp5 = """ +[2011, ..., 2013] +Length: 3, Freq: A-DEC""" + exp6 = """ +[2011-01-01 09:00, ..., NaT] +Length: 3, Freq: H""" + exp7 = """ +[2013Q1] +Length: 1, Freq: Q-DEC""" + exp8 = """ +[2013Q1, 2013Q2] +Length: 2, Freq: Q-DEC""" + exp9 = """ +[2013Q1, ..., 2013Q3] +Length: 3, Freq: Q-DEC""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9]): + for func in ['__repr__', '__unicode__', '__str__']: + result = getattr(idx, func)() + self.assertEqual(result, expected) + + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py new file mode 100644 index 00000000..a195b573 --- /dev/null +++ b/pandas/tests/test_categorical.py @@ -0,0 +1,222 @@ +# pylint: disable=E1101,E1103,W0232 + +from datetime import datetime +from pandas.compat import range, lrange, u +import nose +import re + +import numpy as np + +from pandas.core.categorical import Categorical +from pandas.core.index import Index, Int64Index, MultiIndex +from pandas.core.frame import DataFrame +from pandas.tseries.period import PeriodIndex +from pandas.util.testing import assert_almost_equal +import pandas.core.common as com +from pandas.tseries.period import PeriodIndex + +import pandas.util.testing as tm + + +class TestCategorical(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.factor = Categorical.from_array(['a', 'b', 'b', 'a', + 'a', 'c', 'c', 'c']) + + def test_getitem(self): + self.assertEqual(self.factor[0], 'a') + self.assertEqual(self.factor[-1], 'c') + + subf = self.factor[[0, 1, 2]] + tm.assert_almost_equal(subf.labels, [0, 1, 1]) + + subf = self.factor[np.asarray(self.factor) == 'c'] + tm.assert_almost_equal(subf.labels, [2, 2, 2]) + + def test_constructor_unsortable(self): + raise nose.SkipTest('skipping for now') + + arr = np.array([1, 2, 3, datetime.now()], dtype='O') + + # it works! + factor = Categorical.from_array(arr) + + def test_factor_agg(self): + import pandas.core.frame as frame + + arr = np.arange(len(self.factor)) + + f = np.sum + agged = frame.factor_agg(self.factor, arr, f) + labels = self.factor.labels + for i, idx in enumerate(self.factor.levels): + self.assertEqual(f(arr[labels == i]), agged[i]) + + def test_comparisons(self): + result = self.factor[self.factor == 'a'] + expected = self.factor[np.asarray(self.factor) == 'a'] + self.assertTrue(result.equals(expected)) + + result = self.factor[self.factor != 'a'] + expected = self.factor[np.asarray(self.factor) != 'a'] + self.assertTrue(result.equals(expected)) + + result = self.factor[self.factor < 'c'] + expected = self.factor[np.asarray(self.factor) < 'c'] + self.assertTrue(result.equals(expected)) + + result = self.factor[self.factor > 'a'] + expected = self.factor[np.asarray(self.factor) > 'a'] + self.assertTrue(result.equals(expected)) + + result = self.factor[self.factor >= 'b'] + expected = self.factor[np.asarray(self.factor) >= 'b'] + self.assertTrue(result.equals(expected)) + + result = self.factor[self.factor <= 'b'] + expected = self.factor[np.asarray(self.factor) <= 'b'] + self.assertTrue(result.equals(expected)) + + n = len(self.factor) + + other = self.factor[np.random.permutation(n)] + result = self.factor == other + expected = np.asarray(self.factor) == np.asarray(other) + self.assert_numpy_array_equal(result, expected) + + result = self.factor == 'd' + expected = np.repeat(False, len(self.factor)) + self.assert_numpy_array_equal(result, expected) + + def test_na_flags_int_levels(self): + # #1457 + + levels = lrange(10) + labels = np.random.randint(0, 10, 20) + labels[::5] = -1 + + cat = Categorical(labels, levels) + repr(cat) + + self.assert_numpy_array_equal(com.isnull(cat), labels == -1) + + def test_levels_none(self): + factor = Categorical(['a', 'b', 'b', 'a', + 'a', 'c', 'c', 'c']) + self.assertTrue(factor.equals(self.factor)) + + def test_describe(self): + # string type + desc = self.factor.describe() + expected = DataFrame.from_dict(dict(counts=[3, 2, 3], + freqs=[3/8., 2/8., 3/8.], + levels=['a', 'b', 'c']) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + # check an integer one + desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe() + expected = DataFrame.from_dict(dict(counts=[5, 3, 3], + freqs=[5/11., 3/11., 3/11.], + levels=[1,2,3] + ) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + def test_print(self): + expected = [" a", " b", " b", " a", " a", " c", " c", " c", + "Levels (3): Index([a, b, c], dtype=object)"] + expected = "\n".join(expected) + # hack because array_repr changed in numpy > 1.6.x + actual = repr(self.factor) + pat = "Index\(\['a', 'b', 'c']" + sub = "Index([a, b, c]" + actual = re.sub(pat, sub, actual) + + self.assertEqual(actual, expected) + + def test_big_print(self): + factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat') + expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c", + " a", " b", " c", " a", "...", " c", " a", " b", " c", + " a", " b", " c", " a", " b", " c", " a", " b", " c", + "Levels (3): Index([a, b, c], dtype=object)", + "Name: cat, Length: 600" ] + expected = "\n".join(expected) + + # hack because array_repr changed in numpy > 1.6.x + actual = repr(factor) + pat = "Index\(\['a', 'b', 'c']" + sub = "Index([a, b, c]" + actual = re.sub(pat, sub, actual) + + self.assertEqual(actual, expected) + + def test_empty_print(self): + factor = Categorical([], ["a","b","c"], name="cat") + expected = ("Categorical([], Name: cat, Levels (3): " + "Index([a, b, c], dtype=object)") + # hack because array_repr changed in numpy > 1.6.x + actual = repr(factor) + pat = "Index\(\['a', 'b', 'c']" + sub = "Index([a, b, c]" + actual = re.sub(pat, sub, actual) + + self.assertEqual(actual, expected) + + factor = Categorical([], ["a","b","c"]) + expected = ("Categorical([], Levels (3): " + "Index([a, b, c], dtype=object)") + # hack because array_repr changed in numpy > 1.6.x + actual = repr(factor) + pat = "Index\(\['a', 'b', 'c']" + sub = "Index([a, b, c]" + actual = re.sub(pat, sub, actual) + + self.assertEqual(actual, expected) + + factor = Categorical([], []) + expected = ("Categorical([], Levels (0): " + "Index([], dtype=object)") + self.assertEqual(repr(factor), expected) + + def test_periodindex(self): + idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', + '2014-03', '2014-03'], freq='M') + cat1 = Categorical.from_array(idx1) + + exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + + self.assert_numpy_array_equal(cat1.labels, exp_arr) + self.assertTrue(cat1.levels.equals(exp_idx)) + + + idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01'], freq='M') + cat2 = Categorical.from_array(idx2) + + exp_arr = np.array([2, 2, 1, 0, 2, 0]) + + self.assert_numpy_array_equal(cat2.labels, exp_arr) + self.assertTrue(cat2.levels.equals(exp_idx)) + + idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', + '2013-08', '2013-07', '2013-05'], freq='M') + cat3 = Categorical.from_array(idx3) + + exp_arr = np.array([6, 5, 4, 3, 2, 1, 0]) + exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', + '2013-10', '2013-11', '2013-12'], freq='M') + + self.assert_numpy_array_equal(cat3.labels, exp_arr) + self.assertTrue(cat3.levels.equals(exp_idx)) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py new file mode 100644 index 00000000..a52be0ee --- /dev/null +++ b/pandas/tests/test_common.py @@ -0,0 +1,876 @@ +from datetime import datetime +import re + +import nose +from nose.tools import assert_equal +import numpy as np +from pandas.tslib import iNaT, NaT +from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp, Float64Index +from pandas import compat +from pandas.compat import range, long, lrange, lmap, u +from pandas.core.common import notnull, isnull, array_equivalent +import pandas.core.common as com +import pandas.util.testing as tm +import pandas.core.config as cf + +_multiprocess_can_split_ = True + + +def test_mut_exclusive(): + msg = "mutually exclusive arguments: '[ab]' and '[ab]'" + with tm.assertRaisesRegexp(TypeError, msg): + com._mut_exclusive(a=1, b=2) + assert com._mut_exclusive(a=1, b=None) == 1 + assert com._mut_exclusive(major=None, major_axis=None) is None + + +def test_is_sequence(): + is_seq = com._is_sequence + assert(is_seq((1, 2))) + assert(is_seq([1, 2])) + assert(not is_seq("abcd")) + assert(not is_seq(u("abcd"))) + assert(not is_seq(np.int64)) + + class A(object): + def __getitem__(self): + return 1 + + assert(not is_seq(A())) + + +def test_notnull(): + assert notnull(1.) + assert not notnull(None) + assert not notnull(np.NaN) + + with cf.option_context("mode.use_inf_as_null", False): + assert notnull(np.inf) + assert notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.all() + + with cf.option_context("mode.use_inf_as_null", True): + assert not notnull(np.inf) + assert not notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.sum() == 2 + + with cf.option_context("mode.use_inf_as_null", False): + for s in [tm.makeFloatSeries(),tm.makeStringSeries(), + tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: + assert(isinstance(isnull(s), Series)) + +def test_isnull(): + assert not isnull(1.) + assert isnull(None) + assert isnull(np.NaN) + assert not isnull(np.inf) + assert not isnull(-np.inf) + + # series + for s in [tm.makeFloatSeries(),tm.makeStringSeries(), + tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: + assert(isinstance(isnull(s), Series)) + + # frame + for df in [tm.makeTimeDataFrame(),tm.makePeriodFrame(),tm.makeMixedDataFrame()]: + result = isnull(df) + expected = df.apply(isnull) + tm.assert_frame_equal(result, expected) + + # panel + for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel_equal(result, expected) + + # panel 4d + for p in [ tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D()) ]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel4d_equal(result, expected) + +def test_isnull_lists(): + result = isnull([[False]]) + exp = np.array([[False]]) + assert(np.array_equal(result, exp)) + + result = isnull([[1], [2]]) + exp = np.array([[False], [False]]) + assert(np.array_equal(result, exp)) + + # list of strings / unicode + result = isnull(['foo', 'bar']) + assert(not result.any()) + + result = isnull([u('foo'), u('bar')]) + assert(not result.any()) + +def test_isnull_nat(): + result = isnull([NaT]) + exp = np.array([True]) + assert(np.array_equal(result, exp)) + + result = isnull(np.array([NaT], dtype=object)) + exp = np.array([True]) + assert(np.array_equal(result, exp)) + +def test_isnull_datetime(): + assert (not isnull(datetime.now())) + assert notnull(datetime.now()) + + idx = date_range('1/1/1990', periods=20) + assert(notnull(idx).all()) + + idx = np.asarray(idx) + idx[0] = iNaT + idx = DatetimeIndex(idx) + mask = isnull(idx) + assert(mask[0]) + assert(not mask[1:].any()) + + +class TestIsNull(tm.TestCase): + def test_0d_array(self): + self.assertTrue(isnull(np.array(np.nan))) + self.assertFalse(isnull(np.array(0.0))) + self.assertFalse(isnull(np.array(0))) + # test object dtype + self.assertTrue(isnull(np.array(np.nan, dtype=object))) + self.assertFalse(isnull(np.array(0.0, dtype=object))) + self.assertFalse(isnull(np.array(0, dtype=object))) + + +def test_downcast_conv(): + # test downcasting + + arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) + result = com._possibly_downcast_to_dtype(arr, 'infer') + assert (np.array_equal(result, arr)) + + arr = np.array([8., 8., 8., 8., 8.9999999999995]) + result = com._possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + arr = np.array([8., 8., 8., 8., 9.0000000000005]) + result = com._possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + # conversions + + expected = np.array([1,2]) + for dtype in [np.float64,object,np.int64]: + arr = np.array([1.0,2.0],dtype=dtype) + result = com._possibly_downcast_to_dtype(arr,'infer') + tm.assert_almost_equal(result, expected) + + expected = np.array([1.0,2.0,np.nan]) + for dtype in [np.float64,object]: + arr = np.array([1.0,2.0,np.nan],dtype=dtype) + result = com._possibly_downcast_to_dtype(arr,'infer') + tm.assert_almost_equal(result, expected) + + # empties + for dtype in [np.int32,np.float64,np.float32,np.bool_,np.int64,object]: + arr = np.array([],dtype=dtype) + result = com._possibly_downcast_to_dtype(arr,'int64') + tm.assert_almost_equal(result, np.array([],dtype=np.int64)) + assert result.dtype == np.int64 + +def test_array_equivalent(): + assert array_equivalent(np.array([np.nan, np.nan]), + np.array([np.nan, np.nan])) + assert array_equivalent(np.array([np.nan, 1, np.nan]), + np.array([np.nan, 1, np.nan])) + assert array_equivalent(np.array([np.nan, None], dtype='object'), + np.array([np.nan, None], dtype='object')) + assert array_equivalent(np.array([np.nan, 1+1j], dtype='complex'), + np.array([np.nan, 1+1j], dtype='complex')) + assert not array_equivalent(np.array([np.nan, 1+1j], dtype='complex'), + np.array([np.nan, 1+2j], dtype='complex')) + assert not array_equivalent(np.array([np.nan, 1, np.nan]), + np.array([np.nan, 2, np.nan])) + assert not array_equivalent(np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) + assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) + assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + +def test_datetimeindex_from_empty_datetime64_array(): + for unit in [ 'ms', 'us', 'ns' ]: + idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) + assert(len(idx) == 0) + + +def test_nan_to_nat_conversions(): + + df = DataFrame(dict({ + 'A' : np.asarray(lrange(10),dtype='float64'), + 'B' : Timestamp('20010101') })) + df.iloc[3:6,:] = np.nan + result = df.loc[4,'B'].value + assert(result == iNaT) + + s = df['B'].copy() + s._data = s._data.setitem(indexer=tuple([slice(8,9)]),value=np.nan) + assert(isnull(s[8])) + + # numpy < 1.7.0 is wrong + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.7.0': + assert(s[8].value == np.datetime64('NaT').astype(np.int64)) + + +def test_any_none(): + assert(com._any_none(1, 2, 3, None)) + assert(not com._any_none(1, 2, 3, 4)) + + +def test_all_not_none(): + assert(com._all_not_none(1, 2, 3, 4)) + assert(not com._all_not_none(1, 2, 3, None)) + assert(not com._all_not_none(None, None, None, None)) + + +def test_repr_binary_type(): + import string + letters = string.ascii_letters + btype = compat.binary_type + try: + raw = btype(letters, encoding=cf.get_option('display.encoding')) + except TypeError: + raw = btype(letters) + b = compat.text_type(compat.bytes_to_str(raw)) + res = com.pprint_thing(b, quote_strings=True) + assert_equal(res, repr(b)) + res = com.pprint_thing(b, quote_strings=False) + assert_equal(res, b) + + +def test_rands(): + r = com.rands(10) + assert(len(r) == 10) + + +def test_adjoin(): + data = [['a', 'b', 'c'], + ['dd', 'ee', 'ff'], + ['ggg', 'hhh', 'iii']] + expected = 'a dd ggg\nb ee hhh\nc ff iii' + + adjoined = com.adjoin(2, *data) + + assert(adjoined == expected) + + +def test_iterpairs(): + data = [1, 2, 3, 4] + expected = [(1, 2), + (2, 3), + (3, 4)] + + result = list(com.iterpairs(data)) + + assert(result == expected) + + +def test_split_ranges(): + def _bin(x, width): + "return int(x) as a base2 string of given width" + return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1)) + + def test_locs(mask): + nfalse = sum(np.array(mask) == 0) + + remaining = 0 + for s, e in com.split_ranges(mask): + remaining += e - s + + assert 0 not in mask[s:e] + + # make sure the total items covered by the ranges are a complete cover + assert remaining + nfalse == len(mask) + + # exhaustively test all possible mask sequences of length 8 + ncols = 8 + for i in range(2 ** ncols): + cols = lmap(int, list(_bin(i, ncols))) # count up in base2 + mask = [cols[i] == 1 for i in range(len(cols))] + test_locs(mask) + + # base cases + test_locs([]) + test_locs([0]) + test_locs([1]) + + +def test_indent(): + s = 'a b c\nd e f' + result = com.indent(s, spaces=6) + + assert(result == ' a b c\n d e f') + + +def test_banner(): + ban = com.banner('hi') + assert(ban == ('%s\nhi\n%s' % ('=' * 80, '=' * 80))) + + +def test_map_indices_py(): + data = [4, 3, 2, 1] + expected = {4: 0, 3: 1, 2: 2, 1: 3} + + result = com.map_indices_py(data) + + assert(result == expected) + + +def test_union(): + a = [1, 2, 3] + b = [4, 5, 6] + + union = sorted(com.union(a, b)) + + assert((a + b) == union) + + +def test_difference(): + a = [1, 2, 3] + b = [1, 2, 3, 4, 5, 6] + + inter = sorted(com.difference(b, a)) + + assert([4, 5, 6] == inter) + + +def test_intersection(): + a = [1, 2, 3] + b = [1, 2, 3, 4, 5, 6] + + inter = sorted(com.intersection(a, b)) + + assert(a == inter) + + +def test_groupby(): + values = ['foo', 'bar', 'baz', 'baz2', 'qux', 'foo3'] + expected = {'f': ['foo', 'foo3'], + 'b': ['bar', 'baz', 'baz2'], + 'q': ['qux']} + + grouped = com.groupby(values, lambda x: x[0]) + + for k, v in grouped: + assert v == expected[k] + + +def test_is_list_like(): + passes = ([], [1], (1,), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), + Series([]), Series(['a']).str) + fails = (1, '2', object()) + + for p in passes: + assert com.is_list_like(p) + + for f in fails: + assert not com.is_list_like(f) + + +def test_ensure_int32(): + values = np.arange(10, dtype=np.int32) + result = com._ensure_int32(values) + assert(result.dtype == np.int32) + + values = np.arange(10, dtype=np.int64) + result = com._ensure_int32(values) + assert(result.dtype == np.int32) + + +def test_ensure_platform_int(): + + # verify that when we create certain types of indices + # they remain the correct type under platform conversions + from pandas.core.index import Int64Index + + # int64 + x = Int64Index([1, 2, 3], dtype='int64') + assert(x.dtype == np.int64) + + pi = com._ensure_platform_int(x) + assert(pi.dtype == np.int_) + + # int32 + x = Int64Index([1, 2, 3], dtype='int32') + assert(x.dtype == np.int32) + + pi = com._ensure_platform_int(x) + assert(pi.dtype == np.int_) + +# TODO: fix this broken test + +# def test_console_encode(): +# """ +# On Python 2, if sys.stdin.encoding is None (IPython with zmq frontend) +# common.console_encode should encode things as utf-8. +# """ +# if compat.PY3: +# raise nose.SkipTest + +# with tm.stdin_encoding(encoding=None): +# result = com.console_encode(u"\u05d0") +# expected = u"\u05d0".encode('utf-8') +# assert (result == expected) + + +def test_is_re(): + passes = re.compile('ad'), + fails = 'x', 2, 3, object() + + for p in passes: + assert com.is_re(p) + + for f in fails: + assert not com.is_re(f) + + +def test_is_recompilable(): + passes = (r'a', u('x'), r'asdf', re.compile('adsf'), + u(r'\u2233\s*'), re.compile(r'')) + fails = 1, [], object() + + for p in passes: + assert com.is_re_compilable(p) + + for f in fails: + assert not com.is_re_compilable(f) + + +class TestTake(tm.TestCase): + # standard incompatible fill error + fill_error = re.compile("Incompatible type for fill_value") + + _multiprocess_can_split_ = True + + def test_1d_with_out(self): + def _test_dtype(dtype, can_hold_na): + data = np.random.randint(0, 2, 4).astype(dtype) + + indexer = [2, 1, 0, 1] + out = np.empty(4, dtype=dtype) + com.take_1d(data, indexer, out=out) + expected = data.take(indexer) + tm.assert_almost_equal(out, expected) + + indexer = [2, 1, 0, -1] + out = np.empty(4, dtype=dtype) + if can_hold_na: + com.take_1d(data, indexer, out=out) + expected = data.take(indexer) + expected[3] = np.nan + tm.assert_almost_equal(out, expected) + else: + with tm.assertRaisesRegexp(TypeError, self.fill_error): + com.take_1d(data, indexer, out=out) + # no exception o/w + data.take(indexer, out=out) + + _test_dtype(np.float64, True) + _test_dtype(np.float32, True) + _test_dtype(np.uint64, False) + _test_dtype(np.uint32, False) + _test_dtype(np.uint16, False) + _test_dtype(np.uint8, False) + _test_dtype(np.int64, False) + _test_dtype(np.int32, False) + _test_dtype(np.int16, False) + _test_dtype(np.int8, False) + _test_dtype(np.object_, True) + _test_dtype(np.bool, False) + + def test_1d_fill_nonna(self): + def _test_dtype(dtype, fill_value, out_dtype): + data = np.random.randint(0, 2, 4).astype(dtype) + + indexer = [2, 1, 0, -1] + + result = com.take_1d(data, indexer, fill_value=fill_value) + assert((result[[0, 1, 2]] == data[[2, 1, 0]]).all()) + assert(result[3] == fill_value) + assert(result.dtype == out_dtype) + + indexer = [2, 1, 0, 1] + + result = com.take_1d(data, indexer, fill_value=fill_value) + assert((result[[0, 1, 2, 3]] == data[indexer]).all()) + assert(result.dtype == dtype) + + _test_dtype(np.int8, np.int16(127), np.int8) + _test_dtype(np.int8, np.int16(128), np.int16) + _test_dtype(np.int32, 1, np.int32) + _test_dtype(np.int32, 2.0, np.float64) + _test_dtype(np.int32, 3.0 + 4.0j, np.complex128) + _test_dtype(np.int32, True, np.object_) + _test_dtype(np.int32, '', np.object_) + _test_dtype(np.float64, 1, np.float64) + _test_dtype(np.float64, 2.0, np.float64) + _test_dtype(np.float64, 3.0 + 4.0j, np.complex128) + _test_dtype(np.float64, True, np.object_) + _test_dtype(np.float64, '', np.object_) + _test_dtype(np.complex128, 1, np.complex128) + _test_dtype(np.complex128, 2.0, np.complex128) + _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128) + _test_dtype(np.complex128, True, np.object_) + _test_dtype(np.complex128, '', np.object_) + _test_dtype(np.bool_, 1, np.object_) + _test_dtype(np.bool_, 2.0, np.object_) + _test_dtype(np.bool_, 3.0 + 4.0j, np.object_) + _test_dtype(np.bool_, True, np.bool_) + _test_dtype(np.bool_, '', np.object_) + + def test_2d_with_out(self): + def _test_dtype(dtype, can_hold_na): + data = np.random.randint(0, 2, (5, 3)).astype(dtype) + + indexer = [2, 1, 0, 1] + out0 = np.empty((4, 3), dtype=dtype) + out1 = np.empty((5, 4), dtype=dtype) + com.take_nd(data, indexer, out=out0, axis=0) + com.take_nd(data, indexer, out=out1, axis=1) + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + + indexer = [2, 1, 0, -1] + out0 = np.empty((4, 3), dtype=dtype) + out1 = np.empty((5, 4), dtype=dtype) + if can_hold_na: + com.take_nd(data, indexer, out=out0, axis=0) + com.take_nd(data, indexer, out=out1, axis=1) + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + expected0[3, :] = np.nan + expected1[:, 3] = np.nan + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + else: + for i, out in enumerate([out0, out1]): + with tm.assertRaisesRegexp(TypeError, self.fill_error): + com.take_nd(data, indexer, out=out, axis=i) + # no exception o/w + data.take(indexer, out=out, axis=i) + + _test_dtype(np.float64, True) + _test_dtype(np.float32, True) + _test_dtype(np.uint64, False) + _test_dtype(np.uint32, False) + _test_dtype(np.uint16, False) + _test_dtype(np.uint8, False) + _test_dtype(np.int64, False) + _test_dtype(np.int32, False) + _test_dtype(np.int16, False) + _test_dtype(np.int8, False) + _test_dtype(np.object_, True) + _test_dtype(np.bool, False) + + def test_2d_fill_nonna(self): + def _test_dtype(dtype, fill_value, out_dtype): + data = np.random.randint(0, 2, (5, 3)).astype(dtype) + + indexer = [2, 1, 0, -1] + + result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all()) + assert((result[3, :] == fill_value).all()) + assert(result.dtype == out_dtype) + + result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all()) + assert((result[:, 3] == fill_value).all()) + assert(result.dtype == out_dtype) + + indexer = [2, 1, 0, 1] + + result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert((result[[0, 1, 2, 3], :] == data[indexer, :]).all()) + assert(result.dtype == dtype) + + result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert((result[:, [0, 1, 2, 3]] == data[:, indexer]).all()) + assert(result.dtype == dtype) + + _test_dtype(np.int8, np.int16(127), np.int8) + _test_dtype(np.int8, np.int16(128), np.int16) + _test_dtype(np.int32, 1, np.int32) + _test_dtype(np.int32, 2.0, np.float64) + _test_dtype(np.int32, 3.0 + 4.0j, np.complex128) + _test_dtype(np.int32, True, np.object_) + _test_dtype(np.int32, '', np.object_) + _test_dtype(np.float64, 1, np.float64) + _test_dtype(np.float64, 2.0, np.float64) + _test_dtype(np.float64, 3.0 + 4.0j, np.complex128) + _test_dtype(np.float64, True, np.object_) + _test_dtype(np.float64, '', np.object_) + _test_dtype(np.complex128, 1, np.complex128) + _test_dtype(np.complex128, 2.0, np.complex128) + _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128) + _test_dtype(np.complex128, True, np.object_) + _test_dtype(np.complex128, '', np.object_) + _test_dtype(np.bool_, 1, np.object_) + _test_dtype(np.bool_, 2.0, np.object_) + _test_dtype(np.bool_, 3.0 + 4.0j, np.object_) + _test_dtype(np.bool_, True, np.bool_) + _test_dtype(np.bool_, '', np.object_) + + def test_3d_with_out(self): + def _test_dtype(dtype, can_hold_na): + data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) + + indexer = [2, 1, 0, 1] + out0 = np.empty((4, 4, 3), dtype=dtype) + out1 = np.empty((5, 4, 3), dtype=dtype) + out2 = np.empty((5, 4, 4), dtype=dtype) + com.take_nd(data, indexer, out=out0, axis=0) + com.take_nd(data, indexer, out=out1, axis=1) + com.take_nd(data, indexer, out=out2, axis=2) + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + expected2 = data.take(indexer, axis=2) + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + tm.assert_almost_equal(out2, expected2) + + indexer = [2, 1, 0, -1] + out0 = np.empty((4, 4, 3), dtype=dtype) + out1 = np.empty((5, 4, 3), dtype=dtype) + out2 = np.empty((5, 4, 4), dtype=dtype) + if can_hold_na: + com.take_nd(data, indexer, out=out0, axis=0) + com.take_nd(data, indexer, out=out1, axis=1) + com.take_nd(data, indexer, out=out2, axis=2) + expected0 = data.take(indexer, axis=0) + expected1 = data.take(indexer, axis=1) + expected2 = data.take(indexer, axis=2) + expected0[3, :, :] = np.nan + expected1[:, 3, :] = np.nan + expected2[:, :, 3] = np.nan + tm.assert_almost_equal(out0, expected0) + tm.assert_almost_equal(out1, expected1) + tm.assert_almost_equal(out2, expected2) + else: + for i, out in enumerate([out0, out1, out2]): + with tm.assertRaisesRegexp(TypeError, self.fill_error): + com.take_nd(data, indexer, out=out, axis=i) + # no exception o/w + data.take(indexer, out=out, axis=i) + + _test_dtype(np.float64, True) + _test_dtype(np.float32, True) + _test_dtype(np.uint64, False) + _test_dtype(np.uint32, False) + _test_dtype(np.uint16, False) + _test_dtype(np.uint8, False) + _test_dtype(np.int64, False) + _test_dtype(np.int32, False) + _test_dtype(np.int16, False) + _test_dtype(np.int8, False) + _test_dtype(np.object_, True) + _test_dtype(np.bool, False) + + def test_3d_fill_nonna(self): + def _test_dtype(dtype, fill_value, out_dtype): + data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) + + indexer = [2, 1, 0, -1] + + result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()) + assert((result[3, :, :] == fill_value).all()) + assert(result.dtype == out_dtype) + + result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()) + assert((result[:, 3, :] == fill_value).all()) + assert(result.dtype == out_dtype) + + result = com.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()) + assert((result[:, :, 3] == fill_value).all()) + assert(result.dtype == out_dtype) + + indexer = [2, 1, 0, 1] + + result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()) + assert(result.dtype == dtype) + + result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()) + assert(result.dtype == dtype) + + result = com.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()) + assert(result.dtype == dtype) + + _test_dtype(np.int8, np.int16(127), np.int8) + _test_dtype(np.int8, np.int16(128), np.int16) + _test_dtype(np.int32, 1, np.int32) + _test_dtype(np.int32, 2.0, np.float64) + _test_dtype(np.int32, 3.0 + 4.0j, np.complex128) + _test_dtype(np.int32, True, np.object_) + _test_dtype(np.int32, '', np.object_) + _test_dtype(np.float64, 1, np.float64) + _test_dtype(np.float64, 2.0, np.float64) + _test_dtype(np.float64, 3.0 + 4.0j, np.complex128) + _test_dtype(np.float64, True, np.object_) + _test_dtype(np.float64, '', np.object_) + _test_dtype(np.complex128, 1, np.complex128) + _test_dtype(np.complex128, 2.0, np.complex128) + _test_dtype(np.complex128, 3.0 + 4.0j, np.complex128) + _test_dtype(np.complex128, True, np.object_) + _test_dtype(np.complex128, '', np.object_) + _test_dtype(np.bool_, 1, np.object_) + _test_dtype(np.bool_, 2.0, np.object_) + _test_dtype(np.bool_, 3.0 + 4.0j, np.object_) + _test_dtype(np.bool_, True, np.bool_) + _test_dtype(np.bool_, '', np.object_) + + def test_1d_other_dtypes(self): + arr = np.random.randn(10).astype(np.float32) + + indexer = [1, 2, 3, -1] + result = com.take_1d(arr, indexer) + expected = arr.take(indexer) + expected[-1] = np.nan + tm.assert_almost_equal(result, expected) + + def test_2d_other_dtypes(self): + arr = np.random.randn(10, 5).astype(np.float32) + + indexer = [1, 2, 3, -1] + + # axis=0 + result = com.take_nd(arr, indexer, axis=0) + expected = arr.take(indexer, axis=0) + expected[-1] = np.nan + tm.assert_almost_equal(result, expected) + + # axis=1 + result = com.take_nd(arr, indexer, axis=1) + expected = arr.take(indexer, axis=1) + expected[:, -1] = np.nan + tm.assert_almost_equal(result, expected) + + def test_1d_bool(self): + arr = np.array([0, 1, 0], dtype=bool) + + result = com.take_1d(arr, [0, 2, 2, 1]) + expected = arr.take([0, 2, 2, 1]) + self.assert_numpy_array_equal(result, expected) + + result = com.take_1d(arr, [0, 2, -1]) + self.assertEqual(result.dtype, np.object_) + + def test_2d_bool(self): + arr = np.array([[0, 1, 0], + [1, 0, 1], + [0, 1, 1]], dtype=bool) + + result = com.take_nd(arr, [0, 2, 2, 1]) + expected = arr.take([0, 2, 2, 1], axis=0) + self.assert_numpy_array_equal(result, expected) + + result = com.take_nd(arr, [0, 2, 2, 1], axis=1) + expected = arr.take([0, 2, 2, 1], axis=1) + self.assert_numpy_array_equal(result, expected) + + result = com.take_nd(arr, [0, 2, -1]) + self.assertEqual(result.dtype, np.object_) + + def test_2d_float32(self): + arr = np.random.randn(4, 3).astype(np.float32) + indexer = [0, 2, -1, 1, -1] + + # axis=0 + result = com.take_nd(arr, indexer, axis=0) + result2 = np.empty_like(result) + com.take_nd(arr, indexer, axis=0, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=0) + expected[[2, 4], :] = np.nan + tm.assert_almost_equal(result, expected) + + #### this now accepts a float32! # test with float64 out buffer + out = np.empty((len(indexer), arr.shape[1]), dtype='float32') + com.take_nd(arr, indexer, out=out) # it works! + + # axis=1 + result = com.take_nd(arr, indexer, axis=1) + result2 = np.empty_like(result) + com.take_nd(arr, indexer, axis=1, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=1) + expected[:, [2, 4]] = np.nan + tm.assert_almost_equal(result, expected) + + def test_2d_datetime64(self): + # 2005/01/01 - 2006/01/01 + arr = np.random.randint(long(11045376), long(11360736), (5,3))*100000000000 + arr = arr.view(dtype='datetime64[ns]') + indexer = [0, 2, -1, 1, -1] + + # axis=0 + result = com.take_nd(arr, indexer, axis=0) + result2 = np.empty_like(result) + com.take_nd(arr, indexer, axis=0, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=0) + expected.view(np.int64)[[2, 4], :] = iNaT + tm.assert_almost_equal(result, expected) + + result = com.take_nd(arr, indexer, axis=0, + fill_value=datetime(2007, 1, 1)) + result2 = np.empty_like(result) + com.take_nd(arr, indexer, out=result2, axis=0, + fill_value=datetime(2007, 1, 1)) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=0) + expected[[2, 4], :] = datetime(2007, 1, 1) + tm.assert_almost_equal(result, expected) + + # axis=1 + result = com.take_nd(arr, indexer, axis=1) + result2 = np.empty_like(result) + com.take_nd(arr, indexer, axis=1, out=result2) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=1) + expected.view(np.int64)[:, [2, 4]] = iNaT + tm.assert_almost_equal(result, expected) + + result = com.take_nd(arr, indexer, axis=1, + fill_value=datetime(2007, 1, 1)) + result2 = np.empty_like(result) + com.take_nd(arr, indexer, out=result2, axis=1, + fill_value=datetime(2007, 1, 1)) + tm.assert_almost_equal(result, result2) + + expected = arr.take(indexer, axis=1) + expected[:, [2, 4]] = datetime(2007, 1, 1) + tm.assert_almost_equal(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_compat.py b/pandas/tests/test_compat.py new file mode 100644 index 00000000..0d38bb23 --- /dev/null +++ b/pandas/tests/test_compat.py @@ -0,0 +1,70 @@ +""" +Testing that functions from compat work as expected +""" + +from pandas.compat import ( + range, zip, map, filter, + lrange, lzip, lmap, lfilter, + builtins +) +import unittest +import nose +import pandas.util.testing as tm + +class TestBuiltinIterators(tm.TestCase): + def check_result(self, actual, expected, lengths): + for (iter_res, list_res), exp, length in zip(actual, expected, lengths): + self.assertNotIsInstance(iter_res, list) + tm.assert_isinstance(list_res, list) + iter_res = list(iter_res) + self.assertEqual(len(list_res), length) + self.assertEqual(len(iter_res), length) + self.assertEqual(iter_res, exp) + self.assertEqual(list_res, exp) + + def test_range(self): + actual1 = range(10) + actual2 = lrange(10) + actual = [actual1, actual2], + expected = list(builtins.range(10)), + lengths = 10, + + actual1 = range(1, 10, 2) + actual2 = lrange(1, 10, 2) + actual += [actual1, actual2], + lengths += 5, + expected += list(builtins.range(1, 10, 2)), + self.check_result(actual, expected, lengths) + + def test_map(self): + func = lambda x, y, z: x + y + z + lst = [builtins.range(10), builtins.range(10), builtins.range(10)] + actual1 = map(func, *lst) + actual2 = lmap(func, *lst) + actual = [actual1, actual2], + expected = list(builtins.map(func, *lst)), + lengths = 10, + self.check_result(actual, expected, lengths) + + + def test_filter(self): + func = lambda x: x + lst = list(builtins.range(10)) + actual1 = filter(func, lst) + actual2 = lfilter(func, lst) + actual = [actual1, actual2], + lengths = 9, + expected = list(builtins.filter(func, lst)), + self.check_result(actual, expected, lengths) + + def test_zip(self): + lst = [builtins.range(10), builtins.range(10), builtins.range(10)] + actual = [zip(*lst), lzip(*lst)], + expected = list(builtins.zip(*lst)), + lengths = 10, + self.check_result(actual, expected, lengths) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py new file mode 100644 index 00000000..e60c9d5b --- /dev/null +++ b/pandas/tests/test_config.py @@ -0,0 +1,426 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import pandas as pd +import unittest +import warnings +import nose + + +class TestConfig(unittest.TestCase): + _multiprocess_can_split_ = True + + def __init__(self, *args): + super(TestConfig, self).__init__(*args) + + from copy import deepcopy + self.cf = pd.core.config + self.gc = deepcopy(getattr(self.cf, '_global_config')) + self.do = deepcopy(getattr(self.cf, '_deprecated_options')) + self.ro = deepcopy(getattr(self.cf, '_registered_options')) + + def setUp(self): + setattr(self.cf, '_global_config', {}) + setattr( + self.cf, 'options', self.cf.DictWrapper(self.cf._global_config)) + setattr(self.cf, '_deprecated_options', {}) + setattr(self.cf, '_registered_options', {}) + + def tearDown(self): + setattr(self.cf, '_global_config', self.gc) + setattr(self.cf, '_deprecated_options', self.do) + setattr(self.cf, '_registered_options', self.ro) + + def test_api(self): + + # the pandas object exposes the user API + self.assertTrue(hasattr(pd, 'get_option')) + self.assertTrue(hasattr(pd, 'set_option')) + self.assertTrue(hasattr(pd, 'reset_option')) + self.assertTrue(hasattr(pd, 'describe_option')) + + def test_is_one_of_factory(self): + v = self.cf.is_one_of_factory([None,12]) + + v(12) + v(None) + self.assertRaises(ValueError,v,1.1) + + def test_register_option(self): + self.cf.register_option('a', 1, 'doc') + + # can't register an already registered option + self.assertRaises(KeyError, self.cf.register_option, 'a', 1, 'doc') + + # can't register an already registered option + self.assertRaises(KeyError, self.cf.register_option, 'a.b.c.d1', 1, + 'doc') + self.assertRaises(KeyError, self.cf.register_option, 'a.b.c.d2', 1, + 'doc') + + # no python keywords + self.assertRaises(ValueError, self.cf.register_option, 'for', 0) + # must be valid identifier (ensure attribute access works) + self.assertRaises(ValueError, self.cf.register_option, + 'Oh my Goddess!', 0) + + # we can register options several levels deep + # without predefining the intermediate steps + # and we can define differently named options + # in the same namespace + self.cf.register_option('k.b.c.d1', 1, 'doc') + self.cf.register_option('k.b.c.d2', 1, 'doc') + + def test_describe_option(self): + self.cf.register_option('a', 1, 'doc') + self.cf.register_option('b', 1, 'doc2') + self.cf.deprecate_option('b') + + self.cf.register_option('c.d.e1', 1, 'doc3') + self.cf.register_option('c.d.e2', 1, 'doc4') + self.cf.register_option('f', 1) + self.cf.register_option('g.h', 1) + self.cf.register_option('k', 2) + self.cf.deprecate_option('g.h', rkey="k") + self.cf.register_option('l', "foo") + + # non-existent keys raise KeyError + self.assertRaises(KeyError, self.cf.describe_option, 'no.such.key') + + # we can get the description for any key we registered + self.assertTrue( + 'doc' in self.cf.describe_option('a', _print_desc=False)) + self.assertTrue( + 'doc2' in self.cf.describe_option('b', _print_desc=False)) + self.assertTrue( + 'precated' in self.cf.describe_option('b', _print_desc=False)) + + self.assertTrue( + 'doc3' in self.cf.describe_option('c.d.e1', _print_desc=False)) + self.assertTrue( + 'doc4' in self.cf.describe_option('c.d.e2', _print_desc=False)) + + # if no doc is specified we get a default message + # saying "description not available" + self.assertTrue( + 'vailable' in self.cf.describe_option('f', _print_desc=False)) + self.assertTrue( + 'vailable' in self.cf.describe_option('g.h', _print_desc=False)) + self.assertTrue( + 'precated' in self.cf.describe_option('g.h', _print_desc=False)) + self.assertTrue( + 'k' in self.cf.describe_option('g.h', _print_desc=False)) + + # default is reported + self.assertTrue( + 'foo' in self.cf.describe_option('l', _print_desc=False)) + # current value is reported + self.assertFalse( + 'bar' in self.cf.describe_option('l', _print_desc=False)) + self.cf.set_option("l","bar") + self.assertTrue( + 'bar' in self.cf.describe_option('l', _print_desc=False)) + + def test_case_insensitive(self): + self.cf.register_option('KanBAN', 1, 'doc') + + self.assertTrue( + 'doc' in self.cf.describe_option('kanbaN', _print_desc=False)) + self.assertEqual(self.cf.get_option('kanBaN'), 1) + self.cf.set_option('KanBan', 2) + self.assertEqual(self.cf.get_option('kAnBaN'), 2) + + # gets of non-existent keys fail + self.assertRaises(KeyError, self.cf.get_option, 'no_such_option') + self.cf.deprecate_option('KanBan') + + # testing warning with catch_warning was only added in 2.6 + self.assertTrue(self.cf._is_deprecated('kAnBaN')) + + def test_get_option(self): + self.cf.register_option('a', 1, 'doc') + self.cf.register_option('b.c', 'hullo', 'doc2') + self.cf.register_option('b.b', None, 'doc2') + + # gets of existing keys succeed + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'hullo') + self.assertTrue(self.cf.get_option('b.b') is None) + + # gets of non-existent keys fail + self.assertRaises(KeyError, self.cf.get_option, 'no_such_option') + + def test_set_option(self): + self.cf.register_option('a', 1, 'doc') + self.cf.register_option('b.c', 'hullo', 'doc2') + self.cf.register_option('b.b', None, 'doc2') + + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'hullo') + self.assertTrue(self.cf.get_option('b.b') is None) + + self.cf.set_option('a', 2) + self.cf.set_option('b.c', 'wurld') + self.cf.set_option('b.b', 1.1) + + self.assertEqual(self.cf.get_option('a'), 2) + self.assertEqual(self.cf.get_option('b.c'), 'wurld') + self.assertEqual(self.cf.get_option('b.b'), 1.1) + + self.assertRaises(KeyError, self.cf.set_option, 'no.such.key', None) + + + def test_set_option_empty_args(self): + self.assertRaises(ValueError, self.cf.set_option) + + def test_set_option_uneven_args(self): + self.assertRaises(ValueError, self.cf.set_option, 'a.b', 2, 'b.c') + + def test_set_option_invalid_single_argument_type(self): + self.assertRaises(ValueError, self.cf.set_option, 2) + + def test_set_option_multiple(self): + self.cf.register_option('a', 1, 'doc') + self.cf.register_option('b.c', 'hullo', 'doc2') + self.cf.register_option('b.b', None, 'doc2') + + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'hullo') + self.assertTrue(self.cf.get_option('b.b') is None) + + self.cf.set_option('a', '2', 'b.c', None, 'b.b', 10.0) + + self.assertEqual(self.cf.get_option('a'), '2') + self.assertTrue(self.cf.get_option('b.c') is None) + self.assertEqual(self.cf.get_option('b.b'), 10.0) + + def test_validation(self): + self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) + self.cf.register_option('b.c', 'hullo', 'doc2', + validator=self.cf.is_text) + self.assertRaises(ValueError, self.cf.register_option, 'a.b.c.d2', + 'NO', 'doc', validator=self.cf.is_int) + + self.cf.set_option('a', 2) # int is_int + self.cf.set_option('b.c', 'wurld') # str is_str + + self.assertRaises( + ValueError, self.cf.set_option, 'a', None) # None not is_int + self.assertRaises(ValueError, self.cf.set_option, 'a', 'ab') + self.assertRaises(ValueError, self.cf.set_option, 'b.c', 1) + + def test_reset_option(self): + self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) + self.cf.register_option('b.c', 'hullo', 'doc2', + validator=self.cf.is_str) + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'hullo') + + self.cf.set_option('a', 2) + self.cf.set_option('b.c', 'wurld') + self.assertEqual(self.cf.get_option('a'), 2) + self.assertEqual(self.cf.get_option('b.c'), 'wurld') + + self.cf.reset_option('a') + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'wurld') + self.cf.reset_option('b.c') + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'hullo') + + def test_reset_option_all(self): + self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) + self.cf.register_option('b.c', 'hullo', 'doc2', + validator=self.cf.is_str) + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'hullo') + + self.cf.set_option('a', 2) + self.cf.set_option('b.c', 'wurld') + self.assertEqual(self.cf.get_option('a'), 2) + self.assertEqual(self.cf.get_option('b.c'), 'wurld') + + self.cf.reset_option("all") + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b.c'), 'hullo') + + def test_deprecate_option(self): + import sys + self.cf.deprecate_option( + 'foo') # we can deprecate non-existent options + + # testing warning with catch_warning was only added in 2.6 + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("Need py > 2.6") + + self.assertTrue(self.cf._is_deprecated('foo')) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + try: + self.cf.get_option('foo') + except KeyError: + pass + else: + self.fail("Nonexistent option didn't raise KeyError") + + self.assertEqual(len(w), 1) # should have raised one warning + self.assertTrue( + 'deprecated' in str(w[-1])) # we get the default message + + self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) + self.cf.register_option('b.c', 'hullo', 'doc2') + self.cf.register_option('foo', 'hullo', 'doc2') + + self.cf.deprecate_option('a', removal_ver='nifty_ver') + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + self.cf.get_option('a') + + self.assertEqual(len(w), 1) # should have raised one warning + self.assertTrue( + 'eprecated' in str(w[-1])) # we get the default message + self.assertTrue( + 'nifty_ver' in str(w[-1])) # with the removal_ver quoted + + self.assertRaises( + KeyError, self.cf.deprecate_option, 'a') # can't depr. twice + + self.cf.deprecate_option('b.c', 'zounds!') + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + self.cf.get_option('b.c') + + self.assertEqual(len(w), 1) # should have raised one warning + self.assertTrue( + 'zounds!' in str(w[-1])) # we get the custom message + + # test rerouting keys + self.cf.register_option('d.a', 'foo', 'doc2') + self.cf.register_option('d.dep', 'bar', 'doc2') + self.assertEqual(self.cf.get_option('d.a'), 'foo') + self.assertEqual(self.cf.get_option('d.dep'), 'bar') + + self.cf.deprecate_option('d.dep', rkey='d.a') # reroute d.dep to d.a + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + self.assertEqual(self.cf.get_option('d.dep'), 'foo') + + self.assertEqual(len(w), 1) # should have raised one warning + self.assertTrue( + 'eprecated' in str(w[-1])) # we get the custom message + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + self.cf.set_option('d.dep', 'baz') # should overwrite "d.a" + + self.assertEqual(len(w), 1) # should have raised one warning + self.assertTrue( + 'eprecated' in str(w[-1])) # we get the custom message + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + self.assertEqual(self.cf.get_option('d.dep'), 'baz') + + self.assertEqual(len(w), 1) # should have raised one warning + self.assertTrue( + 'eprecated' in str(w[-1])) # we get the custom message + + def test_config_prefix(self): + with self.cf.config_prefix("base"): + self.cf.register_option('a', 1, "doc1") + self.cf.register_option('b', 2, "doc2") + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b'), 2) + + self.cf.set_option('a', 3) + self.cf.set_option('b', 4) + self.assertEqual(self.cf.get_option('a'), 3) + self.assertEqual(self.cf.get_option('b'), 4) + + self.assertEqual(self.cf.get_option('base.a'), 3) + self.assertEqual(self.cf.get_option('base.b'), 4) + self.assertTrue( + 'doc1' in self.cf.describe_option('base.a', _print_desc=False)) + self.assertTrue( + 'doc2' in self.cf.describe_option('base.b', _print_desc=False)) + + self.cf.reset_option('base.a') + self.cf.reset_option('base.b') + + with self.cf.config_prefix("base"): + self.assertEqual(self.cf.get_option('a'), 1) + self.assertEqual(self.cf.get_option('b'), 2) + + def test_callback(self): + k = [None] + v = [None] + + def callback(key): + k.append(key) + v.append(self.cf.get_option(key)) + + self.cf.register_option('d.a', 'foo', cb=callback) + self.cf.register_option('d.b', 'foo', cb=callback) + + del k[-1], v[-1] + self.cf.set_option("d.a", "fooz") + self.assertEqual(k[-1], "d.a") + self.assertEqual(v[-1], "fooz") + + del k[-1], v[-1] + self.cf.set_option("d.b", "boo") + self.assertEqual(k[-1], "d.b") + self.assertEqual(v[-1], "boo") + + del k[-1], v[-1] + self.cf.reset_option("d.b") + self.assertEqual(k[-1], "d.b") + + def test_set_ContextManager(self): + def eq(val): + self.assertEqual(self.cf.get_option("a"), val) + + self.cf.register_option('a', 0) + eq(0) + with self.cf.option_context("a", 15): + eq(15) + with self.cf.option_context("a", 25): + eq(25) + eq(15) + eq(0) + + self.cf.set_option("a", 17) + eq(17) + + def test_attribute_access(self): + holder = [] + + def f(): + options.b = 1 + + def f2(): + options.display = 1 + + def f3(key): + holder.append(True) + + self.cf.register_option('a', 0) + self.cf.register_option('c', 0, cb=f3) + options = self.cf.options + + self.assertEqual(options.a, 0) + with self.cf.option_context("a", 15): + self.assertEqual(options.a, 15) + + options.a = 500 + self.assertEqual(self.cf.get_option("a"), 500) + + self.cf.reset_option("a") + self.assertEqual(options.a, self.cf.get_option("a", 0)) + + self.assertRaises(KeyError, f) + self.assertRaises(KeyError, f2) + + # make sure callback kicks when using this form of setting + options.c = 1 + self.assertEqual(len(holder), 1) + diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py new file mode 100644 index 00000000..8d012b87 --- /dev/null +++ b/pandas/tests/test_expressions.py @@ -0,0 +1,419 @@ +from __future__ import print_function +# pylint: disable-msg=W0612,E1101 + +import nose +import re + +from numpy.random import randn + +import operator +import numpy as np +from numpy.testing import assert_array_equal + +from pandas.core.api import DataFrame, Panel +from pandas.computation import expressions as expr +from pandas import compat + +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assert_panel_equal, + assert_panel4d_equal) +import pandas.util.testing as tm +from numpy.testing.decorators import slow + + +if not expr._USE_NUMEXPR: + try: + import numexpr + except ImportError: + msg = "don't have" + else: + msg = "not using" + raise nose.SkipTest("{0} numexpr".format(msg)) + +_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') +_frame2 = DataFrame(randn(100, 4), columns = list('ABCD'), dtype='float64') +_mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') }) +_mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') }) +_integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns = list('ABCD'), dtype='int64') +_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)), + columns=list('ABCD'), dtype='int64') +_frame_panel = Panel(dict(ItemA=_frame.copy(), ItemB=(_frame.copy() + 3), ItemC=_frame.copy(), ItemD=_frame.copy())) +_frame2_panel = Panel(dict(ItemA=_frame2.copy(), ItemB=(_frame2.copy() + 3), + ItemC=_frame2.copy(), ItemD=_frame2.copy())) +_integer_panel = Panel(dict(ItemA=_integer, + ItemB=(_integer + 34).astype('int64'))) +_integer2_panel = Panel(dict(ItemA=_integer2, + ItemB=(_integer2 + 34).astype('int64'))) +_mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3))) +_mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3))) + + +class TestExpressions(tm.TestCase): + + _multiprocess_can_split_ = False + + def setUp(self): + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.mixed = _mixed.copy() + self.mixed2 = _mixed2.copy() + self.integer = _integer.copy() + self._MIN_ELEMENTS = expr._MIN_ELEMENTS + + def tearDown(self): + expr._MIN_ELEMENTS = self._MIN_ELEMENTS + + @nose.tools.nottest + def run_arithmetic_test(self, df, other, assert_func, check_dtype=False, + test_flex=True): + expr._MIN_ELEMENTS = 0 + operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv', 'pow'] + if not compat.PY3: + operations.append('div') + for arith in operations: + operator_name = arith + if arith == 'div': + operator_name = 'truediv' + + if test_flex: + op = lambda x, y: getattr(df, arith)(y) + op.__name__ = arith + else: + op = getattr(operator, operator_name) + expr.set_use_numexpr(False) + expected = op(df, other) + expr.set_use_numexpr(True) + result = op(df, other) + try: + if check_dtype: + if arith == 'truediv': + assert expected.dtype.kind == 'f' + assert_func(expected, result) + except Exception: + com.pprint_thing("Failed test with operator %r" % op.__name__) + raise + + def test_integer_arithmetic(self): + self.run_arithmetic_test(self.integer, self.integer, + assert_frame_equal) + self.run_arithmetic_test(self.integer.icol(0), self.integer.icol(0), + assert_series_equal, check_dtype=True) + + @nose.tools.nottest + def run_binary_test(self, df, other, assert_func, + test_flex=False, numexpr_ops=set(['gt', 'lt', 'ge', + 'le', 'eq', 'ne'])): + """ + tests solely that the result is the same whether or not numexpr is + enabled. Need to test whether the function does the correct thing + elsewhere. + """ + expr._MIN_ELEMENTS = 0 + expr.set_test_mode(True) + operations = ['gt', 'lt', 'ge', 'le', 'eq', 'ne'] + for arith in operations: + if test_flex: + op = lambda x, y: getattr(df, arith)(y) + op.__name__ = arith + else: + op = getattr(operator, arith) + expr.set_use_numexpr(False) + expected = op(df, other) + expr.set_use_numexpr(True) + expr.get_test_result() + result = op(df, other) + used_numexpr = expr.get_test_result() + try: + if arith in numexpr_ops: + assert used_numexpr, "Did not use numexpr as expected." + else: + assert not used_numexpr, "Used numexpr unexpectedly." + assert_func(expected, result) + except Exception: + com.pprint_thing("Failed test with operation %r" % arith) + com.pprint_thing("test_flex was %r" % test_flex) + raise + + def run_frame(self, df, other, binary_comp=None, run_binary=True, + **kwargs): + self.run_arithmetic_test(df, other, assert_frame_equal, + test_flex=False, **kwargs) + self.run_arithmetic_test(df, other, assert_frame_equal, test_flex=True, + **kwargs) + if run_binary: + if binary_comp is None: + expr.set_use_numexpr(False) + binary_comp = other + 1 + expr.set_use_numexpr(True) + self.run_binary_test(df, binary_comp, assert_frame_equal, + test_flex=False, **kwargs) + self.run_binary_test(df, binary_comp, assert_frame_equal, + test_flex=True, **kwargs) + + def run_series(self, ser, other, binary_comp=None, **kwargs): + self.run_arithmetic_test(ser, other, assert_series_equal, + test_flex=False, **kwargs) + self.run_arithmetic_test(ser, other, assert_almost_equal, + test_flex=True, **kwargs) + # series doesn't uses vec_compare instead of numexpr... + # if binary_comp is None: + # binary_comp = other + 1 + # self.run_binary_test(ser, binary_comp, assert_frame_equal, test_flex=False, + # **kwargs) + # self.run_binary_test(ser, binary_comp, assert_frame_equal, test_flex=True, + # **kwargs) + + def run_panel(self, panel, other, binary_comp=None, run_binary=True, + assert_func=assert_panel_equal, **kwargs): + self.run_arithmetic_test(panel, other, assert_func, test_flex=False, + **kwargs) + self.run_arithmetic_test(panel, other, assert_func, test_flex=True, + **kwargs) + if run_binary: + if binary_comp is None: + binary_comp = other + 1 + self.run_binary_test(panel, binary_comp, assert_func, + test_flex=False, **kwargs) + self.run_binary_test(panel, binary_comp, assert_func, + test_flex=True, **kwargs) + + def test_integer_arithmetic_frame(self): + self.run_frame(self.integer, self.integer) + + def test_integer_arithmetic_series(self): + self.run_series(self.integer.icol(0), self.integer.icol(0)) + + @slow + def test_integer_panel(self): + self.run_panel(_integer2_panel, np.random.randint(1, 100)) + + def test_float_arithemtic_frame(self): + self.run_frame(self.frame2, self.frame2) + + def test_float_arithmetic_series(self): + self.run_series(self.frame2.icol(0), self.frame2.icol(0)) + + @slow + def test_float_panel(self): + self.run_panel(_frame2_panel, np.random.randn() + 0.1, binary_comp=0.8) + + @slow + def test_panel4d(self): + self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, + assert_func=assert_panel4d_equal, binary_comp=3) + + def test_mixed_arithmetic_frame(self): + # TODO: FIGURE OUT HOW TO GET IT TO WORK... + # can't do arithmetic because comparison methods try to do *entire* + # frame instead of by-column + self.run_frame(self.mixed2, self.mixed2, run_binary=False) + + def test_mixed_arithmetic_series(self): + for col in self.mixed2.columns: + self.run_series(self.mixed2[col], self.mixed2[col], binary_comp=4) + + @slow + def test_mixed_panel(self): + self.run_panel(_mixed2_panel, np.random.randint(1, 100), + binary_comp=-2) + + def test_float_arithemtic(self): + self.run_arithmetic_test(self.frame, self.frame, assert_frame_equal) + self.run_arithmetic_test(self.frame.icol(0), self.frame.icol(0), + assert_series_equal, check_dtype=True) + + def test_mixed_arithmetic(self): + self.run_arithmetic_test(self.mixed, self.mixed, assert_frame_equal) + for col in self.mixed.columns: + self.run_arithmetic_test(self.mixed[col], self.mixed[col], + assert_series_equal) + + def test_integer_with_zeros(self): + self.integer *= np.random.randint(0, 2, size=np.shape(self.integer)) + self.run_arithmetic_test(self.integer, self.integer, assert_frame_equal) + self.run_arithmetic_test(self.integer.icol(0), self.integer.icol(0), + assert_series_equal) + + def test_invalid(self): + + # no op + result = expr._can_use_numexpr(operator.add, None, self.frame, self.frame, 'evaluate') + self.assertFalse(result) + + # mixed + result = expr._can_use_numexpr(operator.add, '+', self.mixed, self.frame, 'evaluate') + self.assertFalse(result) + + # min elements + result = expr._can_use_numexpr(operator.add, '+', self.frame2, self.frame2, 'evaluate') + self.assertFalse(result) + + # ok, we only check on first part of expression + result = expr._can_use_numexpr(operator.add, '+', self.frame, self.frame2, 'evaluate') + self.assertTrue(result) + + def test_binary_ops(self): + + def testit(): + + for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: + + for op, op_str in [('add','+'),('sub','-'),('mul','*'),('div','/'),('pow','**')]: + if op == 'div': + op = getattr(operator, 'truediv', None) + else: + op = getattr(operator, op, None) + if op is not None: + result = expr._can_use_numexpr(op, op_str, f, f, 'evaluate') + self.assertNotEqual(result, f._is_mixed_type) + + result = expr.evaluate(op, op_str, f, f, use_numexpr=True) + expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) + assert_array_equal(result,expected.values) + + result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') + self.assertFalse(result) + + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_boolean_ops(self): + + + def testit(): + for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: + + f11 = f + f12 = f + 1 + + f21 = f2 + f22 = f2 + 1 + + for op, op_str in [('gt','>'),('lt','<'),('ge','>='),('le','<='),('eq','=='),('ne','!=')]: + + op = getattr(operator,op) + + result = expr._can_use_numexpr(op, op_str, f11, f12, 'evaluate') + self.assertNotEqual(result, f11._is_mixed_type) + + result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) + expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) + assert_array_equal(result,expected.values) + + result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') + self.assertFalse(result) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_where(self): + + def testit(): + for f in [ self.frame, self.frame2, self.mixed, self.mixed2 ]: + + + for cond in [ True, False ]: + + c = np.empty(f.shape,dtype=np.bool_) + c.fill(cond) + result = expr.where(c, f.values, f.values+1) + expected = np.where(c, f.values, f.values+1) + assert_array_equal(result,expected) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_bool_ops_raise_on_arithmetic(self): + df = DataFrame({'a': np.random.rand(10) > 0.5, + 'b': np.random.rand(10) > 0.5}) + names = 'div', 'truediv', 'floordiv', 'pow' + ops = '/', '/', '//', '**' + msg = 'operator %r not implemented for bool dtypes' + for op, name in zip(ops, names): + if not compat.PY3 or name != 'div': + f = getattr(operator, name) + err_msg = re.escape(msg % op) + + with tm.assertRaisesRegexp(NotImplementedError, err_msg): + f(df, df) + + with tm.assertRaisesRegexp(NotImplementedError, err_msg): + f(df.a, df.b) + + with tm.assertRaisesRegexp(NotImplementedError, err_msg): + f(df.a, True) + + with tm.assertRaisesRegexp(NotImplementedError, err_msg): + f(False, df.a) + + with tm.assertRaisesRegexp(TypeError, err_msg): + f(False, df) + + with tm.assertRaisesRegexp(TypeError, err_msg): + f(df, True) + + def test_bool_ops_warn_on_arithmetic(self): + n = 10 + df = DataFrame({'a': np.random.rand(n) > 0.5, + 'b': np.random.rand(n) > 0.5}) + names = 'add', 'mul', 'sub' + ops = '+', '*', '-' + subs = {'+': '|', '*': '&', '-': '^'} + sub_funcs = {'|': 'or_', '&': 'and_', '^': 'xor'} + for op, name in zip(ops, names): + f = getattr(operator, name) + fe = getattr(operator, sub_funcs[subs[op]]) + + with tm.use_numexpr(True, min_elements=5): + with tm.assert_produces_warning(): + r = f(df, df) + e = fe(df, df) + tm.assert_frame_equal(r, e) + + with tm.assert_produces_warning(): + r = f(df.a, df.b) + e = fe(df.a, df.b) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(): + r = f(df.a, True) + e = fe(df.a, True) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(): + r = f(False, df.a) + e = fe(False, df.a) + tm.assert_series_equal(r, e) + + with tm.assert_produces_warning(): + r = f(False, df) + e = fe(False, df) + tm.assert_frame_equal(r, e) + + with tm.assert_produces_warning(): + r = f(df, True) + e = fe(df, True) + tm.assert_frame_equal(r, e) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py new file mode 100644 index 00000000..5d785df3 --- /dev/null +++ b/pandas/tests/test_format.py @@ -0,0 +1,2944 @@ +from __future__ import print_function +# -*- coding: utf-8 -*- +import re + +from pandas.compat import range, zip, lrange, StringIO, PY3, lzip, u +import pandas.compat as compat +import itertools +import os +import sys +from textwrap import dedent +import warnings + +from numpy import nan +from numpy.random import randn +import numpy as np + +from pandas import DataFrame, Series, Index, _np_version_under1p7, Timestamp, MultiIndex + +import pandas.core.format as fmt +import pandas.util.testing as tm +import pandas.core.common as com +from pandas.util.terminal import get_terminal_size +import pandas +import pandas.tslib as tslib +import pandas as pd +from pandas.core.config import (set_option, get_option, + option_context, reset_option) +from datetime import datetime + +_frame = DataFrame(tm.getSeriesData()) + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + +def has_info_repr(df): + r = repr(df) + c1 = r.split('\n')[0].startswith(", 2. Index, 3. Columns, 4. dtype + return has_info and nv + +def has_horizontally_truncated_repr(df): + try: # Check header row + fst_line = np.array(repr(df).splitlines()[0].split()) + cand_col = np.where(fst_line=='...')[0][0] + except: + return False + # Make sure each row has this ... in the same place + r = repr(df) + for ix,l in enumerate(r.splitlines()): + if not r.split()[cand_col] == '...': + return False + return True + +def has_vertically_truncated_repr(df): + r = repr(df) + only_dot_row = False + for row in r.splitlines(): + if re.match('^[\.\ ]+$',row): + only_dot_row = True + return only_dot_row + +def has_truncated_repr(df): + return has_horizontally_truncated_repr(df) or has_vertically_truncated_repr(df) + +def has_doubly_truncated_repr(df): + return has_horizontally_truncated_repr(df) and has_vertically_truncated_repr(df) + +def has_expanded_repr(df): + r = repr(df) + for line in r.split('\n'): + if line.endswith('\\'): + return True + return False + +class TestDataFrameFormatting(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.warn_filters = warnings.filters + warnings.filterwarnings('ignore', + category=FutureWarning, + module=".*format") + + self.frame = _frame.copy() + + def tearDown(self): + warnings.filters = self.warn_filters + + def test_repr_embedded_ndarray(self): + arr = np.empty(10, dtype=[('err', object)]) + for i in range(len(arr)): + arr['err'][i] = np.random.randn(i) + + df = DataFrame(arr) + repr(df['err']) + repr(df) + df.to_string() + + def test_eng_float_formatter(self): + self.frame.ix[5] = 0 + + fmt.set_eng_float_format() + result = repr(self.frame) + + fmt.set_eng_float_format(use_eng_prefix=True) + repr(self.frame) + + fmt.set_eng_float_format(accuracy=0) + repr(self.frame) + self.reset_display_options() + + def test_repr_tuples(self): + buf = StringIO() + + df = DataFrame({'tups': lzip(range(10), range(10))}) + repr(df) + df.to_string(col_space=10, buf=buf) + + def test_repr_truncation(self): + max_len = 20 + with option_context("display.max_colwidth", max_len): + df = DataFrame({'A': np.random.randn(10), + 'B': [tm.rands(np.random.randint(max_len - 1, + max_len + 1)) for i in range(10)]}) + r = repr(df) + r = r[r.find('\n') + 1:] + + _strlen = fmt._strlen_func() + + for line, value in lzip(r.split('\n'), df['B']): + if _strlen(value) + 1 > max_len: + self.assertIn('...', line) + else: + self.assertNotIn('...', line) + + with option_context("display.max_colwidth", 999999): + self.assertNotIn('...', repr(df)) + + with option_context("display.max_colwidth", max_len + 2): + self.assertNotIn('...', repr(df)) + + def test_repr_chop_threshold(self): + df = DataFrame([[0.1, 0.5],[0.5, -0.1]]) + pd.reset_option("display.chop_threshold") # default None + self.assertEqual(repr(df), ' 0 1\n0 0.1 0.5\n1 0.5 -0.1') + + with option_context("display.chop_threshold", 0.2 ): + self.assertEqual(repr(df), ' 0 1\n0 0.0 0.5\n1 0.5 0.0') + + with option_context("display.chop_threshold", 0.6 ): + self.assertEqual(repr(df), ' 0 1\n0 0 0\n1 0 0') + + with option_context("display.chop_threshold", None ): + self.assertEqual(repr(df), ' 0 1\n0 0.1 0.5\n1 0.5 -0.1') + + def test_repr_obeys_max_seq_limit(self): + import pandas.core.common as com + + with option_context("display.max_seq_items",2000): + self.assertTrue(len(com.pprint_thing(lrange(1000))) > 1000) + + with option_context("display.max_seq_items",5): + self.assertTrue(len(com.pprint_thing(lrange(1000)))< 100) + + def test_repr_is_valid_construction_code(self): + import pandas as pd + + # for the case of Index, where the repr is traditional rather then stylized + idx = pd.Index(['a','b']) + res = eval("pd."+repr(idx)) + tm.assert_series_equal(Series(res),Series(idx)) + + def test_repr_should_return_str(self): + # http://docs.python.org/py3k/reference/datamodel.html#object.__repr__ + # http://docs.python.org/reference/datamodel.html#object.__repr__ + # "...The return value must be a string object." + + # (str on py2.x, str (unicode) on py3) + + + data = [8, 5, 3, 5] + index1 = [u("\u03c3"), u("\u03c4"), u("\u03c5"), + u("\u03c6")] + cols = [u("\u03c8")] + df = DataFrame(data, columns=cols, index=index1) + self.assertTrue(type(df.__repr__()) == str) # both py2 / 3 + + def test_repr_no_backslash(self): + with option_context('mode.sim_interactive', True): + df = DataFrame(np.random.randn(10, 4)) + self.assertTrue('\\' not in repr(df)) + + def test_expand_frame_repr(self): + df_small = DataFrame('hello', [0], [0]) + df_wide = DataFrame('hello', [0], lrange(10)) + df_tall = DataFrame('hello', lrange(30), lrange(5)) + + with option_context('mode.sim_interactive', True): + with option_context('display.max_columns', 10, + 'display.width',20, + 'display.max_rows', 20, + 'display.show_dimensions', True): + with option_context('display.expand_frame_repr', True): + self.assertFalse(has_truncated_repr(df_small)) + self.assertFalse(has_expanded_repr(df_small)) + self.assertFalse(has_truncated_repr(df_wide)) + self.assertTrue(has_expanded_repr(df_wide)) + self.assertTrue(has_vertically_truncated_repr(df_tall)) + self.assertTrue(has_expanded_repr(df_tall)) + + with option_context('display.expand_frame_repr', False): + self.assertFalse(has_truncated_repr(df_small)) + self.assertFalse(has_expanded_repr(df_small)) + self.assertFalse(has_horizontally_truncated_repr(df_wide)) + self.assertFalse(has_expanded_repr(df_wide)) + self.assertTrue(has_vertically_truncated_repr(df_tall)) + self.assertFalse(has_expanded_repr(df_tall)) + + def test_repr_non_interactive(self): + # in non interactive mode, there can be no dependency on the + # result of terminal auto size detection + df = DataFrame('hello', lrange(1000), lrange(5)) + + with option_context('mode.sim_interactive', False, + 'display.width', 0, + 'display.height', 0, + 'display.max_rows',5000): + self.assertFalse(has_truncated_repr(df)) + self.assertFalse(has_expanded_repr(df)) + + def test_repr_max_columns_max_rows(self): + term_width, term_height = get_terminal_size() + if term_width < 10 or term_height < 10: + raise nose.SkipTest("terminal size too small, " + "{0} x {1}".format(term_width, term_height)) + + def mkframe(n): + index = ['%05d' % i for i in range(n)] + return DataFrame(0, index, index) + + df6 = mkframe(6) + df10 = mkframe(10) + with option_context('mode.sim_interactive', True): + with option_context('display.width', term_width * 2): + with option_context('display.max_rows', 5, + 'display.max_columns', 5): + self.assertFalse(has_expanded_repr(mkframe(4))) + self.assertFalse(has_expanded_repr(mkframe(5))) + self.assertFalse(has_expanded_repr(df6)) + self.assertTrue(has_doubly_truncated_repr(df6)) + + with option_context('display.max_rows', 20, + 'display.max_columns', 10): + # Out off max_columns boundary, but no extending + # since not exceeding width + self.assertFalse(has_expanded_repr(df6)) + self.assertFalse(has_truncated_repr(df6)) + + with option_context('display.max_rows', 9, + 'display.max_columns', 10): + # out vertical bounds can not result in exanded repr + self.assertFalse(has_expanded_repr(df10)) + self.assertTrue(has_vertically_truncated_repr(df10)) + + # width=None in terminal, auto detection + with option_context('display.max_columns', 100, + 'display.max_rows', term_width * 20, + 'display.width', None): + df = mkframe((term_width // 7) - 2) + self.assertFalse(has_expanded_repr(df)) + df = mkframe((term_width // 7) + 2) + com.pprint_thing(df._repr_fits_horizontal_()) + self.assertTrue(has_expanded_repr(df)) + + def test_to_string_repr_unicode(self): + buf = StringIO() + + unicode_values = [u('\u03c3')] * 10 + unicode_values = np.array(unicode_values, dtype=object) + df = DataFrame({'unicode': unicode_values}) + df.to_string(col_space=10, buf=buf) + + # it works! + repr(df) + + idx = Index(['abc', u('\u03c3a'), 'aegdvg']) + ser = Series(np.random.randn(len(idx)), idx) + rs = repr(ser).split('\n') + line_len = len(rs[0]) + for line in rs[1:]: + try: + line = line.decode(get_option("display.encoding")) + except: + pass + if not line.startswith('dtype:'): + self.assertEqual(len(line), line_len) + + # it works even if sys.stdin in None + _stdin= sys.stdin + try: + sys.stdin = None + repr(df) + finally: + sys.stdin = _stdin + + def test_to_string_unicode_columns(self): + df = DataFrame({u('\u03c3'): np.arange(10.)}) + + buf = StringIO() + df.to_string(buf=buf) + buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + buf.getvalue() + + result = self.frame.to_string() + tm.assert_isinstance(result, compat.text_type) + + def test_to_string_utf8_columns(self): + n = u("\u05d0").encode('utf-8') + + with option_context('display.max_rows', 1): + df = pd.DataFrame([1, 2], columns=[n]) + repr(df) + + def test_to_string_unicode_two(self): + dm = DataFrame({u('c/\u03c3'): []}) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_unicode_three(self): + dm = DataFrame(['\xc2']) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_with_formatters(self): + df = DataFrame({'int': [1, 2, 3], + 'float': [1.0, 2.0, 3.0], + 'object': [(1, 2), True, False]}, + columns=['int', 'float', 'object']) + + formatters = [('int', lambda x: '0x%x' % x), + ('float', lambda x: '[% 4.1f]' % x), + ('object', lambda x: '-%s-' % str(x))] + result = df.to_string(formatters=dict(formatters)) + result2 = df.to_string(formatters=lzip(*formatters)[1]) + self.assertEqual(result, (' int float object\n' + '0 0x1 [ 1.0] -(1, 2)-\n' + '1 0x2 [ 2.0] -True-\n' + '2 0x3 [ 3.0] -False-')) + self.assertEqual(result, result2) + + def test_to_string_with_formatters_unicode(self): + df = DataFrame({u('c/\u03c3'): [1, 2, 3]}) + result = df.to_string(formatters={u('c/\u03c3'): + lambda x: '%s' % x}) + self.assertEqual(result, u(' c/\u03c3\n') + + '0 1\n1 2\n2 3') + + def test_to_string_buffer_all_unicode(self): + buf = StringIO() + + empty = DataFrame({u('c/\u03c3'): Series()}) + nonempty = DataFrame({u('c/\u03c3'): Series([1, 2, 3])}) + + print(empty, file=buf) + print(nonempty, file=buf) + + # this should work + buf.getvalue() + + def test_to_string_with_col_space(self): + df = DataFrame(np.random.random(size=(1, 3))) + c10 = len(df.to_string(col_space=10).split("\n")[1]) + c20 = len(df.to_string(col_space=20).split("\n")[1]) + c30 = len(df.to_string(col_space=30).split("\n")[1]) + self.assertTrue(c10 < c20 < c30) + + def test_to_string_truncate_indices(self): + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + for column in [ tm.makeStringIndex ]: + for h in [10,20]: + for w in [10,20]: + with option_context("display.expand_frame_repr",False): + df = DataFrame(index=index(h), columns=column(w)) + with option_context("display.max_rows", 15): + if h == 20: + self.assertTrue(has_vertically_truncated_repr(df)) + else: + self.assertFalse(has_vertically_truncated_repr(df)) + with option_context("display.max_columns", 15): + if w == 20: + self.assertTrue(has_horizontally_truncated_repr(df)) + else: + self.assertFalse(has_horizontally_truncated_repr(df)) + with option_context("display.max_rows", 15,"display.max_columns", 15): + if h == 20 and w == 20: + self.assertTrue(has_doubly_truncated_repr(df)) + else: + self.assertFalse(has_doubly_truncated_repr(df)) + + def test_to_string_truncate_multilevel(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame(index=arrays,columns=arrays) + with option_context("display.max_rows", 7,"display.max_columns", 7): + self.assertTrue(has_doubly_truncated_repr(df)) + + + def test_to_html_with_col_space(self): + def check_with_width(df, col_space): + import re + # check that col_space affects HTML generation + # and be very brittle about it. + html = df.to_html(col_space=col_space) + hdrs = [x for x in html.split("\n") if re.search("\s]", x)] + self.assertTrue(len(hdrs) > 0) + for h in hdrs: + self.assertTrue("min-width" in h) + self.assertTrue(str(col_space) in h) + + df = DataFrame(np.random.random(size=(1, 3))) + + check_with_width(df, 30) + check_with_width(df, 50) + + def test_to_html_with_empty_string_label(self): + # GH3547, to_html regards empty string labels as repeated labels + data = {'c1': ['a', 'b'], 'c2': ['a', ''], 'data': [1, 2]} + df = DataFrame(data).set_index(['c1', 'c2']) + res = df.to_html() + self.assertTrue("rowspan" not in res) + + def test_to_html_unicode(self): + # it works! + df = DataFrame({u('\u03c3'): np.arange(10.)}) + df.to_html() + df = DataFrame({'A': [u('\u03c3')]}) + df.to_html() + + def test_to_html_escaped(self): + a = 'str", + b: ""}, + 'co>l2':{a: "", + b: ""}} + rs = pd.DataFrame(test_dict).to_html() + xp = """ + + + + + + + + + + + + + + + + + + + +
co<l1co>l2
str<ing1 &amp; <type 'str'> <type 'str'>
stri>ng2 &amp; <type 'str'> <type 'str'>
""" + self.assertEqual(xp, rs) + + def test_to_html_escape_disabled(self): + a = 'strbold", + b: "bold"}, + 'co>l2': {a: "bold", + b: "bold"}} + rs = pd.DataFrame(test_dict).to_html(escape=False) + xp = """ + + + + + + + + + + + + + + + + + +
co + co>l2
str + bold bold
stri>ng2 & bold bold
""" + self.assertEqual(xp, rs) + + def test_to_html_multiindex_sparsify_false_multi_sparse(self): + with option_context('display.multi_sparse', False): + index = pd.MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], + names=['foo', None]) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) + + result = df.to_html() + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01
foo
00 0 1
01 2 3
10 4 5
11 6 7
""" + self.assertEqual(result, expected) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], + columns=index[::2], index=index) + + result = df.to_html() + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
foo01
00
foo
00 0 1
01 2 3
10 4 5
11 6 7
""" + self.assertEqual(result, expected) + + def test_to_html_multiindex_sparsify(self): + index = pd.MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], + names=['foo', None]) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) + + result = df.to_html() + expected = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01
foo
00 0 1
1 2 3
10 4 5
1 6 7
""" + self.assertEqual(result, expected) + + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], + columns=index[::2], index=index) + + result = df.to_html() + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
foo01
00
foo
00 0 1
1 2 3
10 4 5
1 6 7
""" + self.assertEqual(result, expected) + + def test_to_html_index_formatter(self): + df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], + columns=['foo', None], index=lrange(4)) + + f = lambda x: 'abcd'[x] + result = df.to_html(formatters={'__index__': f}) + expected = """\ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
fooNone
a 0 1
b 2 3
c 4 5
d 6 7
""" + self.assertEqual(result, expected) + + def test_to_html_regression_GH6098(self): + df = DataFrame({u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], + u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')], + 'données1': np.random.randn(5), + 'données2': np.random.randn(5)}) + # it works + df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_() + + + + + + def test_to_html_truncate(self): + index = pd.DatetimeIndex(start='20010101',freq='D',periods=20) + df = pd.DataFrame(index=index,columns=range(20)) + fmt.set_option('display.max_rows',8) + fmt.set_option('display.max_columns',4) + result = df._repr_html_() + expected = '''\ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01...1819
2001-01-01 NaN NaN... NaN NaN
2001-01-02 NaN NaN... NaN NaN
2001-01-03 NaN NaN... NaN NaN
2001-01-04 NaN NaN... NaN NaN
..................
2001-01-17 NaN NaN... NaN NaN
2001-01-18 NaN NaN... NaN NaN
2001-01-19 NaN NaN... NaN NaN
2001-01-20 NaN NaN... NaN NaN
+

20 rows × 20 columns

+
''' + if sys.version_info[0] < 3: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + def test_to_html_truncate_multi_index(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame(index=arrays,columns=arrays) + fmt.set_option('display.max_rows',7) + fmt.set_option('display.max_columns',7) + result = df._repr_html_() + expected = '''\ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
barbaz...fooqux
onetwoone...twoonetwo
barone NaN NaN NaN... NaN NaN NaN
two NaN NaN NaN... NaN NaN NaN
bazone NaN NaN NaN... NaN NaN NaN
...........................
footwo NaN NaN NaN... NaN NaN NaN
quxone NaN NaN NaN... NaN NaN NaN
two NaN NaN NaN... NaN NaN NaN
+

8 rows × 8 columns

+
''' + if sys.version_info[0] < 3: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + def test_to_html_truncate_multi_index_sparse_off(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + df = pd.DataFrame(index=arrays,columns=arrays) + fmt.set_option('display.max_rows',7) + fmt.set_option('display.max_columns',7) + fmt.set_option('display.multi_sparse',False) + result = df._repr_html_() + expected = '''\ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
barbarbaz...fooquxqux
onetwoone...twoonetwo
barone NaN NaN NaN... NaN NaN NaN
bartwo NaN NaN NaN... NaN NaN NaN
bazone NaN NaN NaN... NaN NaN NaN
footwo NaN NaN NaN... NaN NaN NaN
quxone NaN NaN NaN... NaN NaN NaN
quxtwo NaN NaN NaN... NaN NaN NaN
+

8 rows × 8 columns

+
''' + if sys.version_info[0] < 3: + expected = expected.decode('utf-8') + self.assertEqual(result, expected) + + + + def test_nonunicode_nonascii_alignment(self): + df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) + rep_str = df.to_string() + lines = rep_str.split('\n') + self.assertEqual(len(lines[1]), len(lines[2])) + + def test_unicode_problem_decoding_as_ascii(self): + dm = DataFrame({u('c/\u03c3'): Series({'test': np.NaN})}) + compat.text_type(dm.to_string()) + + def test_string_repr_encoding(self): + filepath = tm.get_data_path('unicode_series.csv') + df = pandas.read_csv(filepath, header=None, encoding='latin1') + repr(df) + repr(df[1]) + + def test_repr_corner(self): + # representing infs poses no problems + df = DataFrame({'foo': np.inf * np.empty(10)}) + foo = repr(df) + + def test_frame_info_encoding(self): + index = ['\'Til There Was You (1997)', + 'ldum klaka (Cold Fever) (1994)'] + fmt.set_option('display.max_rows', 1) + df = DataFrame(columns=['a', 'b', 'c'], index=index) + repr(df) + repr(df.T) + fmt.set_option('display.max_rows', 200) + + def test_pprint_thing(self): + import nose + from pandas.core.common import pprint_thing as pp_t + + if PY3: + raise nose.SkipTest("doesn't work on Python 3") + + self.assertEqual(pp_t('a') , u('a')) + self.assertEqual(pp_t(u('a')) , u('a')) + self.assertEqual(pp_t(None) , 'None') + self.assertEqual(pp_t(u('\u05d0'), quote_strings=True), + u("u'\u05d0'")) + self.assertEqual(pp_t(u('\u05d0'), quote_strings=False), + u('\u05d0')) + self.assertEqual(pp_t((u('\u05d0'), + u('\u05d1')), quote_strings=True), + u("(u'\u05d0', u'\u05d1')")) + self.assertEqual(pp_t((u('\u05d0'), (u('\u05d1'), + u('\u05d2'))), + quote_strings=True), + u("(u'\u05d0', (u'\u05d1', u'\u05d2'))")) + self.assertEqual(pp_t(('foo', u('\u05d0'), (u('\u05d0'), + u('\u05d0'))), + quote_strings=True), + u("(u'foo', u'\u05d0', (u'\u05d0', u'\u05d0'))")) + + # escape embedded tabs in string + # GH #2038 + self.assertTrue(not "\t" in pp_t("a\tb", escape_chars=("\t",))) + + def test_wide_repr(self): + with option_context('mode.sim_interactive', True, 'display.show_dimensions', True): + col = lambda l, k: [tm.rands(k) for _ in range(l)] + max_cols = get_option('display.max_columns') + df = DataFrame([col(max_cols - 1, 25) for _ in range(10)]) + set_option('display.expand_frame_repr', False) + rep_str = repr(df) + + assert "10 rows x %d columns" % (max_cols - 1) in rep_str + set_option('display.expand_frame_repr', True) + wide_repr = repr(df) + self.assertNotEqual(rep_str, wide_repr) + + with option_context('display.width', 120): + wider_repr = repr(df) + self.assertTrue(len(wider_repr) < len(wide_repr)) + + reset_option('display.expand_frame_repr') + + def test_wide_repr_wide_columns(self): + with option_context('mode.sim_interactive', True): + df = DataFrame(randn(5, 3), columns=['a' * 90, 'b' * 90, 'c' * 90]) + rep_str = repr(df) + + self.assertEqual(len(rep_str.splitlines()), 20) + + def test_wide_repr_named(self): + with option_context('mode.sim_interactive', True): + col = lambda l, k: [tm.rands(k) for _ in range(l)] + max_cols = get_option('display.max_columns') + df = DataFrame([col(max_cols-1, 25) for _ in range(10)]) + df.index.name = 'DataFrame Index' + set_option('display.expand_frame_repr', False) + + rep_str = repr(df) + set_option('display.expand_frame_repr', True) + wide_repr = repr(df) + self.assertNotEqual(rep_str, wide_repr) + + with option_context('display.width', 150): + wider_repr = repr(df) + self.assertTrue(len(wider_repr) < len(wide_repr)) + + for line in wide_repr.splitlines()[1::13]: + self.assertIn('DataFrame Index', line) + + reset_option('display.expand_frame_repr') + + def test_wide_repr_multiindex(self): + with option_context('mode.sim_interactive', True): + col = lambda l, k: [tm.rands(k) for _ in range(l)] + midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), + np.array(col(10, 5))]) + max_cols = get_option('display.max_columns') + df = DataFrame([col(max_cols-1, 25) for _ in range(10)], + index=midx) + df.index.names = ['Level 0', 'Level 1'] + set_option('display.expand_frame_repr', False) + rep_str = repr(df) + set_option('display.expand_frame_repr', True) + wide_repr = repr(df) + self.assertNotEqual(rep_str, wide_repr) + + with option_context('display.width', 150): + wider_repr = repr(df) + self.assertTrue(len(wider_repr) < len(wide_repr)) + + for line in wide_repr.splitlines()[1::13]: + self.assertIn('Level 0 Level 1', line) + + reset_option('display.expand_frame_repr') + + def test_wide_repr_multiindex_cols(self): + with option_context('mode.sim_interactive', True): + max_cols = get_option('display.max_columns') + col = lambda l, k: [tm.rands(k) for _ in range(l)] + midx = pandas.MultiIndex.from_arrays([np.array(col(10, 5)), + np.array(col(10, 5))]) + mcols = pandas.MultiIndex.from_arrays([np.array(col(max_cols-1, 3)), + np.array(col(max_cols-1, 3))]) + df = DataFrame([col(max_cols-1, 25) for _ in range(10)], + index=midx, columns=mcols) + df.index.names = ['Level 0', 'Level 1'] + set_option('display.expand_frame_repr', False) + rep_str = repr(df) + set_option('display.expand_frame_repr', True) + wide_repr = repr(df) + self.assertNotEqual(rep_str, wide_repr) + + with option_context('display.width', 150): + wider_repr = repr(df) + self.assertTrue(len(wider_repr) < len(wide_repr)) + + reset_option('display.expand_frame_repr') + + def test_wide_repr_unicode(self): + with option_context('mode.sim_interactive', True): + col = lambda l, k: [tm.randu(k) for _ in range(l)] + max_cols = get_option('display.max_columns') + df = DataFrame([col(max_cols-1, 25) for _ in range(10)]) + set_option('display.expand_frame_repr', False) + rep_str = repr(df) + set_option('display.expand_frame_repr', True) + wide_repr = repr(df) + self.assertNotEqual(rep_str, wide_repr) + + with option_context('display.width', 150): + wider_repr = repr(df) + self.assertTrue(len(wider_repr) < len(wide_repr)) + + reset_option('display.expand_frame_repr') + + def test_wide_repr_wide_long_columns(self): + with option_context('mode.sim_interactive', True): + df = DataFrame( + {'a': ['a' * 30, 'b' * 30], 'b': ['c' * 70, 'd' * 80]}) + + result = repr(df) + self.assertTrue('ccccc' in result) + self.assertTrue('ddddd' in result) + + def test_long_series(self): + n = 1000 + s = Series(np.random.randint(-50,50,n),index=['s%04d' % x for x in range(n)], dtype='int64') + + import re + str_rep = str(s) + nmatches = len(re.findall('dtype',str_rep)) + self.assertEqual(nmatches, 1) + + def test_index_with_nan(self): + # GH 2850 + df = DataFrame({'id1': {0: '1a3', 1: '9h4'}, 'id2': {0: np.nan, 1: 'd67'}, + 'id3': {0: '78d', 1: '79d'}, 'value': {0: 123, 1: 64}}) + + # multi-index + y = df.set_index(['id1', 'id2', 'id3']) + result = y.to_string() + expected = u(' value\nid1 id2 id3 \n1a3 NaN 78d 123\n9h4 d67 79d 64') + self.assertEqual(result, expected) + + # index + y = df.set_index('id2') + result = y.to_string() + expected = u(' id1 id3 value\nid2 \nNaN 1a3 78d 123\nd67 9h4 79d 64') + self.assertEqual(result, expected) + + # with append (this failed in 0.12) + y = df.set_index(['id1', 'id2']).set_index('id3', append=True) + result = y.to_string() + expected = u(' value\nid1 id2 id3 \n1a3 NaN 78d 123\n9h4 d67 79d 64') + self.assertEqual(result, expected) + + # all-nan in mi + df2 = df.copy() + df2.ix[:,'id2'] = np.nan + y = df2.set_index('id2') + result = y.to_string() + expected = u(' id1 id3 value\nid2 \nNaN 1a3 78d 123\nNaN 9h4 79d 64') + self.assertEqual(result, expected) + + # partial nan in mi + df2 = df.copy() + df2.ix[:,'id2'] = np.nan + y = df2.set_index(['id2','id3']) + result = y.to_string() + expected = u(' id1 value\nid2 id3 \nNaN 78d 1a3 123\n 79d 9h4 64') + self.assertEqual(result, expected) + + df = DataFrame({'id1': {0: np.nan, 1: '9h4'}, 'id2': {0: np.nan, 1: 'd67'}, + 'id3': {0: np.nan, 1: '79d'}, 'value': {0: 123, 1: 64}}) + + y = df.set_index(['id1','id2','id3']) + result = y.to_string() + expected = u(' value\nid1 id2 id3 \nNaN NaN NaN 123\n9h4 d67 79d 64') + self.assertEqual(result, expected) + + def test_to_string(self): + from pandas import read_table + import re + + # big mixed + biggie = DataFrame({'A': randn(200), + 'B': tm.makeStringIndex(200)}, + index=lrange(200)) + + biggie['A'][:20] = nan + biggie['B'][:20] = nan + s = biggie.to_string() + + buf = StringIO() + retval = biggie.to_string(buf=buf) + self.assertIsNone(retval) + self.assertEqual(buf.getvalue(), s) + + tm.assert_isinstance(s, compat.string_types) + + # print in right order + result = biggie.to_string(columns=['B', 'A'], col_space=17, + float_format='%.5f'.__mod__) + lines = result.split('\n') + header = lines[0].strip().split() + joined = '\n'.join([re.sub('\s+', ' ', x).strip() for x in lines[1:]]) + recons = read_table(StringIO(joined), names=header, + header=None, sep=' ') + tm.assert_series_equal(recons['B'], biggie['B']) + self.assertEqual(recons['A'].count(), biggie['A'].count()) + self.assertTrue((np.abs(recons['A'].dropna() - + biggie['A'].dropna()) < 0.1).all()) + + # expected = ['B', 'A'] + # self.assertEqual(header, expected) + + result = biggie.to_string(columns=['A'], col_space=17) + header = result.split('\n')[0].strip().split() + expected = ['A'] + self.assertEqual(header, expected) + + biggie.to_string(columns=['B', 'A'], + formatters={'A': lambda x: '%.1f' % x}) + + biggie.to_string(columns=['B', 'A'], float_format=str) + biggie.to_string(columns=['B', 'A'], col_space=12, + float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_string() + + def test_to_string_no_header(self): + df = DataFrame({'x': [1, 2, 3], + 'y': [4, 5, 6]}) + + df_s = df.to_string(header=False) + expected = "0 1 4\n1 2 5\n2 3 6" + + assert(df_s == expected) + + def test_to_string_no_index(self): + df = DataFrame({'x': [1, 2, 3], + 'y': [4, 5, 6]}) + + df_s = df.to_string(index=False) + expected = " x y\n 1 4\n 2 5\n 3 6" + + assert(df_s == expected) + + def test_to_string_float_formatting(self): + self.reset_display_options() + fmt.set_option('display.precision', 6, 'display.column_space', + 12, 'display.notebook_repr_html', False) + + df = DataFrame({'x': [0, 0.25, 3456.000, 12e+45, 1.64e+6, + 1.7e+8, 1.253456, np.pi, -1e6]}) + + df_s = df.to_string() + + # Python 2.5 just wants me to be sad. And debian 32-bit + # sys.version_info[0] == 2 and sys.version_info[1] < 6: + if _three_digit_exp(): + expected = (' x\n0 0.00000e+000\n1 2.50000e-001\n' + '2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n' + '5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n' + '8 -1.00000e+006') + else: + expected = (' x\n0 0.00000e+00\n1 2.50000e-01\n' + '2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n' + '5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n' + '8 -1.00000e+06') + assert(df_s == expected) + + df = DataFrame({'x': [3234, 0.253]}) + df_s = df.to_string() + + expected = (' x\n' + '0 3234.000\n' + '1 0.253') + assert(df_s == expected) + + self.reset_display_options() + self.assertEqual(get_option("display.precision"), 7) + + df = DataFrame({'x': [1e9, 0.2512]}) + df_s = df.to_string() + # Python 2.5 just wants me to be sad. And debian 32-bit + # sys.version_info[0] == 2 and sys.version_info[1] < 6: + if _three_digit_exp(): + expected = (' x\n' + '0 1.000000e+009\n' + '1 2.512000e-001') + else: + expected = (' x\n' + '0 1.000000e+09\n' + '1 2.512000e-01') + assert(df_s == expected) + + def test_to_string_small_float_values(self): + df = DataFrame({'a': [1.5, 1e-17, -5.5e-7]}) + + result = df.to_string() + # sadness per above + if '%.4g' % 1.7e8 == '1.7e+008': + expected = (' a\n' + '0 1.500000e+000\n' + '1 1.000000e-017\n' + '2 -5.500000e-007') + else: + expected = (' a\n' + '0 1.500000e+00\n' + '1 1.000000e-17\n' + '2 -5.500000e-07') + self.assertEqual(result, expected) + + # but not all exactly zero + df = df * 0 + result = df.to_string() + expected = (' 0\n' + '0 0\n' + '1 0\n' + '2 -0') + + def test_to_string_float_index(self): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(lrange(5), index=index) + + result = df.to_string() + expected = (' 0\n' + '1.5 0\n' + '2.0 1\n' + '3.0 2\n' + '4.0 3\n' + '5.0 4') + self.assertEqual(result, expected) + + def test_to_string_ascii_error(self): + data = [('0 ', + u(' .gitignore '), + u(' 5 '), + ' \xe2\x80\xa2\xe2\x80\xa2\xe2\x80' + '\xa2\xe2\x80\xa2\xe2\x80\xa2')] + df = DataFrame(data) + + # it works! + repr(df) + + def test_to_string_int_formatting(self): + df = DataFrame({'x': [-15, 20, 25, -35]}) + self.assertTrue(issubclass(df['x'].dtype.type, np.integer)) + + output = df.to_string() + expected = (' x\n' + '0 -15\n' + '1 20\n' + '2 25\n' + '3 -35') + self.assertEqual(output, expected) + + def test_to_string_index_formatter(self): + df = DataFrame([lrange(5), lrange(5, 10), lrange(10, 15)]) + + rs = df.to_string(formatters={'__index__': lambda x: 'abc'[x]}) + + xp = """\ + 0 1 2 3 4 +a 0 1 2 3 4 +b 5 6 7 8 9 +c 10 11 12 13 14\ +""" + self.assertEqual(rs, xp) + + def test_to_string_left_justify_cols(self): + self.reset_display_options() + df = DataFrame({'x': [3234, 0.253]}) + df_s = df.to_string(justify='left') + expected = (' x \n' + '0 3234.000\n' + '1 0.253') + assert(df_s == expected) + + def test_to_string_format_na(self): + self.reset_display_options() + df = DataFrame({'A': [np.nan, -1, -2.1234, 3, 4], + 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + result = df.to_string() + + expected = (' A B\n' + '0 NaN NaN\n' + '1 -1.0000 foo\n' + '2 -2.1234 foooo\n' + '3 3.0000 fooooo\n' + '4 4.0000 bar') + self.assertEqual(result, expected) + + df = DataFrame({'A': [np.nan, -1., -2., 3., 4.], + 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + result = df.to_string() + + expected = (' A B\n' + '0 NaN NaN\n' + '1 -1 foo\n' + '2 -2 foooo\n' + '3 3 fooooo\n' + '4 4 bar') + self.assertEqual(result, expected) + + def test_to_string_line_width(self): + df = pd.DataFrame(123, lrange(10, 15), lrange(30)) + s = df.to_string(line_width=80) + self.assertEqual(max(len(l) for l in s.split('\n')), 80) + + def test_show_dimensions(self): + df = pd.DataFrame(123, lrange(10, 15), lrange(30)) + + with option_context('display.max_rows', 10, 'display.max_columns', 40, 'display.width', + 500, 'display.expand_frame_repr', 'info', 'display.show_dimensions', True): + self.assertTrue('5 rows' in str(df)) + self.assertTrue('5 rows' in df._repr_html_()) + with option_context('display.max_rows', 10, 'display.max_columns', 40, 'display.width', + 500, 'display.expand_frame_repr', 'info', 'display.show_dimensions', False): + self.assertFalse('5 rows' in str(df)) + self.assertFalse('5 rows' in df._repr_html_()) + with option_context('display.max_rows', 2, 'display.max_columns', 2, 'display.width', + 500, 'display.expand_frame_repr', 'info', 'display.show_dimensions', 'truncate'): + self.assertTrue('5 rows' in str(df)) + self.assertTrue('5 rows' in df._repr_html_()) + with option_context('display.max_rows', 10, 'display.max_columns', 40, 'display.width', + 500, 'display.expand_frame_repr', 'info', 'display.show_dimensions', 'truncate'): + self.assertFalse('5 rows' in str(df)) + self.assertFalse('5 rows' in df._repr_html_()) + + def test_to_html(self): + # big mixed + biggie = DataFrame({'A': randn(200), + 'B': tm.makeStringIndex(200)}, + index=lrange(200)) + + biggie['A'][:20] = nan + biggie['B'][:20] = nan + s = biggie.to_html() + + buf = StringIO() + retval = biggie.to_html(buf=buf) + self.assertIsNone(retval) + self.assertEqual(buf.getvalue(), s) + + tm.assert_isinstance(s, compat.string_types) + + biggie.to_html(columns=['B', 'A'], col_space=17) + biggie.to_html(columns=['B', 'A'], + formatters={'A': lambda x: '%.1f' % x}) + + biggie.to_html(columns=['B', 'A'], float_format=str) + biggie.to_html(columns=['B', 'A'], col_space=12, + float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_html() + + def test_to_html_filename(self): + biggie = DataFrame({'A': randn(200), + 'B': tm.makeStringIndex(200)}, + index=lrange(200)) + + biggie['A'][:20] = nan + biggie['B'][:20] = nan + with tm.ensure_clean('test.html') as path: + biggie.to_html(path) + with open(path, 'r') as f: + s = biggie.to_html() + s2 = f.read() + self.assertEqual(s, s2) + + frame = DataFrame(index=np.arange(200)) + with tm.ensure_clean('test.html') as path: + frame.to_html(path) + with open(path, 'r') as f: + self.assertEqual(frame.to_html(), f.read()) + + def test_to_html_with_no_bold(self): + x = DataFrame({'x': randn(5)}) + ashtml = x.to_html(bold_rows=False) + assert('' not in ashtml[ashtml.find('')]) + + def test_to_html_columns_arg(self): + result = self.frame.to_html(columns=['A']) + self.assertNotIn('
B
\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
CL001
CL10101
0 a b c d
1 e f g h
') + + self.assertEqual(result, expected) + + columns = pandas.MultiIndex.from_tuples(list(zip(range(4), + np.mod(lrange(4), 2)))) + df = pandas.DataFrame([list('abcd'), list('efgh')], columns=columns) + + result = df.to_html(justify='right') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
0123
0101
0 a b c d
1 e f g h
') + + self.assertEqual(result, expected) + + def test_to_html_justify(self): + df = pandas.DataFrame({'A': [6, 30000, 2], + 'B': [1, 2, 70000], + 'C': [223442, 0, 1]}, + columns=['A', 'B', 'C']) + result = df.to_html(justify='left') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
0 6 1 223442
1 30000 2 0
2 2 70000 1
') + + self.assertEqual(result, expected) + + result = df.to_html(justify='right') + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
ABC
0 6 1 223442
1 30000 2 0
2 2 70000 1
') + self.assertEqual(result, expected) + + def test_to_html_index(self): + index = ['foo', 'bar', 'baz'] + df = pandas.DataFrame({'A': [1, 2, 3], + 'B': [1.2, 3.4, 5.6], + 'C': ['one', 'two', np.NaN]}, + columns=['A', 'B', 'C'], + index=index) + result = df.to_html(index=False) + for i in index: + self.assertNotIn(i, result) + + tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] + df.index = pandas.MultiIndex.from_tuples(tuples) + result = df.to_html(index=False) + for i in ['foo', 'bar', 'car', 'bike']: + self.assertNotIn(i, result) + + def test_repr_html(self): + self.frame._repr_html_() + + fmt.set_option('display.max_rows', 1, 'display.max_columns', 1) + self.frame._repr_html_() + + fmt.set_option('display.notebook_repr_html', False) + self.frame._repr_html_() + + self.reset_display_options() + + df = DataFrame([[1, 2], [3, 4]]) + fmt.set_option('display.show_dimensions', True) + self.assertTrue('2 rows' in df._repr_html_()) + fmt.set_option('display.show_dimensions', False) + self.assertFalse('2 rows' in df._repr_html_()) + + self.reset_display_options() + + def test_repr_html_wide(self): + row = lambda l, k: [tm.rands(k) for _ in range(l)] + max_cols = get_option('display.max_columns') + df = DataFrame([row(max_cols-1, 25) for _ in range(10)]) + reg_repr = df._repr_html_() + assert "..." not in reg_repr + + wide_df = DataFrame([row(max_cols+1, 25) for _ in range(10)]) + wide_repr = wide_df._repr_html_() + assert "..." in wide_repr + + def test_repr_html_wide_multiindex_cols(self): + row = lambda l, k: [tm.rands(k) for _ in range(l)] + max_cols = get_option('display.max_columns') + + tuples = list(itertools.product(np.arange(max_cols//2), ['foo', 'bar'])) + mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols) + reg_repr = df._repr_html_() + assert '...' not in reg_repr + + + tuples = list(itertools.product(np.arange(1+(max_cols//2)), ['foo', 'bar'])) + mcols = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = DataFrame([row(len(mcols), 25) for _ in range(10)], columns=mcols) + wide_repr = df._repr_html_() + assert '...' in wide_repr + + def test_repr_html_long(self): + max_rows = get_option('display.max_rows') + h = max_rows - 1 + df = pandas.DataFrame({'A':np.arange(1,1+h), 'B':np.arange(41, 41+h)}) + reg_repr = df._repr_html_() + assert '..' not in reg_repr + assert str(41 + max_rows // 2) in reg_repr + + h = max_rows + 1 + df = pandas.DataFrame({'A':np.arange(1,1+h), 'B':np.arange(41, 41+h)}) + long_repr = df._repr_html_() + assert '..' in long_repr + assert str(41 + max_rows // 2) not in long_repr + assert u('%d rows ') % h in long_repr + assert u('2 columns') in long_repr + + def test_repr_html_float(self): + max_rows = get_option('display.max_rows') + h = max_rows - 1 + df = pandas.DataFrame({'idx':np.linspace(-10,10,h), 'A':np.arange(1,1+h), 'B': np.arange(41, 41+h) }).set_index('idx') + reg_repr = df._repr_html_() + assert '..' not in reg_repr + assert str(40 + h) in reg_repr + + h = max_rows + 1 + df = pandas.DataFrame({'idx':np.linspace(-10,10,h), 'A':np.arange(1,1+h), 'B': np.arange(41, 41+h) }).set_index('idx') + long_repr = df._repr_html_() + assert '..' in long_repr + assert '31' not in long_repr + assert u('%d rows ') % h in long_repr + assert u('2 columns') in long_repr + + def test_repr_html_long_multiindex(self): + max_rows = get_option('display.max_rows') + max_L1 = max_rows//2 + + tuples = list(itertools.product(np.arange(max_L1), ['foo', 'bar'])) + idx = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = DataFrame(np.random.randn(max_L1*2, 2), index=idx, + columns=['A', 'B']) + reg_repr = df._repr_html_() + assert '...' not in reg_repr + + tuples = list(itertools.product(np.arange(max_L1+1), ['foo', 'bar'])) + idx = pandas.MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = DataFrame(np.random.randn((max_L1+1)*2, 2), index=idx, + columns=['A', 'B']) + long_repr = df._repr_html_() + assert '...' in long_repr + + def test_repr_html_long_and_wide(self): + max_cols = get_option('display.max_columns') + max_rows = get_option('display.max_rows') + + h, w = max_rows-1, max_cols-1 + df = pandas.DataFrame(dict((k,np.arange(1,1+h)) for k in np.arange(w))) + assert '...' not in df._repr_html_() + + h, w = max_rows+1, max_cols+1 + df = pandas.DataFrame(dict((k,np.arange(1,1+h)) for k in np.arange(w))) + assert '...' in df._repr_html_() + + def test_info_repr(self): + max_rows = get_option('display.max_rows') + max_cols = get_option('display.max_columns') + # Long + h, w = max_rows+1, max_cols-1 + df = pandas.DataFrame(dict((k,np.arange(1,1+h)) for k in np.arange(w))) + assert has_vertically_truncated_repr(df) + with option_context('display.large_repr', 'info'): + assert has_info_repr(df) + + # Wide + h, w = max_rows-1, max_cols+1 + df = pandas.DataFrame(dict((k,np.arange(1,1+h)) for k in np.arange(w))) + assert has_horizontally_truncated_repr(df) + with option_context('display.large_repr', 'info'): + assert has_info_repr(df) + + def test_info_repr_max_cols(self): + # GH #6939 + df = DataFrame(randn(10, 5)) + with option_context('display.large_repr', 'info', + 'display.max_columns', 1, + 'display.max_info_columns', 4): + self.assertTrue(has_non_verbose_info_repr(df)) + + with option_context('display.large_repr', 'info', + 'display.max_columns', 1, + 'display.max_info_columns', 5): + self.assertFalse(has_non_verbose_info_repr(df)) + + # test verbose overrides + # fmt.set_option('display.max_info_columns', 4) # exceeded + + def test_info_repr_html(self): + max_rows = get_option('display.max_rows') + max_cols = get_option('display.max_columns') + # Long + h, w = max_rows+1, max_cols-1 + df = pandas.DataFrame(dict((k,np.arange(1,1+h)) for k in np.arange(w))) + assert r'<class' not in df._repr_html_() + with option_context('display.large_repr', 'info'): + assert r'<class' in df._repr_html_() + + # Wide + h, w = max_rows-1, max_cols+1 + df = pandas.DataFrame(dict((k,np.arange(1,1+h)) for k in np.arange(w))) + assert ' +
+ + """).strip() + self.assertEqual(result, expected) + + result = df.to_html(classes=["sortable", "draggable"]) + self.assertEqual(result, expected) + + def test_pprint_pathological_object(self): + """ + if the test fails, the stack will overflow and nose crash, + but it won't hang. + """ + class A: + def __getitem__(self, key): + return 3 # obviously simplified + df = pandas.DataFrame([A()]) + repr(df) # just don't dine + + def test_float_trim_zeros(self): + vals = [2.08430917305e+10, 3.52205017305e+10, 2.30674817305e+10, + 2.03954217305e+10, 5.59897817305e+10] + skip = True + for line in repr(DataFrame({'A': vals})).split('\n')[:-2]: + if line.startswith('dtype:'): + continue + if _three_digit_exp(): + self.assertTrue(('+010' in line) or skip) + else: + self.assertTrue(('+10' in line) or skip) + skip = False + + def test_dict_entries(self): + df = DataFrame({'A': [{'a': 1, 'b': 2}]}) + + val = df.to_string() + self.assertTrue("'a': 1" in val) + self.assertTrue("'b': 2" in val) + + def test_to_latex_filename(self): + with tm.ensure_clean('test.tex') as path: + self.frame.to_latex(path) + + with open(path, 'r') as f: + self.assertEqual(self.frame.to_latex(), f.read()) + + def test_to_latex(self): + # it works! + self.frame.to_latex() + + df = DataFrame({'a': [1, 2], + 'b': ['b1', 'b2']}) + withindex_result = df.to_latex() + withindex_expected = r"""\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +""" + self.assertEqual(withindex_result, withindex_expected) + + withoutindex_result = df.to_latex(index=False) + withoutindex_expected = r"""\begin{tabular}{rl} +\toprule + a & b \\ +\midrule + 1 & b1 \\ + 2 & b2 \\ +\bottomrule +\end{tabular} +""" + self.assertEqual(withoutindex_result, withoutindex_expected) + + def test_to_latex_escape(self): + a = 'a' + b = 'b' + + test_dict = {u('co^l1') : {a: "a", + b: "b"}, + u('co$e^x$'): {a: "a", + b: "b"}} + + unescaped_result = pd.DataFrame(test_dict).to_latex(escape=False) + escaped_result = pd.DataFrame(test_dict).to_latex() # default: escape=True + + unescaped_expected = r'''\begin{tabular}{lll} +\toprule +{} & co$e^x$ & co^l1 \\ +\midrule +a & a & a \\ +b & b & b \\ +\bottomrule +\end{tabular} +''' + + escaped_expected = r'''\begin{tabular}{lll} +\toprule +{} & co\$e\textasciicircumx\$ & co\textasciicircuml1 \\ +\midrule +a & a & a \\ +b & b & b \\ +\bottomrule +\end{tabular} +''' + self.assertEqual(unescaped_result, unescaped_expected) + self.assertEqual(escaped_result, escaped_expected) + + def test_to_latex_longtable(self): + self.frame.to_latex(longtable=True) + + df = DataFrame({'a': [1, 2], + 'b': ['b1', 'b2']}) + withindex_result = df.to_latex(longtable=True) + withindex_expected = r"""\begin{longtable}{lrl} +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + self.assertEqual(withindex_result, withindex_expected) + + withoutindex_result = df.to_latex(index=False, longtable=True) + withoutindex_expected = r"""\begin{longtable}{rl} +\toprule + a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot + 1 & b1 \\ + 2 & b2 \\ +\end{longtable} +""" + self.assertEqual(withoutindex_result, withoutindex_expected) + + def test_to_latex_escape_special_chars(self): + special_characters = ['&','%','$','#','_', + '{','}','~','^','\\'] + df = DataFrame(data=special_characters) + observed = df.to_latex() + expected = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & \& \\ +1 & \% \\ +2 & \$ \\ +3 & \# \\ +4 & \_ \\ +5 & \{ \\ +6 & \} \\ +7 & \textasciitilde \\ +8 & \textasciicircum \\ +9 & \textbackslash \\ +\bottomrule +\end{tabular} +""" + self.assertEqual(observed, expected) + + def test_to_csv_quotechar(self): + df = DataFrame({'col' : [1,2]}) + expected = """\ +"","col" +"0","1" +"1","2" +""" + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1) # 1=QUOTE_ALL + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + expected = """\ +$$,$col$ +$0$,$1$ +$1$,$2$ +""" + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, quotechar="$") + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, quotechar="$", engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(TypeError, 'quotechar'): + df.to_csv(path, quoting=1, quotechar=None) + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(TypeError, 'quotechar'): + df.to_csv(path, quoting=1, quotechar=None, engine='python') + + def test_to_csv_doublequote(self): + df = DataFrame({'col' : ['a"a', '"bb"']}) + expected = '''\ +"","col" +"0","a""a" +"1","""bb""" +''' + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, doublequote=True, engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + from _csv import Error + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(Error, 'escapechar'): + df.to_csv(path, doublequote=False) # no escapechar set + with tm.ensure_clean('test.csv') as path: + with tm.assertRaisesRegexp(Error, 'escapechar'): + df.to_csv(path, doublequote=False, engine='python') + + def test_to_csv_escapechar(self): + df = DataFrame({'col' : ['a"a', '"bb"']}) + expected = """\ +"","col" +"0","a\\"a" +"1","\\"bb\\"" +""" + with tm.ensure_clean('test.csv') as path: # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=1, doublequote=False, escapechar='\\', + engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + df = DataFrame({'col' : ['a,a', ',bb,']}) + expected = """\ +,col +0,a\\,a +1,\\,bb\\, +""" + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + with tm.ensure_clean('test.csv') as path: + df.to_csv(path, quoting=3, escapechar='\\', engine='python') + with open(path, 'r') as f: + self.assertEqual(f.read(), expected) + + def test_csv_to_string(self): + df = DataFrame({'col' : [1,2]}) + expected = ',col\n0,1\n1,2\n' + self.assertEqual(df.to_csv(), expected) + + +class TestSeriesFormatting(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + def test_repr_unicode(self): + s = Series([u('\u03c3')] * 10) + repr(s) + + a = Series([u("\u05d0")] * 1000) + a.name = 'title1' + repr(a) + + def test_to_string(self): + buf = StringIO() + + s = self.ts.to_string() + + retval = self.ts.to_string(buf=buf) + self.assertIsNone(retval) + self.assertEqual(buf.getvalue().strip(), s) + + # pass float_format + format = '%.4f'.__mod__ + result = self.ts.to_string(float_format=format) + result = [x.split()[1] for x in result.split('\n')] + expected = [format(x) for x in self.ts] + self.assertEqual(result, expected) + + # empty string + result = self.ts[:0].to_string() + self.assertEqual(result, '') + + result = self.ts[:0].to_string(length=0) + self.assertEqual(result, '') + + # name and length + cp = self.ts.copy() + cp.name = 'foo' + result = cp.to_string(length=True, name=True, dtype=True) + last_line = result.split('\n')[-1].strip() + self.assertEqual(last_line, "Freq: B, Name: foo, Length: %d, dtype: float64" % len(cp)) + + def test_freq_name_separation(self): + s = Series(np.random.randn(10), + index=pd.date_range('1/1/2000', periods=10), name=0) + + result = repr(s) + self.assertTrue('Freq: D, Name: 0' in result) + + def test_to_string_mixed(self): + s = Series(['foo', np.nan, -1.23, 4.56]) + result = s.to_string() + expected = (u('0 foo\n') + + u('1 NaN\n') + + u('2 -1.23\n') + + u('3 4.56')) + self.assertEqual(result, expected) + + # but don't count NAs as floats + s = Series(['foo', np.nan, 'bar', 'baz']) + result = s.to_string() + expected = (u('0 foo\n') + + '1 NaN\n' + + '2 bar\n' + + '3 baz') + self.assertEqual(result, expected) + + s = Series(['foo', 5, 'bar', 'baz']) + result = s.to_string() + expected = (u('0 foo\n') + + '1 5\n' + + '2 bar\n' + + '3 baz') + self.assertEqual(result, expected) + + def test_to_string_float_na_spacing(self): + s = Series([0., 1.5678, 2., -3., 4.]) + s[::2] = np.nan + + result = s.to_string() + expected = (u('0 NaN\n') + + '1 1.5678\n' + + '2 NaN\n' + + '3 -3.0000\n' + + '4 NaN') + self.assertEqual(result, expected) + + def test_unicode_name_in_footer(self): + s = Series([1, 2], name=u('\u05e2\u05d1\u05e8\u05d9\u05ea')) + sf = fmt.SeriesFormatter(s, name=u('\u05e2\u05d1\u05e8\u05d9\u05ea')) + sf._get_footer() # should not raise exception + + def test_float_trim_zeros(self): + vals = [2.08430917305e+10, 3.52205017305e+10, 2.30674817305e+10, + 2.03954217305e+10, 5.59897817305e+10] + for line in repr(Series(vals)).split('\n'): + if line.startswith('dtype:'): + continue + if _three_digit_exp(): + self.assertIn('+010', line) + else: + self.assertIn('+10', line) + + def test_datetimeindex(self): + + from pandas import date_range, NaT + index = date_range('20130102',periods=6) + s = Series(1,index=index) + result = s.to_string() + self.assertTrue('2013-01-02' in result) + + # nat in index + s2 = Series(2, index=[ Timestamp('20130111'), NaT ]) + s = s2.append(s) + result = s.to_string() + self.assertTrue('NaT' in result) + + # nat in summary + result = str(s2.index) + self.assertTrue('NaT' in result) + + def test_timedelta64(self): + + from pandas import date_range + from datetime import datetime, timedelta + + Series(np.array([1100, 20], dtype='timedelta64[ns]')).to_string() + + s = Series(date_range('2012-1-1', periods=3, freq='D')) + + # GH2146 + + # adding NaTs + y = s-s.shift(1) + result = y.to_string() + self.assertTrue('1 days' in result) + self.assertTrue('00:00:00' not in result) + self.assertTrue('NaT' in result) + + # with frac seconds + o = Series([datetime(2012,1,1,microsecond=150)]*3) + y = s-o + result = y.to_string() + self.assertTrue('-0 days, 00:00:00.000150' in result) + + # rounding? + o = Series([datetime(2012,1,1,1)]*3) + y = s-o + result = y.to_string() + self.assertTrue('-0 days, 01:00:00' in result) + self.assertTrue('1 days, 23:00:00' in result) + + o = Series([datetime(2012,1,1,1,1)]*3) + y = s-o + result = y.to_string() + self.assertTrue('-0 days, 01:01:00' in result) + self.assertTrue('1 days, 22:59:00' in result) + + o = Series([datetime(2012,1,1,1,1,microsecond=150)]*3) + y = s-o + result = y.to_string() + self.assertTrue('-0 days, 01:01:00.000150' in result) + self.assertTrue('1 days, 22:58:59.999850' in result) + + # neg time + td = timedelta(minutes=5,seconds=3) + s2 = Series(date_range('2012-1-1', periods=3, freq='D')) + td + y = s - s2 + result = y.to_string() + self.assertTrue('-00:05:03' in result) + + td = timedelta(microseconds=550) + s2 = Series(date_range('2012-1-1', periods=3, freq='D')) + td + y = s - td + result = y.to_string() + self.assertTrue('2012-01-01 23:59:59.999450' in result) + + def test_mixed_datetime64(self): + df = DataFrame({'A': [1, 2], + 'B': ['2012-01-01', '2012-01-02']}) + df['B'] = pd.to_datetime(df.B) + + result = repr(df.ix[0]) + self.assertTrue('2012-01-01' in result) + + def test_max_multi_index_display(self): + # GH 7101 + + # doc example (indexing.rst) + + # multi-index + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = list(zip(*arrays)) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + s = Series(randn(8), index=index) + + with option_context("display.max_rows", 10): + self.assertEqual(len(str(s).split('\n')),10) + with option_context("display.max_rows", 3): + self.assertEqual(len(str(s).split('\n')),5) + with option_context("display.max_rows", 2): + self.assertEqual(len(str(s).split('\n')),5) + with option_context("display.max_rows", 1): + self.assertEqual(len(str(s).split('\n')),5) + with option_context("display.max_rows", 0): + self.assertEqual(len(str(s).split('\n')),10) + + # index + s = Series(randn(8), None) + + with option_context("display.max_rows", 10): + self.assertEqual(len(str(s).split('\n')),9) + with option_context("display.max_rows", 3): + self.assertEqual(len(str(s).split('\n')),4) + with option_context("display.max_rows", 2): + self.assertEqual(len(str(s).split('\n')),4) + with option_context("display.max_rows", 1): + self.assertEqual(len(str(s).split('\n')),4) + with option_context("display.max_rows", 0): + self.assertEqual(len(str(s).split('\n')),9) + +class TestEngFormatter(tm.TestCase): + _multiprocess_can_split_ = True + + def test_eng_float_formatter(self): + df = DataFrame({'A': [1.41, 141., 14100, 1410000.]}) + + fmt.set_eng_float_format() + result = df.to_string() + expected = (' A\n' + '0 1.410E+00\n' + '1 141.000E+00\n' + '2 14.100E+03\n' + '3 1.410E+06') + self.assertEqual(result, expected) + + fmt.set_eng_float_format(use_eng_prefix=True) + result = df.to_string() + expected = (' A\n' + '0 1.410\n' + '1 141.000\n' + '2 14.100k\n' + '3 1.410M') + self.assertEqual(result, expected) + + fmt.set_eng_float_format(accuracy=0) + result = df.to_string() + expected = (' A\n' + '0 1E+00\n' + '1 141E+00\n' + '2 14E+03\n' + '3 1E+06') + self.assertEqual(result, expected) + + self.reset_display_options() + + def compare(self, formatter, input, output): + formatted_input = formatter(input) + msg = ("formatting of %s results in '%s', expected '%s'" + % (str(input), formatted_input, output)) + self.assertEqual(formatted_input, output, msg) + + def compare_all(self, formatter, in_out): + """ + Parameters: + ----------- + formatter: EngFormatter under test + in_out: list of tuples. Each tuple = (number, expected_formatting) + + It is tested if 'formatter(number) == expected_formatting'. + *number* should be >= 0 because formatter(-number) == fmt is also + tested. *fmt* is derived from *expected_formatting* + """ + for input, output in in_out: + self.compare(formatter, input, output) + self.compare(formatter, -input, "-" + output[1:]) + + def test_exponents_with_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + f = np.sqrt(2) + in_out = [(f * 10 ** -24, " 1.414y"), + (f * 10 ** -23, " 14.142y"), + (f * 10 ** -22, " 141.421y"), + (f * 10 ** -21, " 1.414z"), + (f * 10 ** -20, " 14.142z"), + (f * 10 ** -19, " 141.421z"), + (f * 10 ** -18, " 1.414a"), + (f * 10 ** -17, " 14.142a"), + (f * 10 ** -16, " 141.421a"), + (f * 10 ** -15, " 1.414f"), + (f * 10 ** -14, " 14.142f"), + (f * 10 ** -13, " 141.421f"), + (f * 10 ** -12, " 1.414p"), + (f * 10 ** -11, " 14.142p"), + (f * 10 ** -10, " 141.421p"), + (f * 10 ** -9, " 1.414n"), + (f * 10 ** -8, " 14.142n"), + (f * 10 ** -7, " 141.421n"), + (f * 10 ** -6, " 1.414u"), + (f * 10 ** -5, " 14.142u"), + (f * 10 ** -4, " 141.421u"), + (f * 10 ** -3, " 1.414m"), + (f * 10 ** -2, " 14.142m"), + (f * 10 ** -1, " 141.421m"), + (f * 10 ** 0, " 1.414"), + (f * 10 ** 1, " 14.142"), + (f * 10 ** 2, " 141.421"), + (f * 10 ** 3, " 1.414k"), + (f * 10 ** 4, " 14.142k"), + (f * 10 ** 5, " 141.421k"), + (f * 10 ** 6, " 1.414M"), + (f * 10 ** 7, " 14.142M"), + (f * 10 ** 8, " 141.421M"), + (f * 10 ** 9, " 1.414G"), + (f * 10 ** 10, " 14.142G"), + (f * 10 ** 11, " 141.421G"), + (f * 10 ** 12, " 1.414T"), + (f * 10 ** 13, " 14.142T"), + (f * 10 ** 14, " 141.421T"), + (f * 10 ** 15, " 1.414P"), + (f * 10 ** 16, " 14.142P"), + (f * 10 ** 17, " 141.421P"), + (f * 10 ** 18, " 1.414E"), + (f * 10 ** 19, " 14.142E"), + (f * 10 ** 20, " 141.421E"), + (f * 10 ** 21, " 1.414Z"), + (f * 10 ** 22, " 14.142Z"), + (f * 10 ** 23, " 141.421Z"), + (f * 10 ** 24, " 1.414Y"), + (f * 10 ** 25, " 14.142Y"), + (f * 10 ** 26, " 141.421Y")] + self.compare_all(formatter, in_out) + + def test_exponents_without_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) + f = np.pi + in_out = [(f * 10 ** -24, " 3.1416E-24"), + (f * 10 ** -23, " 31.4159E-24"), + (f * 10 ** -22, " 314.1593E-24"), + (f * 10 ** -21, " 3.1416E-21"), + (f * 10 ** -20, " 31.4159E-21"), + (f * 10 ** -19, " 314.1593E-21"), + (f * 10 ** -18, " 3.1416E-18"), + (f * 10 ** -17, " 31.4159E-18"), + (f * 10 ** -16, " 314.1593E-18"), + (f * 10 ** -15, " 3.1416E-15"), + (f * 10 ** -14, " 31.4159E-15"), + (f * 10 ** -13, " 314.1593E-15"), + (f * 10 ** -12, " 3.1416E-12"), + (f * 10 ** -11, " 31.4159E-12"), + (f * 10 ** -10, " 314.1593E-12"), + (f * 10 ** -9, " 3.1416E-09"), + (f * 10 ** -8, " 31.4159E-09"), + (f * 10 ** -7, " 314.1593E-09"), + (f * 10 ** -6, " 3.1416E-06"), + (f * 10 ** -5, " 31.4159E-06"), + (f * 10 ** -4, " 314.1593E-06"), + (f * 10 ** -3, " 3.1416E-03"), + (f * 10 ** -2, " 31.4159E-03"), + (f * 10 ** -1, " 314.1593E-03"), + (f * 10 ** 0, " 3.1416E+00"), + (f * 10 ** 1, " 31.4159E+00"), + (f * 10 ** 2, " 314.1593E+00"), + (f * 10 ** 3, " 3.1416E+03"), + (f * 10 ** 4, " 31.4159E+03"), + (f * 10 ** 5, " 314.1593E+03"), + (f * 10 ** 6, " 3.1416E+06"), + (f * 10 ** 7, " 31.4159E+06"), + (f * 10 ** 8, " 314.1593E+06"), + (f * 10 ** 9, " 3.1416E+09"), + (f * 10 ** 10, " 31.4159E+09"), + (f * 10 ** 11, " 314.1593E+09"), + (f * 10 ** 12, " 3.1416E+12"), + (f * 10 ** 13, " 31.4159E+12"), + (f * 10 ** 14, " 314.1593E+12"), + (f * 10 ** 15, " 3.1416E+15"), + (f * 10 ** 16, " 31.4159E+15"), + (f * 10 ** 17, " 314.1593E+15"), + (f * 10 ** 18, " 3.1416E+18"), + (f * 10 ** 19, " 31.4159E+18"), + (f * 10 ** 20, " 314.1593E+18"), + (f * 10 ** 21, " 3.1416E+21"), + (f * 10 ** 22, " 31.4159E+21"), + (f * 10 ** 23, " 314.1593E+21"), + (f * 10 ** 24, " 3.1416E+24"), + (f * 10 ** 25, " 31.4159E+24"), + (f * 10 ** 26, " 314.1593E+24")] + self.compare_all(formatter, in_out) + + def test_rounding(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + in_out = [(5.55555, ' 5.556'), + (55.5555, ' 55.556'), + (555.555, ' 555.555'), + (5555.55, ' 5.556k'), + (55555.5, ' 55.556k'), + (555555, ' 555.555k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + in_out = [(5.55555, ' 5.6'), + (55.5555, ' 55.6'), + (555.555, ' 555.6'), + (5555.55, ' 5.6k'), + (55555.5, ' 55.6k'), + (555555, ' 555.6k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) + in_out = [(5.55555, ' 6'), + (55.5555, ' 56'), + (555.555, ' 556'), + (5555.55, ' 6k'), + (55555.5, ' 56k'), + (555555, ' 556k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + result = formatter(0) + self.assertEqual(result, u(' 0.000')) + + +def _three_digit_exp(): + return '%.4g' % 1.7e8 == '1.7e+008' + + +class TestFloatArrayFormatter(tm.TestCase): + + def test_misc(self): + obj = fmt.FloatArrayFormatter(np.array([], dtype=np.float64)) + result = obj.get_result() + self.assertTrue(len(result) == 0) + + def test_format(self): + obj = fmt.FloatArrayFormatter(np.array([12, 0], dtype=np.float64)) + result = obj.get_result() + self.assertEqual(result[0], " 12") + self.assertEqual(result[1], " 0") + + +class TestRepr_timedelta64(tm.TestCase): + @classmethod + def setUpClass(cls): + tm._skip_if_not_numpy17_friendly() + + def test_legacy(self): + delta_1d = pd.to_timedelta(1, unit='D') + delta_0d = pd.to_timedelta(0, unit='D') + delta_1s = pd.to_timedelta(1, unit='s') + delta_500ms = pd.to_timedelta(500, unit='ms') + + self.assertEqual(tslib.repr_timedelta64(delta_1d), "1 days, 00:00:00") + self.assertEqual(tslib.repr_timedelta64(-delta_1d), "-1 days, 00:00:00") + self.assertEqual(tslib.repr_timedelta64(delta_0d), "00:00:00") + self.assertEqual(tslib.repr_timedelta64(delta_1s), "00:00:01") + self.assertEqual(tslib.repr_timedelta64(delta_500ms), "00:00:00.500000") + self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_1s), "1 days, 00:00:01") + self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_500ms), "1 days, 00:00:00.500000") + + def test_short(self): + delta_1d = pd.to_timedelta(1, unit='D') + delta_0d = pd.to_timedelta(0, unit='D') + delta_1s = pd.to_timedelta(1, unit='s') + delta_500ms = pd.to_timedelta(500, unit='ms') + + self.assertEqual(tslib.repr_timedelta64(delta_1d, format='short'), "1 days") + self.assertEqual(tslib.repr_timedelta64(-delta_1d, format='short'), "-1 days") + self.assertEqual(tslib.repr_timedelta64(delta_0d, format='short'), "00:00:00") + self.assertEqual(tslib.repr_timedelta64(delta_1s, format='short'), "00:00:01") + self.assertEqual(tslib.repr_timedelta64(delta_500ms, format='short'), "00:00:00.500000") + self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_1s, format='short'), "1 days, 00:00:01") + self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_500ms, format='short'), "1 days, 00:00:00.500000") + + def test_long(self): + delta_1d = pd.to_timedelta(1, unit='D') + delta_0d = pd.to_timedelta(0, unit='D') + delta_1s = pd.to_timedelta(1, unit='s') + delta_500ms = pd.to_timedelta(500, unit='ms') + + self.assertEqual(tslib.repr_timedelta64(delta_1d, format='long'), "1 days, 00:00:00") + self.assertEqual(tslib.repr_timedelta64(-delta_1d, format='long'), "-1 days, 00:00:00") + self.assertEqual(tslib.repr_timedelta64(delta_0d, format='long'), "0 days, 00:00:00") + self.assertEqual(tslib.repr_timedelta64(delta_1s, format='long'), "0 days, 00:00:01") + self.assertEqual(tslib.repr_timedelta64(delta_500ms, format='long'), "0 days, 00:00:00.500000") + self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_1s, format='long'), "1 days, 00:00:01") + self.assertEqual(tslib.repr_timedelta64(delta_1d + delta_500ms, format='long'), "1 days, 00:00:00.500000") + + +class TestTimedelta64Formatter(tm.TestCase): + @classmethod + def setUpClass(cls): + tm._skip_if_not_numpy17_friendly() + + def test_mixed(self): + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + result = fmt.Timedelta64Formatter(x + y).get_result() + self.assertEqual(result[0].strip(), "0 days, 00:00:00") + self.assertEqual(result[1].strip(), "1 days, 00:00:01") + + def test_mixed_neg(self): + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + result = fmt.Timedelta64Formatter(-(x + y)).get_result() + self.assertEqual(result[0].strip(), "0 days, 00:00:00") + self.assertEqual(result[1].strip(), "-1 days, 00:00:01") + + def test_days(self): + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + result = fmt.Timedelta64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "0 days") + self.assertEqual(result[1].strip(), "1 days") + + result = fmt.Timedelta64Formatter(x[1:2]).get_result() + self.assertEqual(result[0].strip(), "1 days") + + def test_days_neg(self): + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + result = fmt.Timedelta64Formatter(-x).get_result() + self.assertEqual(result[0].strip(), "0 days") + self.assertEqual(result[1].strip(), "-1 days") + + def test_subdays(self): + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + result = fmt.Timedelta64Formatter(y).get_result() + self.assertEqual(result[0].strip(), "00:00:00") + self.assertEqual(result[1].strip(), "00:00:01") + + def test_subdays_neg(self): + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + result = fmt.Timedelta64Formatter(-y).get_result() + self.assertEqual(result[0].strip(), "00:00:00") + self.assertEqual(result[1].strip(), "-00:00:01") + + def test_zero(self): + x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit='D') + result = fmt.Timedelta64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "0 days") + + x = pd.to_timedelta(list(range(1)), unit='D') + result = fmt.Timedelta64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "0 days") + + +class TestDatetime64Formatter(tm.TestCase): + def test_mixed(self): + x = pd.Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT]) + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01 00:00:00") + self.assertEqual(result[1].strip(), "2013-01-01 12:00:00") + + def test_dates(self): + x = pd.Series([datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT]) + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "2013-01-01") + self.assertEqual(result[1].strip(), "2013-01-02") + + def test_date_nanos(self): + x = pd.Series([Timestamp(200)]) + result = fmt.Datetime64Formatter(x).get_result() + self.assertEqual(result[0].strip(), "1970-01-01 00:00:00.000000200") + + +class TestNaTFormatting(tm.TestCase): + def test_repr(self): + self.assertEqual(repr(pd.NaT), "NaT") + + def test_str(self): + self.assertEqual(str(pd.NaT), "NaT") + + +class TestDatetimeIndexFormat(tm.TestCase): + def test_datetime(self): + formatted = pd.to_datetime([datetime(2003, 1, 1, 12), pd.NaT]).format() + self.assertEqual(formatted[0], "2003-01-01 12:00:00") + self.assertEqual(formatted[1], "NaT") + + def test_date(self): + formatted = pd.to_datetime([datetime(2003, 1, 1), pd.NaT]).format() + self.assertEqual(formatted[0], "2003-01-01") + self.assertEqual(formatted[1], "NaT") + + def test_date_tz(self): + formatted = pd.to_datetime([datetime(2013,1,1)], utc=True).format() + self.assertEqual(formatted[0], "2013-01-01 00:00:00+00:00") + + formatted = pd.to_datetime([datetime(2013,1,1), pd.NaT], utc=True).format() + self.assertEqual(formatted[0], "2013-01-01 00:00:00+00:00") + + def test_date_explict_date_format(self): + formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format(date_format="%m-%d-%Y", na_rep="UT") + self.assertEqual(formatted[0], "02-01-2003") + self.assertEqual(formatted[1], "UT") + + +class TestDatetimeIndexUnicode(tm.TestCase): + def test_dates(self): + text = str(pd.to_datetime([datetime(2013,1,1), datetime(2014,1,1)])) + self.assertTrue("[2013-01-01," in text) + self.assertTrue(", 2014-01-01]" in text) + + def test_mixed(self): + text = str(pd.to_datetime([datetime(2013,1,1), datetime(2014,1,1,12), datetime(2014,1,1)])) + self.assertTrue("[2013-01-01 00:00:00," in text) + self.assertTrue(", 2014-01-01 00:00:00]" in text) + + +class TestStringRepTimestamp(tm.TestCase): + def test_no_tz(self): + dt_date = datetime(2013, 1, 2) + self.assertEqual(str(dt_date), str(Timestamp(dt_date))) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3) + self.assertEqual(str(dt_datetime), str(Timestamp(dt_datetime))) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45) + self.assertEqual(str(dt_datetime_us), str(Timestamp(dt_datetime_us))) + + ts_nanos_only = Timestamp(200) + self.assertEqual(str(ts_nanos_only), "1970-01-01 00:00:00.000000200") + + ts_nanos_micros = Timestamp(1200) + self.assertEqual(str(ts_nanos_micros), "1970-01-01 00:00:00.000001200") + + def test_tz_pytz(self): + tm._skip_if_no_pytz() + + import pytz + + dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) + self.assertEqual(str(dt_date), str(Timestamp(dt_date))) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) + self.assertEqual(str(dt_datetime), str(Timestamp(dt_datetime))) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) + self.assertEqual(str(dt_datetime_us), str(Timestamp(dt_datetime_us))) + + def test_tz_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + utc = dateutil.tz.tzutc() + + dt_date = datetime(2013, 1, 2, tzinfo=utc) + self.assertEqual(str(dt_date), str(Timestamp(dt_date))) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=utc) + self.assertEqual(str(dt_datetime), str(Timestamp(dt_datetime))) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) + self.assertEqual(str(dt_datetime_us), str(Timestamp(dt_datetime_us))) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py new file mode 100644 index 00000000..1cada8ef --- /dev/null +++ b/pandas/tests/test_frame.py @@ -0,0 +1,14117 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta, time +import sys +import operator +import re +import csv +import nose +import functools +import itertools +from itertools import product +from distutils.version import LooseVersion + +from pandas.compat import( + map, zip, range, long, lrange, lmap, lzip, + OrderedDict, cPickle as pickle, u, StringIO +) +from pandas import compat + +from numpy import random, nan +from numpy.random import randn +import numpy as np +import numpy.ma as ma +from numpy.testing import assert_array_equal +import numpy.ma.mrecords as mrecords + +import pandas.core.nanops as nanops +import pandas.core.common as com +import pandas.core.format as fmt +import pandas.core.datetools as datetools +from pandas import (DataFrame, Index, Series, notnull, isnull, + MultiIndex, DatetimeIndex, Timestamp, date_range, read_csv) +import pandas as pd +from pandas.parser import CParserError +from pandas.util.misc import is_little_endian + +from pandas.util.testing import (assert_almost_equal, + assert_series_equal, + assert_frame_equal, + assertRaisesRegexp, + assertRaises, + makeCustomDataframe as mkdf, + ensure_clean) +from pandas.core.indexing import IndexingError +from pandas.core.common import PandasError + +import pandas.util.testing as tm +import pandas.lib as lib + +from numpy.testing.decorators import slow + +#--------------------------------------------------------------------- +# DataFrame test cases + +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] +MIXED_FLOAT_DTYPES = ['float16','float32','float64'] +MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16', + 'int32','int64'] + +def _check_mixed_float(df, dtype = None): + + # float16 are most likely to be upcasted to float32 + dtypes = dict(A = 'float32', B = 'float32', C = 'float16', D = 'float64') + if isinstance(dtype, compat.string_types): + dtypes = dict([ (k,dtype) for k, v in dtypes.items() ]) + elif isinstance(dtype, dict): + dtypes.update(dtype) + if dtypes.get('A'): + assert(df.dtypes['A'] == dtypes['A']) + if dtypes.get('B'): + assert(df.dtypes['B'] == dtypes['B']) + if dtypes.get('C'): + assert(df.dtypes['C'] == dtypes['C']) + if dtypes.get('D'): + assert(df.dtypes['D'] == dtypes['D']) + + +def _check_mixed_int(df, dtype = None): + dtypes = dict(A = 'int32', B = 'uint64', C = 'uint8', D = 'int64') + if isinstance(dtype, compat.string_types): + dtypes = dict([ (k,dtype) for k, v in dtypes.items() ]) + elif isinstance(dtype, dict): + dtypes.update(dtype) + if dtypes.get('A'): + assert(df.dtypes['A'] == dtypes['A']) + if dtypes.get('B'): + assert(df.dtypes['B'] == dtypes['B']) + if dtypes.get('C'): + assert(df.dtypes['C'] == dtypes['C']) + if dtypes.get('D'): + assert(df.dtypes['D'] == dtypes['D']) + + +class CheckIndexing(object): + + _multiprocess_can_split_ = True + + def test_getitem(self): + # slicing + sl = self.frame[:20] + self.assertEqual(20, len(sl.index)) + + # column access + + for _, series in compat.iteritems(sl): + self.assertEqual(20, len(series.index)) + self.assertTrue(tm.equalContents(series.index, sl.index)) + + for key, _ in compat.iteritems(self.frame._series): + self.assertIsNotNone(self.frame[key]) + + self.assertNotIn('random', self.frame) + with assertRaisesRegexp(KeyError, 'random'): + self.frame['random'] + + df = self.frame.copy() + df['$10'] = randn(len(df)) + ad = randn(len(df)) + df['@awesome_domain'] = ad + self.assertRaises(KeyError, df.__getitem__, 'df["$10"]') + res = df['@awesome_domain'] + assert_array_equal(ad, res.values) + + def test_getitem_dupe_cols(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) + try: + df[['baf']] + except KeyError: + pass + else: + self.fail("Dataframe failed to raise KeyError") + + def test_get(self): + b = self.frame.get('B') + assert_series_equal(b, self.frame['B']) + + self.assertIsNone(self.frame.get('foo')) + assert_series_equal(self.frame.get('foo', self.frame['B']), + self.frame['B']) + # None + # GH 5652 + for df in [DataFrame(), DataFrame(columns=list('AB')), DataFrame(columns=list('AB'),index=range(3)) ]: + result = df.get(None) + self.assertIsNone(result) + + def test_getitem_iterator(self): + idx = iter(['A', 'B', 'C']) + result = self.frame.ix[:, idx] + expected = self.frame.ix[:, ['A', 'B', 'C']] + assert_frame_equal(result, expected) + + def test_getitem_list(self): + self.frame.columns.name = 'foo' + + result = self.frame[['B', 'A']] + result2 = self.frame[Index(['B', 'A'])] + + expected = self.frame.ix[:, ['B', 'A']] + expected.columns.name = 'foo' + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + self.assertEqual(result.columns.name, 'foo') + + with assertRaisesRegexp(KeyError, 'not in index'): + self.frame[['B', 'A', 'food']] + with assertRaisesRegexp(KeyError, 'not in index'): + self.frame[Index(['B', 'A', 'foo'])] + + # tuples + df = DataFrame(randn(8, 3), + columns=Index([('foo', 'bar'), ('baz', 'qux'), + ('peek', 'aboo')], name=['sth', 'sth2'])) + + result = df[[('foo', 'bar'), ('baz', 'qux')]] + expected = df.ix[:, :2] + assert_frame_equal(result, expected) + self.assertEqual(result.columns.names, ['sth', 'sth2']) + + def test_setitem_list(self): + + self.frame['E'] = 'foo' + data = self.frame[['A', 'B']] + self.frame[['B', 'A']] = data + + assert_series_equal(self.frame['B'], data['A']) + assert_series_equal(self.frame['A'], data['B']) + + with assertRaisesRegexp(ValueError, 'Columns must be same length as key'): + data[['A']] = self.frame[['A', 'B']] + with assertRaisesRegexp(ValueError, 'Length of values does not match ' + 'length of index'): + data['A'] = range(len(data.index) - 1) + + df = DataFrame(0, lrange(3), ['tt1', 'tt2'], dtype=np.int_) + df.ix[1, ['tt1', 'tt2']] = [1, 2] + + result = df.ix[1, ['tt1', 'tt2']] + expected = Series([1, 2], df.columns, dtype=np.int_) + assert_series_equal(result, expected) + + df['tt1'] = df['tt2'] = '0' + df.ix[1, ['tt1', 'tt2']] = ['1', '2'] + result = df.ix[1, ['tt1', 'tt2']] + expected = Series(['1', '2'], df.columns) + assert_series_equal(result, expected) + + def test_setitem_list_not_dataframe(self): + data = np.random.randn(len(self.frame), 2) + self.frame[['A', 'B']] = data + assert_almost_equal(self.frame[['A', 'B']].values, data) + + def test_setitem_list_of_tuples(self): + tuples = lzip(self.frame['A'], self.frame['B']) + self.frame['tuples'] = tuples + + result = self.frame['tuples'] + expected = Series(tuples, index=self.frame.index) + assert_series_equal(result, expected) + + def test_getitem_boolean(self): + # boolean indexing + d = self.tsframe.index[10] + indexer = self.tsframe.index > d + indexer_obj = indexer.astype(object) + + subindex = self.tsframe.index[indexer] + subframe = self.tsframe[indexer] + + self.assert_numpy_array_equal(subindex, subframe.index) + with assertRaisesRegexp(ValueError, 'Item wrong length'): + self.tsframe[indexer[:-1]] + + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) + + with tm.assertRaisesRegexp(ValueError, 'boolean values only'): + self.tsframe[self.tsframe] + + # test that Series work + indexer_obj = Series(indexer_obj, self.tsframe.index) + + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) + + # test that Series indexers reindex + import warnings + warnings.filterwarnings(action='ignore', category=UserWarning) + + indexer_obj = indexer_obj.reindex(self.tsframe.index[::-1]) + + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) + + warnings.filterwarnings(action='default', category=UserWarning) + + # test df[df > 0] + for df in [ self.tsframe, self.mixed_frame, self.mixed_float, self.mixed_int ]: + + data = df._get_numeric_data() + bif = df[df > 0] + bifw = DataFrame(dict([ (c,np.where(data[c] > 0, data[c], np.nan)) for c in data.columns ]), + index=data.index, columns=data.columns) + + # add back other columns to compare + for c in df.columns: + if c not in bifw: + bifw[c] = df[c] + bifw = bifw.reindex(columns = df.columns) + + assert_frame_equal(bif, bifw, check_dtype=False) + for c in df.columns: + if bif[c].dtype != bifw[c].dtype: + self.assertEqual(bif[c].dtype, df[c].dtype) + + def test_getitem_boolean_casting(self): + + # don't upcast if we don't need to + df = self.tsframe.copy() + df['E'] = 1 + df['E'] = df['E'].astype('int32') + df['E1'] = df['E'].copy() + df['F'] = 1 + df['F'] = df['F'].astype('int64') + df['F1'] = df['F'].copy() + + casted = df[df>0] + result = casted.get_dtype_counts() + expected = Series({'float64': 4, 'int32' : 2, 'int64' : 2}) + assert_series_equal(result, expected) + + # int block splitting + df.ix[1:3,['E1','F1']] = 0 + casted = df[df>0] + result = casted.get_dtype_counts() + expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1}) + assert_series_equal(result, expected) + + # where dtype conversions + # GH 3733 + df = DataFrame(data = np.random.randn(100, 50)) + df = df.where(df > 0) # create nans + bools = df > 0 + mask = isnull(df) + expected = bools.astype(float).mask(mask) + result = bools.mask(mask) + assert_frame_equal(result,expected) + + def test_getitem_boolean_list(self): + df = DataFrame(np.arange(12).reshape(3, 4)) + + def _checkit(lst): + result = df[lst] + expected = df.ix[df.index[lst]] + assert_frame_equal(result, expected) + + _checkit([True, False, True]) + _checkit([True, True, True]) + _checkit([False, False, False]) + + def test_getitem_boolean_iadd(self): + arr = randn(5, 5) + + df = DataFrame(arr.copy(), columns = ['A','B','C','D','E']) + + df[df < 0] += 1 + arr[arr < 0] += 1 + + assert_almost_equal(df.values, arr) + + def test_boolean_index_empty_corner(self): + # #2096 + blah = DataFrame(np.empty([0, 1]), columns=['A'], + index=DatetimeIndex([])) + + # both of these should succeed trivially + k = np.array([], bool) + + blah[k] + blah[k] = 0 + + def test_getitem_ix_mixed_integer(self): + df = DataFrame(np.random.randn(4, 3), + index=[1, 10, 'C', 'E'], columns=[1, 2, 3]) + + result = df.ix[:-1] + expected = df.ix[df.index[:-1]] + assert_frame_equal(result, expected) + + result = df.ix[[1, 10]] + expected = df.ix[Index([1, 10], dtype=object)] + assert_frame_equal(result, expected) + + def test_getitem_setitem_ix_negative_integers(self): + result = self.frame.ix[:, -1] + assert_series_equal(result, self.frame['D']) + + result = self.frame.ix[:, [-1]] + assert_frame_equal(result, self.frame[['D']]) + + result = self.frame.ix[:, [-1, -2]] + assert_frame_equal(result, self.frame[['D', 'C']]) + + self.frame.ix[:, [-1]] = 0 + self.assertTrue((self.frame['D'] == 0).all()) + + df = DataFrame(np.random.randn(8, 4)) + self.assertTrue(isnull(df.ix[:, [-1]].values).all()) + + # #1942 + a = DataFrame(randn(20, 2), index=[chr(x + 65) for x in range(20)]) + a.ix[-1] = a.ix[-2] + + assert_series_equal(a.ix[-1], a.ix[-2]) + + def test_getattr(self): + tm.assert_series_equal(self.frame.A, self.frame['A']) + self.assertRaises(AttributeError, getattr, self.frame, + 'NONEXISTENT_NAME') + + def test_setattr_column(self): + df = DataFrame({'foobar': 1}, index=lrange(10)) + + df.foobar = 5 + self.assertTrue((df.foobar == 5).all()) + + def test_setitem(self): + # not sure what else to do here + series = self.frame['A'][::2] + self.frame['col5'] = series + self.assertIn('col5', self.frame) + tm.assert_dict_equal(series, self.frame['col5'], + compare_keys=False) + + series = self.frame['A'] + self.frame['col6'] = series + tm.assert_dict_equal(series, self.frame['col6'], + compare_keys=False) + + with tm.assertRaises(KeyError): + self.frame[randn(len(self.frame) + 1)] = 1 + + # set ndarray + arr = randn(len(self.frame)) + self.frame['col9'] = arr + self.assertTrue((self.frame['col9'] == arr).all()) + + self.frame['col7'] = 5 + assert((self.frame['col7'] == 5).all()) + + self.frame['col0'] = 3.14 + assert((self.frame['col0'] == 3.14).all()) + + self.frame['col8'] = 'foo' + assert((self.frame['col8'] == 'foo').all()) + + smaller = self.frame[:2] + smaller['col10'] = ['1', '2'] + self.assertEqual(smaller['col10'].dtype, np.object_) + self.assertTrue((smaller['col10'] == ['1', '2']).all()) + + # with a dtype + for dtype in ['int32','int64','float32','float64']: + self.frame[dtype] = np.array(arr,dtype=dtype) + self.assertEqual(self.frame[dtype].dtype.name, dtype) + + # dtype changing GH4204 + df = DataFrame([[0,0]]) + df.iloc[0] = np.nan + expected = DataFrame([[np.nan,np.nan]]) + assert_frame_equal(df,expected) + + df = DataFrame([[0,0]]) + df.loc[0] = np.nan + assert_frame_equal(df,expected) + + def test_setitem_tuple(self): + self.frame['A', 'B'] = self.frame['A'] + assert_series_equal(self.frame['A', 'B'], self.frame['A']) + + def test_setitem_always_copy(self): + s = self.frame['A'].copy() + self.frame['E'] = s + + self.frame['E'][5:10] = nan + self.assertTrue(notnull(s[5:10]).all()) + + def test_setitem_boolean(self): + df = self.frame.copy() + values = self.frame.values + + df[df['A'] > 0] = 4 + values[values[:, 0] > 0] = 4 + assert_almost_equal(df.values, values) + + # test that column reindexing works + series = df['A'] == 4 + series = series.reindex(df.index[::-1]) + df[series] = 1 + values[values[:, 0] == 4] = 1 + assert_almost_equal(df.values, values) + + df[df > 0] = 5 + values[values > 0] = 5 + assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + assert_almost_equal(df.values, values) + + # indexed with same shape but rows-reversed df + df[df[::-1] == 2] = 3 + values[values == 2] = 3 + assert_almost_equal(df.values, values) + + with assertRaisesRegexp(TypeError, 'Must pass DataFrame with boolean ' + 'values only'): + df[df * 0] = 2 + + # index with DataFrame + mask = df > np.abs(df) + expected = df.copy() + df[df > np.abs(df)] = nan + expected.values[mask.values] = nan + assert_frame_equal(df, expected) + + # set from DataFrame + expected = df.copy() + df[df > np.abs(df)] = df * 2 + np.putmask(expected.values, mask.values, df.values * 2) + assert_frame_equal(df, expected) + + def test_setitem_cast(self): + self.frame['D'] = self.frame['D'].astype('i8') + self.assertEqual(self.frame['D'].dtype, np.int64) + + # #669, should not cast? + # this is now set to int64, which means a replacement of the column to + # the value dtype (and nothing to do with the existing dtype) + self.frame['B'] = 0 + self.assertEqual(self.frame['B'].dtype, np.int64) + + # cast if pass array of course + self.frame['B'] = np.arange(len(self.frame)) + self.assertTrue(issubclass(self.frame['B'].dtype.type, np.integer)) + + self.frame['foo'] = 'bar' + self.frame['foo'] = 0 + self.assertEqual(self.frame['foo'].dtype, np.int64) + + self.frame['foo'] = 'bar' + self.frame['foo'] = 2.5 + self.assertEqual(self.frame['foo'].dtype, np.float64) + + self.frame['something'] = 0 + self.assertEqual(self.frame['something'].dtype, np.int64) + self.frame['something'] = 2 + self.assertEqual(self.frame['something'].dtype, np.int64) + self.frame['something'] = 2.5 + self.assertEqual(self.frame['something'].dtype, np.float64) + + def test_setitem_boolean_column(self): + expected = self.frame.copy() + mask = self.frame['A'] > 0 + + self.frame.ix[mask, 'B'] = 0 + expected.values[mask.values, 1] = 0 + + assert_frame_equal(self.frame, expected) + + def test_setitem_corner(self): + # corner case + df = DataFrame({'B': [1., 2., 3.], + 'C': ['a', 'b', 'c']}, + index=np.arange(3)) + del df['B'] + df['B'] = [1., 2., 3.] + self.assertIn('B', df) + self.assertEqual(len(df.columns), 2) + + df['A'] = 'beginning' + df['E'] = 'foo' + df['D'] = 'bar' + df[datetime.now()] = 'date' + df[datetime.now()] = 5. + + # what to do when empty frame with index + dm = DataFrame(index=self.frame.index) + dm['A'] = 'foo' + dm['B'] = 'bar' + self.assertEqual(len(dm.columns), 2) + self.assertEqual(dm.values.dtype, np.object_) + + # upcast + dm['C'] = 1 + self.assertEqual(dm['C'].dtype, np.int64) + + dm['E'] = 1. + self.assertEqual(dm['E'].dtype, np.float64) + + # set existing column + dm['A'] = 'bar' + self.assertEqual('bar', dm['A'][0]) + + dm = DataFrame(index=np.arange(3)) + dm['A'] = 1 + dm['foo'] = 'bar' + del dm['foo'] + dm['foo'] = 'bar' + self.assertEqual(dm['foo'].dtype, np.object_) + + dm['coercable'] = ['1', '2', '3'] + self.assertEqual(dm['coercable'].dtype, np.object_) + + def test_setitem_corner2(self): + data = {"title": ['foobar', 'bar', 'foobar'] + ['foobar'] * 17, + "cruft": np.random.random(20)} + + df = DataFrame(data) + ix = df[df['title'] == 'bar'].index + + df.ix[ix, ['title']] = 'foobar' + df.ix[ix, ['cruft']] = 0 + + assert(df.ix[1, 'title'] == 'foobar') + assert(df.ix[1, 'cruft'] == 0) + + def test_setitem_ambig(self): + # difficulties with mixed-type data + from decimal import Decimal + + # created as float type + dm = DataFrame(index=lrange(3), columns=lrange(3)) + + coercable_series = Series([Decimal(1) for _ in range(3)], + index=lrange(3)) + uncoercable_series = Series(['foo', 'bzr', 'baz'], index=lrange(3)) + + dm[0] = np.ones(3) + self.assertEqual(len(dm.columns), 3) + # self.assertIsNone(dm.objects) + + dm[1] = coercable_series + self.assertEqual(len(dm.columns), 3) + # self.assertIsNone(dm.objects) + + dm[2] = uncoercable_series + self.assertEqual(len(dm.columns), 3) + # self.assertIsNotNone(dm.objects) + self.assertEqual(dm[2].dtype, np.object_) + + def test_setitem_clear_caches(self): + # GH #304 + df = DataFrame({'x': [1.1, 2.1, 3.1, 4.1], 'y': [5.1, 6.1, 7.1, 8.1]}, + index=[0, 1, 2, 3]) + df.insert(2, 'z', np.nan) + + # cache it + foo = df['z'] + + df.ix[2:, 'z'] = 42 + + expected = Series([np.nan, np.nan, 42, 42], index=df.index) + self.assertIsNot(df['z'], foo) + assert_series_equal(df['z'], expected) + + def test_setitem_None(self): + # GH #766 + self.frame[None] = self.frame['A'] + assert_series_equal(self.frame.iloc[:,-1], self.frame['A']) + assert_series_equal(self.frame.loc[:,None], self.frame['A']) + assert_series_equal(self.frame[None], self.frame['A']) + repr(self.frame) + + def test_delitem_corner(self): + f = self.frame.copy() + del f['D'] + self.assertEqual(len(f.columns), 3) + self.assertRaises(KeyError, f.__delitem__, 'D') + del f['B'] + self.assertEqual(len(f.columns), 2) + + def test_getitem_fancy_2d(self): + f = self.frame + ix = f.ix + + assert_frame_equal(ix[:, ['B', 'A']], f.reindex(columns=['B', 'A'])) + + subidx = self.frame.index[[5, 4, 1]] + assert_frame_equal(ix[subidx, ['B', 'A']], + f.reindex(index=subidx, columns=['B', 'A'])) + + # slicing rows, etc. + assert_frame_equal(ix[5:10], f[5:10]) + assert_frame_equal(ix[5:10, :], f[5:10]) + assert_frame_equal(ix[:5, ['A', 'B']], + f.reindex(index=f.index[:5], columns=['A', 'B'])) + + # slice rows with labels, inclusive! + expected = ix[5:11] + result = ix[f.index[5]:f.index[10]] + assert_frame_equal(expected, result) + + # slice columns + assert_frame_equal(ix[:, :2], f.reindex(columns=['A', 'B'])) + + # get view + exp = f.copy() + ix[5:10].values[:] = 5 + exp.values[5:10] = 5 + assert_frame_equal(f, exp) + + self.assertRaises(ValueError, ix.__getitem__, f > 0.5) + + def test_slice_floats(self): + index = [52195.504153, 52196.303147, 52198.369883] + df = DataFrame(np.random.rand(3, 2), index=index) + + s1 = df.ix[52195.1:52196.5] + self.assertEqual(len(s1), 2) + + s1 = df.ix[52195.1:52196.6] + self.assertEqual(len(s1), 2) + + s1 = df.ix[52195.1:52198.9] + self.assertEqual(len(s1), 3) + + def test_getitem_fancy_slice_integers_step(self): + df = DataFrame(np.random.randn(10, 5)) + + # this is OK + result = df.ix[:8:2] + df.ix[:8:2] = np.nan + self.assertTrue(isnull(df.ix[:8:2]).values.all()) + + def test_getitem_setitem_integer_slice_keyerrors(self): + df = DataFrame(np.random.randn(10, 5), index=lrange(0, 20, 2)) + + # this is OK + cp = df.copy() + cp.ix[4:10] = 0 + self.assertTrue((cp.ix[4:10] == 0).values.all()) + + # so is this + cp = df.copy() + cp.ix[3:11] = 0 + self.assertTrue((cp.ix[3:11] == 0).values.all()) + + result = df.ix[4:10] + result2 = df.ix[3:11] + expected = df.reindex([4, 6, 8, 10]) + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + # non-monotonic, raise KeyError + df2 = df[::-1] + self.assertRaises(KeyError, df2.ix.__getitem__, slice(3, 11)) + self.assertRaises(KeyError, df2.ix.__setitem__, slice(3, 11), 0) + + def test_setitem_fancy_2d(self): + f = self.frame + ix = f.ix + + # case 1 + frame = self.frame.copy() + expected = frame.copy() + frame.ix[:, ['B', 'A']] = 1 + expected['B'] = 1. + expected['A'] = 1. + assert_frame_equal(frame, expected) + + # case 2 + frame = self.frame.copy() + frame2 = self.frame.copy() + + expected = frame.copy() + + subidx = self.frame.index[[5, 4, 1]] + values = randn(3, 2) + + frame.ix[subidx, ['B', 'A']] = values + frame2.ix[[5, 4, 1], ['B', 'A']] = values + + expected['B'].ix[subidx] = values[:, 0] + expected['A'].ix[subidx] = values[:, 1] + + assert_frame_equal(frame, expected) + assert_frame_equal(frame2, expected) + + # case 3: slicing rows, etc. + frame = self.frame.copy() + + expected1 = self.frame.copy() + frame.ix[5:10] = 1. + expected1.values[5:10] = 1. + assert_frame_equal(frame, expected1) + + expected2 = self.frame.copy() + arr = randn(5, len(frame.columns)) + frame.ix[5:10] = arr + expected2.values[5:10] = arr + assert_frame_equal(frame, expected2) + + # case 4 + frame = self.frame.copy() + frame.ix[5:10, :] = 1. + assert_frame_equal(frame, expected1) + frame.ix[5:10, :] = arr + assert_frame_equal(frame, expected2) + + # case 5 + frame = self.frame.copy() + frame2 = self.frame.copy() + + expected = self.frame.copy() + values = randn(5, 2) + + frame.ix[:5, ['A', 'B']] = values + expected['A'][:5] = values[:, 0] + expected['B'][:5] = values[:, 1] + assert_frame_equal(frame, expected) + + frame2.ix[:5, [0, 1]] = values + assert_frame_equal(frame2, expected) + + # case 6: slice rows with labels, inclusive! + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[frame.index[5]:frame.index[10]] = 5. + expected.values[5:11] = 5 + assert_frame_equal(frame, expected) + + # case 7: slice columns + frame = self.frame.copy() + frame2 = self.frame.copy() + expected = self.frame.copy() + + # slice indices + frame.ix[:, 1:3] = 4. + expected.values[:, 1:3] = 4. + assert_frame_equal(frame, expected) + + # slice with labels + frame.ix[:, 'B':'C'] = 4. + assert_frame_equal(frame, expected) + + # new corner case of boolean slicing / setting + frame = DataFrame(lzip([2, 3, 9, 6, 7], [np.nan] * 5), + columns=['a', 'b']) + lst = [100] + lst.extend([np.nan] * 4) + expected = DataFrame(lzip([100, 3, 9, 6, 7], lst), + columns=['a', 'b']) + frame[frame['a'] == 2] = 100 + assert_frame_equal(frame, expected) + + def test_fancy_getitem_slice_mixed(self): + sliced = self.mixed_frame.ix[:, -3:] + self.assertEqual(sliced['D'].dtype, np.float64) + + # get view with single block + sliced = self.frame.ix[:, -3:] + sliced['C'] = 4. + self.assertTrue((self.frame['C'] == 4).all()) + + def test_fancy_setitem_int_labels(self): + # integer index defers to label-based indexing + + df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) + + tmp = df.copy() + exp = df.copy() + tmp.ix[[0, 2, 4]] = 5 + exp.values[:3] = 5 + assert_frame_equal(tmp, exp) + + tmp = df.copy() + exp = df.copy() + tmp.ix[6] = 5 + exp.values[3] = 5 + assert_frame_equal(tmp, exp) + + tmp = df.copy() + exp = df.copy() + tmp.ix[:, 2] = 5 + exp.values[:, 2] = 5 + assert_frame_equal(tmp, exp) + + def test_fancy_getitem_int_labels(self): + df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) + + result = df.ix[[4, 2, 0], [2, 0]] + expected = df.reindex(index=[4, 2, 0], columns=[2, 0]) + assert_frame_equal(result, expected) + + result = df.ix[[4, 2, 0]] + expected = df.reindex(index=[4, 2, 0]) + assert_frame_equal(result, expected) + + result = df.ix[4] + expected = df.xs(4) + assert_series_equal(result, expected) + + result = df.ix[:, 3] + expected = df[3] + assert_series_equal(result, expected) + + def test_fancy_index_int_labels_exceptions(self): + df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) + + # labels that aren't contained + self.assertRaises(KeyError, df.ix.__setitem__, + ([0, 1, 2], [2, 3, 4]), 5) + + # try to set indices not contained in frame + self.assertRaises(KeyError, + self.frame.ix.__setitem__, + ['foo', 'bar', 'baz'], 1) + self.assertRaises(KeyError, + self.frame.ix.__setitem__, + (slice(None, None), ['E']), 1) + + # partial setting now allows this GH2578 + #self.assertRaises(KeyError, + # self.frame.ix.__setitem__, + # (slice(None, None), 'E'), 1) + + def test_setitem_fancy_mixed_2d(self): + self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5 + result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] + self.assertTrue((result.values == 5).all()) + + self.mixed_frame.ix[5] = np.nan + self.assertTrue(isnull(self.mixed_frame.ix[5]).all()) + + self.mixed_frame.ix[5] = self.mixed_frame.ix[6] + assert_series_equal(self.mixed_frame.ix[5], self.mixed_frame.ix[6]) + + # #1432 + df = DataFrame({1: [1., 2., 3.], + 2: [3, 4, 5]}) + self.assertTrue(df._is_mixed_type) + + df.ix[1] = [5, 10] + + expected = DataFrame({1: [1., 5., 3.], + 2: [3, 10, 5]}) + + assert_frame_equal(df, expected) + + def test_ix_align(self): + b = Series(randn(10)) + b.sort() + df_orig = DataFrame(randn(10, 4)) + df = df_orig.copy() + + df.ix[:, 0] = b + assert_series_equal(df.ix[:, 0].reindex(b.index), b) + + dft = df_orig.T + dft.ix[0, :] = b + assert_series_equal(dft.ix[0, :].reindex(b.index), b) + + df = df_orig.copy() + df.ix[:5, 0] = b + s = df.ix[:5, 0] + assert_series_equal(s, b.reindex(s.index)) + + dft = df_orig.T + dft.ix[0, :5] = b + s = dft.ix[0, :5] + assert_series_equal(s, b.reindex(s.index)) + + df = df_orig.copy() + idx = [0, 1, 3, 5] + df.ix[idx, 0] = b + s = df.ix[idx, 0] + assert_series_equal(s, b.reindex(s.index)) + + dft = df_orig.T + dft.ix[0, idx] = b + s = dft.ix[0, idx] + assert_series_equal(s, b.reindex(s.index)) + + def test_ix_frame_align(self): + b = DataFrame(np.random.randn(3, 4)) + df_orig = DataFrame(randn(10, 4)) + df = df_orig.copy() + + df.ix[:3] = b + out = b.ix[:3] + assert_frame_equal(out, b) + + b.sort_index(inplace=True) + + df = df_orig.copy() + df.ix[[0, 1, 2]] = b + out = df.ix[[0, 1, 2]].reindex(b.index) + assert_frame_equal(out, b) + + df = df_orig.copy() + df.ix[:3] = b + out = df.ix[:3] + assert_frame_equal(out, b.reindex(out.index)) + + def test_getitem_setitem_non_ix_labels(self): + df = tm.makeTimeDataFrame() + + start, end = df.index[[5, 10]] + + result = df.ix[start:end] + result2 = df[start:end] + expected = df[5:11] + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + result = df.copy() + result.ix[start:end] = 0 + result2 = df.copy() + result2[start:end] = 0 + expected = df.copy() + expected[5:11] = 0 + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_ix_multi_take(self): + df = DataFrame(np.random.randn(3, 2)) + rs = df.ix[df.index == 0, :] + xp = df.reindex([0]) + assert_frame_equal(rs, xp) + + """ #1321 + df = DataFrame(np.random.randn(3, 2)) + rs = df.ix[df.index==0, df.columns==1] + xp = df.reindex([0], [1]) + assert_frame_equal(rs, xp) + """ + + def test_ix_multi_take_nonint_index(self): + df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], + columns=['a', 'b']) + rs = df.ix[[0], [0]] + xp = df.reindex(['x'], columns=['a']) + assert_frame_equal(rs, xp) + + def test_ix_multi_take_multiindex(self): + df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], + columns=[['a', 'b'], ['1', '2']]) + rs = df.ix[[0], [0]] + xp = df.reindex(['x'], columns=[('a', '1')]) + assert_frame_equal(rs, xp) + + def test_ix_dup(self): + idx = Index(['a', 'a', 'b', 'c', 'd', 'd']) + df = DataFrame(np.random.randn(len(idx), 3), idx) + + sub = df.ix[:'d'] + assert_frame_equal(sub, df) + + sub = df.ix['a':'c'] + assert_frame_equal(sub, df.ix[0:4]) + + sub = df.ix['b':'d'] + assert_frame_equal(sub, df.ix[2:]) + + def test_getitem_fancy_1d(self): + f = self.frame + ix = f.ix + + # return self if no slicing...for now + self.assertIs(ix[:, :], f) + + # low dimensional slice + xs1 = ix[2, ['C', 'B', 'A']] + xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A']) + assert_series_equal(xs1, xs2) + + ts1 = ix[5:10, 2] + ts2 = f[f.columns[2]][5:10] + assert_series_equal(ts1, ts2) + + # positional xs + xs1 = ix[0] + xs2 = f.xs(f.index[0]) + assert_series_equal(xs1, xs2) + + xs1 = ix[f.index[5]] + xs2 = f.xs(f.index[5]) + assert_series_equal(xs1, xs2) + + # single column + assert_series_equal(ix[:, 'A'], f['A']) + + # return view + exp = f.copy() + exp.values[5] = 4 + ix[5][:] = 4 + assert_frame_equal(exp, f) + + exp.values[:, 1] = 6 + ix[:, 1][:] = 6 + assert_frame_equal(exp, f) + + # slice of mixed-frame + xs = self.mixed_frame.ix[5] + exp = self.mixed_frame.xs(self.mixed_frame.index[5]) + assert_series_equal(xs, exp) + + def test_setitem_fancy_1d(self): + + # case 1: set cross-section for indices + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[2, ['C', 'B', 'A']] = [1., 2., 3.] + expected['C'][2] = 1. + expected['B'][2] = 2. + expected['A'][2] = 3. + assert_frame_equal(frame, expected) + + frame2 = self.frame.copy() + frame2.ix[2, [3, 2, 1]] = [1., 2., 3.] + assert_frame_equal(frame, expected) + + # case 2, set a section of a column + frame = self.frame.copy() + expected = self.frame.copy() + + vals = randn(5) + expected.values[5:10, 2] = vals + frame.ix[5:10, 2] = vals + assert_frame_equal(frame, expected) + + frame2 = self.frame.copy() + frame2.ix[5:10, 'B'] = vals + assert_frame_equal(frame, expected) + + # case 3: full xs + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[4] = 5. + expected.values[4] = 5. + assert_frame_equal(frame, expected) + + frame.ix[frame.index[4]] = 6. + expected.values[4] = 6. + assert_frame_equal(frame, expected) + + # single column + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[:, 'A'] = 7. + expected['A'] = 7. + assert_frame_equal(frame, expected) + + def test_getitem_fancy_scalar(self): + f = self.frame + ix = f.ix + # individual value + for col in f.columns: + ts = f[col] + for idx in f.index[::5]: + assert_almost_equal(ix[idx, col], ts[idx]) + + def test_setitem_fancy_scalar(self): + f = self.frame + expected = self.frame.copy() + ix = f.ix + # individual value + for j, col in enumerate(f.columns): + ts = f[col] + for idx in f.index[::5]: + i = f.index.get_loc(idx) + val = randn() + expected.values[i, j] = val + ix[idx, col] = val + assert_frame_equal(f, expected) + + def test_getitem_fancy_boolean(self): + f = self.frame + ix = f.ix + + expected = f.reindex(columns=['B', 'D']) + result = ix[:, [False, True, False, True]] + assert_frame_equal(result, expected) + + expected = f.reindex(index=f.index[5:10], columns=['B', 'D']) + result = ix[5:10, [False, True, False, True]] + assert_frame_equal(result, expected) + + boolvec = f.index > f.index[7] + expected = f.reindex(index=f.index[boolvec]) + result = ix[boolvec] + assert_frame_equal(result, expected) + result = ix[boolvec, :] + assert_frame_equal(result, expected) + + result = ix[boolvec, 2:] + expected = f.reindex(index=f.index[boolvec], + columns=['C', 'D']) + assert_frame_equal(result, expected) + + def test_setitem_fancy_boolean(self): + # from 2d, set with booleans + frame = self.frame.copy() + expected = self.frame.copy() + + mask = frame['A'] > 0 + frame.ix[mask] = 0. + expected.values[mask.values] = 0. + assert_frame_equal(frame, expected) + + frame = self.frame.copy() + expected = self.frame.copy() + frame.ix[mask, ['A', 'B']] = 0. + expected.values[mask.values, :2] = 0. + assert_frame_equal(frame, expected) + + def test_getitem_fancy_ints(self): + result = self.frame.ix[[1, 4, 7]] + expected = self.frame.ix[self.frame.index[[1, 4, 7]]] + assert_frame_equal(result, expected) + + result = self.frame.ix[:, [2, 0, 1]] + expected = self.frame.ix[:, self.frame.columns[[2, 0, 1]]] + assert_frame_equal(result, expected) + + def test_getitem_setitem_fancy_exceptions(self): + ix = self.frame.ix + with assertRaisesRegexp(IndexingError, 'Too many indexers'): + ix[:, :, :] + with assertRaisesRegexp(IndexingError, 'only tuples of length <= 2 ' + 'supported'): + ix[:, :, :] = 1 + + def test_getitem_setitem_boolean_misaligned(self): + # boolean index misaligned labels + mask = self.frame['A'][::-1] > 1 + + result = self.frame.ix[mask] + expected = self.frame.ix[mask[::-1]] + assert_frame_equal(result, expected) + + cp = self.frame.copy() + expected = self.frame.copy() + cp.ix[mask] = 0 + expected.ix[mask] = 0 + assert_frame_equal(cp, expected) + + def test_getitem_setitem_boolean_multi(self): + df = DataFrame(np.random.randn(3, 2)) + + # get + k1 = np.array([True, False, True]) + k2 = np.array([False, True]) + result = df.ix[k1, k2] + expected = df.ix[[0, 2], [1]] + assert_frame_equal(result, expected) + + expected = df.copy() + df.ix[np.array([True, False, True]), + np.array([False, True])] = 5 + expected.ix[[0, 2], [1]] = 5 + assert_frame_equal(df, expected) + + def test_getitem_setitem_float_labels(self): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(np.random.randn(5, 5), index=index) + + result = df.ix[1.5:4] + expected = df.reindex([1.5, 2, 3, 4]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 4) + + result = df.ix[4:5] + expected = df.reindex([4, 5]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 2) + + # loc_float changes this to work properly + result = df.ix[1:2] + expected = df.iloc[0:2] + assert_frame_equal(result, expected) + + df.ix[1:2] = 0 + result = df[1:2] + self.assertTrue((result==0).all().all()) + + # #2727 + index = Index([1.0, 2.5, 3.5, 4.5, 5.0]) + df = DataFrame(np.random.randn(5, 5), index=index) + + # positional slicing only via iloc! + with tm.assert_produces_warning(FutureWarning): + result = df.iloc[1.0:5] + + expected = df.reindex([2.5, 3.5, 4.5, 5.0]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 4) + + result = df.iloc[4:5] + expected = df.reindex([5.0]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 1) + + # GH 4892, float indexers in iloc are deprecated + import warnings + warnings.filterwarnings(action='error', category=FutureWarning) + + cp = df.copy() + def f(): + cp.iloc[1.0:5] = 0 + self.assertRaises(FutureWarning, f) + def f(): + result = cp.iloc[1.0:5] == 0 + self.assertRaises(FutureWarning, f) + self.assertTrue(result.values.all()) + self.assertTrue((cp.iloc[0:1] == df.iloc[0:1]).values.all()) + + warnings.filterwarnings(action='ignore', category=FutureWarning) + + cp = df.copy() + cp.iloc[4:5] = 0 + self.assertTrue((cp.iloc[4:5] == 0).values.all()) + self.assertTrue((cp.iloc[0:4] == df.iloc[0:4]).values.all()) + + # float slicing + result = df.ix[1.0:5] + expected = df + assert_frame_equal(result, expected) + self.assertEqual(len(result), 5) + + result = df.ix[1.1:5] + expected = df.reindex([2.5, 3.5, 4.5, 5.0]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 4) + + result = df.ix[4.51:5] + expected = df.reindex([5.0]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 1) + + result = df.ix[1.0:5.0] + expected = df.reindex([1.0, 2.5, 3.5, 4.5, 5.0]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 5) + + cp = df.copy() + cp.ix[1.0:5.0] = 0 + result = cp.ix[1.0:5.0] + self.assertTrue((result == 0).values.all()) + + def test_setitem_single_column_mixed(self): + df = DataFrame(randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], + columns=['foo', 'bar', 'baz']) + df['str'] = 'qux' + df.ix[::2, 'str'] = nan + expected = [nan, 'qux', nan, 'qux', nan] + assert_almost_equal(df['str'].values, expected) + + def test_setitem_single_column_mixed_datetime(self): + df = DataFrame(randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], + columns=['foo', 'bar', 'baz']) + + df['timestamp'] = Timestamp('20010102') + + # check our dtypes + result = df.get_dtype_counts() + expected = Series({'float64': 3, 'datetime64[ns]': 1}) + assert_series_equal(result, expected) + + # set an allowable datetime64 type + from pandas import tslib + df.ix['b', 'timestamp'] = tslib.iNaT + self.assertTrue(com.isnull(df.ix['b', 'timestamp'])) + + # allow this syntax + df.ix['c', 'timestamp'] = nan + self.assertTrue(com.isnull(df.ix['c', 'timestamp'])) + + # allow this syntax + df.ix['d', :] = nan + self.assertTrue(com.isnull(df.ix['c', :]).all() == False) + + # as of GH 3216 this will now work! + # try to set with a list like item + #self.assertRaises( + # Exception, df.ix.__setitem__, ('d', 'timestamp'), [nan]) + + def test_setitem_frame(self): + piece = self.frame.ix[:2, ['A', 'B']] + self.frame.ix[-2:, ['A', 'B']] = piece.values + assert_almost_equal(self.frame.ix[-2:, ['A', 'B']].values, + piece.values) + + # GH 3216 + + # already aligned + f = self.mixed_frame.copy() + piece = DataFrame([[ 1, 2], [3, 4]], index=f.index[0:2],columns=['A', 'B']) + key = (slice(None,2), ['A', 'B']) + f.ix[key] = piece + assert_almost_equal(f.ix[0:2, ['A', 'B']].values, + piece.values) + + # rows unaligned + f = self.mixed_frame.copy() + piece = DataFrame([[ 1, 2 ], [3, 4], [5, 6], [7, 8]], index=list(f.index[0:2]) + ['foo','bar'],columns=['A', 'B']) + key = (slice(None,2), ['A', 'B']) + f.ix[key] = piece + assert_almost_equal(f.ix[0:2:, ['A', 'B']].values, + piece.values[0:2]) + + # key is unaligned with values + f = self.mixed_frame.copy() + piece = f.ix[:2, ['A']] + key = (slice(-2, None), ['A', 'B']) + f.ix[key] = piece + piece['B'] = np.nan + assert_almost_equal(f.ix[-2:, ['A', 'B']].values, + piece.values) + + # ndarray + f = self.mixed_frame.copy() + piece = self.mixed_frame.ix[:2, ['A', 'B']] + key = (slice(-2, None), ['A', 'B']) + f.ix[key] = piece.values + assert_almost_equal(f.ix[-2:, ['A', 'B']].values, + piece.values) + + + # needs upcasting + df = DataFrame([[1,2,'foo'],[3,4,'bar']],columns=['A','B','C']) + df2 = df.copy() + df2.ix[:,['A','B']] = df.ix[:,['A','B']]+0.5 + expected = df.reindex(columns=['A','B']) + expected += 0.5 + expected['C'] = df['C'] + assert_frame_equal(df2, expected) + + def test_setitem_frame_align(self): + piece = self.frame.ix[:2, ['A', 'B']] + piece.index = self.frame.index[-2:] + piece.columns = ['A', 'B'] + self.frame.ix[-2:, ['A', 'B']] = piece + assert_almost_equal(self.frame.ix[-2:, ['A', 'B']].values, + piece.values) + + def test_setitem_fancy_exceptions(self): + pass + + def test_getitem_boolean_missing(self): + pass + + def test_setitem_boolean_missing(self): + pass + + def test_getitem_setitem_ix_duplicates(self): + # #1201 + df = DataFrame(np.random.randn(5, 3), + index=['foo', 'foo', 'bar', 'baz', 'bar']) + + result = df.ix['foo'] + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.ix['bar'] + expected = df.ix[[2, 4]] + assert_frame_equal(result, expected) + + result = df.ix['baz'] + expected = df.ix[3] + assert_series_equal(result, expected) + + def test_getitem_ix_boolean_duplicates_multiple(self): + # #1201 + df = DataFrame(np.random.randn(5, 3), + index=['foo', 'foo', 'bar', 'baz', 'bar']) + + result = df.ix[['bar']] + exp = df.ix[[2, 4]] + assert_frame_equal(result, exp) + + result = df.ix[df[1] > 0] + exp = df[df[1] > 0] + assert_frame_equal(result, exp) + + result = df.ix[df[0] > 0] + exp = df[df[0] > 0] + assert_frame_equal(result, exp) + + def test_getitem_setitem_ix_bool_keyerror(self): + # #2199 + df = DataFrame({'a': [1, 2, 3]}) + + self.assertRaises(KeyError, df.ix.__getitem__, False) + self.assertRaises(KeyError, df.ix.__getitem__, True) + + self.assertRaises(KeyError, df.ix.__setitem__, False, 0) + self.assertRaises(KeyError, df.ix.__setitem__, True, 0) + + def test_getitem_list_duplicates(self): + # #1943 + df = DataFrame(np.random.randn(4, 4), columns=list('AABC')) + df.columns.name = 'foo' + + result = df[['B', 'C']] + self.assertEqual(result.columns.name, 'foo') + + expected = df.ix[:, 2:] + assert_frame_equal(result, expected) + + def test_get_value(self): + for idx in self.frame.index: + for col in self.frame.columns: + result = self.frame.get_value(idx, col) + expected = self.frame[col][idx] + assert_almost_equal(result, expected) + + def test_iteritems(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) + for k, v in compat.iteritems(df): + self.assertEqual(type(v), Series) + + def test_lookup(self): + def alt(df, rows, cols): + result = [] + for r, c in zip(rows, cols): + result.append(df.get_value(r, c)) + return result + + def testit(df): + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) + expected = alt(df, rows, cols) + assert_almost_equal(result, expected) + + testit(self.mixed_frame) + testit(self.frame) + + df = DataFrame({'label': ['a', 'b', 'a', 'c'], + 'mask_a': [True, True, False, True], + 'mask_b': [True, False, False, False], + 'mask_c': [False, True, False, True]}) + df['mask'] = df.lookup(df.index, 'mask_' + df['label']) + exp_mask = alt(df, df.index, 'mask_' + df['label']) + assert_almost_equal(df['mask'], exp_mask) + self.assertEqual(df['mask'].dtype, np.bool_) + + with tm.assertRaises(KeyError): + self.frame.lookup(['xyz'], ['A']) + + with tm.assertRaises(KeyError): + self.frame.lookup([self.frame.index[0]], ['xyz']) + + with tm.assertRaisesRegexp(ValueError, 'same size'): + self.frame.lookup(['a', 'b', 'c'], ['a']) + + def test_set_value(self): + for idx in self.frame.index: + for col in self.frame.columns: + self.frame.set_value(idx, col, 1) + assert_almost_equal(self.frame[col][idx], 1) + + def test_set_value_resize(self): + + res = self.frame.set_value('foobar', 'B', 0) + self.assertIs(res, self.frame) + self.assertEqual(res.index[-1], 'foobar') + self.assertEqual(res.get_value('foobar', 'B'), 0) + + self.frame.loc['foobar','qux'] = 0 + self.assertEqual(self.frame.get_value('foobar', 'qux'), 0) + + res = self.frame.copy() + res3 = res.set_value('foobar', 'baz', 'sam') + self.assertEqual(res3['baz'].dtype, np.object_) + + res = self.frame.copy() + res3 = res.set_value('foobar', 'baz', True) + self.assertEqual(res3['baz'].dtype, np.object_) + + res = self.frame.copy() + res3 = res.set_value('foobar', 'baz', 5) + self.assertTrue(com.is_float_dtype(res3['baz'])) + self.assertTrue(isnull(res3['baz'].drop(['foobar'])).all()) + self.assertRaises(ValueError, res3.set_value, 'foobar', 'baz', 'sam') + + def test_set_value_with_index_dtype_change(self): + df_orig = DataFrame(randn(3, 3), index=lrange(3), columns=list('ABC')) + + # this is actually ambiguous as the 2 is interpreted as a positional + # so column is not created + df = df_orig.copy() + df.set_value('C', 2, 1.0) + self.assertEqual(list(df.index), list(df_orig.index) + ['C']) + #self.assertEqual(list(df.columns), list(df_orig.columns) + [2]) + + df = df_orig.copy() + df.loc['C', 2] = 1.0 + self.assertEqual(list(df.index), list(df_orig.index) + ['C']) + #self.assertEqual(list(df.columns), list(df_orig.columns) + [2]) + + # create both new + df = df_orig.copy() + df.set_value('C', 'D', 1.0) + self.assertEqual(list(df.index), list(df_orig.index) + ['C']) + self.assertEqual(list(df.columns), list(df_orig.columns) + ['D']) + + df = df_orig.copy() + df.loc['C', 'D'] = 1.0 + self.assertEqual(list(df.index), list(df_orig.index) + ['C']) + self.assertEqual(list(df.columns), list(df_orig.columns) + ['D']) + + def test_get_set_value_no_partial_indexing(self): + # partial w/ MultiIndex raise exception + index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)]) + df = DataFrame(index=index, columns=lrange(4)) + self.assertRaises(KeyError, df.get_value, 0, 1) + # self.assertRaises(KeyError, df.set_value, 0, 1, 0) + + def test_single_element_ix_dont_upcast(self): + self.frame['E'] = 1 + self.assertTrue(issubclass(self.frame['E'].dtype.type, + (int, np.integer))) + + result = self.frame.ix[self.frame.index[5], 'E'] + self.assertTrue(com.is_integer(result)) + + def test_irow(self): + df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2)) + + result = df.irow(1) + exp = df.ix[2] + assert_series_equal(result, exp) + + result = df.irow(2) + exp = df.ix[4] + assert_series_equal(result, exp) + + # slice + result = df.irow(slice(4, 8)) + expected = df.ix[8:14] + assert_frame_equal(result, expected) + + # verify slice is view + result[2] = 0. + exp_col = df[2].copy() + exp_col[4:8] = 0. + assert_series_equal(df[2], exp_col) + + # list of integers + result = df.irow([1, 2, 4, 6]) + expected = df.reindex(df.index[[1, 2, 4, 6]]) + assert_frame_equal(result, expected) + + def test_icol(self): + df = DataFrame(np.random.randn(4, 10), columns=lrange(0, 20, 2)) + + result = df.icol(1) + exp = df.ix[:, 2] + assert_series_equal(result, exp) + + result = df.icol(2) + exp = df.ix[:, 4] + assert_series_equal(result, exp) + + # slice + result = df.icol(slice(4, 8)) + expected = df.ix[:, 8:14] + assert_frame_equal(result, expected) + + # verify slice is view + result[8] = 0. + self.assertTrue((df[8] == 0).all()) + + # list of integers + result = df.icol([1, 2, 4, 6]) + expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) + assert_frame_equal(result, expected) + + def test_irow_icol_duplicates(self): + df = DataFrame(np.random.rand(3, 3), columns=list('ABC'), + index=list('aab')) + + result = df.irow(0) + result2 = df.ix[0] + tm.assert_isinstance(result, Series) + assert_almost_equal(result.values, df.values[0]) + assert_series_equal(result, result2) + + result = df.T.icol(0) + result2 = df.T.ix[:, 0] + tm.assert_isinstance(result, Series) + assert_almost_equal(result.values, df.values[0]) + assert_series_equal(result, result2) + + # multiindex + df = DataFrame(np.random.randn(3, 3), columns=[['i', 'i', 'j'], + ['A', 'A', 'B']], + index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + rs = df.irow(0) + xp = df.ix[0] + assert_series_equal(rs, xp) + + rs = df.icol(0) + xp = df.T.ix[0] + assert_series_equal(rs, xp) + + rs = df.icol([0]) + xp = df.ix[:, [0]] + assert_frame_equal(rs, xp) + + # #2259 + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1, 1, 2]) + result = df.icol([0]) + expected = df.take([0], axis=1) + assert_frame_equal(result, expected) + + def test_icol_sparse_propegate_fill_value(self): + from pandas.sparse.api import SparseDataFrame + df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999) + self.assertTrue(len(df['A'].sp_values) == len(df.icol(0).sp_values)) + + def test_iget_value(self): + for i, row in enumerate(self.frame.index): + for j, col in enumerate(self.frame.columns): + result = self.frame.iget_value(i, j) + expected = self.frame.get_value(row, col) + assert_almost_equal(result, expected) + + def test_nested_exception(self): + # Ignore the strange way of triggering the problem + # (which may get fixed), it's just a way to trigger + # the issue or reraising an outer exception without + # a named argument + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, + 9]}).set_index(["a", "b"]) + l = list(df.index) + l[0] = ["a", "b"] + df.index = l + + try: + repr(df) + except Exception as e: + self.assertNotEqual(type(e), UnboundLocalError) + + def test_reverse_reindex_ffill_raises(self): + dr = pd.date_range('2013-08-01', periods=6, freq='B') + data = np.random.randn(6,1) + df = pd.DataFrame(data, index=dr, columns=list('A')) + df['A'][3] = np.nan + df_rev = pd.DataFrame(data, index=dr[::-1], columns=list('A')) + # Reverse index is not 'monotonic' + self.assertRaises(ValueError, df_rev.reindex, df.index, method='pad') + self.assertRaises(ValueError, df_rev.reindex, df.index, method='ffill') + self.assertRaises(ValueError, df_rev.reindex, df.index, method='bfill') + + def test_reversed_reindex_ffill_raises(self): + dr = pd.date_range('2013-08-01', periods=6, freq='B') + data = np.random.randn(6,1) + df = pd.DataFrame(data, index=dr, columns=list('A')) + df['A'][3] = np.nan + df = pd.DataFrame(data, index=dr, columns=list('A')) + # Reversed reindex is not 'monotonic' + self.assertRaises(ValueError, df.reindex, dr[::-1], method='pad') + self.assertRaises(ValueError, df.reindex, dr[::-1], method='ffill') + self.assertRaises(ValueError, df.reindex, dr[::-1], method='bfill') + + def test_getitem_ix_float_duplicates(self): + df = pd.DataFrame(np.random.randn(3, 3), + index=[0.1, 0.2, 0.2], columns=list('abc')) + expect = df.iloc[1:] + tm.assert_frame_equal(df.loc[0.2], expect) + tm.assert_frame_equal(df.ix[0.2], expect) + + expect = df.iloc[1:, 0] + tm.assert_series_equal(df.loc[0.2, 'a'], expect) + + df.index = [1, 0.2, 0.2] + expect = df.iloc[1:] + tm.assert_frame_equal(df.loc[0.2], expect) + tm.assert_frame_equal(df.ix[0.2], expect) + + expect = df.iloc[1:, 0] + tm.assert_series_equal(df.loc[0.2, 'a'], expect) + + df = pd.DataFrame(np.random.randn(4, 3), + index=[1, 0.2, 0.2, 1], columns=list('abc')) + expect = df.iloc[1:-1] + tm.assert_frame_equal(df.loc[0.2], expect) + tm.assert_frame_equal(df.ix[0.2], expect) + + expect = df.iloc[1:-1, 0] + tm.assert_series_equal(df.loc[0.2, 'a'], expect) + + df.index = [0.1, 0.2, 2, 0.2] + expect = df.iloc[[1, -1]] + tm.assert_frame_equal(df.loc[0.2], expect) + tm.assert_frame_equal(df.ix[0.2], expect) + + expect = df.iloc[[1, -1], 0] + tm.assert_series_equal(df.loc[0.2, 'a'], expect) + + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() + +_frame = DataFrame(_seriesd) +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_intframe = DataFrame(dict((k, v.astype(int)) + for k, v in compat.iteritems(_seriesd))) + +_tsframe = DataFrame(_tsd) + +_mixed_frame = _frame.copy() +_mixed_frame['foo'] = 'bar' + + +class SafeForSparse(object): + + _multiprocess_can_split_ = True + + def test_copy_index_name_checking(self): + # don't want to be able to modify the index stored elsewhere after + # making a copy + for attr in ('index', 'columns'): + ind = getattr(self.frame, attr) + ind.name = None + cp = self.frame.copy() + getattr(cp, attr).name = 'foo' + self.assertIsNone(getattr(self.frame, attr).name) + + def test_getitem_pop_assign_name(self): + s = self.frame['A'] + self.assertEqual(s.name, 'A') + + s = self.frame.pop('A') + self.assertEqual(s.name, 'A') + + s = self.frame.ix[:, 'B'] + self.assertEqual(s.name, 'B') + + s2 = s.ix[:] + self.assertEqual(s2.name, 'B') + + def test_get_value(self): + for idx in self.frame.index: + for col in self.frame.columns: + result = self.frame.get_value(idx, col) + expected = self.frame[col][idx] + assert_almost_equal(result, expected) + + def test_join_index(self): + # left / right + + f = self.frame.reindex(columns=['A', 'B'])[:10] + f2 = self.frame.reindex(columns=['C', 'D']) + + joined = f.join(f2) + self.assertTrue(f.index.equals(joined.index)) + self.assertEqual(len(joined.columns), 4) + + joined = f.join(f2, how='left') + self.assertTrue(joined.index.equals(f.index)) + self.assertEqual(len(joined.columns), 4) + + joined = f.join(f2, how='right') + self.assertTrue(joined.index.equals(f2.index)) + self.assertEqual(len(joined.columns), 4) + + # inner + + f = self.frame.reindex(columns=['A', 'B'])[:10] + f2 = self.frame.reindex(columns=['C', 'D']) + + joined = f.join(f2, how='inner') + self.assertTrue(joined.index.equals(f.index.intersection(f2.index))) + self.assertEqual(len(joined.columns), 4) + + # outer + + f = self.frame.reindex(columns=['A', 'B'])[:10] + f2 = self.frame.reindex(columns=['C', 'D']) + + joined = f.join(f2, how='outer') + self.assertTrue(tm.equalContents(self.frame.index, joined.index)) + self.assertEqual(len(joined.columns), 4) + + assertRaisesRegexp(ValueError, 'join method', f.join, f2, how='foo') + + # corner case - overlapping columns + for how in ('outer', 'left', 'inner'): + with assertRaisesRegexp(ValueError, 'columns overlap but no suffix'): + self.frame.join(self.frame, how=how) + + def test_join_index_more(self): + af = self.frame.ix[:, ['A', 'B']] + bf = self.frame.ix[::2, ['C', 'D']] + + expected = af.copy() + expected['C'] = self.frame['C'][::2] + expected['D'] = self.frame['D'][::2] + + result = af.join(bf) + assert_frame_equal(result, expected) + + result = af.join(bf, how='right') + assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how='right') + assert_frame_equal(result, expected.ix[:, result.columns]) + + def test_join_index_series(self): + df = self.frame.copy() + s = df.pop(self.frame.columns[-1]) + joined = df.join(s) + + assert_frame_equal(joined, self.frame, check_names=False) # TODO should this check_names ? + + s.name = None + assertRaisesRegexp(ValueError, 'must have a name', df.join, s) + + def test_join_overlap(self): + df1 = self.frame.ix[:, ['A', 'B', 'C']] + df2 = self.frame.ix[:, ['B', 'C', 'D']] + + joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') + df1_suf = df1.ix[:, ['B', 'C']].add_suffix('_df1') + df2_suf = df2.ix[:, ['B', 'C']].add_suffix('_df2') + no_overlap = self.frame.ix[:, ['A', 'D']] + expected = df1_suf.join(df2_suf).join(no_overlap) + + # column order not necessarily sorted + assert_frame_equal(joined, expected.ix[:, joined.columns]) + + def test_add_prefix_suffix(self): + with_prefix = self.frame.add_prefix('foo#') + expected = ['foo#%s' % c for c in self.frame.columns] + self.assert_numpy_array_equal(with_prefix.columns, expected) + + with_suffix = self.frame.add_suffix('#foo') + expected = ['%s#foo' % c for c in self.frame.columns] + self.assert_numpy_array_equal(with_suffix.columns, expected) + + +class TestDataFrame(tm.TestCase, CheckIndexing, + SafeForSparse): + klass = DataFrame + + _multiprocess_can_split_ = True + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + + # force these all to int64 to avoid platform testing issues + self.intframe = DataFrame(dict([ (c,s) for c,s in compat.iteritems(_intframe) ]), dtype = np.int64) + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + self.mixed_float = DataFrame({ 'A': _frame['A'].copy().astype('float32'), + 'B': _frame['B'].copy().astype('float32'), + 'C': _frame['C'].copy().astype('float16'), + 'D': _frame['D'].copy().astype('float64') }) + self.mixed_float2 = DataFrame({ 'A': _frame2['A'].copy().astype('float32'), + 'B': _frame2['B'].copy().astype('float32'), + 'C': _frame2['C'].copy().astype('float16'), + 'D': _frame2['D'].copy().astype('float64') }) + self.mixed_int = DataFrame({ 'A': _intframe['A'].copy().astype('int32'), + 'B': np.ones(len(_intframe['B']),dtype='uint64'), + 'C': _intframe['C'].copy().astype('uint8'), + 'D': _intframe['D'].copy().astype('int64') }) + self.all_mixed = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float32' : np.array([1.]*10,dtype='float32'), + 'int32' : np.array([1]*10,dtype='int32'), + }, index=np.arange(10)) + + self.ts1 = tm.makeTimeSeries() + self.ts2 = tm.makeTimeSeries()[5:] + self.ts3 = tm.makeTimeSeries()[-5:] + self.ts4 = tm.makeTimeSeries()[1:-1] + + self.ts_dict = { + 'col1': self.ts1, + 'col2': self.ts2, + 'col3': self.ts3, + 'col4': self.ts4, + } + self.empty = DataFrame({}) + + arr = np.array([[1., 2., 3.], + [4., 5., 6.], + [7., 8., 9.]]) + + self.simple = DataFrame(arr, columns=['one', 'two', 'three'], + index=['a', 'b', 'c']) + + def test_get_axis(self): + f = self.frame + self.assertEqual(f._get_axis_number(0), 0) + self.assertEqual(f._get_axis_number(1), 1) + self.assertEqual(f._get_axis_number('index'), 0) + self.assertEqual(f._get_axis_number('rows'), 0) + self.assertEqual(f._get_axis_number('columns'), 1) + + self.assertEqual(f._get_axis_name(0), 'index') + self.assertEqual(f._get_axis_name(1), 'columns') + self.assertEqual(f._get_axis_name('index'), 'index') + self.assertEqual(f._get_axis_name('rows'), 'index') + self.assertEqual(f._get_axis_name('columns'), 'columns') + + self.assertIs(f._get_axis(0), f.index) + self.assertIs(f._get_axis(1), f.columns) + + assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, 2) + assertRaisesRegexp(ValueError, 'No axis.*foo', f._get_axis_name, 'foo') + assertRaisesRegexp(ValueError, 'No axis.*None', f._get_axis_name, None) + assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, None) + + def test_set_index(self): + idx = Index(np.arange(len(self.mixed_frame))) + + # cache it + _ = self.mixed_frame['foo'] + self.mixed_frame.index = idx + self.assertIs(self.mixed_frame['foo'].index, idx) + with assertRaisesRegexp(ValueError, 'Length mismatch'): + self.mixed_frame.index = idx[::2] + + def test_set_index_cast(self): + + # issue casting an index then set_index + df = DataFrame({'A' : [1.1,2.2,3.3], 'B' : [5.0,6.1,7.2]}, + index = [2010,2011,2012]) + expected = df.ix[2010] + new_index = df.index.astype(np.int32) + df.index = new_index + result = df.ix[2010] + assert_series_equal(result,expected) + + def test_set_index2(self): + df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'three', 'one', 'two'], + 'C': ['a', 'b', 'c', 'd', 'e'], + 'D': np.random.randn(5), + 'E': np.random.randn(5)}) + + # new object, single-column + result = df.set_index('C') + result_nodrop = df.set_index('C', drop=False) + + index = Index(df['C'], name='C') + + expected = df.ix[:, ['A', 'B', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.name, index.name) + + # inplace, single + df2 = df.copy() + + df2.set_index('C', inplace=True) + + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index('C', drop=False, inplace=True) + + assert_frame_equal(df3, expected_nodrop) + + # create new object, multi-column + result = df.set_index(['A', 'B']) + result_nodrop = df.set_index(['A', 'B'], drop=False) + + index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) + + expected = df.ix[:, ['C', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.names, index.names) + + # inplace + df2 = df.copy() + df2.set_index(['A', 'B'], inplace=True) + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index(['A', 'B'], drop=False, inplace=True) + assert_frame_equal(df3, expected_nodrop) + + # corner case + with assertRaisesRegexp(ValueError, 'Index has duplicate keys'): + df.set_index('A', verify_integrity=True) + + # append + result = df.set_index(['A', 'B'], append=True) + xp = df.reset_index().set_index(['index', 'A', 'B']) + xp.index.names = [None, 'A', 'B'] + assert_frame_equal(result, xp) + + # append to existing multiindex + rdf = df.set_index(['A'], append=True) + rdf = rdf.set_index(['B', 'C'], append=True) + expected = df.set_index(['A', 'B', 'C'], append=True) + assert_frame_equal(rdf, expected) + + # Series + result = df.set_index(df.C) + self.assertEqual(result.index.name, 'C') + + def test_set_index_nonuniq(self): + df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'three', 'one', 'two'], + 'C': ['a', 'b', 'c', 'd', 'e'], + 'D': np.random.randn(5), + 'E': np.random.randn(5)}) + with assertRaisesRegexp(ValueError, 'Index has duplicate keys'): + df.set_index('A', verify_integrity=True, inplace=True) + self.assertIn('A', df) + + def test_set_index_bug(self): + # GH1590 + df = DataFrame({'val': [0, 1, 2], 'key': ['a', 'b', 'c']}) + df2 = df.select(lambda indx: indx >= 1) + rs = df2.set_index('key') + xp = DataFrame({'val': [1, 2]}, + Index(['b', 'c'], name='key')) + assert_frame_equal(rs, xp) + + def test_set_index_pass_arrays(self): + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + # multiple columns + result = df.set_index(['A', df['B'].values], drop=False) + expected = df.set_index(['A', 'B'], drop=False) + assert_frame_equal(result, expected, check_names=False) # TODO should set_index check_names ? + + def test_set_index_cast_datetimeindex(self): + df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) + for i in range(1000)], + 'B': np.random.randn(1000)}) + + idf = df.set_index('A') + tm.assert_isinstance(idf.index, DatetimeIndex) + + # don't cast a DatetimeIndex WITH a tz, leave as object + # GH 6032 + i = pd.DatetimeIndex(pd.tseries.tools.to_datetime(['2013-1-1 13:00','2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific') + df = DataFrame(np.random.randn(2,1),columns=['A']) + + expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), + pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) + + # convert index to series + result = Series(i) + assert_series_equal(result, expected) + + # assignt to frame + df['B'] = i + result = df['B'] + assert_series_equal(result, expected) + + # keep the timezone + result = i.to_series(keep_tz=True) + assert_series_equal(result.reset_index(drop=True), expected) + + # convert to utc + df['C'] = i.to_series().reset_index(drop=True) + result = df['C'] + comp = DatetimeIndex(expected.values).copy() + comp.tz = None + self.assert_numpy_array_equal(result.values, comp.values) + + # list of datetimes with a tz + df['D'] = i.to_pydatetime() + result = df['D'] + assert_series_equal(result, expected) + + # GH 6785 + # set the index manually + import pytz + df = DataFrame([{'ts':datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo':1}]) + expected = df.set_index('ts') + df.index = df['ts'] + df.pop('ts') + assert_frame_equal(df, expected) + + # GH 3950 + # reset_index with single level + for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: + idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') + df = pd.DataFrame({'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) + + expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), + datetime(2011, 1, 3), datetime(2011, 1, 4), + datetime(2011, 1, 5)], + 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, + columns=['idx', 'a', 'b']) + expected['idx'] = expected['idx'].apply(lambda d: pd.Timestamp(d, tz=tz)) + assert_frame_equal(df.reset_index(), expected) + + def test_set_index_multiindexcolumns(self): + columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) + df = DataFrame(np.random.randn(3, 3), columns=columns) + rs = df.set_index(df.columns[0]) + xp = df.ix[:, 1:] + xp.index = df.ix[:, 0].values + xp.index.names = [df.columns[0]] + assert_frame_equal(rs, xp) + + def test_set_index_empty_column(self): + # #1971 + df = DataFrame([ + dict(a=1, p=0), + dict(a=2, m=10), + dict(a=3, m=11, p=20), + dict(a=4, m=12, p=21) + ], columns=('a', 'm', 'p', 'x')) + + # it works! + result = df.set_index(['a', 'x']) + repr(result) + + def test_set_columns(self): + cols = Index(np.arange(len(self.mixed_frame.columns))) + self.mixed_frame.columns = cols + with assertRaisesRegexp(ValueError, 'Length mismatch'): + self.mixed_frame.columns = cols[::2] + + def test_keys(self): + getkeys = self.frame.keys + self.assertIs(getkeys(), self.frame.columns) + + def test_column_contains_typeerror(self): + try: + self.frame.columns in self.frame + except TypeError: + pass + + def test_constructor(self): + df = DataFrame() + self.assertEqual(len(df.index), 0) + + df = DataFrame(data={}) + self.assertEqual(len(df.index), 0) + + def test_constructor_mixed(self): + index, data = tm.getMixedTypeDict() + + indexed_frame = DataFrame(data, index=index) + unindexed_frame = DataFrame(data) + + self.assertEqual(self.mixed_frame['foo'].dtype, np.object_) + + def test_constructor_cast_failure(self): + foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64) + self.assertEqual(foo['a'].dtype, object) + + # GH 3010, constructing with odd arrays + df = DataFrame(np.ones((4,2))) + + # this is ok + df['foo'] = np.ones((4,2)).tolist() + + # this is not ok + self.assertRaises(ValueError, df.__setitem__, tuple(['test']), np.ones((4,2))) + + # this is ok + df['foo2'] = np.ones((4,2)).tolist() + + def test_constructor_dtype_nocast_view(self): + df = DataFrame([[1, 2]]) + should_be_view = DataFrame(df, dtype=df[0].dtype) + should_be_view[0][0] = 99 + self.assertEqual(df.values[0, 0], 99) + + should_be_view = DataFrame(df.values, dtype=df[0].dtype) + should_be_view[0][0] = 97 + self.assertEqual(df.values[0, 0], 97) + + def test_constructor_dtype_list_data(self): + df = DataFrame([[1, '2'], + [None, 'a']], dtype=object) + self.assertIsNone(df.ix[1, 0]) + self.assertEqual(df.ix[0, 1], '2') + + def test_constructor_list_frames(self): + + # GH 3243 + result = DataFrame([DataFrame([])]) + self.assertEqual(result.shape, (1,0)) + + result = DataFrame([DataFrame(dict(A = lrange(5)))]) + tm.assert_isinstance(result.iloc[0,0], DataFrame) + + def test_constructor_mixed_dtypes(self): + + def _make_mixed_dtypes_df(typ, ad = None): + + if typ == 'int': + dtypes = MIXED_INT_DTYPES + arrays = [ np.array(np.random.rand(10), dtype = d) for d in dtypes ] + elif typ == 'float': + dtypes = MIXED_FLOAT_DTYPES + arrays = [ np.array(np.random.randint(10, size=10), dtype = d) for d in dtypes ] + + zipper = lzip(dtypes,arrays) + for d,a in zipper: + assert(a.dtype == d) + if ad is None: + ad = dict() + ad.update(dict([ (d,a) for d,a in zipper ])) + return DataFrame(ad) + + def _check_mixed_dtypes(df, dtypes = None): + if dtypes is None: + dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES + for d in dtypes: + if d in df: + assert(df.dtypes[d] == d) + + # mixed floating and integer coexinst in the same frame + df = _make_mixed_dtypes_df('float') + _check_mixed_dtypes(df) + + # add lots of types + df = _make_mixed_dtypes_df('float', dict(A = 1, B = 'foo', C = 'bar')) + _check_mixed_dtypes(df) + + # GH 622 + df = _make_mixed_dtypes_df('int') + _check_mixed_dtypes(df) + + def test_constructor_rec(self): + rec = self.frame.to_records(index=False) + + # Assigning causes segfault in NumPy < 1.5.1 + # rec.dtype.names = list(rec.dtype.names)[::-1] + + index = self.frame.index + + df = DataFrame(rec) + self.assert_numpy_array_equal(df.columns, rec.dtype.names) + + df2 = DataFrame(rec, index=index) + self.assert_numpy_array_equal(df2.columns, rec.dtype.names) + self.assertTrue(df2.index.equals(index)) + + rng = np.arange(len(rec))[::-1] + df3 = DataFrame(rec, index=rng, columns=['C', 'B']) + expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B']) + assert_frame_equal(df3, expected) + + def test_constructor_bool(self): + df = DataFrame({0: np.ones(10, dtype=bool), + 1: np.zeros(10, dtype=bool)}) + self.assertEqual(df.values.dtype, np.bool_) + + def test_constructor_overflow_int64(self): + values = np.array([2 ** 64 - i for i in range(1, 10)], + dtype=np.uint64) + + result = DataFrame({'a': values}) + self.assertEqual(result['a'].dtype, object) + + # #2355 + data_scores = [(6311132704823138710, 273), (2685045978526272070, 23), + (8921811264899370420, 45), (long(17019687244989530680), 270), + (long(9930107427299601010), 273)] + dtype = [('uid', 'u8'), ('score', 'u8')] + data = np.zeros((len(data_scores),), dtype=dtype) + data[:] = data_scores + df_crawls = DataFrame(data) + self.assertEqual(df_crawls['uid'].dtype, object) + + def test_constructor_ordereddict(self): + import random + nitems = 100 + nums = lrange(nitems) + random.shuffle(nums) + expected = ['A%d' % i for i in nums] + df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems))) + self.assertEqual(expected, list(df.columns)) + + def test_constructor_dict(self): + frame = DataFrame({'col1': self.ts1, + 'col2': self.ts2}) + + tm.assert_dict_equal(self.ts1, frame['col1'], compare_keys=False) + tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False) + + frame = DataFrame({'col1': self.ts1, + 'col2': self.ts2}, + columns=['col2', 'col3', 'col4']) + + self.assertEqual(len(frame), len(self.ts2)) + self.assertNotIn('col1', frame) + self.assertTrue(isnull(frame['col3']).all()) + + # Corner cases + self.assertEqual(len(DataFrame({})), 0) + + # mix dict and array, wrong size - no spec for which error should raise + # first + with tm.assertRaises(ValueError): + DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']}) + + # Length-one dict micro-optimization + frame = DataFrame({'A': {'1': 1, '2': 2}}) + self.assert_numpy_array_equal(frame.index, ['1', '2']) + + # empty dict plus index + idx = Index([0, 1, 2]) + frame = DataFrame({}, index=idx) + self.assertIs(frame.index, idx) + + # empty with index and columns + idx = Index([0, 1, 2]) + frame = DataFrame({}, index=idx, columns=idx) + self.assertIs(frame.index, idx) + self.assertIs(frame.columns, idx) + self.assertEqual(len(frame._series), 3) + + # with dict of empty list and Series + frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) + self.assertTrue(frame.index.equals(Index([]))) + + def test_constructor_multi_index(self): + # GH 4078 + # construction error with mi and all-nan frame + tuples = [(2, 3), (3, 3), (3, 3)] + mi = MultiIndex.from_tuples(tuples) + df = DataFrame(index=mi,columns=mi) + self.assertTrue(pd.isnull(df).values.ravel().all()) + + tuples = [(3, 3), (2, 3), (3, 3)] + mi = MultiIndex.from_tuples(tuples) + df = DataFrame(index=mi,columns=mi) + self.assertTrue(pd.isnull(df).values.ravel().all()) + + def test_constructor_error_msgs(self): + msg = "Mixing dicts with non-Series may lead to ambiguous ordering." + # mix dict and array, wrong size + with assertRaisesRegexp(ValueError, msg): + DataFrame({'A': {'a': 'a', 'b': 'b'}, + 'B': ['a', 'b', 'c']}) + + # wrong size ndarray, GH 3105 + msg = "Shape of passed values is \(3, 4\), indices imply \(3, 3\)" + with assertRaisesRegexp(ValueError, msg): + DataFrame(np.arange(12).reshape((4, 3)), + columns=['foo', 'bar', 'baz'], + index=date_range('2000-01-01', periods=3)) + + + # higher dim raise exception + with assertRaisesRegexp(ValueError, 'Must pass 2-d input'): + DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) + + # wrong size axis labels + with assertRaisesRegexp(ValueError, "Shape of passed values is \(3, 2\), indices imply \(3, 1\)"): + DataFrame(np.random.rand(2,3), columns=['A', 'B', 'C'], index=[1]) + + with assertRaisesRegexp(ValueError, "Shape of passed values is \(3, 2\), indices imply \(2, 2\)"): + DataFrame(np.random.rand(2,3), columns=['A', 'B'], index=[1, 2]) + + with assertRaisesRegexp(ValueError, 'If using all scalar values, you must must pass an index'): + DataFrame({'a': False, 'b': True}) + + def test_constructor_with_embedded_frames(self): + + # embedded data frames + df1 = DataFrame({'a':[1, 2, 3], 'b':[3, 4, 5]}) + df2 = DataFrame([df1, df1+10]) + + df2.dtypes + str(df2) + + result = df2.loc[0,0] + assert_frame_equal(result,df1) + + result = df2.loc[1,0] + assert_frame_equal(result,df1+10) + + def test_insert_error_msmgs(self): + + # GH 7432 + df = DataFrame({'foo':['a', 'b', 'c'], 'bar':[1,2,3], 'baz':['d','e','f']}).set_index('foo') + s = DataFrame({'foo':['a', 'b', 'c', 'a'], 'fiz':['g','h','i','j']}).set_index('foo') + msg = 'cannot reindex from a duplicate axis' + with assertRaisesRegexp(ValueError, msg): + df['newcol'] = s + + # GH 4107, more descriptive error message + df = DataFrame(np.random.randint(0,2,(4,4)), + columns=['a', 'b', 'c', 'd']) + + msg = 'incompatible index of inserted column with frame index' + with assertRaisesRegexp(TypeError, msg): + df['gr'] = df.groupby(['b', 'c']).count() + + def test_constructor_subclass_dict(self): + # Test for passing dict subclass to constructor + data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), + 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))} + df = DataFrame(data) + refdf = DataFrame(dict((col, dict(compat.iteritems(val))) + for col, val in compat.iteritems(data))) + assert_frame_equal(refdf, df) + + data = tm.TestSubDict(compat.iteritems(data)) + df = DataFrame(data) + assert_frame_equal(refdf, df) + + # try with defaultdict + from collections import defaultdict + data = {} + self.frame['B'][:10] = np.nan + for k, v in compat.iteritems(self.frame): + dct = defaultdict(dict) + dct.update(v.to_dict()) + data[k] = dct + frame = DataFrame(data) + assert_frame_equal(self.frame.sort_index(), frame) + + def test_constructor_dict_block(self): + expected = [[4., 3., 2., 1.]] + df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]}, + columns=['d', 'c', 'b', 'a']) + assert_almost_equal(df.values, expected) + + def test_constructor_dict_cast(self): + # cast float tests + test_data = { + 'A': {'1': 1, '2': 2}, + 'B': {'1': '1', '2': '2', '3': '3'}, + } + frame = DataFrame(test_data, dtype=float) + self.assertEqual(len(frame), 3) + self.assertEqual(frame['B'].dtype, np.float64) + self.assertEqual(frame['A'].dtype, np.float64) + + frame = DataFrame(test_data) + self.assertEqual(len(frame), 3) + self.assertEqual(frame['B'].dtype, np.object_) + self.assertEqual(frame['A'].dtype, np.float64) + + # can't cast to float + test_data = { + 'A': dict(zip(range(20), tm.makeStringIndex(20))), + 'B': dict(zip(range(15), randn(15))) + } + frame = DataFrame(test_data, dtype=float) + self.assertEqual(len(frame), 20) + self.assertEqual(frame['A'].dtype, np.object_) + self.assertEqual(frame['B'].dtype, np.float64) + + def test_constructor_dict_dont_upcast(self): + d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}} + df = DataFrame(d) + tm.assert_isinstance(df['Col1']['Row2'], float) + + dm = DataFrame([[1, 2], ['a', 'b']], index=[1, 2], columns=[1, 2]) + tm.assert_isinstance(dm[1][1], int) + + def test_constructor_dict_of_tuples(self): + # GH #1491 + data = {'a': (1, 2, 3), 'b': (4, 5, 6)} + + result = DataFrame(data) + expected = DataFrame(dict((k, list(v)) for k, v in compat.iteritems(data))) + assert_frame_equal(result, expected, check_dtype=False) + + def test_constructor_dict_multiindex(self): + check = lambda result, expected: tm.assert_frame_equal( + result, expected, check_dtype=True, check_index_type=True, + check_column_type=True, check_names=True) + d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, + ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4}, + ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}} + _d = sorted(d.items()) + df = DataFrame(d) + expected = DataFrame( + [x[1] for x in _d], + index=MultiIndex.from_tuples([x[0] for x in _d])).T + expected.index = MultiIndex.from_tuples(expected.index) + check(df, expected) + + d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111} + _d.insert(0, ('z', d['z'])) + expected = DataFrame( + [x[1] for x in _d], + index=Index([x[0] for x in _d], tupleize_cols=False)).T + expected.index = Index(expected.index, tupleize_cols=False) + df = DataFrame(d) + df = df.reindex(columns=expected.columns, index=expected.index) + check(df, expected) + + def _check_basic_constructor(self, empty): + "mat: 2d matrix with shpae (3, 2) to input. empty - makes sized objects" + mat = empty((2, 3), dtype=float) + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + + # 1-D input + frame = DataFrame(empty((3,)), columns=['A'], index=[1, 2, 3]) + self.assertEqual(len(frame.index), 3) + self.assertEqual(len(frame.columns), 1) + + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=np.int64) + self.assertEqual(frame.values.dtype, np.int64) + + # wrong size axis labels + msg = r'Shape of passed values is \(3, 2\), indices imply \(3, 1\)' + with assertRaisesRegexp(ValueError, msg): + DataFrame(mat, columns=['A', 'B', 'C'], index=[1]) + msg = r'Shape of passed values is \(3, 2\), indices imply \(2, 2\)' + with assertRaisesRegexp(ValueError, msg): + DataFrame(mat, columns=['A', 'B'], index=[1, 2]) + + # higher dim raise exception + with assertRaisesRegexp(ValueError, 'Must pass 2-d input'): + DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'], + index=[1]) + + # automatic labeling + frame = DataFrame(mat) + self.assert_numpy_array_equal(frame.index, lrange(2)) + self.assert_numpy_array_equal(frame.columns, lrange(3)) + + frame = DataFrame(mat, index=[1, 2]) + self.assert_numpy_array_equal(frame.columns, lrange(3)) + + frame = DataFrame(mat, columns=['A', 'B', 'C']) + self.assert_numpy_array_equal(frame.index, lrange(2)) + + # 0-length axis + frame = DataFrame(empty((0, 3))) + self.assertEqual(len(frame.index), 0) + + frame = DataFrame(empty((3, 0))) + self.assertEqual(len(frame.columns), 0) + + def test_constructor_ndarray(self): + mat = np.zeros((2, 3), dtype=float) + self._check_basic_constructor(np.ones) + + frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) + self.assertEqual(len(frame), 2) + + def test_constructor_maskedarray(self): + self._check_basic_constructor(ma.masked_all) + + # Check non-masked values + mat = ma.masked_all((2, 3), dtype=float) + mat[0, 0] = 1.0 + mat[1, 2] = 2.0 + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1.0, frame['A'][1]) + self.assertEqual(2.0, frame['C'][2]) + + # what is this even checking?? + mat = ma.masked_all((2, 3), dtype=float) + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + def test_constructor_maskedarray_nonfloat(self): + # masked int promoted to float + mat = ma.masked_all((2, 3), dtype=int) + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=np.float64) + self.assertEqual(frame.values.dtype, np.float64) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0, 0] = 1 + mat2[1, 2] = 2 + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1, frame['A'][1]) + self.assertEqual(2, frame['C'][2]) + + # masked np.datetime64 stays (use lib.NaT as null) + mat = ma.masked_all((2, 3), dtype='M8[ns]') + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(isnull(frame).values.all()) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=np.int64) + self.assertEqual(frame.values.dtype, np.int64) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0, 0] = 1 + mat2[1, 2] = 2 + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1, frame['A'].view('i8')[1]) + self.assertEqual(2, frame['C'].view('i8')[2]) + + # masked bool promoted to object + mat = ma.masked_all((2, 3), dtype=bool) + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=object) + self.assertEqual(frame.values.dtype, object) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0, 0] = True + mat2[1, 2] = False + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(True, frame['A'][1]) + self.assertEqual(False, frame['C'][2]) + + def test_constructor_mrecarray(self): + # Ensure mrecarray produces frame identical to dict of masked arrays + # from GH3479 + + assert_fr_equal = functools.partial(assert_frame_equal, + check_index_type=True, + check_column_type=True, + check_frame_type=True) + arrays = [ + ('float', np.array([1.5, 2.0])), + ('int', np.array([1, 2])), + ('str', np.array(['abc', 'def'])), + ] + for name, arr in arrays[:]: + arrays.append(('masked1_' + name, + np.ma.masked_array(arr, mask=[False, True]))) + arrays.append(('masked_all', np.ma.masked_all((2,)))) + arrays.append(('masked_none', + np.ma.masked_array([1.0, 2.5], mask=False))) + + # call assert_frame_equal for all selections of 3 arrays + for comb in itertools.combinations(arrays, 3): + names, data = zip(*comb) + mrecs = mrecords.fromarrays(data, names=names) + + # fill the comb + comb = dict([ (k, v.filled()) if hasattr(v,'filled') else (k, v) for k, v in comb ]) + + expected = DataFrame(comb,columns=names) + result = DataFrame(mrecs) + assert_fr_equal(result,expected) + + # specify columns + expected = DataFrame(comb,columns=names[::-1]) + result = DataFrame(mrecs, columns=names[::-1]) + assert_fr_equal(result,expected) + + # specify index + expected = DataFrame(comb,columns=names,index=[1,2]) + result = DataFrame(mrecs, index=[1,2]) + assert_fr_equal(result,expected) + + def test_constructor_corner(self): + df = DataFrame(index=[]) + self.assertEqual(df.values.shape, (0, 0)) + + # empty but with specified dtype + df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=object) + self.assertEqual(df.values.dtype, np.object_) + + # does not error but ends up float + df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=int) + self.assertEqual(df.values.dtype, np.object_) + + # #1783 empty dtype object + df = DataFrame({}, columns=['foo', 'bar']) + self.assertEqual(df.values.dtype, np.object_) + + df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'), + dtype=int) + self.assertEqual(df.values.dtype, np.object_) + + + def test_constructor_scalar_inference(self): + data = {'int': 1, 'bool': True, + 'float': 3., 'complex': 4j, 'object': 'foo'} + df = DataFrame(data, index=np.arange(10)) + + self.assertEqual(df['int'].dtype, np.int64) + self.assertEqual(df['bool'].dtype, np.bool_) + self.assertEqual(df['float'].dtype, np.float64) + self.assertEqual(df['complex'].dtype, np.complex128) + self.assertEqual(df['object'].dtype, np.object_) + + def test_constructor_arrays_and_scalars(self): + df = DataFrame({'a': randn(10), 'b': True}) + exp = DataFrame({'a': df['a'].values, 'b': [True] * 10}) + + assert_frame_equal(df, exp) + with tm.assertRaisesRegexp(ValueError, 'must pass an index'): + DataFrame({'a': False, 'b': True}) + + def test_constructor_DataFrame(self): + df = DataFrame(self.frame) + assert_frame_equal(df, self.frame) + + df_casted = DataFrame(self.frame, dtype=np.int64) + self.assertEqual(df_casted.values.dtype, np.int64) + + def test_constructor_more(self): + # used to be in test_matrix.py + arr = randn(10) + dm = DataFrame(arr, columns=['A'], index=np.arange(10)) + self.assertEqual(dm.values.ndim, 2) + + arr = randn(0) + dm = DataFrame(arr) + self.assertEqual(dm.values.ndim, 2) + self.assertEqual(dm.values.ndim, 2) + + # no data specified + dm = DataFrame(columns=['A', 'B'], index=np.arange(10)) + self.assertEqual(dm.values.shape, (10, 2)) + + dm = DataFrame(columns=['A', 'B']) + self.assertEqual(dm.values.shape, (0, 2)) + + dm = DataFrame(index=np.arange(10)) + self.assertEqual(dm.values.shape, (10, 0)) + + # corner, silly + # TODO: Fix this Exception to be better... + with assertRaisesRegexp(PandasError, 'constructor not properly called'): + DataFrame((1, 2, 3)) + + # can't cast + mat = np.array(['foo', 'bar'], dtype=object).reshape(2, 1) + with assertRaisesRegexp(ValueError, 'cast'): + DataFrame(mat, index=[0, 1], columns=[0], dtype=float) + + dm = DataFrame(DataFrame(self.frame._series)) + tm.assert_frame_equal(dm, self.frame) + + # int cast + dm = DataFrame({'A': np.ones(10, dtype=int), + 'B': np.ones(10, dtype=np.float64)}, + index=np.arange(10)) + + self.assertEqual(len(dm.columns), 2) + self.assertEqual(dm.values.dtype, np.float64) + + def test_constructor_empty_list(self): + df = DataFrame([], index=[]) + expected = DataFrame(index=[]) + assert_frame_equal(df, expected) + + def test_constructor_list_of_lists(self): + # GH #484 + l = [[1, 'a'], [2, 'b']] + df = DataFrame(data=l, columns=["num", "str"]) + self.assertTrue(com.is_integer_dtype(df['num'])) + self.assertEqual(df['str'].dtype, np.object_) + + # GH 4851 + # list of 0-dim ndarrays + expected = DataFrame({ 0: range(10) }) + data = [np.array(x) for x in range(10)] + result = DataFrame(data) + assert_frame_equal(result, expected) + + def test_constructor_sequence_like(self): + # GH 3783 + # collections.Squence like + import collections + + class DummyContainer(collections.Sequence): + def __init__(self, lst): + self._lst = lst + def __getitem__(self, n): + return self._lst.__getitem__(n) + def __len__(self, n): + return self._lst.__len__() + + l = [DummyContainer([1, 'a']), DummyContainer([2, 'b'])] + columns = ["num", "str"] + result = DataFrame(l, columns=columns) + expected = DataFrame([[1,'a'],[2,'b']],columns=columns) + assert_frame_equal(result, expected, check_dtype=False) + + # GH 4297 + # support Array + import array + result = DataFrame.from_items([('A', array.array('i', range(10)))]) + expected = DataFrame({ 'A' : list(range(10)) }) + assert_frame_equal(result, expected, check_dtype=False) + + expected = DataFrame([ list(range(10)), list(range(10)) ]) + result = DataFrame([ array.array('i', range(10)), array.array('i',range(10)) ]) + assert_frame_equal(result, expected, check_dtype=False) + + def test_constructor_iterator(self): + + expected = DataFrame([ list(range(10)), list(range(10)) ]) + result = DataFrame([ range(10), range(10) ]) + assert_frame_equal(result, expected) + + def test_constructor_generator(self): + #related #2305 + + gen1 = (i for i in range(10)) + gen2 = (i for i in range(10)) + + expected = DataFrame([ list(range(10)), list(range(10)) ]) + result = DataFrame([ gen1, gen2 ]) + assert_frame_equal(result, expected) + + gen = ([ i, 'a'] for i in range(10)) + result = DataFrame(gen) + expected = DataFrame({ 0 : range(10), 1 : 'a' }) + assert_frame_equal(result, expected, check_dtype=False) + + def test_constructor_list_of_dicts(self): + data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]), + OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]), + OrderedDict([['a', 1.5], ['d', 6]]), + OrderedDict(), + OrderedDict([['a', 1.5], ['b', 3], ['c', 4]]), + OrderedDict([['b', 3], ['c', 4], ['d', 6]])] + + result = DataFrame(data) + expected = DataFrame.from_dict(dict(zip(range(len(data)), data)), + orient='index') + assert_frame_equal(result, expected.reindex(result.index)) + + result = DataFrame([{}]) + expected = DataFrame(index=[0]) + assert_frame_equal(result, expected) + + def test_constructor_list_of_series(self): + data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), + OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])] + sdict = OrderedDict(zip(['x', 'y'], data)) + idx = Index(['a', 'b', 'c']) + + # all named + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx, name='y')] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected) + + # some unnamed + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx)] + result = DataFrame(data2) + + sdict = OrderedDict(zip(['x', 'Unnamed 0'], data)) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result.sort_index(), expected) + + # none named + data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]), + OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]), + OrderedDict([['a', 1.5], ['d', 6]]), + OrderedDict(), + OrderedDict([['a', 1.5], ['b', 3], ['c', 4]]), + OrderedDict([['b', 3], ['c', 4], ['d', 6]])] + data = [Series(d) for d in data] + + result = DataFrame(data) + sdict = OrderedDict(zip(range(len(data)), data)) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected.reindex(result.index)) + + result2 = DataFrame(data, index=np.arange(6)) + assert_frame_equal(result, result2) + + result = DataFrame([Series({})]) + expected = DataFrame(index=[0]) + assert_frame_equal(result, expected) + + data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), + OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])] + sdict = OrderedDict(zip(range(len(data)), data)) + + idx = Index(['a', 'b', 'c']) + data2 = [Series([1.5, 3, 4], idx, dtype='O'), + Series([1.5, 3, 6], idx)] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected) + + def test_constructor_list_of_derived_dicts(self): + class CustomDict(dict): + pass + d = {'a': 1.5, 'b': 3} + + data_custom = [CustomDict(d)] + data = [d] + + result_custom = DataFrame(data_custom) + result = DataFrame(data) + assert_frame_equal(result, result_custom) + + def test_constructor_ragged(self): + data = {'A': randn(10), + 'B': randn(8)} + with assertRaisesRegexp(ValueError, 'arrays must all be same length'): + DataFrame(data) + + def test_constructor_scalar(self): + idx = Index(lrange(3)) + df = DataFrame({"a": 0}, index=idx) + expected = DataFrame({"a": [0, 0, 0]}, index=idx) + assert_frame_equal(df, expected, check_dtype=False) + + def test_constructor_Series_copy_bug(self): + df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) + df.copy() + + def test_constructor_mixed_dict_and_Series(self): + data = {} + data['A'] = {'foo': 1, 'bar': 2, 'baz': 3} + data['B'] = Series([4, 3, 2, 1], index=['bar', 'qux', 'baz', 'foo']) + + result = DataFrame(data) + self.assertTrue(result.index.is_monotonic) + + # ordering ambiguous, raise exception + with assertRaisesRegexp(ValueError, 'ambiguous ordering'): + DataFrame({'A': ['a', 'b'], 'B': {'a': 'a', 'b': 'b'}}) + + # this is OK though + result = DataFrame({'A': ['a', 'b'], + 'B': Series(['a', 'b'], index=['a', 'b'])}) + expected = DataFrame({'A': ['a', 'b'], 'B': ['a', 'b']}, + index=['a', 'b']) + assert_frame_equal(result, expected) + + def test_constructor_tuples(self): + result = DataFrame({'A': [(1, 2), (3, 4)]}) + expected = DataFrame({'A': Series([(1, 2), (3, 4)])}) + assert_frame_equal(result, expected) + + def test_constructor_orient(self): + data_dict = self.mixed_frame.T._series + recons = DataFrame.from_dict(data_dict, orient='index') + expected = self.mixed_frame.sort_index() + assert_frame_equal(recons, expected) + + # dict of sequence + a = {'hi': [32, 3, 3], + 'there': [3, 5, 3]} + rs = DataFrame.from_dict(a, orient='index') + xp = DataFrame.from_dict(a).T.reindex(list(a.keys())) + assert_frame_equal(rs, xp) + + def test_constructor_Series_named(self): + a = Series([1, 2, 3], index=['a', 'b', 'c'], name='x') + df = DataFrame(a) + self.assertEqual(df.columns[0], 'x') + self.assertTrue(df.index.equals(a.index)) + + # ndarray like + arr = np.random.randn(10) + s = Series(arr,name='x') + df = DataFrame(s) + expected = DataFrame(dict(x = s)) + assert_frame_equal(df,expected) + + s = Series(arr,index=range(3,13)) + df = DataFrame(s) + expected = DataFrame({ 0 : s }) + assert_frame_equal(df,expected) + + self.assertRaises(ValueError, DataFrame, s, columns=[1,2]) + + # #2234 + a = Series([], name='x') + df = DataFrame(a) + self.assertEqual(df.columns[0], 'x') + + # series with name and w/o + s1 = Series(arr,name='x') + df = DataFrame([s1, arr]).T + expected = DataFrame({ 'x' : s1, 'Unnamed 0' : arr },columns=['x','Unnamed 0']) + assert_frame_equal(df,expected) + + # this is a bit non-intuitive here; the series collapse down to arrays + df = DataFrame([arr, s1]).T + expected = DataFrame({ 1 : s1, 0 : arr },columns=[0,1]) + assert_frame_equal(df,expected) + + def test_constructor_Series_differently_indexed(self): + # name + s1 = Series([1, 2, 3], index=['a', 'b', 'c'], name='x') + + # no name + s2 = Series([1, 2, 3], index=['a', 'b', 'c']) + + other_index = Index(['a', 'b']) + + df1 = DataFrame(s1, index=other_index) + exp1 = DataFrame(s1.reindex(other_index)) + self.assertEqual(df1.columns[0], 'x') + assert_frame_equal(df1, exp1) + + df2 = DataFrame(s2, index=other_index) + exp2 = DataFrame(s2.reindex(other_index)) + self.assertEqual(df2.columns[0], 0) + self.assertTrue(df2.index.equals(other_index)) + assert_frame_equal(df2, exp2) + + def test_constructor_manager_resize(self): + index = list(self.frame.index[:5]) + columns = list(self.frame.columns[:3]) + + result = DataFrame(self.frame._data, index=index, + columns=columns) + self.assert_numpy_array_equal(result.index, index) + self.assert_numpy_array_equal(result.columns, columns) + + def test_constructor_from_items(self): + items = [(c, self.frame[c]) for c in self.frame.columns] + recons = DataFrame.from_items(items) + assert_frame_equal(recons, self.frame) + + # pass some columns + recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) + assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) + + # orient='index' + + row_items = [(idx, self.mixed_frame.xs(idx)) + for idx in self.mixed_frame.index] + + recons = DataFrame.from_items(row_items, + columns=self.mixed_frame.columns, + orient='index') + assert_frame_equal(recons, self.mixed_frame) + self.assertEqual(recons['A'].dtype, np.float64) + + with tm.assertRaisesRegexp(TypeError, + "Must pass columns with orient='index'"): + DataFrame.from_items(row_items, orient='index') + + # orient='index', but thar be tuples + arr = lib.list_to_object_array( + [('bar', 'baz')] * len(self.mixed_frame)) + self.mixed_frame['foo'] = arr + row_items = [(idx, list(self.mixed_frame.xs(idx))) + for idx in self.mixed_frame.index] + recons = DataFrame.from_items(row_items, + columns=self.mixed_frame.columns, + orient='index') + assert_frame_equal(recons, self.mixed_frame) + tm.assert_isinstance(recons['foo'][0], tuple) + + rs = DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], + orient='index', columns=['one', 'two', 'three']) + xp = DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'], + columns=['one', 'two', 'three']) + assert_frame_equal(rs, xp) + + def test_constructor_mix_series_nonseries(self): + df = DataFrame({'A': self.frame['A'], + 'B': list(self.frame['B'])}, columns=['A', 'B']) + assert_frame_equal(df, self.frame.ix[:, ['A', 'B']]) + + with tm.assertRaisesRegexp(ValueError, 'does not match index length'): + DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])[:-2]}) + + def test_constructor_miscast_na_int_dtype(self): + df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) + expected = DataFrame([[np.nan, 1], [1, 0]]) + assert_frame_equal(df, expected) + + def test_constructor_iterator_failure(self): + with assertRaisesRegexp(TypeError, 'iterator'): + df = DataFrame(iter([1, 2, 3])) + + def test_constructor_column_duplicates(self): + # it works! #2079 + df = DataFrame([[8, 5]], columns=['a', 'a']) + edf = DataFrame([[8, 5]]) + edf.columns = ['a', 'a'] + + assert_frame_equal(df, edf) + + idf = DataFrame.from_items( + [('a', [8]), ('a', [5])], columns=['a', 'a']) + assert_frame_equal(idf, edf) + + self.assertRaises(ValueError, DataFrame.from_items, + [('a', [8]), ('a', [5]), ('b', [6])], + columns=['b', 'a', 'a']) + + def test_column_dups_operations(self): + + def check(result, expected=None): + if expected is not None: + assert_frame_equal(result,expected) + result.dtypes + str(result) + + # assignment + # GH 3687 + arr = np.random.randn(3, 2) + idx = lrange(2) + df = DataFrame(arr, columns=['A', 'A']) + df.columns = idx + expected = DataFrame(arr,columns=idx) + check(df,expected) + + idx = date_range('20130101',periods=4,freq='Q-NOV') + df = DataFrame([[1,1,1,5],[1,1,2,5],[2,1,3,5]],columns=['a','a','a','a']) + df.columns = idx + expected = DataFrame([[1,1,1,5],[1,1,2,5],[2,1,3,5]],columns=idx) + check(df,expected) + + # insert + df = DataFrame([[1,1,1,5],[1,1,2,5],[2,1,3,5]],columns=['foo','bar','foo','hello']) + df['string'] = 'bah' + expected = DataFrame([[1,1,1,5,'bah'],[1,1,2,5,'bah'],[2,1,3,5,'bah']],columns=['foo','bar','foo','hello','string']) + check(df,expected) + with assertRaisesRegexp(ValueError, 'Length of value'): + df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) + + # insert same dtype + df['foo2'] = 3 + expected = DataFrame([[1,1,1,5,'bah',3],[1,1,2,5,'bah',3],[2,1,3,5,'bah',3]],columns=['foo','bar','foo','hello','string','foo2']) + check(df,expected) + + # set (non-dup) + df['foo2'] = 4 + expected = DataFrame([[1,1,1,5,'bah',4],[1,1,2,5,'bah',4],[2,1,3,5,'bah',4]],columns=['foo','bar','foo','hello','string','foo2']) + check(df,expected) + df['foo2'] = 3 + + # delete (non dup) + del df['bar'] + expected = DataFrame([[1,1,5,'bah',3],[1,2,5,'bah',3],[2,3,5,'bah',3]],columns=['foo','foo','hello','string','foo2']) + check(df,expected) + + # try to delete again (its not consolidated) + del df['hello'] + expected = DataFrame([[1,1,'bah',3],[1,2,'bah',3],[2,3,'bah',3]],columns=['foo','foo','string','foo2']) + check(df,expected) + + # consolidate + df = df.consolidate() + expected = DataFrame([[1,1,'bah',3],[1,2,'bah',3],[2,3,'bah',3]],columns=['foo','foo','string','foo2']) + check(df,expected) + + # insert + df.insert(2,'new_col',5.) + expected = DataFrame([[1,1,5.,'bah',3],[1,2,5.,'bah',3],[2,3,5.,'bah',3]],columns=['foo','foo','new_col','string','foo2']) + check(df,expected) + + # insert a dup + assertRaisesRegexp(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) + df.insert(2,'new_col',4.,allow_duplicates=True) + expected = DataFrame([[1,1,4.,5.,'bah',3],[1,2,4.,5.,'bah',3],[2,3,4.,5.,'bah',3]],columns=['foo','foo','new_col','new_col','string','foo2']) + check(df,expected) + + # delete (dup) + del df['foo'] + expected = DataFrame([[4.,5.,'bah',3],[4.,5.,'bah',3],[4.,5.,'bah',3]],columns=['new_col','new_col','string','foo2']) + assert_frame_equal(df,expected) + + # dup across dtypes + df = DataFrame([[1,1,1.,5],[1,1,2.,5],[2,1,3.,5]],columns=['foo','bar','foo','hello']) + check(df) + + df['foo2'] = 7. + expected = DataFrame([[1,1,1.,5,7.],[1,1,2.,5,7.],[2,1,3.,5,7.]],columns=['foo','bar','foo','hello','foo2']) + check(df,expected) + + result = df['foo'] + expected = DataFrame([[1,1.],[1,2.],[2,3.]],columns=['foo','foo']) + check(result,expected) + + # multiple replacements + df['foo'] = 'string' + expected = DataFrame([['string',1,'string',5,7.],['string',1,'string',5,7.],['string',1,'string',5,7.]],columns=['foo','bar','foo','hello','foo2']) + check(df,expected) + + del df['foo'] + expected = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','hello','foo2']) + check(df,expected) + + # values + df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x']) + result = df.values + expected = np.array([[1,2.5],[3,4.5]]) + self.assertTrue((result == expected).all().all()) + + # rename, GH 4403 + df4 = DataFrame({'TClose': [22.02], + 'RT': [0.0454], + 'TExg': [0.0422]}, + index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) + + df5 = DataFrame({'STK_ID': [600809] * 3, + 'RPT_Date': [20120930,20121231,20130331], + 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], + 'TClose': [38.05, 41.66, 30.01]}, + index=MultiIndex.from_tuples([(600809, 20120930), (600809, 20121231),(600809,20130331)], names=['STK_ID', 'RPT_Date'])) + + k = pd.merge(df4,df5,how='inner',left_index=True,right_index=True) + result = k.rename(columns={'TClose_x':'TClose', 'TClose_y':'QT_Close'}) + str(result) + result.dtypes + + expected = DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01 ]], + columns=['RT','TClose','TExg','RPT_Date','STK_ID','STK_Name','QT_Close']).set_index(['STK_ID','RPT_Date'],drop=False) + assert_frame_equal(result,expected) + + # reindex is invalid! + df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a']) + self.assertRaises(ValueError, df.reindex, columns=['bar']) + self.assertRaises(ValueError, df.reindex, columns=['bar','foo']) + + # drop + df = DataFrame([[1,5,7.],[1,5,7.],[1,5,7.]],columns=['bar','a','a']) + result = df.drop(['a'],axis=1) + expected = DataFrame([[1],[1],[1]],columns=['bar']) + check(result,expected) + result = df.drop('a',axis=1) + check(result,expected) + + # describe + df = DataFrame([[1,1,1],[2,2,2],[3,3,3]],columns=['bar','a','a'],dtype='float64') + result = df.describe() + s = df.iloc[:,0].describe() + expected = pd.concat([ s, s, s],keys=df.columns,axis=1) + check(result,expected) + + # check column dups with index equal and not equal to df's index + df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], + columns=['A', 'B', 'A']) + for index in [df.index, pd.Index(list('edcba'))]: + this_df = df.copy() + expected_ser = pd.Series(index.values, index=this_df.index) + expected_df = DataFrame.from_items([('A', expected_ser), + ('B', this_df['B']), + ('A', expected_ser)]) + this_df['A'] = index + check(this_df, expected_df) + + # operations + for op in ['__add__','__mul__','__sub__','__truediv__']: + df = DataFrame(dict(A = np.arange(10), B = np.random.rand(10))) + expected = getattr(df,op)(df) + expected.columns = ['A','A'] + df.columns = ['A','A'] + result = getattr(df,op)(df) + check(result,expected) + + # multiple assignments that change dtypes + # the location indexer is a slice + # GH 6120 + df = DataFrame(np.random.randn(5,2), columns=['that', 'that']) + expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) + + df['that'] = 1.0 + check(df, expected) + + df = DataFrame(np.random.rand(5,2), columns=['that', 'that']) + expected = DataFrame(1, index=range(5), columns=['that', 'that']) + + df['that'] = 1 + check(df, expected) + + def test_column_dups2(self): + + # drop buggy GH 6240 + df = DataFrame({'A' : np.random.randn(5), + 'B' : np.random.randn(5), + 'C' : np.random.randn(5), + 'D' : ['a','b','c','d','e'] }) + + expected = df.take([0,1,1], axis=1) + df2 = df.take([2,0,1,2,1], axis=1) + result = df2.drop('C',axis=1) + assert_frame_equal(result, expected) + + # dropna + df = DataFrame({'A' : np.random.randn(5), + 'B' : np.random.randn(5), + 'C' : np.random.randn(5), + 'D' : ['a','b','c','d','e'] }) + df.iloc[2,[0,1,2]] = np.nan + df.iloc[0,0] = np.nan + df.iloc[1,1] = np.nan + df.iloc[:,3] = np.nan + expected = df.dropna(subset=['A','B','C'],how='all') + expected.columns = ['A','A','B','C'] + + df.columns = ['A','A','B','C'] + + result = df.dropna(subset=['A','C'],how='all') + assert_frame_equal(result, expected) + + def test_column_dups_indexing(self): + def check(result, expected=None): + if expected is not None: + assert_frame_equal(result,expected) + result.dtypes + str(result) + + # boolean indexing + # GH 4879 + dups = ['A', 'A', 'C', 'D'] + df = DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'],dtype='float64') + expected = df[df.C > 6] + expected.columns = dups + df = DataFrame(np.arange(12).reshape(3,4), columns=dups,dtype='float64') + result = df[df.C > 6] + check(result,expected) + + # where + df = DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'],dtype='float64') + expected = df[df > 6] + expected.columns = dups + df = DataFrame(np.arange(12).reshape(3,4), columns=dups,dtype='float64') + result = df[df > 6] + check(result,expected) + + # boolean with the duplicate raises + df = DataFrame(np.arange(12).reshape(3,4), columns=dups,dtype='float64') + self.assertRaises(ValueError, lambda : df[df.A > 6]) + + # dup aligining operations should work + # GH 5185 + df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) + df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) + expected = DataFrame([0,2,0,2,2],index=[1,1,2,2,3]) + result = df1.sub(df2) + assert_frame_equal(result,expected) + + # equality + df1 = DataFrame([[1,2],[2,np.nan],[3,4],[4,4]],columns=['A','B']) + df2 = DataFrame([[0,1],[2,4],[2,np.nan],[4,5]],columns=['A','A']) + + # not-comparing like-labelled + self.assertRaises(ValueError, lambda : df1 == df2) + + df1r = df1.reindex_like(df2) + result = df1r == df2 + expected = DataFrame([[False,True],[True,False],[False,False],[True,False]],columns=['A','A']) + assert_frame_equal(result,expected) + + # mixed column selection + # GH 5639 + dfbool = DataFrame({'one' : Series([True, True, False], index=['a', 'b', 'c']), + 'two' : Series([False, False, True, False], index=['a', 'b', 'c', 'd']), + 'three': Series([False, True, True, True], index=['a', 'b', 'c', 'd'])}) + expected = pd.concat([dfbool['one'],dfbool['three'],dfbool['one']],axis=1) + result = dfbool[['one', 'three', 'one']] + check(result,expected) + + # multi-axis dups + # GH 6121 + df = DataFrame(np.arange(25.).reshape(5,5), + index=['a', 'b', 'c', 'd', 'e'], + columns=['A', 'B', 'C', 'D', 'E']) + z = df[['A', 'C', 'A']].copy() + expected = z.ix[['a', 'c', 'a']] + + df = DataFrame(np.arange(25.).reshape(5,5), + index=['a', 'b', 'c', 'd', 'e'], + columns=['A', 'B', 'C', 'D', 'E']) + z = df[['A', 'C', 'A']] + result = z.ix[['a', 'c', 'a']] + check(result,expected) + + def test_insert_benchmark(self): + # from the vb_suite/frame_methods/frame_insert_columns + N = 10 + K = 5 + df = DataFrame(index=lrange(N)) + new_col = np.random.randn(N) + for i in range(K): + df[i] = new_col + expected = DataFrame(np.repeat(new_col,K).reshape(N,K),index=lrange(N)) + assert_frame_equal(df,expected) + + def test_constructor_single_value(self): + + # expecting single value upcasting here + df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c']) + assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('float64'), df.index, + df.columns)) + + df = DataFrame(0, index=[1, 2, 3], columns=['a', 'b', 'c']) + assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'), df.index, + df.columns)) + + + df = DataFrame('a', index=[1, 2], columns=['a', 'c']) + assert_frame_equal(df, DataFrame(np.array([['a', 'a'], + ['a', 'a']], + dtype=object), + index=[1, 2], + columns=['a', 'c'])) + + self.assertRaises(com.PandasError, DataFrame, 'a', [1, 2]) + self.assertRaises(com.PandasError, DataFrame, 'a', columns=['a', 'c']) + with tm.assertRaisesRegexp(TypeError, 'incompatible data and dtype'): + DataFrame('a', [1, 2], ['a', 'c'], float) + + def test_constructor_with_datetimes(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + # single item + df = DataFrame({'A' : 1, 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime(2001,1,2,0,0) }, + index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 1, datetime64name: 2, objectname : 2}) + result.sort_index() + expected.sort_index() + assert_series_equal(result, expected) + + # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 ndarray with a dtype specified) + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array(1.,dtype=floatname), + intname : np.array(1,dtype=intname)}, index=np.arange(10)) + result = df.get_dtype_counts() + expected = { objectname : 1 } + if intname == 'int64': + expected['int64'] = 2 + else: + expected['int64'] = 1 + expected[intname] = 1 + if floatname == 'float64': + expected['float64'] = 2 + else: + expected['float64'] = 1 + expected[floatname] = 1 + + result.sort_index() + expected = Series(expected) + expected.sort_index() + assert_series_equal(result, expected) + + # check with ndarray construction ndim>0 + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array([1.]*10,dtype=floatname), + intname : np.array([1]*10,dtype=intname)}, index=np.arange(10)) + result = df.get_dtype_counts() + result.sort_index() + assert_series_equal(result, expected) + + # GH 2809 + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + datetime_s = Series(datetimes) + self.assertEqual(datetime_s.dtype, 'M8[ns]') + df = DataFrame({'datetime_s':datetime_s}) + result = df.get_dtype_counts() + expected = Series({ datetime64name : 1 }) + result.sort_index() + expected.sort_index() + assert_series_equal(result, expected) + + # GH 2810 + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + dates = [ts.date() for ts in ind] + df = DataFrame({'datetimes': datetimes, 'dates':dates}) + result = df.get_dtype_counts() + expected = Series({ datetime64name : 1, objectname : 1 }) + result.sort_index() + expected.sort_index() + assert_series_equal(result, expected) + + # GH 7594 + # don't coerce tz-aware + import pytz + tz = pytz.timezone('US/Eastern') + dt = tz.localize(datetime(2012, 1, 1)) + df = DataFrame({'End Date': dt}, index=[0]) + self.assertEqual(df.iat[0,0],dt) + assert_series_equal(df.dtypes,Series({'End Date' : np.dtype('object') })) + + df = DataFrame([{'End Date': dt}]) + self.assertEqual(df.iat[0,0],dt) + assert_series_equal(df.dtypes,Series({'End Date' : np.dtype('object') })) + + def test_constructor_for_list_with_dtypes(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + # test list of lists/ndarrays + df = DataFrame([np.arange(5) for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int64' : 5}) + + df = DataFrame([np.array(np.arange(5),dtype='int32') for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int32' : 5}) + + # overflow issue? (we always expecte int64 upcasting here) + df = DataFrame({'a' : [2**31,2**31+1]}) + result = df.get_dtype_counts() + expected = Series({'int64' : 1 }) + assert_series_equal(result, expected) + + # GH #2751 (construction with no index specified), make sure we cast to platform values + df = DataFrame([1, 2]) + result = df.get_dtype_counts() + expected = Series({'int64': 1 }) + assert_series_equal(result, expected) + + df = DataFrame([1.,2.]) + result = df.get_dtype_counts() + expected = Series({'float64' : 1 }) + assert_series_equal(result, expected) + + df = DataFrame({'a' : [1, 2]}) + result = df.get_dtype_counts() + expected = Series({'int64' : 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : [1., 2.]}) + result = df.get_dtype_counts() + expected = Series({'float64' : 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : 1 }, index=lrange(3)) + result = df.get_dtype_counts() + expected = Series({'int64': 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : 1. }, index=lrange(3)) + result = df.get_dtype_counts() + expected = Series({'float64': 1 }) + assert_series_equal(result, expected) + + # with object list + df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], + 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)], + 'e' : [1.,2,4.,7]}) + result = df.get_dtype_counts() + expected = Series({'int64': 1, 'float64' : 2, datetime64name: 1, objectname : 1}) + result.sort_index() + expected.sort_index() + assert_series_equal(result, expected) + + def test_not_hashable(self): + df = pd.DataFrame([1]) + self.assertRaises(TypeError, hash, df) + self.assertRaises(TypeError, hash, self.empty) + + def test_timedeltas(self): + + df = DataFrame(dict(A = Series(date_range('2012-1-1', periods=3, freq='D')), + B = Series([ timedelta(days=i) for i in range(3) ]))) + result = df.get_dtype_counts() + expected = Series({'datetime64[ns]': 1, 'timedelta64[ns]' : 1 }) + result.sort() + expected.sort() + assert_series_equal(result, expected) + + df['C'] = df['A'] + df['B'] + expected = Series({'datetime64[ns]': 2, 'timedelta64[ns]' : 1 }) + result = df.get_dtype_counts() + result.sort() + expected.sort() + assert_series_equal(result, expected) + + # mixed int types + df['D'] = 1 + expected = Series({'datetime64[ns]': 2, 'timedelta64[ns]' : 1, 'int64' : 1 }) + result = df.get_dtype_counts() + result.sort() + expected.sort() + assert_series_equal(result, expected) + + def test_operators_timedelta64(self): + + from datetime import datetime, timedelta + df = DataFrame(dict(A = date_range('2012-1-1', periods=3, freq='D'), + B = date_range('2012-1-2', periods=3, freq='D'), + C = Timestamp('20120101')-timedelta(minutes=5,seconds=5))) + + diffs = DataFrame(dict(A = df['A']-df['C'], + B = df['A']-df['B'])) + + + # min + result = diffs.min() + self.assertEqual(result[0], diffs.ix[0,'A']) + self.assertEqual(result[1], diffs.ix[0,'B']) + + result = diffs.min(axis=1) + self.assertTrue((result == diffs.ix[0,'B']).all() == True) + + # max + result = diffs.max() + self.assertEqual(result[0], diffs.ix[2,'A']) + self.assertEqual(result[1], diffs.ix[2,'B']) + + result = diffs.max(axis=1) + self.assertTrue((result == diffs['A']).all() == True) + + # abs + result = diffs.abs() + result2 = abs(diffs) + expected = DataFrame(dict(A = df['A']-df['C'], + B = df['B']-df['A'])) + assert_frame_equal(result,expected) + assert_frame_equal(result2, expected) + + # mixed frame + mixed = diffs.copy() + mixed['C'] = 'foo' + mixed['D'] = 1 + mixed['E'] = 1. + mixed['F'] = Timestamp('20130101') + + # results in an object array + from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + result = mixed.min() + expected = Series([_coerce_scalar_to_timedelta_type(timedelta(seconds=5*60+5)), + _coerce_scalar_to_timedelta_type(timedelta(days=-1)), + 'foo', + 1, + 1.0, + Timestamp('20130101')], + index=mixed.columns) + assert_series_equal(result,expected) + + # excludes numeric + result = mixed.min(axis=1) + expected = Series([1, 1, 1.],index=[0, 1, 2]) + assert_series_equal(result,expected) + + # works when only those columns are selected + result = mixed[['A','B']].min(1) + expected = Series([ timedelta(days=-1) ] * 3) + assert_series_equal(result,expected) + + result = mixed[['A','B']].min() + expected = Series([ timedelta(seconds=5*60+5), timedelta(days=-1) ],index=['A','B']) + assert_series_equal(result,expected) + + # GH 3106 + df = DataFrame({'time' : date_range('20130102',periods=5), + 'time2' : date_range('20130105',periods=5) }) + df['off1'] = df['time2']-df['time'] + self.assertEqual(df['off1'].dtype, 'timedelta64[ns]') + + df['off2'] = df['time']-df['time2'] + df._consolidate_inplace() + self.assertTrue(df['off1'].dtype == 'timedelta64[ns]') + self.assertTrue(df['off2'].dtype == 'timedelta64[ns]') + + def test_datetimelike_setitem_with_inference(self): + tm._skip_if_not_numpy17_friendly() + + # GH 7592 + # assignment of timedeltas with NaT + + one_hour = timedelta(hours=1) + df = DataFrame(index=date_range('20130101',periods=4)) + df['A'] = np.array([1*one_hour]*4, dtype='m8[ns]') + df.loc[:,'B'] = np.array([2*one_hour]*4, dtype='m8[ns]') + df.loc[:3,'C'] = np.array([3*one_hour]*3, dtype='m8[ns]') + df.ix[:,'D'] = np.array([4*one_hour]*4, dtype='m8[ns]') + df.ix[:3,'E'] = np.array([5*one_hour]*3, dtype='m8[ns]') + df['F'] = np.timedelta64('NaT') + df.ix[:-1,'F'] = np.array([6*one_hour]*3, dtype='m8[ns]') + df.ix[-3:,'G'] = date_range('20130101',periods=3) + df['H'] = np.datetime64('NaT') + result = df.dtypes + expected = Series([np.dtype('timedelta64[ns]')]*6+[np.dtype('datetime64[ns]')]*2,index=list('ABCDEFGH')) + assert_series_equal(result,expected) + + def test_new_empty_index(self): + df1 = DataFrame(randn(0, 3)) + df2 = DataFrame(randn(0, 3)) + df1.index.name = 'foo' + self.assertIsNone(df2.index.name) + + def test_astype(self): + casted = self.frame.astype(int) + expected = DataFrame(self.frame.values.astype(int), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(casted, expected) + + casted = self.frame.astype(np.int32) + expected = DataFrame(self.frame.values.astype(np.int32), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(casted, expected) + + self.frame['foo'] = '5' + casted = self.frame.astype(int) + expected = DataFrame(self.frame.values.astype(int), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(casted, expected) + + # mixed casting + def _check_cast(df, v): + self.assertEqual(list(set([ s.dtype.name for _, s in compat.iteritems(df) ]))[0], v) + + mn = self.all_mixed._get_numeric_data().copy() + mn['little_float'] = np.array(12345.,dtype='float16') + mn['big_float'] = np.array(123456789101112.,dtype='float64') + + casted = mn.astype('float64') + _check_cast(casted, 'float64') + + casted = mn.astype('int64') + _check_cast(casted, 'int64') + + casted = self.mixed_float.reindex(columns = ['A','B']).astype('float32') + _check_cast(casted, 'float32') + + casted = mn.reindex(columns = ['little_float']).astype('float16') + _check_cast(casted, 'float16') + + casted = self.mixed_float.reindex(columns = ['A','B']).astype('float16') + _check_cast(casted, 'float16') + + casted = mn.astype('float32') + _check_cast(casted, 'float32') + + casted = mn.astype('int32') + _check_cast(casted, 'int32') + + # to object + casted = mn.astype('O') + _check_cast(casted, 'object') + + def test_astype_with_exclude_string(self): + df = self.frame.copy() + expected = self.frame.astype(int) + df['string'] = 'foo' + casted = df.astype(int, raise_on_error = False) + + expected['string'] = 'foo' + assert_frame_equal(casted, expected) + + df = self.frame.copy() + expected = self.frame.astype(np.int32) + df['string'] = 'foo' + casted = df.astype(np.int32, raise_on_error = False) + + expected['string'] = 'foo' + assert_frame_equal(casted, expected) + + def test_astype_with_view(self): + + tf = self.mixed_float.reindex(columns = ['A','B','C']) + + casted = tf.astype(np.int64) + + casted = tf.astype(np.float32) + + # this is the only real reason to do it this way + tf = np.round(self.frame).astype(np.int32) + casted = tf.astype(np.float32, copy = False) + + tf = self.frame.astype(np.float64) + casted = tf.astype(np.int64, copy = False) + + def test_astype_cast_nan_int(self): + df = DataFrame(data={"Values": [1.0, 2.0, 3.0, np.nan]}) + self.assertRaises(ValueError, df.astype, np.int64) + + def test_array_interface(self): + result = np.sqrt(self.frame) + tm.assert_isinstance(result, type(self.frame)) + self.assertIs(result.index, self.frame.index) + self.assertIs(result.columns, self.frame.columns) + + assert_frame_equal(result, self.frame.apply(np.sqrt)) + + def test_pickle(self): + unpickled = pickle.loads(pickle.dumps(self.mixed_frame)) + assert_frame_equal(self.mixed_frame, unpickled) + + # buglet + self.mixed_frame._data.ndim + + # empty + unpickled = pickle.loads(pickle.dumps(self.empty)) + repr(unpickled) + + def test_to_dict(self): + test_data = { + 'A': {'1': 1, '2': 2}, + 'B': {'1': '1', '2': '2', '3': '3'}, + } + recons_data = DataFrame(test_data).to_dict() + + for k, v in compat.iteritems(test_data): + for k2, v2 in compat.iteritems(v): + self.assertEqual(v2, recons_data[k][k2]) + + recons_data = DataFrame(test_data).to_dict("l") + + for k, v in compat.iteritems(test_data): + for k2, v2 in compat.iteritems(v): + self.assertEqual(v2, recons_data[k][int(k2) - 1]) + + recons_data = DataFrame(test_data).to_dict("s") + + for k, v in compat.iteritems(test_data): + for k2, v2 in compat.iteritems(v): + self.assertEqual(v2, recons_data[k][k2]) + + recons_data = DataFrame(test_data).to_dict("r") + + expected_records = [{'A': 1.0, 'B': '1'}, + {'A': 2.0, 'B': '2'}, + {'A': nan, 'B': '3'}] + + tm.assert_almost_equal(recons_data, expected_records) + + def test_to_records_dt64(self): + df = DataFrame([["one", "two", "three"], + ["four", "five", "six"]], + index=date_range("2012-01-01", "2012-01-02")) + self.assertEqual(df.to_records()['index'][0], df.index[0]) + + rs = df.to_records(convert_datetime64=False) + self.assertEqual(rs['index'][0], df.index.values[0]) + + def test_to_records_with_multindex(self): + # GH3189 + index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + data = np.zeros((8, 4)) + df = DataFrame(data, index=index) + r = df.to_records(index=True)['level_0'] + self.assertTrue('bar' in r) + self.assertTrue('one' not in r) + + def test_to_records_with_Mapping_type(self): + import email + from email.parser import Parser + import collections + + collections.Mapping.register(email.message.Message) + + headers = Parser().parsestr('From: \n' + 'To: \n' + 'Subject: Test message\n' + '\n' + 'Body would go here\n') + + frame = DataFrame.from_records([headers]) + all( x in frame for x in ['Type','Subject','From']) + + def test_from_records_to_records(self): + # from numpy documentation + arr = np.zeros((2,), dtype=('i4,f4,a10')) + arr[:] = [(1, 2., 'Hello'), (2, 3., "World")] + + frame = DataFrame.from_records(arr) + + index = np.arange(len(arr))[::-1] + indexed_frame = DataFrame.from_records(arr, index=index) + self.assert_numpy_array_equal(indexed_frame.index, index) + + # without names, it should go to last ditch + arr2 = np.zeros((2,3)) + tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) + + # wrong length + msg = r'Shape of passed values is \(3,\), indices imply \(3, 1\)' + with assertRaisesRegexp(ValueError, msg): + DataFrame.from_records(arr, index=index[:-1]) + + indexed_frame = DataFrame.from_records(arr, index='f1') + + # what to do? + records = indexed_frame.to_records() + self.assertEqual(len(records.dtype.names), 3) + + records = indexed_frame.to_records(index=False) + self.assertEqual(len(records.dtype.names), 2) + self.assertNotIn('index', records.dtype.names) + + def test_from_records_nones(self): + tuples = [(1, 2, None, 3), + (1, 2, None, 3), + (None, 2, 5, 3)] + + df = DataFrame.from_records(tuples, columns=['a', 'b', 'c', 'd']) + self.assertTrue(np.isnan(df['c'][0])) + + def test_from_records_iterator(self): + arr = np.array([(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5., 5., 6, 6), (7., 7., 8, 8)], + dtype=[('x', np.float64), ('u', np.float32), ('y', np.int64), ('z', np.int32) ]) + df = DataFrame.from_records(iter(arr), nrows=2) + xp = DataFrame({'x': np.array([1.0, 3.0], dtype=np.float64), + 'u': np.array([1.0, 3.0], dtype=np.float32), + 'y': np.array([2, 4], dtype=np.int64), + 'z': np.array([2, 4], dtype=np.int32)}) + assert_frame_equal(df.reindex_like(xp), xp) + + # no dtypes specified here, so just compare with the default + arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)] + df = DataFrame.from_records(iter(arr), columns=['x', 'y'], + nrows=2) + assert_frame_equal(df, xp.reindex(columns=['x','y']), check_dtype=False) + + def test_from_records_tuples_generator(self): + def tuple_generator(length): + for i in range(length): + letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + yield (i, letters[i % len(letters)], i/length) + + columns_names = ['Integer', 'String', 'Float'] + columns = [[i[j] for i in tuple_generator(10)] for j in range(len(columns_names))] + data = {'Integer': columns[0], 'String': columns[1], 'Float': columns[2]} + expected = DataFrame(data, columns=columns_names) + + generator = tuple_generator(10) + result = DataFrame.from_records(generator, columns=columns_names) + assert_frame_equal(result, expected) + + def test_from_records_lists_generator(self): + def list_generator(length): + for i in range(length): + letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + yield [i, letters[i % len(letters)], i/length] + + columns_names = ['Integer', 'String', 'Float'] + columns = [[i[j] for i in list_generator(10)] for j in range(len(columns_names))] + data = {'Integer': columns[0], 'String': columns[1], 'Float': columns[2]} + expected = DataFrame(data, columns=columns_names) + + generator = list_generator(10) + result = DataFrame.from_records(generator, columns=columns_names) + assert_frame_equal(result, expected) + + def test_from_records_columns_not_modified(self): + tuples = [(1, 2, 3), + (1, 2, 3), + (2, 5, 3)] + + columns = ['a', 'b', 'c'] + original_columns = list(columns) + df = DataFrame.from_records(tuples, columns=columns, index='a') + self.assertEqual(columns, original_columns) + + def test_from_records_decimal(self): + from decimal import Decimal + + tuples = [(Decimal('1.5'),), (Decimal('2.5'),), (None,)] + + df = DataFrame.from_records(tuples, columns=['a']) + self.assertEqual(df['a'].dtype, object) + + df = DataFrame.from_records(tuples, columns=['a'], coerce_float=True) + self.assertEqual(df['a'].dtype, np.float64) + self.assertTrue(np.isnan(df['a'].values[-1])) + + def test_from_records_duplicates(self): + result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], + columns=['a', 'b', 'a']) + + expected = DataFrame([(1, 2, 3), (4, 5, 6)], + columns=['a', 'b', 'a']) + + assert_frame_equal(result, expected) + + def test_from_records_set_index_name(self): + def create_dict(order_id): + return {'order_id': order_id, 'quantity': np.random.randint(1, 10), + 'price': np.random.randint(1, 10)} + documents = [create_dict(i) for i in range(10)] + # demo missing data + documents.append({'order_id': 10, 'quantity': 5}) + + result = DataFrame.from_records(documents, index='order_id') + self.assertEqual(result.index.name, 'order_id') + + # MultiIndex + result = DataFrame.from_records(documents, + index=['order_id', 'quantity']) + self.assertEqual(result.index.names, ('order_id', 'quantity')) + + def test_from_records_misc_brokenness(self): + # #2179 + + data = {1: ['foo'], 2: ['bar']} + + result = DataFrame.from_records(data, columns=['a', 'b']) + exp = DataFrame(data, columns=['a', 'b']) + assert_frame_equal(result, exp) + + # overlap in index/index_names + + data = {'a': [1, 2, 3], 'b': [4, 5, 6]} + + result = DataFrame.from_records(data, index=['a', 'b', 'c']) + exp = DataFrame(data, index=['a', 'b', 'c']) + assert_frame_equal(result, exp) + + + # GH 2623 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), 'hi']) # test col upconverts to obj + df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + results = df2_obj.get_dtype_counts() + expected = Series({ 'datetime64[ns]' : 1, 'object' : 1 }) + + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), 1]) + df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + results = df2_obj.get_dtype_counts() + expected = Series({ 'datetime64[ns]' : 1, 'int64' : 1 }) + + def test_from_records_empty(self): + # 3562 + result = DataFrame.from_records([], columns=['a','b','c']) + expected = DataFrame(columns=['a','b','c']) + assert_frame_equal(result, expected) + + result = DataFrame.from_records([], columns=['a','b','b']) + expected = DataFrame(columns=['a','b','b']) + assert_frame_equal(result, expected) + + def test_from_records_empty_with_nonempty_fields_gh3682(self): + a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)]) + df = DataFrame.from_records(a, index='id') + assert_array_equal(df.index, Index([1], name='id')) + self.assertEqual(df.index.name, 'id') + assert_array_equal(df.columns, Index(['value'])) + + b = np.array([], dtype=[('id', np.int64), ('value', np.int64)]) + df = DataFrame.from_records(b, index='id') + assert_array_equal(df.index, Index([], name='id')) + self.assertEqual(df.index.name, 'id') + + def test_from_records_with_datetimes(self): + if sys.version < LooseVersion('2.7'): + raise nose.SkipTest('rec arrays dont work properly with py2.6') + + # this may fail on certain platforms because of a numpy issue + # related GH6140 + if not is_little_endian(): + raise nose.SkipTest("known failure of test on non-little endian") + + # construction with a null in a recarray + # GH 6140 + expected = DataFrame({ 'EXPIRY' : [datetime(2005, 3, 1, 0, 0), None ]}) + + arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] + dtypes = [('EXPIRY', '= y) + self.assertRaises(TypeError, lambda : x > y) + self.assertRaises(TypeError, lambda : x < y) + self.assertRaises(TypeError, lambda : x <= y) + + # GH4968 + # invalid date/int comparisons + df = DataFrame(np.random.randint(10, size=(10, 1)), columns=['a']) + df['dates'] = date_range('20010101', periods=len(df)) + + df2 = df.copy() + df2['dates'] = df['a'] + check(df,df2) + + df = DataFrame(np.random.randint(10, size=(10, 2)), columns=['a', 'b']) + df2 = DataFrame({'a': date_range('20010101', periods=len(df)), 'b': date_range('20100101', periods=len(df))}) + check(df,df2) + + def test_timestamp_compare(self): + # make sure we can compare Timestamps on the right AND left hand side + # GH4982 + df = DataFrame({'dates1': date_range('20010101', periods=10), + 'dates2': date_range('20010102', periods=10), + 'intcol': np.random.randint(1000000000, size=10), + 'floatcol': np.random.randn(10), + 'stringcol': list(tm.rands(10))}) + df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT + ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', + 'ne': 'ne'} + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + + # no nats + expected = left_f(df, Timestamp('20010109')) + result = right_f(Timestamp('20010109'), df) + tm.assert_frame_equal(result, expected) + + # nats + expected = left_f(df, Timestamp('nat')) + result = right_f(Timestamp('nat'), df) + tm.assert_frame_equal(result, expected) + + def test_modulo(self): + + # GH3590, modulo as ints + p = DataFrame({ 'first' : [3,4,5,8], 'second' : [0,0,0,3] }) + + ### this is technically wrong as the integer portion is coerced to float ### + expected = DataFrame({ 'first' : Series([0,0,0,0],dtype='float64'), 'second' : Series([np.nan,np.nan,np.nan,0]) }) + result = p % p + assert_frame_equal(result,expected) + + # numpy has a slightly different (wrong) treatement + result2 = DataFrame(p.values % p.values,index=p.index,columns=p.columns,dtype='float64') + result2.iloc[0:3,1] = np.nan + assert_frame_equal(result2,expected) + + result = p % 0 + expected = DataFrame(np.nan,index=p.index,columns=p.columns) + assert_frame_equal(result,expected) + + # numpy has a slightly different (wrong) treatement + result2 = DataFrame(p.values.astype('float64') % 0,index=p.index,columns=p.columns) + assert_frame_equal(result2,expected) + + # not commutative with series + p = DataFrame(np.random.randn(10, 5)) + s = p[0] + res = s % p + res2 = p % s + self.assertFalse(np.array_equal(res.fillna(0), res2.fillna(0))) + + def test_div(self): + + # integer div, but deal with the 0's + p = DataFrame({ 'first' : [3,4,5,8], 'second' : [0,0,0,3] }) + result = p / p + + ### this is technically wrong as the integer portion is coerced to float ### + expected = DataFrame({ 'first' : Series([1,1,1,1],dtype='float64'), 'second' : Series([np.inf,np.inf,np.inf,1]) }) + assert_frame_equal(result,expected) + + result2 = DataFrame(p.values.astype('float64')/p.values,index=p.index,columns=p.columns).fillna(np.inf) + assert_frame_equal(result2,expected) + + result = p / 0 + expected = DataFrame(np.inf,index=p.index,columns=p.columns) + assert_frame_equal(result,expected) + + # numpy has a slightly different (wrong) treatement + result2 = DataFrame(p.values.astype('float64')/0,index=p.index,columns=p.columns).fillna(np.inf) + assert_frame_equal(result2,expected) + + p = DataFrame(np.random.randn(10, 5)) + s = p[0] + res = s / p + res2 = p / s + self.assertFalse(np.array_equal(res.fillna(0), res2.fillna(0))) + + def test_logical_operators(self): + import operator + + def _check_bin_op(op): + result = op(df1, df2) + expected = DataFrame(op(df1.values, df2.values), index=df1.index, + columns=df1.columns) + self.assertEqual(result.values.dtype, np.bool_) + assert_frame_equal(result, expected) + + def _check_unary_op(op): + result = op(df1) + expected = DataFrame(op(df1.values), index=df1.index, + columns=df1.columns) + self.assertEqual(result.values.dtype, np.bool_) + assert_frame_equal(result, expected) + + df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, + 'b': {'a': False, 'b': True, 'c': False, + 'd': False, 'e': False}, + 'c': {'a': False, 'b': False, 'c': True, + 'd': False, 'e': False}, + 'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, + 'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}} + + df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, + 'b': {'a': False, 'b': True, 'c': False, + 'd': False, 'e': False}, + 'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, + 'd': {'a': False, 'b': False, 'c': False, + 'd': True, 'e': False}, + 'e': {'a': False, 'b': False, 'c': False, + 'd': False, 'e': True}} + + df1 = DataFrame(df1) + df2 = DataFrame(df2) + + _check_bin_op(operator.and_) + _check_bin_op(operator.or_) + _check_bin_op(operator.xor) + + # operator.neg is deprecated in numpy >= 1.9 + _check_unary_op(operator.inv) + + def test_logical_typeerror(self): + if not compat.PY3: + self.assertRaises(TypeError, self.frame.__eq__, 'foo') + self.assertRaises(TypeError, self.frame.__lt__, 'foo') + self.assertRaises(TypeError, self.frame.__gt__, 'foo') + self.assertRaises(TypeError, self.frame.__ne__, 'foo') + else: + raise nose.SkipTest('test_logical_typeerror not tested on PY3') + + def test_constructor_lists_to_object_dtype(self): + # from #1074 + d = DataFrame({'a': [np.nan, False]}) + self.assertEqual(d['a'].dtype, np.object_) + self.assertFalse(d['a'][1]) + + def test_constructor_with_nas(self): + # GH 5016 + # na's in indicies + + def check(df): + for i in range(len(df.columns)): + df.iloc[:,i] + + # allow single nans to succeed + indexer = np.arange(len(df.columns))[isnull(df.columns)] + + if len(indexer) == 1: + assert_series_equal(df.iloc[:,indexer[0]],df.loc[:,np.nan]) + + + # multiple nans should fail + else: + + def f(): + df.loc[:,np.nan] + self.assertRaises(ValueError, f) + + + df = DataFrame([[1,2,3],[4,5,6]], index=[1,np.nan]) + check(df) + + df = DataFrame([[1,2,3],[4,5,6]], columns=[1.1,2.2,np.nan]) + check(df) + + df = DataFrame([[0,1,2,3],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan]) + check(df) + + df = DataFrame([[0.0,1,2,3.0],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan]) + check(df) + + def test_logical_with_nas(self): + d = DataFrame({'a': [np.nan, False], 'b': [True, True]}) + + # GH4947 + # bool comparisons should return bool + result = d['a'] | d['b'] + expected = Series([False, True]) + assert_series_equal(result, expected) + + # GH4604, automatic casting here + result = d['a'].fillna(False) | d['b'] + expected = Series([True, True]) + assert_series_equal(result, expected) + + result = d['a'].fillna(False,downcast=False) | d['b'] + expected = Series([True, True]) + assert_series_equal(result, expected) + + def test_neg(self): + # what to do? + assert_frame_equal(-self.frame, -1 * self.frame) + + def test_invert(self): + assert_frame_equal(-(self.frame < 0), ~(self.frame < 0)) + + def test_first_last_valid(self): + N = len(self.frame.index) + mat = randn(N) + mat[:5] = nan + mat[-5:] = nan + + frame = DataFrame({'foo': mat}, index=self.frame.index) + index = frame.first_valid_index() + + self.assertEqual(index, frame.index[5]) + + index = frame.last_valid_index() + self.assertEqual(index, frame.index[-6]) + + def test_arith_flex_frame(self): + ops = ['add', 'sub', 'mul', 'div', 'truediv', 'pow', 'floordiv', 'mod'] + if not compat.PY3: + aliases = {} + else: + aliases = {'div': 'truediv'} + + for op in ops: + try: + alias = aliases.get(op, op) + f = getattr(operator, alias) + result = getattr(self.frame, op)(2 * self.frame) + exp = f(self.frame, 2 * self.frame) + assert_frame_equal(result, exp) + + # vs mix float + result = getattr(self.mixed_float, op)(2 * self.mixed_float) + exp = f(self.mixed_float, 2 * self.mixed_float) + assert_frame_equal(result, exp) + _check_mixed_float(result, dtype = dict(C = None)) + + # vs mix int + if op in ['add','sub','mul']: + result = getattr(self.mixed_int, op)(2 + self.mixed_int) + exp = f(self.mixed_int, 2 + self.mixed_int) + + # overflow in the uint + dtype = None + if op in ['sub']: + dtype = dict(B = 'object', C = None) + elif op in ['add','mul']: + dtype = dict(C = None) + assert_frame_equal(result, exp) + _check_mixed_int(result, dtype = dtype) + + # rops + r_f = lambda x, y: f(y, x) + result = getattr(self.frame, 'r' + op)(2 * self.frame) + exp = r_f(self.frame, 2 * self.frame) + assert_frame_equal(result, exp) + + # vs mix float + result = getattr(self.mixed_float, op)(2 * self.mixed_float) + exp = f(self.mixed_float, 2 * self.mixed_float) + assert_frame_equal(result, exp) + _check_mixed_float(result, dtype = dict(C = None)) + + result = getattr(self.intframe, op)(2 * self.intframe) + exp = f(self.intframe, 2 * self.intframe) + assert_frame_equal(result, exp) + + # vs mix int + if op in ['add','sub','mul']: + result = getattr(self.mixed_int, op)(2 + self.mixed_int) + exp = f(self.mixed_int, 2 + self.mixed_int) + + # overflow in the uint + dtype = None + if op in ['sub']: + dtype = dict(B = 'object', C = None) + elif op in ['add','mul']: + dtype = dict(C = None) + assert_frame_equal(result, exp) + _check_mixed_int(result, dtype = dtype) + except: + com.pprint_thing("Failing operation %r" % op) + raise + + # ndim >= 3 + ndim_5 = np.ones(self.frame.shape + (3, 4, 5)) + with assertRaisesRegexp(ValueError, 'shape'): + f(self.frame, ndim_5) + + with assertRaisesRegexp(ValueError, 'shape'): + getattr(self.frame, op)(ndim_5) + + + # res_add = self.frame.add(self.frame) + # res_sub = self.frame.sub(self.frame) + # res_mul = self.frame.mul(self.frame) + # res_div = self.frame.div(2 * self.frame) + + # assert_frame_equal(res_add, self.frame + self.frame) + # assert_frame_equal(res_sub, self.frame - self.frame) + # assert_frame_equal(res_mul, self.frame * self.frame) + # assert_frame_equal(res_div, self.frame / (2 * self.frame)) + + const_add = self.frame.add(1) + assert_frame_equal(const_add, self.frame + 1) + + # corner cases + result = self.frame.add(self.frame[:0]) + assert_frame_equal(result, self.frame * np.nan) + + result = self.frame[:0].add(self.frame) + assert_frame_equal(result, self.frame * np.nan) + with assertRaisesRegexp(NotImplementedError, 'fill_value'): + self.frame.add(self.frame.irow(0), fill_value=3) + with assertRaisesRegexp(NotImplementedError, 'fill_value'): + self.frame.add(self.frame.irow(0), axis='index', fill_value=3) + + def test_binary_ops_align(self): + + # test aligning binary ops + + # GH 6681 + index=MultiIndex.from_product([list('abc'), + ['one','two','three'], + [1,2,3]], + names=['first','second','third']) + + df = DataFrame(np.arange(27*3).reshape(27,3), + index=index, + columns=['value1','value2','value3']).sortlevel() + + idx = pd.IndexSlice + for op in ['add','sub','mul','div','truediv']: + opa = getattr(operator,op,None) + if opa is None: + continue + + x = Series([ 1.0, 10.0, 100.0], [1,2,3]) + result = getattr(df,op)(x,level='third',axis=0) + + expected = pd.concat([ opa(df.loc[idx[:,:,i],:],v) for i, v in x.iteritems() ]).sortlevel() + assert_frame_equal(result, expected) + + x = Series([ 1.0, 10.0], ['two','three']) + result = getattr(df,op)(x,level='second',axis=0) + + expected = pd.concat([ opa(df.loc[idx[:,i],:],v) for i, v in x.iteritems() ]).reindex_like(df).sortlevel() + assert_frame_equal(result, expected) + + def test_arith_mixed(self): + + left = DataFrame({'A': ['a', 'b', 'c'], + 'B': [1, 2, 3]}) + + result = left + left + expected = DataFrame({'A': ['aa', 'bb', 'cc'], + 'B': [2, 4, 6]}) + assert_frame_equal(result, expected) + + def test_arith_getitem_commute(self): + df = DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) + + self._test_op(df, operator.add) + self._test_op(df, operator.sub) + self._test_op(df, operator.mul) + self._test_op(df, operator.truediv) + self._test_op(df, operator.floordiv) + self._test_op(df, operator.pow) + + self._test_op(df, lambda x, y: y + x) + self._test_op(df, lambda x, y: y - x) + self._test_op(df, lambda x, y: y * x) + self._test_op(df, lambda x, y: y / x) + self._test_op(df, lambda x, y: y ** x) + + self._test_op(df, lambda x, y: x + y) + self._test_op(df, lambda x, y: x - y) + self._test_op(df, lambda x, y: x * y) + self._test_op(df, lambda x, y: x / y) + self._test_op(df, lambda x, y: x ** y) + + @staticmethod + def _test_op(df, op): + result = op(df, 1) + + if not df.columns.is_unique: + raise ValueError("Only unique columns supported by this test") + + for col in result.columns: + assert_series_equal(result[col], op(df[col], 1)) + + def test_bool_flex_frame(self): + data = np.random.randn(5, 3) + other_data = np.random.randn(5, 3) + df = DataFrame(data) + other = DataFrame(other_data) + ndim_5 = np.ones(df.shape + (1, 3)) + + # Unaligned + def _check_unaligned_frame(meth, op, df, other): + part_o = other.ix[3:, 1:].copy() + rs = meth(part_o) + xp = op(df, part_o.reindex(index=df.index, columns=df.columns)) + assert_frame_equal(rs, xp) + + # DataFrame + self.assertTrue(df.eq(df).values.all()) + self.assertFalse(df.ne(df).values.any()) + for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: + f = getattr(df, op) + o = getattr(operator, op) + # No NAs + assert_frame_equal(f(other), o(df, other)) + _check_unaligned_frame(f, o, df, other) + # ndarray + assert_frame_equal(f(other.values), o(df, other.values)) + # scalar + assert_frame_equal(f(0), o(df, 0)) + # NAs + assert_frame_equal(f(np.nan), o(df, np.nan)) + with assertRaisesRegexp(ValueError, 'shape'): + f(ndim_5) + + # Series + def _test_seq(df, idx_ser, col_ser): + idx_eq = df.eq(idx_ser, axis=0) + col_eq = df.eq(col_ser) + idx_ne = df.ne(idx_ser, axis=0) + col_ne = df.ne(col_ser) + assert_frame_equal(col_eq, df == Series(col_ser)) + assert_frame_equal(col_eq, -col_ne) + assert_frame_equal(idx_eq, -idx_ne) + assert_frame_equal(idx_eq, df.T.eq(idx_ser).T) + assert_frame_equal(col_eq, df.eq(list(col_ser))) + assert_frame_equal(idx_eq, df.eq(Series(idx_ser), axis=0)) + assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0)) + + idx_gt = df.gt(idx_ser, axis=0) + col_gt = df.gt(col_ser) + idx_le = df.le(idx_ser, axis=0) + col_le = df.le(col_ser) + + assert_frame_equal(col_gt, df > Series(col_ser)) + assert_frame_equal(col_gt, -col_le) + assert_frame_equal(idx_gt, -idx_le) + assert_frame_equal(idx_gt, df.T.gt(idx_ser).T) + + idx_ge = df.ge(idx_ser, axis=0) + col_ge = df.ge(col_ser) + idx_lt = df.lt(idx_ser, axis=0) + col_lt = df.lt(col_ser) + assert_frame_equal(col_ge, df >= Series(col_ser)) + assert_frame_equal(col_ge, -col_lt) + assert_frame_equal(idx_ge, -idx_lt) + assert_frame_equal(idx_ge, df.T.ge(idx_ser).T) + + idx_ser = Series(np.random.randn(5)) + col_ser = Series(np.random.randn(3)) + _test_seq(df, idx_ser, col_ser) + + + # list/tuple + _test_seq(df, idx_ser.values, col_ser.values) + + # NA + df.ix[0, 0] = np.nan + rs = df.eq(df) + self.assertFalse(rs.ix[0, 0]) + rs = df.ne(df) + self.assertTrue(rs.ix[0, 0]) + rs = df.gt(df) + self.assertFalse(rs.ix[0, 0]) + rs = df.lt(df) + self.assertFalse(rs.ix[0, 0]) + rs = df.ge(df) + self.assertFalse(rs.ix[0, 0]) + rs = df.le(df) + self.assertFalse(rs.ix[0, 0]) + + + + # complex + arr = np.array([np.nan, 1, 6, np.nan]) + arr2 = np.array([2j, np.nan, 7, None]) + df = DataFrame({'a': arr}) + df2 = DataFrame({'a': arr2}) + rs = df.gt(df2) + self.assertFalse(rs.values.any()) + rs = df.ne(df2) + self.assertTrue(rs.values.all()) + + arr3 = np.array([2j, np.nan, None]) + df3 = DataFrame({'a': arr3}) + rs = df3.gt(2j) + self.assertFalse(rs.values.any()) + + # corner, dtype=object + df1 = DataFrame({'col': ['foo', np.nan, 'bar']}) + df2 = DataFrame({'col': ['foo', datetime.now(), 'bar']}) + result = df1.ne(df2) + exp = DataFrame({'col': [False, True, False]}) + assert_frame_equal(result, exp) + + def test_arith_flex_series(self): + df = self.simple + + row = df.xs('a') + col = df['two'] + # after arithmetic refactor, add truediv here + ops = ['add', 'sub', 'mul', 'mod'] + for op in ops: + f = getattr(df, op) + op = getattr(operator, op) + assert_frame_equal(f(row), op(df, row)) + assert_frame_equal(f(col, axis=0), op(df.T, col).T) + + # special case for some reason + assert_frame_equal(df.add(row, axis=None), df + row) + + # cases which will be refactored after big arithmetic refactor + assert_frame_equal(df.div(row), df / row) + assert_frame_equal(df.div(col, axis=0), (df.T / col).T) + + # broadcasting issue in GH7325 + df = DataFrame(np.arange(3*2).reshape((3,2)),dtype='int64') + expected = DataFrame([[np.inf,np.inf],[1.0,1.5],[1.0,1.25]]) + result = df.div(df[0],axis='index') + assert_frame_equal(result,expected) + + df = DataFrame(np.arange(3*2).reshape((3,2)),dtype='float64') + expected = DataFrame([[np.nan,np.inf],[1.0,1.5],[1.0,1.25]]) + result = df.div(df[0],axis='index') + assert_frame_equal(result,expected) + + def test_arith_non_pandas_object(self): + df = self.simple + + val1 = df.xs('a').values + added = DataFrame(df.values + val1, index=df.index, columns=df.columns) + assert_frame_equal(df + val1, added) + + added = DataFrame((df.values.T + val1).T, + index=df.index, columns=df.columns) + assert_frame_equal(df.add(val1, axis=0), added) + + val2 = list(df['two']) + + added = DataFrame(df.values + val2, index=df.index, columns=df.columns) + assert_frame_equal(df + val2, added) + + added = DataFrame((df.values.T + val2).T, index=df.index, + columns=df.columns) + assert_frame_equal(df.add(val2, axis='index'), added) + + val3 = np.random.rand(*df.shape) + added = DataFrame(df.values + val3, index=df.index, columns=df.columns) + assert_frame_equal(df.add(val3), added) + + def test_combineFrame(self): + frame_copy = self.frame.reindex(self.frame.index[::2]) + + del frame_copy['D'] + frame_copy['C'][:5] = nan + + added = self.frame + frame_copy + tm.assert_dict_equal(added['A'].valid(), + self.frame['A'] * 2, + compare_keys=False) + + self.assertTrue(np.isnan(added['C'].reindex(frame_copy.index)[:5]).all()) + + # assert(False) + + self.assertTrue(np.isnan(added['D']).all()) + + self_added = self.frame + self.frame + self.assertTrue(self_added.index.equals(self.frame.index)) + + added_rev = frame_copy + self.frame + self.assertTrue(np.isnan(added['D']).all()) + + # corner cases + + # empty + plus_empty = self.frame + self.empty + self.assertTrue(np.isnan(plus_empty.values).all()) + + empty_plus = self.empty + self.frame + self.assertTrue(np.isnan(empty_plus.values).all()) + + empty_empty = self.empty + self.empty + self.assertTrue(empty_empty.empty) + + # out of order + reverse = self.frame.reindex(columns=self.frame.columns[::-1]) + + assert_frame_equal(reverse + self.frame, self.frame * 2) + + # mix vs float64, upcast + added = self.frame + self.mixed_float + _check_mixed_float(added, dtype = 'float64') + added = self.mixed_float + self.frame + _check_mixed_float(added, dtype = 'float64') + + # mix vs mix + added = self.mixed_float + self.mixed_float2 + _check_mixed_float(added, dtype = dict(C = None)) + added = self.mixed_float2 + self.mixed_float + _check_mixed_float(added, dtype = dict(C = None)) + + # with int + added = self.frame + self.mixed_int + _check_mixed_float(added, dtype = 'float64') + + def test_combineSeries(self): + + # Series + series = self.frame.xs(self.frame.index[0]) + + added = self.frame + series + + for key, s in compat.iteritems(added): + assert_series_equal(s, self.frame[key] + series[key]) + + larger_series = series.to_dict() + larger_series['E'] = 1 + larger_series = Series(larger_series) + larger_added = self.frame + larger_series + + for key, s in compat.iteritems(self.frame): + assert_series_equal(larger_added[key], s + series[key]) + self.assertIn('E', larger_added) + self.assertTrue(np.isnan(larger_added['E']).all()) + + # vs mix (upcast) as needed + added = self.mixed_float + series + _check_mixed_float(added, dtype = 'float64') + added = self.mixed_float + series.astype('float32') + _check_mixed_float(added, dtype = dict(C = None)) + added = self.mixed_float + series.astype('float16') + _check_mixed_float(added, dtype = dict(C = None)) + + #### these raise with numexpr.....as we are adding an int64 to an uint64....weird + # vs int + #added = self.mixed_int + (100*series).astype('int64') + #_check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = 'int64', D = 'int64')) + #added = self.mixed_int + (100*series).astype('int32') + #_check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = 'int32', D = 'int64')) + + # TimeSeries + buf = StringIO() + tmp = sys.stderr + sys.stderr = buf + + try: + ts = self.tsframe['A'] + added = self.tsframe + ts + + for key, col in compat.iteritems(self.tsframe): + assert_series_equal(added[key], col + ts) + + smaller_frame = self.tsframe[:-5] + smaller_added = smaller_frame + ts + + self.assertTrue(smaller_added.index.equals(self.tsframe.index)) + + smaller_ts = ts[:-5] + smaller_added2 = self.tsframe + smaller_ts + assert_frame_equal(smaller_added, smaller_added2) + + # length 0 + result = self.tsframe + ts[:0] + + # Frame is length 0 + result = self.tsframe[:0] + ts + self.assertEqual(len(result), 0) + + # empty but with non-empty index + frame = self.tsframe[:1].reindex(columns=[]) + result = frame * ts + self.assertEqual(len(result), len(ts)) + finally: + sys.stderr = tmp + + def test_combineFunc(self): + result = self.frame * 2 + self.assert_numpy_array_equal(result.values, self.frame.values * 2) + + # vs mix + result = self.mixed_float * 2 + for c, s in compat.iteritems(result): + self.assert_numpy_array_equal(s.values, self.mixed_float[c].values * 2) + _check_mixed_float(result, dtype = dict(C = None)) + + result = self.empty * 2 + self.assertIs(result.index, self.empty.index) + self.assertEqual(len(result.columns), 0) + + def test_comparisons(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + + row = self.simple.xs('a') + ndim_5 = np.ones(df1.shape + (1, 1, 1)) + + def test_comp(func): + result = func(df1, df2) + self.assert_numpy_array_equal(result.values, + func(df1.values, df2.values)) + with assertRaisesRegexp(ValueError, 'Wrong number of dimensions'): + func(df1, ndim_5) + + result2 = func(self.simple, row) + self.assert_numpy_array_equal(result2.values, + func(self.simple.values, row.values)) + + result3 = func(self.frame, 0) + self.assert_numpy_array_equal(result3.values, + func(self.frame.values, 0)) + + + with assertRaisesRegexp(ValueError, 'Can only compare ' + 'identically-labeled DataFrame'): + func(self.simple, self.simple[:2]) + + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) + + def test_string_comparison(self): + df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) + mask_a = df.a > 1 + assert_frame_equal(df[mask_a], df.ix[1:1, :]) + assert_frame_equal(df[-mask_a], df.ix[0:0, :]) + + mask_b = df.b == "foo" + assert_frame_equal(df[mask_b], df.ix[0:0, :]) + assert_frame_equal(df[-mask_b], df.ix[1:1, :]) + + def test_float_none_comparison(self): + df = DataFrame(np.random.randn(8, 3), index=lrange(8), + columns=['A', 'B', 'C']) + + self.assertRaises(TypeError, df.__eq__, None) + + def test_boolean_comparison(self): + + # GH 4576 + # boolean comparisons with a tuple/list give unexpected results + df = DataFrame(np.arange(6).reshape((3,2))) + b = np.array([2, 2]) + b_r = np.atleast_2d([2,2]) + b_c = b_r.T + l = (2,2,2) + tup = tuple(l) + + # gt + expected = DataFrame([[False,False],[False,True],[True,True]]) + result = df>b + assert_frame_equal(result,expected) + + result = df.values>b + assert_array_equal(result,expected.values) + + result = df>l + assert_frame_equal(result,expected) + + result = df>tup + assert_frame_equal(result,expected) + + result = df>b_r + assert_frame_equal(result,expected) + + result = df.values>b_r + assert_array_equal(result,expected.values) + + self.assertRaises(ValueError, df.__gt__, b_c) + self.assertRaises(ValueError, df.values.__gt__, b_c) + + # == + expected = DataFrame([[False,False],[True,False],[False,False]]) + result = df == b + assert_frame_equal(result,expected) + + result = df==l + assert_frame_equal(result,expected) + + result = df==tup + assert_frame_equal(result,expected) + + result = df == b_r + assert_frame_equal(result,expected) + + result = df.values == b_r + assert_array_equal(result,expected.values) + + self.assertRaises(ValueError, lambda : df == b_c) + self.assertFalse((df.values == b_c)) + + # with alignment + df = DataFrame(np.arange(6).reshape((3,2)),columns=list('AB'),index=list('abc')) + expected.index=df.index + expected.columns=df.columns + + result = df==l + assert_frame_equal(result,expected) + + result = df==tup + assert_frame_equal(result,expected) + + # not shape compatible + self.assertRaises(ValueError, lambda : df == (2,2)) + self.assertRaises(ValueError, lambda : df == [2,2]) + + def test_to_csv_deprecated_options(self): + + pname = '__tmp_to_csv_deprecated_options__' + with ensure_clean(pname) as path: + + self.tsframe[1:3] = np.nan + self.tsframe.to_csv(path, nanRep='foo') + recons = read_csv(path,index_col=0,parse_dates=[0],na_values=['foo']) + assert_frame_equal(self.tsframe, recons) + + with tm.assert_produces_warning(FutureWarning): + self.frame.to_csv(path, cols=['A', 'B']) + + with tm.assert_produces_warning(False): + self.frame.to_csv(path, columns=['A', 'B']) + + + def test_to_csv_from_csv(self): + + pname = '__tmp_to_csv_from_csv__' + with ensure_clean(pname) as path: + + self.frame['A'][:5] = nan + + self.frame.to_csv(path) + self.frame.to_csv(path, columns=['A', 'B']) + self.frame.to_csv(path, header=False) + self.frame.to_csv(path, index=False) + + # test roundtrip + self.tsframe.to_csv(path) + recons = DataFrame.from_csv(path) + + assert_frame_equal(self.tsframe, recons) + + self.tsframe.to_csv(path, index_label='index') + recons = DataFrame.from_csv(path, index_col=None) + assert(len(recons.columns) == len(self.tsframe.columns) + 1) + + # no index + self.tsframe.to_csv(path, index=False) + recons = DataFrame.from_csv(path, index_col=None) + assert_almost_equal(self.tsframe.values, recons.values) + + # corner case + dm = DataFrame({'s1': Series(lrange(3), lrange(3)), + 's2': Series(lrange(2), lrange(2))}) + dm.to_csv(path) + recons = DataFrame.from_csv(path) + assert_frame_equal(dm, recons) + + with ensure_clean(pname) as path: + + # duplicate index + df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], + columns=['x', 'y', 'z']) + df.to_csv(path) + result = DataFrame.from_csv(path) + assert_frame_equal(result, df) + + midx = MultiIndex.from_tuples([('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) + df = DataFrame(np.random.randn(3, 3), index=midx, + columns=['x', 'y', 'z']) + df.to_csv(path) + result = DataFrame.from_csv(path, index_col=[0, 1, 2], + parse_dates=False) + assert_frame_equal(result, df, check_names=False) # TODO from_csv names index ['Unnamed: 1', 'Unnamed: 2'] should it ? + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_csv(path, header=col_aliases) + rs = DataFrame.from_csv(path) + xp = self.frame2.copy() + xp.columns = col_aliases + + assert_frame_equal(xp, rs) + + self.assertRaises(ValueError, self.frame2.to_csv, path, + header=['AA', 'X']) + + with ensure_clean(pname) as path: + import pandas as pd + df1 = DataFrame(np.random.randn(3, 1)) + df2 = DataFrame(np.random.randn(3, 1)) + + df1.to_csv(path) + df2.to_csv(path,mode='a',header=False) + xp = pd.concat([df1,df2]) + rs = pd.read_csv(path,index_col=0) + rs.columns = lmap(int,rs.columns) + xp.columns = lmap(int,xp.columns) + assert_frame_equal(xp,rs) + + def test_to_csv_cols_reordering(self): + # GH3454 + import pandas as pd + + def _check_df(df,cols=None): + with ensure_clean() as path: + df.to_csv(path,columns = cols,engine='python') + rs_p = pd.read_csv(path,index_col=0) + df.to_csv(path,columns = cols,chunksize=chunksize) + rs_c = pd.read_csv(path,index_col=0) + + if cols: + df = df[cols] + assert (rs_c.columns==rs_p.columns).all() + assert_frame_equal(df,rs_c,check_names=False) + + chunksize=5 + N = int(chunksize*2.5) + + df= mkdf(N, 3) + cs = df.columns + cols = [cs[2],cs[0]] + _check_df(df,cols) + + def test_to_csv_legacy_raises_on_dupe_cols(self): + df= mkdf(10, 3) + df.columns = ['a','a','b'] + with ensure_clean() as path: + self.assertRaises(NotImplementedError,df.to_csv,path,engine='python') + + def test_to_csv_new_dupe_cols(self): + import pandas as pd + def _check_df(df,cols=None): + with ensure_clean() as path: + df.to_csv(path,columns = cols,chunksize=chunksize) + rs_c = pd.read_csv(path,index_col=0) + + # we wrote them in a different order + # so compare them in that order + if cols is not None: + + if df.columns.is_unique: + rs_c.columns = cols + else: + indexer, missing = df.columns.get_indexer_non_unique(cols) + rs_c.columns = df.columns.take(indexer) + + for c in cols: + obj_df = df[c] + obj_rs = rs_c[c] + if isinstance(obj_df,Series): + assert_series_equal(obj_df,obj_rs) + else: + assert_frame_equal(obj_df,obj_rs,check_names=False) + + # wrote in the same order + else: + rs_c.columns = df.columns + assert_frame_equal(df,rs_c,check_names=False) + + chunksize=5 + N = int(chunksize*2.5) + + # dupe cols + df= mkdf(N, 3) + df.columns = ['a','a','b'] + _check_df(df,None) + + # dupe cols with selection + cols = ['b','a'] + _check_df(df,cols) + + @slow + def test_to_csv_moar(self): + path = '__tmp_to_csv_moar__' + + def _do_test(df,path,r_dtype=None,c_dtype=None,rnlvl=None,cnlvl=None, + dupe_col=False): + + kwargs = dict(parse_dates=False) + if cnlvl: + if rnlvl is not None: + kwargs['index_col'] = lrange(rnlvl) + kwargs['header'] = lrange(cnlvl) + with ensure_clean(path) as path: + df.to_csv(path,encoding='utf8',chunksize=chunksize,tupleize_cols=False) + recons = DataFrame.from_csv(path,tupleize_cols=False,**kwargs) + else: + kwargs['header'] = 0 + with ensure_clean(path) as path: + df.to_csv(path,encoding='utf8',chunksize=chunksize) + recons = DataFrame.from_csv(path,**kwargs) + + def _to_uni(x): + if not isinstance(x, compat.text_type): + return x.decode('utf8') + return x + if dupe_col: + # read_Csv disambiguates the columns by + # labeling them dupe.1,dupe.2, etc'. monkey patch columns + recons.columns = df.columns + if rnlvl and not cnlvl: + delta_lvl = [recons.icol(i).values for i in range(rnlvl-1)] + ix=MultiIndex.from_arrays([list(recons.index)]+delta_lvl) + recons.index = ix + recons = recons.iloc[:,rnlvl-1:] + + type_map = dict(i='i',f='f',s='O',u='O',dt='O',p='O') + if r_dtype: + if r_dtype == 'u': # unicode + r_dtype='O' + recons.index = np.array(lmap(_to_uni,recons.index), + dtype=r_dtype) + df.index = np.array(lmap(_to_uni,df.index),dtype=r_dtype) + elif r_dtype == 'dt': # unicode + r_dtype='O' + recons.index = np.array(lmap(Timestamp,recons.index), + dtype=r_dtype) + df.index = np.array(lmap(Timestamp,df.index),dtype=r_dtype) + elif r_dtype == 'p': + r_dtype='O' + recons.index = np.array(list(map(Timestamp, + recons.index.to_datetime())), + dtype=r_dtype) + df.index = np.array(list(map(Timestamp, + df.index.to_datetime())), + dtype=r_dtype) + else: + r_dtype= type_map.get(r_dtype) + recons.index = np.array(recons.index,dtype=r_dtype ) + df.index = np.array(df.index,dtype=r_dtype ) + if c_dtype: + if c_dtype == 'u': + c_dtype='O' + recons.columns = np.array(lmap(_to_uni,recons.columns), + dtype=c_dtype) + df.columns = np.array(lmap(_to_uni,df.columns),dtype=c_dtype ) + elif c_dtype == 'dt': + c_dtype='O' + recons.columns = np.array(lmap(Timestamp,recons.columns), + dtype=c_dtype ) + df.columns = np.array(lmap(Timestamp,df.columns),dtype=c_dtype) + elif c_dtype == 'p': + c_dtype='O' + recons.columns = np.array(lmap(Timestamp,recons.columns.to_datetime()), + dtype=c_dtype) + df.columns = np.array(lmap(Timestamp,df.columns.to_datetime()),dtype=c_dtype ) + else: + c_dtype= type_map.get(c_dtype) + recons.columns = np.array(recons.columns,dtype=c_dtype ) + df.columns = np.array(df.columns,dtype=c_dtype ) + + assert_frame_equal(df,recons,check_names=False,check_less_precise=True) + + N = 100 + chunksize=1000 + + # GH3437 + from pandas import NaT + def make_dtnat_arr(n,nnat=None): + if nnat is None: + nnat= int(n*0.1) # 10% + s=list(date_range('2000',freq='5min',periods=n)) + if nnat: + for i in np.random.randint(0,len(s),nnat): + s[i] = NaT + i = np.random.randint(100) + s[-i] = NaT + s[i] = NaT + return s + + # N=35000 + s1=make_dtnat_arr(chunksize+5) + s2=make_dtnat_arr(chunksize+5,0) + path = '1.csv' + + # s3=make_dtnjat_arr(chunksize+5,0) + with ensure_clean('.csv') as pth: + df=DataFrame(dict(a=s1,b=s2)) + df.to_csv(pth,chunksize=chunksize) + recons = DataFrame.from_csv(pth).convert_objects('coerce') + assert_frame_equal(df, recons,check_names=False,check_less_precise=True) + + for ncols in [4]: + base = int((chunksize// ncols or 1) or 1) + for nrows in [2,10,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + _do_test(mkdf(nrows, ncols,r_idx_type='dt', + c_idx_type='s'),path, 'dt','s') + + + for ncols in [4]: + base = int((chunksize// ncols or 1) or 1) + for nrows in [2,10,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + _do_test(mkdf(nrows, ncols,r_idx_type='dt', + c_idx_type='s'),path, 'dt','s') + pass + + for r_idx_type,c_idx_type in [('i','i'),('s','s'),('u','dt'),('p','p')]: + for ncols in [1,2,3,4]: + base = int((chunksize// ncols or 1) or 1) + for nrows in [2,10,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + _do_test(mkdf(nrows, ncols,r_idx_type=r_idx_type, + c_idx_type=c_idx_type),path,r_idx_type,c_idx_type) + + for ncols in [1,2,3,4]: + base = int((chunksize// ncols or 1) or 1) + for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + _do_test(mkdf(nrows, ncols),path) + + for nrows in [10,N-2,N-1,N,N+1,N+2]: + df = mkdf(nrows, 3) + cols = list(df.columns) + cols[:2] = ["dupe","dupe"] + cols[-2:] = ["dupe","dupe"] + ix = list(df.index) + ix[:2] = ["rdupe","rdupe"] + ix[-2:] = ["rdupe","rdupe"] + df.index=ix + df.columns=cols + _do_test(df,path,dupe_col=True) + + + _do_test(DataFrame(index=lrange(10)),path) + _do_test(mkdf(chunksize//2+1, 2,r_idx_nlevels=2),path,rnlvl=2) + for ncols in [2,3,4]: + base = int(chunksize//ncols) + for nrows in [10,N-2,N-1,N,N+1,N+2,2*N-2,2*N-1,2*N,2*N+1,2*N+2, + base-1,base,base+1]: + _do_test(mkdf(nrows, ncols,r_idx_nlevels=2),path,rnlvl=2) + _do_test(mkdf(nrows, ncols,c_idx_nlevels=2),path,cnlvl=2) + _do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2), + path,rnlvl=2,cnlvl=2) + + def test_to_csv_from_csv_w_some_infs(self): + + # test roundtrip with inf, -inf, nan, as full columns and mix + self.frame['G'] = np.nan + f = lambda x: [np.inf, np.nan][np.random.rand() < .5] + self.frame['H'] = self.frame.index.map(f) + + with ensure_clean() as path: + self.frame.to_csv(path) + recons = DataFrame.from_csv(path) + + assert_frame_equal(self.frame, recons, check_names=False) # TODO to_csv drops column name + assert_frame_equal(np.isinf(self.frame), np.isinf(recons), check_names=False) + + def test_to_csv_from_csv_w_all_infs(self): + + # test roundtrip with inf, -inf, nan, as full columns and mix + self.frame['E'] = np.inf + self.frame['F'] = -np.inf + + with ensure_clean() as path: + self.frame.to_csv(path) + recons = DataFrame.from_csv(path) + + assert_frame_equal(self.frame, recons, check_names=False) # TODO to_csv drops column name + assert_frame_equal(np.isinf(self.frame), np.isinf(recons), check_names=False) + + def test_to_csv_no_index(self): + # GH 3624, after appending columns, to_csv fails + pname = '__tmp_to_csv_no_index__' + with ensure_clean(pname) as path: + df = DataFrame({'c1':[1,2,3], 'c2':[4,5,6]}) + df.to_csv(path, index=False) + result = read_csv(path) + assert_frame_equal(df,result) + df['c3'] = Series([7,8,9],dtype='int64') + df.to_csv(path, index=False) + result = read_csv(path) + assert_frame_equal(df,result) + + def test_to_csv_headers(self): + # GH6186, the presence or absence of `index` incorrectly + # causes to_csv to have different header semantics. + pname = '__tmp_to_csv_headers__' + from_df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + to_df = DataFrame([[1, 2], [3, 4]], columns=['X', 'Y']) + with ensure_clean(pname) as path: + from_df.to_csv(path, header=['X', 'Y']) + recons = DataFrame.from_csv(path) + assert_frame_equal(to_df, recons) + + from_df.to_csv(path, index=False, header=['X', 'Y']) + recons = DataFrame.from_csv(path) + recons.reset_index(inplace=True) + assert_frame_equal(to_df, recons) + + def test_to_csv_multiindex(self): + + pname = '__tmp_to_csv_multiindex__' + frame = self.frame + old_index = frame.index + arrays = np.arange(len(old_index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) + frame.index = new_index + + with ensure_clean(pname) as path: + + frame.to_csv(path, header=False) + frame.to_csv(path, columns=['A', 'B']) + + # round trip + frame.to_csv(path) + df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False) + + assert_frame_equal(frame, df, check_names=False) # TODO to_csv drops column name + self.assertEqual(frame.index.names, df.index.names) + self.frame.index = old_index # needed if setUP becomes a classmethod + + # try multiindex with dates + tsframe = self.tsframe + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.to_csv(path, index_label=['time', 'foo']) + recons = DataFrame.from_csv(path, index_col=[0, 1]) + assert_frame_equal(tsframe, recons, check_names=False) # TODO to_csv drops column name + + # do not load index + tsframe.to_csv(path) + recons = DataFrame.from_csv(path, index_col=None) + np.testing.assert_equal(len(recons.columns), len(tsframe.columns) + 2) + + # no index + tsframe.to_csv(path, index=False) + recons = DataFrame.from_csv(path, index_col=None) + assert_almost_equal(recons.values, self.tsframe.values) + self.tsframe.index = old_index # needed if setUP becomes classmethod + + with ensure_clean(pname) as path: + # GH3571, GH1651, GH3141 + + def _make_frame(names=None): + if names is True: + names = ['first','second'] + return DataFrame(np.random.randint(0,10,size=(3,3)), + columns=MultiIndex.from_tuples([('bah', 'foo'), + ('bah', 'bar'), + ('ban', 'baz')], + names=names), + dtype='int64') + + # column & index are multi-index + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1],tupleize_cols=False) + assert_frame_equal(df,result) + + # column is mi + df = mkdf(5,3,r_idx_nlevels=1,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=0,tupleize_cols=False) + assert_frame_equal(df,result) + + # dup column names? + df = mkdf(5,3,r_idx_nlevels=3,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1,2,3],index_col=[0,1,2],tupleize_cols=False) + assert_frame_equal(df,result) + + # writing with no index + df = _make_frame() + df.to_csv(path,tupleize_cols=False,index=False) + result = read_csv(path,header=[0,1],tupleize_cols=False) + assert_frame_equal(df,result) + + # we lose the names here + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False,index=False) + result = read_csv(path,header=[0,1],tupleize_cols=False) + self.assertTrue(all([ x is None for x in result.columns.names ])) + result.columns.names = df.columns.names + assert_frame_equal(df,result) + + # tupleize_cols=True and index=False + df = _make_frame(True) + df.to_csv(path,tupleize_cols=True,index=False) + result = read_csv(path,header=0,tupleize_cols=True,index_col=None) + result.columns = df.columns + assert_frame_equal(df,result) + + # whatsnew example + df = _make_frame() + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1],index_col=[0],tupleize_cols=False) + assert_frame_equal(df,result) + + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False) + result = read_csv(path,header=[0,1],index_col=[0],tupleize_cols=False) + assert_frame_equal(df,result) + + # column & index are multi-index (compatibility) + df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df.to_csv(path,tupleize_cols=True) + result = read_csv(path,header=0,index_col=[0,1],tupleize_cols=True) + result.columns = df.columns + assert_frame_equal(df,result) + + # invalid options + df = _make_frame(True) + df.to_csv(path,tupleize_cols=False) + + # catch invalid headers + with assertRaisesRegexp(CParserError, 'Passed header=\[0,1,2\] are too many rows for this multi_index of columns'): + read_csv(path,tupleize_cols=False,header=lrange(3),index_col=0) + + with assertRaisesRegexp(CParserError, 'Passed header=\[0,1,2,3,4,5,6\], len of 7, but only 6 lines in file'): + read_csv(path,tupleize_cols=False,header=lrange(7),index_col=0) + + for i in [4,5,6]: + with tm.assertRaises(CParserError): + read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) + + # write with cols + with assertRaisesRegexp(TypeError, 'cannot specify cols with a MultiIndex'): + df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar']) + + with ensure_clean(pname) as path: + # empty + tsframe[:0].to_csv(path) + recons = DataFrame.from_csv(path) + exp = tsframe[:0] + exp.index = [] + + self.assertTrue(recons.columns.equals(exp.columns)) + self.assertEqual(len(recons), 0) + + def test_to_csv_float32_nanrep(self): + df = DataFrame(np.random.randn(1, 4).astype(np.float32)) + df[1] = np.nan + + with ensure_clean('__tmp_to_csv_float32_nanrep__.csv') as path: + df.to_csv(path, na_rep=999) + + with open(path) as f: + lines = f.readlines() + self.assertEqual(lines[1].split(',')[2], '999') + + def test_to_csv_withcommas(self): + + # Commas inside fields should be correctly escaped when saving as CSV. + df = DataFrame({'A': [1, 2, 3], 'B': ['5,6', '7,8', '9,0']}) + + with ensure_clean('__tmp_to_csv_withcommas__.csv') as path: + df.to_csv(path) + df2 = DataFrame.from_csv(path) + assert_frame_equal(df2, df) + + def test_to_csv_mixed(self): + + def create_cols(name): + return [ "%s%03d" % (name,i) for i in range(5) ] + + df_float = DataFrame(np.random.randn(100, 5),dtype='float64',columns=create_cols('float')) + df_int = DataFrame(np.random.randn(100, 5),dtype='int64',columns=create_cols('int')) + df_bool = DataFrame(True,index=df_float.index,columns=create_cols('bool')) + df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object')) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date')) + + # add in some nans + df_float.ix[30:50,1:3] = np.nan + + #### this is a bug in read_csv right now #### + #df_dt.ix[30:50,1:3] = np.nan + + df = pd.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + # dtype + dtypes = dict() + for n,dtype in [('float',np.float64),('int',np.int64),('bool',np.bool),('object',np.object)]: + for c in create_cols(n): + dtypes[c] = dtype + + with ensure_clean() as filename: + df.to_csv(filename) + rs = read_csv(filename, index_col=0, dtype=dtypes, parse_dates=create_cols('date')) + assert_frame_equal(rs, df) + + def test_to_csv_dups_cols(self): + + df = DataFrame(np.random.randn(1000, 30),columns=lrange(15)+lrange(15),dtype='float64') + + with ensure_clean() as filename: + df.to_csv(filename) # single dtype, fine + result = read_csv(filename,index_col=0) + result.columns = df.columns + assert_frame_equal(result,df) + + df_float = DataFrame(np.random.randn(1000, 3),dtype='float64') + df_int = DataFrame(np.random.randn(1000, 3),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=lrange(3)) + df_object = DataFrame('foo',index=df_float.index,columns=lrange(3)) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=lrange(3)) + df = pd.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1, ignore_index=True) + + cols = [] + for i in range(5): + cols.extend([0,1,2]) + df.columns = cols + + from pandas import to_datetime + with ensure_clean() as filename: + df.to_csv(filename) + result = read_csv(filename,index_col=0) + + # date cols + for i in ['0.4','1.4','2.4']: + result[i] = to_datetime(result[i]) + + result.columns = df.columns + assert_frame_equal(result,df) + + # GH3457 + from pandas.util.testing import makeCustomDataframe as mkdf + + N=10 + df= mkdf(N, 3) + df.columns = ['a','a','b'] + + with ensure_clean() as filename: + df.to_csv(filename) + + # read_csv will rename the dups columns + result = read_csv(filename,index_col=0) + result = result.rename(columns={ 'a.1' : 'a' }) + assert_frame_equal(result,df) + + def test_to_csv_chunking(self): + + aa=DataFrame({'A':lrange(100000)}) + aa['B'] = aa.A + 1.0 + aa['C'] = aa.A + 2.0 + aa['D'] = aa.A + 3.0 + + for chunksize in [10000,50000,100000]: + with ensure_clean() as filename: + aa.to_csv(filename,chunksize=chunksize) + rs = read_csv(filename,index_col=0) + assert_frame_equal(rs, aa) + + def test_to_csv_bug(self): + f1 = StringIO('a,1.0\nb,2.0') + df = DataFrame.from_csv(f1, header=None) + newdf = DataFrame({'t': df[df.columns[0]]}) + + with ensure_clean() as path: + newdf.to_csv(path) + + recons = read_csv(path, index_col=0) + assert_frame_equal(recons, newdf, check_names=False) # don't check_names as t != 1 + + def test_to_csv_unicode(self): + + df = DataFrame({u('c/\u03c3'): [1, 2, 3]}) + with ensure_clean() as path: + + df.to_csv(path, encoding='UTF-8') + df2 = read_csv(path, index_col=0, encoding='UTF-8') + assert_frame_equal(df, df2) + + df.to_csv(path, encoding='UTF-8', index=False) + df2 = read_csv(path, index_col=None, encoding='UTF-8') + assert_frame_equal(df, df2) + + def test_to_csv_unicode_index_col(self): + buf = StringIO('') + df = DataFrame( + [[u("\u05d0"), "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], + columns=[u("\u05d0"), + u("\u05d1"), u("\u05d2"), u("\u05d3")], + index=[u("\u05d0"), u("\u05d1")]) + + df.to_csv(buf, encoding='UTF-8') + buf.seek(0) + + df2 = read_csv(buf, index_col=0, encoding='UTF-8') + assert_frame_equal(df, df2) + + def test_to_csv_stringio(self): + buf = StringIO() + self.frame.to_csv(buf) + buf.seek(0) + recons = read_csv(buf, index_col=0) + assert_frame_equal(recons, self.frame, check_names=False) # TODO to_csv drops column name + + def test_to_csv_float_format(self): + + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with ensure_clean() as filename: + + df.to_csv(filename, float_format='%.2f') + + rs = read_csv(filename, index_col=0) + xp = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + assert_frame_equal(rs, xp) + + def test_to_csv_quoting(self): + df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) + + buf = StringIO() + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC) + + result = buf.getvalue() + expected = ('"A","B"\n' + '1,"foo"\n' + '2,"bar"\n' + '3,"baz"\n') + + self.assertEqual(result, expected) + + # quoting windows line terminators, presents with encoding? + # #3503 + text = 'a,b,c\n1,"test \r\n",3\n' + df = pd.read_csv(StringIO(text)) + buf = StringIO() + df.to_csv(buf, encoding='utf-8', index=False) + self.assertEqual(buf.getvalue(), text) + + def test_to_csv_unicodewriter_quoting(self): + df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) + + buf = StringIO() + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, + encoding='utf-8') + + result = buf.getvalue() + expected = ('"A","B"\n' + '1,"foo"\n' + '2,"bar"\n' + '3,"baz"\n') + + self.assertEqual(result, expected) + + def test_to_csv_quote_none(self): + # GH4328 + df = DataFrame({'A': ['hello', '{"hello"}']}) + for encoding in (None, 'utf-8'): + buf = StringIO() + df.to_csv(buf, quoting=csv.QUOTE_NONE, + encoding=encoding, index=False) + result = buf.getvalue() + expected = 'A\nhello\n{"hello"}\n' + self.assertEqual(result, expected) + + def test_to_csv_index_no_leading_comma(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + index=['one', 'two', 'three']) + + buf = StringIO() + df.to_csv(buf, index_label=False) + expected = ('A,B\n' + 'one,1,4\n' + 'two,2,5\n' + 'three,3,6\n') + self.assertEqual(buf.getvalue(), expected) + + def test_to_csv_line_terminators(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + index=['one', 'two', 'three']) + + buf = StringIO() + df.to_csv(buf, line_terminator='\r\n') + expected = (',A,B\r\n' + 'one,1,4\r\n' + 'two,2,5\r\n' + 'three,3,6\r\n') + self.assertEqual(buf.getvalue(), expected) + + buf = StringIO() + df.to_csv(buf) # The default line terminator remains \n + expected = (',A,B\n' + 'one,1,4\n' + 'two,2,5\n' + 'three,3,6\n') + self.assertEqual(buf.getvalue(), expected) + + def test_info(self): + io = StringIO() + self.frame.info(buf=io) + self.tsframe.info(buf=io) + + frame = DataFrame(np.random.randn(5, 3)) + + import sys + sys.stdout = StringIO() + frame.info() + frame.info(verbose=False) + sys.stdout = sys.__stdout__ + + def test_info_wide(self): + from pandas import set_option, reset_option + io = StringIO() + df = DataFrame(np.random.randn(5, 101)) + df.info(buf=io) + + io = StringIO() + df.info(buf=io, max_cols=101) + rs = io.getvalue() + self.assertTrue(len(rs.splitlines()) > 100) + xp = rs + + set_option('display.max_info_columns', 101) + io = StringIO() + df.info(buf=io) + self.assertEqual(rs, xp) + reset_option('display.max_info_columns') + + def test_info_duplicate_columns(self): + io = StringIO() + + # it works! + frame = DataFrame(np.random.randn(1500, 4), + columns=['a', 'a', 'b', 'b']) + frame.info(buf=io) + + def test_info_shows_column_dtypes(self): + dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', + 'complex128', 'object', 'bool'] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + df.info(buf=buf) + res = buf.getvalue() + for i, dtype in enumerate(dtypes): + name = '%d %d non-null %s' % (i, n, dtype) + assert name in res + + def test_info_max_cols(self): + df = DataFrame(np.random.randn(10, 5)) + for len_, verbose in [(4, None), (4, False), (9, True)]: + # For verbose always ^ setting ^ summarize ^ full output + with pd.option_context('max_info_columns', 4): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + self.assertEqual(len(res.split('\n')), len_) + + for len_, verbose in [(9, None), (4, False), (9, True)]: + + # max_cols no exceeded + with pd.option_context('max_info_columns', 5): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + self.assertEqual(len(res.split('\n')), len_) + + for len_, max_cols in [(9, 5), (4, 4)]: + # setting truncates + with pd.option_context('max_info_columns', 4): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + self.assertEqual(len(res.split('\n')), len_) + + # setting wouldn't truncate + with pd.option_context('max_info_columns', 5): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + self.assertEqual(len(res.split('\n')), len_) + + + def test_dtypes(self): + self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 + result = self.mixed_frame.dtypes + expected = Series(dict((k, v.dtype) + for k, v in compat.iteritems(self.mixed_frame)), + index=result.index) + assert_series_equal(result, expected) + + def test_convert_objects(self): + + oops = self.mixed_frame.T.T + converted = oops.convert_objects() + assert_frame_equal(converted, self.mixed_frame) + self.assertEqual(converted['A'].dtype, np.float64) + + # force numeric conversion + self.mixed_frame['H'] = '1.' + self.mixed_frame['I'] = '1' + + # add in some items that will be nan + l = len(self.mixed_frame) + self.mixed_frame['J'] = '1.' + self.mixed_frame['K'] = '1' + self.mixed_frame.ix[0:5,['J','K']] = 'garbled' + converted = self.mixed_frame.convert_objects(convert_numeric=True) + self.assertEqual(converted['H'].dtype, 'float64') + self.assertEqual(converted['I'].dtype, 'int64') + self.assertEqual(converted['J'].dtype, 'float64') + self.assertEqual(converted['K'].dtype, 'float64') + self.assertEqual(len(converted['J'].dropna()), l-5) + self.assertEqual(len(converted['K'].dropna()), l-5) + + # via astype + converted = self.mixed_frame.copy() + converted['H'] = converted['H'].astype('float64') + converted['I'] = converted['I'].astype('int64') + self.assertEqual(converted['H'].dtype, 'float64') + self.assertEqual(converted['I'].dtype, 'int64') + + # via astype, but errors + converted = self.mixed_frame.copy() + with assertRaisesRegexp(ValueError, 'invalid literal'): + converted['H'].astype('int32') + + # mixed in a single column + df = DataFrame(dict(s = Series([1, 'na', 3 ,4]))) + result = df.convert_objects(convert_numeric=True) + expected = DataFrame(dict(s = Series([1, np.nan, 3 ,4]))) + assert_frame_equal(result, expected) + + def test_convert_objects_no_conversion(self): + mixed1 = DataFrame( + {'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']}) + mixed2 = mixed1.convert_objects() + assert_frame_equal(mixed1, mixed2) + + def test_append_series_dict(self): + df = DataFrame(np.random.randn(5, 4), + columns=['foo', 'bar', 'baz', 'qux']) + + series = df.ix[4] + with assertRaisesRegexp(ValueError, 'Indexes have overlapping values'): + df.append(series, verify_integrity=True) + series.name = None + with assertRaisesRegexp(TypeError, 'Can only append a Series if ' + 'ignore_index=True'): + df.append(series, verify_integrity=True) + + result = df.append(series[::-1], ignore_index=True) + expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T, + ignore_index=True) + assert_frame_equal(result, expected) + + # dict + result = df.append(series.to_dict(), ignore_index=True) + assert_frame_equal(result, expected) + + result = df.append(series[::-1][:3], ignore_index=True) + expected = df.append(DataFrame({0: series[::-1][:3]}).T, + ignore_index=True) + assert_frame_equal(result, expected.ix[:, result.columns]) + + # can append when name set + row = df.ix[4] + row.name = 5 + result = df.append(row) + expected = df.append(df[-1:], ignore_index=True) + assert_frame_equal(result, expected) + + def test_append_list_of_series_dicts(self): + df = DataFrame(np.random.randn(5, 4), + columns=['foo', 'bar', 'baz', 'qux']) + + dicts = [x.to_dict() for idx, x in df.iterrows()] + + result = df.append(dicts, ignore_index=True) + expected = df.append(df, ignore_index=True) + assert_frame_equal(result, expected) + + # different columns + dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, + {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] + result = df.append(dicts, ignore_index=True) + expected = df.append(DataFrame(dicts), ignore_index=True) + assert_frame_equal(result, expected) + + def test_append_empty_dataframe(self): + + # Empty df append empty df + df1 = DataFrame([]) + df2 = DataFrame([]) + result = df1.append(df2) + expected = df1.copy() + assert_frame_equal(result, expected) + + # Non-empty df append empty df + df1 = DataFrame(np.random.randn(5, 2)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + assert_frame_equal(result, expected) + + # Empty df with columns append empty df + df1 = DataFrame(columns=['bar', 'foo']) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + assert_frame_equal(result, expected) + + # Non-Empty df with columns append empty df + df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo']) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + assert_frame_equal(result, expected) + + def test_append_dtypes(self): + + # GH 5754 + # row appends of different dtypes (so need to do by-item) + # can sometimes infer the correct type + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(5)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : 'foo' }, index=lrange(1,2)) + result = df1.append(df2) + expected = DataFrame({ 'bar' : [ Timestamp('20130101'), 'foo' ]}) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : np.nan }, index=lrange(1,2)) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ Timestamp('20130101'), np.nan ],dtype='M8[ns]') }) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : np.nan }, index=lrange(1,2), dtype=object) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ Timestamp('20130101'), np.nan ],dtype='M8[ns]') }) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : np.nan }, index=lrange(1)) + df2 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1,2)) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ np.nan, Timestamp('20130101')] ,dtype='M8[ns]') }) + assert_frame_equal(result, expected) + + df1 = DataFrame({ 'bar' : Timestamp('20130101') }, index=lrange(1)) + df2 = DataFrame({ 'bar' : 1 }, index=lrange(1,2), dtype=object) + result = df1.append(df2) + expected = DataFrame({ 'bar' : Series([ Timestamp('20130101'), 1 ]) }) + assert_frame_equal(result, expected) + + def test_asfreq(self): + offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) + rule_monthly = self.tsframe.asfreq('BM') + + assert_almost_equal(offset_monthly['A'], rule_monthly['A']) + + filled = rule_monthly.asfreq('B', method='pad') + # TODO: actually check that this worked. + + # don't forget! + filled_dep = rule_monthly.asfreq('B', method='pad') + + # test does not blow up on length-0 DataFrame + zero_length = self.tsframe.reindex([]) + result = zero_length.asfreq('BM') + self.assertIsNot(result, zero_length) + + def test_asfreq_datetimeindex(self): + df = DataFrame({'A': [1, 2, 3]}, + index=[datetime(2011, 11, 1), datetime(2011, 11, 2), + datetime(2011, 11, 3)]) + df = df.asfreq('B') + tm.assert_isinstance(df.index, DatetimeIndex) + + ts = df['A'].asfreq('B') + tm.assert_isinstance(ts.index, DatetimeIndex) + + def test_at_time_between_time_datetimeindex(self): + index = date_range("2012-01-01", "2012-01-05", freq='30min') + df = DataFrame(randn(len(index), 5), index=index) + akey = time(12, 0, 0) + bkey = slice(time(13, 0, 0), time(14, 0, 0)) + ainds = [24, 72, 120, 168] + binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172] + + result = df.at_time(akey) + expected = df.ix[akey] + expected2 = df.ix[ainds] + assert_frame_equal(result, expected) + assert_frame_equal(result, expected2) + self.assertEqual(len(result), 4) + + result = df.between_time(bkey.start, bkey.stop) + expected = df.ix[bkey] + expected2 = df.ix[binds] + assert_frame_equal(result, expected) + assert_frame_equal(result, expected2) + self.assertEqual(len(result), 12) + + result = df.copy() + result.ix[akey] = 0 + result = result.ix[akey] + expected = df.ix[akey].copy() + expected.ix[:] = 0 + assert_frame_equal(result, expected) + + result = df.copy() + result.ix[akey] = 0 + result.ix[akey] = df.ix[ainds] + assert_frame_equal(result, df) + + result = df.copy() + result.ix[bkey] = 0 + result = result.ix[bkey] + expected = df.ix[bkey].copy() + expected.ix[:] = 0 + assert_frame_equal(result, expected) + + result = df.copy() + result.ix[bkey] = 0 + result.ix[bkey] = df.ix[binds] + assert_frame_equal(result, df) + + def test_as_matrix(self): + frame = self.frame + mat = frame.as_matrix() + + frameCols = frame.columns + for i, row in enumerate(mat): + for j, value in enumerate(row): + col = frameCols[j] + if np.isnan(value): + self.assertTrue(np.isnan(frame[col][i])) + else: + self.assertEqual(value, frame[col][i]) + + # mixed type + mat = self.mixed_frame.as_matrix(['foo', 'A']) + self.assertEqual(mat[0, 0], 'bar') + + df = DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) + mat = df.as_matrix() + self.assertEqual(mat[0, 0], 1j) + + # single block corner case + mat = self.frame.as_matrix(['A', 'B']) + expected = self.frame.reindex(columns=['A', 'B']).values + assert_almost_equal(mat, expected) + + def test_as_matrix_duplicates(self): + df = DataFrame([[1, 2, 'a', 'b'], + [1, 2, 'a', 'b']], + columns=['one', 'one', 'two', 'two']) + + result = df.values + expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], + dtype=object) + + self.assertTrue(np.array_equal(result, expected)) + + def test_ftypes(self): + frame = self.mixed_float + expected = Series(dict(A = 'float32:dense', B = 'float32:dense', C = 'float16:dense', D = 'float64:dense')) + expected.sort() + result = frame.ftypes + result.sort() + assert_series_equal(result,expected) + + def test_values(self): + self.frame.values[:, 0] = 5. + self.assertTrue((self.frame.values[:, 0] == 5).all()) + + def test_deepcopy(self): + cp = deepcopy(self.frame) + series = cp['A'] + series[:] = 10 + for idx, value in compat.iteritems(series): + self.assertNotEqual(self.frame['A'][idx], value) + + def test_copy(self): + cop = self.frame.copy() + cop['E'] = cop['A'] + self.assertNotIn('E', self.frame) + + # copy objects + copy = self.mixed_frame.copy() + self.assertIsNot(copy._data, self.mixed_frame._data) + + def _check_method(self, method='pearson', check_minp=False): + if not check_minp: + correls = self.frame.corr(method=method) + exp = self.frame['A'].corr(self.frame['C'], method=method) + assert_almost_equal(correls['A']['C'], exp) + else: + result = self.frame.corr(min_periods=len(self.frame) - 8) + expected = self.frame.corr() + expected.ix['A', 'B'] = expected.ix['B', 'A'] = nan + + def test_corr_pearson(self): + tm._skip_if_no_scipy() + self.frame['A'][:5] = nan + self.frame['B'][5:10] = nan + + self._check_method('pearson') + + def test_corr_kendall(self): + tm._skip_if_no_scipy() + self.frame['A'][:5] = nan + self.frame['B'][5:10] = nan + + self._check_method('kendall') + + def test_corr_spearman(self): + tm._skip_if_no_scipy() + self.frame['A'][:5] = nan + self.frame['B'][5:10] = nan + + self._check_method('spearman') + + def test_corr_non_numeric(self): + tm._skip_if_no_scipy() + self.frame['A'][:5] = nan + self.frame['B'][5:10] = nan + + # exclude non-numeric types + result = self.mixed_frame.corr() + expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].corr() + assert_frame_equal(result, expected) + + def test_corr_nooverlap(self): + tm._skip_if_no_scipy() + + # nothing in common + for meth in ['pearson', 'kendall', 'spearman']: + df = DataFrame({'A': [1, 1.5, 1, np.nan, np.nan, np.nan], + 'B': [np.nan, np.nan, np.nan, 1, 1.5, 1]}) + rs = df.corr(meth) + self.assertTrue(isnull(rs.ix['A', 'B'])) + self.assertTrue(isnull(rs.ix['B', 'A'])) + self.assertEqual(rs.ix['A', 'A'], 1) + self.assertEqual(rs.ix['B', 'B'], 1) + + def test_corr_constant(self): + tm._skip_if_no_scipy() + + # constant --> all NA + + for meth in ['pearson', 'spearman']: + df = DataFrame({'A': [1, 1, 1, np.nan, np.nan, np.nan], + 'B': [np.nan, np.nan, np.nan, 1, 1, 1]}) + rs = df.corr(meth) + self.assertTrue(isnull(rs.values).all()) + + def test_corr_int(self): + # dtypes other than float64 #1761 + df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + + # it works! + df3.cov() + df3.corr() + + def test_cov(self): + # min_periods no NAs (corner case) + expected = self.frame.cov() + result = self.frame.cov(min_periods=len(self.frame)) + + assert_frame_equal(expected, result) + + result = self.frame.cov(min_periods=len(self.frame) + 1) + self.assertTrue(isnull(result.values).all()) + + # with NAs + frame = self.frame.copy() + frame['A'][:5] = nan + frame['B'][5:10] = nan + result = self.frame.cov(min_periods=len(self.frame) - 8) + expected = self.frame.cov() + expected.ix['A', 'B'] = np.nan + expected.ix['B', 'A'] = np.nan + + # regular + self.frame['A'][:5] = nan + self.frame['B'][:10] = nan + cov = self.frame.cov() + + assert_almost_equal(cov['A']['C'], + self.frame['A'].cov(self.frame['C'])) + + # exclude non-numeric types + result = self.mixed_frame.cov() + expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].cov() + assert_frame_equal(result, expected) + + # Single column frame + df = DataFrame(np.linspace(0.0,1.0,10)) + result = df.cov() + expected = DataFrame(np.cov(df.values.T).reshape((1,1)), + index=df.columns,columns=df.columns) + assert_frame_equal(result, expected) + df.ix[0] = np.nan + result = df.cov() + expected = DataFrame(np.cov(df.values[1:].T).reshape((1,1)), + index=df.columns,columns=df.columns) + assert_frame_equal(result, expected) + + def test_corrwith(self): + a = self.tsframe + noise = Series(randn(len(a)), index=a.index) + + b = self.tsframe + noise + + # make sure order does not matter + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + del b['B'] + + colcorr = a.corrwith(b, axis=0) + assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) + + rowcorr = a.corrwith(b, axis=1) + assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + + dropped = a.corrwith(b, axis=0, drop=True) + assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) + self.assertNotIn('B', dropped) + + dropped = a.corrwith(b, axis=1, drop=True) + self.assertNotIn(a.index[-1], dropped.index) + + # non time-series data + index = ['a', 'b', 'c', 'd', 'e'] + columns = ['one', 'two', 'three', 'four'] + df1 = DataFrame(randn(5, 4), index=index, columns=columns) + df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) + correls = df1.corrwith(df2, axis=1) + for row in index[:4]: + assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) + + def test_corrwith_with_objects(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + cols = ['A', 'B', 'C', 'D'] + + df1['obj'] = 'foo' + df2['obj'] = 'bar' + + result = df1.corrwith(df2) + expected = df1.ix[:, cols].corrwith(df2.ix[:, cols]) + assert_series_equal(result, expected) + + result = df1.corrwith(df2, axis=1) + expected = df1.ix[:, cols].corrwith(df2.ix[:, cols], axis=1) + assert_series_equal(result, expected) + + def test_corrwith_series(self): + result = self.tsframe.corrwith(self.tsframe['A']) + expected = self.tsframe.apply(self.tsframe['A'].corr) + + assert_series_equal(result, expected) + + def test_corrwith_matches_corrcoef(self): + df1 = DataFrame(np.arange(10000), columns=['a']) + df2 = DataFrame(np.arange(10000)**2, columns=['a']) + c1 = df1.corrwith(df2)['a'] + c2 = np.corrcoef(df1['a'],df2['a'])[0][1] + + assert_almost_equal(c1, c2) + self.assertTrue(c1 < 1) + + def test_drop_names(self): + df = DataFrame([[1, 2, 3],[3, 4, 5],[5, 6, 7]], index=['a', 'b', 'c'], + columns=['d', 'e', 'f']) + df.index.name, df.columns.name = 'first', 'second' + df_dropped_b = df.drop('b') + df_dropped_e = df.drop('e', axis=1) + df_inplace_b, df_inplace_e = df.copy(), df.copy() + df_inplace_b.drop('b', inplace=True) + df_inplace_e.drop('e', axis=1, inplace=True) + for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): + self.assertEqual(obj.index.name, 'first') + self.assertEqual(obj.columns.name, 'second') + self.assertEqual(list(df.columns), ['d', 'e', 'f']) + + def test_dropEmptyRows(self): + N = len(self.frame.index) + mat = randn(N) + mat[:5] = nan + + frame = DataFrame({'foo': mat}, index=self.frame.index) + original = Series(mat, index=self.frame.index) + expected = original.dropna() + inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna(how='all') + # check that original was preserved + assert_series_equal(frame['foo'], original) + inplace_frame1.dropna(how='all', inplace=True) + assert_series_equal(smaller_frame['foo'], expected) + assert_series_equal(inplace_frame1['foo'], expected) + + smaller_frame = frame.dropna(how='all', subset=['foo']) + inplace_frame2.dropna(how='all', subset=['foo'], inplace=True) + assert_series_equal(smaller_frame['foo'], expected) + assert_series_equal(inplace_frame2['foo'], expected) + + def test_dropIncompleteRows(self): + N = len(self.frame.index) + mat = randn(N) + mat[:5] = nan + + frame = DataFrame({'foo': mat}, index=self.frame.index) + frame['bar'] = 5 + original = Series(mat, index=self.frame.index) + inp_frame1, inp_frame2 = frame.copy(), frame.copy() + + smaller_frame = frame.dropna() + assert_series_equal(frame['foo'], original) + inp_frame1.dropna(inplace=True) + self.assert_numpy_array_equal(smaller_frame['foo'], mat[5:]) + self.assert_numpy_array_equal(inp_frame1['foo'], mat[5:]) + + samesize_frame = frame.dropna(subset=['bar']) + assert_series_equal(frame['foo'], original) + self.assertTrue((frame['bar'] == 5).all()) + inp_frame2.dropna(subset=['bar'], inplace=True) + self.assertTrue(samesize_frame.index.equals(self.frame.index)) + self.assertTrue(inp_frame2.index.equals(self.frame.index)) + + def test_dropna(self): + df = DataFrame(np.random.randn(6, 4)) + df[2][:2] = nan + + dropped = df.dropna(axis=1) + expected = df.ix[:, [0, 1, 3]] + inp = df.copy() + inp.dropna(axis=1, inplace=True) + assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) + + dropped = df.dropna(axis=0) + expected = df.ix[lrange(2, 6)] + inp = df.copy() + inp.dropna(axis=0, inplace=True) + assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) + + # threshold + dropped = df.dropna(axis=1, thresh=5) + expected = df.ix[:, [0, 1, 3]] + inp = df.copy() + inp.dropna(axis=1, thresh=5, inplace=True) + assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) + + dropped = df.dropna(axis=0, thresh=4) + expected = df.ix[lrange(2, 6)] + inp = df.copy() + inp.dropna(axis=0, thresh=4, inplace=True) + assert_frame_equal(dropped, expected) + assert_frame_equal(inp, expected) + + dropped = df.dropna(axis=1, thresh=4) + assert_frame_equal(dropped, df) + + dropped = df.dropna(axis=1, thresh=3) + assert_frame_equal(dropped, df) + + # subset + dropped = df.dropna(axis=0, subset=[0, 1, 3]) + inp = df.copy() + inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) + assert_frame_equal(dropped, df) + assert_frame_equal(inp, df) + + # all + dropped = df.dropna(axis=1, how='all') + assert_frame_equal(dropped, df) + + df[2] = nan + dropped = df.dropna(axis=1, how='all') + expected = df.ix[:, [0, 1, 3]] + assert_frame_equal(dropped, expected) + + # bad input + self.assertRaises(ValueError, df.dropna, axis=3) + + + def test_drop_and_dropna_caching(self): + # tst that cacher updates + original = Series([1, 2, np.nan]) + expected = Series([1, 2], dtype=original.dtype) + df = pd.DataFrame({'A': original.values.copy()}) + df2 = df.copy() + df['A'].dropna() + assert_series_equal(df['A'], original) + df['A'].dropna(inplace=True) + assert_series_equal(df['A'], expected) + df2['A'].drop([1]) + assert_series_equal(df2['A'], original) + df2['A'].drop([1], inplace=True) + assert_series_equal(df2['A'], original.drop([1])) + + def test_dropna_corner(self): + # bad input + self.assertRaises(ValueError, self.frame.dropna, how='foo') + self.assertRaises(TypeError, self.frame.dropna, how=None) + + def test_dropna_multiple_axes(self): + df = DataFrame([[1, np.nan, 2, 3], + [4, np.nan, 5, 6], + [np.nan, np.nan, np.nan, np.nan], + [7, np.nan, 8, 9]]) + cp = df.copy() + result = df.dropna(how='all', axis=[0, 1]) + result2 = df.dropna(how='all', axis=(0, 1)) + expected = df.dropna(how='all').dropna(how='all', axis=1) + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(df, cp) + + inp = df.copy() + inp.dropna(how='all', axis=(0, 1), inplace=True) + assert_frame_equal(inp, expected) + + def test_drop_duplicates(self): + df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('AAA') + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('AAA', take_last=True) + expected = df.ix[[6, 7]] + assert_frame_equal(result, expected) + + # multi column + expected = df.ix[[0, 1, 2, 3]] + result = df.drop_duplicates(np.array(['AAA', 'B'])) + assert_frame_equal(result, expected) + result = df.drop_duplicates(['AAA', 'B']) + assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AAA', 'B'), take_last=True) + expected = df.ix[[0, 5, 6, 7]] + assert_frame_equal(result, expected) + + # consider everything + df2 = df.ix[:, ['AAA', 'B', 'C']] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(['AAA', 'B']) + assert_frame_equal(result, expected) + + result = df2.drop_duplicates(take_last=True) + expected = df2.drop_duplicates(['AAA', 'B'], take_last=True) + assert_frame_equal(result, expected) + + def test_drop_duplicates_deprecated_warning(self): + df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + expected = df[:2] + + # Raises warning + with tm.assert_produces_warning(False): + result = df.drop_duplicates(subset='AAA') + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = df.drop_duplicates(cols='AAA') + assert_frame_equal(result, expected) + + # Does not allow both subset and cols + self.assertRaises(TypeError, df.drop_duplicates, + kwargs={'cols': 'AAA', 'subset': 'B'}) + + # Does not allow unknown kwargs + self.assertRaises(TypeError, df.drop_duplicates, + kwargs={'subset': 'AAA', 'bad_arg': True}) + + def test_drop_duplicates_tuple(self): + df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates(('AA', 'AB')) + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(('AA', 'AB'), take_last=True) + expected = df.ix[[6, 7]] + assert_frame_equal(result, expected) + + # multi column + expected = df.ix[[0, 1, 2, 3]] + result = df.drop_duplicates((('AA', 'AB'), 'B')) + assert_frame_equal(result, expected) + + def test_drop_duplicates_NA(self): + # none + df = DataFrame({'A': [None, None, 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('A') + expected = df.ix[[0, 2, 3]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', take_last=True) + expected = df.ix[[1, 6, 7]] + assert_frame_equal(result, expected) + + # multi column + result = df.drop_duplicates(['A', 'B']) + expected = df.ix[[0, 2, 3, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], take_last=True) + expected = df.ix[[1, 5, 6, 7]] + assert_frame_equal(result, expected) + + # nan + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D': lrange(8)}) + + # single column + result = df.drop_duplicates('C') + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', take_last=True) + expected = df.ix[[3, 7]] + assert_frame_equal(result, expected) + + # multi column + result = df.drop_duplicates(['C', 'B']) + expected = df.ix[[0, 1, 2, 4]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], take_last=True) + expected = df.ix[[1, 3, 6, 7]] + assert_frame_equal(result, expected) + + def test_drop_duplicates_inplace(self): + orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # single column + df = orig.copy() + df.drop_duplicates('A', inplace=True) + expected = orig[:2] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', take_last=True, inplace=True) + expected = orig.ix[[6, 7]] + result = df + assert_frame_equal(result, expected) + + # multi column + df = orig.copy() + df.drop_duplicates(['A', 'B'], inplace=True) + expected = orig.ix[[0, 1, 2, 3]] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) + expected = orig.ix[[0, 5, 6, 7]] + result = df + assert_frame_equal(result, expected) + + # consider everything + orig2 = orig.ix[:, ['A', 'B', 'C']].copy() + + df2 = orig2.copy() + df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(['A', 'B']) + result = df2 + assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(take_last=True, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], take_last=True) + result = df2 + assert_frame_equal(result, expected) + + def test_duplicated_deprecated_warning(self): + df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': [1, 1, 2, 2, 2, 2, 1, 2], + 'D': lrange(8)}) + + # Raises warning + with tm.assert_produces_warning(False): + result = df.duplicated(subset='AAA') + + with tm.assert_produces_warning(FutureWarning): + result = df.duplicated(cols='AAA') + + # Does not allow both subset and cols + self.assertRaises(TypeError, df.duplicated, + kwargs={'cols': 'AAA', 'subset': 'B'}) + + # Does not allow unknown kwargs + self.assertRaises(TypeError, df.duplicated, + kwargs={'subset': 'AAA', 'bad_arg': True}) + + def test_drop_col_still_multiindex(self): + arrays = [['a', 'b', 'c', 'top'], + ['', '', '', 'OD'], + ['', '', '', 'wx']] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + + df = DataFrame(randn(3, 4), columns=index) + del df[('a', '', '')] + assert(isinstance(df.columns, MultiIndex)) + + def test_drop(self): + simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) + assert_frame_equal(simple.drop("A", axis=1), simple[['B']]) + assert_frame_equal(simple.drop(["A", "B"], axis='columns'), + simple[[]]) + assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.ix[[2], :]) + assert_frame_equal(simple.drop([0, 3], axis='index'), simple.ix[[1, 2], :]) + + #non-unique - wheee! + nu_df = DataFrame(lzip(range(3), range(-3, 1), list('abc')), + columns=['a', 'a', 'b']) + assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']]) + assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a']) + + nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) + nu_df.columns = list('abc') + assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.ix[["Y"], :]) + assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.ix[[], :]) + + # inplace cache issue + # GH 5628 + df = pd.DataFrame(np.random.randn(10,3), columns=list('abc')) + expected = df[~(df.b>0)] + df.drop(labels=df[df.b>0].index, inplace=True) + assert_frame_equal(df,expected) + + def test_fillna(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + zero_filled = self.tsframe.fillna(0) + self.assertTrue((zero_filled['A'][:5] == 0).all()) + + padded = self.tsframe.fillna(method='pad') + self.assertTrue(np.isnan(padded['A'][:5]).all()) + self.assertTrue((padded['A'][-5:] == padded['A'][-5]).all()) + + # mixed type + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + result = self.mixed_frame.fillna(value=0) + result = self.mixed_frame.fillna(method='pad') + + self.assertRaises(ValueError, self.tsframe.fillna) + self.assertRaises(ValueError, self.tsframe.fillna, 5, method='ffill') + + # mixed numeric (but no float16) + mf = self.mixed_float.reindex(columns=['A','B','D']) + mf['A'][-10:] = nan + result = mf.fillna(value=0) + _check_mixed_float(result, dtype = dict(C = None)) + + result = mf.fillna(method='pad') + _check_mixed_float(result, dtype = dict(C = None)) + + # empty frame (GH #2778) + df = DataFrame(columns=['x']) + for m in ['pad','backfill']: + df.x.fillna(method=m,inplace=1) + df.x.fillna(method=m) + + # with different dtype (GH3386) + df = DataFrame([['a','a',np.nan,'a'],['b','b',np.nan,'b'],['c','c',np.nan,'c']]) + + result = df.fillna({ 2: 'foo' }) + expected = DataFrame([['a','a','foo','a'],['b','b','foo','b'],['c','c','foo','c']]) + assert_frame_equal(result, expected) + + df.fillna({ 2: 'foo' }, inplace=True) + assert_frame_equal(df, expected) + + # limit and value + df = DataFrame(np.random.randn(10,3)) + df.iloc[2:7,0] = np.nan + df.iloc[3:5,2] = np.nan + + expected = df.copy() + expected.iloc[2,0] = 999 + expected.iloc[3,2] = 999 + result = df.fillna(999,limit=1) + assert_frame_equal(result, expected) + + # with datelike + # GH 6344 + df = DataFrame({ + 'Date':[pd.NaT, Timestamp("2014-1-1")], + 'Date2':[ Timestamp("2013-1-1"), pd.NaT] + }) + + expected = df.copy() + expected['Date'] = expected['Date'].fillna(df.ix[0,'Date2']) + result = df.fillna(value={'Date':df['Date2']}) + assert_frame_equal(result, expected) + + def test_fillna_dtype_conversion(self): + # make sure that fillna on an empty frame works + df = DataFrame(index=["A","B","C"], columns = [1,2,3,4,5]) + result = df.get_dtype_counts().order() + expected = Series({ 'object' : 5 }) + assert_series_equal(result, expected) + + result = df.fillna(1) + expected = DataFrame(1, index=["A","B","C"], columns = [1,2,3,4,5]) + result = result.get_dtype_counts().order() + expected = Series({ 'int64' : 5 }) + assert_series_equal(result, expected) + + # empty block + df = DataFrame(index=lrange(3),columns=['A','B'],dtype='float64') + result = df.fillna('nan') + expected = DataFrame('nan',index=lrange(3),columns=['A','B']) + assert_frame_equal(result, expected) + + # equiv of replace + df = DataFrame(dict(A = [1,np.nan], B = [1.,2.])) + for v in ['',1,np.nan,1.0]: + expected = df.replace(np.nan,v) + result = df.fillna(v) + assert_frame_equal(result, expected) + + def test_ffill(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + assert_frame_equal(self.tsframe.ffill(), + self.tsframe.fillna(method='ffill')) + + def test_bfill(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + assert_frame_equal(self.tsframe.bfill(), + self.tsframe.fillna(method='bfill')) + + def test_fillna_skip_certain_blocks(self): + # don't try to fill boolean, int blocks + + df = DataFrame(np.random.randn(10, 4).astype(int)) + + # it works! + df.fillna(np.nan) + + def test_fillna_inplace(self): + df = DataFrame(np.random.randn(10, 4)) + df[1][:4] = np.nan + df[3][-4:] = np.nan + + expected = df.fillna(value=0) + self.assertIsNot(expected, df) + + df.fillna(value=0, inplace=True) + assert_frame_equal(df, expected) + + df[1][:4] = np.nan + df[3][-4:] = np.nan + expected = df.fillna(method='ffill') + self.assertIsNot(expected, df) + + df.fillna(method='ffill', inplace=True) + assert_frame_equal(df, expected) + + def test_fillna_dict_series(self): + df = DataFrame({'a': [nan, 1, 2, nan, nan], + 'b': [1, 2, 3, nan, nan], + 'c': [nan, 1, 2, 3, 4]}) + + result = df.fillna({'a': 0, 'b': 5}) + + expected = df.copy() + expected['a'] = expected['a'].fillna(0) + expected['b'] = expected['b'].fillna(5) + assert_frame_equal(result, expected) + + # it works + result = df.fillna({'a': 0, 'b': 5, 'd': 7}) + + # Series treated same as dict + result = df.fillna(df.max()) + expected = df.fillna(df.max().to_dict()) + assert_frame_equal(result, expected) + + # disable this for now + with assertRaisesRegexp(NotImplementedError, 'column by column'): + df.fillna(df.max(1), axis=1) + + def test_fillna_columns(self): + df = DataFrame(np.random.randn(10, 10)) + df.values[:, ::2] = np.nan + + result = df.fillna(method='ffill', axis=1) + expected = df.T.fillna(method='pad').T + assert_frame_equal(result, expected) + + df.insert(6, 'foo', 5) + result = df.fillna(method='ffill', axis=1) + expected = df.astype(float).fillna(method='ffill', axis=1) + assert_frame_equal(result, expected) + + def test_fillna_invalid_method(self): + with assertRaisesRegexp(ValueError, 'ffil'): + self.frame.fillna(method='ffil') + + def test_fillna_invalid_value(self): + # list + self.assertRaises(TypeError, self.frame.fillna, [1, 2]) + # tuple + self.assertRaises(TypeError, self.frame.fillna, (1, 2)) + + def test_replace_inplace(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + tsframe = self.tsframe.copy() + tsframe.replace(nan, 0, inplace=True) + assert_frame_equal(tsframe, self.tsframe.fillna(0)) + + self.assertRaises(TypeError, self.tsframe.replace, nan, inplace=True) + self.assertRaises(TypeError, self.tsframe.replace, nan) + + # mixed type + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + result = self.mixed_frame.replace(np.nan, 0) + expected = self.mixed_frame.fillna(value=0) + assert_frame_equal(result, expected) + + tsframe = self.tsframe.copy() + tsframe.replace([nan], [0], inplace=True) + assert_frame_equal(tsframe, self.tsframe.fillna(0)) + + def test_regex_replace_scalar(self): + obj = {'a': list('ab..'), 'b': list('efgh')} + dfobj = DataFrame(obj) + mix = {'a': lrange(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + + ### simplest cases + ## regex -> value + # obj frame + res = dfobj.replace(r'\s*\.\s*', nan, regex=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.replace(r'\s*\.\s*', nan, regex=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.replace(re.compile(r'\s*\.\s*'), nan, regex=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.replace(re.compile(r'\s*\.\s*'), nan, regex=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + res = dfmix.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1') + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + res = dfmix.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1') + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + def test_regex_replace_scalar_inplace(self): + obj = {'a': list('ab..'), 'b': list('efgh')} + dfobj = DataFrame(obj) + mix = {'a': lrange(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + + ### simplest cases + ## regex -> value + # obj frame + res = dfobj.copy() + res.replace(r'\s*\.\s*', nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(r'\s*\.\s*', nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.copy() + res.replace(re.compile(r'\s*\.\s*'), nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(re.compile(r'\s*\.\s*'), nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, + inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, + inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + res = dfobj.copy() + res.replace(regex=r'\s*\.\s*', value=nan, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(regex=r'\s*\.\s*', value=nan, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + # everything with compiled regexs as well + res = dfobj.copy() + res.replace(regex=re.compile(r'\s*\.\s*'), value=nan, inplace=True) + assert_frame_equal(dfobj, res.fillna('.')) + + # mixed + res = dfmix.copy() + res.replace(regex=re.compile(r'\s*\.\s*'), value=nan, inplace=True) + assert_frame_equal(dfmix, res.fillna('.')) + + ## regex -> regex + # obj frame + res = dfobj.copy() + res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', + inplace=True) + objc = obj.copy() + objc['a'] = ['a', 'b', '...', '...'] + expec = DataFrame(objc) + assert_frame_equal(res, expec) + + # with mixed + res = dfmix.copy() + res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', + inplace=True) + mixc = mix.copy() + mixc['b'] = ['a', 'b', '...', '...'] + expec = DataFrame(mixc) + assert_frame_equal(res, expec) + + def test_regex_replace_list_obj(self): + obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + dfobj = DataFrame(obj) + + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'e|f|g'] + values = [nan, 'crap'] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': ['a', 'b', nan, nan], 'b': ['crap'] * 3 + + ['h'], 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] + values = [r'\1\1', r'\1_crap'] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e_crap', + 'f_crap', + 'g_crap', 'h'], + 'c': ['h', 'e_crap', 'l', 'o']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.replace(value=values, regex=to_replace_res) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + def test_regex_replace_list_obj_inplace(self): + ### same as above with inplace=True + ## lists of regexes and values + obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + dfobj = DataFrame(obj) + + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'e|f|g'] + values = [nan, 'crap'] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': ['a', 'b', nan, nan], 'b': ['crap'] * 3 + + ['h'], 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] + values = [r'\1\1', r'\1_crap'] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e_crap', + 'f_crap', + 'g_crap', 'h'], + 'c': ['h', 'e_crap', 'l', 'o']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'e'] + values = [r'\1\1', r'crap'] + res = dfobj.copy() + res.replace(value=values, regex=to_replace_res, inplace=True) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['crap', 'f', 'g', + 'h'], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed(self): + ## mixed frame to make sure this doesn't break things + mix = {'a': lrange(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'a'] + values = [nan, 'crap'] + mix2 = {'a': lrange(4), 'b': list('ab..'), 'c': list('halo')} + dfmix2 = DataFrame(mix2) + res = dfmix2.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': mix2['a'], 'b': ['crap', 'b', nan, nan], + 'c': ['h', 'crap', 'l', 'o']}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] + values = [r'\1\1', r'\1_crap'] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a_crap', 'b_crap', '..', + '..']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.replace(to_replace_res, values, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.replace(regex=to_replace_res, value=values) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + def test_regex_replace_list_mixed_inplace(self): + mix = {'a': lrange(4), 'b': list('ab..')} + dfmix = DataFrame(mix) + # the same inplace + ## lists of regexes and values + # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] + to_replace_res = [r'\s*\.\s*', r'a'] + values = [nan, 'crap'] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b', nan, nan]}) + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] + to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] + values = [r'\1\1', r'\1_crap'] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a_crap', 'b_crap', '..', + '..']}) + + assert_frame_equal(res, expec) + + # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN + # or vN)] + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.copy() + res.replace(to_replace_res, values, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] + values = [r'\1\1', r'crap', r'\1_crap'] + res = dfmix.copy() + res.replace(regex=to_replace_res, value=values, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['crap', 'b_crap', '..', '..']}) + assert_frame_equal(res, expec) + + def test_regex_replace_dict_mixed(self): + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + dfmix = DataFrame(mix) + + ## dicts + # single dict {re1: v1}, search the whole frame + # need test for this... + + # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole + # frame + res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': nan}, regex=True) + res2 = dfmix.copy() + res2.replace({'b': r'\s*\.\s*'}, {'b': nan}, inplace=True, regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the + # whole frame + res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + res2 = dfmix.copy() + res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True, + regex=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}) + res2 = dfmix.copy() + res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}, + inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', '.ty', '.ty'], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + # scalar -> dict + # to_replace regex, {value: value} + expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': + mix['c']}) + res = dfmix.replace('a', {'b': nan}, regex=True) + res2 = dfmix.copy() + res2.replace('a', {'b': nan}, regex=True, inplace=True) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + res = dfmix.replace('a', {'b': nan}, regex=True) + res2 = dfmix.copy() + res2.replace(regex='a', value={'b': nan}, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + def test_regex_replace_dict_nested(self): + # nested dicts will not work until this is implemented for Series + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + dfmix = DataFrame(mix) + res = dfmix.replace({'b': {r'\s*\.\s*': nan}}, regex=True) + res2 = dfmix.copy() + res4 = dfmix.copy() + res2.replace({'b': {r'\s*\.\s*': nan}}, inplace=True, regex=True) + res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': nan}}) + res4.replace(regex={'b': {r'\s*\.\s*': nan}}, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + assert_frame_equal(res4, expec) + + def test_regex_replace_dict_nested_gh4115(self): + df = pd.DataFrame({'Type':['Q','T','Q','Q','T'], 'tmp':2}) + expected = DataFrame({'Type': [0,1,0,0,1], 'tmp': 2}) + assert_frame_equal(df.replace({'Type': {'Q':0,'T':1}}), expected) + + def test_regex_replace_list_to_scalar(self): + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace([r'\s*\.\s*', 'a|b'], nan, regex=True) + res2 = df.copy() + res3 = df.copy() + res2.replace([r'\s*\.\s*', 'a|b'], nan, regex=True, inplace=True) + res3.replace(regex=[r'\s*\.\s*', 'a|b'], value=nan, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': np.array([nan] * 4), + 'c': [nan, nan, nan, 'd']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_str_to_numeric(self): + # what happens when you try to replace a numeric value with a regex? + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace(r'\s*\.\s*', 0, regex=True) + res2 = df.copy() + res2.replace(r'\s*\.\s*', 0, inplace=True, regex=True) + res3 = df.copy() + res3.replace(regex=r'\s*\.\s*', value=0, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', 0, 0], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_regex_list_to_numeric(self): + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace([r'\s*\.\s*', 'b'], 0, regex=True) + res2 = df.copy() + res2.replace([r'\s*\.\s*', 'b'], 0, regex=True, inplace=True) + res3 = df.copy() + res3.replace(regex=[r'\s*\.\s*', 'b'], value=0, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 0, 0, 0], 'c': ['a', 0, + nan, + 'd']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_series_of_regexes(self): + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + s1 = Series({'b': r'\s*\.\s*'}) + s2 = Series({'b': nan}) + res = df.replace(s1, s2, regex=True) + res2 = df.copy() + res2.replace(s1, s2, inplace=True, regex=True) + res3 = df.copy() + res3.replace(regex=s1, value=s2, inplace=True) + expec = DataFrame({'a': mix['a'], 'b': ['a', 'b', nan, nan], 'c': + mix['c']}) + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + assert_frame_equal(res3, expec) + + def test_regex_replace_numeric_to_object_conversion(self): + mix = {'a': lrange(4), 'b': list('ab..'), 'c': ['a', 'b', nan, 'd']} + df = DataFrame(mix) + res = df.replace(0, 'a') + expec = DataFrame({'a': ['a', 1, 2, 3], 'b': mix['b'], 'c': mix['c']}) + assert_frame_equal(res, expec) + self.assertEqual(res.a.dtype, np.object_) + + def test_replace_regex_metachar(self): + metachars = '[]', '()', '\d', '\w', '\s' + + for metachar in metachars: + df = DataFrame({'a': [metachar, 'else']}) + result = df.replace({'a': {metachar: 'paren'}}) + expected = DataFrame({'a': ['paren', 'else']}) + tm.assert_frame_equal(result, expected) + + def test_replace(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + zero_filled = self.tsframe.replace(nan, -1e8) + assert_frame_equal(zero_filled, self.tsframe.fillna(-1e8)) + assert_frame_equal(zero_filled.replace(-1e8, nan), self.tsframe) + + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + self.tsframe['B'][:5] = -1e8 + + # empty + df = DataFrame(index=['a', 'b']) + assert_frame_equal(df, df.replace(5, 7)) + + def test_replace_list(self): + obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + dfobj = DataFrame(obj) + + ## lists of regexes and values + # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN] + to_replace_res = [r'.', r'e'] + values = [nan, 'crap'] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame({'a': ['a', 'b', nan, nan], + 'b': ['crap', 'f', 'g', 'h'], 'c': ['h', 'crap', + 'l', 'o']}) + assert_frame_equal(res, expec) + + # list of [v1, v2, ..., vN] -> [v1, v2, .., vN] + to_replace_res = [r'.', r'f'] + values = [r'..', r'crap'] + res = dfobj.replace(to_replace_res, values) + expec = DataFrame({'a': ['a', 'b', '..', '..'], 'b': ['e', 'crap', 'g', + 'h'], + 'c': ['h', 'e', 'l', 'o']}) + + assert_frame_equal(res, expec) + + def test_replace_series_dict(self): + # from GH 3064 + df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) + result = df.replace(0, {'zero': 0.5, 'one': 1.0}) + expected = DataFrame({'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}}) + assert_frame_equal(result, expected) + + result = df.replace(0, df.mean()) + assert_frame_equal(result, expected) + + # series to series/dict + df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) + s = Series({'zero': 0.0, 'one': 2.0}) + result = df.replace(s, {'zero': 0.5, 'one': 1.0}) + expected = DataFrame({'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}}) + assert_frame_equal(result, expected) + + result = df.replace(s, df.mean()) + assert_frame_equal(result, expected) + + def test_replace_convert(self): + # gh 3907 + df = DataFrame([['foo', 'bar', 'bah'], ['bar', 'foo', 'bah']]) + m = {'foo': 1, 'bar': 2, 'bah': 3} + rep = df.replace(m) + expec = Series([ np.int64] * 3) + res = rep.dtypes + assert_series_equal(expec, res) + + def test_replace_mixed(self): + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + result = self.mixed_frame.replace(np.nan, -18) + expected = self.mixed_frame.fillna(value=-18) + assert_frame_equal(result, expected) + assert_frame_equal(result.replace(-18, nan), self.mixed_frame) + + result = self.mixed_frame.replace(np.nan, -1e8) + expected = self.mixed_frame.fillna(value=-1e8) + assert_frame_equal(result, expected) + assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame) + + # int block upcasting + df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') }) + expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64') }) + result = df.replace(0, 0.5) + assert_frame_equal(result,expected) + + df.replace(0, 0.5, inplace=True) + assert_frame_equal(df,expected) + + # int block splitting + df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64'), 'C' : Series([1,2],dtype='int64') }) + expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64'), 'C' : Series([1,2],dtype='int64') }) + result = df.replace(0, 0.5) + assert_frame_equal(result,expected) + + # to object block upcasting + df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') }) + expected = DataFrame({ 'A' : Series([1,'foo'],dtype='object'), 'B' : Series([0,1],dtype='int64') }) + result = df.replace(2, 'foo') + assert_frame_equal(result,expected) + + expected = DataFrame({ 'A' : Series(['foo','bar'],dtype='object'), 'B' : Series([0,'foo'],dtype='object') }) + result = df.replace([1,2], ['foo','bar']) + assert_frame_equal(result,expected) + + # test case from + from pandas.util.testing import makeCustomDataframe as mkdf + df = DataFrame({'A' : Series([3,0],dtype='int64'), 'B' : Series([0,3],dtype='int64') }) + result = df.replace(3, df.mean().to_dict()) + expected = df.copy().astype('float64') + m = df.mean() + expected.iloc[0,0] = m[0] + expected.iloc[1,1] = m[1] + assert_frame_equal(result,expected) + + def test_replace_simple_nested_dict(self): + df = DataFrame({'col': range(1, 5)}) + expected = DataFrame({'col': ['a', 2, 3, 'b']}) + + result = df.replace({'col': {1: 'a', 4: 'b'}}) + tm.assert_frame_equal(expected, result) + + # in this case, should be the same as the not nested version + result = df.replace({1: 'a', 4: 'b'}) + tm.assert_frame_equal(expected, result) + + def test_replace_simple_nested_dict_with_nonexistent_value(self): + df = DataFrame({'col': range(1, 5)}) + expected = DataFrame({'col': ['a', 2, 3, 'b']}) + + result = df.replace({-1: '-', 1: 'a', 4: 'b'}) + tm.assert_frame_equal(expected, result) + + result = df.replace({'col': {-1: '-', 1: 'a', 4: 'b'}}) + tm.assert_frame_equal(expected, result) + + def test_interpolate(self): + pass + + def test_replace_value_is_none(self): + self.assertRaises(TypeError, self.tsframe.replace, nan) + orig_value = self.tsframe.iloc[0, 0] + orig2 = self.tsframe.iloc[1, 0] + + self.tsframe.iloc[0, 0] = nan + self.tsframe.iloc[1, 0] = 1 + + result = self.tsframe.replace(to_replace={nan: 0}) + expected = self.tsframe.T.replace(to_replace={nan: 0}).T + assert_frame_equal(result, expected) + + result = self.tsframe.replace(to_replace={nan: 0, 1: -1e8}) + tsframe = self.tsframe.copy() + tsframe.iloc[0, 0] = 0 + tsframe.iloc[1, 0] = -1e8 + expected = tsframe + assert_frame_equal(expected, result) + self.tsframe.iloc[0, 0] = orig_value + self.tsframe.iloc[1, 0] = orig2 + + def test_replace_for_new_dtypes(self): + + # dtypes + tsframe = self.tsframe.copy().astype(np.float32) + tsframe['A'][:5] = nan + tsframe['A'][-5:] = nan + + zero_filled = tsframe.replace(nan, -1e8) + assert_frame_equal(zero_filled, tsframe.fillna(-1e8)) + assert_frame_equal(zero_filled.replace(-1e8, nan), tsframe) + + tsframe['A'][:5] = nan + tsframe['A'][-5:] = nan + tsframe['B'][:5] = -1e8 + + b = tsframe['B'] + b[b == -1e8] = nan + tsframe['B'] = b + result = tsframe.fillna(method='bfill') + assert_frame_equal(result, tsframe.fillna(method='bfill')) + + def test_replace_dtypes(self): + # int + df = DataFrame({'ints': [1, 2, 3]}) + result = df.replace(1, 0) + expected = DataFrame({'ints': [0, 2, 3]}) + assert_frame_equal(result, expected) + + df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int32) + result = df.replace(1, 0) + expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int32) + assert_frame_equal(result, expected) + + df = DataFrame({'ints': [1, 2, 3]}, dtype=np.int16) + result = df.replace(1, 0) + expected = DataFrame({'ints': [0, 2, 3]}, dtype=np.int16) + assert_frame_equal(result, expected) + + # bools + df = DataFrame({'bools': [True, False, True]}) + result = df.replace(False, True) + self.assertTrue(result.values.all()) + + # complex blocks + df = DataFrame({'complex': [1j, 2j, 3j]}) + result = df.replace(1j, 0j) + expected = DataFrame({'complex': [0j, 2j, 3j]}) + assert_frame_equal(result, expected) + + # datetime blocks + prev = datetime.today() + now = datetime.today() + df = DataFrame({'datetime64': Index([prev, now, prev])}) + result = df.replace(prev, now) + expected = DataFrame({'datetime64': Index([now] * 3)}) + assert_frame_equal(result, expected) + + def test_replace_input_formats(self): + # both dicts + to_rep = {'A': np.nan, 'B': 0, 'C': ''} + values = {'A': 0, 'B': -1, 'C': 'missing'} + df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], + 'C': ['', 'asdf', 'fd']}) + filled = df.replace(to_rep, values) + expected = {} + for k, v in compat.iteritems(df): + expected[k] = v.replace(to_rep[k], values[k]) + assert_frame_equal(filled, DataFrame(expected)) + + result = df.replace([0, 2, 5], [5, 2, 0]) + expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], + 'C': ['', 'asdf', 'fd']}) + assert_frame_equal(result, expected) + + # dict to scalar + filled = df.replace(to_rep, 0) + expected = {} + for k, v in compat.iteritems(df): + expected[k] = v.replace(to_rep[k], 0) + assert_frame_equal(filled, DataFrame(expected)) + + self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) + + # scalar to dict + values = {'A': 0, 'B': -1, 'C': 'missing'} + df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], + 'C': ['', 'asdf', 'fd']}) + filled = df.replace(np.nan, values) + expected = {} + for k, v in compat.iteritems(df): + expected[k] = v.replace(np.nan, values[k]) + assert_frame_equal(filled, DataFrame(expected)) + + # list to list + to_rep = [np.nan, 0, ''] + values = [-2, -1, 'missing'] + result = df.replace(to_rep, values) + expected = df.copy() + for i in range(len(to_rep)): + expected.replace(to_rep[i], values[i], inplace=True) + assert_frame_equal(result, expected) + + self.assertRaises(ValueError, df.replace, to_rep, values[1:]) + + # list to scalar + to_rep = [np.nan, 0, ''] + result = df.replace(to_rep, -1) + expected = df.copy() + for i in range(len(to_rep)): + expected.replace(to_rep[i], -1, inplace=True) + assert_frame_equal(result, expected) + + def test_replace_limit(self): + pass + + def test_replace_dict_no_regex(self): + answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: + 'Disagree', 4: 'Strongly Disagree'}) + weights = {'Agree': 4, 'Disagree': 2, 'Neutral': 3, 'Strongly Agree': + 5, 'Strongly Disagree': 1} + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_series_no_regex(self): + answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: + 'Disagree', 4: 'Strongly Disagree'}) + weights = Series({'Agree': 4, 'Disagree': 2, 'Neutral': 3, + 'Strongly Agree': 5, 'Strongly Disagree': 1}) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_dict_tuple_list_ordering_remains_the_same(self): + df = DataFrame(dict(A=[nan, 1])) + res1 = df.replace(to_replace={nan: 0, 1: -1e8}) + res2 = df.replace(to_replace=(1, nan), value=[-1e8, 0]) + res3 = df.replace(to_replace=[1, nan], value=[-1e8, 0]) + + expected = DataFrame({'A': [0, -1e8]}) + tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res2, res3) + tm.assert_frame_equal(res3, expected) + + def test_replace_doesnt_replace_without_regex(self): + from pandas.compat import StringIO + raw = """fol T_opp T_Dir T_Enh + 0 1 0 0 vo + 1 2 vr 0 0 + 2 2 0 0 0 + 3 3 0 bt 0""" + df = read_csv(StringIO(raw), sep=r'\s+') + res = df.replace({'\D': 1}) + tm.assert_frame_equal(df, res) + + def test_replace_bool_with_string(self): + df = DataFrame({'a': [True, False], 'b': list('ab')}) + result = df.replace(True, 'a') + expected = DataFrame({'a': ['a', False], 'b': df.b}) + tm.assert_frame_equal(result, expected) + + def test_replace_pure_bool_with_string_no_op(self): + df = DataFrame(np.random.rand(2, 2) > 0.5) + result = df.replace('asdf', 'fdsa') + tm.assert_frame_equal(df, result) + + def test_replace_bool_with_bool(self): + df = DataFrame(np.random.rand(2, 2) > 0.5) + result = df.replace(False, True) + expected = DataFrame(np.ones((2, 2), dtype=bool)) + tm.assert_frame_equal(result, expected) + + def test_replace_with_dict_with_bool_keys(self): + df = DataFrame({0: [True, False], 1: [False, True]}) + with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): + df.replace({'asdf': 'asdb', True: 'yes'}) + + def test_replace_truthy(self): + df = DataFrame({'a': [True, True]}) + r = df.replace([np.inf, -np.inf], np.nan) + e = df + tm.assert_frame_equal(r, e) + + def test_replace_int_to_int_chain(self): + df = DataFrame({'a': lrange(1, 5)}) + with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): + df.replace({'a': dict(zip(range(1, 5), range(2, 6)))}) + + def test_replace_str_to_str_chain(self): + a = np.arange(1, 5) + astr = a.astype(str) + bstr = np.arange(2, 6).astype(str) + df = DataFrame({'a': astr}) + with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): + df.replace({'a': dict(zip(astr, bstr))}) + + def test_replace_swapping_bug(self): + df = pd.DataFrame({'a': [True, False, True]}) + res = df.replace({'a': {True: 'Y', False: 'N'}}) + expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + tm.assert_frame_equal(res, expect) + + df = pd.DataFrame({'a': [0, 1, 0]}) + res = df.replace({'a': {0: 'Y', 1: 'N'}}) + expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + tm.assert_frame_equal(res, expect) + + def test_replace_period(self): + d = {'fname': + {'out_augmented_AUG_2011.json': pd.Period(year=2011, month=8, freq='M'), + 'out_augmented_JAN_2011.json': pd.Period(year=2011, month=1, freq='M'), + 'out_augmented_MAY_2012.json': pd.Period(year=2012, month=5, freq='M'), + 'out_augmented_SUBSIDY_WEEK.json': pd.Period(year=2011, month=4, freq='M'), + 'out_augmented_AUG_2012.json': pd.Period(year=2012, month=8, freq='M'), + 'out_augmented_MAY_2011.json': pd.Period(year=2011, month=5, freq='M'), + 'out_augmented_SEP_2013.json': pd.Period(year=2013, month=9, freq='M')}} + + df = pd.DataFrame(['out_augmented_AUG_2012.json', + 'out_augmented_SEP_2013.json', + 'out_augmented_SUBSIDY_WEEK.json', + 'out_augmented_MAY_2012.json', + 'out_augmented_MAY_2011.json', + 'out_augmented_AUG_2011.json', + 'out_augmented_JAN_2011.json'], columns=['fname']) + tm.assert_equal(set(df.fname.values), set(d['fname'].keys())) + expected = DataFrame({'fname': [d['fname'][k] + for k in df.fname.values]}) + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + def test_replace_datetime(self): + d = {'fname': + {'out_augmented_AUG_2011.json': pd.Timestamp('2011/08'), + 'out_augmented_JAN_2011.json': pd.Timestamp('2011/01'), + 'out_augmented_MAY_2012.json': pd.Timestamp('2012/05'), + 'out_augmented_SUBSIDY_WEEK.json': pd.Timestamp('2011/04'), + 'out_augmented_AUG_2012.json': pd.Timestamp('2012/08'), + 'out_augmented_MAY_2011.json': pd.Timestamp('2011/05'), + 'out_augmented_SEP_2013.json': pd.Timestamp('2013/09')}} + + df = pd.DataFrame(['out_augmented_AUG_2012.json', + 'out_augmented_SEP_2013.json', + 'out_augmented_SUBSIDY_WEEK.json', + 'out_augmented_MAY_2012.json', + 'out_augmented_MAY_2011.json', + 'out_augmented_AUG_2011.json', + 'out_augmented_JAN_2011.json'], columns=['fname']) + tm.assert_equal(set(df.fname.values), set(d['fname'].keys())) + expected = DataFrame({'fname': [d['fname'][k] + for k in df.fname.values]}) + result = df.replace(d) + tm.assert_frame_equal(result, expected) + + def test_combine_multiple_frames_dtypes(self): + + # GH 2759 + A = DataFrame(data=np.ones((10, 2)), columns=['foo', 'bar'], dtype=np.float64) + B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) + results = pd.concat((A, B), axis=1).get_dtype_counts() + expected = Series(dict( float64 = 2, float32 = 2 )) + assert_series_equal(results,expected) + + def test_ops(self): + + # tst ops and reversed ops in evaluation + # GH7198 + + # smaller hits python, larger hits numexpr + for n in [ 4, 4000 ]: + + df = DataFrame(1,index=range(n),columns=list('abcd')) + df.iloc[0] = 2 + m = df.mean() + + for op_str, op, rop in [('+','__add__','__radd__'), + ('-','__sub__','__rsub__'), + ('*','__mul__','__rmul__'), + ('/','__truediv__','__rtruediv__')]: + + base = DataFrame(np.tile(m.values,n).reshape(n,-1),columns=list('abcd')) + expected = eval("base{op}df".format(op=op_str)) + + # ops as strings + result = eval("m{op}df".format(op=op_str)) + assert_frame_equal(result,expected) + + # these are commutative + if op in ['+','*']: + result = getattr(df,op)(m) + assert_frame_equal(result,expected) + + # these are not + elif op in ['-','/']: + result = getattr(df,rop)(m) + assert_frame_equal(result,expected) + + # GH7192 + df = DataFrame(dict(A=np.random.randn(25000))) + df.iloc[0:5] = np.nan + expected = (1-np.isnan(df.iloc[0:25])) + result = (1-np.isnan(df)).iloc[0:25] + assert_frame_equal(result,expected) + + def test_truncate(self): + offset = datetools.bday + + ts = self.tsframe[::3] + + start, end = self.tsframe.index[3], self.tsframe.index[6] + + start_missing = self.tsframe.index[2] + end_missing = self.tsframe.index[7] + + # neither specified + truncated = ts.truncate() + assert_frame_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + assert_frame_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + assert_frame_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + assert_frame_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + assert_frame_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + assert_frame_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + assert_frame_equal(truncated, expected) + + self.assertRaises(ValueError, ts.truncate, + before=ts.index[-1] - 1, + after=ts.index[0] +1) + + def test_truncate_copy(self): + index = self.tsframe.index + truncated = self.tsframe.truncate(index[5], index[10]) + truncated.values[:] = 5. + self.assertFalse((self.tsframe.values[5:11] == 5).any()) + + def test_xs(self): + idx = self.frame.index[5] + xs = self.frame.xs(idx) + for item, value in compat.iteritems(xs): + if np.isnan(value): + self.assertTrue(np.isnan(self.frame[item][idx])) + else: + self.assertEqual(value, self.frame[item][idx]) + + # mixed-type xs + test_data = { + 'A': {'1': 1, '2': 2}, + 'B': {'1': '1', '2': '2', '3': '3'}, + } + frame = DataFrame(test_data) + xs = frame.xs('1') + self.assertEqual(xs.dtype, np.object_) + self.assertEqual(xs['A'], 1) + self.assertEqual(xs['B'], '1') + + with tm.assertRaises(KeyError): + self.tsframe.xs(self.tsframe.index[0] - datetools.bday) + + # xs get column + series = self.frame.xs('A', axis=1) + expected = self.frame['A'] + assert_series_equal(series, expected) + + # view is returned if possible + series = self.frame.xs('A', axis=1) + series[:] = 5 + self.assertTrue((expected == 5).all()) + + def test_xs_corner(self): + # pathological mixed-type reordering case + df = DataFrame(index=[0]) + df['A'] = 1. + df['B'] = 'foo' + df['C'] = 2. + df['D'] = 'bar' + df['E'] = 3. + + xs = df.xs(0) + assert_almost_equal(xs, [1., 'foo', 2., 'bar', 3.]) + + # no columns but index + df = DataFrame(index=['a', 'b', 'c']) + result = df.xs('a') + expected = Series([]) + assert_series_equal(result, expected) + + def test_xs_duplicates(self): + df = DataFrame(randn(5, 2), index=['b', 'b', 'c', 'b', 'a']) + + cross = df.xs('c') + exp = df.irow(2) + assert_series_equal(cross, exp) + + def test_xs_keep_level(self): + df = DataFrame({'day': {0: 'sat', 1: 'sun'}, + 'flavour': {0: 'strawberry', 1: 'strawberry'}, + 'sales': {0: 10, 1: 12}, + 'year': {0: 2008, 1: 2008}}).set_index(['year','flavour','day']) + result = df.xs('sat', level='day', drop_level=False) + expected = df[:1] + assert_frame_equal(result, expected) + + result = df.xs([2008, 'sat'], level=['year', 'day'], drop_level=False) + assert_frame_equal(result, expected) + + def test_pivot(self): + data = { + 'index': ['A', 'B', 'C', 'C', 'B', 'A'], + 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], + 'values': [1., 2., 3., 3., 2., 1.] + } + + frame = DataFrame(data) + pivoted = frame.pivot( + index='index', columns='columns', values='values') + + expected = DataFrame({ + 'One': {'A': 1., 'B': 2., 'C': 3.}, + 'Two': {'A': 1., 'B': 2., 'C': 3.} + }) + expected.index.name, expected.columns.name = 'index', 'columns' + + assert_frame_equal(pivoted, expected) + + # name tracking + self.assertEqual(pivoted.index.name, 'index') + self.assertEqual(pivoted.columns.name, 'columns') + + # don't specify values + pivoted = frame.pivot(index='index', columns='columns') + self.assertEqual(pivoted.index.name, 'index') + self.assertEqual(pivoted.columns.names, (None, 'columns')) + + # pivot multiple columns + wp = tm.makePanel() + lp = wp.to_frame() + df = lp.reset_index() + assert_frame_equal(df.pivot('major', 'minor'), lp.unstack()) + + def test_pivot_duplicates(self): + data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'], + 'b': ['one', 'two', 'one', 'one', 'two'], + 'c': [1., 2., 3., 3., 4.]}) + with assertRaisesRegexp(ValueError, 'duplicate entries'): + data.pivot('a', 'b', 'c') + + def test_pivot_empty(self): + df = DataFrame({}, columns=['a', 'b', 'c']) + result = df.pivot('a', 'b', 'c') + expected = DataFrame({}) + assert_frame_equal(result, expected, check_names=False) + + def test_pivot_integer_bug(self): + df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) + + result = df.pivot(index=1, columns=0, values=2) + repr(result) + self.assert_numpy_array_equal(result.columns, ['A', 'B']) + + def test_reindex(self): + newFrame = self.frame.reindex(self.ts1.index) + + for col in newFrame.columns: + for idx, val in compat.iteritems(newFrame[col]): + if idx in self.frame.index: + if np.isnan(val): + self.assertTrue(np.isnan(self.frame[col][idx])) + else: + self.assertEqual(val, self.frame[col][idx]) + else: + self.assertTrue(np.isnan(val)) + + for col, series in compat.iteritems(newFrame): + self.assertTrue(tm.equalContents(series.index, newFrame.index)) + emptyFrame = self.frame.reindex(Index([])) + self.assertEqual(len(emptyFrame.index), 0) + + # Cython code should be unit-tested directly + nonContigFrame = self.frame.reindex(self.ts1.index[::2]) + + for col in nonContigFrame.columns: + for idx, val in compat.iteritems(nonContigFrame[col]): + if idx in self.frame.index: + if np.isnan(val): + self.assertTrue(np.isnan(self.frame[col][idx])) + else: + self.assertEqual(val, self.frame[col][idx]) + else: + self.assertTrue(np.isnan(val)) + + for col, series in compat.iteritems(nonContigFrame): + self.assertTrue(tm.equalContents(series.index, + nonContigFrame.index)) + + # corner cases + + # Same index, copies values but not index if copy=False + newFrame = self.frame.reindex(self.frame.index, copy=False) + self.assertIs(newFrame.index, self.frame.index) + + # length zero + newFrame = self.frame.reindex([]) + self.assertTrue(newFrame.empty) + self.assertEqual(len(newFrame.columns), len(self.frame.columns)) + + # length zero with columns reindexed with non-empty index + newFrame = self.frame.reindex([]) + newFrame = newFrame.reindex(self.frame.index) + self.assertEqual(len(newFrame.index), len(self.frame.index)) + self.assertEqual(len(newFrame.columns), len(self.frame.columns)) + + # pass non-Index + newFrame = self.frame.reindex(list(self.ts1.index)) + self.assertTrue(newFrame.index.equals(self.ts1.index)) + + # copy with no axes + result = self.frame.reindex() + assert_frame_equal(result,self.frame) + self.assertFalse(result is self.frame) + + def test_reindex_name_remains(self): + s = Series(random.rand(10)) + df = DataFrame(s, index=np.arange(len(s))) + i = Series(np.arange(10), name='iname') + + df = df.reindex(i) + self.assertEqual(df.index.name, 'iname') + + df = df.reindex(Index(np.arange(10), name='tmpname')) + self.assertEqual(df.index.name, 'tmpname') + + s = Series(random.rand(10)) + df = DataFrame(s.T, index=np.arange(len(s))) + i = Series(np.arange(10), name='iname') + df = df.reindex(columns=i) + self.assertEqual(df.columns.name, 'iname') + + def test_reindex_int(self): + smaller = self.intframe.reindex(self.intframe.index[::2]) + + self.assertEqual(smaller['A'].dtype, np.int64) + + bigger = smaller.reindex(self.intframe.index) + self.assertEqual(bigger['A'].dtype, np.float64) + + smaller = self.intframe.reindex(columns=['A', 'B']) + self.assertEqual(smaller['A'].dtype, np.int64) + + def test_reindex_like(self): + other = self.frame.reindex(index=self.frame.index[:10], + columns=['C', 'B']) + + assert_frame_equal(other, self.frame.reindex_like(other)) + + def test_reindex_columns(self): + newFrame = self.frame.reindex(columns=['A', 'B', 'E']) + + assert_series_equal(newFrame['B'], self.frame['B']) + self.assertTrue(np.isnan(newFrame['E']).all()) + self.assertNotIn('C', newFrame) + + # length zero + newFrame = self.frame.reindex(columns=[]) + self.assertTrue(newFrame.empty) + + def test_reindex_axes(self): + + # GH 3317, reindexing by both axes loses freq of the index + from datetime import datetime + df = DataFrame(np.ones((3, 3)), index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=['a', 'b', 'c']) + time_freq = date_range('2012-01-01', '2012-01-03', freq='d') + some_cols = ['a', 'b'] + + index_freq = df.reindex(index=time_freq).index.freq + both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq + seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq + self.assertEqual(index_freq, both_freq) + self.assertEqual(index_freq, seq_freq) + + def test_reindex_fill_value(self): + df = DataFrame(np.random.randn(10, 4)) + + # axis=0 + result = df.reindex(lrange(15)) + self.assertTrue(np.isnan(result.values[-5:]).all()) + + result = df.reindex(lrange(15), fill_value=0) + expected = df.reindex(lrange(15)).fillna(0) + assert_frame_equal(result, expected) + + # axis=1 + result = df.reindex(columns=lrange(5), fill_value=0.) + expected = df.copy() + expected[4] = 0. + assert_frame_equal(result, expected) + + result = df.reindex(columns=lrange(5), fill_value=0) + expected = df.copy() + expected[4] = 0 + assert_frame_equal(result, expected) + + result = df.reindex(columns=lrange(5), fill_value='foo') + expected = df.copy() + expected[4] = 'foo' + assert_frame_equal(result, expected) + + # reindex_axis + result = df.reindex_axis(lrange(15), fill_value=0., axis=0) + expected = df.reindex(lrange(15)).fillna(0) + assert_frame_equal(result, expected) + + result = df.reindex_axis(lrange(5), fill_value=0., axis=1) + expected = df.reindex(columns=lrange(5)).fillna(0) + assert_frame_equal(result, expected) + + # other dtypes + df['foo'] = 'foo' + result = df.reindex(lrange(15), fill_value=0) + expected = df.reindex(lrange(15)).fillna(0) + assert_frame_equal(result, expected) + + def test_reindex_dups(self): + + # GH4746, reindex on duplicate index error messages + arr = np.random.randn(10) + df = DataFrame(arr,index=[1,2,3,4,5,1,2,3,4,5]) + + # set index is ok + result = df.copy() + result.index = list(range(len(df))) + expected = DataFrame(arr,index=list(range(len(df)))) + assert_frame_equal(result,expected) + + # reindex fails + self.assertRaises(ValueError, df.reindex, index=list(range(len(df)))) + + def test_align(self): + af, bf = self.frame.align(self.frame) + self.assertIsNot(af._data, self.frame._data) + + af, bf = self.frame.align(self.frame, copy=False) + self.assertIs(af._data, self.frame._data) + + # axis = 0 + other = self.frame.ix[:-5, :3] + af, bf = self.frame.align(other, axis=0, fill_value=-1) + self.assertTrue(bf.columns.equals(other.columns)) + # test fill value + join_idx = self.frame.index.join(other.index) + diff_a = self.frame.index.diff(join_idx) + diff_b = other.index.diff(join_idx) + diff_a_vals = af.reindex(diff_a).values + diff_b_vals = bf.reindex(diff_b).values + self.assertTrue((diff_a_vals == -1).all()) + + af, bf = self.frame.align(other, join='right', axis=0) + self.assertTrue(bf.columns.equals(other.columns)) + self.assertTrue(bf.index.equals(other.index)) + self.assertTrue(af.index.equals(other.index)) + + # axis = 1 + other = self.frame.ix[:-5, :3].copy() + af, bf = self.frame.align(other, axis=1) + self.assertTrue(bf.columns.equals(self.frame.columns)) + self.assertTrue(bf.index.equals(other.index)) + + # test fill value + join_idx = self.frame.index.join(other.index) + diff_a = self.frame.index.diff(join_idx) + diff_b = other.index.diff(join_idx) + diff_a_vals = af.reindex(diff_a).values + diff_b_vals = bf.reindex(diff_b).values + self.assertTrue((diff_a_vals == -1).all()) + + af, bf = self.frame.align(other, join='inner', axis=1) + self.assertTrue(bf.columns.equals(other.columns)) + + af, bf = self.frame.align(other, join='inner', axis=1, method='pad') + self.assertTrue(bf.columns.equals(other.columns)) + + # test other non-float types + af, bf = self.intframe.align(other, join='inner', axis=1, method='pad') + self.assertTrue(bf.columns.equals(other.columns)) + + af, bf = self.mixed_frame.align(self.mixed_frame, + join='inner', axis=1, method='pad') + self.assertTrue(bf.columns.equals(self.mixed_frame.columns)) + + af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, + method=None, fill_value=None) + self.assertTrue(bf.index.equals(Index([]))) + + af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, + method=None, fill_value=0) + self.assertTrue(bf.index.equals(Index([]))) + + # mixed floats/ints + af, bf = self.mixed_float.align(other.ix[:, 0], join='inner', axis=1, + method=None, fill_value=0) + self.assertTrue(bf.index.equals(Index([]))) + + af, bf = self.mixed_int.align(other.ix[:, 0], join='inner', axis=1, + method=None, fill_value=0) + self.assertTrue(bf.index.equals(Index([]))) + + # try to align dataframe to series along bad axis + self.assertRaises(ValueError, self.frame.align, af.ix[0, :3], + join='inner', axis=2) + + def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): + aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit, + fill_axis=fill_axis) + + join_index, join_columns = None, None + + ea, eb = a, b + if axis is None or axis == 0: + join_index = a.index.join(b.index, how=how) + ea = ea.reindex(index=join_index) + eb = eb.reindex(index=join_index) + + if axis is None or axis == 1: + join_columns = a.columns.join(b.columns, how=how) + ea = ea.reindex(columns=join_columns) + eb = eb.reindex(columns=join_columns) + + ea = ea.fillna(axis=fill_axis, method=method, limit=limit) + eb = eb.fillna(axis=fill_axis, method=method, limit=limit) + + assert_frame_equal(aa, ea) + assert_frame_equal(ab, eb) + + def test_align_fill_method_inner(self): + for meth in ['pad', 'bfill']: + for ax in [0, 1, None]: + for fax in [0, 1]: + self._check_align_fill('inner', meth, ax, fax) + + def test_align_fill_method_outer(self): + for meth in ['pad', 'bfill']: + for ax in [0, 1, None]: + for fax in [0, 1]: + self._check_align_fill('outer', meth, ax, fax) + + def test_align_fill_method_left(self): + for meth in ['pad', 'bfill']: + for ax in [0, 1, None]: + for fax in [0, 1]: + self._check_align_fill('left', meth, ax, fax) + + def test_align_fill_method_right(self): + for meth in ['pad', 'bfill']: + for ax in [0, 1, None]: + for fax in [0, 1]: + self._check_align_fill('right', meth, ax, fax) + + def _check_align_fill(self, kind, meth, ax, fax): + left = self.frame.ix[0:4, :10] + right = self.frame.ix[2:, 6:] + empty = self.frame.ix[:0, :0] + + self._check_align(left, right, axis=ax, fill_axis=fax, + how=kind, method=meth) + self._check_align(left, right, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + # empty left + self._check_align(empty, right, axis=ax, fill_axis=fax, + how=kind, method=meth) + self._check_align(empty, right, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + # empty right + self._check_align(left, empty, axis=ax, fill_axis=fax, + how=kind, method=meth) + self._check_align(left, empty, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + # both empty + self._check_align(empty, empty, axis=ax, fill_axis=fax, + how=kind, method=meth) + self._check_align(empty, empty, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + def test_align_int_fill_bug(self): + # GH #910 + X = np.arange(10*10, dtype='float64').reshape(10, 10) + Y = np.ones((10, 1), dtype=int) + + df1 = DataFrame(X) + df1['0.X'] = Y.squeeze() + + df2 = df1.astype(float) + + result = df1 - df1.mean() + expected = df2 - df2.mean() + assert_frame_equal(result, expected) + + def test_where(self): + default_frame = DataFrame(np.random.randn(5, 3),columns=['A','B','C']) + + def _safe_add(df): + # only add to the numeric items + def is_ok(s): + return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8' + return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in compat.iteritems(df) ])) + + def _check_get(df, cond, check_dtypes = True): + other1 = _safe_add(df) + rs = df.where(cond, other1) + rs2 = df.where(cond.values, other1) + for k, v in rs.iteritems(): + assert_series_equal(v, Series(np.where(cond[k], df[k], other1[k]),index=v.index)) + assert_frame_equal(rs, rs2) + + # dtypes + if check_dtypes: + self.assertTrue((rs.dtypes == df.dtypes).all() == True) + + # check getting + for df in [ default_frame, self.mixed_frame, self.mixed_float, self.mixed_int ]: + cond = df > 0 + _check_get(df, cond) + + + # upcasting case (GH # 2794) + df = DataFrame(dict([ (c,Series([1]*3,dtype=c)) for c in ['int64','int32','float32','float64'] ])) + df.ix[1,:] = 0 + result = df.where(df>=0).get_dtype_counts() + + #### when we don't preserve boolean casts #### + #expected = Series({ 'float32' : 1, 'float64' : 3 }) + + expected = Series({ 'float32' : 1, 'float64' : 1, 'int32' : 1, 'int64' : 1 }) + assert_series_equal(result, expected) + + # aligning + def _check_align(df, cond, other, check_dtypes = True): + rs = df.where(cond, other) + for i, k in enumerate(rs.columns): + result = rs[k] + d = df[k].values + c = cond[k].reindex(df[k].index).fillna(False).values + + if np.isscalar(other): + o = other + else: + if isinstance(other,np.ndarray): + o = Series(other[:,i],index=result.index).values + else: + o = other[k].values + + new_values = d if c.all() else np.where(c, d, o) + expected = Series(new_values,index=result.index) + + # since we can't always have the correct numpy dtype + # as numpy doesn't know how to downcast, don't check + assert_series_equal(result, expected, check_dtype=False) + + # dtypes + # can't check dtype when other is an ndarray + + if check_dtypes and not isinstance(other,np.ndarray): + self.assertTrue((rs.dtypes == df.dtypes).all() == True) + + for df in [ self.mixed_frame, self.mixed_float, self.mixed_int ]: + + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) + + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) + + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all([ not issubclass(s.type,np.integer) for s in df.dtypes ]) + _check_align(df, cond, np.nan, check_dtypes = check_dtypes) + + # invalid conditions + df = default_frame + err1 = (df + 1).values[0:2, :] + self.assertRaises(ValueError, df.where, cond, err1) + + err2 = cond.ix[:2, :].values + other1 = _safe_add(df) + self.assertRaises(ValueError, df.where, err2, other1) + + self.assertRaises(ValueError, df.mask, True) + self.assertRaises(ValueError, df.mask, 0) + + # where inplace + def _check_set(df, cond, check_dtypes = True): + dfi = df.copy() + econd = cond.reindex_like(df).fillna(True) + expected = dfi.mask(~econd) + + dfi.where(cond, np.nan, inplace=True) + assert_frame_equal(dfi, expected) + + # dtypes (and confirm upcasts)x + if check_dtypes: + for k, v in compat.iteritems(df.dtypes): + if issubclass(v.type,np.integer) and not cond[k].all(): + v = np.dtype('float64') + self.assertEqual(dfi[k].dtype, v) + + for df in [ default_frame, self.mixed_frame, self.mixed_float, self.mixed_int ]: + + cond = df > 0 + _check_set(df, cond) + + cond = df >= 0 + _check_set(df, cond) + + # aligining + cond = (df >= 0)[1:] + _check_set(df, cond) + + def test_where_bug(self): + + # GH 2793 + + df = DataFrame({'a': [1.0, 2.0, 3.0, 4.0], 'b': [4.0, 3.0, 2.0, 1.0]}, dtype = 'float64') + expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [4.0, 3.0, np.nan, np.nan]}, dtype = 'float64') + result = df.where(df > 2, np.nan) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + assert_frame_equal(result, expected) + + # mixed + for dtype in ['int16','int8','int32','int64']: + df = DataFrame({'a': np.array([1, 2, 3, 4],dtype=dtype), 'b': np.array([4.0, 3.0, 2.0, 1.0], dtype = 'float64') }) + expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [4.0, 3.0, np.nan, np.nan]}, dtype = 'float64') + result = df.where(df > 2, np.nan) + assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + assert_frame_equal(result, expected) + + # transpositional issue + # GH7506 + a = DataFrame({ 0 : [1,2], 1 : [3,4], 2 : [5,6]}) + b = DataFrame({ 0 : [np.nan,8], 1:[9,np.nan], 2:[np.nan,np.nan]}) + do_not_replace = b.isnull() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace,b) + assert_frame_equal(result,expected) + + a = DataFrame({ 0 : [4,6], 1 : [1,0]}) + b = DataFrame({ 0 : [np.nan,3],1:[3,np.nan]}) + do_not_replace = b.isnull() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace,b) + assert_frame_equal(result,expected) + + def test_where_datetime(self): + + # GH 3311 + df = DataFrame(dict(A = date_range('20130102',periods=5), + B = date_range('20130104',periods=5), + C = np.random.randn(5))) + + stamp = datetime(2013,1,3) + result = df[df>stamp] + expected = df.copy() + expected.loc[[0,1],'A'] = np.nan + assert_frame_equal(result,expected) + + def test_where_none(self): + # GH 4667 + # setting with None changes dtype + df = DataFrame({'series': Series(range(10))}).astype(float) + df[df > 7] = None + expected = DataFrame({'series': Series([0,1,2,3,4,5,6,7,np.nan,np.nan]) }) + assert_frame_equal(df, expected) + + # GH 7656 + df = DataFrame([{'A': 1, 'B': np.nan, 'C': 'Test'}, {'A': np.nan, 'B': 'Test', 'C': np.nan}]) + expected = df.where(~isnull(df), None) + with tm.assertRaisesRegexp(TypeError, 'boolean setting on mixed-type'): + df.where(~isnull(df), None, inplace=True) + + def test_where_align(self): + + def create(): + df = DataFrame(np.random.randn(10,3)) + df.iloc[3:5,0] = np.nan + df.iloc[4:6,1] = np.nan + df.iloc[5:8,2] = np.nan + return df + + # series + df = create() + expected = df.fillna(df.mean()) + result = df.where(pd.notnull(df),df.mean(),axis='columns') + assert_frame_equal(result, expected) + + df.where(pd.notnull(df),df.mean(),inplace=True,axis='columns') + assert_frame_equal(df, expected) + + df = create().fillna(0) + expected = df.apply(lambda x, y: x.where(x>0,y), y=df[0]) + result = df.where(df>0,df[0],axis='index') + assert_frame_equal(result, expected) + result = df.where(df>0,df[0],axis='rows') + assert_frame_equal(result, expected) + + # frame + df = create() + expected = df.fillna(1) + result = df.where(pd.notnull(df),DataFrame(1,index=df.index,columns=df.columns)) + assert_frame_equal(result, expected) + + def test_where_complex(self): + # GH 6345 + expected = DataFrame([[1+1j, 2], [np.nan, 4+1j]], columns=['a', 'b']) + df = DataFrame([[1+1j, 2], [5+1j, 4+1j]], columns=['a', 'b']) + df[df.abs() >= 5] = np.nan + assert_frame_equal(df,expected) + + def test_mask(self): + df = DataFrame(np.random.randn(5, 3)) + cond = df > 0 + + rs = df.where(cond, np.nan) + assert_frame_equal(rs, df.mask(df <= 0)) + assert_frame_equal(rs, df.mask(~cond)) + + def test_mask_edge_case_1xN_frame(self): + # GH4071 + df = DataFrame([[1, 2]]) + res = df.mask(DataFrame([[True, False]])) + expec = DataFrame([[nan, 2]]) + assert_frame_equal(res, expec) + + #---------------------------------------------------------------------- + # Transposing + + def test_transpose(self): + frame = self.frame + dft = frame.T + for idx, series in compat.iteritems(dft): + for col, value in compat.iteritems(series): + if np.isnan(value): + self.assertTrue(np.isnan(frame[col][idx])) + else: + self.assertEqual(value, frame[col][idx]) + + # mixed type + index, data = tm.getMixedTypeDict() + mixed = DataFrame(data, index=index) + + mixed_T = mixed.T + for col, s in compat.iteritems(mixed_T): + self.assertEqual(s.dtype, np.object_) + + def test_transpose_get_view(self): + dft = self.frame.T + dft.values[:, 5:10] = 5 + + self.assertTrue((self.frame.values[5:10] == 5).all()) + + #---------------------------------------------------------------------- + # Renaming + + def test_rename(self): + mapping = { + 'A': 'a', + 'B': 'b', + 'C': 'c', + 'D': 'd' + } + + renamed = self.frame.rename(columns=mapping) + renamed2 = self.frame.rename(columns=str.lower) + + assert_frame_equal(renamed, renamed2) + assert_frame_equal(renamed2.rename(columns=str.upper), + self.frame, check_names=False) + + # index + data = { + 'A': {'foo': 0, 'bar': 1} + } + + # gets sorted alphabetical + df = DataFrame(data) + renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'}) + self.assert_numpy_array_equal(renamed.index, ['foo', 'bar']) + + renamed = df.rename(index=str.upper) + self.assert_numpy_array_equal(renamed.index, ['BAR', 'FOO']) + + # have to pass something + self.assertRaises(TypeError, self.frame.rename) + + # partial columns + renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'}) + self.assert_numpy_array_equal(renamed.columns, ['A', 'B', 'foo', 'bar']) + + # other axis + renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) + self.assert_numpy_array_equal(renamed.index, ['A', 'B', 'foo', 'bar']) + + # index with name + index = Index(['foo', 'bar'], name='name') + renamer = DataFrame(data, index=index) + renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) + self.assert_numpy_array_equal(renamed.index, ['bar', 'foo']) + self.assertEqual(renamed.index.name, renamer.index.name) + + # MultiIndex + tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] + tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] + index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) + columns = MultiIndex.from_tuples(tuples_columns, names=['fizz', 'buzz']) + renamer = DataFrame([(0,0),(1,1)], index=index, columns=columns) + renamed = renamer.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, + columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) + new_index = MultiIndex.from_tuples([('foo3', 'bar1'), ('foo2', 'bar3')]) + new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), ('fizz2', 'buzz3')]) + self.assert_numpy_array_equal(renamed.index, new_index) + self.assert_numpy_array_equal(renamed.columns, new_columns) + self.assertEqual(renamed.index.names, renamer.index.names) + self.assertEqual(renamed.columns.names, renamer.columns.names) + + def test_rename_nocopy(self): + renamed = self.frame.rename(columns={'C': 'foo'}, copy=False) + renamed['foo'] = 1. + self.assertTrue((self.frame['C'] == 1.).all()) + + def test_rename_inplace(self): + self.frame.rename(columns={'C': 'foo'}) + self.assertIn('C', self.frame) + self.assertNotIn('foo', self.frame) + + c_id = id(self.frame['C']) + frame = self.frame.copy() + frame.rename(columns={'C': 'foo'}, inplace=True) + + self.assertNotIn('C', frame) + self.assertIn('foo', frame) + self.assertNotEqual(id(frame['foo']), c_id) + + def test_rename_bug(self): + # GH 5344 + # rename set ref_locs, and set_index was not resetting + df = DataFrame({ 0 : ['foo','bar'], 1 : ['bah','bas'], 2 : [1,2]}) + df = df.rename(columns={0 : 'a'}) + df = df.rename(columns={1 : 'b'}) + df = df.set_index(['a','b']) + df.columns = ['2001-01-01'] + expected = DataFrame([[1],[2]],index=MultiIndex.from_tuples([('foo','bah'),('bar','bas')], + names=['a','b']), + columns=['2001-01-01']) + assert_frame_equal(df,expected) + + #---------------------------------------------------------------------- + # Time series related + def test_diff(self): + the_diff = self.tsframe.diff(1) + + assert_series_equal(the_diff['A'], + self.tsframe['A'] - self.tsframe['A'].shift(1)) + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = DataFrame({'s': s}).diff() + self.assertEqual(rs.s[1], 1) + + # mixed numeric + tf = self.tsframe.astype('float32') + the_diff = tf.diff(1) + assert_series_equal(the_diff['A'], + tf['A'] - tf['A'].shift(1)) + + def test_diff_mixed_dtype(self): + df = DataFrame(np.random.randn(5, 3)) + df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) + + result = df.diff() + self.assertEqual(result[0].dtype, np.float64) + + def test_diff_neg_n(self): + rs = self.tsframe.diff(-1) + xp = self.tsframe - self.tsframe.shift(-1) + assert_frame_equal(rs, xp) + + def test_diff_float_n(self): + rs = self.tsframe.diff(1.) + xp = self.tsframe.diff(1) + assert_frame_equal(rs, xp) + + def test_pct_change(self): + rs = self.tsframe.pct_change(fill_method=None) + assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1) + + rs = self.tsframe.pct_change(2) + filled = self.tsframe.fillna(method='pad') + assert_frame_equal(rs, filled / filled.shift(2) - 1) + + rs = self.tsframe.pct_change(fill_method='bfill', limit=1) + filled = self.tsframe.fillna(method='bfill', limit=1) + assert_frame_equal(rs, filled / filled.shift(1) - 1) + + rs = self.tsframe.pct_change(freq='5D') + filled = self.tsframe.fillna(method='pad') + assert_frame_equal(rs, filled / filled.shift(freq='5D') - 1) + + def test_pct_change_shift_over_nas(self): + s = Series([1., 1.5, np.nan, 2.5, 3.]) + + df = DataFrame({'a': s, 'b': s}) + + chg = df.pct_change() + expected = Series([np.nan, 0.5, np.nan, 2.5 / 1.5 - 1, .2]) + edf = DataFrame({'a': expected, 'b': expected}) + assert_frame_equal(chg, edf) + + def test_shift(self): + # naive shift + shiftedFrame = self.tsframe.shift(5) + self.assertTrue(shiftedFrame.index.equals(self.tsframe.index)) + + shiftedSeries = self.tsframe['A'].shift(5) + assert_series_equal(shiftedFrame['A'], shiftedSeries) + + shiftedFrame = self.tsframe.shift(-5) + self.assertTrue(shiftedFrame.index.equals(self.tsframe.index)) + + shiftedSeries = self.tsframe['A'].shift(-5) + assert_series_equal(shiftedFrame['A'], shiftedSeries) + + # shift by 0 + unshifted = self.tsframe.shift(0) + assert_frame_equal(unshifted, self.tsframe) + + # shift by DateOffset + shiftedFrame = self.tsframe.shift(5, freq=datetools.BDay()) + self.assertEqual(len(shiftedFrame), len(self.tsframe)) + + shiftedFrame2 = self.tsframe.shift(5, freq='B') + assert_frame_equal(shiftedFrame, shiftedFrame2) + + d = self.tsframe.index[0] + shifted_d = d + datetools.BDay(5) + assert_series_equal(self.tsframe.xs(d), + shiftedFrame.xs(shifted_d)) + + # shift int frame + int_shifted = self.intframe.shift(1) + + # Shifting with PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + self.assertTrue(shifted.index.equals(ps.index)) + + tm.assert_dict_equal(unshifted.ix[:, 0].valid(), ps.ix[:, 0], + compare_keys=False) + + shifted2 = ps.shift(1, 'B') + shifted3 = ps.shift(1, datetools.bday) + assert_frame_equal(shifted2, shifted3) + assert_frame_equal(ps, shifted2.shift(-1, 'B')) + + assertRaisesRegexp(ValueError, 'does not match PeriodIndex freq', + ps.shift, freq='D') + + + # shift other axis + # GH 6371 + df = DataFrame(np.random.rand(10,5)) + expected = pd.concat([DataFrame(np.nan,index=df.index,columns=[0]),df.iloc[:,0:-1]],ignore_index=True,axis=1) + result = df.shift(1,axis=1) + assert_frame_equal(result,expected) + + # shift named axis + df = DataFrame(np.random.rand(10,5)) + expected = pd.concat([DataFrame(np.nan,index=df.index,columns=[0]),df.iloc[:,0:-1]],ignore_index=True,axis=1) + result = df.shift(1,axis='columns') + assert_frame_equal(result,expected) + + def test_shift_bool(self): + df = DataFrame({'high': [True, False], + 'low': [False, False]}) + rs = df.shift(1) + xp = DataFrame(np.array([[np.nan, np.nan], + [True, False]], dtype=object), + columns=['high', 'low']) + assert_frame_equal(rs, xp) + + def test_tshift(self): + # PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + assert_frame_equal(unshifted, ps) + + shifted2 = ps.tshift(freq='B') + assert_frame_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=datetools.bday) + assert_frame_equal(shifted, shifted3) + + assertRaisesRegexp(ValueError, 'does not match', ps.tshift, freq='M') + + # DatetimeIndex + shifted = self.tsframe.tshift(1) + unshifted = shifted.tshift(-1) + + assert_frame_equal(self.tsframe, unshifted) + + shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq) + assert_frame_equal(shifted, shifted2) + + inferred_ts = DataFrame(self.tsframe.values, + Index(np.asarray(self.tsframe.index)), + columns=self.tsframe.columns) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + assert_frame_equal(shifted, self.tsframe.tshift(1)) + assert_frame_equal(unshifted, inferred_ts) + + no_freq = self.tsframe.ix[[0, 5, 7], :] + self.assertRaises(ValueError, no_freq.tshift) + + def test_apply(self): + # ufunc + applied = self.frame.apply(np.sqrt) + assert_series_equal(np.sqrt(self.frame['A']), applied['A']) + + # aggregator + applied = self.frame.apply(np.mean) + self.assertEqual(applied['A'], np.mean(self.frame['A'])) + + d = self.frame.index[0] + applied = self.frame.apply(np.mean, axis=1) + self.assertEqual(applied[d], np.mean(self.frame.xs(d))) + self.assertIs(applied.index, self.frame.index) # want this + + # invalid axis + df = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) + self.assertRaises(ValueError, df.apply, lambda x: x, 2) + + def test_apply_empty(self): + # empty + applied = self.empty.apply(np.sqrt) + self.assertTrue(applied.empty) + + applied = self.empty.apply(np.mean) + self.assertTrue(applied.empty) + + no_rows = self.frame[:0] + result = no_rows.apply(lambda x: x.mean()) + expected = Series(np.nan, index=self.frame.columns) + assert_series_equal(result, expected) + + no_cols = self.frame.ix[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1) + expected = Series(np.nan, index=self.frame.index) + assert_series_equal(result, expected) + + # 2476 + xp = DataFrame(index=['a']) + rs = xp.apply(lambda x: x['a'], axis=1) + assert_frame_equal(xp, rs) + + # reduce with an empty DataFrame + x = [] + result = self.empty.apply(x.append, axis=1, reduce=False) + assert_frame_equal(result, self.empty) + result = self.empty.apply(x.append, axis=1, reduce=True) + assert_series_equal(result, Series([])) + + empty_with_cols = DataFrame(columns=['a', 'b', 'c']) + result = empty_with_cols.apply(x.append, axis=1, reduce=False) + assert_frame_equal(result, empty_with_cols) + result = empty_with_cols.apply(x.append, axis=1, reduce=True) + assert_series_equal(result, Series([])) + + # Ensure that x.append hasn't been called + self.assertEqual(x, []) + + def test_apply_standard_nonunique(self): + df = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) + rs = df.apply(lambda s: s[0], axis=1) + xp = Series([1, 4, 7], ['a', 'a', 'c']) + assert_series_equal(rs, xp) + + rs = df.T.apply(lambda s: s[0], axis=0) + assert_series_equal(rs, xp) + + def test_apply_broadcast(self): + broadcasted = self.frame.apply(np.mean, broadcast=True) + agged = self.frame.apply(np.mean) + + for col, ts in compat.iteritems(broadcasted): + self.assertTrue((ts == agged[col]).all()) + + broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) + agged = self.frame.apply(np.mean, axis=1) + for idx in broadcasted.index: + self.assertTrue((broadcasted.xs(idx) == agged[idx]).all()) + + def test_apply_raw(self): + result0 = self.frame.apply(np.mean, raw=True) + result1 = self.frame.apply(np.mean, axis=1, raw=True) + + expected0 = self.frame.apply(lambda x: x.values.mean()) + expected1 = self.frame.apply(lambda x: x.values.mean(), axis=1) + + assert_series_equal(result0, expected0) + assert_series_equal(result1, expected1) + + # no reduction + result = self.frame.apply(lambda x: x * 2, raw=True) + expected = self.frame * 2 + assert_frame_equal(result, expected) + + def test_apply_axis1(self): + d = self.frame.index[0] + tapplied = self.frame.apply(np.mean, axis=1) + self.assertEqual(tapplied[d], np.mean(self.frame.xs(d))) + + def test_apply_ignore_failures(self): + result = self.mixed_frame._apply_standard(np.mean, 0, + ignore_failures=True) + expected = self.mixed_frame._get_numeric_data().apply(np.mean) + assert_series_equal(result, expected) + + def test_apply_mixed_dtype_corner(self): + df = DataFrame({'A': ['foo'], + 'B': [1.]}) + result = df[:0].apply(np.mean, axis=1) + # the result here is actually kind of ambiguous, should it be a Series + # or a DataFrame? + expected = Series(np.nan, index=[]) + assert_series_equal(result, expected) + + df = DataFrame({'A': ['foo'], + 'B': [1.]}) + result = df.apply(lambda x: x['A'], axis=1) + expected = Series(['foo'],index=[0]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: x['B'], axis=1) + expected = Series([1.],index=[0]) + assert_series_equal(result, expected) + + def test_apply_empty_infer_type(self): + no_cols = DataFrame(index=['a', 'b', 'c']) + no_index = DataFrame(columns=['a', 'b', 'c']) + + def _check(df, f): + test_res = f(np.array([], dtype='f8')) + is_reduction = not isinstance(test_res, np.ndarray) + + def _checkit(axis=0, raw=False): + res = df.apply(f, axis=axis, raw=raw) + if is_reduction: + agg_axis = df._get_agg_axis(axis) + tm.assert_isinstance(res, Series) + self.assertIs(res.index, agg_axis) + else: + tm.assert_isinstance(res, DataFrame) + + _checkit() + _checkit(axis=1) + _checkit(raw=True) + _checkit(axis=0, raw=True) + + _check(no_cols, lambda x: x) + _check(no_cols, lambda x: x.mean()) + _check(no_index, lambda x: x) + _check(no_index, lambda x: x.mean()) + + result = no_cols.apply(lambda x: x.mean(), broadcast=True) + tm.assert_isinstance(result, DataFrame) + + def test_apply_with_args_kwds(self): + def add_some(x, howmuch=0): + return x + howmuch + + def agg_and_add(x, howmuch=0): + return x.mean() + howmuch + + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide + + result = self.frame.apply(add_some, howmuch=2) + exp = self.frame.apply(lambda x: x + 2) + assert_frame_equal(result, exp) + + result = self.frame.apply(agg_and_add, howmuch=2) + exp = self.frame.apply(lambda x: x.mean() + 2) + assert_series_equal(result, exp) + + res = self.frame.apply(subtract_and_divide, args=(2,), divide=2) + exp = self.frame.apply(lambda x: (x - 2.) / 2.) + assert_frame_equal(res, exp) + + def test_apply_yield_list(self): + result = self.frame.apply(list) + assert_frame_equal(result, self.frame) + + def test_apply_reduce_Series(self): + self.frame.ix[::2, 'A'] = np.nan + expected = self.frame.mean(1) + result = self.frame.apply(np.mean, axis=1) + assert_series_equal(result, expected) + + def test_apply_differently_indexed(self): + df = DataFrame(np.random.randn(20, 10)) + + result0 = df.apply(Series.describe, axis=0) + expected0 = DataFrame(dict((i, v.describe()) + for i, v in compat.iteritems(df)), + columns=df.columns) + assert_frame_equal(result0, expected0) + + result1 = df.apply(Series.describe, axis=1) + expected1 = DataFrame(dict((i, v.describe()) + for i, v in compat.iteritems(df.T)), + columns=df.index).T + assert_frame_equal(result1, expected1) + + def test_apply_modify_traceback(self): + data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + data['C'][4] = np.nan + + def transform(row): + if row['C'].startswith('shin') and row['A'] == 'foo': + row['D'] = 7 + return row + + def transform2(row): + if (notnull(row['C']) and row['C'].startswith('shin') + and row['A'] == 'foo'): + row['D'] = 7 + return row + + try: + transformed = data.apply(transform, axis=1) + except AttributeError as e: + self.assertEqual(len(e.args), 2) + self.assertEqual(e.args[1], 'occurred at index 4') + self.assertEqual(e.args[0], "'float' object has no attribute 'startswith'") + + def test_apply_bug(self): + + # GH 6125 + import datetime + positions = pd.DataFrame([[1, 'ABC0', 50], [1, 'YUM0', 20], + [1, 'DEF0', 20], [2, 'ABC1', 50], + [2, 'YUM1', 20], [2, 'DEF1', 20]], + columns=['a', 'market', 'position']) + def f(r): + return r['market'] + expected = positions.apply(f, axis=1) + + positions = DataFrame([[datetime.datetime(2013, 1, 1), 'ABC0', 50], + [datetime.datetime(2013, 1, 2), 'YUM0', 20], + [datetime.datetime(2013, 1, 3), 'DEF0', 20], + [datetime.datetime(2013, 1, 4), 'ABC1', 50], + [datetime.datetime(2013, 1, 5), 'YUM1', 20], + [datetime.datetime(2013, 1, 6), 'DEF1', 20]], + columns=['a', 'market', 'position']) + result = positions.apply(f, axis=1) + assert_series_equal(result,expected) + + def test_swapaxes(self): + df = DataFrame(np.random.randn(10, 5)) + assert_frame_equal(df.T, df.swapaxes(0, 1)) + assert_frame_equal(df.T, df.swapaxes(1, 0)) + assert_frame_equal(df, df.swapaxes(0, 0)) + self.assertRaises(ValueError, df.swapaxes, 2, 5) + + def test_apply_convert_objects(self): + data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + result = data.apply(lambda x: x, axis=1) + assert_frame_equal(result.convert_objects(), data) + + def test_apply_attach_name(self): + result = self.frame.apply(lambda x: x.name) + expected = Series(self.frame.columns, index=self.frame.columns) + assert_series_equal(result, expected) + + result = self.frame.apply(lambda x: x.name, axis=1) + expected = Series(self.frame.index, index=self.frame.index) + assert_series_equal(result, expected) + + # non-reductions + result = self.frame.apply(lambda x: np.repeat(x.name, len(x))) + expected = DataFrame(np.tile(self.frame.columns, + (len(self.frame.index), 1)), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(result, expected) + + result = self.frame.apply(lambda x: np.repeat(x.name, len(x)), + axis=1) + expected = DataFrame(np.tile(self.frame.index, + (len(self.frame.columns), 1)).T, + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(result, expected) + + def test_apply_multi_index(self): + s = DataFrame([[1,2], [3,4], [5,6]]) + s.index = MultiIndex.from_arrays([['a','a','b'], ['c','d','d']]) + s.columns = ['col1','col2'] + res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1) + tm.assert_isinstance(res.index, MultiIndex) + + def test_applymap(self): + applied = self.frame.applymap(lambda x: x * 2) + assert_frame_equal(applied, self.frame * 2) + result = self.frame.applymap(type) + + # GH #465, function returning tuples + result = self.frame.applymap(lambda x: (x, x)) + tm.assert_isinstance(result['A'][0], tuple) + + # GH 2909, object conversion to float in constructor? + df = DataFrame(data=[1,'a']) + result = df.applymap(lambda x: x) + self.assertEqual(result.dtypes[0], object) + + df = DataFrame(data=[1.,'a']) + result = df.applymap(lambda x: x) + self.assertEqual(result.dtypes[0], object) + + # GH2786 + df = DataFrame(np.random.random((3,4))) + df2 = df.copy() + cols = ['a','a','a','a'] + df.columns = cols + + expected = df2.applymap(str) + expected.columns = cols + result = df.applymap(str) + assert_frame_equal(result,expected) + + def test_filter(self): + # items + filtered = self.frame.filter(['A', 'B', 'E']) + self.assertEqual(len(filtered.columns), 2) + self.assertNotIn('E', filtered) + + filtered = self.frame.filter(['A', 'B', 'E'], axis='columns') + self.assertEqual(len(filtered.columns), 2) + self.assertNotIn('E', filtered) + + # other axis + idx = self.frame.index[0:4] + filtered = self.frame.filter(idx, axis='index') + expected = self.frame.reindex(index=idx) + assert_frame_equal(filtered,expected) + + # like + fcopy = self.frame.copy() + fcopy['AA'] = 1 + + filtered = fcopy.filter(like='A') + self.assertEqual(len(filtered.columns), 2) + self.assertIn('AA', filtered) + + # like with ints in column names + df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B']) + filtered = df.filter(like='_') + self.assertEqual(len(filtered.columns), 2) + + # pass in None + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter(items=None) + + # objects + filtered = self.mixed_frame.filter(like='foo') + self.assertIn('foo', filtered) + + # unicode columns, won't ascii-encode + df = self.frame.rename(columns={'B': u('\u2202')}) + filtered = df.filter(like='C') + self.assertTrue('C' in filtered) + + def test_filter_regex_search(self): + fcopy = self.frame.copy() + fcopy['AA'] = 1 + + # regex + filtered = fcopy.filter(regex='[A]+') + self.assertEqual(len(filtered.columns), 2) + self.assertIn('AA', filtered) + + # doesn't have to be at beginning + df = DataFrame({'aBBa': [1, 2], + 'BBaBB': [1, 2], + 'aCCa': [1, 2], + 'aCCaBB': [1, 2]}) + + result = df.filter(regex='BB') + exp = df[[x for x in df.columns if 'BB' in x]] + assert_frame_equal(result, exp) + + def test_filter_corner(self): + empty = DataFrame() + + result = empty.filter([]) + assert_frame_equal(result, empty) + + result = empty.filter(like='foo') + assert_frame_equal(result, empty) + + def test_select(self): + f = lambda x: x.weekday() == 2 + result = self.tsframe.select(f, axis=0) + expected = self.tsframe.reindex( + index=self.tsframe.index[[f(x) for x in self.tsframe.index]]) + assert_frame_equal(result, expected) + + result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) + expected = self.frame.reindex(columns=['B', 'D']) + + assert_frame_equal(result, expected, check_names=False) # TODO should reindex check_names? + + def test_reorder_levels(self): + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]], + names=['L0', 'L1', 'L2']) + df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index) + + # no change, position + result = df.reorder_levels([0, 1, 2]) + assert_frame_equal(df, result) + + # no change, labels + result = df.reorder_levels(['L0', 'L1', 'L2']) + assert_frame_equal(df, result) + + # rotate, position + result = df.reorder_levels([1, 2, 0]) + e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], + labels=[[0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0]], + names=['L1', 'L2', 'L0']) + expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, + index=e_idx) + assert_frame_equal(result, expected) + + result = df.reorder_levels([0, 0, 0]) + e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], + labels=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + names=['L0', 'L0', 'L0']) + expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, + index=e_idx) + assert_frame_equal(result, expected) + + result = df.reorder_levels(['L0', 'L0', 'L0']) + assert_frame_equal(result, expected) + + def test_sort_index(self): + frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + # axis=0 + unordered = frame.ix[[3, 2, 4, 1]] + sorted_df = unordered.sort_index() + expected = frame + assert_frame_equal(sorted_df, expected) + + sorted_df = unordered.sort_index(ascending=False) + expected = frame[::-1] + assert_frame_equal(sorted_df, expected) + + # axis=1 + unordered = frame.ix[:, ['D', 'B', 'C', 'A']] + sorted_df = unordered.sort_index(axis=1) + expected = frame + assert_frame_equal(sorted_df, expected) + + sorted_df = unordered.sort_index(axis=1, ascending=False) + expected = frame.ix[:, ::-1] + assert_frame_equal(sorted_df, expected) + + # by column + sorted_df = frame.sort_index(by='A') + indexer = frame['A'].argsort().values + expected = frame.ix[frame.index[indexer]] + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_index(by='A', ascending=False) + indexer = indexer[::-1] + expected = frame.ix[frame.index[indexer]] + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort(columns='A', ascending=False) + assert_frame_equal(sorted_df, expected) + + # GH4839 + sorted_df = frame.sort(columns=['A'], ascending=[False]) + assert_frame_equal(sorted_df, expected) + + # check for now + sorted_df = frame.sort(columns='A') + assert_frame_equal(sorted_df, expected[::-1]) + expected = frame.sort_index(by='A') + assert_frame_equal(sorted_df, expected) + + + sorted_df = frame.sort(columns=['A', 'B'], ascending=False) + expected = frame.sort_index(by=['A', 'B'], ascending=False) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort(columns=['A', 'B']) + assert_frame_equal(sorted_df, expected[::-1]) + + self.assertRaises(ValueError, frame.sort_index, axis=2, inplace=True) + + msg = 'When sorting by column, axis must be 0' + with assertRaisesRegexp(ValueError, msg): + frame.sort_index(by='A', axis=1) + + msg = r'Length of ascending \(5\) != length of by \(2\)' + with assertRaisesRegexp(ValueError, msg): + frame.sort_index(by=['A', 'B'], axis=0, ascending=[True] * 5) + + def test_sort_nan(self): + # GH3917 + nan = np.nan + df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}) + + # sort one column only + expected = DataFrame( + {'A': [nan, 1, 1, 2, 4, 6, 8], + 'B': [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5]) + sorted_df = df.sort(['A'], na_position='first') + assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {'A': [nan, 8, 6, 4, 2, 1, 1], + 'B': [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3]) + sorted_df = df.sort(['A'], na_position='first', ascending=False) + assert_frame_equal(sorted_df, expected) + + # na_position='last', order + expected = DataFrame( + {'A': [1, 1, 2, 4, 6, 8, nan], + 'B': [2, 9, nan, 5, 5, 4, 5]}, + index=[3, 0, 1, 6, 4, 5, 2]) + sorted_df = df.sort(['A','B']) + assert_frame_equal(sorted_df, expected) + + # na_position='first', order + expected = DataFrame( + {'A': [nan, 1, 1, 2, 4, 6, 8], + 'B': [5, 2, 9, nan, 5, 5, 4]}, + index=[2, 3, 0, 1, 6, 4, 5]) + sorted_df = df.sort(['A','B'], na_position='first') + assert_frame_equal(sorted_df, expected) + + # na_position='first', not order + expected = DataFrame( + {'A': [nan, 1, 1, 2, 4, 6, 8], + 'B': [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5]) + sorted_df = df.sort(['A','B'], ascending=[1,0], na_position='first') + assert_frame_equal(sorted_df, expected) + + # na_position='last', not order + expected = DataFrame( + {'A': [8, 6, 4, 2, 1, 1, nan], + 'B': [4, 5, 5, nan, 2, 9, 5]}, + index=[5, 4, 6, 1, 3, 0, 2]) + sorted_df = df.sort(['A','B'], ascending=[0,1], na_position='last') + assert_frame_equal(sorted_df, expected) + + # Test DataFrame with nan label + df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}, + index = [1, 2, 3, 4, 5, 6, nan]) + + # NaN label, ascending=True, na_position='last' + sorted_df = df.sort(kind='quicksort', ascending=True, na_position='last') + expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}, + index = [1, 2, 3, 4, 5, 6, nan]) + assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=True, na_position='first' + sorted_df = df.sort(na_position='first') + expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8], + 'B': [5, 9, nan, 5, 2, 5, 4]}, + index = [nan, 1, 2, 3, 4, 5, 6]) + assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='last' + sorted_df = df.sort(kind='quicksort', ascending=False) + expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4], + 'B': [4, 5, 2, 5, nan, 9, 5]}, + index = [6, 5, 4, 3, 2, 1, nan]) + assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='first' + sorted_df = df.sort(kind='quicksort', ascending=False, na_position='first') + expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1], + 'B': [5, 4, 5, 2, 5, nan, 9]}, + index = [nan, 6, 5, 4, 3, 2, 1]) + assert_frame_equal(sorted_df, expected) + + def test_stable_descending_sort(self): + # GH #6399 + df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], + columns=['sort_col', 'order']) + sorted_df = df.sort_index(by='sort_col', kind='mergesort', + ascending=False) + assert_frame_equal(df, sorted_df) + + def test_stable_descending_multicolumn_sort(self): + nan = np.nan + df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], + 'B': [9, nan, 5, 2, 5, 4, 5]}) + # test stable mergesort + expected = DataFrame( + {'A': [nan, 8, 6, 4, 2, 1, 1], + 'B': [5, 4, 5, 5, nan, 2, 9]}, + index=[2, 5, 4, 6, 1, 3, 0]) + sorted_df = df.sort(['A','B'], ascending=[0,1], na_position='first', + kind='mergesort') + assert_frame_equal(sorted_df, expected) + + expected = DataFrame( + {'A': [nan, 8, 6, 4, 2, 1, 1], + 'B': [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3]) + sorted_df = df.sort(['A','B'], ascending=[0,0], na_position='first', + kind='mergesort') + assert_frame_equal(sorted_df, expected) + + def test_sort_index_multicolumn(self): + import random + A = np.arange(5).repeat(20) + B = np.tile(np.arange(5), 20) + random.shuffle(A) + random.shuffle(B) + frame = DataFrame({'A': A, 'B': B, + 'C': np.random.randn(100)}) + + result = frame.sort_index(by=['A', 'B']) + indexer = np.lexsort((frame['B'], frame['A'])) + expected = frame.take(indexer) + assert_frame_equal(result, expected) + + result = frame.sort_index(by=['A', 'B'], ascending=False) + indexer = np.lexsort((frame['B'].rank(ascending=False), + frame['A'].rank(ascending=False))) + expected = frame.take(indexer) + assert_frame_equal(result, expected) + + result = frame.sort_index(by=['B', 'A']) + indexer = np.lexsort((frame['A'], frame['B'])) + expected = frame.take(indexer) + assert_frame_equal(result, expected) + + def test_sort_index_inplace(self): + frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + # axis=0 + unordered = frame.ix[[3, 2, 4, 1]] + a_id = id(unordered['A']) + df = unordered.copy() + df.sort_index(inplace=True) + expected = frame + assert_frame_equal(df, expected) + self.assertNotEqual(a_id, id(df['A'])) + + df = unordered.copy() + df.sort_index(ascending=False, inplace=True) + expected = frame[::-1] + assert_frame_equal(df, expected) + + # axis=1 + unordered = frame.ix[:, ['D', 'B', 'C', 'A']] + df = unordered.copy() + df.sort_index(axis=1, inplace=True) + expected = frame + assert_frame_equal(df, expected) + + df = unordered.copy() + df.sort_index(axis=1, ascending=False, inplace=True) + expected = frame.ix[:, ::-1] + assert_frame_equal(df, expected) + + def test_sort_index_different_sortorder(self): + import random + A = np.arange(20).repeat(5) + B = np.tile(np.arange(5), 20) + + indexer = np.random.permutation(100) + A = A.take(indexer) + B = B.take(indexer) + + df = DataFrame({'A': A, 'B': B, + 'C': np.random.randn(100)}) + + result = df.sort_index(by=['A', 'B'], ascending=[1, 0]) + + ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) + expected = df.take(ex_indexer) + assert_frame_equal(result, expected) + + # test with multiindex, too + idf = df.set_index(['A', 'B']) + + result = idf.sort_index(ascending=[1, 0]) + expected = idf.take(ex_indexer) + assert_frame_equal(result, expected) + + # also, Series! + result = idf['C'].sort_index(ascending=[1, 0]) + assert_series_equal(result, expected['C']) + + def test_sort_inplace(self): + frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + sorted_df = frame.copy() + sorted_df.sort(columns='A', inplace=True) + expected = frame.sort_index(by='A') + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort(columns='A', ascending=False, inplace=True) + expected = frame.sort_index(by='A', ascending=False) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort(columns=['A', 'B'], ascending=False, inplace=True) + expected = frame.sort_index(by=['A', 'B'], ascending=False) + assert_frame_equal(sorted_df, expected) + + def test_sort_index_duplicates(self): + df = DataFrame([lrange(5,9), lrange(4)], + columns=['a', 'a', 'b', 'b']) + + with assertRaisesRegexp(ValueError, 'duplicate'): + df.sort_index(by='a') + with assertRaisesRegexp(ValueError, 'duplicate'): + df.sort_index(by=['a']) + with assertRaisesRegexp(ValueError, 'duplicate'): + # multi-column 'by' is separate codepath + df.sort_index(by=['a', 'b']) + + # with multi-index + # GH4370 + df = DataFrame(np.random.randn(4,2),columns=MultiIndex.from_tuples([('a',0),('a',1)])) + with assertRaisesRegexp(ValueError, 'levels'): + df.sort_index(by='a') + + # convert tuples to a list of tuples + expected = df.sort_index(by=[('a',1)]) + result = df.sort_index(by=('a',1)) + assert_frame_equal(result, expected) + + def test_sortlevel(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4]], mi) + res = df.sortlevel('A', sort_remaining=False) + assert_frame_equal(df, res) + + res = df.sortlevel(['A', 'B'], sort_remaining=False) + assert_frame_equal(df, res) + + def test_sort_datetimes(self): + + # GH 3461, argsort / lexsort differences for a datetime column + df = DataFrame(['a','a','a','b','c','d','e','f','g'], + columns=['A'], + index=date_range('20130101',periods=9)) + dts = [Timestamp(x) + for x in ['2004-02-11','2004-01-21','2004-01-26', + '2005-09-20','2010-10-04','2009-05-12', + '2008-11-12','2010-09-28','2010-09-28']] + df['B'] = dts[::2] + dts[1::2] + df['C'] = 2. + df['A1'] = 3. + + df1 = df.sort(columns='A') + df2 = df.sort(columns=['A']) + assert_frame_equal(df1,df2) + + df1 = df.sort(columns='B') + df2 = df.sort(columns=['B']) + assert_frame_equal(df1,df2) + + def test_frame_column_inplace_sort_exception(self): + s = self.frame['A'] + with assertRaisesRegexp(ValueError, "This Series is a view"): + s.sort() + + cp = s.copy() + cp.sort() # it works! + + def test_combine_first(self): + # disjoint + head, tail = self.frame[:5], self.frame[5:] + + combined = head.combine_first(tail) + reordered_frame = self.frame.reindex(combined.index) + assert_frame_equal(combined, reordered_frame) + self.assertTrue(tm.equalContents(combined.columns, self.frame.columns)) + assert_series_equal(combined['A'], reordered_frame['A']) + + # same index + fcopy = self.frame.copy() + fcopy['A'] = 1 + del fcopy['C'] + + fcopy2 = self.frame.copy() + fcopy2['B'] = 0 + del fcopy2['D'] + + combined = fcopy.combine_first(fcopy2) + + self.assertTrue((combined['A'] == 1).all()) + assert_series_equal(combined['B'], fcopy['B']) + assert_series_equal(combined['C'], fcopy2['C']) + assert_series_equal(combined['D'], fcopy['D']) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head['A'] = 1 + + combined = head.combine_first(tail) + self.assertTrue((combined['A'][:10] == 1).all()) + + # reverse overlap + tail['A'][:10] = 0 + combined = tail.combine_first(head) + self.assertTrue((combined['A'][:10] == 0).all()) + + # no overlap + f = self.frame[:10] + g = self.frame[10:] + combined = f.combine_first(g) + assert_series_equal(combined['A'].reindex(f.index), f['A']) + assert_series_equal(combined['A'].reindex(g.index), g['A']) + + # corner cases + comb = self.frame.combine_first(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combine_first(self.frame) + assert_frame_equal(comb, self.frame) + + comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) + self.assertTrue("faz" in comb.index) + + # #2525 + df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame({}, columns=['b']) + result = df.combine_first(df2) + self.assertTrue('b' in result) + + def test_combine_first_mixed_bug(self): + idx = Index(['a', 'b', 'c', 'e']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'e'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame1 = DataFrame({"col0": ser1, + "col2": ser2, + "col3": ser3}) + + idx = Index(['a', 'b', 'c', 'f']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'f'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame2 = DataFrame({"col1": ser1, + "col2": ser2, + "col5": ser3}) + + combined = frame1.combine_first(frame2) + self.assertEqual(len(combined.columns), 5) + + # gh 3016 (same as in update) + df = DataFrame([[1.,2.,False, True],[4.,5.,True,False]], + columns=['A','B','bool1','bool2']) + + other = DataFrame([[45,45]],index=[0],columns=['A','B']) + result = df.combine_first(other) + assert_frame_equal(result, df) + + df.ix[0,'A'] = np.nan + result = df.combine_first(other) + df.ix[0,'A'] = 45 + assert_frame_equal(result, df) + + # doc example + df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + + df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + + result = df1.combine_first(df2) + expected = DataFrame({ 'A' : [1,2,3,5,3,7.], 'B' : [np.nan,2,3,4,6,8] }) + assert_frame_equal(result,expected) + + # GH3552, return object dtype with bools + df1 = DataFrame([[np.nan, 3.,True], [-4.6, np.nan, True], [np.nan, 7., False]]) + df2 = DataFrame([[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) + + result = df1.combine_first(df2)[2] + expected = Series([True,True,False]) + assert_series_equal(result,expected) + + # GH 3593, converting datetime64[ns] incorrecly + df0 = DataFrame({"a":[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) + df1 = DataFrame({"a":[None, None, None]}) + df2 = df1.combine_first(df0) + assert_frame_equal(df2,df0) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2,df0) + + df0 = DataFrame({"a":[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}) + df1 = DataFrame({"a":[datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0,:] = df1.iloc[0,:] + assert_frame_equal(df2,result) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2,df0) + + def test_update(self): + df = DataFrame([[1.5, nan, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other) + + expected = DataFrame([[1.5, nan, 3], + [3.6, 2, 3], + [1.5, nan, 3], + [1.5, nan, 7.]]) + assert_frame_equal(df, expected) + + def test_update_dtypes(self): + + # gh 3016 + df = DataFrame([[1.,2.,False, True],[4.,5.,True,False]], + columns=['A','B','bool1','bool2']) + + other = DataFrame([[45,45]],index=[0],columns=['A','B']) + df.update(other) + + expected = DataFrame([[45.,45.,False, True],[4.,5.,True,False]], + columns=['A','B','bool1','bool2']) + assert_frame_equal(df, expected) + + def test_update_nooverwrite(self): + df = DataFrame([[1.5, nan, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, overwrite=False) + + expected = DataFrame([[1.5, nan, 3], + [1.5, 2, 3], + [1.5, nan, 3], + [1.5, nan, 3.]]) + assert_frame_equal(df, expected) + + def test_update_filtered(self): + df = DataFrame([[1.5, nan, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, filter_func=lambda x: x > 2) + + expected = DataFrame([[1.5, nan, 3], + [1.5, nan, 3], + [1.5, nan, 3], + [1.5, nan, 7.]]) + assert_frame_equal(df, expected) + + def test_update_raise(self): + df = DataFrame([[1.5, 1, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[2., nan], + [nan, 7]], index=[1, 3], columns=[1, 2]) + with assertRaisesRegexp(ValueError, "Data overlaps"): + df.update(other, raise_conflict=True) + + def test_update_from_non_df(self): + d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])} + df = DataFrame(d) + + d['a'] = Series([5, 6, 7, 8]) + df.update(d) + + expected = DataFrame(d) + + assert_frame_equal(df, expected) + + d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]} + df = DataFrame(d) + + d['a'] = [5, 6, 7, 8] + df.update(d) + + expected = DataFrame(d) + + assert_frame_equal(df, expected) + + def test_combineAdd(self): + # trivial + comb = self.frame.combineAdd(self.frame) + assert_frame_equal(comb, self.frame * 2) + + # more rigorous + a = DataFrame([[1., nan, nan, 2., nan]], + columns=np.arange(5)) + b = DataFrame([[2., 3., nan, 2., 6., nan]], + columns=np.arange(6)) + expected = DataFrame([[3., 3., nan, 4., 6., nan]], + columns=np.arange(6)) + + result = a.combineAdd(b) + assert_frame_equal(result, expected) + result2 = a.T.combineAdd(b.T) + assert_frame_equal(result2, expected.T) + + expected2 = a.combine(b, operator.add, fill_value=0.) + assert_frame_equal(expected, expected2) + + # corner cases + comb = self.frame.combineAdd(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combineAdd(self.frame) + assert_frame_equal(comb, self.frame) + + # integer corner case + df1 = DataFrame({'x': [5]}) + df2 = DataFrame({'x': [1]}) + df3 = DataFrame({'x': [6]}) + comb = df1.combineAdd(df2) + assert_frame_equal(comb, df3) + + # mixed type GH2191 + df1 = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = DataFrame({'A': [1, 2], 'C': [5, 6]}) + rs = df1.combineAdd(df2) + xp = DataFrame({'A': [2, 4], 'B': [3, 4.], 'C': [5, 6.]}) + assert_frame_equal(xp, rs) + + # TODO: test integer fill corner? + + def test_combineMult(self): + # trivial + comb = self.frame.combineMult(self.frame) + + assert_frame_equal(comb, self.frame ** 2) + + # corner cases + comb = self.frame.combineMult(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combineMult(self.frame) + assert_frame_equal(comb, self.frame) + + def test_combine_generic(self): + df1 = self.frame + df2 = self.frame.ix[:-5, ['A', 'B', 'C']] + + combined = df1.combine(df2, np.add) + combined2 = df2.combine(df1, np.add) + self.assertTrue(combined['D'].isnull().all()) + self.assertTrue(combined2['D'].isnull().all()) + + chunk = combined.ix[:-5, ['A', 'B', 'C']] + chunk2 = combined2.ix[:-5, ['A', 'B', 'C']] + + exp = self.frame.ix[:-5, ['A', 'B', 'C']].reindex_like(chunk) * 2 + assert_frame_equal(chunk, exp) + assert_frame_equal(chunk2, exp) + + def test_clip(self): + median = self.frame.median().median() + + capped = self.frame.clip_upper(median) + self.assertFalse((capped.values > median).any()) + + floored = self.frame.clip_lower(median) + self.assertFalse((floored.values < median).any()) + + double = self.frame.clip(upper=median, lower=median) + self.assertFalse((double.values != median).any()) + + def test_dataframe_clip(self): + + # GH #2747 + df = DataFrame(np.random.randn(1000,2)) + + for lb, ub in [(-1,1),(1,-1)]: + clipped_df = df.clip(lb, ub) + + lb, ub = min(lb,ub), max(ub,lb) + lb_mask = df.values <= lb + ub_mask = df.values >= ub + mask = ~lb_mask & ~ub_mask + self.assertTrue((clipped_df.values[lb_mask] == lb).all() == True) + self.assertTrue((clipped_df.values[ub_mask] == ub).all() == True) + self.assertTrue((clipped_df.values[mask] == df.values[mask]).all() == True) + + def test_get_X_columns(self): + # numeric and object columns + + df = DataFrame({'a': [1, 2, 3], + 'b' : [True, False, True], + 'c': ['foo', 'bar', 'baz'], + 'd': [None, None, None], + 'e': [3.14, 0.577, 2.773]}) + + self.assert_numpy_array_equal(df._get_numeric_data().columns, + ['a', 'b', 'e']) + + def test_is_mixed_type(self): + self.assertFalse(self.frame._is_mixed_type) + self.assertTrue(self.mixed_frame._is_mixed_type) + + def test_get_numeric_data(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f' : Timestamp('20010102')}, + index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 1, 'float64' : 1, datetime64name: 1, objectname : 1}) + result.sort_index() + expected.sort_index() + assert_series_equal(result, expected) + + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', + 'd' : np.array([1.]*10,dtype='float32'), + 'e' : np.array([1]*10,dtype='int32'), + 'f' : np.array([1]*10,dtype='int16'), + 'g' : Timestamp('20010102')}, + index=np.arange(10)) + + result = df._get_numeric_data() + expected = df.ix[:, ['a', 'b','d','e','f']] + assert_frame_equal(result, expected) + + only_obj = df.ix[:, ['c','g']] + result = only_obj._get_numeric_data() + expected = df.ix[:, []] + assert_frame_equal(result, expected) + + df = DataFrame.from_dict({'a':[1,2], 'b':['foo','bar'],'c':[np.pi,np.e]}) + result = df._get_numeric_data() + expected = DataFrame.from_dict({'a':[1,2], 'c':[np.pi,np.e]}) + assert_frame_equal(result, expected) + + df = result.copy() + result = df._get_numeric_data() + expected = df + assert_frame_equal(result, expected) + + def test_bool_describe_in_mixed_frame(self): + df = DataFrame({ + 'string_data': ['a', 'b', 'c', 'd', 'e'], + 'bool_data': [True, True, False, False, False], + 'int_data': [10, 20, 30, 40, 50], + }) + + # Boolean data and integer data is included in .describe() output, string data isn't + self.assert_numpy_array_equal(df.describe().columns, ['bool_data', 'int_data']) + + bool_describe = df.describe()['bool_data'] + + # Both the min and the max values should stay booleans + self.assertEqual(bool_describe['min'].dtype, np.bool_) + self.assertEqual(bool_describe['max'].dtype, np.bool_) + + self.assertFalse(bool_describe['min']) + self.assertTrue(bool_describe['max']) + + # For numeric operations, like mean or median, the values True/False are cast to + # the integer values 1 and 0 + assert_almost_equal(bool_describe['mean'], 0.4) + assert_almost_equal(bool_describe['50%'], 0) + + def test_reduce_mixed_frame(self): + # GH 6806 + df = DataFrame({ + 'bool_data': [True, True, False, False, False], + 'int_data': [10, 20, 30, 40, 50], + 'string_data': ['a', 'b', 'c', 'd', 'e'], + }) + df.reindex(columns=['bool_data', 'int_data', 'string_data']) + test = df.sum(axis=0) + assert_almost_equal(test.values, [2, 150, 'abcde']) + assert_series_equal(test, df.T.sum(axis=1)) + + def test_count(self): + f = lambda s: notnull(s).sum() + self._check_stat_op('count', f, + has_skipna=False, + has_numeric_only=True, + check_dtype=False, + check_dates=True) + + # corner case + frame = DataFrame() + ct1 = frame.count(1) + tm.assert_isinstance(ct1, Series) + + ct2 = frame.count(0) + tm.assert_isinstance(ct2, Series) + + # GH #423 + df = DataFrame(index=lrange(10)) + result = df.count(1) + expected = Series(0, index=df.index) + assert_series_equal(result, expected) + + df = DataFrame(columns=lrange(10)) + result = df.count(0) + expected = Series(0, index=df.columns) + assert_series_equal(result, expected) + + df = DataFrame() + result = df.count() + expected = Series(0, index=[]) + assert_series_equal(result, expected) + + def test_sum(self): + self._check_stat_op('sum', np.sum, has_numeric_only=True) + + # mixed types (with upcasting happening) + self._check_stat_op('sum', np.sum, frame=self.mixed_float.astype('float32'), + has_numeric_only=True, check_dtype=False, check_less_precise=True) + + def test_stat_operators_attempt_obj_array(self): + data = { + 'a': [-0.00049987540199591344, -0.0016467257772919831, + 0.00067695870775883013], + 'b': [-0, -0, 0.0], + 'c': [0.00031111847529610595, 0.0014902627951905339, + -0.00094099200035979691] + } + df1 = DataFrame(data, index=['foo', 'bar', 'baz'], + dtype='O') + methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max'] + + # GH #676 + df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], + 2: [np.nan, 4]}, dtype=object) + + for df in [df1, df2]: + for meth in methods: + self.assertEqual(df.values.dtype, np.object_) + result = getattr(df, meth)(1) + expected = getattr(df.astype('f8'), meth)(1) + assert_series_equal(result, expected) + + def test_mean(self): + self._check_stat_op('mean', np.mean, check_dates=True) + + def test_product(self): + self._check_stat_op('product', np.prod) + + def test_median(self): + def wrapper(x): + if isnull(x).any(): + return np.nan + return np.median(x) + + self._check_stat_op('median', wrapper, check_dates=True) + + def test_min(self): + self._check_stat_op('min', np.min, check_dates=True) + self._check_stat_op('min', np.min, frame=self.intframe) + + def test_cummin(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cummin = self.tsframe.cummin() + expected = self.tsframe.apply(Series.cummin) + assert_frame_equal(cummin, expected) + + # axis = 1 + cummin = self.tsframe.cummin(axis=1) + expected = self.tsframe.apply(Series.cummin, axis=1) + assert_frame_equal(cummin, expected) + + # works + df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + result = df.cummin() + + # fix issue + cummin_xs = self.tsframe.cummin(axis=1) + self.assertEqual(np.shape(cummin_xs), np.shape(self.tsframe)) + + def test_cummax(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cummax = self.tsframe.cummax() + expected = self.tsframe.apply(Series.cummax) + assert_frame_equal(cummax, expected) + + # axis = 1 + cummax = self.tsframe.cummax(axis=1) + expected = self.tsframe.apply(Series.cummax, axis=1) + assert_frame_equal(cummax, expected) + + # works + df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + result = df.cummax() + + # fix issue + cummax_xs = self.tsframe.cummax(axis=1) + self.assertEqual(np.shape(cummax_xs), np.shape(self.tsframe)) + + def test_max(self): + self._check_stat_op('max', np.max, check_dates=True) + self._check_stat_op('max', np.max, frame=self.intframe) + + def test_mad(self): + f = lambda x: np.abs(x - x.mean()).mean() + self._check_stat_op('mad', f) + + def test_var_std(self): + alt = lambda x: np.var(x, ddof=1) + self._check_stat_op('var', alt) + + alt = lambda x: np.std(x, ddof=1) + self._check_stat_op('std', alt) + + result = self.tsframe.std(ddof=4) + expected = self.tsframe.apply(lambda x: x.std(ddof=4)) + assert_almost_equal(result, expected) + + result = self.tsframe.var(ddof=4) + expected = self.tsframe.apply(lambda x: x.var(ddof=4)) + assert_almost_equal(result, expected) + + arr = np.repeat(np.random.random((1, 1000)), 1000, 0) + result = nanops.nanvar(arr, axis=0) + self.assertFalse((result < 0).any()) + if nanops._USE_BOTTLENECK: + nanops._USE_BOTTLENECK = False + result = nanops.nanvar(arr, axis=0) + self.assertFalse((result < 0).any()) + nanops._USE_BOTTLENECK = True + + def test_sem(self): + alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + + result = self.tsframe.sem(ddof=4) + expected = self.tsframe.apply(lambda x: x.std(ddof=4)/np.sqrt(len(x))) + assert_almost_equal(result, expected) + + arr = np.repeat(np.random.random((1, 1000)), 1000, 0) + result = nanops.nansem(arr, axis=0) + self.assertFalse((result < 0).any()) + if nanops._USE_BOTTLENECK: + nanops._USE_BOTTLENECK = False + result = nanops.nansem(arr, axis=0) + self.assertFalse((result < 0).any()) + nanops._USE_BOTTLENECK = True + + def test_skew(self): + tm._skip_if_no_scipy() + from scipy.stats import skew + + def alt(x): + if len(x) < 3: + return np.nan + return skew(x, bias=False) + + self._check_stat_op('skew', alt) + + def test_kurt(self): + tm._skip_if_no_scipy() + + from scipy.stats import kurtosis + + def alt(x): + if len(x) < 4: + return np.nan + return kurtosis(x, bias=False) + + self._check_stat_op('kurt', alt) + + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + df = DataFrame(np.random.randn(6, 3), index=index) + assert_series_equal(df.kurt(), df.kurt(level=0).xs('bar')) + + def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, + has_numeric_only=False, check_dtype=True, check_dates=False, + check_less_precise=False): + if frame is None: + frame = self.frame + # set some NAs + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + + f = getattr(frame, name) + + if check_dates: + df = DataFrame({'b': date_range('1/1/2001', periods=2)}) + _f = getattr(df, name) + result = _f() + self.assertIsInstance(result, Series) + + df['a'] = lrange(len(df)) + result = getattr(df, name)() + self.assertIsInstance(result, Series) + self.assertTrue(len(result)) + + if has_skipna: + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + def wrapper(x): + return alternative(x.values) + + result0 = f(axis=0, skipna=False) + result1 = f(axis=1, skipna=False) + assert_series_equal(result0, frame.apply(wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) + assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False, + check_less_precise=check_less_precise) # HACK: win32 + else: + skipna_wrapper = alternative + wrapper = alternative + + result0 = f(axis=0) + result1 = f(axis=1) + assert_series_equal(result0, frame.apply(skipna_wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) + assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), + check_dtype=False, + check_less_precise=check_less_precise) + + # check dtypes + if check_dtype: + lcd_dtype = frame.values.dtype + self.assertEqual(lcd_dtype, result0.dtype) + self.assertEqual(lcd_dtype, result1.dtype) + + # result = f(axis=1) + # comp = frame.apply(alternative, axis=1).reindex(result.index) + # assert_series_equal(result, comp) + + # bad axis + assertRaisesRegexp(ValueError, 'No axis named 2', f, axis=2) + # make sure works on mixed-type frame + getattr(self.mixed_frame, name)(axis=0) + getattr(self.mixed_frame, name)(axis=1) + + if has_numeric_only: + getattr(self.mixed_frame, name)(axis=0, numeric_only=True) + getattr(self.mixed_frame, name)(axis=1, numeric_only=True) + getattr(self.frame, name)(axis=0, numeric_only=False) + getattr(self.frame, name)(axis=1, numeric_only=False) + + # all NA case + if has_skipna: + all_na = self.frame * np.NaN + r0 = getattr(all_na, name)(axis=0) + r1 = getattr(all_na, name)(axis=1) + self.assertTrue(np.isnan(r0).all()) + self.assertTrue(np.isnan(r1).all()) + + def test_mode(self): + df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], + "B": [10, 10, 10, np.nan, 3, 4], + "C": [8, 8, 8, 9, 9, 9], + "D": range(6), + "E": [8, 8, 1, 1, 3, 3]}) + assert_frame_equal(df[["A"]].mode(), + pd.DataFrame({"A": [12]})) + assert_frame_equal(df[["D"]].mode(), + pd.DataFrame(pd.Series([], dtype="int64"), + columns=["D"])) + assert_frame_equal(df[["E"]].mode(), + pd.DataFrame(pd.Series([1, 3, 8], dtype="int64"), + columns=["E"])) + assert_frame_equal(df[["A", "B"]].mode(), + pd.DataFrame({"A": [12], "B": [10.]})) + assert_frame_equal(df.mode(), + pd.DataFrame({"A": [12, np.nan, np.nan], + "B": [10, np.nan, np.nan], + "C": [8, 9, np.nan], + "D": [np.nan, np.nan, np.nan], + "E": [1, 3, 8]})) + + # outputs in sorted order + df["C"] = list(reversed(df["C"])) + com.pprint_thing(df["C"]) + com.pprint_thing(df["C"].mode()) + a, b = (df[["A", "B", "C"]].mode(), + pd.DataFrame({"A": [12, np.nan], + "B": [10, np.nan], + "C": [8, 9]})) + com.pprint_thing(a) + com.pprint_thing(b) + assert_frame_equal(a, b) + # should work with heterogeneous types + df = pd.DataFrame({"A": range(6), + "B": pd.date_range('2011', periods=6), + "C": list('abcdef')}) + exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype), + "B": pd.Series([], dtype=df["B"].dtype), + "C": pd.Series([], dtype=df["C"].dtype)}) + assert_frame_equal(df.mode(), exp) + + # and also when not empty + df.loc[1, "A"] = 0 + df.loc[4, "B"] = df.loc[3, "B"] + df.loc[5, "C"] = 'e' + exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype), + "B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype), + "C": pd.Series(['e'], dtype=df["C"].dtype)}) + + assert_frame_equal(df.mode(), exp) + + def test_sum_corner(self): + axis0 = self.empty.sum(0) + axis1 = self.empty.sum(1) + tm.assert_isinstance(axis0, Series) + tm.assert_isinstance(axis1, Series) + self.assertEqual(len(axis0), 0) + self.assertEqual(len(axis1), 0) + + def test_sum_object(self): + values = self.frame.values.astype(int) + frame = DataFrame(values, index=self.frame.index, + columns=self.frame.columns) + deltas = frame * timedelta(1) + deltas.sum() + + def test_sum_bool(self): + # ensure this works, bug report + bools = np.isnan(self.frame) + bools.sum(1) + bools.sum(0) + + def test_mean_corner(self): + # unit test when have object data + the_mean = self.mixed_frame.mean(axis=0) + the_sum = self.mixed_frame.sum(axis=0, numeric_only=True) + self.assertTrue(the_sum.index.equals(the_mean.index)) + self.assertTrue(len(the_mean.index) < len(self.mixed_frame.columns)) + + # xs sum mixed type, just want to know it works... + the_mean = self.mixed_frame.mean(axis=1) + the_sum = self.mixed_frame.sum(axis=1, numeric_only=True) + self.assertTrue(the_sum.index.equals(the_mean.index)) + + # take mean of boolean column + self.frame['bool'] = self.frame['A'] > 0 + means = self.frame.mean(0) + self.assertEqual(means['bool'], self.frame['bool'].values.mean()) + + def test_stats_mixed_type(self): + # don't blow up + self.mixed_frame.std(1) + self.mixed_frame.var(1) + self.mixed_frame.mean(1) + self.mixed_frame.skew(1) + + def test_median_corner(self): + def wrapper(x): + if isnull(x).any(): + return np.nan + return np.median(x) + + self._check_stat_op('median', wrapper, frame=self.intframe, + check_dtype=False, check_dates=True) + + def test_quantile(self): + from numpy import percentile + + q = self.tsframe.quantile(0.1, axis=0) + self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + q = self.tsframe.quantile(0.9, axis=1) + q = self.intframe.quantile(0.1) + self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + + # test degenerate case + q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) + assert(np.isnan(q['x']) and np.isnan(q['y'])) + + # non-numeric exclusion + df = DataFrame({'col1':['A','A','B','B'], 'col2':[1,2,3,4]}) + rs = df.quantile(0.5) + xp = df.median() + assert_series_equal(rs, xp) + + # axis + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(.5, axis=1) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + assert_series_equal(result, expected) + + result = df.quantile([.5, .75], axis=1) + expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75], + 3: [3.5, 3.75]}, index=["0.5", "0.75"]) + assert_frame_equal(result, expected) + + # We may want to break API in the future to change this + # so that we exclude non-numeric along the same axis + # See GH #7312 + df = DataFrame([[1, 2, 3], + ['a', 'b', 4]]) + result = df.quantile(.5, axis=1) + expected = Series([3., 4.], index=[0, 1]) + assert_series_equal(result, expected) + + def test_quantile_multi(self): + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=['a', 'b', 'c']) + result = df.quantile([.25, .5]) + expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], + index=[.25, .5], columns=['a', 'b', 'c']) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.quantile([.25, .5], axis=1) + expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], + index=[.25, .5], columns=[0, 1, 2]) + + # empty + result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) + expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]}, + index=[.1, .9]) + assert_frame_equal(result, expected) + + def test_quantile_datetime(self): + df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) + + # exclude datetime + result = df.quantile(.5) + expected = Series([2.5], index=['b']) + + # datetime + result = df.quantile(.5, numeric_only=False) + expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], + index=['a', 'b']) + assert_series_equal(result, expected) + + # datetime w/ multi + result = df.quantile([.5], numeric_only=False) + expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], + index=[.5], columns=['a', 'b']) + assert_frame_equal(result, expected) + + # axis = 1 + df['c'] = pd.to_datetime(['2011', '2012']) + result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) + expected = Series([Timestamp('2010-07-02 12:00:00'), + Timestamp('2011-07-02 12:00:00')], + index=[0, 1]) + assert_series_equal(result, expected) + + result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) + expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), + Timestamp('2011-07-02 12:00:00')]], + index=[0.5], columns=[0, 1]) + assert_frame_equal(result, expected) + + def test_cumsum(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cumsum = self.tsframe.cumsum() + expected = self.tsframe.apply(Series.cumsum) + assert_frame_equal(cumsum, expected) + + # axis = 1 + cumsum = self.tsframe.cumsum(axis=1) + expected = self.tsframe.apply(Series.cumsum, axis=1) + assert_frame_equal(cumsum, expected) + + # works + df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + result = df.cumsum() + + # fix issue + cumsum_xs = self.tsframe.cumsum(axis=1) + self.assertEqual(np.shape(cumsum_xs), np.shape(self.tsframe)) + + def test_cumprod(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cumprod = self.tsframe.cumprod() + expected = self.tsframe.apply(Series.cumprod) + assert_frame_equal(cumprod, expected) + + # axis = 1 + cumprod = self.tsframe.cumprod(axis=1) + expected = self.tsframe.apply(Series.cumprod, axis=1) + assert_frame_equal(cumprod, expected) + + # fix issue + cumprod_xs = self.tsframe.cumprod(axis=1) + self.assertEqual(np.shape(cumprod_xs), np.shape(self.tsframe)) + + # ints + df = self.tsframe.fillna(0).astype(int) + df.cumprod(0) + df.cumprod(1) + + # ints32 + df = self.tsframe.fillna(0).astype(np.int32) + df.cumprod(0) + df.cumprod(1) + + def test_rank(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + self.frame['A'][::2] = np.nan + self.frame['B'][::3] = np.nan + self.frame['C'][::4] = np.nan + self.frame['D'][::5] = np.nan + + ranks0 = self.frame.rank() + ranks1 = self.frame.rank(1) + mask = np.isnan(self.frame.values) + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp0[mask] = np.nan + + exp1 = np.apply_along_axis(rankdata, 1, fvals) + exp1[mask] = np.nan + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + # integers + df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) + + result = df.rank() + exp = df.astype(float).rank() + assert_frame_equal(result, exp) + + result = df.rank(1) + exp = df.astype(float).rank(1) + assert_frame_equal(result, exp) + + def test_rank2(self): + from datetime import datetime + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 + result = df.rank(1, pct=True) + assert_frame_equal(result, expected) + + df = DataFrame([[1, 3, 2], [1, 2, 3]]) + expected = df.rank(0) / 2.0 + result = df.rank(0, pct=True) + assert_frame_equal(result, expected) + + + + df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) + expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) + result = df.rank(1, numeric_only=False) + assert_frame_equal(result, expected) + + + expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) + result = df.rank(0, numeric_only=False) + assert_frame_equal(result, expected) + + df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) + expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) + result = df.rank(1, numeric_only=False) + assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) + result = df.rank(0, numeric_only=False) + assert_frame_equal(result, expected) + + # f7u12, this does not work without extensive workaround + data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 1)]] + df = DataFrame(data) + + # check the rank + expected = DataFrame([[2., nan, 1.], + [2., 3., 1.]]) + result = df.rank(1, numeric_only=False) + assert_frame_equal(result, expected) + + # mixed-type frames + self.mixed_frame['datetime'] = datetime.now() + self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1) + + result = self.mixed_frame.rank(1) + expected = self.mixed_frame.rank(1, numeric_only=True) + assert_frame_equal(result, expected) + + df = DataFrame({"a":[1e-20, -5, 1e-20+1e-40, 10, 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a":[ 3.5, 1. , 3.5, 5. , 6. , 7. , 2. ]}) + assert_frame_equal(df.rank(), exp) + + def test_rank_na_option(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + self.frame['A'][::2] = np.nan + self.frame['B'][::3] = np.nan + self.frame['C'][::4] = np.nan + self.frame['D'][::5] = np.nan + + # bottom + ranks0 = self.frame.rank(na_option='bottom') + ranks1 = self.frame.rank(1, na_option='bottom') + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp1 = np.apply_along_axis(rankdata, 1, fvals) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + # top + ranks0 = self.frame.rank(na_option='top') + ranks1 = self.frame.rank(1, na_option='top') + + fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values + fval1 = self.frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fval0) + exp1 = np.apply_along_axis(rankdata, 1, fval1) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + # descending + + # bottom + ranks0 = self.frame.rank(na_option='top', ascending=False) + ranks1 = self.frame.rank(1, na_option='top', ascending=False) + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fvals) + exp1 = np.apply_along_axis(rankdata, 1, -fvals) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + # descending + + # top + ranks0 = self.frame.rank(na_option='bottom', ascending=False) + ranks1 = self.frame.rank(1, na_option='bottom', ascending=False) + + fval0 = self.frame.fillna((self.frame.min() - 1).to_dict()).values + fval1 = self.frame.T + fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T + fval1 = fval1.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, -fval0) + exp1 = np.apply_along_axis(rankdata, 1, -fval1) + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + def test_axis_aliases(self): + + f = self.frame + + # reg name + expected = f.sum(axis=0) + result = f.sum(axis='index') + assert_series_equal(result, expected) + + expected = f.sum(axis=1) + result = f.sum(axis='columns') + assert_series_equal(result, expected) + + def test_combine_first_mixed(self): + a = Series(['a', 'b'], index=lrange(2)) + b = Series(lrange(2), index=lrange(2)) + f = DataFrame({'A': a, 'B': b}) + + a = Series(['a', 'b'], index=lrange(5, 7)) + b = Series(lrange(2), index=lrange(5, 7)) + g = DataFrame({'A': a, 'B': b}) + + combined = f.combine_first(g) + + def test_more_asMatrix(self): + values = self.mixed_frame.as_matrix() + self.assertEqual(values.shape[1], len(self.mixed_frame.columns)) + + def test_reindex_boolean(self): + frame = DataFrame(np.ones((10, 2), dtype=bool), + index=np.arange(0, 20, 2), + columns=[0, 2]) + + reindexed = frame.reindex(np.arange(10)) + self.assertEqual(reindexed.values.dtype, np.object_) + self.assertTrue(isnull(reindexed[0][1])) + + reindexed = frame.reindex(columns=lrange(3)) + self.assertEqual(reindexed.values.dtype, np.object_) + self.assertTrue(isnull(reindexed[1]).all()) + + def test_reindex_objects(self): + reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B']) + self.assertIn('foo', reindexed) + + reindexed = self.mixed_frame.reindex(columns=['A', 'B']) + self.assertNotIn('foo', reindexed) + + def test_reindex_corner(self): + index = Index(['a', 'b', 'c']) + dm = self.empty.reindex(index=[1, 2, 3]) + reindexed = dm.reindex(columns=index) + self.assertTrue(reindexed.columns.equals(index)) + + # ints are weird + + smaller = self.intframe.reindex(columns=['A', 'B', 'E']) + self.assertEqual(smaller['E'].dtype, np.float64) + + def test_reindex_axis(self): + cols = ['A', 'B', 'E'] + reindexed1 = self.intframe.reindex_axis(cols, axis=1) + reindexed2 = self.intframe.reindex(columns=cols) + assert_frame_equal(reindexed1, reindexed2) + + rows = self.intframe.index[0:5] + reindexed1 = self.intframe.reindex_axis(rows, axis=0) + reindexed2 = self.intframe.reindex(index=rows) + assert_frame_equal(reindexed1, reindexed2) + + self.assertRaises(ValueError, self.intframe.reindex_axis, rows, axis=2) + + # no-op case + cols = self.frame.columns.copy() + newFrame = self.frame.reindex_axis(cols, axis=1) + assert_frame_equal(newFrame, self.frame) + + def test_reindex_with_nans(self): + df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], + columns=['a', 'b'], + index=[100.0, 101.0, np.nan, 102.0, 103.0]) + + result = df.reindex(index=[101.0, 102.0, 103.0]) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(result, expected) + + result = df.reindex(index=[103.0]) + expected = df.iloc[[4]] + assert_frame_equal(result, expected) + + result = df.reindex(index=[101.0]) + expected = df.iloc[[1]] + assert_frame_equal(result, expected) + + def test_reindex_multi(self): + df = DataFrame(np.random.randn(3, 3)) + + result = df.reindex(lrange(4), lrange(4)) + expected = df.reindex(lrange(4)).reindex(columns=lrange(4)) + + assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(lrange(4), lrange(4)) + expected = df.reindex(lrange(4)).reindex(columns=lrange(4)) + + assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(lrange(2), lrange(2)) + expected = df.reindex(lrange(2)).reindex(columns=lrange(2)) + + assert_frame_equal(result, expected) + + df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c']) + + result = df.reindex(index=[0, 1], columns=['a', 'b']) + expected = df.reindex([0, 1]).reindex(columns=['a', 'b']) + + assert_frame_equal(result, expected) + + def test_rename_objects(self): + renamed = self.mixed_frame.rename(columns=str.upper) + self.assertIn('FOO', renamed) + self.assertNotIn('foo', renamed) + + def test_fill_corner(self): + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + filled = self.mixed_frame.fillna(value=0) + self.assertTrue((filled['foo'][5:20] == 0).all()) + del self.mixed_frame['foo'] + + empty_float = self.frame.reindex(columns=[]) + result = empty_float.fillna(value=0) + + def test_count_objects(self): + dm = DataFrame(self.mixed_frame._series) + df = DataFrame(self.mixed_frame._series) + + tm.assert_series_equal(dm.count(), df.count()) + tm.assert_series_equal(dm.count(1), df.count(1)) + + def test_cumsum_corner(self): + dm = DataFrame(np.arange(20).reshape(4, 5), + index=lrange(4), columns=lrange(5)) + result = dm.cumsum() + + #---------------------------------------------------------------------- + # Stacking / unstacking + + def test_stack_unstack(self): + stacked = self.frame.stack() + stacked_df = DataFrame({'foo': stacked, 'bar': stacked}) + + unstacked = stacked.unstack() + unstacked_df = stacked_df.unstack() + + assert_frame_equal(unstacked, self.frame) + assert_frame_equal(unstacked_df['bar'], self.frame) + + unstacked_cols = stacked.unstack(0) + unstacked_cols_df = stacked_df.unstack(0) + assert_frame_equal(unstacked_cols.T, self.frame) + assert_frame_equal(unstacked_cols_df['bar'].T, self.frame) + + def test_unstack_bool(self): + df = DataFrame([False, False], + index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]), + columns=['col']) + rs = df.unstack() + xp = DataFrame(np.array([[False, np.nan], [np.nan, False]], + dtype=object), + index=['a', 'b'], + columns=MultiIndex.from_arrays([['col', 'col'], + ['c', 'l']])) + assert_frame_equal(rs, xp) + + def test_unstack_to_series(self): + # check reversibility + data = self.frame.unstack() + + self.assertTrue(isinstance(data, Series)) + undo = data.unstack().T + assert_frame_equal(undo, self.frame) + + # check NA handling + data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]}) + data.index = Index(['a', 'b', 'c']) + result = data.unstack() + + midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']], + labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) + + assert_series_equal(result, expected) + + # check composability of unstack + old_data = data.copy() + for _ in range(4): + data = data.unstack() + assert_frame_equal(old_data, data) + + def test_unstack_dtypes(self): + + # GH 2929 + rows = [[1, 1, 3, 4], + [1, 2, 3, 4], + [2, 1, 3, 4], + [2, 2, 3, 4]] + + df = DataFrame(rows, columns=list('ABCD')) + result = df.get_dtype_counts() + expected = Series({'int64' : 4}) + assert_series_equal(result, expected) + + # single dtype + df2 = df.set_index(['A','B']) + df3 = df2.unstack('B') + result = df3.get_dtype_counts() + expected = Series({'int64' : 4}) + assert_series_equal(result, expected) + + # mixed + df2 = df.set_index(['A','B']) + df2['C'] = 3. + df3 = df2.unstack('B') + result = df3.get_dtype_counts() + expected = Series({'int64' : 2, 'float64' : 2}) + assert_series_equal(result, expected) + + df2['D'] = 'foo' + df3 = df2.unstack('B') + result = df3.get_dtype_counts() + expected = Series({'float64' : 2, 'object' : 2}) + assert_series_equal(result, expected) + + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], + names=['c1', 'c1']) + df = DataFrame([1, 2], index=idx) + with tm.assertRaises(ValueError): + df.unstack('c1') + + with tm.assertRaises(ValueError): + df.T.stack('c1') + + def test_repr_with_mi_nat(self): + df = DataFrame({'X': [1, 2]}, + index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']]) + res = repr(df) + exp = ' X\nNaT a 1\n2013-01-01 b 2' + nose.tools.assert_equal(res, exp) + + def test_reset_index(self): + stacked = self.frame.stack()[::2] + stacked = DataFrame({'foo': stacked, 'bar': stacked}) + + names = ['first', 'second'] + stacked.index.names = names + deleveled = stacked.reset_index() + for i, (lev, lab) in enumerate(zip(stacked.index.levels, + stacked.index.labels)): + values = lev.take(lab) + name = names[i] + assert_almost_equal(values, deleveled[name]) + + stacked.index.names = [None, None] + deleveled2 = stacked.reset_index() + self.assert_numpy_array_equal(deleveled['first'], + deleveled2['level_0']) + self.assert_numpy_array_equal(deleveled['second'], + deleveled2['level_1']) + + # default name assigned + rdf = self.frame.reset_index() + self.assert_numpy_array_equal(rdf['index'], self.frame.index.values) + + # default name assigned, corner case + df = self.frame.copy() + df['index'] = 'foo' + rdf = df.reset_index() + self.assert_numpy_array_equal(rdf['level_0'], self.frame.index.values) + + # but this is ok + self.frame.index.name = 'index' + deleveled = self.frame.reset_index() + self.assert_numpy_array_equal(deleveled['index'], + self.frame.index.values) + self.assert_numpy_array_equal(deleveled.index, + np.arange(len(deleveled))) + + # preserve column names + self.frame.columns.name = 'columns' + resetted = self.frame.reset_index() + self.assertEqual(resetted.columns.name, 'columns') + + # only remove certain columns + frame = self.frame.reset_index().set_index(['index', 'A', 'B']) + rs = frame.reset_index(['A', 'B']) + + assert_frame_equal(rs, self.frame, check_names=False) # TODO should reset_index check_names ? + + rs = frame.reset_index(['index', 'A', 'B']) + assert_frame_equal(rs, self.frame.reset_index(), check_names=False) + + rs = frame.reset_index(['index', 'A', 'B']) + assert_frame_equal(rs, self.frame.reset_index(), check_names=False) + + rs = frame.reset_index('A') + xp = self.frame.reset_index().set_index(['index', 'B']) + assert_frame_equal(rs, xp, check_names=False) + + # test resetting in place + df = self.frame.copy() + resetted = self.frame.reset_index() + df.reset_index(inplace=True) + assert_frame_equal(df, resetted, check_names=False) + + frame = self.frame.reset_index().set_index(['index', 'A', 'B']) + rs = frame.reset_index('A', drop=True) + xp = self.frame.copy() + del xp['A'] + xp = xp.set_index(['B'], append=True) + assert_frame_equal(rs, xp, check_names=False) + + def test_reset_index_right_dtype(self): + time = np.arange(0.0, 10, np.sqrt(2) / 2) + s1 = Series((9.81 * time ** 2) / 2, + index=Index(time, name='time'), + name='speed') + df = DataFrame(s1) + + resetted = s1.reset_index() + self.assertEqual(resetted['time'].dtype, np.float64) + + resetted = df.reset_index() + self.assertEqual(resetted['time'].dtype, np.float64) + + def test_reset_index_multiindex_col(self): + vals = np.random.randn(3, 3).astype(object) + idx = ['x', 'y', 'z'] + full = np.hstack(([[x] for x in idx], vals)) + df = DataFrame(vals, Index(idx, name='a'), + columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']]) + rs = df.reset_index() + xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'], + ['', 'mean', 'median', 'mean']]) + assert_frame_equal(rs, xp) + + rs = df.reset_index(col_fill=None) + xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'], + ['a', 'mean', 'median', 'mean']]) + assert_frame_equal(rs, xp) + + rs = df.reset_index(col_level=1, col_fill='blah') + xp = DataFrame(full, columns=[['blah', 'b', 'b', 'c'], + ['a', 'mean', 'median', 'mean']]) + assert_frame_equal(rs, xp) + + df = DataFrame(vals, + MultiIndex.from_arrays([[0, 1, 2], ['x', 'y', 'z']], + names=['d', 'a']), + columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']]) + rs = df.reset_index('a', ) + xp = DataFrame(full, Index([0, 1, 2], name='d'), + columns=[['a', 'b', 'b', 'c'], + ['', 'mean', 'median', 'mean']]) + assert_frame_equal(rs, xp) + + rs = df.reset_index('a', col_fill=None) + xp = DataFrame(full, Index(lrange(3), name='d'), + columns=[['a', 'b', 'b', 'c'], + ['a', 'mean', 'median', 'mean']]) + assert_frame_equal(rs, xp) + + rs = df.reset_index('a', col_fill='blah', col_level=1) + xp = DataFrame(full, Index(lrange(3), name='d'), + columns=[['blah', 'b', 'b', 'c'], + ['a', 'mean', 'median', 'mean']]) + assert_frame_equal(rs, xp) + + def test_reset_index_with_datetimeindex_cols(self): + # GH5818 + # + df = pd.DataFrame([[1, 2], [3, 4]], + columns=pd.date_range('1/1/2013', '1/2/2013'), + index=['A', 'B']) + + result = df.reset_index() + expected = pd.DataFrame([['A', 1, 2], ['B', 3, 4]], + columns=['index', datetime(2013, 1, 1), + datetime(2013, 1, 2)]) + assert_frame_equal(result, expected) + + #---------------------------------------------------------------------- + # Tests to cope with refactored internals + def test_as_matrix_numeric_cols(self): + self.frame['foo'] = 'bar' + + values = self.frame.as_matrix(['A', 'B', 'C', 'D']) + self.assertEqual(values.dtype, np.float64) + + def test_as_matrix_lcd(self): + + # mixed lcd + values = self.mixed_float.as_matrix(['A', 'B', 'C', 'D']) + self.assertEqual(values.dtype, np.float64) + + values = self.mixed_float.as_matrix(['A', 'B', 'C' ]) + self.assertEqual(values.dtype, np.float32) + + values = self.mixed_float.as_matrix(['C']) + self.assertEqual(values.dtype, np.float16) + + values = self.mixed_int.as_matrix(['A','B','C','D']) + self.assertEqual(values.dtype, np.int64) + + values = self.mixed_int.as_matrix(['A','D']) + self.assertEqual(values.dtype, np.int64) + + # guess all ints are cast to uints.... + values = self.mixed_int.as_matrix(['A','B','C']) + self.assertEqual(values.dtype, np.int64) + + values = self.mixed_int.as_matrix(['A','C']) + self.assertEqual(values.dtype, np.int32) + + values = self.mixed_int.as_matrix(['C','D']) + self.assertEqual(values.dtype, np.int64) + + values = self.mixed_int.as_matrix(['A']) + self.assertEqual(values.dtype, np.int32) + + values = self.mixed_int.as_matrix(['C']) + self.assertEqual(values.dtype, np.uint8) + + def test_constructor_with_convert(self): + # this is actually mostly a test of lib.maybe_convert_objects + # #2845 + df = DataFrame({'A' : [2**63-1] }) + result = df['A'] + expected = Series(np.asarray([2**63-1], np.int64)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [2**63] }) + result = df['A'] + expected = Series(np.asarray([2**63], np.object_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [datetime(2005, 1, 1), True] }) + result = df['A'] + expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [None, 1] }) + result = df['A'] + expected = Series(np.asarray([np.nan, 1], np.float_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [1.0, 2] }) + result = df['A'] + expected = Series(np.asarray([1.0, 2], np.float_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [1.0+2.0j, 3] }) + result = df['A'] + expected = Series(np.asarray([1.0+2.0j, 3], np.complex_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [1.0+2.0j, 3.0] }) + result = df['A'] + expected = Series(np.asarray([1.0+2.0j, 3.0], np.complex_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [1.0+2.0j, True] }) + result = df['A'] + expected = Series(np.asarray([1.0+2.0j, True], np.object_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [1.0, None] }) + result = df['A'] + expected = Series(np.asarray([1.0, np.nan], np.float_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [1.0+2.0j, None] }) + result = df['A'] + expected = Series(np.asarray([1.0+2.0j, np.nan], np.complex_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [2.0, 1, True, None] }) + result = df['A'] + expected = Series(np.asarray([2.0, 1, True, None], np.object_)) + assert_series_equal(result, expected) + + df = DataFrame({'A' : [2.0, 1, datetime(2006, 1, 1), None] }) + result = df['A'] + expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1), + None], np.object_)) + assert_series_equal(result, expected) + + def test_construction_with_mixed(self): + # test construction edge cases with mixed types + + # f7u12, this does not work without extensive workaround + data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 1)]] + df = DataFrame(data) + + # check dtypes + result = df.get_dtype_counts().order() + expected = Series({ 'datetime64[ns]' : 3 }) + + # mixed-type frames + self.mixed_frame['datetime'] = datetime.now() + self.mixed_frame['timedelta'] = timedelta(days=1,seconds=1) + self.assertEqual(self.mixed_frame['datetime'].dtype, 'M8[ns]') + self.assertEqual(self.mixed_frame['timedelta'].dtype, 'm8[ns]') + result = self.mixed_frame.get_dtype_counts().order() + expected = Series({ 'float64' : 4, + 'object' : 1, + 'datetime64[ns]' : 1, + 'timedelta64[ns]' : 1}).order() + assert_series_equal(result,expected) + + def test_constructor_frame_copy(self): + cop = DataFrame(self.frame, copy=True) + cop['A'] = 5 + self.assertTrue((cop['A'] == 5).all()) + self.assertFalse((self.frame['A'] == 5).all()) + + def test_constructor_ndarray_copy(self): + df = DataFrame(self.frame.values) + + self.frame.values[5] = 5 + self.assertTrue((df.values[5] == 5).all()) + + df = DataFrame(self.frame.values, copy=True) + self.frame.values[6] = 6 + self.assertFalse((df.values[6] == 6).all()) + + def test_constructor_series_copy(self): + series = self.frame._series + + df = DataFrame({'A': series['A']}) + df['A'][:] = 5 + + self.assertFalse((series['A'] == 5).all()) + + def test_constructor_compound_dtypes(self): + # GH 5191 + # compound dtypes should raise not-implementederror + + def f(dtype): + return DataFrame(data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)), + columns=["A", "B", "C"], dtype=dtype) + + self.assertRaises(NotImplementedError, f, [("A","datetime64[h]"), ("B","str"), ("C","int32")]) + + # these work (though results may be unexpected) + f('int64') + f('float64') + f('M8[ns]') + + def test_assign_columns(self): + self.frame['hi'] = 'there' + + frame = self.frame.copy() + frame.columns = ['foo', 'bar', 'baz', 'quux', 'foo2'] + assert_series_equal(self.frame['C'], frame['baz']) + assert_series_equal(self.frame['hi'], frame['foo2']) + + def test_columns_with_dups(self): + + # GH 3468 related + + # basic + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['a','a.1'] + str(df) + expected = DataFrame([[1,2]], columns=['a','a.1']) + assert_frame_equal(df, expected) + + df = DataFrame([[1,2,3]], columns=['b','a','a']) + df.columns = ['b','a','a.1'] + str(df) + expected = DataFrame([[1,2,3]], columns=['b','a','a.1']) + assert_frame_equal(df, expected) + + # with a dup index + df = DataFrame([[1,2]], columns=['a','a']) + df.columns = ['b','b'] + str(df) + expected = DataFrame([[1,2]], columns=['b','b']) + assert_frame_equal(df, expected) + + # multi-dtype + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=['a','a','b','b','d','c','c']) + df.columns = list('ABCDEFG') + str(df) + expected = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('ABCDEFG')) + assert_frame_equal(df, expected) + + # this is an error because we cannot disambiguate the dup columns + self.assertRaises(Exception, lambda x: DataFrame([[1,2,'foo','bar']], columns=['a','a','a','a'])) + + # dups across blocks + df_float = DataFrame(np.random.randn(10, 3),dtype='float64') + df_int = DataFrame(np.random.randn(10, 3),dtype='int64') + df_bool = DataFrame(True,index=df_float.index,columns=df_float.columns) + df_object = DataFrame('foo',index=df_float.index,columns=df_float.columns) + df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) + df = pd.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) + + self.assertEqual(len(df._data._blknos), len(df.columns)) + self.assertEqual(len(df._data._blklocs), len(df.columns)) + + # testing iget + for i in range(len(df.columns)): + df.iloc[:,i] + + # dup columns across dtype GH 2079/2194 + vals = [[1, -1, 2.], [2, -2, 3.]] + rs = DataFrame(vals, columns=['A', 'A', 'B']) + xp = DataFrame(vals) + xp.columns = ['A', 'A', 'B'] + assert_frame_equal(rs, xp) + + def test_insert_column_bug_4032(self): + + # GH4032, inserting a column and renaming causing errors + df = DataFrame({'b': [1.1, 2.2]}) + df = df.rename(columns={}) + df.insert(0, 'a', [1, 2]) + + result = df.rename(columns={}) + str(result) + expected = DataFrame([[1,1.1],[2, 2.2]],columns=['a','b']) + assert_frame_equal(result,expected) + df.insert(0, 'c', [1.3, 2.3]) + + result = df.rename(columns={}) + str(result) + + expected = DataFrame([[1.3,1,1.1],[2.3,2, 2.2]],columns=['c','a','b']) + assert_frame_equal(result,expected) + + def test_cast_internals(self): + casted = DataFrame(self.frame._data, dtype=int) + expected = DataFrame(self.frame._series, dtype=int) + assert_frame_equal(casted, expected) + + casted = DataFrame(self.frame._data, dtype=np.int32) + expected = DataFrame(self.frame._series, dtype=np.int32) + assert_frame_equal(casted, expected) + + def test_consolidate(self): + self.frame['E'] = 7. + consolidated = self.frame.consolidate() + self.assertEqual(len(consolidated._data.blocks), 1) + + # Ensure copy, do I want this? + recons = consolidated.consolidate() + self.assertIsNot(recons, consolidated) + assert_frame_equal(recons, consolidated) + + self.frame['F'] = 8. + self.assertEqual(len(self.frame._data.blocks), 3) + self.frame.consolidate(inplace=True) + self.assertEqual(len(self.frame._data.blocks), 1) + + def test_consolidate_inplace(self): + frame = self.frame.copy() + + # triggers in-place consolidation + for letter in range(ord('A'), ord('Z')): + self.frame[chr(letter)] = chr(letter) + + def test_as_matrix_consolidate(self): + self.frame['E'] = 7. + self.assertFalse(self.frame._data.is_consolidated()) + _ = self.frame.as_matrix() + self.assertTrue(self.frame._data.is_consolidated()) + + def test_modify_values(self): + self.frame.values[5] = 5 + self.assertTrue((self.frame.values[5] == 5).all()) + + # unconsolidated + self.frame['E'] = 7. + self.frame.values[6] = 6 + self.assertTrue((self.frame.values[6] == 6).all()) + + def test_boolean_set_uncons(self): + self.frame['E'] = 7. + + expected = self.frame.values.copy() + expected[expected > 1] = 2 + + self.frame[self.frame > 1] = 2 + assert_almost_equal(expected, self.frame.values) + + def test_xs_view(self): + """ + in 0.14 this will return a view if possible + a copy otherwise, but this is numpy dependent + """ + + dm = DataFrame(np.arange(20.).reshape(4, 5), + index=lrange(4), columns=lrange(5)) + + dm.xs(2)[:] = 10 + self.assertTrue((dm.xs(2) == 10).all()) + + def test_boolean_indexing(self): + idx = lrange(3) + cols = ['A','B','C'] + df1 = DataFrame(index=idx, columns=cols, + data=np.array([[0.0, 0.5, 1.0], + [1.5, 2.0, 2.5], + [3.0, 3.5, 4.0]], + dtype=float)) + df2 = DataFrame(index=idx, columns=cols, + data=np.ones((len(idx), len(cols)))) + + expected = DataFrame(index=idx, columns=cols, + data=np.array([[0.0, 0.5, 1.0], + [1.5, 2.0, -1], + [-1, -1, -1]], dtype=float)) + + df1[df1 > 2.0 * df2] = -1 + assert_frame_equal(df1, expected) + with assertRaisesRegexp(ValueError, 'Item wrong length'): + df1[df1.index[:-1] > 2] = -1 + + def test_boolean_indexing_mixed(self): + df = DataFrame( + {long(0): {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + long(1): {35: np.nan, + 40: 0.32632316859446198, + 43: np.nan, + 49: 0.32632316859446198, + 50: 0.39114724480578139}, + long(2): {35: np.nan, 40: np.nan, 43: 0.29012581014105987, 49: np.nan, 50: np.nan}, + long(3): {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + long(4): {35: 0.34215328467153283, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + 'y': {35: 0, 40: 0, 43: 0, 49: 0, 50: 1}}) + + # mixed int/float ok + df2 = df.copy() + df2[df2>0.3] = 1 + expected = df.copy() + expected.loc[40,1] = 1 + expected.loc[49,1] = 1 + expected.loc[50,1] = 1 + expected.loc[35,4] = 1 + assert_frame_equal(df2,expected) + + df['foo'] = 'test' + with tm.assertRaisesRegexp(TypeError, 'boolean setting on mixed-type'): + df[df > 0.3] = 1 + + def test_sum_bools(self): + df = DataFrame(index=lrange(1), columns=lrange(10)) + bools = isnull(df) + self.assertEqual(bools.sum(axis=1)[0], 10) + + def test_fillna_col_reordering(self): + idx = lrange(20) + cols = ["COL." + str(i) for i in range(5, 0, -1)] + data = np.random.rand(20, 5) + df = DataFrame(index=lrange(20), columns=cols, data=data) + filled = df.fillna(method='ffill') + self.assertEqual(df.columns.tolist(), filled.columns.tolist()) + + def test_take(self): + + # homogeneous + #---------------------------------------- + order = [3, 1, 2, 0] + for df in [self.frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.ix[:, ['D', 'B', 'C', 'A']] + assert_frame_equal(result, expected, check_names=False) + + # neg indicies + order = [2,1,-1] + for df in [self.frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.ix[:, ['C', 'B', 'D']] + assert_frame_equal(result, expected, check_names=False) + + # illegal indices + self.assertRaises(IndexError, df.take, [3,1,2,30], axis=0) + self.assertRaises(IndexError, df.take, [3,1,2,-31], axis=0) + self.assertRaises(IndexError, df.take, [3,1,2,5], axis=1) + self.assertRaises(IndexError, df.take, [3,1,2,-5], axis=1) + + # mixed-dtype + #---------------------------------------- + order = [4, 1, 2, 0, 3] + for df in [self.mixed_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.ix[:, ['foo', 'B', 'C', 'A', 'D']] + assert_frame_equal(result, expected) + + # neg indicies + order = [4,1,-2] + for df in [self.mixed_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.ix[:, ['foo', 'B', 'D']] + assert_frame_equal(result, expected) + + # by dtype + order = [1, 2, 0, 3] + for df in [self.mixed_float,self.mixed_int]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.ix[:, ['B', 'C', 'A', 'D']] + assert_frame_equal(result, expected) + + def test_iterkv_deprecation(self): + with tm.assert_produces_warning(DeprecationWarning): + self.mixed_float.iterkv() + + def test_iterkv_names(self): + for k, v in compat.iteritems(self.mixed_frame): + self.assertEqual(v.name, k) + + def test_series_put_names(self): + series = self.mixed_frame._series + for k, v in compat.iteritems(series): + self.assertEqual(v.name, k) + + def test_dot(self): + a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], + columns=['p', 'q', 'r', 's']) + b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'], + columns=['one', 'two']) + + result = a.dot(b) + expected = DataFrame(np.dot(a.values, b.values), + index=['a', 'b', 'c'], + columns=['one', 'two']) + # Check alignment + b1 = b.reindex(index=reversed(b.index)) + result = a.dot(b) + assert_frame_equal(result, expected) + + # Check series argument + result = a.dot(b['one']) + assert_series_equal(result, expected['one']) + result = a.dot(b1['one']) + assert_series_equal(result, expected['one']) + + # can pass correct-length arrays + row = a.ix[0].values + + result = a.dot(row) + exp = a.dot(a.ix[0]) + assert_series_equal(result, exp) + + with assertRaisesRegexp(ValueError, 'Dot product shape mismatch'): + a.dot(row[:-1]) + + a = np.random.rand(1, 5) + b = np.random.rand(5, 1) + A = DataFrame(a) + B = DataFrame(b) + + # it works + result = A.dot(b) + + # unaligned + df = DataFrame(randn(3, 4), index=[1, 2, 3], columns=lrange(4)) + df2 = DataFrame(randn(5, 3), index=lrange(5), columns=[1, 2, 3]) + + assertRaisesRegexp(ValueError, 'aligned', df.dot, df2) + + def test_idxmin(self): + frame = self.frame + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + for skipna in [True, False]: + for axis in [0, 1]: + for df in [frame, self.intframe]: + result = df.idxmin(axis=axis, skipna=skipna) + expected = df.apply( + Series.idxmin, axis=axis, skipna=skipna) + assert_series_equal(result, expected) + + self.assertRaises(ValueError, frame.idxmin, axis=2) + + def test_idxmax(self): + frame = self.frame + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + for skipna in [True, False]: + for axis in [0, 1]: + for df in [frame, self.intframe]: + result = df.idxmax(axis=axis, skipna=skipna) + expected = df.apply( + Series.idxmax, axis=axis, skipna=skipna) + assert_series_equal(result, expected) + + self.assertRaises(ValueError, frame.idxmax, axis=2) + + def test_stale_cached_series_bug_473(self): + Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'), + columns=('e', 'f', 'g', 'h')) + repr(Y) + Y['e'] = Y['e'].astype('object') + Y['g']['c'] = np.NaN + repr(Y) + result = Y.sum() + exp = Y['g'].sum() + self.assertTrue(isnull(Y['g']['c'])) + + def test_index_namedtuple(self): + from collections import namedtuple + IndexType = namedtuple("IndexType", ["a", "b"]) + idx1 = IndexType("foo", "bar") + idx2 = IndexType("baz", "bof") + index = Index([idx1, idx2], + name="composite_index", tupleize_cols=False) + df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) + self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1) + + def test_empty_nonzero(self): + df = DataFrame([1, 2, 3]) + self.assertFalse(df.empty) + df = DataFrame(index=['a', 'b'], columns=['c', 'd']).dropna() + self.assertTrue(df.empty) + self.assertTrue(df.T.empty) + + def test_any_all(self): + self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) + self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) + + df = DataFrame(randn(10, 4)) > 0 + df.any(1) + df.all(1) + df.any(1, bool_only=True) + df.all(1, bool_only=True) + + # skip pathological failure cases + # class CantNonzero(object): + + # def __nonzero__(self): + # raise ValueError + + # df[4] = CantNonzero() + + # it works! + # df.any(1) + # df.all(1) + # df.any(1, bool_only=True) + # df.all(1, bool_only=True) + + # df[4][4] = np.nan + # df.any(1) + # df.all(1) + # df.any(1, bool_only=True) + # df.all(1, bool_only=True) + + def test_consolidate_datetime64(self): + # numpy vstack bug + + data = """\ +starting,ending,measure +2012-06-21 00:00,2012-06-23 07:00,77 +2012-06-23 07:00,2012-06-23 16:30,65 +2012-06-23 16:30,2012-06-25 08:00,77 +2012-06-25 08:00,2012-06-26 12:00,0 +2012-06-26 12:00,2012-06-27 08:00,77 +""" + df = read_csv(StringIO(data), parse_dates=[0, 1]) + + ser_starting = df.starting + ser_starting.index = ser_starting.values + ser_starting = ser_starting.tz_localize('US/Eastern') + ser_starting = ser_starting.tz_convert('UTC') + + ser_ending = df.ending + ser_ending.index = ser_ending.values + ser_ending = ser_ending.tz_localize('US/Eastern') + ser_ending = ser_ending.tz_convert('UTC') + + df.starting = ser_starting.index + df.ending = ser_ending.index + + assert_array_equal(df.starting.values, ser_starting.index.values) + assert_array_equal(df.ending.values, ser_ending.index.values) + + def test_tslib_tz_convert_trans_pos_plus_1__bug(self): + # Regression test for tslib.tz_convert(vals, tz1, tz2). + # See https://github.com/pydata/pandas/issues/4496 for details. + idx = pd.date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq='1min') + idx = idx.tz_localize('UTC') + idx = idx.tz_convert('Europe/Moscow') + + test_vector = pd.Series([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 5], dtype=int) + + hours = idx.hour + + np.testing.assert_equal(hours, test_vector.values) + + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, + has_bool_only=False): + if frame is None: + frame = self.frame > 0 + # set some NAs + frame = DataFrame(frame.values.astype(object), frame.index, + frame.columns) + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + + f = getattr(frame, name) + + if has_skipna: + def skipna_wrapper(x): + nona = x.dropna().values + return alternative(nona) + + def wrapper(x): + return alternative(x.values) + + result0 = f(axis=0, skipna=False) + result1 = f(axis=1, skipna=False) + assert_series_equal(result0, frame.apply(wrapper)) + assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False) # HACK: win32 + else: + skipna_wrapper = alternative + wrapper = alternative + + result0 = f(axis=0) + result1 = f(axis=1) + assert_series_equal(result0, frame.apply(skipna_wrapper)) + assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), + check_dtype=False) + + # result = f(axis=1) + # comp = frame.apply(alternative, axis=1).reindex(result.index) + # assert_series_equal(result, comp) + + # bad axis + self.assertRaises(ValueError, f, axis=2) + + # make sure works on mixed-type frame + mixed = self.mixed_frame + mixed['_bool_'] = np.random.randn(len(mixed)) > 0 + getattr(mixed, name)(axis=0) + getattr(mixed, name)(axis=1) + + class NonzeroFail: + + def __nonzero__(self): + raise ValueError + + mixed['_nonzero_fail_'] = NonzeroFail() + + if has_bool_only: + getattr(mixed, name)(axis=0, bool_only=True) + getattr(mixed, name)(axis=1, bool_only=True) + getattr(frame, name)(axis=0, bool_only=False) + getattr(frame, name)(axis=1, bool_only=False) + + # all NA case + if has_skipna: + all_na = frame * np.NaN + r0 = getattr(all_na, name)(axis=0) + r1 = getattr(all_na, name)(axis=1) + if name == 'any': + self.assertFalse(r0.any()) + self.assertFalse(r1.any()) + else: + self.assertTrue(r0.all()) + self.assertTrue(r1.all()) + + def test_strange_column_corruption_issue(self): + df = DataFrame(index=[0, 1]) + df[0] = nan + wasCol = {} + # uncommenting these makes the results match + # for col in xrange(100, 200): + # wasCol[col] = 1 + # df[col] = nan + + for i, dt in enumerate(df.index): + for col in range(100, 200): + if not col in wasCol: + wasCol[col] = 1 + df[col] = nan + df[col][dt] = i + + myid = 100 + + first = len(df.ix[isnull(df[myid]), [myid]]) + second = len(df.ix[isnull(df[myid]), [myid]]) + self.assertTrue(first == second == 0) + + def test_inplace_return_self(self): + # re #1893 + + data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'], + 'b': [0, 0, 1, 1], + 'c': [1, 2, 3, 4]}) + + def _check_f(base, f): + result = f(base) + self.assertTrue(result is None) + + # -----DataFrame----- + + # set_index + f = lambda x: x.set_index('a', inplace=True) + _check_f(data.copy(), f) + + # reset_index + f = lambda x: x.reset_index(inplace=True) + _check_f(data.set_index('a'), f) + + # drop_duplicates + f = lambda x: x.drop_duplicates(inplace=True) + _check_f(data.copy(), f) + + # sort + f = lambda x: x.sort('b', inplace=True) + _check_f(data.copy(), f) + + # sort_index + f = lambda x: x.sort_index(inplace=True) + _check_f(data.copy(), f) + + # sortlevel + f = lambda x: x.sortlevel(0, inplace=True) + _check_f(data.set_index(['a', 'b']), f) + + # fillna + f = lambda x: x.fillna(0, inplace=True) + _check_f(data.copy(), f) + + # replace + f = lambda x: x.replace(1, 0, inplace=True) + _check_f(data.copy(), f) + + # rename + f = lambda x: x.rename({1: 'foo'}, inplace=True) + _check_f(data.copy(), f) + + # -----Series----- + d = data.copy()['c'] + + # reset_index + f = lambda x: x.reset_index(inplace=True, drop=True) + _check_f(data.set_index('a')['c'], f) + + # fillna + f = lambda x: x.fillna(0, inplace=True) + _check_f(d.copy(), f) + + # replace + f = lambda x: x.replace(1, 0, inplace=True) + _check_f(d.copy(), f) + + # rename + f = lambda x: x.rename({1: 'foo'}, inplace=True) + _check_f(d.copy(), f) + + def test_isin(self): + # GH #4211 + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}, + index=['foo', 'bar', 'baz', 'qux']) + other = ['a', 'b', 'c'] + + result = df.isin(other) + expected = DataFrame([df.loc[s].isin(other) for s in df.index]) + assert_frame_equal(result, expected) + + def test_isin_empty(self): + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + result = df.isin([]) + expected = pd.DataFrame(False, df.index, df.columns) + assert_frame_equal(result, expected) + + def test_isin_dict(self): + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + d = {'A': ['a']} + + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, 'A'] = True + + result = df.isin(d) + assert_frame_equal(result, expected) + + # non unique columns + df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + df.columns = ['A', 'A'] + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, 'A'] = True + result = df.isin(d) + assert_frame_equal(result, expected) + + def test_isin_with_string_scalar(self): + #GH4763 + df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], + 'ids2': ['a', 'n', 'c', 'n']}, + index=['foo', 'bar', 'baz', 'qux']) + with tm.assertRaises(TypeError): + df.isin('a') + + with tm.assertRaises(TypeError): + df.isin('aaa') + + def test_isin_df(self): + df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) + df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]}) + expected = DataFrame(False, df1.index, df1.columns) + result = df1.isin(df2) + expected['A'].loc[[1, 3]] = True + expected['B'].loc[[0, 2]] = True + assert_frame_equal(result, expected) + + # partial overlapping columns + df2.columns = ['A', 'C'] + result = df1.isin(df2) + expected['B'] = False + assert_frame_equal(result, expected) + + def test_isin_df_dupe_values(self): + df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) + # just cols duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=['B', 'B']) + with tm.assertRaises(ValueError): + df1.isin(df2) + + # just index duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=['A', 'B'], index=[0, 0, 1, 1]) + with tm.assertRaises(ValueError): + df1.isin(df2) + + # cols and index: + df2.columns = ['B', 'B'] + with tm.assertRaises(ValueError): + df1.isin(df2) + + def test_isin_dupe_self(self): + other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A','A']) + result = df.isin(other) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected.loc[0] = True + expected.iloc[1, 1] = True + assert_frame_equal(result, expected) + + def test_isin_against_series(self): + df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, + index=['a', 'b', 'c', 'd']) + s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd']) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected['A'].loc['a'] = True + expected.loc['d'] = True + result = df.isin(s) + assert_frame_equal(result, expected) + + def test_isin_multiIndex(self): + idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'), + (0, 'b', 'bar'), (0, 'b', 'baz'), + (2, 'a', 'foo'), (2, 'a', 'bar'), + (2, 'c', 'bar'), (2, 'c', 'baz'), + (1, 'b', 'foo'), (1, 'b', 'bar'), + (1, 'c', 'bar'), (1, 'c', 'baz')]) + df1 = DataFrame({'A': np.ones(12), + 'B': np.zeros(12)}, index=idx) + df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + 'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]}) + # against regular index + expected = DataFrame(False, index=df1.index, columns=df1.columns) + result = df1.isin(df2) + assert_frame_equal(result, expected) + + df2.index = idx + expected = df2.values.astype(np.bool) + expected[:, 1] = ~expected[:, 1] + expected = DataFrame(expected, columns=['A', 'B'], index=idx) + + result = df1.isin(df2) + assert_frame_equal(result, expected) + + def test_to_csv_date_format(self): + from pandas import to_datetime + pname = '__tmp_to_csv_date_format__' + with ensure_clean(pname) as path: + for engine in [None, 'python']: + dt_index = self.tsframe.index + datetime_frame = DataFrame({'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) + + datetime_frame.to_csv(path, date_format='%Y%m%d', engine=engine) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + + datetime_frame_int = datetime_frame.applymap(lambda x: int(x.strftime('%Y%m%d'))) + datetime_frame_int.index = datetime_frame_int.index.map(lambda x: int(x.strftime('%Y%m%d'))) + + assert_frame_equal(test, datetime_frame_int) + + datetime_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.applymap(lambda x: x.strftime('%Y-%m-%d')) + datetime_frame_str.index = datetime_frame_str.index.map(lambda x: x.strftime('%Y-%m-%d')) + + assert_frame_equal(test, datetime_frame_str) + + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + + datetime_frame_columns.to_csv(path, date_format='%Y%m%d', engine=engine) + + test = read_csv(path, index_col=0) + + datetime_frame_columns = datetime_frame_columns.applymap(lambda x: int(x.strftime('%Y%m%d'))) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = datetime_frame_columns.columns.map(lambda x: x.strftime('%Y%m%d')) + + assert_frame_equal(test, datetime_frame_columns) + + # test NaTs + nat_index = to_datetime(['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) + nat_frame = DataFrame({'A': nat_index}, index=nat_index) + + nat_frame.to_csv(path, date_format='%Y-%m-%d', engine=engine) + + test = read_csv(path, parse_dates=[0, 1], index_col=0) + + assert_frame_equal(test, nat_frame) + + def test_concat_empty_dataframe_dtypes(self): + df = DataFrame(columns=list("abc")) + df['a'] = df['a'].astype(np.bool_) + df['b'] = df['b'].astype(np.int32) + df['c'] = df['c'].astype(np.float64) + + result = pd.concat([df, df]) + self.assertEqual(result['a'].dtype, np.bool_) + self.assertEqual(result['b'].dtype, np.int32) + self.assertEqual(result['c'].dtype, np.float64) + + result = pd.concat([df, df.astype(np.float64)]) + self.assertEqual(result['a'].dtype, np.object_) + self.assertEqual(result['b'].dtype, np.float64) + self.assertEqual(result['c'].dtype, np.float64) + + def test_empty_frame_dtypes_ftypes(self): + empty_df = pd.DataFrame() + assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object)) + assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object)) + + nocols_df = pd.DataFrame(index=[1,2,3]) + assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object)) + assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object)) + + norows_df = pd.DataFrame(columns=list("abc")) + assert_series_equal(norows_df.dtypes, pd.Series(np.object, index=list("abc"))) + assert_series_equal(norows_df.ftypes, pd.Series('object:dense', index=list("abc"))) + + norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32) + assert_series_equal(norows_int_df.dtypes, pd.Series(np.dtype('int32'), index=list("abc"))) + assert_series_equal(norows_int_df.ftypes, pd.Series('int32:dense', index=list("abc"))) + + odict = OrderedDict + df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]), index=[1, 2, 3]) + assert_series_equal(df.dtypes, pd.Series(odict([('a', np.int64), + ('b', np.bool), + ('c', np.float64)]))) + assert_series_equal(df.ftypes, pd.Series(odict([('a', 'int64:dense'), + ('b', 'bool:dense'), + ('c', 'float64:dense')]))) + + # same but for empty slice of df + assert_series_equal(df[:0].dtypes, pd.Series(odict([('a', np.int64), + ('b', np.bool), + ('c', np.float64)]))) + assert_series_equal(df[:0].ftypes, pd.Series(odict([('a', 'int64:dense'), + ('b', 'bool:dense'), + ('c', 'float64:dense')]))) + + def test_dtypes_are_correct_after_column_slice(self): + # GH6525 + df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + odict = OrderedDict + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + assert_series_equal(df.iloc[:,2:].dtypes, + pd.Series(odict([('c', np.float_)]))) + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + + def test_set_index_names(self): + df = pd.util.testing.makeDataFrame() + df.index.name = 'name' + + self.assertEqual(df.set_index(df.index).index.names, ['name']) + + mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) + mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, + names=['A', 'B', 'A', 'B']) + + df = df.set_index(['A', 'B']) + + self.assertEqual(df.set_index(df.index).index.names, ['A', 'B']) + + # Check that set_index isn't converting a MultiIndex into an Index + self.assertTrue(isinstance(df.set_index(df.index).index, MultiIndex)) + + # Check actual equality + tm.assert_index_equal(df.set_index(df.index).index, mi) + + # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather + # than a pair of tuples + self.assertTrue(isinstance(df.set_index([df.index, df.index]).index, MultiIndex)) + + # Check equality + tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2) + + def test_select_dtypes_include(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True]}) + ri = df.select_dtypes(include=[np.number]) + ei = df[['b', 'c', 'd']] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_exclude(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True]}) + re = df.select_dtypes(exclude=[np.number]) + ee = df[['a', 'e']] + tm.assert_frame_equal(re, ee) + + def test_select_dtypes_exclude_include(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + exclude = np.datetime64, + include = np.bool_, 'integer' + r = df.select_dtypes(include=include, exclude=exclude) + e = df[['b', 'c', 'e']] + tm.assert_frame_equal(r, e) + + exclude = 'datetime', + include = 'bool', 'int64', 'int32' + r = df.select_dtypes(include=include, exclude=exclude) + e = df[['b', 'e']] + tm.assert_frame_equal(r, e) + + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): + tm._skip_if_not_numpy17_friendly() + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + df['g'] = df.f.diff() + assert not hasattr(np, 'u8') + r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta']) + e = df[['a', 'b']] + tm.assert_frame_equal(r, e) + + r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]']) + e = df[['a', 'b', 'g']] + tm.assert_frame_equal(r, e) + + def test_select_dtypes_empty(self): + df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) + with tm.assertRaisesRegexp(ValueError, 'at least one of include or ' + 'exclude must be nonempty'): + df.select_dtypes() + + def test_select_dtypes_raises_on_string(self): + df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) + with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): + df.select_dtypes(include='object') + with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): + df.select_dtypes(exclude='object') + with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): + df.select_dtypes(include=int, exclude='object') + + def test_select_dtypes_bad_datetime64(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + with tm.assertRaisesRegexp(ValueError, '.+ is too specific'): + df.select_dtypes(include=['datetime64[D]']) + + with tm.assertRaisesRegexp(ValueError, '.+ is too specific'): + df.select_dtypes(exclude=['datetime64[as]']) + + def test_select_dtypes_str_raises(self): + df = DataFrame({'a': list('abc'), + 'g': list(u('abc')), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + string_dtypes = set((str, 'str', np.string_, 'S1', + 'unicode', np.unicode_, 'U1')) + try: + string_dtypes.add(unicode) + except NameError: + pass + for dt in string_dtypes: + with tm.assertRaisesRegexp(TypeError, + 'string dtypes are not allowed'): + df.select_dtypes(include=[dt]) + with tm.assertRaisesRegexp(TypeError, + 'string dtypes are not allowed'): + df.select_dtypes(exclude=[dt]) + + def test_select_dtypes_bad_arg_raises(self): + df = DataFrame({'a': list('abc'), + 'g': list(u('abc')), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'): + df.select_dtypes(['blargy, blarg, blarg']) + + +def skip_if_no_ne(engine='numexpr'): + if engine == 'numexpr': + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not " + "installed") + + +def skip_if_no_pandas_parser(parser): + if parser != 'pandas': + raise nose.SkipTest("cannot evaluate with parser {0!r}".format(parser)) + + +class TestDataFrameQueryWithMultiIndex(object): + def check_query_with_named_multiindex(self, parser, engine): + tm.skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = tm.choice(['eggs', 'ham'], size=10) + index = MultiIndex.from_arrays([a, b], names=['color', 'food']) + df = DataFrame(randn(10, 2), index=index) + ind = Series(df.index.get_level_values('color').values, index=index, + name='color') + + # equality + res1 = df.query('color == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == color', parser=parser, engine=engine) + exp = df[ind == 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('color != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != color', parser=parser, engine=engine) + exp = df[ind != 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('color == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == color', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('color != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != color', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in color', parser=parser, engine=engine) + res2 = df.query('"red" in color', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in color', parser=parser, engine=engine) + res2 = df.query('"red" not in color', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + def test_query_with_named_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_named_multiindex, parser, engine + + def check_query_with_unnamed_multiindex(self, parser, engine): + tm.skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = tm.choice(['eggs', 'ham'], size=10) + index = MultiIndex.from_arrays([a, b]) + df = DataFrame(randn(10, 2), index=index) + ind = Series(df.index.get_level_values(0).values, index=index) + + res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) + exp = df[ind == 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine) + exp = df[ind != 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + #### LEVEL 1 #### + ind = Series(df.index.get_level_values(1).values, index=index) + res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine) + exp = df[ind == 'eggs'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine) + exp = df[ind != 'eggs'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + def test_query_with_unnamed_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_unnamed_multiindex, parser, engine + + def check_query_with_partially_named_multiindex(self, parser, engine): + tm.skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = np.arange(10) + index = MultiIndex.from_arrays([a, b]) + index.names = [None, 'rating'] + df = DataFrame(randn(10, 2), index=index) + res = df.query('rating == 1', parser=parser, engine=engine) + ind = Series(df.index.get_level_values('rating').values, index=index, + name='rating') + exp = df[ind == 1] + assert_frame_equal(res, exp) + + res = df.query('rating != 1', parser=parser, engine=engine) + ind = Series(df.index.get_level_values('rating').values, index=index, + name='rating') + exp = df[ind != 1] + assert_frame_equal(res, exp) + + res = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind == "red"] + assert_frame_equal(res, exp) + + res = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind != "red"] + assert_frame_equal(res, exp) + + def test_query_with_partially_named_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_partially_named_multiindex, parser, engine + + def test_query_multiindex_get_index_resolvers(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_multiindex_get_index_resolvers, parser, engine + + def check_query_multiindex_get_index_resolvers(self, parser, engine): + df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) + resolvers = df._get_index_resolvers() + + def to_series(mi, level): + level_values = mi.get_level_values(level) + s = level_values.to_series() + s.index = mi + return s + + col_series = df.columns.to_series() + expected = {'index': df.index, + 'columns': col_series, + 'spam': to_series(df.index, 'spam'), + 'eggs': to_series(df.index, 'eggs'), + 'C0': col_series} + for k, v in resolvers.items(): + if isinstance(v, Index): + assert v.is_(expected[k]) + elif isinstance(v, Series): + tm.assert_series_equal(v, expected[k]) + else: + raise AssertionError("object must be a Series or Index") + + def test_raise_on_panel_with_multiindex(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_raise_on_panel_with_multiindex, parser, engine + + def check_raise_on_panel_with_multiindex(self, parser, engine): + tm.skip_if_no_ne() + p = tm.makePanel(7) + p.items = tm.makeCustomIndex(len(p.items), nlevels=2) + with tm.assertRaises(NotImplementedError): + pd.eval('p + 1', parser=parser, engine=engine) + + def test_raise_on_panel4d_with_multiindex(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_raise_on_panel4d_with_multiindex, parser, engine + + def check_raise_on_panel4d_with_multiindex(self, parser, engine): + tm.skip_if_no_ne() + p4d = tm.makePanel4D(7) + p4d.items = tm.makeCustomIndex(len(p4d.items), nlevels=2) + with tm.assertRaises(NotImplementedError): + pd.eval('p4d + 1', parser=parser, engine=engine) + + +class TestDataFrameQueryNumExprPandas(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestDataFrameQueryNumExprPandas, cls).setUpClass() + cls.engine = 'numexpr' + cls.parser = 'pandas' + tm.skip_if_no_ne(cls.engine) + + @classmethod + def tearDownClass(cls): + super(TestDataFrameQueryNumExprPandas, cls).tearDownClass() + del cls.engine, cls.parser + + def test_date_query_with_attribute_access(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_query_no_attribute_access(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + tm.assert_frame_equal(res, expec) + + def test_date_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates2'] = date_range('1/1/2013', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.set_index('dates1', inplace=True, drop=True) + res = df.query('index < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.iloc[0, 0] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + res = df.query('index < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT_duplicates(self): + engine, parser = self.engine, self.parser + n = 10 + d = {} + d['dates1'] = date_range('1/1/2012', periods=n) + d['dates3'] = date_range('1/1/2014', periods=n) + df = DataFrame(d) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + res = df.query('index < 20130101 < dates3', engine=engine, parser=parser) + expec = df[(df.index.to_series() < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_query_with_non_date(self): + engine, parser = self.engine, self.parser + + n = 10 + df = DataFrame({'dates': date_range('1/1/2012', periods=n), + 'nondate': np.arange(n)}) + + ops = '==', '!=', '<', '>', '<=', '>=' + + for op in ops: + with tm.assertRaises(TypeError): + df.query('dates %s nondate' % op, parser=parser, engine=engine) + + def test_query_syntax_error(self): + engine, parser = self.engine, self.parser + df = DataFrame({"i": lrange(10), "+": lrange(3, 13), + "r": lrange(4, 14)}) + with tm.assertRaises(SyntaxError): + df.query('i - +', engine=engine, parser=parser) + + def test_query_scope(self): + from pandas.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(20, 2), columns=list('ab')) + + a, b = 1, 2 + res = df.query('a > b', engine=engine, parser=parser) + expected = df[df.a > df.b] + tm.assert_frame_equal(res, expected) + + res = df.query('@a > b', engine=engine, parser=parser) + expected = df[a > df.b] + tm.assert_frame_equal(res, expected) + + # no local variable c + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > @c', engine=engine, parser=parser) + + # no column named 'c' + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > c', engine=engine, parser=parser) + + def test_query_doesnt_pickup_local(self): + from pandas.computation.ops import UndefinedVariableError + + engine, parser = self.engine, self.parser + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + from numpy import sin + + # we don't pick up the local 'sin' + with tm.assertRaises(UndefinedVariableError): + df.query('sin > 5', engine=engine, parser=parser) + + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + with tm.assertRaisesRegexp(NumExprClobberingError, + 'Variables in expression.+'): + df.query('sin > 5', engine=engine, parser=parser) + + def test_query(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + + assert_frame_equal(df.query('a < b', engine=engine, parser=parser), + df[df.a < df.b]) + assert_frame_equal(df.query('a + b > b * c', engine=engine, + parser=parser), + df[df.a + df.b > df.b * df.c]) + + def test_query_index_with_name(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) + expec = df[(df.index < 5) & (df.a < df.b)] + assert_frame_equal(res, expec) + + res = df.query('blob < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + + assert_frame_equal(res, expec) + + def test_query_index_without_name(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=range(10), columns=['a', 'b', 'c']) + + # "index" should refer to the index + res = df.query('index < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + assert_frame_equal(res, expec) + + # test against a scalar + res = df.query('index < 5', engine=engine, parser=parser) + expec = df[df.index < 5] + assert_frame_equal(res, expec) + + def test_nested_scope(self): + engine = self.engine + parser = self.parser + + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df > 0) & (df2 > 0)] + + result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, + parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', + engine=engine, parser=parser) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + assert_frame_equal(result, expected) + + result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) + expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + def test_nested_raises_on_local_self_reference(self): + from pandas.computation.ops import UndefinedVariableError + + df = DataFrame(np.random.randn(5, 3)) + + # can't reference ourself b/c we're a local so @ is necessary + with tm.assertRaises(UndefinedVariableError): + df.query('df > 0', engine=self.engine, parser=self.parser) + + def test_local_syntax(self): + skip_if_no_pandas_parser(self.parser) + + engine, parser = self.engine, self.parser + df = DataFrame(randn(100, 10), columns=list('abcdefghij')) + b = 1 + expect = df[df.a < b] + result = df.query('a < @b', engine=engine, parser=parser) + assert_frame_equal(result, expect) + + expect = df[df.a < df.b] + result = df.query('a < b', engine=engine, parser=parser) + assert_frame_equal(result, expect) + + def test_chained_cmp_and_in(self): + skip_if_no_pandas_parser(self.parser) + engine, parser = self.engine, self.parser + cols = list('abc') + df = DataFrame(randn(100, len(cols)), columns=cols) + res = df.query('a < b < c and a not in b not in c', engine=engine, + parser=parser) + ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) + expec = df[ind] + assert_frame_equal(res, expec) + + def test_local_variable_with_in(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + a = Series(np.random.randint(3, size=15), name='a') + b = Series(np.random.randint(10, size=15), name='b') + df = DataFrame({'a': a, 'b': b}) + + expected = df.loc[(df.b - 1).isin(a)] + result = df.query('b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + b = Series(np.random.randint(10, size=15), name='b') + expected = df.loc[(b - 1).isin(a)] + result = df.query('@b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + def test_at_inside_string(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + c = 1 + df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']}) + result = df.query('a == "@c"', engine=engine, parser=parser) + expected = df[df.a == "@c"] + tm.assert_frame_equal(result, expected) + + def test_query_undefined_local(self): + from pandas.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + df = DataFrame(np.random.rand(10, 2), columns=list('ab')) + with tm.assertRaisesRegexp(UndefinedVariableError, + "local variable 'c' is not defined"): + df.query('a == @c', engine=engine, parser=parser) + + def test_index_resolvers_come_after_columns_with_the_same_name(self): + n = 1 + a = np.r_[20:101:20] + + df = DataFrame({'index': a, 'b': np.random.randn(a.size)}) + df.index.name = 'index' + result = df.query('index > 5', engine=self.engine, parser=self.parser) + expected = df[df['index'] > 5] + tm.assert_frame_equal(result, expected) + + df = DataFrame({'index': a, 'b': np.random.randn(a.size)}) + result = df.query('ilevel_0 > 5', engine=self.engine, parser=self.parser) + expected = df.loc[df.index[df.index > 5]] + tm.assert_frame_equal(result, expected) + + df = DataFrame({'a': a, 'b': np.random.randn(a.size)}) + df.index.name = 'a' + result = df.query('a > 5', engine=self.engine, parser=self.parser) + expected = df[df.a > 5] + tm.assert_frame_equal(result, expected) + + result = df.query('index > 5', engine=self.engine, parser=self.parser) + expected = df.loc[df.index[df.index > 5]] + tm.assert_frame_equal(result, expected) + + def test_inf(self): + n = 10 + df = DataFrame({'a': np.random.rand(n), 'b': np.random.rand(n)}) + df.loc[::2, 0] = np.inf + ops = '==', '!=' + d = dict(zip(ops, (operator.eq, operator.ne))) + for op, f in d.items(): + q = 'a %s inf' % op + expected = df[f(df.a, np.inf)] + result = df.query(q, engine=self.engine, parser=self.parser) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): + + @classmethod + def setUpClass(cls): + super(TestDataFrameQueryNumExprPython, cls).setUpClass() + cls.engine = 'numexpr' + cls.parser = 'python' + tm.skip_if_no_ne(cls.engine) + cls.frame = _frame.copy() + + def test_date_query_no_attribute_access(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('(dates1 < 20130101) & (20130101 < dates3)', + engine=engine, parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + tm.assert_frame_equal(res, expec) + def test_date_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates2'] = date_range('1/1/2013', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT + res = df.query('(dates1 < 20130101) & (20130101 < dates3)', + engine=engine, parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.set_index('dates1', inplace=True, drop=True) + res = df.query('(index < 20130101) & (20130101 < dates3)', + engine=engine, parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.iloc[0, 0] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + res = df.query('(index < 20130101) & (20130101 < dates3)', + engine=engine, parser=parser) + expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_date_index_query_with_NaT_duplicates(self): + engine, parser = self.engine, self.parser + n = 10 + df = DataFrame(randn(n, 3)) + df['dates1'] = date_range('1/1/2012', periods=n) + df['dates3'] = date_range('1/1/2014', periods=n) + df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT + df.set_index('dates1', inplace=True, drop=True) + with tm.assertRaises(NotImplementedError): + df.query('index < 20130101 < dates3', engine=engine, parser=parser) + + def test_nested_scope(self): + from pandas.computation.ops import UndefinedVariableError + engine = self.engine + parser = self.parser + # smoke test + x = 1 + result = pd.eval('x + 1', engine=engine, parser=parser) + self.assertEqual(result, 2) + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + + # don't have the pandas parser + with tm.assertRaises(SyntaxError): + df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + + with tm.assertRaises(UndefinedVariableError): + df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + + expected = df[(df > 0) & (df2 > 0)] + result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, + parser=parser) + tm.assert_frame_equal(expected, result) + + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', + engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + +class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): + + @classmethod + def setUpClass(cls): + super(TestDataFrameQueryPythonPandas, cls).setUpClass() + cls.engine = 'python' + cls.parser = 'pandas' + cls.frame = _frame.copy() + + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + +class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): + + @classmethod + def setUpClass(cls): + super(TestDataFrameQueryPythonPython, cls).setUpClass() + cls.engine = cls.parser = 'python' + cls.frame = _frame.copy() + + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + +PARSERS = 'python', 'pandas' +ENGINES = 'python', 'numexpr' + + +class TestDataFrameQueryStrings(object): + def check_str_query_method(self, parser, engine): + tm.skip_if_no_ne(engine) + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings == 'a'] + + if parser != 'pandas': + col = 'strings' + lst = '"a"' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = '==', '!=' + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + assertRaises(NotImplementedError, df.query, ex, engine=engine, + parser=parser, local_dict={'strings': df.strings}) + else: + res = df.query('"a" == strings', engine=engine, parser=parser) + assert_frame_equal(res, expect) + + res = df.query('strings == "a"', engine=engine, parser=parser) + assert_frame_equal(res, expect) + assert_frame_equal(res, df[df.strings.isin(['a'])]) + + expect = df[df.strings != 'a'] + res = df.query('strings != "a"', engine=engine, parser=parser) + assert_frame_equal(res, expect) + + res = df.query('"a" != strings', engine=engine, parser=parser) + assert_frame_equal(res, expect) + assert_frame_equal(res, df[~df.strings.isin(['a'])]) + + def test_str_query_method(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_str_query_method, parser, engine + + def test_str_list_query_method(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_str_list_query_method, parser, engine + + def check_str_list_query_method(self, parser, engine): + tm.skip_if_no_ne(engine) + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings.isin(['a', 'b'])] + + if parser != 'pandas': + col = 'strings' + lst = '["a", "b"]' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = '==', '!=' + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + with tm.assertRaises(NotImplementedError): + df.query(ex, engine=engine, parser=parser) + else: + res = df.query('strings == ["a", "b"]', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + res = df.query('["a", "b"] == strings', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + expect = df[~df.strings.isin(['a', 'b'])] + + res = df.query('strings != ["a", "b"]', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + res = df.query('["a", "b"] != strings', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + def check_query_with_string_columns(self, parser, engine): + tm.skip_if_no_ne(engine) + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + if parser == 'pandas': + res = df.query('a in b', parser=parser, engine=engine) + expec = df[df.a.isin(df.b)] + assert_frame_equal(res, expec) + + res = df.query('a in b and c < d', parser=parser, engine=engine) + expec = df[df.a.isin(df.b) & (df.c < df.d)] + assert_frame_equal(res, expec) + else: + with assertRaises(NotImplementedError): + df.query('a in b', parser=parser, engine=engine) + + with assertRaises(NotImplementedError): + df.query('a in b and c < d', parser=parser, engine=engine) + + def test_query_with_string_columns(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_with_string_columns, parser, engine + + def check_object_array_eq_ne(self, parser, engine): + tm.skip_if_no_ne(engine) + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + res = df.query('a == b', parser=parser, engine=engine) + exp = df[df.a == df.b] + assert_frame_equal(res, exp) + + res = df.query('a != b', parser=parser, engine=engine) + exp = df[df.a != df.b] + assert_frame_equal(res, exp) + + def test_object_array_eq_ne(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_object_array_eq_ne, parser, engine + + def check_query_with_nested_strings(self, parser, engine): + tm.skip_if_no_ne(engine) + skip_if_no_pandas_parser(parser) + from pandas.compat import StringIO + raw = """id event timestamp + 1 "page 1 load" 1/1/2014 0:00:01 + 1 "page 1 exit" 1/1/2014 0:00:31 + 2 "page 2 load" 1/1/2014 0:01:01 + 2 "page 2 exit" 1/1/2014 0:01:31 + 3 "page 3 load" 1/1/2014 0:02:01 + 3 "page 3 exit" 1/1/2014 0:02:31 + 4 "page 1 load" 2/1/2014 1:00:01 + 4 "page 1 exit" 2/1/2014 1:00:31 + 5 "page 2 load" 2/1/2014 1:01:01 + 5 "page 2 exit" 2/1/2014 1:01:31 + 6 "page 3 load" 2/1/2014 1:02:01 + 6 "page 3 exit" 2/1/2014 1:02:31 + """ + df = pd.read_csv(StringIO(raw), sep=r'\s{2,}', engine='python', + parse_dates=['timestamp']) + expected = df[df.event == '"page 1 load"'] + res = df.query("""'"page 1 load"' in event""", parser=parser, + engine=engine) + tm.assert_frame_equal(expected, res) + + def test_query_with_nested_string(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_with_nested_strings, parser, engine + + def check_query_with_nested_special_character(self, parser, engine): + skip_if_no_pandas_parser(parser) + tm.skip_if_no_ne(engine) + df = DataFrame({'a': ['a', 'b', 'test & test'], + 'b': [1, 2, 3]}) + res = df.query('a == "test & test"', parser=parser, engine=engine) + expec = df[df.a == 'test & test'] + tm.assert_frame_equal(res, expec) + + def test_query_with_nested_special_character(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_with_nested_special_character, parser, engine + + def check_query_lex_compare_strings(self, parser, engine): + tm.skip_if_no_ne(engine=engine) + import operator as opr + + a = Series(tm.choice(list('abcde'), 20)) + b = Series(np.arange(a.size)) + df = DataFrame({'X': a, 'Y': b}) + + ops = {'<': opr.lt, '>': opr.gt, '<=': opr.le, '>=': opr.ge} + + for op, func in ops.items(): + res = df.query('X %s "d"' % op, engine=engine, parser=parser) + expected = df[func(df.X, 'd')] + assert_frame_equal(res, expected) + + def test_query_lex_compare_strings(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_lex_compare_strings, parser, engine + + def check_query_single_element_booleans(self, parser, engine): + tm.skip_if_no_ne(engine) + columns = 'bid', 'bidsize', 'ask', 'asksize' + data = np.random.randint(2, size=(1, len(columns))).astype(bool) + df = DataFrame(data, columns=columns) + res = df.query('bid & ask', engine=engine, parser=parser) + expected = df[df.bid & df.ask] + assert_frame_equal(res, expected) + + def test_query_single_element_booleans(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_single_element_booleans, parser, engine + + def check_query_string_scalar_variable(self, parser, engine): + tm.skip_if_no_ne(engine) + df = pd.DataFrame({'Symbol': ['BUD US', 'BUD US', 'IBM US', 'IBM US'], + 'Price': [109.70, 109.72, 183.30, 183.35]}) + e = df[df.Symbol == 'BUD US'] + symb = 'BUD US' + r = df.query('Symbol == @symb', parser=parser, engine=engine) + tm.assert_frame_equal(e, r) + + def test_query_string_scalar_variable(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_string_scalar_variable, parser, engine + + +class TestDataFrameEvalNumExprPandas(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestDataFrameEvalNumExprPandas, cls).setUpClass() + cls.engine = 'numexpr' + cls.parser = 'pandas' + tm.skip_if_no_ne() + + def setUp(self): + self.frame = DataFrame(randn(10, 3), columns=list('abc')) + + def tearDown(self): + del self.frame + + def test_simple_expr(self): + res = self.frame.eval('a + b', engine=self.engine, parser=self.parser) + expect = self.frame.a + self.frame.b + assert_series_equal(res, expect) + + def test_bool_arith_expr(self): + res = self.frame.eval('a[a < 1] + b', engine=self.engine, + parser=self.parser) + expect = self.frame.a[self.frame.a < 1] + self.frame.b + assert_series_equal(res, expect) + + def test_invalid_type_for_operator_raises(self): + df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) + ops = '+', '-', '*', '/' + for op in ops: + with tm.assertRaisesRegexp(TypeError, + "unsupported operand type\(s\) for " + ".+: '.+' and '.+'"): + df.eval('a {0} b'.format(op), engine=self.engine, + parser=self.parser) + + +class TestDataFrameEvalNumExprPython(TestDataFrameEvalNumExprPandas): + + @classmethod + def setUpClass(cls): + super(TestDataFrameEvalNumExprPython, cls).setUpClass() + cls.engine = 'numexpr' + cls.parser = 'python' + tm.skip_if_no_ne(cls.engine) + + +class TestDataFrameEvalPythonPandas(TestDataFrameEvalNumExprPandas): + + @classmethod + def setUpClass(cls): + super(TestDataFrameEvalPythonPandas, cls).setUpClass() + cls.engine = 'python' + cls.parser = 'pandas' + + +class TestDataFrameEvalPythonPython(TestDataFrameEvalNumExprPython): + + @classmethod + def setUpClass(cls): + super(TestDataFrameEvalPythonPython, cls).tearDownClass() + cls.engine = cls.parser = 'python' + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py new file mode 100644 index 00000000..044d4054 --- /dev/null +++ b/pandas/tests/test_generic.py @@ -0,0 +1,1219 @@ +# pylint: disable-msg=E1101,W0612 + +from datetime import datetime, timedelta +import nose +import numpy as np +from numpy import nan +import pandas as pd + +from pandas import (Index, Series, DataFrame, Panel, + isnull, notnull,date_range) +from pandas.core.index import Index, MultiIndex + +import pandas.core.common as com + +from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long +from pandas import compat +from pandas.util.testing import (assert_series_equal, + assert_frame_equal, + assert_panel_equal, + assert_almost_equal, + ensure_clean) +import pandas.util.testing as tm + + +def _skip_if_no_pchip(): + try: + from scipy.interpolate import pchip_interpolate + except ImportError: + raise nose.SkipTest('scipy.interpolate.pchip missing') + +#------------------------------------------------------------------------------ +# Generic types test cases + + +class Generic(object): + + _multiprocess_can_split_ = True + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + @property + def _ndim(self): + return self._typ._AXIS_LEN + + def _axes(self): + """ return the axes for my object typ """ + return self._typ._AXIS_ORDERS + + def _construct(self, shape, value=None, dtype=None, **kwargs): + """ construct an object for the given shape + if value is specified use that if its a scalar + if value is an array, repeat it as needed """ + + if isinstance(shape,int): + shape = tuple([shape] * self._ndim) + if value is not None: + if np.isscalar(value): + if value == 'empty': + arr = None + + # remove the info axis + kwargs.pop(self._typ._info_axis_name,None) + else: + arr = np.empty(shape,dtype=dtype) + arr.fill(value) + else: + fshape = np.prod(shape) + arr = value.ravel() + new_shape = fshape/arr.shape[0] + if fshape % arr.shape[0] != 0: + raise Exception("invalid value passed in _construct") + + arr = np.repeat(arr,new_shape).reshape(shape) + else: + arr = np.random.randn(*shape) + return self._typ(arr,dtype=dtype,**kwargs) + + def _compare(self, result, expected): + self._comparator(result,expected) + + def test_rename(self): + + # single axis + for axis in self._axes(): + kwargs = { axis : list('ABCD') } + obj = self._construct(4,**kwargs) + + # no values passed + #self.assertRaises(Exception, o.rename(str.lower)) + + # rename a single axis + result = obj.rename(**{ axis : str.lower }) + expected = obj.copy() + setattr(expected,axis,list('abcd')) + self._compare(result, expected) + + # multiple axes at once + + def test_get_numeric_data(self): + + n = 4 + kwargs = { } + for i in range(self._ndim): + kwargs[self._typ._AXIS_NAMES[i]] = list(range(n)) + + # get the numeric data + o = self._construct(n,**kwargs) + result = o._get_numeric_data() + self._compare(result, o) + + # non-inclusion + result = o._get_bool_data() + expected = self._construct(n,value='empty',**kwargs) + self._compare(result,expected) + + # get the bool data + arr = np.array([True,True,False,True]) + o = self._construct(n,value=arr,**kwargs) + result = o._get_numeric_data() + self._compare(result, o) + + # _get_numeric_data is includes _get_bool_data, so can't test for non-inclusion + + def test_nonzero(self): + + # GH 4633 + # look at the boolean/nonzero behavior for objects + obj = self._construct(shape=4) + self.assertRaises(ValueError, lambda : bool(obj == 0)) + self.assertRaises(ValueError, lambda : bool(obj == 1)) + self.assertRaises(ValueError, lambda : bool(obj)) + + obj = self._construct(shape=4,value=1) + self.assertRaises(ValueError, lambda : bool(obj == 0)) + self.assertRaises(ValueError, lambda : bool(obj == 1)) + self.assertRaises(ValueError, lambda : bool(obj)) + + obj = self._construct(shape=4,value=np.nan) + self.assertRaises(ValueError, lambda : bool(obj == 0)) + self.assertRaises(ValueError, lambda : bool(obj == 1)) + self.assertRaises(ValueError, lambda : bool(obj)) + + # empty + obj = self._construct(shape=0) + self.assertRaises(ValueError, lambda : bool(obj)) + + # invalid behaviors + + obj1 = self._construct(shape=4,value=1) + obj2 = self._construct(shape=4,value=1) + + def f(): + if obj1: + com.pprint_thing("this works and shouldn't") + self.assertRaises(ValueError, f) + self.assertRaises(ValueError, lambda : obj1 and obj2) + self.assertRaises(ValueError, lambda : obj1 or obj2) + self.assertRaises(ValueError, lambda : not obj1) + + def test_numpy_1_7_compat_numeric_methods(self): + tm._skip_if_not_numpy17_friendly() + + # GH 4435 + # numpy in 1.7 tries to pass addtional arguments to pandas functions + + o = self._construct(shape=4) + for op in ['min','max','max','var','std','prod','sum','cumsum','cumprod', + 'median','skew','kurt','compound','cummax','cummin','all','any']: + f = getattr(np,op,None) + if f is not None: + f(o) + + def test_downcast(self): + # test close downcasting + + o = self._construct(shape=4, value=9, dtype=np.int64) + result = o.copy() + result._data = o._data.downcast(dtypes='infer') + self._compare(result, o) + + o = self._construct(shape=4, value=9.) + expected = o.astype(np.int64) + result = o.copy() + result._data = o._data.downcast(dtypes='infer') + self._compare(result, expected) + + o = self._construct(shape=4, value=9.5) + result = o.copy() + result._data = o._data.downcast(dtypes='infer') + self._compare(result, o) + + # are close + o = self._construct(shape=4, value=9.000000000005) + result = o.copy() + result._data = o._data.downcast(dtypes='infer') + expected = o.astype(np.int64) + self._compare(result, expected) + + def test_constructor_compound_dtypes(self): + # GH 5191 + # compound dtypes should raise not-implementederror + + def f(dtype): + return self._construct(shape=3, dtype=dtype) + + self.assertRaises(NotImplementedError, f, [("A","datetime64[h]"), ("B","str"), ("C","int32")]) + + # these work (though results may be unexpected) + f('int64') + f('float64') + f('M8[ns]') + + def check_metadata(self, x, y=None): + for m in x._metadata: + v = getattr(x,m,None) + if y is None: + self.assertIsNone(v) + else: + self.assertEqual(v, getattr(y,m,None)) + + def test_metadata_propagation(self): + # check that the metadata matches up on the resulting ops + + o = self._construct(shape=3) + o.name = 'foo' + o2 = self._construct(shape=3) + o2.name = 'bar' + + # TODO + # Once panel can do non-trivial combine operations + # (currently there is an a raise in the Panel arith_ops to prevent + # this, though it actually does work) + # can remove all of these try: except: blocks on the actual operations + + + # ---------- + # preserving + # ---------- + + # simple ops with scalars + for op in [ '__add__','__sub__','__truediv__','__mul__' ]: + result = getattr(o,op)(1) + self.check_metadata(o,result) + + # ops with like + for op in [ '__add__','__sub__','__truediv__','__mul__' ]: + try: + result = getattr(o,op)(o) + self.check_metadata(o,result) + except (ValueError, AttributeError): + pass + + # simple boolean + for op in [ '__eq__','__le__', '__ge__' ]: + v1 = getattr(o,op)(o) + self.check_metadata(o,v1) + + try: + self.check_metadata(o, v1 & v1) + except (ValueError): + pass + + try: + self.check_metadata(o, v1 | v1) + except (ValueError): + pass + + # combine_first + try: + result = o.combine_first(o2) + self.check_metadata(o,result) + except (AttributeError): + pass + + # --------------------------- + # non-preserving (by default) + # --------------------------- + + # add non-like + try: + result = o + o2 + self.check_metadata(result) + except (ValueError, AttributeError): + pass + + # simple boolean + for op in [ '__eq__','__le__', '__ge__' ]: + + # this is a name matching op + v1 = getattr(o,op)(o) + + v2 = getattr(o,op)(o2) + self.check_metadata(v2) + + try: + self.check_metadata(v1 & v2) + except (ValueError): + pass + + try: + self.check_metadata(v1 | v2) + except (ValueError): + pass + + def test_head_tail(self): + # GH5370 + + o = self._construct(shape=10) + + # check all index types + for index in [ tm.makeFloatIndex, tm.makeIntIndex, + tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + axis = o._get_axis_name(0) + setattr(o,axis,index(len(getattr(o,axis)))) + + # Panel + dims + try: + o.head() + except (NotImplementedError): + raise nose.SkipTest('not implemented on {0}'.format(o.__class__.__name__)) + + self._compare(o.head(), o.iloc[:5]) + self._compare(o.tail(), o.iloc[-5:]) + + # 0-len + self._compare(o.head(0), o.iloc[:]) + self._compare(o.tail(0), o.iloc[0:]) + + # bounded + self._compare(o.head(len(o)+1), o) + self._compare(o.tail(len(o)+1), o) + + # neg index + self._compare(o.head(-3), o.head(7)) + self._compare(o.tail(-3), o.tail(7)) + +class TestSeries(tm.TestCase, Generic): + _typ = Series + _comparator = lambda self, x, y: assert_series_equal(x,y) + + def setUp(self): + self.ts = tm.makeTimeSeries() # Was at top level in test_series + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + def test_rename_mi(self): + s = Series([11,21,31], + index=MultiIndex.from_tuples([("A",x) for x in ["a","B","c"]])) + result = s.rename(str.lower) + + def test_get_numeric_data_preserve_dtype(self): + + # get the numeric data + o = Series([1,2,3]) + result = o._get_numeric_data() + self._compare(result, o) + + o = Series([1,'2',3.]) + result = o._get_numeric_data() + expected = Series([],dtype=object) + self._compare(result, expected) + + o = Series([True,False,True]) + result = o._get_numeric_data() + self._compare(result, o) + + o = Series([True,False,True]) + result = o._get_bool_data() + self._compare(result, o) + + o = Series(date_range('20130101',periods=3)) + result = o._get_numeric_data() + expected = Series([],dtype='M8[ns]') + self._compare(result, expected) + + def test_nonzero_single_element(self): + + # allow single item via bool method + s = Series([True]) + self.assertTrue(s.bool()) + + s = Series([False]) + self.assertFalse(s.bool()) + + # single item nan to raise + for s in [ Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False]) ]: + self.assertRaises(ValueError, lambda : bool(s)) + + for s in [ Series([np.nan]), Series([pd.NaT])]: + self.assertRaises(ValueError, lambda : s.bool()) + + # multiple bool are still an error + for s in [Series([True,True]), Series([False, False])]: + self.assertRaises(ValueError, lambda : bool(s)) + self.assertRaises(ValueError, lambda : s.bool()) + + # single non-bool are an error + for s in [Series([1]), Series([0]), + Series(['a']), Series([0.0])]: + self.assertRaises(ValueError, lambda : bool(s)) + self.assertRaises(ValueError, lambda : s.bool()) + + def test_metadata_propagation_indiv(self): + # check that the metadata matches up on the resulting ops + + o = Series(range(3),range(3)) + o.name = 'foo' + o2 = Series(range(3),range(3)) + o2.name = 'bar' + + result = o.T + self.check_metadata(o,result) + + # resample + ts = Series(np.random.rand(1000), + index=date_range('20130101',periods=1000,freq='s'), + name='foo') + result = ts.resample('1T') + self.check_metadata(ts,result) + + result = ts.resample('1T',how='min') + self.check_metadata(ts,result) + + result = ts.resample('1T',how=lambda x: x.sum()) + self.check_metadata(ts,result) + + _metadata = Series._metadata + _finalize = Series.__finalize__ + Series._metadata = ['name','filename'] + o.filename = 'foo' + o2.filename = 'bar' + + def finalize(self, other, method=None, **kwargs): + for name in self._metadata: + if method == 'concat' and name == 'filename': + value = '+'.join([ getattr(o,name) for o in other.objs if getattr(o,name,None) ]) + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, None)) + + return self + + Series.__finalize__ = finalize + + result = pd.concat([o, o2]) + self.assertEqual(result.filename,'foo+bar') + self.assertIsNone(result.name) + + # reset + Series._metadata = _metadata + Series.__finalize__ = _finalize + + def test_interpolate(self): + ts = Series(np.arange(len(self.ts), dtype=float), self.ts.index) + + ts_copy = ts.copy() + ts_copy[5:10] = np.NaN + + linear_interp = ts_copy.interpolate(method='linear') + self.assert_numpy_array_equal(linear_interp, ts) + + ord_ts = Series([d.toordinal() for d in self.ts.index], + index=self.ts.index).astype(float) + + ord_ts_copy = ord_ts.copy() + ord_ts_copy[5:10] = np.NaN + + time_interp = ord_ts_copy.interpolate(method='time') + self.assert_numpy_array_equal(time_interp, ord_ts) + + # try time interpolation on a non-TimeSeries + # Only raises ValueError if there are NaNs. + non_ts = self.series.copy() + non_ts[0] = np.NaN + self.assertRaises(ValueError, non_ts.interpolate, method='time') + + def test_interp_regression(self): + tm._skip_if_no_scipy() + _skip_if_no_pchip() + + ser = Series(np.sort(np.random.uniform(size=100))) + + # interpolate at new_index + new_index = ser.index + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + interp_s = ser.reindex(new_index).interpolate(method='pchip') + # does not blow up, GH5977 + interp_s[49:51] + + def test_interpolate_corners(self): + s = Series([np.nan, np.nan]) + assert_series_equal(s.interpolate(), s) + + s = Series([]).interpolate() + assert_series_equal(s.interpolate(), s) + + tm._skip_if_no_scipy() + s = Series([np.nan, np.nan]) + assert_series_equal(s.interpolate(method='polynomial', order=1), s) + + s = Series([]).interpolate() + assert_series_equal(s.interpolate(method='polynomial', order=1), s) + + def test_interpolate_index_values(self): + s = Series(np.nan, index=np.sort(np.random.rand(30))) + s[::3] = np.random.randn(10) + + vals = s.index.values.astype(float) + + result = s.interpolate(method='index') + + expected = s.copy() + bad = isnull(expected.values) + good = ~bad + expected = Series( + np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad]) + + assert_series_equal(result[bad], expected) + + # 'values' is synonymous with 'index' for the method kwarg + other_result = s.interpolate(method='values') + + assert_series_equal(other_result, result) + assert_series_equal(other_result[bad], expected) + + def test_interpolate_non_ts(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + with tm.assertRaises(ValueError): + s.interpolate(method='time') + + # New interpolation tests + def test_nan_interpolate(self): + s = Series([0, 1, np.nan, 3]) + result = s.interpolate() + expected = Series([0., 1., 2., 3.]) + assert_series_equal(result, expected) + + tm._skip_if_no_scipy() + result = s.interpolate(method='polynomial', order=1) + assert_series_equal(result, expected) + + def test_nan_irregular_index(self): + s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) + result = s.interpolate() + expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9]) + assert_series_equal(result, expected) + + def test_nan_str_index(self): + s = Series([0, 1, 2, np.nan], index=list('abcd')) + result = s.interpolate() + expected = Series([0., 1., 2., 2.], index=list('abcd')) + assert_series_equal(result, expected) + + def test_interp_quad(self): + tm._skip_if_no_scipy() + sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) + result = sq.interpolate(method='quadratic') + expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4]) + assert_series_equal(result, expected) + + def test_interp_scipy_basic(self): + tm._skip_if_no_scipy() + s = Series([1, 3, np.nan, 12, np.nan, 25]) + # slinear + expected = Series([1., 3., 7.5, 12., 18.5, 25.]) + result = s.interpolate(method='slinear') + assert_series_equal(result, expected) + + result = s.interpolate(method='slinear', donwcast='infer') + assert_series_equal(result, expected) + # nearest + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method='nearest') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='nearest', downcast='infer') + assert_series_equal(result, expected) + # zero + expected = Series([1, 3, 3, 12, 12, 25]) + result = s.interpolate(method='zero') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='zero', downcast='infer') + assert_series_equal(result, expected) + # quadratic + expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) + result = s.interpolate(method='quadratic') + assert_series_equal(result, expected) + + result = s.interpolate(method='quadratic', downcast='infer') + assert_series_equal(result, expected) + # cubic + expected = Series([1., 3., 6.8, 12., 18.2, 25.]) + result = s.interpolate(method='cubic') + assert_series_equal(result, expected) + + def test_interp_limit(self): + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + expected = Series([1., 3., 5., 7., np.nan, 11.]) + result = s.interpolate(method='linear', limit=2) + assert_series_equal(result, expected) + + def test_interp_all_good(self): + # scipy + tm._skip_if_no_scipy() + s = Series([1, 2, 3]) + result = s.interpolate(method='polynomial', order=1) + assert_series_equal(result, s) + + # non-scipy + result = s.interpolate() + assert_series_equal(result, s) + + def test_interp_multiIndex(self): + idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + s = Series([1, 2, np.nan], index=idx) + + expected = s.copy() + expected.loc[2] = 2 + result = s.interpolate() + assert_series_equal(result, expected) + + tm._skip_if_no_scipy() + with tm.assertRaises(ValueError): + s.interpolate(method='polynomial', order=1) + + def test_interp_nonmono_raise(self): + tm._skip_if_no_scipy() + s = Series([1, np.nan, 3], index=[0, 2, 1]) + with tm.assertRaises(ValueError): + s.interpolate(method='krogh') + + def test_interp_datetime64(self): + tm._skip_if_no_scipy() + df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3)) + result = df.interpolate(method='nearest') + expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3)) + assert_series_equal(result, expected) + + def test_describe(self): + _ = self.series.describe() + _ = self.ts.describe() + + def test_describe_percentiles(self): + with tm.assert_produces_warning(FutureWarning): + desc = self.series.describe(percentile_width=50) + assert '75%' in desc.index + assert '25%' in desc.index + + with tm.assert_produces_warning(FutureWarning): + desc = self.series.describe(percentile_width=95) + assert '97.5%' in desc.index + assert '2.5%' in desc.index + + def test_describe_objects(self): + s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) + result = s.describe() + expected = Series({'count': 7, 'unique': 4, + 'top': 'a', 'freq': 3}, index=result.index) + assert_series_equal(result, expected) + + dt = list(self.ts.index) + dt.append(dt[0]) + ser = Series(dt) + rs = ser.describe() + min_date = min(dt) + max_date = max(dt) + xp = Series({'count': len(dt), + 'unique': len(self.ts.index), + 'first': min_date, 'last': max_date, 'freq': 2, + 'top': min_date}, index=rs.index) + assert_series_equal(rs, xp) + + def test_describe_empty(self): + result = pd.Series().describe() + + self.assertEqual(result['count'], 0) + self.assertTrue(result.drop('count').isnull().all()) + + nanSeries = Series([np.nan]) + nanSeries.name = 'NaN' + result = nanSeries.describe() + self.assertEqual(result['count'], 0) + self.assertTrue(result.drop('count').isnull().all()) + + def test_describe_none(self): + noneSeries = Series([None]) + noneSeries.name = 'None' + assert_series_equal(noneSeries.describe(), + Series([0, 0], index=['count', 'unique'])) + + +class TestDataFrame(tm.TestCase, Generic): + _typ = DataFrame + _comparator = lambda self, x, y: assert_frame_equal(x,y) + + def test_rename_mi(self): + df = DataFrame([11,21,31], + index=MultiIndex.from_tuples([("A",x) for x in ["a","B","c"]])) + result = df.rename(str.lower) + + def test_nonzero_single_element(self): + + # allow single item via bool method + df = DataFrame([[True]]) + self.assertTrue(df.bool()) + + df = DataFrame([[False]]) + self.assertFalse(df.bool()) + + df = DataFrame([[False, False]]) + self.assertRaises(ValueError, lambda : df.bool()) + self.assertRaises(ValueError, lambda : bool(df)) + + def test_get_numeric_data_preserve_dtype(self): + + # get the numeric data + o = DataFrame({'A': [1, '2', 3.]}) + result = o._get_numeric_data() + expected = DataFrame(index=[0, 1, 2], dtype=object) + self._compare(result, expected) + + def test_interp_basic(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + result = df.interpolate() + assert_frame_equal(result, expected) + + result = df.set_index('C').interpolate() + expected = df.set_index('C') + expected.A.loc[3] = 3 + expected.B.loc[5] = 9 + assert_frame_equal(result, expected) + + def test_interp_bad_method(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + with tm.assertRaises(ValueError): + df.interpolate(method='not_a_method') + + def test_interp_combo(self): + df = DataFrame({'A': [1., 2., np.nan, 4.], 'B': [1, 4, 9, np.nan], + 'C': [1, 2, 3, 5], 'D': list('abcd')}) + + result = df['A'].interpolate() + expected = Series([1., 2., 3., 4.]) + assert_series_equal(result, expected) + + result = df['A'].interpolate(downcast='infer') + expected = Series([1, 2, 3, 4]) + assert_series_equal(result, expected) + + def test_interp_nan_idx(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) + df = df.set_index('A') + with tm.assertRaises(NotImplementedError): + df.interpolate(method='values') + + def test_interp_various(self): + tm._skip_if_no_scipy() + df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], + 'C': [1, 2, 3, 5, 8, 13, 21]}) + df = df.set_index('C') + expected = df.copy() + result = df.interpolate(method='polynomial', order=1) + + expected.A.loc[3] = 2.66666667 + expected.A.loc[13] = 5.76923076 + assert_frame_equal(result, expected) + + result = df.interpolate(method='cubic') + expected.A.loc[3] = 2.81621174 + expected.A.loc[13] = 5.64146581 + assert_frame_equal(result, expected) + + result = df.interpolate(method='nearest') + expected.A.loc[3] = 2 + expected.A.loc[13] = 5 + assert_frame_equal(result, expected, check_dtype=False) + + result = df.interpolate(method='quadratic') + expected.A.loc[3] = 2.82533638 + expected.A.loc[13] = 6.02817974 + assert_frame_equal(result, expected) + + result = df.interpolate(method='slinear') + expected.A.loc[3] = 2.66666667 + expected.A.loc[13] = 5.76923077 + assert_frame_equal(result, expected) + + result = df.interpolate(method='zero') + expected.A.loc[3] = 2. + expected.A.loc[13] = 5 + assert_frame_equal(result, expected, check_dtype=False) + + result = df.interpolate(method='quadratic') + expected.A.loc[3] = 2.82533638 + expected.A.loc[13] = 6.02817974 + assert_frame_equal(result, expected) + + def test_interp_alt_scipy(self): + tm._skip_if_no_scipy() + df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], + 'C': [1, 2, 3, 5, 8, 13, 21]}) + result = df.interpolate(method='barycentric') + expected = df.copy() + expected['A'].iloc[2] = 3 + expected['A'].iloc[5] = 6 + assert_frame_equal(result, expected) + + result = df.interpolate(method='barycentric', downcast='infer') + assert_frame_equal(result, expected.astype(np.int64)) + + result = df.interpolate(method='krogh') + expectedk = df.copy() + # expectedk['A'].iloc[2] = 3 + # expectedk['A'].iloc[5] = 6 + expectedk['A'] = expected['A'] + assert_frame_equal(result, expectedk) + + _skip_if_no_pchip() + result = df.interpolate(method='pchip') + expected['A'].iloc[2] = 3 + expected['A'].iloc[5] = 6.125 + assert_frame_equal(result, expected) + + def test_interp_rowwise(self): + df = DataFrame({0: [1, 2, np.nan, 4], + 1: [2, 3, 4, np.nan], + 2: [np.nan, 4, 5, 6], + 3: [4, np.nan, 6, 7], + 4: [1, 2, 3, 4]}) + result = df.interpolate(axis=1) + expected = df.copy() + expected[1].loc[3] = 5 + expected[2].loc[0] = 3 + expected[3].loc[1] = 3 + expected[4] = expected[4].astype(np.float64) + assert_frame_equal(result, expected) + + # scipy route + tm._skip_if_no_scipy() + result = df.interpolate(axis=1, method='values') + assert_frame_equal(result, expected) + + result = df.interpolate(axis=0) + expected = df.interpolate() + assert_frame_equal(result, expected) + + def test_rowwise_alt(self): + df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64], + 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]}) + df.interpolate(axis=0) + + def test_interp_leading_nans(self): + df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0], + "B": [np.nan, -3, -3.5, np.nan, -4]}) + result = df.interpolate() + expected = df.copy() + expected['B'].loc[3] = -3.75 + assert_frame_equal(result, expected) + + tm._skip_if_no_scipy() + result = df.interpolate(method='polynomial', order=1) + assert_frame_equal(result, expected) + + def test_interp_raise_on_only_mixed(self): + df = DataFrame({'A': [1, 2, np.nan, 4], 'B': ['a', 'b', 'c', 'd'], + 'C': [np.nan, 2, 5, 7], 'D': [np.nan, np.nan, 9, 9], + 'E': [1, 2, 3, 4]}) + with tm.assertRaises(TypeError): + df.interpolate(axis=1) + + def test_interp_inplace(self): + df = DataFrame({'a': [1., 2., np.nan, 4.]}) + expected = DataFrame({'a': [1., 2., 3., 4.]}) + result = df.copy() + result['a'].interpolate(inplace=True) + assert_frame_equal(result, expected) + + result = df.copy() + result['a'].interpolate(inplace=True, downcast='infer') + assert_frame_equal(result, expected.astype('int64')) + + def test_interp_ignore_all_good(self): + # GH + df = DataFrame({'A': [1, 2, np.nan, 4], + 'B': [1, 2, 3, 4], + 'C': [1., 2., np.nan, 4.], + 'D': [1., 2., 3., 4.]}) + expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float64'), + 'B': np.array([1, 2, 3, 4], dtype='int64'), + 'C': np.array([1., 2., 3, 4.], dtype='float64'), + 'D': np.array([1., 2., 3., 4.], dtype='float64')}) + + result = df.interpolate(downcast=None) + assert_frame_equal(result, expected) + + # all good + result = df[['B', 'D']].interpolate(downcast=None) + assert_frame_equal(result, df[['B', 'D']]) + + def test_describe(self): + desc = tm.makeDataFrame().describe() + desc = tm.makeMixedDataFrame().describe() + desc = tm.makeTimeDataFrame().describe() + + def test_describe_percentiles(self): + with tm.assert_produces_warning(FutureWarning): + desc = tm.makeDataFrame().describe(percentile_width=50) + assert '75%' in desc.index + assert '25%' in desc.index + + with tm.assert_produces_warning(FutureWarning): + desc = tm.makeDataFrame().describe(percentile_width=95) + assert '97.5%' in desc.index + assert '2.5%' in desc.index + + def test_describe_quantiles_both(self): + with tm.assertRaises(ValueError): + tm.makeDataFrame().describe(percentile_width=50, + percentiles=[25, 75]) + + def test_describe_percentiles_percent_or_raw(self): + df = tm.makeDataFrame() + with tm.assertRaises(ValueError): + df.describe(percentiles=[10, 50, 100]) + + def test_describe_percentiles_equivalence(self): + df = tm.makeDataFrame() + d1 = df.describe() + d2 = df.describe(percentiles=[.25, .75]) + assert_frame_equal(d1, d2) + + def test_describe_percentiles_insert_median(self): + df = tm.makeDataFrame() + d1 = df.describe(percentiles=[.25, .75]) + d2 = df.describe(percentiles=[.25, .5, .75]) + assert_frame_equal(d1, d2) + + # none above + d1 = df.describe(percentiles=[.25, .45]) + d2 = df.describe(percentiles=[.25, .45, .5]) + assert_frame_equal(d1, d2) + + # none below + d1 = df.describe(percentiles=[.75, 1]) + d2 = df.describe(percentiles=[.5, .75, 1]) + assert_frame_equal(d1, d2) + + def test_describe_no_numeric(self): + df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, + 'B': ['a', 'b', 'c', 'd'] * 6}) + desc = df.describe() + expected = DataFrame(dict((k, v.describe()) + for k, v in compat.iteritems(df)), + columns=df.columns) + assert_frame_equal(desc, expected) + + ts = tm.makeTimeSeries() + df = DataFrame({'time': ts.index}) + desc = df.describe() + self.assertEqual(desc.time['first'], min(ts.index)) + + def test_describe_empty_int_columns(self): + df = DataFrame([[0, 1], [1, 2]]) + desc = df[df[0] < 0].describe() # works + assert_series_equal(desc.xs('count'), + Series([0, 0], dtype=float, name='count')) + self.assertTrue(isnull(desc.ix[1:]).all().all()) + + def test_describe_objects(self): + df = DataFrame({"C1": ['a', 'a', 'c'], "C2": ['d', 'd', 'f']}) + result = df.describe() + expected = DataFrame({"C1": [3, 2, 'a', 2], "C2": [3, 2, 'd', 2]}, + index=['count', 'unique', 'top', 'freq']) + assert_frame_equal(result, expected) + + df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')}) + df.loc[4] = pd.Timestamp('2010-01-04') + result = df.describe() + expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-01'), + pd.Timestamp('2010-01-04'), + pd.Timestamp('2010-01-04'), 2]}, + index=['count', 'unique', 'first', 'last', 'top', + 'freq']) + assert_frame_equal(result, expected) + + # mix time and str + df['C2'] = ['a', 'a', 'b', 'c', 'a'] + result = df.describe() + # when mix of dateimte / obj the index gets reordered. + expected['C2'] = [5, 3, np.nan, np.nan, 'a', 3] + assert_frame_equal(result, expected) + + # just str + expected = DataFrame({'C2': [5, 3, 'a', 4]}, + index=['count', 'unique', 'top', 'freq']) + result = df[['C2']].describe() + + # mix of time, str, numeric + df['C3'] = [2, 4, 6, 8, 2] + result = df.describe() + expected = DataFrame({"C3": [5., 4.4, 2.607681, 2., 2., 4., 6., 8.]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + assert_frame_equal(result, expected) + assert_frame_equal(df.describe(), df[['C3']].describe()) + + assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe()) + assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe()) + + def test_no_order(self): + tm._skip_if_no_scipy() + s = Series([0, 1, np.nan, 3]) + with tm.assertRaises(ValueError): + s.interpolate(method='polynomial') + with tm.assertRaises(ValueError): + s.interpolate(method='spline') + + def test_spline(self): + tm._skip_if_no_scipy() + s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) + result = s.interpolate(method='spline', order=1) + expected = Series([1., 2., 3., 4., 5., 6., 7.]) + assert_series_equal(result, expected) + + def test_metadata_propagation_indiv(self): + + # groupby + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + result = df.groupby('A').sum() + self.check_metadata(df,result) + + # resample + df = DataFrame(np.random.randn(1000,2), + index=date_range('20130101',periods=1000,freq='s')) + result = df.resample('1T') + self.check_metadata(df,result) + + # merging with override + # GH 6923 + _metadata = DataFrame._metadata + _finalize = DataFrame.__finalize__ + + np.random.seed(10) + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b']) + df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd']) + DataFrame._metadata = ['filename'] + df1.filename = 'fname1.csv' + df2.filename = 'fname2.csv' + + def finalize(self, other, method=None, **kwargs): + + for name in self._metadata: + if method == 'merge': + left, right = other.left, other.right + value = getattr(left, name, '') + '|' + getattr(right, name, '') + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, '')) + + return self + + DataFrame.__finalize__ = finalize + result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner') + self.assertEqual(result.filename,'fname1.csv|fname2.csv') + + # concat + # GH 6927 + DataFrame._metadata = ['filename'] + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list('ab')) + df1.filename = 'foo' + + def finalize(self, other, method=None, **kwargs): + for name in self._metadata: + if method == 'concat': + value = '+'.join([ getattr(o,name) for o in other.objs if getattr(o,name,None) ]) + object.__setattr__(self, name, value) + else: + object.__setattr__(self, name, getattr(other, name, None)) + + return self + + DataFrame.__finalize__ = finalize + + result = pd.concat([df1, df1]) + self.assertEqual(result.filename,'foo+foo') + + # reset + DataFrame._metadata = _metadata + DataFrame.__finalize__ = _finalize + +class TestPanel(tm.TestCase, Generic): + _typ = Panel + _comparator = lambda self, x, y: assert_panel_equal(x, y) + + +class TestNDFrame(tm.TestCase): + # tests that don't fit elsewhere + + def test_squeeze(self): + # noop + for s in [ tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries() ]: + tm.assert_series_equal(s.squeeze(),s) + for df in [ tm.makeTimeDataFrame() ]: + tm.assert_frame_equal(df.squeeze(),df) + for p in [ tm.makePanel() ]: + tm.assert_panel_equal(p.squeeze(),p) + for p4d in [ tm.makePanel4D() ]: + tm.assert_panel4d_equal(p4d.squeeze(),p4d) + + # squeezing + df = tm.makeTimeDataFrame().reindex(columns=['A']) + tm.assert_series_equal(df.squeeze(),df['A']) + + p = tm.makePanel().reindex(items=['ItemA']) + tm.assert_frame_equal(p.squeeze(),p['ItemA']) + + p = tm.makePanel().reindex(items=['ItemA'],minor_axis=['A']) + tm.assert_series_equal(p.squeeze(),p.ix['ItemA',:,'A']) + + p4d = tm.makePanel4D().reindex(labels=['label1']) + tm.assert_panel_equal(p4d.squeeze(),p4d['label1']) + + p4d = tm.makePanel4D().reindex(labels=['label1'],items=['ItemA']) + tm.assert_frame_equal(p4d.squeeze(),p4d.ix['label1','ItemA']) + + def test_equals(self): + s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) + s2 = s1.copy() + self.assertTrue(s1.equals(s2)) + + s1[1] = 99 + self.assertFalse(s1.equals(s2)) + + # NaNs compare as equal + s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) + s2 = s1.copy() + self.assertTrue(s1.equals(s2)) + + s2[0] = 9.9 + self.assertFalse(s1.equals(s2)) + + idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + s1 = Series([1, 2, np.nan], index=idx) + s2 = s1.copy() + self.assertTrue(s1.equals(s2)) + + # Add object dtype column with nans + index = np.random.random(10) + df1 = DataFrame(np.random.random(10,), index=index, columns=['floats']) + df1['text'] = 'the sky is so blue. we could use more chocolate.'.split() + df1['start'] = date_range('2000-1-1', periods=10, freq='T') + df1['end'] = date_range('2000-1-1', periods=10, freq='D') + df1['diff'] = df1['end'] - df1['start'] + df1['bool'] = (np.arange(10) % 3 == 0) + df1.ix[::2] = nan + df2 = df1.copy() + self.assertTrue(df1['text'].equals(df2['text'])) + self.assertTrue(df1['start'].equals(df2['start'])) + self.assertTrue(df1['end'].equals(df2['end'])) + self.assertTrue(df1['diff'].equals(df2['diff'])) + self.assertTrue(df1['bool'].equals(df2['bool'])) + self.assertTrue(df1.equals(df2)) + self.assertFalse(df1.equals(object)) + + # different dtype + different = df1.copy() + different['floats'] = different['floats'].astype('float32') + self.assertFalse(df1.equals(different)) + + # different index + different_index = -index + different = df2.set_index(different_index) + self.assertFalse(df1.equals(different)) + + # different columns + different = df2.copy() + different.columns = df2.columns[::-1] + self.assertFalse(df1.equals(different)) + + # DatetimeIndex + index = pd.date_range('2000-1-1', periods=10, freq='T') + df1 = df1.set_index(index) + df2 = df1.copy() + self.assertTrue(df1.equals(df2)) + + # MultiIndex + df3 = df1.set_index(['text'], append=True) + df2 = df1.set_index(['text'], append=True) + self.assertTrue(df3.equals(df2)) + + df2 = df1.set_index(['floats'], append=True) + self.assertFalse(df3.equals(df2)) + + # NaN in index + df3 = df1.set_index(['floats'], append=True) + df2 = df1.set_index(['floats'], append=True) + self.assertTrue(df3.equals(df2)) + + def test_describe_raises(self): + with tm.assertRaises(NotImplementedError): + tm.makePanel().describe() + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py new file mode 100644 index 00000000..00045e88 --- /dev/null +++ b/pandas/tests/test_graphics.py @@ -0,0 +1,2578 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose +import itertools +import os +import string +from distutils.version import LooseVersion + +from datetime import datetime, date + +from pandas import Series, DataFrame, MultiIndex, PeriodIndex, date_range +from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip, + iteritems, OrderedDict) +from pandas.util.decorators import cache_readonly +import pandas.core.common as com +import pandas.util.testing as tm +from pandas.util.testing import ensure_clean +from pandas.core.config import set_option + + +import numpy as np +from numpy import random +from numpy.random import rand, randn + +from numpy.testing import assert_array_equal, assert_allclose +from numpy.testing.decorators import slow +import pandas.tools.plotting as plotting + + +def _skip_if_no_scipy_gaussian_kde(): + try: + import scipy + from scipy.stats import gaussian_kde + except ImportError: + raise nose.SkipTest("scipy version doesn't support gaussian_kde") + +def _ok_for_gaussian_kde(kind): + if kind in ['kde','density']: + try: + import scipy + from scipy.stats import gaussian_kde + except ImportError: + return False + return True + +@tm.mplskip +class TestPlotBase(tm.TestCase): + + def setUp(self): + + import matplotlib as mpl + mpl.rcdefaults() + + n = 100 + with tm.RNGContext(42): + gender = tm.choice(['Male', 'Female'], size=n) + classroom = tm.choice(['A', 'B', 'C'], size=n) + + self.hist_df = DataFrame({'gender': gender, + 'classroom': classroom, + 'height': random.normal(66, 4, size=n), + 'weight': random.normal(161, 32, size=n), + 'category': random.randint(4, size=n)}) + + def tearDown(self): + tm.close() + + @cache_readonly + def plt(self): + import matplotlib.pyplot as plt + return plt + + @cache_readonly + def colorconverter(self): + import matplotlib.colors as colors + return colors.colorConverter + + def _check_legend_labels(self, axes, labels=None, visible=True): + """ + Check each axes has expected legend labels + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + labels : list-like + expected legend labels + visible : bool + expected legend visibility. labels are checked only when visible is True + """ + + if visible and (labels is None): + raise ValueError('labels must be specified when visible is True') + axes = self._flatten_visible(axes) + for ax in axes: + if visible: + self.assertTrue(ax.get_legend() is not None) + self._check_text_labels(ax.get_legend().get_texts(), labels) + else: + self.assertTrue(ax.get_legend() is None) + + def _check_data(self, xp, rs): + """ + Check each axes has identical lines + + Parameters + ---------- + xp : matplotlib Axes object + rs : matplotlib Axes object + """ + xp_lines = xp.get_lines() + rs_lines = rs.get_lines() + + def check_line(xpl, rsl): + xpdata = xpl.get_xydata() + rsdata = rsl.get_xydata() + assert_allclose(xpdata, rsdata) + + self.assertEqual(len(xp_lines), len(rs_lines)) + [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)] + tm.close() + + def _check_visible(self, collections, visible=True): + """ + Check each artist is visible or not + + Parameters + ---------- + collections : list-like + list or collection of target artist + visible : bool + expected visibility + """ + + for patch in collections: + self.assertEqual(patch.get_visible(), visible) + + def _get_colors_mapped(self, series, colors): + unique = series.unique() + # unique and colors length can be differed + # depending on slice value + mapped = dict(zip(unique, colors)) + return [mapped[v] for v in series.values] + + def _check_colors(self, collections, linecolors=None, facecolors=None, + mapping=None): + """ + Check each artist has expected line colors and face colors + + Parameters + ---------- + collections : list-like + list or collection of target artist + linecolors : list-like which has the same length as collections + list of expected line colors + facecolors : list-like which has the same length as collections + list of expected face colors + mapping : Series + Series used for color grouping key + used for andrew_curves, parallel_coordinates, radviz test + """ + + from matplotlib.lines import Line2D + from matplotlib.collections import Collection + conv = self.colorconverter + if linecolors is not None: + + if mapping is not None: + linecolors = self._get_colors_mapped(mapping, linecolors) + linecolors = linecolors[:len(collections)] + + self.assertEqual(len(collections), len(linecolors)) + for patch, color in zip(collections, linecolors): + if isinstance(patch, Line2D): + result = patch.get_color() + # Line2D may contains string color expression + result = conv.to_rgba(result) + else: + result = patch.get_edgecolor() + + expected = conv.to_rgba(color) + self.assertEqual(result, expected) + + if facecolors is not None: + + if mapping is not None: + facecolors = self._get_colors_mapped(mapping, facecolors) + facecolors = facecolors[:len(collections)] + + self.assertEqual(len(collections), len(facecolors)) + for patch, color in zip(collections, facecolors): + if isinstance(patch, Collection): + # returned as list of np.array + result = patch.get_facecolor()[0] + else: + result = patch.get_facecolor() + + if isinstance(result, np.ndarray): + result = tuple(result) + + expected = conv.to_rgba(color) + self.assertEqual(result, expected) + + def _check_text_labels(self, texts, expected): + """ + Check each text has expected labels + + Parameters + ---------- + texts : matplotlib Text object, or its list-like + target text, or its list + expected : str or list-like which has the same length as texts + expected text label, or its list + """ + if not com.is_list_like(texts): + self.assertEqual(texts.get_text(), expected) + else: + labels = [t.get_text() for t in texts] + self.assertEqual(len(labels), len(expected)) + for l, e in zip(labels, expected): + self.assertEqual(l, e) + + def _check_ticks_props(self, axes, xlabelsize=None, xrot=None, + ylabelsize=None, yrot=None): + """ + Check each axes has expected tick properties + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xlabelsize : number + expected xticks font size + xrot : number + expected xticks rotation + ylabelsize : number + expected yticks font size + yrot : number + expected yticks rotation + """ + axes = self._flatten_visible(axes) + for ax in axes: + if xlabelsize or xrot: + xtick = ax.get_xticklabels()[0] + if xlabelsize is not None: + self.assertAlmostEqual(xtick.get_fontsize(), xlabelsize) + if xrot is not None: + self.assertAlmostEqual(xtick.get_rotation(), xrot) + + if ylabelsize or yrot: + ytick = ax.get_yticklabels()[0] + if ylabelsize is not None: + self.assertAlmostEqual(ytick.get_fontsize(), ylabelsize) + if yrot is not None: + self.assertAlmostEqual(ytick.get_rotation(), yrot) + + def _check_ax_scales(self, axes, xaxis='linear', yaxis='linear'): + """ + Check each axes has expected scales + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xaxis : {'linear', 'log'} + expected xaxis scale + yaxis : {'linear', 'log'} + expected yaxis scale + """ + axes = self._flatten_visible(axes) + for ax in axes: + self.assertEqual(ax.xaxis.get_scale(), xaxis) + self.assertEqual(ax.yaxis.get_scale(), yaxis) + + def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=(8.0, 6.0)): + """ + Check expected number of axes is drawn in expected layout + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + axes_num : number + expected number of axes. Unnecessary axes should be set to invisible. + layout : tuple + expected layout, (expected number of rows , columns) + figsize : tuple + expected figsize. default is matplotlib default + """ + visible_axes = self._flatten_visible(axes) + + if axes_num is not None: + self.assertEqual(len(visible_axes), axes_num) + for ax in visible_axes: + # check something drawn on visible axes + self.assertTrue(len(ax.get_children()) > 0) + + if layout is not None: + result = self._get_axes_layout(plotting._flatten(axes)) + self.assertEqual(result, layout) + + self.assert_numpy_array_equal(np.round(visible_axes[0].figure.get_size_inches()), + np.array(figsize)) + + def _get_axes_layout(self, axes): + x_set = set() + y_set = set() + for ax in axes: + # check axes coordinates to estimate layout + points = ax.get_position().get_points() + x_set.add(points[0][0]) + y_set.add(points[0][1]) + return (len(y_set), len(x_set)) + + def _flatten_visible(self, axes): + """ + Flatten axes, and filter only visible + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + + """ + axes = plotting._flatten(axes) + axes = [ax for ax in axes if ax.get_visible()] + return axes + + def _check_has_errorbars(self, axes, xerr=0, yerr=0): + """ + Check axes has expected number of errorbars + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xerr : number + expected number of x errorbar + yerr : number + expected number of y errorbar + """ + + axes = self._flatten_visible(axes) + for ax in axes: + containers = ax.containers + xerr_count = 0 + yerr_count = 0 + for c in containers: + has_xerr = getattr(c, 'has_xerr', False) + has_yerr = getattr(c, 'has_yerr', False) + if has_xerr: + xerr_count += 1 + if has_yerr: + yerr_count += 1 + self.assertEqual(xerr, xerr_count) + self.assertEqual(yerr, yerr_count) + + def _check_box_return_type(self, returned, return_type, expected_keys=None): + """ + Check box returned type is correct + + Parameters + ---------- + returned : object to be tested, returned from boxplot + return_type : str + return_type passed to boxplot + expected_keys : list-like, optional + group labels in subplot case. If not passed, + the function checks assuming boxplot uses single ax + """ + from matplotlib.axes import Axes + types = {'dict': dict, 'axes': Axes, 'both': tuple} + if expected_keys is None: + # should be fixed when the returning default is changed + if return_type is None: + return_type = 'dict' + + self.assertTrue(isinstance(returned, types[return_type])) + if return_type == 'both': + self.assertIsInstance(returned.ax, Axes) + self.assertIsInstance(returned.lines, dict) + else: + # should be fixed when the returning default is changed + if return_type is None: + for r in self._flatten_visible(returned): + self.assertIsInstance(r, Axes) + return + + self.assertTrue(isinstance(returned, OrderedDict)) + self.assertEqual(sorted(returned.keys()), sorted(expected_keys)) + for key, value in iteritems(returned): + self.assertTrue(isinstance(value, types[return_type])) + # check returned dict has correct mapping + if return_type == 'axes': + self.assertEqual(value.get_title(), key) + elif return_type == 'both': + self.assertEqual(value.ax.get_title(), key) + self.assertIsInstance(value.ax, Axes) + self.assertIsInstance(value.lines, dict) + elif return_type == 'dict': + line = value['medians'][0] + self.assertEqual(line.get_axes().get_title(), key) + else: + raise AssertionError + + +@tm.mplskip +class TestSeriesPlots(TestPlotBase): + + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.mpl_le_1_2_1 = str(mpl.__version__) <= LooseVersion('1.2.1') + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.iseries = tm.makePeriodSeries() + self.iseries.name = 'iseries' + + @slow + def test_plot(self): + _check_plot_works(self.ts.plot, label='foo') + _check_plot_works(self.ts.plot, use_index=False) + axes = _check_plot_works(self.ts.plot, rot=0) + self._check_ticks_props(axes, xrot=0) + + ax = _check_plot_works(self.ts.plot, style='.', logy=True) + self._check_ax_scales(ax, yaxis='log') + + ax = _check_plot_works(self.ts.plot, style='.', logx=True) + self._check_ax_scales(ax, xaxis='log') + + ax = _check_plot_works(self.ts.plot, style='.', loglog=True) + self._check_ax_scales(ax, xaxis='log', yaxis='log') + + _check_plot_works(self.ts[:10].plot, kind='bar') + _check_plot_works(self.ts.plot, kind='area', stacked=False) + _check_plot_works(self.iseries.plot) + + for kind in ['line', 'bar', 'barh', 'kde']: + if not _ok_for_gaussian_kde(kind): + continue + _check_plot_works(self.series[:5].plot, kind=kind) + + _check_plot_works(self.series[:10].plot, kind='barh') + ax = _check_plot_works(Series(randn(10)).plot, kind='bar', color='black') + self._check_colors([ax.patches[0]], facecolors=['black']) + + # GH 6951 + ax = _check_plot_works(self.ts.plot, subplots=True) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + + @slow + def test_plot_figsize_and_title(self): + # figsize and title + ax = self.series.plot(title='Test', figsize=(16, 8)) + self._check_text_labels(ax.title, 'Test') + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) + + def test_ts_line_lim(self): + ax = self.ts.plot() + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) + self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) + tm.close() + + ax = self.ts.plot(secondary_y=True) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) + self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) + + def test_ts_area_lim(self): + ax = self.ts.plot(kind='area', stacked=False) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + # GH 7471 + ax = self.ts.plot(kind='area', stacked=False, x_compat=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + tz_ts = self.ts.copy() + tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') + ax = tz_ts.plot(kind='area', stacked=False, x_compat=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + ax = tz_ts.plot(kind='area', stacked=False, secondary_y=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + + def test_line_area_nan_series(self): + values = [1, 2, np.nan, 3] + s = Series(values) + ts = Series(values, index=tm.makeDateIndex(k=4)) + + for d in [s, ts]: + ax = _check_plot_works(d.plot) + masked = ax.lines[0].get_ydata() + # remove nan for comparison purpose + self.assert_numpy_array_equal(np.delete(masked.data, 2), np.array([1, 2, 3])) + self.assert_numpy_array_equal(masked.mask, np.array([False, False, True, False])) + + expected = np.array([1, 2, 0, 3]) + ax = _check_plot_works(d.plot, stacked=True) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot, kind='area') + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot, kind='area', stacked=False) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + + @slow + def test_bar_log(self): + expected = np.array([1., 10., 100., 1000.]) + + if not self.mpl_le_1_2_1: + expected = np.hstack((.1, expected, 1e4)) + + ax = Series([200, 500]).plot(log=True, kind='bar') + assert_array_equal(ax.yaxis.get_ticklocs(), expected) + + @slow + def test_bar_ignore_index(self): + df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + ax = df.plot(kind='bar', use_index=False) + self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) + + def test_rotation(self): + df = DataFrame(randn(5, 5)) + axes = df.plot(rot=30) + self._check_ticks_props(axes, xrot=30) + + def test_irregular_datetime(self): + rng = date_range('1/1/2000', '3/1/2000') + rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] + ser = Series(randn(len(rng)), rng) + ax = ser.plot() + xp = datetime(1999, 1, 1).toordinal() + ax.set_xlim('1/1/1999', '1/1/2001') + self.assertEqual(xp, ax.get_xlim()[0]) + + @slow + def test_pie_series(self): + # if sum of values is less than 1.0, pie handle them as rate and draw semicircle. + series = Series(np.random.randint(1, 5), + index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') + ax = _check_plot_works(series.plot, kind='pie') + self._check_text_labels(ax.texts, series.index) + self.assertEqual(ax.get_ylabel(), 'YLABEL') + + # without wedge labels + ax = _check_plot_works(series.plot, kind='pie', labels=None) + self._check_text_labels(ax.texts, [''] * 5) + + # with less colors than elements + color_args = ['r', 'g', 'b'] + ax = _check_plot_works(series.plot, kind='pie', colors=color_args) + + color_expected = ['r', 'g', 'b', 'r', 'g'] + self._check_colors(ax.patches, facecolors=color_expected) + + # with labels and colors + labels = ['A', 'B', 'C', 'D', 'E'] + color_args = ['r', 'g', 'b', 'c', 'm'] + ax = _check_plot_works(series.plot, kind='pie', labels=labels, colors=color_args) + self._check_text_labels(ax.texts, labels) + self._check_colors(ax.patches, facecolors=color_args) + + # with autopct and fontsize + ax = _check_plot_works(series.plot, kind='pie', colors=color_args, + autopct='%.2f', fontsize=7) + pcts = ['{0:.2f}'.format(s * 100) for s in series.values / float(series.sum())] + iters = [iter(series.index), iter(pcts)] + expected_texts = list(next(it) for it in itertools.cycle(iters)) + self._check_text_labels(ax.texts, expected_texts) + for t in ax.texts: + self.assertEqual(t.get_fontsize(), 7) + + # includes negative value + with tm.assertRaises(ValueError): + series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) + series.plot(kind='pie') + + # includes nan + series = Series([1, 2, np.nan, 4], + index=['a', 'b', 'c', 'd'], name='YLABEL') + ax = _check_plot_works(series.plot, kind='pie') + self._check_text_labels(ax.texts, series.index) + + @slow + def test_hist(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + _check_plot_works(self.ts.hist, figsize=(8, 10)) + _check_plot_works(self.ts.hist, by=self.ts.index.month) + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) + + fig, ax = self.plt.subplots(1, 1) + _check_plot_works(self.ts.hist, ax=ax) + _check_plot_works(self.ts.hist, ax=ax, figure=fig) + _check_plot_works(self.ts.hist, figure=fig) + tm.close() + + fig, (ax1, ax2) = self.plt.subplots(1, 2) + _check_plot_works(self.ts.hist, figure=fig, ax=ax1) + _check_plot_works(self.ts.hist, figure=fig, ax=ax2) + + with tm.assertRaises(ValueError): + self.ts.hist(by=self.ts.index, figure=fig) + + @slow + def test_hist_bins(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.hist(bins=2)[0][0] + self.assertEqual(len(ax.patches), 2) + + @slow + def test_hist_layout(self): + df = self.hist_df + with tm.assertRaises(ValueError): + df.height.hist(layout=(1, 1)) + + with tm.assertRaises(ValueError): + df.height.hist(layout=[1, 1]) + + @slow + def test_hist_layout_with_by(self): + df = self.hist_df + + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + + @slow + def test_hist_no_overlap(self): + from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) + y = Series(randn(2)) + subplot(121) + x.hist() + subplot(122) + y.hist() + fig = gcf() + axes = fig.get_axes() + self.assertEqual(len(axes), 2) + + @slow + def test_plot_fails_with_dupe_color_and_style(self): + x = Series(randn(2)) + with tm.assertRaises(ValueError): + x.plot(style='k--', color='k') + + @slow + def test_hist_by_no_extra_plots(self): + df = self.hist_df + axes = df.height.hist(by=df.gender) + self.assertEqual(len(self.plt.get_fignums()), 1) + + def test_plot_fails_when_ax_differs_from_figure(self): + from pylab import figure + fig1 = figure() + fig2 = figure() + ax1 = fig1.add_subplot(111) + with tm.assertRaises(AssertionError): + self.ts.hist(ax=ax1, figure=fig2) + + @slow + def test_kde(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + _check_plot_works(self.ts.plot, kind='kde') + _check_plot_works(self.ts.plot, kind='density') + ax = self.ts.plot(kind='kde', logy=True) + self._check_ax_scales(ax, yaxis='log') + + @slow + def test_kde_kwargs(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + from numpy import linspace + _check_plot_works(self.ts.plot, kind='kde', bw_method=.5, ind=linspace(-100,100,20)) + _check_plot_works(self.ts.plot, kind='density', bw_method=.5, ind=linspace(-100,100,20)) + ax = self.ts.plot(kind='kde', logy=True, bw_method=.5, ind=linspace(-100,100,20)) + self._check_ax_scales(ax, yaxis='log') + + @slow + def test_kde_color(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + ax = self.ts.plot(kind='kde', logy=True, color='r') + self._check_ax_scales(ax, yaxis='log') + lines = ax.get_lines() + self.assertEqual(len(lines), 1) + self._check_colors(lines, ['r']) + + @slow + def test_autocorrelation_plot(self): + from pandas.tools.plotting import autocorrelation_plot + _check_plot_works(autocorrelation_plot, self.ts) + _check_plot_works(autocorrelation_plot, self.ts.values) + + ax = autocorrelation_plot(self.ts, label='Test') + self._check_legend_labels(ax, labels=['Test']) + + @slow + def test_lag_plot(self): + from pandas.tools.plotting import lag_plot + _check_plot_works(lag_plot, self.ts) + _check_plot_works(lag_plot, self.ts, lag=5) + + @slow + def test_bootstrap_plot(self): + from pandas.tools.plotting import bootstrap_plot + _check_plot_works(bootstrap_plot, self.ts, size=10) + + def test_invalid_plot_data(self): + s = Series(list('abcd')) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + s.plot(kind=kind) + + @slow + def test_valid_object_plot(self): + s = Series(lrange(10), dtype=object) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + _check_plot_works(s.plot, kind=kind) + + def test_partially_invalid_plot_data(self): + s = Series(['a', 'b', 1.0, 2]) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + s.plot(kind=kind) + + def test_invalid_kind(self): + s = Series([1, 2]) + with tm.assertRaises(ValueError): + s.plot(kind='aasdf') + + @slow + def test_dup_datetime_index_plot(self): + dr1 = date_range('1/1/2009', periods=4) + dr2 = date_range('1/2/2009', periods=4) + index = dr1.append(dr2) + values = randn(index.size) + s = Series(values, index=index) + _check_plot_works(s.plot) + + @slow + def test_errorbar_plot(self): + + s = Series(np.arange(10), name='x') + s_err = np.random.randn(10) + d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y']) + # test line and bar plots + kinds = ['line', 'bar'] + for kind in kinds: + ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err.tolist(), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, xerr=0.2, yerr=0.2, kind=kind) + self._check_has_errorbars(ax, xerr=1, yerr=1) + + ax = _check_plot_works(s.plot, xerr=s_err) + self._check_has_errorbars(ax, xerr=1, yerr=0) + + # test time series plotting + ix = date_range('1/1/2000', '1/1/2001', freq='M') + ts = Series(np.arange(12), index=ix, name='x') + ts_err = Series(np.random.randn(12), index=ix) + td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y']) + + ax = _check_plot_works(ts.plot, yerr=ts_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(ts.plot, yerr=td_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + # check incorrect lengths and types + with tm.assertRaises(ValueError): + s.plot(yerr=np.arange(11)) + + s_err = ['zzz']*10 + with tm.assertRaises(TypeError): + s.plot(yerr=s_err) + + def test_table(self): + _check_plot_works(self.series.plot, table=True) + _check_plot_works(self.series.plot, table=self.series) + + +@tm.mplskip +class TestDataFramePlots(TestPlotBase): + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.mpl_le_1_2_1 = str(mpl.__version__) <= LooseVersion('1.2.1') + + self.tdf = tm.makeTimeDataFrame() + self.hexbin_df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20)}) + + from pandas import read_csv + path = os.path.join(curpath(), 'data', 'iris.csv') + self.iris = read_csv(path) + + @slow + def test_plot(self): + df = self.tdf + _check_plot_works(df.plot, grid=False) + axes = _check_plot_works(df.plot, subplots=True) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + _check_plot_works(df.plot, subplots=True, use_index=False) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + df = DataFrame({'x': [1, 2], 'y': [3, 4]}) + with tm.assertRaises(TypeError): + df.plot(kind='line', blarg=True) + + df = DataFrame(np.random.rand(10, 3), + index=list(string.ascii_letters[:10])) + _check_plot_works(df.plot, use_index=True) + _check_plot_works(df.plot, sort_columns=False) + _check_plot_works(df.plot, yticks=[1, 5, 10]) + _check_plot_works(df.plot, xticks=[1, 5, 10]) + _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) + + axes = _check_plot_works(df.plot, subplots=True, title='blah') + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + for ax in axes[:2]: + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible([ax.xaxis.get_label()], visible=False) + for ax in [axes[2]]: + self._check_visible(ax.get_xticklabels()) + self._check_visible([ax.xaxis.get_label()]) + + _check_plot_works(df.plot, title='blah') + + tuples = lzip(string.ascii_letters[:10], range(10)) + df = DataFrame(np.random.rand(10, 3), + index=MultiIndex.from_tuples(tuples)) + _check_plot_works(df.plot, use_index=True) + + # unicode + index = MultiIndex.from_tuples([(u('\u03b1'), 0), + (u('\u03b1'), 1), + (u('\u03b2'), 2), + (u('\u03b2'), 3), + (u('\u03b3'), 4), + (u('\u03b3'), 5), + (u('\u03b4'), 6), + (u('\u03b4'), 7)], names=['i0', 'i1']) + columns = MultiIndex.from_tuples([('bar', u('\u0394')), + ('bar', u('\u0395'))], names=['c0', + 'c1']) + df = DataFrame(np.random.randint(0, 10, (8, 2)), + columns=columns, + index=index) + _check_plot_works(df.plot, title=u('\u03A3')) + + # GH 6951 + # Test with single column + df = DataFrame({'x': np.random.rand(10)}) + axes = _check_plot_works(df.plot, kind='bar', subplots=True) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + # When ax is supplied and required number of axes is 1, + # passed ax should be used: + fig, ax = self.plt.subplots() + axes = df.plot(kind='bar', subplots=True, ax=ax) + self.assertEqual(len(axes), 1) + self.assertIs(ax.get_axes(), axes[0]) + + def test_nonnumeric_exclude(self): + df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}) + ax = df.plot() + self.assertEqual(len(ax.get_lines()), 1) # B was plotted + + @slow + def test_implicit_label(self): + df = DataFrame(randn(10, 3), columns=['a', 'b', 'c']) + ax = df.plot(x='a', y='b') + self._check_text_labels(ax.xaxis.get_label(), 'a') + + @slow + def test_explicit_label(self): + df = DataFrame(randn(10, 3), columns=['a', 'b', 'c']) + ax = df.plot(x='a', y='b', label='LABEL') + self._check_text_labels(ax.xaxis.get_label(), 'LABEL') + + @slow + def test_plot_xy(self): + # columns.inferred_type == 'string' + df = self.tdf + self._check_data(df.plot(x=0, y=1), + df.set_index('A')['B'].plot()) + self._check_data(df.plot(x=0), df.set_index('A').plot()) + self._check_data(df.plot(y=0), df.B.plot()) + self._check_data(df.plot(x='A', y='B'), + df.set_index('A').B.plot()) + self._check_data(df.plot(x='A'), df.set_index('A').plot()) + self._check_data(df.plot(y='B'), df.B.plot()) + + # columns.inferred_type == 'integer' + df.columns = lrange(1, len(df.columns) + 1) + self._check_data(df.plot(x=1, y=2), + df.set_index(1)[2].plot()) + self._check_data(df.plot(x=1), df.set_index(1).plot()) + self._check_data(df.plot(y=1), df[1].plot()) + + # figsize and title + ax = df.plot(x=1, y=2, title='Test', figsize=(16, 8)) + self._check_text_labels(ax.title, 'Test') + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16., 8.)) + + # columns.inferred_type == 'mixed' + # TODO add MultiIndex test + + @slow + def test_logscales(self): + df = DataFrame({'a': np.arange(100)}, + index=np.arange(100)) + ax = df.plot(logy=True) + self._check_ax_scales(ax, yaxis='log') + + ax = df.plot(logx=True) + self._check_ax_scales(ax, xaxis='log') + + ax = df.plot(loglog=True) + self._check_ax_scales(ax, xaxis='log', yaxis='log') + + @slow + def test_xcompat(self): + import pandas as pd + + df = self.tdf + ax = df.plot(x_compat=True) + lines = ax.get_lines() + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) + + tm.close() + pd.plot_params['xaxis.compat'] = True + ax = df.plot() + lines = ax.get_lines() + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) + + tm.close() + pd.plot_params['x_compat'] = False + ax = df.plot() + lines = ax.get_lines() + tm.assert_isinstance(lines[0].get_xdata(), PeriodIndex) + + tm.close() + # useful if you're plotting a bunch together + with pd.plot_params.use('x_compat', True): + ax = df.plot() + lines = ax.get_lines() + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) + + tm.close() + ax = df.plot() + lines = ax.get_lines() + tm.assert_isinstance(lines[0].get_xdata(), PeriodIndex) + + def test_unsorted_index(self): + df = DataFrame({'y': np.arange(100)}, + index=np.arange(99, -1, -1), dtype=np.int64) + ax = df.plot() + l = ax.get_lines()[0] + rs = l.get_xydata() + rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64) + tm.assert_series_equal(rs, df.y) + + @slow + def test_subplots(self): + df = DataFrame(np.random.rand(10, 3), + index=list(string.ascii_letters[:10])) + + for kind in ['bar', 'barh', 'line', 'area']: + axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) + self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) + + for ax, column in zip(axes, df.columns): + self._check_legend_labels(ax, labels=[com.pprint_thing(column)]) + + for ax in axes[:-2]: + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible(ax.get_yticklabels()) + + self._check_visible(axes[-1].get_xticklabels()) + self._check_visible(axes[-1].get_yticklabels()) + + axes = df.plot(kind=kind, subplots=True, sharex=False) + for ax in axes: + self._check_visible(ax.get_xticklabels()) + self._check_visible(ax.get_yticklabels()) + + axes = df.plot(kind=kind, subplots=True, legend=False) + for ax in axes: + self.assertTrue(ax.get_legend() is None) + + def test_negative_log(self): + df = - DataFrame(rand(6, 4), + index=list(string.ascii_letters[:6]), + columns=['x', 'y', 'z', 'four']) + + with tm.assertRaises(ValueError): + df.plot(kind='area', logy=True) + with tm.assertRaises(ValueError): + df.plot(kind='area', loglog=True) + + def _compare_stacked_y_cood(self, normal_lines, stacked_lines): + base = np.zeros(len(normal_lines[0].get_data()[1])) + for nl, sl in zip(normal_lines, stacked_lines): + base += nl.get_data()[1] # get y coodinates + sy = sl.get_data()[1] + self.assert_numpy_array_equal(base, sy) + + def test_line_area_stacked(self): + with tm.RNGContext(42): + df = DataFrame(rand(6, 4), + columns=['w', 'x', 'y', 'z']) + neg_df = - df + # each column has either positive or negative value + sep_df = DataFrame({'w': rand(6), 'x': rand(6), + 'y': - rand(6), 'z': - rand(6)}) + # each column has positive-negative mixed value + mixed_df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), + columns=['w', 'x', 'y', 'z']) + + for kind in ['line', 'area']: + ax1 = _check_plot_works(df.plot, kind=kind, stacked=False) + ax2 = _check_plot_works(df.plot, kind=kind, stacked=True) + self._compare_stacked_y_cood(ax1.lines, ax2.lines) + + ax1 = _check_plot_works(neg_df.plot, kind=kind, stacked=False) + ax2 = _check_plot_works(neg_df.plot, kind=kind, stacked=True) + self._compare_stacked_y_cood(ax1.lines, ax2.lines) + + ax1 = _check_plot_works(sep_df.plot, kind=kind, stacked=False) + ax2 = _check_plot_works(sep_df.plot, kind=kind, stacked=True) + self._compare_stacked_y_cood(ax1.lines[:2], ax2.lines[:2]) + self._compare_stacked_y_cood(ax1.lines[2:], ax2.lines[2:]) + + _check_plot_works(mixed_df.plot, stacked=False) + with tm.assertRaises(ValueError): + mixed_df.plot(stacked=True) + + _check_plot_works(df.plot, kind=kind, logx=True, stacked=True) + + def test_line_area_nan_df(self): + values1 = [1, 2, np.nan, 3] + values2 = [3, np.nan, 2, 1] + df = DataFrame({'a': values1, 'b': values2}) + tdf = DataFrame({'a': values1, 'b': values2}, index=tm.makeDateIndex(k=4)) + + for d in [df, tdf]: + ax = _check_plot_works(d.plot) + masked1 = ax.lines[0].get_ydata() + masked2 = ax.lines[1].get_ydata() + # remove nan for comparison purpose + self.assert_numpy_array_equal(np.delete(masked1.data, 2), np.array([1, 2, 3])) + self.assert_numpy_array_equal(np.delete(masked2.data, 1), np.array([3, 2, 1])) + self.assert_numpy_array_equal(masked1.mask, np.array([False, False, True, False])) + self.assert_numpy_array_equal(masked2.mask, np.array([False, True, False, False])) + + expected1 = np.array([1, 2, 0, 3]) + expected2 = np.array([3, 0, 2, 1]) + + ax = _check_plot_works(d.plot, stacked=True) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + self.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) + + ax = _check_plot_works(d.plot, kind='area') + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + self.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) + + ax = _check_plot_works(d.plot, kind='area', stacked=False) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + self.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2) + + def test_line_lim(self): + df = DataFrame(rand(6, 3), columns=['x', 'y', 'z']) + ax = df.plot() + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data()[0][0]) + self.assertEqual(xmax, lines[0].get_data()[0][-1]) + + ax = df.plot(secondary_y=True) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data()[0][0]) + self.assertEqual(xmax, lines[0].get_data()[0][-1]) + + axes = df.plot(secondary_y=True, subplots=True) + for ax in axes: + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data()[0][0]) + self.assertEqual(xmax, lines[0].get_data()[0][-1]) + + def test_area_lim(self): + df = DataFrame(rand(6, 4), + columns=['x', 'y', 'z', 'four']) + + neg_df = - df + for stacked in [True, False]: + ax = _check_plot_works(df.plot, kind='area', stacked=stacked) + xmin, xmax = ax.get_xlim() + ymin, ymax = ax.get_ylim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data()[0][0]) + self.assertEqual(xmax, lines[0].get_data()[0][-1]) + self.assertEqual(ymin, 0) + + ax = _check_plot_works(neg_df.plot, kind='area', stacked=stacked) + ymin, ymax = ax.get_ylim() + self.assertEqual(ymax, 0) + + @slow + def test_bar_colors(self): + import matplotlib.pyplot as plt + + default_colors = plt.rcParams.get('axes.color_cycle') + + + df = DataFrame(randn(5, 5)) + ax = df.plot(kind='bar') + self._check_colors(ax.patches[::5], facecolors=default_colors[:5]) + tm.close() + + custom_colors = 'rgcby' + ax = df.plot(kind='bar', color=custom_colors) + self._check_colors(ax.patches[::5], facecolors=custom_colors) + tm.close() + + from matplotlib import cm + # Test str -> colormap functionality + ax = df.plot(kind='bar', colormap='jet') + rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5)) + self._check_colors(ax.patches[::5], facecolors=rgba_colors) + tm.close() + + # Test colormap functionality + ax = df.plot(kind='bar', colormap=cm.jet) + rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5)) + self._check_colors(ax.patches[::5], facecolors=rgba_colors) + tm.close() + + ax = df.ix[:, [0]].plot(kind='bar', color='DodgerBlue') + self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) + + @slow + def test_bar_linewidth(self): + df = DataFrame(randn(5, 5)) + + # regular + ax = df.plot(kind='bar', linewidth=2) + for r in ax.patches: + self.assertEqual(r.get_linewidth(), 2) + + # stacked + ax = df.plot(kind='bar', stacked=True, linewidth=2) + for r in ax.patches: + self.assertEqual(r.get_linewidth(), 2) + + # subplots + axes = df.plot(kind='bar', linewidth=2, subplots=True) + self._check_axes_shape(axes, axes_num=5, layout=(5, 1)) + for ax in axes: + for r in ax.patches: + self.assertEqual(r.get_linewidth(), 2) + + @slow + def test_bar_barwidth(self): + df = DataFrame(randn(5, 5)) + + width = 0.9 + + # regular + ax = df.plot(kind='bar', width=width) + for r in ax.patches: + self.assertEqual(r.get_width(), width / len(df.columns)) + + # stacked + ax = df.plot(kind='bar', stacked=True, width=width) + for r in ax.patches: + self.assertEqual(r.get_width(), width) + + # horizontal regular + ax = df.plot(kind='barh', width=width) + for r in ax.patches: + self.assertEqual(r.get_height(), width / len(df.columns)) + + # horizontal stacked + ax = df.plot(kind='barh', stacked=True, width=width) + for r in ax.patches: + self.assertEqual(r.get_height(), width) + + # subplots + axes = df.plot(kind='bar', width=width, subplots=True) + for ax in axes: + for r in ax.patches: + self.assertEqual(r.get_width(), width) + + # horizontal subplots + axes = df.plot(kind='barh', width=width, subplots=True) + for ax in axes: + for r in ax.patches: + self.assertEqual(r.get_height(), width) + + @slow + def test_bar_barwidth_position(self): + df = DataFrame(randn(5, 5)) + self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9, position=0.2) + self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9, position=0.2) + self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9, position=0.2) + self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9, position=0.2) + self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9, position=0.2) + self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9, position=0.2) + + @slow + def test_bar_bottom_left(self): + df = DataFrame(rand(5, 5)) + ax = df.plot(kind='bar', stacked=False, bottom=1) + result = [p.get_y() for p in ax.patches] + self.assertEqual(result, [1] * 25) + + ax = df.plot(kind='bar', stacked=True, bottom=[-1, -2, -3, -4, -5]) + result = [p.get_y() for p in ax.patches[:5]] + self.assertEqual(result, [-1, -2, -3, -4, -5]) + + ax = df.plot(kind='barh', stacked=False, left=np.array([1, 1, 1, 1, 1])) + result = [p.get_x() for p in ax.patches] + self.assertEqual(result, [1] * 25) + + ax = df.plot(kind='barh', stacked=True, left=[1, 2, 3, 4, 5]) + result = [p.get_x() for p in ax.patches[:5]] + self.assertEqual(result, [1, 2, 3, 4, 5]) + + axes = df.plot(kind='bar', subplots=True, bottom=-1) + for ax in axes: + result = [p.get_y() for p in ax.patches] + self.assertEqual(result, [-1] * 5) + + axes = df.plot(kind='barh', subplots=True, left=np.array([1, 1, 1, 1, 1])) + for ax in axes: + result = [p.get_x() for p in ax.patches] + self.assertEqual(result, [1] * 5) + + @slow + def test_plot_scatter(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['x', 'y', 'z', 'four']) + + _check_plot_works(df.plot, x='x', y='y', kind='scatter') + _check_plot_works(df.plot, x=1, y=2, kind='scatter') + + with tm.assertRaises(ValueError): + df.plot(x='x', kind='scatter') + with tm.assertRaises(ValueError): + df.plot(y='y', kind='scatter') + + # GH 6951 + axes = df.plot(x='x', y='y', kind='scatter', subplots=True) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @slow + def test_plot_bar(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + + _check_plot_works(df.plot, kind='bar') + _check_plot_works(df.plot, kind='bar', legend=False) + _check_plot_works(df.plot, kind='bar', subplots=True) + _check_plot_works(df.plot, kind='bar', stacked=True) + + df = DataFrame(randn(10, 15), + index=list(string.ascii_letters[:10]), + columns=lrange(15)) + _check_plot_works(df.plot, kind='bar') + + df = DataFrame({'a': [0, 1], 'b': [1, 0]}) + _check_plot_works(df.plot, kind='bar') + + def _check_bar_alignment(self, df, kind='bar', stacked=False, + subplots=False, align='center', + width=0.5, position=0.5): + + axes = df.plot(kind=kind, stacked=stacked, subplots=subplots, + align=align, width=width, position=position, + grid=True) + + axes = self._flatten_visible(axes) + + for ax in axes: + if kind == 'bar': + axis = ax.xaxis + ax_min, ax_max = ax.get_xlim() + min_edge = min([p.get_x() for p in ax.patches]) + max_edge = max([p.get_x() + p.get_width() for p in ax.patches]) + elif kind == 'barh': + axis = ax.yaxis + ax_min, ax_max = ax.get_ylim() + min_edge = min([p.get_y() for p in ax.patches]) + max_edge = max([p.get_y() + p.get_height() for p in ax.patches]) + else: + raise ValueError + + # GH 7498 + # compare margins between lim and bar edges + self.assertAlmostEqual(ax_min, min_edge - 0.25) + self.assertAlmostEqual(ax_max, max_edge + 0.25) + + p = ax.patches[0] + if kind == 'bar' and (stacked is True or subplots is True): + edge = p.get_x() + center = edge + p.get_width() * position + elif kind == 'bar' and stacked is False: + center = p.get_x() + p.get_width() * len(df.columns) * position + edge = p.get_x() + elif kind == 'barh' and (stacked is True or subplots is True): + center = p.get_y() + p.get_height() * position + edge = p.get_y() + elif kind == 'barh' and stacked is False: + center = p.get_y() + p.get_height() * len(df.columns) * position + edge = p.get_y() + else: + raise ValueError + + # Check the ticks locates on integer + self.assertTrue((axis.get_ticklocs() == np.arange(len(df))).all()) + + if align == 'center': + # Check whether the bar locates on center + self.assertAlmostEqual(axis.get_ticklocs()[0], center) + elif align == 'edge': + # Check whether the bar's edge starts from the tick + self.assertAlmostEqual(axis.get_ticklocs()[0], edge) + else: + raise ValueError + + return axes + + @slow + def test_bar_stacked_center(self): + # GH2157 + df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) + self._check_bar_alignment(df, kind='bar', stacked=True) + self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9) + self._check_bar_alignment(df, kind='barh', stacked=True) + self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9) + + @slow + def test_bar_center(self): + df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) + self._check_bar_alignment(df, kind='bar', stacked=False) + self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9) + self._check_bar_alignment(df, kind='barh', stacked=False) + self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9) + + @slow + def test_bar_subplots_center(self): + df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) + self._check_bar_alignment(df, kind='bar', subplots=True) + self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9) + self._check_bar_alignment(df, kind='barh', subplots=True) + self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9) + + @slow + def test_bar_align_single_column(self): + df = DataFrame(randn(5)) + self._check_bar_alignment(df, kind='bar', stacked=False) + self._check_bar_alignment(df, kind='bar', stacked=True) + self._check_bar_alignment(df, kind='barh', stacked=False) + self._check_bar_alignment(df, kind='barh', stacked=True) + self._check_bar_alignment(df, kind='bar', subplots=True) + self._check_bar_alignment(df, kind='barh', subplots=True) + + @slow + def test_bar_edge(self): + df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) + + self._check_bar_alignment(df, kind='bar', stacked=True, align='edge') + self._check_bar_alignment(df, kind='bar', stacked=True, + width=0.9, align='edge') + self._check_bar_alignment(df, kind='barh', stacked=True, align='edge') + self._check_bar_alignment(df, kind='barh', stacked=True, + width=0.9, align='edge') + + self._check_bar_alignment(df, kind='bar', stacked=False, align='edge') + self._check_bar_alignment(df, kind='bar', stacked=False, + width=0.9, align='edge') + self._check_bar_alignment(df, kind='barh', stacked=False, align='edge') + self._check_bar_alignment(df, kind='barh', stacked=False, + width=0.9, align='edge') + + self._check_bar_alignment(df, kind='bar', subplots=True, align='edge') + self._check_bar_alignment(df, kind='bar', subplots=True, + width=0.9, align='edge') + self._check_bar_alignment(df, kind='barh', subplots=True, align='edge') + self._check_bar_alignment(df, kind='barh', subplots=True, + width=0.9, align='edge') + + @slow + def test_bar_log_no_subplots(self): + # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 + # regressions in 1.2.1 + expected = np.array([1., 10.]) + + if not self.mpl_le_1_2_1: + expected = np.hstack((.1, expected, 100)) + + # no subplots + df = DataFrame({'A': [3] * 5, 'B': lrange(1, 6)}, index=lrange(5)) + ax = df.plot(kind='bar', grid=True, log=True) + assert_array_equal(ax.yaxis.get_ticklocs(), expected) + + @slow + def test_bar_log_subplots(self): + expected = np.array([1., 10., 100., 1000.]) + if not self.mpl_le_1_2_1: + expected = np.hstack((.1, expected, 1e4)) + + ax = DataFrame([Series([200, 300]), + Series([300, 500])]).plot(log=True, kind='bar', + subplots=True) + + assert_array_equal(ax[0].yaxis.get_ticklocs(), expected) + assert_array_equal(ax[1].yaxis.get_ticklocs(), expected) + + @slow + def test_boxplot(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + df['indic'] = ['foo', 'bar'] * 3 + df['indic2'] = ['foo', 'bar', 'foo'] * 2 + + _check_plot_works(df.boxplot, return_type='dict') + _check_plot_works(df.boxplot, column=['one', 'two'], return_type='dict') + _check_plot_works(df.boxplot, column=['one', 'two'], by='indic') + _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) + _check_plot_works(df.boxplot, by='indic') + _check_plot_works(df.boxplot, by=['indic', 'indic2']) + _check_plot_works(plotting.boxplot, df['one'], return_type='dict') + _check_plot_works(df.boxplot, notch=1, return_type='dict') + _check_plot_works(df.boxplot, by='indic', notch=1) + + df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) + df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df['Y'] = Series(['A'] * 10) + _check_plot_works(df.boxplot, by='X') + + # When ax is supplied and required number of axes is 1, + # passed ax should be used: + fig, ax = self.plt.subplots() + axes = df.boxplot('Col1', by='X', ax=ax) + self.assertIs(ax.get_axes(), axes) + + fig, ax = self.plt.subplots() + axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') + self.assertIs(ax.get_axes(), axes['A']) + + # Multiple columns with an ax argument should use same figure + fig, ax = self.plt.subplots() + axes = df.boxplot(column=['Col1', 'Col2'], by='X', ax=ax, return_type='axes') + self.assertIs(axes['Col1'].get_figure(), fig) + + # When by is None, check that all relevant lines are present in the dict + fig, ax = self.plt.subplots() + d = df.boxplot(ax=ax, return_type='dict') + lines = list(itertools.chain.from_iterable(d.values())) + self.assertEqual(len(ax.get_lines()), len(lines)) + + @slow + def test_boxplot_return_type(self): + # API change in https://github.com/pydata/pandas/pull/7096 + import matplotlib as mpl + + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + with tm.assertRaises(ValueError): + df.boxplot(return_type='NOTATYPE') + + with tm.assert_produces_warning(FutureWarning): + result = df.boxplot() + # change to Axes in future + self._check_box_return_type(result, 'dict') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='dict') + self._check_box_return_type(result, 'dict') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='axes') + self._check_box_return_type(result, 'axes') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='both') + self._check_box_return_type(result, 'both') + + @slow + def test_kde(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + df = DataFrame(randn(100, 4)) + ax = _check_plot_works(df.plot, kind='kde') + expected = [com.pprint_thing(c) for c in df.columns] + self._check_legend_labels(ax, labels=expected) + + axes = _check_plot_works(df.plot, kind='kde', subplots=True) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.plot(kind='kde', logy=True, subplots=True) + self._check_ax_scales(axes, yaxis='log') + + @slow + def test_hist(self): + _check_plot_works(self.hist_df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 3)) + axes = _check_plot_works(df.hist, grid=False) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + self.assertFalse(axes[1, 1].get_visible()) + + df = DataFrame(randn(100, 1)) + _check_plot_works(df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 6)) + axes = _check_plot_works(df.hist, layout=(4, 2)) + self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) + + # make sure sharex, sharey is handled + _check_plot_works(df.hist, sharex=True, sharey=True) + + # handle figsize arg + _check_plot_works(df.hist, figsize=(8, 10)) + + # check bins argument + _check_plot_works(df.hist, bins=5) + + # make sure xlabelsize and xrot are handled + ser = df[0] + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + tm.close() + # make sure kwargs to hist are handled + ax = ser.hist(normed=True, cumulative=True, bins=4) + # height of last bin (index 5) must be 1.0 + self.assertAlmostEqual(ax.get_children()[5].get_height(), 1.0) + + tm.close() + ax = ser.hist(log=True) + # scale of y must be 'log' + self._check_ax_scales(ax, yaxis='log') + + tm.close() + + # propagate attr exception from matplotlib.Axes.hist + with tm.assertRaises(AttributeError): + ser.hist(foo='bar') + + @slow + def test_hist_layout(self): + df = DataFrame(randn(100, 3)) + + layout_to_expected_size = ( + {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 + {'layout': (2, 2), 'expected_size': (2, 2)}, + {'layout': (4, 1), 'expected_size': (4, 1)}, + {'layout': (1, 4), 'expected_size': (1, 4)}, + {'layout': (3, 3), 'expected_size': (3, 3)}, + ) + + for layout_test in layout_to_expected_size: + axes = df.hist(layout=layout_test['layout']) + expected = layout_test['expected_size'] + self._check_axes_shape(axes, axes_num=3, layout=expected) + + # layout too small for all 4 plots + with tm.assertRaises(ValueError): + df.hist(layout=(1, 1)) + + # invalid format for layout + with tm.assertRaises(ValueError): + df.hist(layout=(1,)) + + @slow + def test_scatter(self): + tm._skip_if_no_scipy() + + df = DataFrame(randn(100, 2)) + import pandas.tools.plotting as plt + + def scat(**kwds): + return plt.scatter_matrix(df, **kwds) + + _check_plot_works(scat) + _check_plot_works(scat, marker='+') + _check_plot_works(scat, vmin=0) + if _ok_for_gaussian_kde('kde'): + _check_plot_works(scat, diagonal='kde') + if _ok_for_gaussian_kde('density'): + _check_plot_works(scat, diagonal='density') + _check_plot_works(scat, diagonal='hist') + _check_plot_works(scat, range_padding=.1) + + def scat2(x, y, by=None, ax=None, figsize=None): + return plt.scatter_plot(df, x, y, by, ax, figsize=None) + + _check_plot_works(scat2, 0, 1) + grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) + _check_plot_works(scat2, 0, 1, by=grouper) + + @slow + def test_andrews_curves(self): + from pandas.tools.plotting import andrews_curves + from matplotlib import cm + + df = self.iris + + _check_plot_works(andrews_curves, df, 'Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(andrews_curves, df, 'Name', color=rgba) + self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(andrews_curves, df, 'Name', color=cnames) + self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(andrews_curves, df, 'Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = andrews_curves(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + with tm.assert_produces_warning(FutureWarning): + andrews_curves(data=df, class_column='Name') + + @slow + def test_parallel_coordinates(self): + from pandas.tools.plotting import parallel_coordinates + from matplotlib import cm + + df = self.iris + + _check_plot_works(parallel_coordinates, df, 'Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(parallel_coordinates, df, 'Name', color=rgba) + self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(parallel_coordinates, df, 'Name', color=cnames) + self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(parallel_coordinates, df, 'Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = parallel_coordinates(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(data=df, class_column='Name') + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(df, 'Name', colors=colors) + + @slow + def test_radviz(self): + from pandas.tools.plotting import radviz + from matplotlib import cm + + df = self.iris + _check_plot_works(radviz, df, 'Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(radviz, df, 'Name', color=rgba) + # skip Circle drawn as ticks + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors(patches[:10], facecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + _check_plot_works(radviz, df, 'Name', color=cnames) + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) + + _check_plot_works(radviz, df, 'Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) + + colors = [[0., 0., 1., 1.], + [0., 0.5, 1., 1.], + [1., 0., 0., 1.]] + df = DataFrame({"A": [1, 2, 3], + "B": [2, 1, 3], + "C": [3, 2, 1], + "Name": ['b', 'g', 'r']}) + ax = radviz(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=colors) + + @slow + def test_plot_int_columns(self): + df = DataFrame(randn(100, 4)).cumsum() + _check_plot_works(df.plot, legend=True) + + @slow + def test_df_legend_labels(self): + kinds = 'line', 'bar', 'barh', 'kde', 'area' + df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) + df2 = DataFrame(rand(3, 3), columns=['d', 'e', 'f']) + df3 = DataFrame(rand(3, 3), columns=['g', 'h', 'i']) + df4 = DataFrame(rand(3, 3), columns=['j', 'k', 'l']) + + for kind in kinds: + if not _ok_for_gaussian_kde(kind): + continue + + ax = df.plot(kind=kind, legend=True) + self._check_legend_labels(ax, labels=df.columns) + + ax = df2.plot(kind=kind, legend=False, ax=ax) + self._check_legend_labels(ax, labels=df.columns) + + ax = df3.plot(kind=kind, legend=True, ax=ax) + self._check_legend_labels(ax, labels=df.columns + df3.columns) + + ax = df4.plot(kind=kind, legend='reverse', ax=ax) + expected = list(df.columns + df3.columns) + list(reversed(df4.columns)) + self._check_legend_labels(ax, labels=expected) + + # Secondary Y + ax = df.plot(legend=True, secondary_y='b') + self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ax = df2.plot(legend=False, ax=ax) + self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ax = df3.plot(kind='bar', legend=True, secondary_y='h', ax=ax) + self._check_legend_labels(ax, labels=['a', 'b (right)', 'c', 'g', 'h (right)', 'i']) + + # Time Series + ind = date_range('1/1/2014', periods=3) + df = DataFrame(randn(3, 3), columns=['a', 'b', 'c'], index=ind) + df2 = DataFrame(randn(3, 3), columns=['d', 'e', 'f'], index=ind) + df3 = DataFrame(randn(3, 3), columns=['g', 'h', 'i'], index=ind) + ax = df.plot(legend=True, secondary_y='b') + self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ax = df2.plot(legend=False, ax=ax) + self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ax = df3.plot(legend=True, ax=ax) + self._check_legend_labels(ax, labels=['a', 'b (right)', 'c', 'g', 'h', 'i']) + + # scatter + ax = df.plot(kind='scatter', x='a', y='b', label='data1') + self._check_legend_labels(ax, labels=['data1']) + ax = df2.plot(kind='scatter', x='d', y='e', legend=False, + label='data2', ax=ax) + self._check_legend_labels(ax, labels=['data1']) + ax = df3.plot(kind='scatter', x='g', y='h', label='data3', ax=ax) + self._check_legend_labels(ax, labels=['data1', 'data3']) + + def test_legend_name(self): + multi = DataFrame(randn(4, 4), + columns=[np.array(['a', 'a', 'b', 'b']), + np.array(['x', 'y', 'x', 'y'])]) + multi.columns.names = ['group', 'individual'] + + ax = multi.plot() + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, 'group,individual') + + df = DataFrame(randn(5, 5)) + ax = df.plot(legend=True, ax=ax) + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, 'group,individual') + + df.columns.name = 'new' + ax = df.plot(legend=False, ax=ax) + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, 'group,individual') + + ax = df.plot(legend=True, ax=ax) + leg_title = ax.legend_.get_title() + self._check_text_labels(leg_title, 'new') + + @slow + def test_no_legend(self): + kinds = 'line', 'bar', 'barh', 'kde', 'area' + df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) + + for kind in kinds: + if not _ok_for_gaussian_kde(kind): + continue + + ax = df.plot(kind=kind, legend=False) + self._check_legend_labels(ax, visible=False) + + @slow + def test_style_by_column(self): + import matplotlib.pyplot as plt + fig = plt.gcf() + + df = DataFrame(randn(100, 3)) + for markers in [{0: '^', 1: '+', 2: 'o'}, + {0: '^', 1: '+'}, + ['^', '+', 'o'], + ['^', '+']]: + fig.clf() + fig.add_subplot(111) + ax = df.plot(style=markers) + for i, l in enumerate(ax.get_lines()[:len(markers)]): + self.assertEqual(l.get_marker(), markers[i]) + + @slow + def test_line_colors(self): + import sys + from matplotlib import cm + + custom_colors = 'rgcby' + df = DataFrame(randn(5, 5)) + + ax = df.plot(color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + + tmp = sys.stderr + sys.stderr = StringIO() + try: + tm.close() + ax2 = df.plot(colors=custom_colors) + lines2 = ax2.get_lines() + for l1, l2 in zip(ax.get_lines(), lines2): + self.assertEqual(l1.get_color(), l2.get_color()) + finally: + sys.stderr = tmp + + tm.close() + + ax = df.plot(colormap='jet') + rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + ax = df.plot(colormap=cm.jet) + rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') + ax = df.ix[:, [0]].plot(color='DodgerBlue') + self._check_colors(ax.lines, linecolors=['DodgerBlue']) + + @slow + def test_area_colors(self): + from matplotlib import cm + from matplotlib.collections import PolyCollection + + custom_colors = 'rgcby' + df = DataFrame(rand(5, 5)) + + ax = df.plot(kind='area', color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + self._check_colors(poly, facecolors=custom_colors) + tm.close() + + ax = df.plot(kind='area', colormap='jet') + rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + self._check_colors(poly, facecolors=rgba_colors) + tm.close() + + ax = df.plot(kind='area', colormap=cm.jet) + rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] + self._check_colors(poly, facecolors=rgba_colors) + + def test_default_color_cycle(self): + import matplotlib.pyplot as plt + plt.rcParams['axes.color_cycle'] = list('rgbk') + + df = DataFrame(randn(5, 3)) + ax = df.plot() + + expected = plt.rcParams['axes.color_cycle'][:3] + self._check_colors(ax.get_lines(), linecolors=expected) + + def test_unordered_ts(self): + df = DataFrame(np.array([3.0, 2.0, 1.0]), + index=[date(2012, 10, 1), + date(2012, 9, 1), + date(2012, 8, 1)], + columns=['test']) + ax = df.plot() + xticks = ax.lines[0].get_xdata() + self.assertTrue(xticks[0] < xticks[1]) + ydata = ax.lines[0].get_ydata() + assert_array_equal(ydata, np.array([1.0, 2.0, 3.0])) + + def test_all_invalid_plot_data(self): + df = DataFrame(list('abcd')) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + df.plot(kind=kind) + + @slow + def test_partially_invalid_plot_data(self): + with tm.RNGContext(42): + df = DataFrame(randn(10, 2), dtype=object) + df[np.random.rand(df.shape[0]) > 0.5] = 'a' + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + df.plot(kind=kind) + + with tm.RNGContext(42): + # area plot doesn't support positive/negative mixed data + kinds = ['area'] + df = DataFrame(rand(10, 2), dtype=object) + df[np.random.rand(df.shape[0]) > 0.5] = 'a' + for kind in kinds: + with tm.assertRaises(TypeError): + df.plot(kind=kind) + + def test_invalid_kind(self): + df = DataFrame(randn(10, 2)) + with tm.assertRaises(ValueError): + df.plot(kind='aasdf') + + @slow + def test_hexbin_basic(self): + df = self.hexbin_df + + ax = df.plot(kind='hexbin', x='A', y='B', gridsize=10) + # TODO: need better way to test. This just does existence. + self.assertEqual(len(ax.collections), 1) + + # GH 6951 + axes = df.plot(x='A', y='B', kind='hexbin', subplots=True) + # hexbin should have 2 axes in the figure, 1 for plotting and another is colorbar + self.assertEqual(len(axes[0].figure.axes), 2) + # return value is single axes + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @slow + def test_hexbin_with_c(self): + df = self.hexbin_df + + ax = df.plot(kind='hexbin', x='A', y='B', C='C') + self.assertEqual(len(ax.collections), 1) + + ax = df.plot(kind='hexbin', x='A', y='B', C='C', + reduce_C_function=np.std) + self.assertEqual(len(ax.collections), 1) + + @slow + def test_hexbin_cmap(self): + df = self.hexbin_df + + # Default to BuGn + ax = df.plot(kind='hexbin', x='A', y='B') + self.assertEqual(ax.collections[0].cmap.name, 'BuGn') + + cm = 'cubehelix' + ax = df.plot(kind='hexbin', x='A', y='B', colormap=cm) + self.assertEqual(ax.collections[0].cmap.name, cm) + + @slow + def test_no_color_bar(self): + df = self.hexbin_df + + ax = df.plot(kind='hexbin', x='A', y='B', colorbar=None) + self.assertIs(ax.collections[0].colorbar, None) + + @slow + def test_allow_cmap(self): + df = self.hexbin_df + + ax = df.plot(kind='hexbin', x='A', y='B', cmap='YlGn') + self.assertEqual(ax.collections[0].cmap.name, 'YlGn') + + with tm.assertRaises(TypeError): + df.plot(kind='hexbin', x='A', y='B', cmap='YlGn', + colormap='BuGn') + + @slow + def test_pie_df(self): + df = DataFrame(np.random.rand(5, 3), columns=['X', 'Y', 'Z'], + index=['a', 'b', 'c', 'd', 'e']) + with tm.assertRaises(ValueError): + df.plot(kind='pie') + + ax = _check_plot_works(df.plot, kind='pie', y='Y') + self._check_text_labels(ax.texts, df.index) + + axes = _check_plot_works(df.plot, kind='pie', subplots=True) + self.assertEqual(len(axes), len(df.columns)) + for ax in axes: + self._check_text_labels(ax.texts, df.index) + for ax, ylabel in zip(axes, df.columns): + self.assertEqual(ax.get_ylabel(), ylabel) + + labels = ['A', 'B', 'C', 'D', 'E'] + color_args = ['r', 'g', 'b', 'c', 'm'] + axes = _check_plot_works(df.plot, kind='pie', subplots=True, + labels=labels, colors=color_args) + self.assertEqual(len(axes), len(df.columns)) + + for ax in axes: + self._check_text_labels(ax.texts, labels) + self._check_colors(ax.patches, facecolors=color_args) + + def test_errorbar_plot(self): + d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + df = DataFrame(d) + d_err = {'x': np.ones(12)*0.2, 'y': np.ones(12)*0.4} + df_err = DataFrame(d_err) + + # check line plots + ax = _check_plot_works(df.plot, yerr=df_err, logy=True) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, logx=True, logy=True) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, loglog=True) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + kinds = ['line', 'bar', 'barh'] + for kind in kinds: + ax = _check_plot_works(df.plot, yerr=df_err['x'], kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err['x'], xerr=df_err['x'], kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + axes = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, subplots=True, kind=kind) + self._check_has_errorbars(axes, xerr=1, yerr=1) + + ax = _check_plot_works((df+1).plot, yerr=df_err, xerr=df_err, kind='bar', log=True) + self._check_has_errorbars(ax, xerr=2, yerr=2) + + # yerr is raw error values + ax = _check_plot_works(df['y'].plot, yerr=np.ones(12)*0.4) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(df.plot, yerr=np.ones((2, 12))*0.4) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + # yerr is iterator + import itertools + ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + # yerr is column name + for yerr in ['yerr', u('誤差')]: + s_df = df.copy() + s_df[yerr] = np.ones(12)*0.2 + ax = _check_plot_works(s_df.plot, yerr=yerr) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(s_df.plot, y='y', x='x', yerr=yerr) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + with tm.assertRaises(ValueError): + df.plot(yerr=np.random.randn(11)) + + df_err = DataFrame({'x': ['zzz']*12, 'y': ['zzz']*12}) + with tm.assertRaises(TypeError): + df.plot(yerr=df_err) + + @slow + def test_errorbar_with_integer_column_names(self): + # test with integer column names + df = DataFrame(np.random.randn(10, 2)) + df_err = DataFrame(np.random.randn(10, 2)) + ax = _check_plot_works(df.plot, yerr=df_err) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, y=0, yerr=1) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + @slow + def test_errorbar_with_partial_columns(self): + df = DataFrame(np.random.randn(10, 3)) + df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) + kinds = ['line', 'bar'] + for kind in kinds: + ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + ix = date_range('1/1/2000', periods=10, freq='M') + df.set_index(ix, inplace=True) + df_err.set_index(ix, inplace=True) + ax = _check_plot_works(df.plot, yerr=df_err, kind='line') + self._check_has_errorbars(ax, xerr=0, yerr=2) + + d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + df = DataFrame(d) + d_err = {'x': np.ones(12)*0.2, 'z': np.ones(12)*0.4} + df_err = DataFrame(d_err) + for err in [d_err, df_err]: + ax = _check_plot_works(df.plot, yerr=err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + @slow + def test_errorbar_timeseries(self): + + d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + d_err = {'x': np.ones(12)*0.2, 'y': np.ones(12)*0.4} + + # check time-series plots + ix = date_range('1/1/2000', '1/1/2001', freq='M') + tdf = DataFrame(d, index=ix) + tdf_err = DataFrame(d_err, index=ix) + + kinds = ['line', 'bar', 'barh'] + for kind in kinds: + ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(tdf.plot, y='y', yerr=tdf_err['x'], kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(tdf.plot, y='y', yerr='x', kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + axes = _check_plot_works(tdf.plot, kind=kind, yerr=tdf_err, subplots=True) + self._check_has_errorbars(axes, xerr=0, yerr=1) + + def test_errorbar_asymmetrical(self): + + np.random.seed(0) + err = np.random.rand(3, 2, 5) + + data = np.random.randn(5, 3) + df = DataFrame(data) + + ax = df.plot(yerr=err, xerr=err/2) + + self.assertEqual(ax.lines[7].get_ydata()[0], data[0,1]-err[1,0,0]) + self.assertEqual(ax.lines[8].get_ydata()[0], data[0,1]+err[1,1,0]) + + self.assertEqual(ax.lines[5].get_xdata()[0], -err[1,0,0]/2) + self.assertEqual(ax.lines[6].get_xdata()[0], err[1,1,0]/2) + + with tm.assertRaises(ValueError): + df.plot(yerr=err.T) + + tm.close() + + def test_table(self): + df = DataFrame(np.random.rand(10, 3), + index=list(string.ascii_letters[:10])) + _check_plot_works(df.plot, table=True) + _check_plot_works(df.plot, table=df) + + ax = df.plot() + self.assertTrue(len(ax.tables) == 0) + plotting.table(ax, df.T) + self.assertTrue(len(ax.tables) == 1) + + def test_errorbar_scatter(self): + df = DataFrame(np.random.randn(5, 2), index=range(5), columns=['x', 'y']) + df_err = DataFrame(np.random.randn(5, 2) / 5, + index=range(5), columns=['x', 'y']) + + ax = _check_plot_works(df.plot, kind='scatter', x='x', y='y') + self._check_has_errorbars(ax, xerr=0, yerr=0) + ax = _check_plot_works(df.plot, kind='scatter', x='x', y='y', xerr=df_err) + self._check_has_errorbars(ax, xerr=1, yerr=0) + ax = _check_plot_works(df.plot, kind='scatter', x='x', y='y', yerr=df_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(df.plot, kind='scatter', x='x', y='y', + xerr=df_err, yerr=df_err) + self._check_has_errorbars(ax, xerr=1, yerr=1) + + +@tm.mplskip +class TestDataFrameGroupByPlots(TestPlotBase): + + @slow + def test_boxplot(self): + grouped = self.hist_df.groupby(by='gender') + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(axes.values(), axes_num=2, layout=(1, 2)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + tuples = lzip(string.ascii_letters[:10], range(10)) + df = DataFrame(np.random.rand(10, 3), + index=MultiIndex.from_tuples(tuples)) + + grouped = df.groupby(level=1) + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(axes.values(), axes_num=10, layout=(4, 3)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + grouped = df.unstack(level=1).groupby(level=0, axis=1) + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(axes.values(), axes_num=3, layout=(2, 2)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + def test_series_plot_color_kwargs(self): + # GH1890 + ax = Series(np.arange(12) + 1).plot(color='green') + self._check_colors(ax.get_lines(), linecolors=['green']) + + def test_time_series_plot_color_kwargs(self): + # #1890 + ax = Series(np.arange(12) + 1, index=date_range( + '1/1/2000', periods=12)).plot(color='green') + self._check_colors(ax.get_lines(), linecolors=['green']) + + def test_time_series_plot_color_with_empty_kwargs(self): + import matplotlib as mpl + + def_colors = mpl.rcParams['axes.color_cycle'] + index = date_range('1/1/2000', periods=12) + s = Series(np.arange(1, 13), index=index) + + ncolors = 3 + + for i in range(ncolors): + ax = s.plot() + self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) + + @slow + def test_grouped_hist(self): + df = DataFrame(randn(500, 2), columns=['A', 'B']) + df['C'] = np.random.randint(0, 4, 500) + df['D'] = ['X'] * 500 + + axes = plotting.grouped_hist(df.A, by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + axes = df.hist(by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + # group by a key with single value + axes = df.hist(by='D', rot=30) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + self._check_ticks_props(axes, xrot=30) + + tm.close() + # make sure kwargs to hist are handled + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = plotting.grouped_hist(df.A, by=df.C, normed=True, + cumulative=True, bins=4, + xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + # height of last bin (index 5) must be 1.0 + for ax in axes.ravel(): + height = ax.get_children()[5].get_height() + self.assertAlmostEqual(height, 1.0) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + tm.close() + axes = plotting.grouped_hist(df.A, by=df.C, log=True) + # scale of y must be 'log' + self._check_ax_scales(axes, yaxis='log') + + tm.close() + # propagate attr exception from matplotlib.Axes.hist + with tm.assertRaises(AttributeError): + plotting.grouped_hist(df.A, by=df.C, foo='bar') + + with tm.assert_produces_warning(FutureWarning): + df.hist(by='C', figsize='default') + + @slow + def test_grouped_box_return_type(self): + df = self.hist_df + + # old style: return_type=None + result = df.boxplot(by='gender') + self.assertIsInstance(result, np.ndarray) + self._check_box_return_type(result, None, + expected_keys=['height', 'weight', 'category']) + + # now for groupby + with tm.assert_produces_warning(FutureWarning): + result = df.groupby('gender').boxplot() + self._check_box_return_type(result, 'dict', expected_keys=['Male', 'Female']) + + columns2 = 'X B C D A G Y N Q O'.split() + df2 = DataFrame(random.randn(50, 10), columns=columns2) + categories2 = 'A B C D E F G H I J'.split() + df2['category'] = categories2 * 5 + + for t in ['dict', 'axes', 'both']: + returned = df.groupby('classroom').boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=['A', 'B', 'C']) + + returned = df.boxplot(by='classroom', return_type=t) + self._check_box_return_type(returned, t, + expected_keys=['height', 'weight', 'category']) + + returned = df2.groupby('category').boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=categories2) + + returned = df2.boxplot(by='category', return_type=t) + self._check_box_return_type(returned, t, expected_keys=columns2) + + @slow + def test_grouped_box_layout(self): + df = self.hist_df + + self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], + by=df.gender, layout=(1, 1)) + self.assertRaises(ValueError, df.boxplot, column=['height', 'weight', 'category'], + layout=(2, 1), return_type='dict') + + box = _check_plot_works(df.groupby('gender').boxplot, column='height', + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) + + box = _check_plot_works(df.groupby('category').boxplot, column='height', + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) + + # GH 6769 + box = _check_plot_works(df.groupby('classroom').boxplot, + column='height', return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + # GH 5897 + axes = df.boxplot(column=['height', 'weight', 'category'], by='gender', + return_type='axes') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + for ax in [axes['height']]: + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible([ax.xaxis.get_label()], visible=False) + for ax in [axes['weight'], axes['category']]: + self._check_visible(ax.get_xticklabels()) + self._check_visible([ax.xaxis.get_label()]) + + box = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + box = _check_plot_works(df.groupby('category').boxplot, column='height', + layout=(3, 2), return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + + box = df.boxplot(column=['height', 'weight', 'category'], by='gender', layout=(4, 1)) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) + + box = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], layout=(1, 4), + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) + + @slow + def test_grouped_hist_layout(self): + + df = self.hist_df + self.assertRaises(ValueError, df.hist, column='weight', by=df.gender, + layout=(1, 1)) + self.assertRaises(ValueError, df.hist, column='height', by=df.category, + layout=(1, 3)) + + axes = _check_plot_works(df.hist, column='height', by=df.gender, layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + axes = _check_plot_works(df.hist, column='height', by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = _check_plot_works(df.hist, column='height', by=df.category, + layout=(4, 2), figsize=(12, 8)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) + + # GH 6769 + axes = _check_plot_works(df.hist, column='height', by='classroom', layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + # without column + axes = _check_plot_works(df.hist, by='classroom') + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = _check_plot_works(df.hist, by='gender', layout=(3, 5)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) + + axes = _check_plot_works(df.hist, column=['height', 'weight', 'category']) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + @slow + def test_axis_share_x(self): + df = self.hist_df + # GH4089 + ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) + + # share x + self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) + + # don't share y + self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2)) + + @slow + def test_axis_share_y(self): + df = self.hist_df + ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) + + # share y + self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) + + # don't share x + self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2)) + + @slow + def test_axis_share_xy(self): + df = self.hist_df + ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, + sharey=True) + + # share both x and y + self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) + + self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) + + def test_option_mpl_style(self): + set_option('display.mpl_style', 'default') + set_option('display.mpl_style', None) + set_option('display.mpl_style', False) + + with tm.assertRaises(ValueError): + set_option('display.mpl_style', 'default2') + + def test_invalid_colormap(self): + df = DataFrame(randn(3, 2), columns=['A', 'B']) + + with tm.assertRaises(ValueError): + df.plot(colormap='invalid_colormap') + + +def assert_is_valid_plot_return_object(objs): + import matplotlib.pyplot as plt + if isinstance(objs, np.ndarray): + for el in objs.flat: + assert isinstance(el, plt.Axes), ('one of \'objs\' is not a ' + 'matplotlib Axes instance, ' + 'type encountered {0!r}' + ''.format(el.__class__.__name__)) + else: + assert isinstance(objs, (plt.Artist, tuple, dict)), \ + ('objs is neither an ndarray of Artist instances nor a ' + 'single Artist instance, tuple, or dict, "objs" is a {0!r} ' + ''.format(objs.__class__.__name__)) + + +def _check_plot_works(f, *args, **kwargs): + import matplotlib.pyplot as plt + ret = None + + try: + try: + fig = kwargs['figure'] + except KeyError: + fig = plt.gcf() + + plt.clf() + + ax = kwargs.get('ax', fig.add_subplot(211)) + ret = f(*args, **kwargs) + + assert_is_valid_plot_return_object(ret) + + try: + kwargs['ax'] = fig.add_subplot(212) + ret = f(*args, **kwargs) + except Exception: + pass + else: + assert_is_valid_plot_return_object(ret) + + with ensure_clean(return_filelike=True) as path: + plt.savefig(path) + finally: + tm.close(fig) + + return ret + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py new file mode 100644 index 00000000..434591a8 --- /dev/null +++ b/pandas/tests/test_groupby.py @@ -0,0 +1,4473 @@ +from __future__ import print_function +import nose + +from numpy.testing.decorators import slow + +from datetime import datetime +from numpy import nan + +from pandas import date_range,bdate_range, Timestamp +from pandas.core.index import Index, MultiIndex, Int64Index +from pandas.core.common import rands +from pandas.core.api import Categorical, DataFrame +from pandas.core.groupby import (SpecificationError, DataError, + _nargsort, _lexsort_indexer) +from pandas.core.series import Series +from pandas.util.testing import (assert_panel_equal, assert_frame_equal, + assert_series_equal, assert_almost_equal, + assert_index_equal, assertRaisesRegexp) +from pandas.compat import( + range, long, lrange, StringIO, lmap, lzip, map, + zip, builtins, OrderedDict +) +from pandas import compat +from pandas.core.panel import Panel +from pandas.tools.merge import concat +from collections import defaultdict +import pandas.core.common as com +import numpy as np + +import pandas.core.nanops as nanops + +import pandas.util.testing as tm +import pandas as pd +from numpy.testing import assert_equal + +def _skip_if_mpl_not_installed(): + try: + import matplotlib.pyplot as plt + except ImportError: + raise nose.SkipTest("matplotlib not installed") + +def commonSetUp(self): + self.dateRange = bdate_range('1/1/2005', periods=250) + self.stringIndex = Index([rands(8).upper() for x in range(250)]) + + self.groupId = Series([x[0] for x in self.stringIndex], + index=self.stringIndex) + self.groupDict = dict((k, v) for k, v in compat.iteritems(self.groupId)) + + self.columnIndex = Index(['A', 'B', 'C', 'D', 'E']) + + randMat = np.random.randn(250, 5) + self.stringMatrix = DataFrame(randMat, columns=self.columnIndex, + index=self.stringIndex) + + self.timeMatrix = DataFrame(randMat, columns=self.columnIndex, + index=self.dateRange) + + +class TestGroupBy(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array(np.random.randn(8), + dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_basic(self): + + def checkit(dtype): + data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + for k, v in grouped: + self.assertEqual(len(v), 3) + + agged = grouped.aggregate(np.mean) + self.assertEqual(agged[1], 1) + + assert_series_equal(agged, grouped.agg(np.mean)) # shorthand + assert_series_equal(agged, grouped.mean()) + assert_series_equal(grouped.agg(np.sum), grouped.sum()) + + expected = grouped.apply(lambda x: x * x.sum()) + transformed = grouped.transform(lambda x: x * x.sum()) + self.assertEqual(transformed[7], 12) + assert_series_equal(transformed, expected) + + value_grouped = data.groupby(data) + assert_series_equal(value_grouped.aggregate(np.mean), agged) + + # complex agg + agged = grouped.aggregate([np.mean, np.std]) + agged = grouped.aggregate({'one': np.mean, + 'two': np.std}) + + group_constants = { + 0: 10, + 1: 20, + 2: 30 + } + agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) + self.assertEqual(agged[1], 21) + + # corner cases + self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2) + + for dtype in ['int64', 'int32', 'float64', 'float32']: + checkit(dtype) + + def test_select_bad_cols(self): + df = DataFrame([[1, 2]], columns=['A', 'B']) + g = df.groupby('A') + self.assertRaises(KeyError, g.__getitem__, ['C']) # g[['C']] + + self.assertRaises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']] + with assertRaisesRegexp(KeyError, '^[^A]+$'): + # A should not be referenced as a bad column... + # will have to rethink regex if you change message! + g[['A', 'C']] + + def test_first_last_nth(self): + # tests for first / last / nth + grouped = self.df.groupby('A') + first = grouped.first() + expected = self.df.ix[[1, 0], ['B','C','D']] + expected.index = Index(['bar', 'foo'],name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + nth = grouped.nth(0) + assert_frame_equal(nth, expected) + + last = grouped.last() + expected = self.df.ix[[5, 7], ['B','C','D']] + expected.index = Index(['bar', 'foo'],name='A') + assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + assert_frame_equal(nth, expected) + + nth = grouped.nth(1) + expected = self.df.ix[[2, 3],['B','C','D']].copy() + expected.index = Index(['foo', 'bar'],name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # it works! + grouped['B'].first() + grouped['B'].last() + grouped['B'].nth(0) + + self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan + self.assertTrue(com.isnull(grouped['B'].first()['foo'])) + self.assertTrue(com.isnull(grouped['B'].last()['foo'])) + self.assertTrue(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing + + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + result = g.first() + expected = df.iloc[[1,2]].set_index('A') + assert_frame_equal(result, expected) + + expected = df.iloc[[1,2]].set_index('A') + result = g.nth(0,dropna='any') + assert_frame_equal(result, expected) + + def test_first_last_nth_dtypes(self): + + df = self.df_mixed_floats.copy() + df['E'] = True + df['F'] = 1 + + # tests for first / last / nth + grouped = df.groupby('A') + first = grouped.first() + expected = df.ix[[1, 0], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + last = grouped.last() + expected = df.ix[[5, 7], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = df.ix[[3, 2],['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # GH 2763, first/last shifting dtypes + idx = lrange(10) + idx.append(9) + s = Series(data=lrange(11), index=idx, name='IntCol') + self.assertEqual(s.dtype, 'int64') + f = s.groupby(level=0).first() + self.assertEqual(f.dtype, 'int64') + + def test_nth(self): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) + assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) + assert_frame_equal(g.nth(2), df.loc[[],['B']]) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) + assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) + assert_frame_equal(g.nth(-3), df.loc[[],['B']]) + assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) + assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['A', 'B']].set_index('A')) + + exp = df.set_index('A') + assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) + + exp['B'] = np.nan + assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + + # out of bounds, regression from 0.13.1 + # GH 6621 + df = DataFrame({'color': {0: 'green', 1: 'green', 2: 'red', 3: 'red', 4: 'red'}, + 'food': {0: 'ham', 1: 'eggs', 2: 'eggs', 3: 'ham', 4: 'pork'}, + 'two': {0: 1.5456590000000001, 1: -0.070345000000000005, 2: -2.4004539999999999, 3: 0.46206000000000003, 4: 0.52350799999999997}, + 'one': {0: 0.56573799999999996, 1: -0.9742360000000001, 2: 1.033801, 3: -0.78543499999999999, 4: 0.70422799999999997}}).set_index(['color', 'food']) + + result = df.groupby(level=0).nth(2) + expected = df.iloc[[-1]] + assert_frame_equal(result,expected) + + result = df.groupby(level=0).nth(3) + expected = df.loc[[]] + assert_frame_equal(result,expected) + + # GH 7559 + # from the vbench + df = DataFrame(np.random.randint(1, 10, (100, 2)),dtype='int64') + s = df[1] + g = df[0] + expected = s.groupby(g).first() + expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) + assert_series_equal(expected2,expected) + + # validate first + v = s[g==1].iloc[0] + self.assertEqual(expected.iloc[0],v) + self.assertEqual(expected2.iloc[0],v) + + # this is NOT the same as .first (as sorted is default!) + # as it keeps the order in the series (and not the group order) + # related GH 7287 + expected = s.groupby(g,sort=False).first() + expected.index = range(1,10) + result = s.groupby(g).nth(0,dropna='all') + assert_series_equal(result,expected) + + # doc example + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + result = g.B.nth(0, dropna=True) + expected = g.B.first() + assert_series_equal(result,expected) + + def test_grouper_index_types(self): + # related GH5375 + # groupby misbehaving when using a Floatlike index + df = DataFrame(np.arange(10).reshape(5,2),columns=list('AB')) + for index in [ tm.makeFloatIndex, tm.makeStringIndex, + tm.makeUnicodeIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + + df.index = index(len(df)) + df.groupby(list('abcde')).apply(lambda x: x) + + df.index = list(reversed(df.index.tolist())) + df.groupby(list('abcde')).apply(lambda x: x) + + def test_grouper_iter(self): + self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo']) + + def test_empty_groups(self): + # GH # 1048 + self.assertRaises(ValueError, self.df.groupby, []) + + def test_groupby_grouper(self): + grouped = self.df.groupby('A') + + result = self.df.groupby(grouped.grouper).mean() + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_groupby_dict_mapping(self): + # GH #679 + from pandas import Series + s = Series({'T1': 5}) + result = s.groupby({'T1': 'T2'}).agg(sum) + expected = s.groupby(['T2']).agg(sum) + assert_series_equal(result, expected) + + s = Series([1., 2., 3., 4.], index=list('abcd')) + mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} + + result = s.groupby(mapping).mean() + result2 = s.groupby(mapping).agg(np.mean) + expected = s.groupby([0, 0, 1, 1]).mean() + expected2 = s.groupby([0, 0, 1, 1]).mean() + assert_series_equal(result, expected) + assert_series_equal(result, result2) + assert_series_equal(result, expected2) + + def test_groupby_bounds_check(self): + import pandas as pd + # groupby_X is code-generated, so if one variant + # does, the rest probably do to + a = np.array([1,2],dtype='object') + b = np.array([1,2,3],dtype='object') + self.assertRaises(AssertionError, pd.algos.groupby_object,a, b) + + def test_groupby_grouper_f_sanity_checked(self): + import pandas as pd + dates = date_range('01-Jan-2013', periods=12, freq='MS') + ts = pd.TimeSeries(np.random.randn(12), index=dates) + + # GH3035 + # index.map is used to apply grouper to the index + # if it fails on the elements, map tries it on the entire index as + # a sequence. That can yield invalid results that cause trouble + # down the line. + # the surprise comes from using key[0:6] rather then str(key)[0:6] + # when the elements are Timestamp. + # the result is Index[0:6], very confusing. + + self.assertRaises(AssertionError, ts.groupby,lambda key: key[0:6]) + + def test_groupby_nonobject_dtype(self): + key = self.mframe.index.labels[0] + grouped = self.mframe.groupby(key) + result = grouped.sum() + + expected = self.mframe.groupby(key.astype('O')).sum() + assert_frame_equal(result, expected) + + # GH 3911, mixed frame non-conversion + df = self.df_mixed_floats.copy() + df['value'] = lrange(len(df)) + + def max_value(group): + return group.ix[group['value'].idxmax()] + + applied = df.groupby('A').apply(max_value) + result = applied.get_dtype_counts() + result.sort() + expected = Series({ 'object' : 2, 'float64' : 2, 'int64' : 1 }) + expected.sort() + assert_series_equal(result,expected) + + def test_groupby_return_type(self): + + # GH2893, return a reduced type + df1 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, + {"val1":2, "val2": 27}, {"val1":2, "val2": 12}]) + + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + result = df1.groupby("val1", squeeze=True).apply(func) + tm.assert_isinstance(result,Series) + + df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19}, + {"val1":1, "val2": 27}, {"val1":1, "val2": 12}]) + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + result = df2.groupby("val1", squeeze=True).apply(func) + tm.assert_isinstance(result,Series) + + # GH3596, return a consistent type (regression in 0.11 from 0.10.1) + df = DataFrame([[1,1],[1,1]],columns=['X','Y']) + result = df.groupby('X',squeeze=False).count() + tm.assert_isinstance(result,DataFrame) + + # GH5592 + # inconcistent return type + df = DataFrame(dict(A = [ 'Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', 'Pony', 'Pony' ], + B = Series(np.arange(7),dtype='int64'), + C = date_range('20130101',periods=7))) + + def f(grp): + return grp.iloc[0] + expected = df.groupby('A').first()[['B']] + result = df.groupby('A').apply(f)[['B']] + assert_frame_equal(result,expected) + + def f(grp): + if grp.name == 'Tiger': + return None + return grp.iloc[0] + result = df.groupby('A').apply(f)[['B']] + e = expected.copy() + e.loc['Tiger'] = np.nan + assert_frame_equal(result,e) + + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0] + result = df.groupby('A').apply(f)[['B']] + e = expected.copy() + e.loc['Pony'] = np.nan + assert_frame_equal(result,e) + + # 5592 revisited, with datetimes + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0] + result = df.groupby('A').apply(f)[['C']] + e = df.groupby('A').first()[['C']] + e.loc['Pony'] = np.nan + assert_frame_equal(result,e) + + # scalar outputs + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0].loc['C'] + result = df.groupby('A').apply(f) + e = df.groupby('A').first()['C'].copy() + e.loc['Pony'] = np.nan + e.name = None + assert_series_equal(result,e) + + def test_agg_api(self): + + # GH 6337 + # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # different api for agg when passed custom function with mixed frame + + df = DataFrame({'data1':np.random.randn(5), + 'data2':np.random.randn(5), + 'key1':['a','a','b','b','a'], + 'key2':['one','two','one','two','one']}) + grouped = df.groupby('key1') + + def peak_to_peak(arr): + return arr.max() - arr.min() + + expected = grouped.agg([peak_to_peak]) + expected.columns=['data1','data2'] + result = grouped.agg(peak_to_peak) + assert_frame_equal(result,expected) + + def test_agg_regression1(self): + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_agg_datetimes_mixed(self): + data = [[1, '2012-01-01', 1.0], + [2, '2012-01-02', 2.0], + [3, None, 3.0]] + + df1 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() + if row[1] else None, row[2]] for row in data] + + df2 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + df1['weights'] = df1['value'] / df1['value'].sum() + gb1 = df1.groupby('date').aggregate(np.sum) + + df2['weights'] = df1['value'] / df1['value'].sum() + gb2 = df2.groupby('date').aggregate(np.sum) + + assert(len(gb1) == len(gb2)) + + def test_agg_period_index(self): + from pandas import period_range, PeriodIndex + prng = period_range('2012-1-1', freq='M', periods=3) + df = DataFrame(np.random.randn(3, 2), index=prng) + rs = df.groupby(level=0).sum() + tm.assert_isinstance(rs.index, PeriodIndex) + + # GH 3579 + index = period_range(start='1999-01', periods=5, freq='M') + s1 = Series(np.random.rand(len(index)), index=index) + s2 = Series(np.random.rand(len(index)), index=index) + series = [('s1', s1), ('s2',s2)] + df = DataFrame.from_items(series) + grouped = df.groupby(df.index.month) + list(grouped) + + def test_agg_must_agg(self): + grouped = self.df.groupby('A')['C'] + self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) + self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) + + def test_agg_ser_multi_key(self): + ser = self.df.C + f = lambda x: x.sum() + results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) + expected = self.df.groupby(['A', 'B']).sum()['C'] + assert_series_equal(results, expected) + + def test_get_group(self): + wp = tm.makePanel() + grouped = wp.groupby(lambda x: x.month, axis='major') + + gp = grouped.get_group(1) + expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1]) + assert_panel_equal(gp, expected) + + + # GH 5267 + # be datelike friendly + df = DataFrame({'DATE' : pd.to_datetime(['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', + '11-Oct-2013', '11-Oct-2013', '11-Oct-2013']), + 'label' : ['foo','foo','bar','foo','foo','bar'], + 'VAL' : [1,2,3,4,5,6]}) + + g = df.groupby('DATE') + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group(Timestamp(key).to_datetime()) + result3 = g.get_group(str(Timestamp(key))) + assert_frame_equal(result1,result2) + assert_frame_equal(result1,result3) + + g = df.groupby(['DATE','label']) + + key = list(g.groups)[0] + result1 = g.get_group(key) + result2 = g.get_group((Timestamp(key[0]).to_datetime(),key[1])) + result3 = g.get_group((str(Timestamp(key[0])),key[1])) + assert_frame_equal(result1,result2) + assert_frame_equal(result1,result3) + + # must pass a same-length tuple with multiple keys + self.assertRaises(ValueError, lambda : g.get_group('foo')) + self.assertRaises(ValueError, lambda : g.get_group(('foo'))) + self.assertRaises(ValueError, lambda : g.get_group(('foo','bar','baz'))) + + def test_agg_apply_corner(self): + # nothing to group, all NA + grouped = self.ts.groupby(self.ts * np.nan) + + assert_series_equal(grouped.sum(), Series([])) + assert_series_equal(grouped.agg(np.sum), Series([])) + assert_series_equal(grouped.apply(np.sum), Series([])) + + # DataFrame + grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) + exp_df = DataFrame(columns=self.tsframe.columns, dtype=float) + assert_frame_equal(grouped.sum(), exp_df, check_names=False) + assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) + assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float)) + + def test_agg_grouping_is_list_tuple(self): + from pandas.core.groupby import Grouping + + df = tm.makeTimeDataFrame() + + grouped = df.groupby(lambda x: x.year) + grouper = grouped.grouper.groupings[0].grouper + grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_agg_python_multiindex(self): + grouped = self.mframe.groupby(['A', 'B']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_apply_describe_bug(self): + grouped = self.mframe.groupby(level='first') + result = grouped.describe() # it works! + + def test_apply_issues(self): + # GH 5788 + + s="""2011.05.16,00:00,1.40893 +2011.05.16,01:00,1.40760 +2011.05.16,02:00,1.40750 +2011.05.16,03:00,1.40649 +2011.05.17,02:00,1.40893 +2011.05.17,03:00,1.40760 +2011.05.17,04:00,1.40750 +2011.05.17,05:00,1.40649 +2011.05.18,02:00,1.40893 +2011.05.18,03:00,1.40760 +2011.05.18,04:00,1.40750 +2011.05.18,05:00,1.40649""" + + df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']]) + df = df.set_index('date_time') + + expected = df.groupby(df.index.date).idxmax() + result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) + assert_frame_equal(result,expected) + + # GH 5789 + # don't auto coerce dates + df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value']) + expected = Series(['00:00','02:00','02:00'],index=['2011.05.16','2011.05.17','2011.05.18']) + result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()]) + assert_series_equal(result,expected) + + def test_len(self): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day]) + self.assertEqual(len(grouped), len(df)) + + grouped = df.groupby([lambda x: x.year, + lambda x: x.month]) + expected = len(set([(x.year, x.month) for x in df.index])) + self.assertEqual(len(grouped), expected) + + def test_groups(self): + grouped = self.df.groupby(['A']) + groups = grouped.groups + self.assertIs(groups, grouped.groups) # caching works + + for k, v in compat.iteritems(grouped.groups): + self.assertTrue((self.df.ix[v]['A'] == k).all()) + + grouped = self.df.groupby(['A', 'B']) + groups = grouped.groups + self.assertIs(groups, grouped.groups) # caching works + for k, v in compat.iteritems(grouped.groups): + self.assertTrue((self.df.ix[v]['A'] == k[0]).all()) + self.assertTrue((self.df.ix[v]['B'] == k[1]).all()) + + def test_aggregate_str_func(self): + + def _check_results(grouped): + # single series + result = grouped['A'].agg('std') + expected = grouped['A'].std() + assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate('var') + expected = grouped.var() + assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg(OrderedDict([['A', 'var'], + ['B', 'std'], + ['C', 'mean'], + ['D', 'sem']])) + expected = DataFrame(OrderedDict([['A', grouped['A'].var()], + ['B', grouped['B'].std()], + ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) + assert_frame_equal(result, expected) + + by_weekday = self.tsframe.groupby(lambda x: x.weekday()) + _check_results(by_weekday) + + by_mwkday = self.tsframe.groupby([lambda x: x.month, + lambda x: x.weekday()]) + _check_results(by_mwkday) + + def test_aggregate_item_by_item(self): + + df = self.df.copy() + df['E'] = ['a'] * len(self.df) + grouped = self.df.groupby('A') + + # API change in 0.11 + # def aggfun(ser): + # return len(ser + 'a') + # result = grouped.agg(aggfun) + # self.assertEqual(len(result.columns), 1) + + aggfun = lambda ser: ser.size + result = grouped.agg(aggfun) + foo = (self.df.A == 'foo').sum() + bar = (self.df.A == 'bar').sum() + K = len(result.columns) + + # GH5782 + # odd comparisons can result here, so cast to make easy + assert_almost_equal(result.xs('foo'), np.array([foo] * K).astype('float64')) + assert_almost_equal(result.xs('bar'), np.array([bar] * K).astype('float64')) + + def aggfun(ser): + return ser.size + result = DataFrame().groupby(self.df.A).agg(aggfun) + tm.assert_isinstance(result, DataFrame) + self.assertEqual(len(result), 0) + + def test_agg_item_by_item_raise_typeerror(self): + from numpy.random import randint + + df = DataFrame(randint(10, size=(20, 10))) + + def raiseException(df): + com.pprint_thing('----------------------------------------') + com.pprint_thing(df.to_string()) + raise TypeError + + self.assertRaises(TypeError, df.groupby(0).agg, + raiseException) + + def test_basic_regression(self): + # regression + T = [1.0 * x for x in lrange(1, 10) * 10][:1095] + result = Series(T, lrange(0, len(T))) + + groupings = np.random.random((1100,)) + groupings = Series(groupings, lrange(0, len(groupings))) * 10. + + grouped = result.groupby(groupings) + grouped.mean() + + def test_transform(self): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + transformed = grouped.transform(lambda x: x * x.sum()) + self.assertEqual(transformed[7], 12) + + def test_transform_broadcast(self): + grouped = self.ts.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + + self.assertTrue(result.index.equals(self.ts.index)) + for _, gp in grouped: + assert_fp_equal(result.reindex(gp.index), gp.mean()) + + grouped = self.tsframe.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + self.assertTrue(result.index.equals(self.tsframe.index)) + for _, gp in grouped: + agged = gp.mean() + res = result.reindex(gp.index) + for col in self.tsframe: + assert_fp_equal(res[col], agged[col]) + + # group columns + grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, + axis=1) + result = grouped.transform(np.mean) + self.assertTrue(result.index.equals(self.tsframe.index)) + self.assertTrue(result.columns.equals(self.tsframe.columns)) + for _, gp in grouped: + agged = gp.mean(1) + res = result.reindex(columns=gp.columns) + for idx in gp.index: + assert_fp_equal(res.xs(idx), agged[idx]) + + def test_transform_bug(self): + # GH 5712 + # transforming on a datetime column + df = DataFrame(dict(A = Timestamp('20130101'), B = np.arange(5))) + result = df.groupby('A')['B'].transform(lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5,0,step=-1),name='B') + assert_series_equal(result,expected) + + def test_transform_multiple(self): + grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) + + transformed = grouped.transform(lambda x: x * 2) + broadcasted = grouped.transform(np.mean) + + def test_dispatch_transform(self): + df = self.tsframe[::5].reindex(self.tsframe.index) + + grouped = df.groupby(lambda x: x.month) + + filled = grouped.fillna(method='pad') + fillit = lambda x: x.fillna(method='pad') + expected = df.groupby(lambda x: x.month).transform(fillit) + assert_frame_equal(filled, expected) + + def test_transform_select_columns(self): + f = lambda x: x.mean() + result = self.df.groupby('A')['C', 'D'].transform(f) + + selection = self.df[['C', 'D']] + expected = selection.groupby(self.df['A']).transform(f) + + assert_frame_equal(result, expected) + + def test_transform_exclude_nuisance(self): + expected = {} + grouped = self.df.groupby('A') + expected['C'] = grouped['C'].transform(np.mean) + expected['D'] = grouped['D'].transform(np.mean) + expected = DataFrame(expected) + + result = self.df.groupby('A').transform(np.mean) + + assert_frame_equal(result, expected) + + def test_transform_function_aliases(self): + result = self.df.groupby('A').transform('mean') + expected = self.df.groupby('A').transform(np.mean) + assert_frame_equal(result, expected) + + result = self.df.groupby('A')['C'].transform('mean') + expected = self.df.groupby('A')['C'].transform(np.mean) + assert_series_equal(result, expected) + + def test_with_na(self): + index = Index(np.arange(10)) + + for dtype in ['float64','float32','int64','int32','int16','int8']: + values = Series(np.ones(10), index, dtype=dtype) + labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar', + 'bar', nan, 'foo'], index=index) + + + # this SHOULD be an int + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + #self.assertTrue(issubclass(agged.dtype.type, np.integer)) + + # explicity return a float from my function + def f(x): + return float(len(x)) + + agged = grouped.agg(f) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + self.assertTrue(issubclass(agged.dtype.type, np.dtype(dtype).type)) + + def test_groupby_transform_with_int(self): + + # GH 3740, make sure that we might upcast on item-by-item transform + + # floats + df = DataFrame(dict(A = [1,1,1,2,2,2], B = Series(1,dtype='float64'), C = Series([1,2,3,1,2,3],dtype='float64'), D = 'foo')) + result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std()) + expected = DataFrame(dict(B = np.nan, C = Series([-1,0,1,-1,0,1],dtype='float64'))) + assert_frame_equal(result,expected) + + # int case + df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = [1,2,3,1,2,3], D = 'foo')) + result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std()) + expected = DataFrame(dict(B = np.nan, C = [-1,0,1,-1,0,1])) + assert_frame_equal(result,expected) + + # int that needs float conversion + s = Series([2,3,4,10,5,-1]) + df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = s, D = 'foo')) + result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std()) + + s1 = s.iloc[0:3] + s1 = (s1-s1.mean())/s1.std() + s2 = s.iloc[3:6] + s2 = (s2-s2.mean())/s2.std() + expected = DataFrame(dict(B = np.nan, C = concat([s1,s2]))) + assert_frame_equal(result,expected) + + # int downcasting + result = df.groupby('A').transform(lambda x: x*2/2) + expected = DataFrame(dict(B = 1, C = [2,3,4,10,5,-1])) + assert_frame_equal(result,expected) + + def test_indices_concatenation_order(self): + + # GH 2808 + + def f1(x): + y = x[(x.b % 2) == 1]**2 + if y.empty: + multiindex = MultiIndex( + levels = [[]]*2, + labels = [[]]*2, + names = ['b', 'c'] + ) + res = DataFrame(None, + columns=['a'], + index=multiindex) + return res + else: + y = y.set_index(['b','c']) + return y + + def f2(x): + y = x[(x.b % 2) == 1]**2 + if y.empty: + return DataFrame() + else: + y = y.set_index(['b','c']) + return y + + def f3(x): + y = x[(x.b % 2) == 1]**2 + if y.empty: + multiindex = MultiIndex( + levels = [[]]*2, + labels = [[]]*2, + names = ['foo', 'bar'] + ) + res = DataFrame(None, + columns=['a','b'], + index=multiindex) + return res + else: + return y + + df = DataFrame({'a':[1,2,2,2], + 'b':lrange(4), + 'c':lrange(5,9)}) + + df2 = DataFrame({'a':[3,2,2,2], + 'b':lrange(4), + 'c':lrange(5,9)}) + + + # correct result + result1 = df.groupby('a').apply(f1) + result2 = df2.groupby('a').apply(f1) + assert_frame_equal(result1, result2) + + # should fail (not the same number of levels) + self.assertRaises(AssertionError, df.groupby('a').apply, f2) + self.assertRaises(AssertionError, df2.groupby('a').apply, f2) + + # should fail (incorrect shape) + self.assertRaises(AssertionError, df.groupby('a').apply, f3) + self.assertRaises(AssertionError, df2.groupby('a').apply, f3) + + def test_attr_wrapper(self): + grouped = self.ts.groupby(lambda x: x.weekday()) + + result = grouped.std() + expected = grouped.agg(lambda x: np.std(x, ddof=1)) + assert_series_equal(result, expected) + + # this is pretty cool + result = grouped.describe() + expected = {} + for name, gp in grouped: + expected[name] = gp.describe() + expected = DataFrame(expected).T + assert_frame_equal(result.unstack(), expected) + + # get attribute + result = grouped.dtype + expected = grouped.agg(lambda x: x.dtype) + + # make sure raises error + self.assertRaises(AttributeError, getattr, grouped, 'foo') + + def test_series_describe_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe().unstack() + assert_series_equal(result['mean'], grouped.mean()) + assert_series_equal(result['std'], grouped.std()) + assert_series_equal(result['min'], grouped.min()) + + def test_series_describe_single(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe() + assert_series_equal(result, expected) + + def test_series_agg_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_series_equal(result, expected) + + def test_series_agg_multi_pure_python(self): + data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def bad(x): + assert(len(x.base) > 0) + return 'foo' + + result = data.groupby(['A', 'B']).agg(bad) + expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') + assert_frame_equal(result, expected) + + def test_series_index_name(self): + grouped = self.df.ix[:, ['C']].groupby(self.df['A']) + result = grouped.agg(lambda x: x.mean()) + self.assertEqual(result.index.name, 'A') + + def test_frame_describe_multikey(self): + grouped = self.tsframe.groupby([lambda x: x.year, + lambda x: x.month]) + result = grouped.describe() + + for col in self.tsframe: + expected = grouped[col].describe() + assert_series_equal(result[col], expected) + + groupedT = self.tsframe.groupby({'A': 0, 'B': 0, + 'C': 1, 'D': 1}, axis=1) + result = groupedT.describe() + + for name, group in groupedT: + assert_frame_equal(result[name], group.describe()) + + def test_frame_groupby(self): + grouped = self.tsframe.groupby(lambda x: x.weekday()) + + # aggregate + aggregated = grouped.aggregate(np.mean) + self.assertEqual(len(aggregated), 5) + self.assertEqual(len(aggregated.columns), 4) + + # by string + tscopy = self.tsframe.copy() + tscopy['weekday'] = [x.weekday() for x in tscopy.index] + stragged = tscopy.groupby('weekday').aggregate(np.mean) + assert_frame_equal(stragged, aggregated, check_names=False) + + # transform + grouped = self.tsframe.head(30).groupby(lambda x: x.weekday()) + transformed = grouped.transform(lambda x: x - x.mean()) + self.assertEqual(len(transformed), 30) + self.assertEqual(len(transformed.columns), 4) + + # transform propagate + transformed = grouped.transform(lambda x: x.mean()) + for name, group in grouped: + mean = group.mean() + for idx in group.index: + assert_almost_equal(transformed.xs(idx), mean) + + # iterate + for weekday, group in grouped: + self.assertEqual(group.index[0].weekday(), weekday) + + # groups / group_indices + groups = grouped.groups + indices = grouped.indices + + for k, v in compat.iteritems(groups): + samething = self.tsframe.index.take(indices[k]) + self.assertTrue((samething == v).all()) + + def test_grouping_is_iterable(self): + # this code path isn't used anywhere else + # not sure it's useful + grouped = self.tsframe.groupby([lambda x: x.weekday(), + lambda x: x.year]) + + # test it works + for g in grouped.grouper.groupings[0]: + pass + + def test_frame_groupby_columns(self): + mapping = { + 'A': 0, 'B': 0, 'C': 1, 'D': 1 + } + grouped = self.tsframe.groupby(mapping, axis=1) + + # aggregate + aggregated = grouped.aggregate(np.mean) + self.assertEqual(len(aggregated), len(self.tsframe)) + self.assertEqual(len(aggregated.columns), 2) + + # transform + tf = lambda x: x - x.mean() + groupedT = self.tsframe.T.groupby(mapping, axis=0) + assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) + + # iterate + for k, v in grouped: + self.assertEqual(len(v.columns), 2) + + def test_frame_set_name_single(self): + grouped = self.df.groupby('A') + + result = grouped.mean() + self.assertEqual(result.index.name, 'A') + + result = self.df.groupby('A', as_index=False).mean() + self.assertNotEqual(result.index.name, 'A') + + result = grouped.agg(np.mean) + self.assertEqual(result.index.name, 'A') + + result = grouped.agg({'C': np.mean, 'D': np.std}) + self.assertEqual(result.index.name, 'A') + + result = grouped['C'].mean() + self.assertEqual(result.index.name, 'A') + result = grouped['C'].agg(np.mean) + self.assertEqual(result.index.name, 'A') + result = grouped['C'].agg([np.mean, np.std]) + self.assertEqual(result.index.name, 'A') + + result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) + self.assertEqual(result.index.name, 'A') + + def test_multi_iter(self): + s = Series(np.arange(6)) + k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + + grouped = s.groupby([k1, k2]) + + iterated = list(grouped) + expected = [('a', '1', s[[0, 2]]), + ('a', '2', s[[1]]), + ('b', '1', s[[4]]), + ('b', '2', s[[3, 5]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + self.assertEqual(e1, one) + self.assertEqual(e2, two) + assert_series_equal(three, e3) + + def test_multi_iter_frame(self): + k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + df = DataFrame({'v1': np.random.randn(6), + 'v2': np.random.randn(6), + 'k1': k1, 'k2': k2}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + + grouped = df.groupby(['k1', 'k2']) + + # things get sorted! + iterated = list(grouped) + idx = df.index + expected = [('a', '1', df.ix[idx[[4]]]), + ('a', '2', df.ix[idx[[3, 5]]]), + ('b', '1', df.ix[idx[[0, 2]]]), + ('b', '2', df.ix[idx[[1]]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + self.assertEqual(e1, one) + self.assertEqual(e2, two) + assert_frame_equal(three, e3) + + # don't iterate through groups with no data + df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) + grouped = df.groupby(['k1', 'k2']) + groups = {} + for key, gp in grouped: + groups[key] = gp + self.assertEqual(len(groups), 2) + + # axis = 1 + three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() + grouped = three_levels.T.groupby(axis=1, level=(1, 2)) + for key, group in grouped: + pass + + def test_multi_iter_panel(self): + wp = tm.makePanel() + grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()], + axis=1) + + for (month, wd), group in grouped: + exp_axis = [x for x in wp.major_axis + if x.month == month and x.weekday() == wd] + expected = wp.reindex(major=exp_axis) + assert_panel_equal(group, expected) + + def test_multi_func(self): + col1 = self.df['A'] + col2 = self.df['B'] + + grouped = self.df.groupby([col1.get, col2.get]) + agged = grouped.mean() + expected = self.df.groupby(['A', 'B']).mean() + assert_frame_equal(agged.ix[:, ['C', 'D']], + expected.ix[:, ['C', 'D']], + check_names=False) # TODO groupby get drops names + + # some "groups" with no data + df = DataFrame({'v1': np.random.randn(6), + 'v2': np.random.randn(6), + 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + # only verify that it works for now + grouped = df.groupby(['k1', 'k2']) + grouped.agg(np.sum) + + def test_multi_key_multiple_functions(self): + grouped = self.df.groupby(['A', 'B'])['C'] + + agged = grouped.agg([np.mean, np.std]) + expected = DataFrame({'mean': grouped.agg(np.mean), + 'std': grouped.agg(np.std)}) + assert_frame_equal(agged, expected) + + def test_frame_multi_key_function_list(self): + data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + grouped = data.groupby(['A', 'B']) + funcs = [np.mean, np.std] + agged = grouped.agg(funcs) + expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), + grouped['F'].agg(funcs)], + keys=['D', 'E', 'F'], axis=1) + assert(isinstance(agged.index, MultiIndex)) + assert(isinstance(expected.index, MultiIndex)) + assert_frame_equal(agged, expected) + + def test_groupby_multiple_columns(self): + data = self.df + grouped = data.groupby(['A', 'B']) + + def _check_op(op): + + result1 = op(grouped) + + expected = defaultdict(dict) + for n1, gp1 in data.groupby('A'): + for n2, gp2 in gp1.groupby('B'): + expected[n1][n2] = op(gp2.ix[:, ['C', 'D']]) + expected = dict((k, DataFrame(v)) for k, v in compat.iteritems(expected)) + expected = Panel.fromDict(expected).swapaxes(0, 1) + expected.major_axis.name, expected.minor_axis.name = 'A', 'B' + + # a little bit crude + for col in ['C', 'D']: + result_col = op(grouped[col]) + exp = expected[col] + pivoted = result1[col].unstack() + pivoted2 = result_col.unstack() + assert_frame_equal(pivoted.reindex_like(exp), exp) + assert_frame_equal(pivoted2.reindex_like(exp), exp) + + _check_op(lambda x: x.sum()) + _check_op(lambda x: x.mean()) + + # test single series works the same + result = data['C'].groupby([data['A'], data['B']]).mean() + expected = data.groupby(['A', 'B']).mean()['C'] + + assert_series_equal(result, expected) + + def test_groupby_as_index_agg(self): + grouped = self.df.groupby('A', as_index=False) + + # single-key + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + grouped = self.df.groupby('A', as_index=True) + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + result3 = grouped['C'].agg({'Q': np.sum}) + assert_frame_equal(result3, expected3) + + # multi-key + + grouped = self.df.groupby(['A', 'B'], as_index=False) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + result3 = grouped['C'].agg({'Q': np.sum}) + assert_frame_equal(result3, expected3) + + def test_multifunc_select_col_integer_cols(self): + df = self.df + df.columns = np.arange(len(df.columns)) + + # it works! + result = df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) + + def test_as_index_series_return_frame(self): + grouped = self.df.groupby('A', as_index=False) + grouped2 = self.df.groupby(['A', 'B'], as_index=False) + + result = grouped['C'].agg(np.sum) + expected = grouped.agg(np.sum).ix[:, ['A', 'C']] + tm.assert_isinstance(result, DataFrame) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].agg(np.sum) + expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']] + tm.assert_isinstance(result2, DataFrame) + assert_frame_equal(result2, expected2) + + result = grouped['C'].sum() + expected = grouped.sum().ix[:, ['A', 'C']] + tm.assert_isinstance(result, DataFrame) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].sum() + expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']] + tm.assert_isinstance(result2, DataFrame) + assert_frame_equal(result2, expected2) + + # corner case + self.assertRaises(Exception, grouped['C'].__getitem__, + 'D') + + def test_groupby_as_index_cython(self): + data = self.df + + # single-key + grouped = data.groupby('A', as_index=False) + result = grouped.mean() + expected = data.groupby(['A']).mean() + expected.insert(0, 'A', expected.index) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + # multi-key + grouped = data.groupby(['A', 'B'], as_index=False) + result = grouped.mean() + expected = data.groupby(['A', 'B']).mean() + + arrays = lzip(*expected.index._tuple_index) + expected.insert(0, 'A', arrays[0]) + expected.insert(1, 'B', arrays[1]) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + def test_groupby_as_index_series_scalar(self): + grouped = self.df.groupby(['A', 'B'], as_index=False) + + # GH #421 + + result = grouped['C'].agg(len) + expected = grouped.agg(len).ix[:, ['A', 'B', 'C']] + assert_frame_equal(result, expected) + + def test_groupby_as_index_corner(self): + self.assertRaises(TypeError, self.ts.groupby, + lambda x: x.weekday(), as_index=False) + + self.assertRaises(ValueError, self.df.groupby, + lambda x: x.lower(), as_index=False, axis=1) + + def test_groupby_as_index_apply(self): + # GH #4648 and #3417 + df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], + 'user_id': [1,2,1,1,3,1], + 'time': range(6)}) + + g_as = df.groupby('user_id', as_index=True) + g_not_as = df.groupby('user_id', as_index=False) + + res_as = g_as.head(2).index + res_not_as = g_not_as.head(2).index + exp = Index([0, 1, 2, 4]) + assert_index_equal(res_as, exp) + assert_index_equal(res_not_as, exp) + + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + + # apply doesn't maintain the original ordering + # changed in GH5610 as the as_index=False returns a MI here + exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) + exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + + assert_index_equal(res_as_apply, exp_as_apply) + assert_index_equal(res_not_as_apply, exp_not_as_apply) + + ind = Index(list('abcde')) + df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) + res = df.groupby(0, as_index=False).apply(lambda x: x).index + assert_index_equal(res, ind) + + def test_groupby_head_tail(self): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False, much easier + assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) + + empty_not_as = DataFrame(columns=df.columns) + assert_frame_equal(empty_not_as, g_not_as.head(0)) + assert_frame_equal(empty_not_as, g_not_as.tail(0)) + assert_frame_equal(empty_not_as, g_not_as.head(-1)) + assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + assert_frame_equal(df, g_not_as.head(7)) # contains all + assert_frame_equal(df, g_not_as.tail(7)) + + # as_index=True, (used to be different) + df_as = df + + assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + assert_frame_equal(empty_as, g_as.head(0)) + assert_frame_equal(empty_as, g_as.tail(0)) + assert_frame_equal(empty_as, g_as.head(-1)) + assert_frame_equal(empty_as, g_as.tail(-1)) + + assert_frame_equal(df_as, g_as.head(7)) # contains all + assert_frame_equal(df_as, g_as.tail(7)) + + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + def test_groupby_multiple_key(self): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day]) + agged = grouped.sum() + assert_almost_equal(df.values, agged.values) + + grouped = df.T.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day], axis=1) + + agged = grouped.agg(lambda x: x.sum(1)) + self.assertTrue(agged.index.equals(df.columns)) + assert_almost_equal(df.T.values, agged.values) + + agged = grouped.agg(lambda x: x.sum(1)) + assert_almost_equal(df.T.values, agged.values) + + def test_groupby_multi_corner(self): + # test that having an all-NA column doesn't mess you up + df = self.df.copy() + df['bad'] = np.nan + agged = df.groupby(['A', 'B']).mean() + + expected = self.df.groupby(['A', 'B']).mean() + expected['bad'] = np.nan + + assert_frame_equal(agged, expected) + + def test_omit_nuisance(self): + grouped = self.df.groupby('A') + + result = grouped.mean() + expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean() + assert_frame_equal(result, expected) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + df = self.df.ix[:, ['A', 'C', 'D']] + df['E'] = datetime.now() + grouped = df.groupby('A') + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_frame_equal(result, expected) + + # won't work with axis = 1 + grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) + result = self.assertRaises(TypeError, grouped.agg, + lambda x: x.sum(1, numeric_only=False)) + + def test_omit_nuisance_python_multiple(self): + grouped = self.three_group.groupby(['A', 'B']) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + def test_empty_groups_corner(self): + # handle empty groups + df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2': np.array(['1', '1', '1', '2', '2', '2']), + 'k3': ['foo', 'bar'] * 3, + 'v1': np.random.randn(6), + 'v2': np.random.randn(6)}) + + grouped = df.groupby(['k1', 'k2']) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + grouped = self.mframe[3:5].groupby(level=0) + agged = grouped.apply(lambda x: x.mean()) + agged_A = grouped['A'].apply(np.mean) + assert_series_equal(agged['A'], agged_A) + self.assertEqual(agged.index.name, 'first') + + def test_apply_concat_preserve_names(self): + grouped = self.three_group.groupby(['A', 'B']) + + def desc(group): + result = group.describe() + result.index.name = 'stat' + return result + + def desc2(group): + result = group.describe() + result.index.name = 'stat' + result = result[:len(group)] + # weirdo + return result + + def desc3(group): + result = group.describe() + + # names are different + result.index.name = 'stat_%d' % len(group) + + result = result[:len(group)] + # weirdo + return result + + result = grouped.apply(desc) + self.assertEqual(result.index.names, ('A', 'B', 'stat')) + + result2 = grouped.apply(desc2) + self.assertEqual(result2.index.names, ('A', 'B', 'stat')) + + result3 = grouped.apply(desc3) + self.assertEqual(result3.index.names, ('A', 'B', None)) + + def test_nonsense_func(self): + df = DataFrame([0]) + self.assertRaises(Exception, df.groupby, lambda x: x + 'foo') + + def test_cythonized_aggers(self): + data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], + 'B': ['A', 'B'] * 6, + 'C': np.random.randn(12)} + df = DataFrame(data) + df['C'][2:10:2] = nan + + def _testit(op): + # single column + grouped = df.drop(['B'], axis=1).groupby('A') + exp = {} + for cat, group in grouped: + exp[cat] = op(group['C']) + exp = DataFrame({'C': exp}) + exp.index.name = 'A' + result = op(grouped) + assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(['A', 'B']) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group['C']) + exp = DataFrame(expd).T.stack(dropna=False) + result = op(grouped)['C'] + assert_series_equal(result, exp) + + _testit(lambda x: x.count()) + _testit(lambda x: x.sum()) + _testit(lambda x: x.std()) + _testit(lambda x: x.var()) + _testit(lambda x: x.sem()) + _testit(lambda x: x.mean()) + _testit(lambda x: x.median()) + _testit(lambda x: x.prod()) + _testit(lambda x: x.min()) + _testit(lambda x: x.max()) + + def test_max_min_non_numeric(self): + # #2700 + aa = DataFrame({'nn':[11,11,22,22],'ii':[1,2,3,4],'ss':4*['mama']}) + + result = aa.groupby('nn').max() + self.assertTrue('ss' in result) + + result = aa.groupby('nn').min() + self.assertTrue('ss' in result) + + def test_cython_agg_boolean(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': np.random.randint(0, 2, 50).astype('bool')}) + result = frame.groupby('a')['b'].mean() + expected = frame.groupby('a')['b'].agg(np.mean) + + assert_series_equal(result, expected) + + def test_cython_agg_nothing_to_agg(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + self.assertRaises(DataError, frame.groupby('a')['b'].mean) + + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + self.assertRaises(DataError, frame[['b']].groupby(frame['a']).mean) + + def test_cython_agg_nothing_to_agg_with_dates(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25, + 'dates': pd.date_range('now', periods=50, + freq='T')}) + with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"): + frame.groupby('b').dates.mean() + + def test_groupby_timedelta_cython_count(self): + df = DataFrame({'g': list('ab' * 2), + 'delt': np.arange(4).astype('timedelta64[ns]')}) + expected = Series([2, 2], index=['a', 'b'], name='delt') + result = df.groupby('g').delt.count() + tm.assert_series_equal(expected, result) + + def test_cython_agg_frame_columns(self): + # #2113 + df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + + result = df.groupby(level=0, axis='columns').mean() + result = df.groupby(level=0, axis='columns').mean() + result = df.groupby(level=0, axis='columns').mean() + _ = df.groupby(level=0, axis='columns').mean() + + def test_wrap_aggregated_output_multindex(self): + df = self.mframe.T + df['baz', 'two'] = 'peekaboo' + + keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] + agged = df.groupby(keys).agg(np.mean) + tm.assert_isinstance(agged.columns, MultiIndex) + + def aggfun(ser): + if ser.name == ('foo', 'one'): + raise TypeError + else: + return ser.sum() + agged2 = df.groupby(keys).aggregate(aggfun) + self.assertEqual(len(agged2.columns) + 1, len(df.columns)) + + def test_groupby_level(self): + frame = self.mframe + deleveled = frame.reset_index() + + result0 = frame.groupby(level=0).sum() + result1 = frame.groupby(level=1).sum() + + expected0 = frame.groupby(deleveled['first'].values).sum() + expected1 = frame.groupby(deleveled['second'].values).sum() + + expected0 = expected0.reindex(frame.index.levels[0]) + expected1 = expected1.reindex(frame.index.levels[1]) + + self.assertEqual(result0.index.name, 'first') + self.assertEqual(result1.index.name, 'second') + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + self.assertEqual(result0.index.name, frame.index.names[0]) + self.assertEqual(result1.index.name, frame.index.names[1]) + + # groupby level name + result0 = frame.groupby(level='first').sum() + result1 = frame.groupby(level='second').sum() + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + # axis=1 + + result0 = frame.T.groupby(level=0, axis=1).sum() + result1 = frame.T.groupby(level=1, axis=1).sum() + assert_frame_equal(result0, expected0.T) + assert_frame_equal(result1, expected1.T) + + # raise exception for non-MultiIndex + self.assertRaises(ValueError, self.df.groupby, level=1) + + def test_groupby_level_index_names(self): + ## GH4014 this used to raise ValueError since 'exp'>1 (in py2) + df = DataFrame({'exp' : ['A']*3 + ['B']*3, 'var1' : lrange(6),}).set_index('exp') + df.groupby(level='exp') + self.assertRaises(ValueError, df.groupby, level='foo') + + def test_groupby_level_with_nas(self): + index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], + labels=[[1, 1, 1, 1, 0, 0, 0, 0], + [0, 1, 2, 3, 0, 1, 2, 3]]) + + # factorizing doesn't confuse things + s = Series(np.arange(8.), index=index) + result = s.groupby(level=0).sum() + expected = Series([22., 6.], index=[1, 0]) + assert_series_equal(result, expected) + + index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], + labels=[[1, 1, 1, 1, -1, 0, 0, 0], + [0, 1, 2, 3, 0, 1, 2, 3]]) + + # factorizing doesn't confuse things + s = Series(np.arange(8.), index=index) + result = s.groupby(level=0).sum() + expected = Series([18., 6.], index=[1, 0]) + assert_series_equal(result, expected) + + def test_groupby_level_apply(self): + frame = self.mframe + + result = frame.groupby(level=0).count() + self.assertEqual(result.index.name, 'first') + result = frame.groupby(level=1).count() + self.assertEqual(result.index.name, 'second') + + result = frame['A'].groupby(level=0).count() + self.assertEqual(result.index.name, 'first') + + def test_groupby_level_mapper(self): + frame = self.mframe + deleveled = frame.reset_index() + + mapper0 = {'foo': 0, 'bar': 0, + 'baz': 1, 'qux': 1} + mapper1 = {'one': 0, 'two': 0, 'three': 1} + + result0 = frame.groupby(mapper0, level=0).sum() + result1 = frame.groupby(mapper1, level=1).sum() + + mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) + expected0 = frame.groupby(mapped_level0).sum() + expected1 = frame.groupby(mapped_level1).sum() + expected0.index.name, expected1.index.name = 'first', 'second' + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + def test_groupby_level_0_nonmulti(self): + # #1313 + a = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, + 4, 5, 2, 6], name='foo')) + + result = a.groupby(level=0).sum() + self.assertEqual(result.index.name, a.index.name) + + def test_level_preserve_order(self): + grouped = self.mframe.groupby(level=0) + exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3]) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + def test_grouping_labels(self): + grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3]) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + def test_cython_fail_agg(self): + dr = bdate_range('1/1/2000', periods=50) + ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + expected = grouped.agg(np.sum) + assert_series_equal(summed, expected) + + def test_apply_series_to_frame(self): + def f(piece): + return DataFrame({'value': piece, + 'demeaned': piece - piece.mean(), + 'logged': np.log(piece)}) + + dr = bdate_range('1/1/2000', periods=100) + ts = Series(np.random.randn(100), index=dr) + + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(f) + + tm.assert_isinstance(result, DataFrame) + self.assertTrue(result.index.equals(ts.index)) + + def test_apply_series_yield_constant(self): + result = self.df.groupby(['A', 'B'])['C'].apply(len) + self.assertEqual(result.index.names[:2], ('A', 'B')) + + def test_apply_frame_to_series(self): + grouped = self.df.groupby(['A', 'B']) + result = grouped.apply(len) + expected = grouped.count()['C'] + self.assertTrue(result.index.equals(expected.index)) + self.assert_numpy_array_equal(result.values, expected.values) + + def test_apply_frame_concat_series(self): + def trans(group): + return group.groupby('B')['C'].sum().order()[:2] + + def trans2(group): + grouped = group.groupby(df.reindex(group.index)['B']) + return grouped.sum().order()[:2] + + df = DataFrame({'A': np.random.randint(0, 5, 1000), + 'B': np.random.randint(0, 5, 1000), + 'C': np.random.randn(1000)}) + + result = df.groupby('A').apply(trans) + exp = df.groupby('A')['C'].apply(trans2) + assert_series_equal(result, exp) + + def test_apply_transform(self): + grouped = self.ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + assert_series_equal(result, expected) + + def test_apply_multikey_corner(self): + grouped = self.tsframe.groupby([lambda x: x.year, + lambda x: x.month]) + + def f(group): + return group.sort('A')[-5:] + + result = grouped.apply(f) + for key, group in grouped: + assert_frame_equal(result.ix[key], f(group)) + + def test_mutate_groups(self): + + # GH3380 + + mydf = DataFrame({ + 'cat1' : ['a'] * 8 + ['b'] * 6, + 'cat2' : ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + ['d'] * 2 + ['e'] * 2, + 'cat3' : lmap(lambda x: 'g%s' % x, lrange(1,15)), + 'val' : np.random.randint(100, size=14), + }) + + def f_copy(x): + x = x.copy() + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + def f_no_copy(x): + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + grpby_copy = mydf.groupby('cat1').apply(f_copy) + grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy) + assert_series_equal(grpby_copy,grpby_no_copy) + + def test_apply_chunk_view(self): + # Low level tinkering could be unsafe, make sure not + df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'value': lrange(9)}) + + # return view + f = lambda x: x[:2] + + result = df.groupby('key', group_keys=False).apply(f) + expected = df.take([0, 1, 3, 4, 6, 7]) + assert_frame_equal(result, expected) + + def test_apply_no_name_column_conflict(self): + df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], + 'value': lrange(10)[::-1]}) + + # it works! #2605 + grouped = df.groupby(['name', 'name2']) + grouped.apply(lambda x: x.sort('value')) + + def test_groupby_series_indexed_differently(self): + s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], + index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) + s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], + index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) + + grouped = s1.groupby(s2) + agged = grouped.mean() + exp = s1.groupby(s2.reindex(s1.index).get).mean() + assert_series_equal(agged, exp) + + def test_groupby_with_hier_columns(self): + tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']])) + index = MultiIndex.from_tuples(tuples) + columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), + ('B', 'cat'), ('A', 'dog')]) + df = DataFrame(np.random.randn(8, 4), index=index, + columns=columns) + + result = df.groupby(level=0).mean() + self.assertTrue(result.columns.equals(columns)) + + result = df.groupby(level=0, axis=1).mean() + self.assertTrue(result.index.equals(df.index)) + + result = df.groupby(level=0).agg(np.mean) + self.assertTrue(result.columns.equals(columns)) + + result = df.groupby(level=0).apply(lambda x: x.mean()) + self.assertTrue(result.columns.equals(columns)) + + result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) + self.assertTrue(result.columns.equals(Index(['A', 'B']))) + self.assertTrue(result.index.equals(df.index)) + + # add a nuisance column + sorted_columns, _ = columns.sortlevel(0) + df['A', 'foo'] = 'bar' + result = df.groupby(level=0).mean() + self.assertTrue(result.columns.equals(df.columns[:-1])) + + def test_pass_args_kwargs(self): + from numpy import percentile + + def f(x, q=None, axis=0): + return percentile(x, q, axis=axis) + g = lambda x: percentile(x, 80, axis=0) + + # Series + ts_grouped = self.ts.groupby(lambda x: x.month) + agg_result = ts_grouped.agg(percentile, 80, axis=0) + apply_result = ts_grouped.apply(percentile, 80, axis=0) + trans_result = ts_grouped.transform(percentile, 80, axis=0) + + agg_expected = ts_grouped.quantile(.8) + trans_expected = ts_grouped.transform(g) + + assert_series_equal(apply_result, agg_expected) + assert_series_equal(agg_result, agg_expected) + assert_series_equal(trans_result, trans_expected) + + agg_result = ts_grouped.agg(f, q=80) + apply_result = ts_grouped.apply(f, q=80) + trans_result = ts_grouped.transform(f, q=80) + assert_series_equal(agg_result, agg_expected) + assert_series_equal(apply_result, agg_expected) + assert_series_equal(trans_result, trans_expected) + + # DataFrame + df_grouped = self.tsframe.groupby(lambda x: x.month) + agg_result = df_grouped.agg(percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, .8) + expected = df_grouped.quantile(.8) + assert_frame_equal(apply_result, expected) + assert_frame_equal(agg_result, expected) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=.8) + assert_frame_equal(agg_result, expected) + assert_frame_equal(apply_result, expected) + + # def test_cython_na_bug(self): + # values = np.random.randn(10) + # shape = (5, 5) + # label_list = [np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype=np.int32), + # np.array([1, 2, 3, 4, 0, 1, 2, 3, 3, 4], dtype=np.int32)] + + # lib.group_aggregate(values, label_list, shape) + + def test_size(self): + grouped = self.df.groupby(['A', 'B']) + result = grouped.size() + for key, group in grouped: + self.assertEqual(result[key], len(group)) + + grouped = self.df.groupby('A') + result = grouped.size() + for key, group in grouped: + self.assertEqual(result[key], len(group)) + + grouped = self.df.groupby('B') + result = grouped.size() + for key, group in grouped: + self.assertEqual(result[key], len(group)) + + def test_count(self): + + # GH5610 + # count counts non-nulls + df = pd.DataFrame([[1, 2, 'foo'], [1, nan, 'bar'], [3, nan, nan]], + columns=['A', 'B', 'C']) + + count_as = df.groupby('A').count() + count_not_as = df.groupby('A', as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], index=[1,3]) + expected.index.name='A' + assert_frame_equal(count_not_as, expected.reset_index()) + assert_frame_equal(count_as, expected) + + count_B = df.groupby('A')['B'].count() + assert_series_equal(count_B, expected['B']) + + def test_count_object(self): + df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([3, 3], index=[2, 3], name='a') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([1, 3], index=[2, 3], name='a') + tm.assert_series_equal(result, expected) + + def test_non_cython_api(self): + + # GH5610 + # non-cython calls should not include the grouper + + df = DataFrame([[1, 2, 'foo'], [1, nan, 'bar',], [3, nan, 'baz']], columns=['A', 'B','C']) + g = df.groupby('A') + gni = df.groupby('A',as_index=False) + + # mad + expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3]) + expected.index.name = 'A' + result = g.mad() + assert_frame_equal(result,expected) + + expected = DataFrame([[0.,0.],[0,nan]],columns=['A','B'],index=[0,1]) + result = gni.mad() + assert_frame_equal(result,expected) + + # describe + expected = DataFrame(dict(B = concat([df.loc[[0,1],'B'].describe(),df.loc[[2],'B'].describe()],keys=[1,3]))) + expected.index.names = ['A',None] + result = g.describe() + assert_frame_equal(result,expected) + + expected = concat([df.loc[[0,1],['A','B']].describe(),df.loc[[2],['A','B']].describe()],keys=[0,1]) + result = gni.describe() + assert_frame_equal(result,expected) + + # any + expected = DataFrame([[True, True],[False, True]],columns=['B','C'],index=[1,3]) + expected.index.name = 'A' + result = g.any() + assert_frame_equal(result,expected) + + # idxmax + expected = DataFrame([[0],[nan]],columns=['B'],index=[1,3]) + expected.index.name = 'A' + result = g.idxmax() + assert_frame_equal(result,expected) + + # cumsum (GH5614) + df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=['A', 'B', 'C']) + expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) + result = df.groupby('A').cumsum() + assert_frame_equal(result,expected) + + expected = DataFrame([[1, 2, np.nan], [2, np.nan, 9], [3, 4, 9]], columns=['A', 'B', 'C']).astype('float64') + result = df.groupby('A', as_index=False).cumsum() + assert_frame_equal(result,expected) + + def test_grouping_ndarray(self): + grouped = self.df.groupby(self.df['A'].values) + + result = grouped.sum() + expected = self.df.groupby('A').sum() + assert_frame_equal(result, expected, check_names=False) # Note: no names when grouping by value + + def test_agg_consistency(self): + # agg with ([]) and () not consistent + # GH 6715 + + def P1(a): + try: + return np.percentile(a.dropna(), q=1) + except: + return np.nan + + import datetime as dt + df = DataFrame({'col1':[1,2,3,4], + 'col2':[10,25,26,31], + 'date':[dt.date(2013,2,10),dt.date(2013,2,10),dt.date(2013,2,11),dt.date(2013,2,11)]}) + + g = df.groupby('date') + + expected = g.agg([P1]) + expected.columns = expected.columns.levels[0] + + result = g.agg(P1) + assert_frame_equal(result, expected) + + def test_apply_typecast_fail(self): + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) + + def test_apply_multiindex_fail(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}, index=index) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) + + def test_apply_corner(self): + result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + expected = self.tsframe * 2 + assert_frame_equal(result, expected) + + def test_apply_without_copy(self): + # GH 5545 + # returning a non-copy in an applied function fails + + data = DataFrame({'id_field' : [100, 100, 200, 300], 'category' : ['a','b','c','c'], 'value' : [1,2,3,4]}) + + def filt1(x): + if x.shape[0] == 1: + return x.copy() + else: + return x[x.category == 'c'] + + def filt2(x): + if x.shape[0] == 1: + return x + else: + return x[x.category == 'c'] + + expected = data.groupby('id_field').apply(filt1) + result = data.groupby('id_field').apply(filt2) + assert_frame_equal(result,expected) + + def test_apply_use_categorical_name(self): + from pandas import qcut + cats = qcut(self.df.C, 4) + + def get_stats(group): + return {'min': group.min(), 'max': group.max(), + 'count': group.count(), 'mean': group.mean()} + + result = self.df.groupby(cats).D.apply(get_stats) + self.assertEqual(result.index.names[0], 'C') + + def test_apply_corner_cases(self): + # #535, can't use sliding iterator + + N = 1000 + labels = np.random.randint(0, 100, size=N) + df = DataFrame({'key': labels, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + + grouped = df.groupby('key') + + def f(g): + g['value3'] = g['value1'] * 2 + return g + + result = grouped.apply(f) + self.assertTrue('value3' in result) + + def test_transform_mixed_type(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}, index=index) + + def f(group): + group['g'] = group['d'] * 2 + return group[:1] + + grouped = df.groupby('c') + result = grouped.apply(f) + + self.assertEqual(result['d'].dtype, np.float64) + + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.ix[key]) + + def test_groupby_wrong_multi_labels(self): + from pandas import read_csv + data = """index,foo,bar,baz,spam,data +0,foo1,bar1,baz1,spam2,20 +1,foo1,bar2,baz1,spam3,30 +2,foo2,bar2,baz1,spam2,40 +3,foo1,bar1,baz2,spam1,50 +4,foo3,bar1,baz2,spam1,60""" + data = read_csv(StringIO(data), index_col=0) + + grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_groupby_series_with_name(self): + result = self.df.groupby(self.df['A']).mean() + result2 = self.df.groupby(self.df['A'], as_index=False).mean() + self.assertEqual(result.index.name, 'A') + self.assertIn('A', result2) + + result = self.df.groupby([self.df['A'], self.df['B']]).mean() + result2 = self.df.groupby([self.df['A'], self.df['B']], + as_index=False).mean() + self.assertEqual(result.index.names, ('A', 'B')) + self.assertIn('A', result2) + self.assertIn('B', result2) + + def test_seriesgroupby_name_attr(self): + # GH 6265 + result = self.df.groupby('A')['C'] + self.assertEqual(result.count().name, 'C') + self.assertEqual(result.mean().name, 'C') + + testFunc = lambda x: np.sum(x)*2 + self.assertEqual(result.agg(testFunc).name, 'C') + + def test_groupby_name_propagation(self): + # GH 6124 + def summarize(df, name=None): + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=name) + + def summarize_random_name(df): + # Provide a different name for each Series. In this case, groupby + # should not attempt to propagate the Series name since they are + # inconsistent. + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=df.iloc[0]['A']) + + metrics = self.df.groupby('A').apply(summarize) + self.assertEqual(metrics.columns.name, None) + metrics = self.df.groupby('A').apply(summarize, 'metrics') + self.assertEqual(metrics.columns.name, 'metrics') + metrics = self.df.groupby('A').apply(summarize_random_name) + self.assertEqual(metrics.columns.name, None) + + def test_groupby_nonstring_columns(self): + df = DataFrame([np.arange(10) for x in range(10)]) + grouped = df.groupby(0) + result = grouped.mean() + expected = df.groupby(df[0]).mean() + del expected[0] + assert_frame_equal(result, expected) + + def test_cython_grouper_series_bug_noncontig(self): + arr = np.empty((100, 100)) + arr.fill(np.nan) + obj = Series(arr[:, 0], index=lrange(100)) + inds = np.tile(lrange(10), 10) + + result = obj.groupby(inds).agg(Series.median) + self.assertTrue(result.isnull().all()) + + def test_series_grouper_noncontig_index(self): + index = Index([tm.rands(10) for _ in range(100)]) + + values = Series(np.random.randn(50), index=index[::2]) + labels = np.random.randint(0, 5, 50) + + # it works! + grouped = values.groupby(labels) + + # accessing the index elements causes segfault + f = lambda x: len(set(map(id, x.index))) + grouped.agg(f) + + def test_convert_objects_leave_decimal_alone(self): + + from decimal import Decimal + + s = Series(lrange(5)) + labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') + + def convert_fast(x): + return Decimal(str(x.mean())) + + def convert_force_pure(x): + # base will be length 0 + assert(len(x.base) > 0) + return Decimal(str(x.mean())) + + grouped = s.groupby(labels) + + result = grouped.agg(convert_fast) + self.assertEqual(result.dtype, np.object_) + tm.assert_isinstance(result[0], Decimal) + + result = grouped.agg(convert_force_pure) + self.assertEqual(result.dtype, np.object_) + tm.assert_isinstance(result[0], Decimal) + + def test_apply_with_mixed_dtype(self): + # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 + df = DataFrame({'foo1' : ['one', 'two', 'two', 'three', 'one', 'two'], + 'foo2' : np.random.randn(6)}) + result = df.apply(lambda x: x, axis=1) + assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) + + + # GH 3610 incorrect dtype conversion with as_index=False + df = DataFrame({"c1" : [1,2,6,6,8]}) + df["c2"] = df.c1/2.0 + result1 = df.groupby("c2").mean().reset_index().c2 + result2 = df.groupby("c2", as_index=False).mean().c2 + assert_series_equal(result1,result2) + + def test_groupby_aggregation_mixed_dtype(self): + + # GH 6212 + expected = DataFrame({ + 'v1': [5,5,7,np.nan,3,3,4,1], + 'v2': [55,55,77,np.nan,33,33,44,11]}, + index=MultiIndex.from_tuples([(1,95),(1,99),(2,95),(2,99),('big','damp'), + ('blue','dry'),('red','red'),('red','wet')], + names=['by1','by2'])) + + df = DataFrame({ + 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], + 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, + np.nan] + }) + + g = df.groupby(['by1','by2']) + result = g[['v1','v2']].mean() + assert_frame_equal(result,expected) + + def test_groupby_dtype_inference_empty(self): + # GH 6733 + df = DataFrame({'x': [], 'range': np.arange(0,dtype='int64')}) + result = df.groupby('x').first() + expected = DataFrame({'range' : Series([],index=Index([],name='x'),dtype='int64') }) + assert_frame_equal(result,expected,by_blocks=True) + + def test_groupby_list_infer_array_like(self): + result = self.df.groupby(list(self.df['A'])).mean() + expected = self.df.groupby(self.df['A']).mean() + assert_frame_equal(result, expected, check_names=False) + + self.assertRaises(Exception, self.df.groupby, list(self.df['A'][:-1])) + + # pathological case of ambiguity + df = DataFrame({'foo': [0, 1], 'bar': [3, 4], + 'val': np.random.randn(2)}) + + result = df.groupby(['foo', 'bar']).mean() + expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + + def test_dictify(self): + dict(iter(self.df.groupby('A'))) + dict(iter(self.df.groupby(['A', 'B']))) + dict(iter(self.df['C'].groupby(self.df['A']))) + dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) + dict(iter(self.df.groupby('A')['C'])) + dict(iter(self.df.groupby(['A', 'B'])['C'])) + + def test_sparse_friendly(self): + sdf = self.df[['C', 'D']].to_sparse() + panel = tm.makePanel() + tm.add_nans(panel) + + def _check_work(gp): + gp.mean() + gp.agg(np.mean) + dict(iter(gp)) + + # it works! + _check_work(sdf.groupby(lambda x: x // 2)) + _check_work(sdf['C'].groupby(lambda x: x // 2)) + _check_work(sdf.groupby(self.df['A'])) + + # do this someday + # _check_work(panel.groupby(lambda x: x.month, axis=1)) + + def test_panel_groupby(self): + self.panel = tm.makePanel() + tm.add_nans(self.panel) + grouped = self.panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1}, + axis='items') + agged = grouped.mean() + agged2 = grouped.agg(lambda x: x.mean('items')) + + tm.assert_panel_equal(agged, agged2) + + self.assert_numpy_array_equal(agged.items, [0, 1]) + + grouped = self.panel.groupby(lambda x: x.month, axis='major') + agged = grouped.mean() + + self.assert_numpy_array_equal(agged.major_axis, sorted(list(set(self.panel.major_axis.month)))) + + grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, + axis='minor') + agged = grouped.mean() + self.assert_numpy_array_equal(agged.minor_axis, [0, 1]) + + def test_numpy_groupby(self): + from pandas.core.groupby import numpy_groupby + + data = np.random.randn(100, 100) + labels = np.random.randint(0, 10, size=100) + + df = DataFrame(data) + + result = df.groupby(labels).sum().values + expected = numpy_groupby(data, labels) + assert_almost_equal(result, expected) + + result = df.groupby(labels, axis=1).sum().values + expected = numpy_groupby(data, labels, axis=1) + assert_almost_equal(result, expected) + + def test_groupby_2d_malformed(self): + d = DataFrame(index=lrange(2)) + d['group'] = ['g1', 'g2'] + d['zeros'] = [0, 0] + d['ones'] = [1, 1] + d['label'] = ['l1', 'l2'] + tmp = d.groupby(['group']).mean() + res_values = np.array([[0., 1.], [0., 1.]]) + self.assert_numpy_array_equal(tmp.columns, ['zeros', 'ones']) + self.assert_numpy_array_equal(tmp.values, res_values) + + def test_int32_overflow(self): + B = np.concatenate((np.arange(10000), np.arange(10000), + np.arange(5000))) + A = np.arange(25000) + df = DataFrame({'A': A, 'B': B, + 'C': A, 'D': B, + 'E': np.random.randn(25000)}) + + left = df.groupby(['A', 'B', 'C', 'D']).sum() + right = df.groupby(['D', 'C', 'B', 'A']).sum() + self.assertEqual(len(left), len(right)) + + def test_int64_overflow(self): + B = np.concatenate((np.arange(1000), np.arange(1000), + np.arange(500))) + A = np.arange(2500) + df = DataFrame({'A': A, 'B': B, + 'C': A, 'D': B, + 'E': A, 'F': B, + 'G': A, 'H': B, + 'values': np.random.randn(2500)}) + + lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) + rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) + + left = lg.sum()['values'] + right = rg.sum()['values'] + + exp_index, _ = left.index.sortlevel(0) + self.assertTrue(left.index.equals(exp_index)) + + exp_index, _ = right.index.sortlevel(0) + self.assertTrue(right.index.equals(exp_index)) + + tups = list(map(tuple, df[['A', 'B', 'C', 'D', + 'E', 'F', 'G', 'H']].values)) + tups = com._asarray_tuplesafe(tups) + expected = df.groupby(tups).sum()['values'] + + for k, v in compat.iteritems(expected): + self.assertEqual(left[k], right[k[::-1]]) + self.assertEqual(left[k], v) + self.assertEqual(len(left), len(right)) + + def test_groupby_sort_multi(self): + df = DataFrame({'a': ['foo', 'bar', 'baz'], + 'b': [3, 2, 1], + 'c': [0, 1, 2], + 'd': np.random.randn(3)}) + + tups = lmap(tuple, df[['a', 'b', 'c']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['a', 'b', 'c'], sort=True).sum() + self.assert_numpy_array_equal(result.index.values, + tups[[1, 2, 0]]) + + tups = lmap(tuple, df[['c', 'a', 'b']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['c', 'a', 'b'], sort=True).sum() + self.assert_numpy_array_equal(result.index.values, tups) + + tups = lmap(tuple, df[['b', 'c', 'a']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['b', 'c', 'a'], sort=True).sum() + self.assert_numpy_array_equal(result.index.values, + tups[[2, 1, 0]]) + + df = DataFrame({'a': [0, 1, 2, 0, 1, 2], + 'b': [0, 0, 0, 1, 1, 1], + 'd': np.random.randn(6)}) + grouped = df.groupby(['a', 'b'])['d'] + result = grouped.sum() + _check_groupby(df, result, ['a', 'b'], 'd') + + def test_intercept_builtin_sum(self): + s = Series([1., 2., np.nan, 3.]) + grouped = s.groupby([0, 1, 2, 2]) + + result = grouped.agg(builtins.sum) + result2 = grouped.apply(builtins.sum) + expected = grouped.sum() + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + def test_column_select_via_attr(self): + result = self.df.groupby('A').C.sum() + expected = self.df.groupby('A')['C'].sum() + assert_series_equal(result, expected) + + self.df['mean'] = 1.5 + result = self.df.groupby('A').mean() + expected = self.df.groupby('A').agg(np.mean) + assert_frame_equal(result, expected) + + def test_rank_apply(self): + lev1 = np.array([rands(10) for _ in range(100)], dtype=object) + lev2 = np.array([rands(10) for _ in range(130)], dtype=object) + lab1 = np.random.randint(0, 100, size=500) + lab2 = np.random.randint(0, 130, size=500) + + df = DataFrame({'value': np.random.randn(500), + 'key1': lev1.take(lab1), + 'key2': lev2.take(lab2)}) + + result = df.groupby(['key1', 'key2']).value.rank() + + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank()) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + assert_series_equal(result, expected) + + result = df.groupby(['key1', 'key2']).value.rank(pct=True) + + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank(pct=True)) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + assert_series_equal(result, expected) + + def test_dont_clobber_name_column(self): + df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], + 'name': ['foo', 'bar', 'baz'] * 2}) + + result = df.groupby('key').apply(lambda x: x) + assert_frame_equal(result, df) + + def test_skip_group_keys(self): + from pandas import concat + + tsf = tm.makeTimeDataFrame() + + grouped = tsf.groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_index(by='A')[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.sort_index(by='A')[:3]) + + expected = concat(pieces) + assert_frame_equal(result, expected) + + grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.order()[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.order()[:3]) + + expected = concat(pieces) + assert_series_equal(result, expected) + + def test_no_nonsense_name(self): + # GH #995 + s = self.frame['C'].copy() + s.name = None + + result = s.groupby(self.frame['A']).agg(np.sum) + self.assertIsNone(result.name) + + def test_wrap_agg_out(self): + grouped = self.three_group.groupby(['A', 'B']) + + def func(ser): + if ser.dtype == np.object: + raise TypeError + else: + return ser.sum() + result = grouped.aggregate(func) + exp_grouped = self.three_group.ix[:, self.three_group.columns != 'C'] + expected = exp_grouped.groupby(['A', 'B']).aggregate(func) + assert_frame_equal(result, expected) + + def test_multifunc_sum_bug(self): + # GH #1065 + x = DataFrame(np.arange(9).reshape(3, 3)) + x['test'] = 0 + x['fl'] = [1.3, 1.5, 1.6] + + grouped = x.groupby('test') + result = grouped.agg({'fl': 'sum', 2: 'size'}) + self.assertEqual(result['fl'].dtype, np.float64) + + def test_handle_dict_return_value(self): + def f(group): + return {'min': group.min(), 'max': group.max()} + + def g(group): + return Series({'min': group.min(), 'max': group.max()}) + + result = self.df.groupby('A')['C'].apply(f) + expected = self.df.groupby('A')['C'].apply(g) + + tm.assert_isinstance(result, Series) + assert_series_equal(result, expected) + + def test_getitem_list_of_columns(self): + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': np.random.randn(8)}) + + result = df.groupby('A')[['C', 'D']].mean() + result2 = df.groupby('A')['C', 'D'].mean() + result3 = df.groupby('A')[df.columns[2:4]].mean() + + expected = df.ix[:, ['A', 'C', 'D']].groupby('A').mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + + def test_agg_multiple_functions_maintain_order(self): + # GH #610 + funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] + result = self.df.groupby('A')['C'].agg(funcs) + exp_cols = ['mean', 'max', 'min'] + + self.assert_numpy_array_equal(result.columns, exp_cols) + + def test_multiple_functions_tuples_and_non_tuples(self): + # #1359 + + funcs = [('foo', 'mean'), 'std'] + ex_funcs = [('foo', 'mean'), ('std', 'std')] + + result = self.df.groupby('A')['C'].agg(funcs) + expected = self.df.groupby('A')['C'].agg(ex_funcs) + assert_frame_equal(result, expected) + + result = self.df.groupby('A').agg(funcs) + expected = self.df.groupby('A').agg(ex_funcs) + assert_frame_equal(result, expected) + + def test_agg_multiple_functions_too_many_lambdas(self): + grouped = self.df.groupby('A') + funcs = ['mean', lambda x: x.mean(), lambda x: x.std()] + + self.assertRaises(SpecificationError, grouped.agg, funcs) + + def test_more_flexible_frame_multi_function(self): + from pandas import concat + + grouped = self.df.groupby('A') + + exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) + exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) + + expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) + expected = expected.swaplevel(0, 1, axis=1).sortlevel(0, axis=1) + + d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) + result = grouped.aggregate(d) + + assert_frame_equal(result, expected) + + # be careful + result = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + expected = grouped.aggregate(OrderedDict([['C', np.mean], + ['D', [np.mean, np.std]]])) + assert_frame_equal(result, expected) + + def foo(x): + return np.mean(x) + + def bar(x): + return np.std(x, ddof=1) + d = OrderedDict([['C', np.mean], + ['D', OrderedDict([['foo', np.mean], + ['bar', np.std]])]]) + result = grouped.aggregate(d) + + d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) + expected = grouped.aggregate(d) + + assert_frame_equal(result, expected) + + def test_multi_function_flexible_mix(self): + # GH #1268 + grouped = self.df.groupby('A') + + d = OrderedDict([['C', OrderedDict([['foo', 'mean'], + [ + 'bar', 'std']])], + ['D', 'sum']]) + result = grouped.aggregate(d) + d2 = OrderedDict([['C', OrderedDict([['foo', 'mean'], + [ + 'bar', 'std']])], + ['D', ['sum']]]) + result2 = grouped.aggregate(d2) + + d3 = OrderedDict([['C', OrderedDict([['foo', 'mean'], + [ + 'bar', 'std']])], + ['D', {'sum': 'sum'}]]) + expected = grouped.aggregate(d3) + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_set_group_name(self): + def f(group): + assert group.name is not None + return group + + def freduce(group): + assert group.name is not None + return group.sum() + + def foo(x): + return freduce(x) + + def _check_all(grouped): + # make sure all these work + grouped.apply(f) + grouped.aggregate(freduce) + grouped.aggregate({'C': freduce, 'D': freduce}) + grouped.transform(f) + + grouped['C'].apply(f) + grouped['C'].aggregate(freduce) + grouped['C'].aggregate([freduce, foo]) + grouped['C'].transform(f) + + _check_all(self.df.groupby('A')) + _check_all(self.df.groupby(['A', 'B'])) + + def test_no_dummy_key_names(self): + # GH #1291 + + result = self.df.groupby(self.df['A'].values).sum() + self.assertIsNone(result.index.name) + + result = self.df.groupby([self.df['A'].values, + self.df['B'].values]).sum() + self.assertEqual(result.index.names, (None, None)) + + def test_groupby_categorical(self): + levels = ['foo', 'bar', 'baz', 'qux'] + labels = np.random.randint(0, 4, size=100) + + cats = Categorical(labels, levels, name='myfactor') + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + expected = expected.reindex(levels) + expected.index.name = 'myfactor' + + assert_frame_equal(result, expected) + self.assertEqual(result.index.name, cats.name) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.labels.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels, sort=False).describe() + expected.index.names = ['myfactor', None] + assert_frame_equal(desc_result, expected) + + def test_groupby_groups_datetimeindex(self): + # #1430 + from pandas.tseries.api import DatetimeIndex + periods = 1000 + ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) + df = DataFrame({'high': np.arange(periods), + 'low': np.arange(periods)}, index=ind) + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + + # it works! + groups = grouped.groups + tm.assert_isinstance(list(groups.keys())[0], datetime) + + def test_groupby_groups_datetimeindex_tz(self): + # GH 3950 + dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', + '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] + df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], + 'datetime': dates, + 'value1': np.arange(6,dtype='int64'), + 'value2': [1, 2] * 3}) + df['datetime'] = df['datetime'].apply(lambda d: Timestamp(d, tz='US/Pacific')) + + exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 09:00:00'], + tz='US/Pacific', name='datetime') + exp_idx2 = Index(['a', 'b'] * 3, name='label') + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], 'value2': [1, 2, 2, 1, 1, 2]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(['datetime', 'label']).sum() + assert_frame_equal(result, expected) + + # by level + didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo') + df = DataFrame({'value1': np.arange(6,dtype='int64'), + 'value2': [1, 2, 3, 1, 2, 3]}, + index=didx) + + exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], tz='Asia/Tokyo') + expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(level=0).sum() + assert_frame_equal(result, expected) + + def test_groupby_reindex_inside_function(self): + from pandas.tseries.api import DatetimeIndex + + periods = 1000 + ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) + df = DataFrame({'high': np.arange( + periods), 'low': np.arange(periods)}, index=ind) + + def agg_before(hour, func, fix=False): + """ + Run an aggregate func on the subset of data. + """ + def _func(data): + d = data.select(lambda x: x.hour < 11).dropna() + if fix: + data[data.index[0]] + if len(d) == 0: + return None + return func(d) + return _func + + def afunc(data): + d = data.select(lambda x: x.hour < 11).dropna() + return np.max(d) + + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + closure_bad = grouped.agg({'high': agg_before(11, np.max)}) + closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) + + assert_frame_equal(closure_bad, closure_good) + + def test_multiindex_columns_empty_level(self): + l = [['count', 'values'], ['to filter', '']] + midx = MultiIndex.from_tuples(l) + + df = DataFrame([[long(1), 'A']], columns=midx) + + grouped = df.groupby('to filter').groups + self.assert_numpy_array_equal(grouped['A'], [0]) + + grouped = df.groupby([('to filter', '')]).groups + self.assert_numpy_array_equal(grouped['A'], [0]) + + df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx) + + expected = df.groupby('to filter').groups + result = df.groupby([('to filter', '')]).groups + self.assertEqual(result, expected) + + df = DataFrame([[long(1), 'A'], [long(2), 'A']], columns=midx) + + expected = df.groupby('to filter').groups + result = df.groupby([('to filter', '')]).groups + self.assertEqual(result, expected) + + def test_cython_median(self): + df = DataFrame(np.random.randn(1000)) + df.values[::2] = np.nan + + labels = np.random.randint(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + exp = df.groupby(labels).agg(nanops.nanmedian) + assert_frame_equal(result, exp) + + df = DataFrame(np.random.randn(1000, 5)) + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + assert_frame_equal(rs, xp) + + def test_groupby_categorical_no_compress(self): + data = Series(np.random.randn(9)) + + labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + cats = Categorical(labels, [0, 1, 2]) + + result = data.groupby(cats).mean() + exp = data.groupby(labels).mean() + assert_series_equal(result, exp) + + labels = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) + cats = Categorical(labels, [0, 1, 2, 3]) + + result = data.groupby(cats).mean() + exp = data.groupby(labels).mean().reindex(cats.levels) + assert_series_equal(result, exp) + + def test_groupby_first_datetime64(self): + df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) + df[1] = df[1].view('M8[ns]') + + self.assertTrue(issubclass(df[1].dtype.type, np.datetime64)) + + result = df.groupby(level=0).first() + got_dt = result[1].dtype + self.assertTrue(issubclass(got_dt.type, np.datetime64)) + + result = df[1].groupby(level=0).first() + got_dt = result.dtype + self.assertTrue(issubclass(got_dt.type, np.datetime64)) + + def test_groupby_max_datetime64(self): + # GH 5869 + # datetimelike dtype conversion from int + df = DataFrame(dict(A = Timestamp('20130101'), B = np.arange(5))) + expected = df.groupby('A')['A'].apply(lambda x: x.max()) + result = df.groupby('A')['A'].max() + assert_series_equal(result,expected) + + def test_groupby_datetime64_32_bit(self): + # GH 6410 / numpy 4328 + # 32-bit under 1.9-dev indexing issue + + df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')]*2}) + result = df.groupby("A")["B"].transform(min) + expected = Series([pd.Timestamp('2000-01-1')]*2) + assert_series_equal(result,expected) + + def test_groupby_categorical_unequal_len(self): + import pandas as pd + #GH3011 + series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) + bins = pd.cut(series.dropna(), 4) + + # len(bins) != len(series) here + self.assertRaises(AssertionError,lambda : series.groupby(bins).mean()) + + def test_gb_apply_list_of_unequal_len_arrays(self): + + # GH1738 + df = DataFrame({'group1': ['a','a','a','b','b','b','a','a','a','b','b','b'], + 'group2': ['c','c','d','d','d','e','c','c','d','d','d','e'], + 'weight': [1.1,2,3,4,5,6,2,4,6,8,1,2], + 'value': [7.1,8,9,10,11,12,8,7,6,5,4,3] + }) + df = df.set_index(['group1', 'group2']) + df_grouped = df.groupby(level=['group1','group2'], sort=True) + + def noddy(value, weight): + out = np.array( value * weight ).repeat(3) + return out + + # the kernel function returns arrays of unequal length + # pandas sniffs the first one, sees it's an array and not + # a list, and assumed the rest are of equal length + # and so tries a vstack + + # don't die + no_toes = df_grouped.apply(lambda x: noddy(x.value, x.weight )) + + def test_groupby_with_empty(self): + import pandas as pd + index = pd.DatetimeIndex(()) + data = () + series = pd.Series(data, index) + grouper = pd.tseries.resample.TimeGrouper('D') + grouped = series.groupby(grouper) + assert next(iter(grouped), None) is None + + def test_groupby_with_timegrouper(self): + # GH 4161 + # TimeGrouper requires a sorted index + # also verifies that the resultant index has the correct name + import datetime as DT + df_original = DataFrame({ + 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(), + 'Quantity': [18,3,5,1,9,3], + 'Date' : [ + DT.datetime(2013,9,1,13,0), + DT.datetime(2013,9,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,3,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,9,2,14,0), + ]}) + + # GH 6908 change target column's order + df_reordered = df_original.sort(columns='Quantity') + + for df in [df_original, df_reordered]: + df = df.set_index(['Date']) + + expected = DataFrame({ 'Quantity' : np.nan }, + index=date_range('20130901 13:00:00','20131205 13:00:00', + freq='5D',name='Date',closed='left')) + expected.iloc[[0,6,18],0] = np.array([24.,6.,9.],dtype='float64') + + result1 = df.resample('5D',how=sum) + assert_frame_equal(result1, expected) + + df_sorted = df.sort_index() + result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result2, expected) + + result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum() + assert_frame_equal(result3, expected) + + def test_groupby_with_timegrouper_methods(self): + # GH 3881 + # make sure API of timegrouper conforms + + import datetime as DT + df_original = pd.DataFrame({ + 'Branch' : 'A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(), + 'Quantity': [1,3,5,8,9,3], + 'Date' : [ + DT.datetime(2013,1,1,13,0), + DT.datetime(2013,1,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,12,2,14,0), + ]}) + + df_sorted = df_original.sort(columns='Quantity', ascending=False) + + for df in [df_original, df_sorted]: + df = df.set_index('Date', drop=False) + g = df.groupby(pd.TimeGrouper('6M')) + self.assertTrue(g.group_keys) + self.assertTrue(isinstance(g.grouper,pd.core.groupby.BinGrouper)) + groups = g.groups + self.assertTrue(isinstance(groups,dict)) + self.assertTrue(len(groups) == 3) + + def test_timegrouper_with_reg_groups(self): + + # GH 3794 + # allow combinateion of timegrouper/reg groups + + import datetime as DT + + df_original = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [ + DT.datetime(2013,1,1,13,0), + DT.datetime(2013,1,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,12,2,12,0), + DT.datetime(2013,12,2,14,0), + ]}).set_index('Date') + + df_sorted = df_original.sort(columns='Quantity', ascending=False) + + for df in [df_original, df_sorted]: + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,12,31,0,0), + DT.datetime(2013,12,31,0,0), + DT.datetime(2013,12,31,0,0), + ]}).set_index(['Date','Buyer']) + + result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum() + assert_frame_equal(result,expected) + + expected = DataFrame({ + 'Buyer': 'Carl Mark Carl Joe'.split(), + 'Quantity': [1,3,9,18], + 'Date' : [ + DT.datetime(2013,1,1,0,0), + DT.datetime(2013,1,1,0,0), + DT.datetime(2013,7,1,0,0), + DT.datetime(2013,7,1,0,0), + ]}).set_index(['Date','Buyer']) + result = df.groupby([pd.Grouper(freq='6MS'),'Buyer']).sum() + assert_frame_equal(result,expected) + + df_original = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [ + DT.datetime(2013,10,1,13,0), + DT.datetime(2013,10,1,13,5), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,1,20,0), + DT.datetime(2013,10,2,10,0), + DT.datetime(2013,10,2,12,0), + DT.datetime(2013,10,2,14,0), + ]}).set_index('Date') + + df_sorted = df_original.sort(columns='Quantity', ascending=False) + for df in [df_original, df_sorted]: + + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark Carl Joe'.split(), + 'Quantity': [6,8,3,4,10], + 'Date' : [ + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,1,0,0), + DT.datetime(2013,10,2,0,0), + DT.datetime(2013,10,2,0,0), + ]}).set_index(['Date','Buyer']) + + result = df.groupby([pd.Grouper(freq='1D'),'Buyer']).sum() + assert_frame_equal(result,expected) + + result = df.groupby([pd.Grouper(freq='1M'),'Buyer']).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,10,31,0,0), + DT.datetime(2013,10,31,0,0), + DT.datetime(2013,10,31,0,0), + ]}).set_index(['Date','Buyer']) + assert_frame_equal(result,expected) + + # passing the name + df = df.reset_index() + result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + assert_frame_equal(result,expected) + + self.assertRaises(KeyError, lambda : df.groupby([pd.Grouper(freq='1M',key='foo'),'Buyer']).sum()) + + # passing the level + df = df.set_index('Date') + result = df.groupby([pd.Grouper(freq='1M',level='Date'),'Buyer']).sum() + assert_frame_equal(result,expected) + result = df.groupby([pd.Grouper(freq='1M',level=0),'Buyer']).sum() + assert_frame_equal(result,expected) + + self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',level='foo'),'Buyer']).sum()) + + # multi names + df = df.copy() + df['Date'] = df.index + pd.offsets.MonthEnd(2) + result = df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum() + expected = DataFrame({ + 'Buyer': 'Carl Joe Mark'.split(), + 'Quantity': [10,18,3], + 'Date' : [ + DT.datetime(2013,11,30,0,0), + DT.datetime(2013,11,30,0,0), + DT.datetime(2013,11,30,0,0), + ]}).set_index(['Date','Buyer']) + assert_frame_equal(result,expected) + + # error as we have both a level and a name! + self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum()) + + + # single groupers + expected = DataFrame({ 'Quantity' : [31], + 'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M')).sum() + assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq='1M')]).sum() + assert_frame_equal(result, expected) + + expected = DataFrame({ 'Quantity' : [31], + 'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date') + result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum() + assert_frame_equal(result, expected) + + result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum() + assert_frame_equal(result, expected) + + # GH 6764 multiple grouping with/without sort + df = DataFrame({ + 'date' : pd.to_datetime([ + '20121002','20121007','20130130','20130202','20130305','20121002', + '20121207','20130130','20130202','20130305','20130202','20130305']), + 'user_id' : [1,1,1,1,1,3,3,3,5,5,5,5], + 'whole_cost' : [1790,364,280,259,201,623,90,312,359,301,359,801], + 'cost1' : [12,15,10,24,39,1,0,90,45,34,1,12] }).set_index('date') + + for freq in ['D', 'M', 'A', 'Q-APR']: + expected = df.groupby('user_id')['whole_cost'].resample( + freq, how='sum').dropna().reorder_levels( + ['date','user_id']).sortlevel().astype('int64') + expected.name = 'whole_cost' + + result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum() + assert_series_equal(result1, expected) + + result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum() + assert_series_equal(result2, expected) + + def test_timegrouper_get_group(self): + # GH 6914 + + df_original = DataFrame({ + 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(), + 'Quantity': [18,3,5,1,9,3], + 'Date' : [datetime(2013,9,1,13,0), datetime(2013,9,1,13,5), + datetime(2013,10,1,20,0), datetime(2013,10,3,10,0), + datetime(2013,12,2,12,0), datetime(2013,9,2,14,0),]}) + df_reordered = df_original.sort(columns='Quantity') + + # single grouping + expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], + df_original.iloc[[4]]] + dt_list = ['2013-09-30', '2013-10-31', '2013-12-31'] + + for df in [df_original, df_reordered]: + grouped = df.groupby(pd.Grouper(freq='M', key='Date')) + for t, expected in zip(dt_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group(dt) + assert_frame_equal(result, expected) + + # multiple grouping + expected_list = [df_original.iloc[[1]], df_original.iloc[[3]], + df_original.iloc[[4]]] + g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'), ('Joe', '2013-12-31')] + + for df in [df_original, df_reordered]: + grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')]) + for (b, t), expected in zip(g_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group((b, dt)) + assert_frame_equal(result, expected) + + # with index + df_original = df_original.set_index('Date') + df_reordered = df_original.sort(columns='Quantity') + + expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], + df_original.iloc[[4]]] + + for df in [df_original, df_reordered]: + grouped = df.groupby(pd.Grouper(freq='M')) + for t, expected in zip(dt_list, expected_list): + dt = pd.Timestamp(t) + result = grouped.get_group(dt) + assert_frame_equal(result, expected) + + def test_cumcount(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3]) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_empty(self): + ge = DataFrame().groupby() + se = Series().groupby() + + e = Series(dtype='int64') # edge case, as this is usually considered float + + assert_series_equal(e, ge.cumcount()) + assert_series_equal(e, se.cumcount()) + + def test_cumcount_dupe_index(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=mi) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=mi) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_groupby_not_col(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_filter_series(self): + import pandas as pd + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(s.index)) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(s.index)) + + def test_filter_single_column_df(self): + import pandas as pd + df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + grouper = df[0].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(df.index)) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(df.index)) + + def test_filter_multi_column_df(self): + import pandas as pd + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), expected) + + def test_filter_mixed_df(self): + import pandas as pd + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, + index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 10), expected) + + def test_filter_out_all_groups(self): + import pandas as pd + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 1000), s[[]]) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]]) + + def test_filter_out_no_groups(self): + import pandas as pd + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + filtered = grouped.filter(lambda x: x.mean() > 0) + assert_series_equal(filtered, s) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + filtered = grouped.filter(lambda x: x['A'].mean() > 0) + assert_frame_equal(filtered, df) + + def test_filter_condition_raises(self): + import pandas as pd + def raise_if_sum_is_zero(x): + if x.sum() == 0: + raise ValueError + else: + return x.sum() > 0 + s = pd.Series([-1,0,1,2]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + self.assertRaises(TypeError, + lambda: grouped.filter(raise_if_sum_is_zero)) + + def test_filter_bad_shapes(self): + df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby('B') + g_s = s.groupby(s) + + f = lambda x: x + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: x == 1 + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: np.outer(x, x) + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + def test_filter_nan_is_false(self): + df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby(df['B']) + g_s = s.groupby(s) + + f = lambda x: np.nan + assert_frame_equal(g_df.filter(f), df.loc[[]]) + assert_series_equal(g_s.filter(f), s[[]]) + + def test_filter_against_workaround(self): + np.random.seed(0) + # Series of ints + s = Series(np.random.randint(0,100,1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.order(), old_way.order()) + + # Series of floats + s = 100*Series(np.random.random(1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.order(), old_way.order()) + + # Set up DataFrame of ints, floats, strings. + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 1000 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), + 'floats': N/10*Series(np.random.random(N)), + 'letters': Series(random_letters)}) + + # Group by ints; filter on floats. + grouped = df.groupby('ints') + old_way = df[grouped.floats.\ + transform(lambda x: x.mean() > N/20).astype('bool')] + new_way = grouped.filter(lambda x: x['floats'].mean() > N/20) + assert_frame_equal(new_way, old_way) + + # Group by floats (rounded); filter on strings. + grouper = df.floats.apply(lambda x: np.round(x, -1)) + grouped = df.groupby(grouper) + old_way = df[grouped.letters.\ + transform(lambda x: len(x) < N/10).astype('bool')] + new_way = grouped.filter( + lambda x: len(x.letters) < N/10) + assert_frame_equal(new_way, old_way) + + # Group by strings; filter on ints. + grouped = df.groupby('letters') + old_way = df[grouped.ints.\ + transform(lambda x: x.mean() > N/20).astype('bool')] + new_way = grouped.filter(lambda x: x['ints'].mean() > N/20) + assert_frame_equal(new_way, old_way) + + def test_filter_using_len(self): + # BUG GH4447 + df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)}) + grouped = df.groupby('B') + actual = grouped.filter(lambda x: len(x) > 2) + expected = DataFrame({'A': np.arange(2, 6), 'B': list('bbbb'), 'C': np.arange(2, 6)}, index=np.arange(2, 6)) + assert_frame_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = df.ix[[]] + assert_frame_equal(actual, expected) + + # Series have always worked properly, but we'll test anyway. + s = df['B'] + grouped = s.groupby(s) + actual = grouped.filter(lambda x: len(x) > 2) + expected = Series(4*['b'], index=np.arange(2, 6)) + assert_series_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = s[[]] + assert_series_equal(actual, expected) + + def test_filter_maintains_ordering(self): + # Simple case: index is sequential. #4621 + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Now index is sequentially decreasing. + df.index = np.arange(len(df) - 1, -1, -1) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Index is shuffled. + SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] + df.index = df.index[SHUFFLED] + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 1, 1, 0, 1] + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_multiple_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 0, 0, 0, 1] + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_float_index(self): + # GH4620 + index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_float_index(self): + # GH4620 + index = np.array([1, 1, 1, 2, 0, 0, 0, 1], dtype=float) + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_timestamp_index(self): + # GH4620 + t0 = Timestamp('2013-09-30 00:05:00') + t1 = Timestamp('2013-10-30 00:05:00') + t2 = Timestamp('2013-11-30 00:05:00') + index = [t1, t1, t1, t2, t1, t1, t0, t1] + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_string_index(self): + # GH4620 + index = list('bbbcbbab') + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_has_access_to_grouped_cols(self): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x['A'].sum() == 2) + assert_frame_equal(filt, df.iloc[[0, 1]]) + + def test_index_label_overlaps_location(self): + # checking we don't have any label/location confusion in the + # the wake of GH5375 + df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + + # ... and again, with a generic Index of floats + df.index = df.index.astype(float) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + + def test_groupby_selection_with_methods(self): + # some methods which require DatetimeIndex + rng = pd.date_range('2014', periods=len(self.df)) + self.df.index = rng + + g = self.df.groupby(['A'])[['C']] + g_exp = self.df[['C']].groupby(self.df['A']) + # TODO check groupby with > 1 col ? + + # methods which are called as .foo() + methods = ['count', + 'corr', + 'cummax', 'cummin', 'cumprod', + 'describe', 'rank', + 'quantile', + 'diff', 'shift', + 'all', 'any', + 'idxmin', 'idxmax', + 'ffill', 'bfill', + 'pct_change', + 'tshift', + #'ohlc' + ] + + for m in methods: + res = getattr(g, m)() + exp = getattr(g_exp, m)() + assert_frame_equal(res, exp) # should always be frames! + + # methods which aren't just .foo() + assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + assert_frame_equal(g.dtypes, g_exp.dtypes) + assert_frame_equal(g.apply(lambda x: x.sum()), + g_exp.apply(lambda x: x.sum())) + + assert_frame_equal(g.resample('D'), g_exp.resample('D')) + assert_frame_equal(g.resample('D', how='ohlc'), + g_exp.resample('D', how='ohlc')) + + assert_frame_equal(g.filter(lambda x: len(x) == 3), + g_exp.filter(lambda x: len(x) == 3)) + + def test_groupby_whitelist(self): + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 10 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + s = df.floats + + df_whitelist = frozenset([ + 'last', 'first', + 'mean', 'sum', 'min', 'max', + 'head', 'tail', + 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', + 'resample', + 'describe', + 'rank', 'quantile', 'count', + 'fillna', + 'mad', + 'any', 'all', + 'irow', 'take', + 'idxmax', 'idxmin', + 'shift', 'tshift', + 'ffill', 'bfill', + 'pct_change', 'skew', + 'plot', 'boxplot', 'hist', + 'median', 'dtypes', + 'corrwith', 'corr', 'cov', + 'diff', + ]) + s_whitelist = frozenset([ + 'last', 'first', + 'mean', 'sum', 'min', 'max', + 'head', 'tail', + 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', + 'resample', + 'describe', + 'rank', 'quantile', 'count', + 'fillna', + 'mad', + 'any', 'all', + 'irow', 'take', + 'idxmax', 'idxmin', + 'shift', 'tshift', + 'ffill', 'bfill', + 'pct_change', 'skew', + 'plot', 'hist', + 'median', 'dtype', + 'corr', 'cov', + 'value_counts', + 'diff', + 'unique', 'nunique', + 'nlargest', 'nsmallest', + ]) + + for obj, whitelist in zip((df, s), + (df_whitelist, s_whitelist)): + gb = obj.groupby(df.letters) + self.assertEqual(whitelist, gb._apply_whitelist) + for m in whitelist: + getattr(gb, m) + + def test_groupby_blacklist(self): + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 10 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + s = df.floats + + blacklist = [ + 'eval', 'query', 'abs', 'where', + 'mask', 'align', 'groupby', 'clip', 'astype', + 'at', 'combine', 'consolidate', 'convert_objects', + ] + to_methods = [method for method in dir(df) if method.startswith('to_')] + + blacklist.extend(to_methods) + + # e.g., to_csv + defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the " + "'apply' method$)") + + # e.g., query, eval + not_defined = "(?:^{1!r} object has no attribute {0!r}$)" + fmt = defined_but_not_allowed + '|' + not_defined + for bl in blacklist: + for obj in (df, s): + gb = obj.groupby(df.letters) + msg = fmt.format(bl, type(gb).__name__) + with tm.assertRaisesRegexp(AttributeError, msg): + getattr(gb, bl) + + def test_series_groupby_plotting_nominally_works(self): + _skip_if_mpl_not_installed() + + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = tm.choice(['male', 'female'], size=n) + + weight.groupby(gender).plot() + tm.close() + height.groupby(gender).hist() + tm.close() + + def test_plotting_with_float_index_works(self): + _skip_if_mpl_not_installed() + + # GH 7025 + df = DataFrame({'def': [1,1,1,2,2,2,3,3,3], + 'val': np.random.randn(9)}, + index=[1.0,2.0,3.0,1.0,2.0,3.0,1.0,2.0,3.0]) + + df.groupby('def')['val'].plot() + tm.close() + df.groupby('def')['val'].apply(lambda x: x.plot()) + tm.close() + + @slow + def test_frame_groupby_plot_boxplot(self): + _skip_if_mpl_not_installed() + + import matplotlib.pyplot as plt + import matplotlib as mpl + mpl.use('Agg') + tm.close() + + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = tm.choice(['male', 'female'], size=n) + df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) + gb = df.groupby('gender') + + res = gb.plot() + self.assertEqual(len(plt.get_fignums()), 2) + self.assertEqual(len(res), 2) + tm.close() + + res = gb.boxplot() + self.assertEqual(len(plt.get_fignums()), 1) + self.assertEqual(len(res), 2) + tm.close() + + # now works with GH 5610 as gender is excluded + res = df.groupby('gender').hist() + tm.close() + + @slow + def test_frame_groupby_hist(self): + _skip_if_mpl_not_installed() + import matplotlib.pyplot as plt + import matplotlib as mpl + mpl.use('Agg') + tm.close() + + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender_int = tm.choice([0, 1], size=n) + df_int = DataFrame({'height': height, 'weight': weight, + 'gender': gender_int}) + gb = df_int.groupby('gender') + axes = gb.hist() + self.assertEqual(len(axes), 2) + self.assertEqual(len(plt.get_fignums()), 2) + tm.close() + + def test_tab_completion(self): + grp = self.mframe.groupby(level='second') + results = set([v for v in dir(grp) if not v.startswith('_')]) + expected = set(['A','B','C', + 'agg','aggregate','apply','boxplot','filter','first','get_group', + 'groups','hist','indices','last','max','mean','median', + 'min','name','ngroups','nth','ohlc','plot', 'prod', + 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'head', + 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', + 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount', + 'all', 'shift', 'skew', 'bfill', 'irow', 'ffill', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'diff', 'idxmax', 'idxmin' + ]) + self.assertEqual(results, expected) + + def test_lexsort_indexer(self): + keys = [[nan]*5 + list(range(100)) + [nan]*5] + # orders=True, na_position='last' + result = _lexsort_indexer(keys, orders=True, na_position='last') + expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # orders=True, na_position='first' + result = _lexsort_indexer(keys, orders=True, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + assert_equal(result, expected) + + # orders=False, na_position='last' + result = _lexsort_indexer(keys, orders=False, na_position='last') + expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # orders=False, na_position='first' + result = _lexsort_indexer(keys, orders=False, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + assert_equal(result, expected) + + def test_nargsort(self): + # np.argsort(items) places NaNs last + items = [nan]*5 + list(range(100)) + [nan]*5 + # np.argsort(items2) may not place NaNs first + items2 = np.array(items, dtype='O') + + try: + # GH 2785; due to a regression in NumPy1.6.2 + np.argsort(np.array([[1, 2], [1, 3], [1, 2]], dtype='i')) + np.argsort(items2, kind='mergesort') + except TypeError as err: + raise nose.SkipTest('requested sort not available for type') + + # mergesort is the most difficult to get right because we want it to be stable. + + # According to numpy/core/tests/test_multiarray, """The number + # of sorted items must be greater than ~50 to check the actual algorithm + # because quick and merge sort fall over to insertion sort for small + # arrays.""" + + + # mergesort, ascending=True, na_position='last' + result = _nargsort( + items, kind='mergesort', ascending=True, na_position='last') + expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=True, na_position='first' + result = _nargsort( + items, kind='mergesort', ascending=True, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='last' + result = _nargsort( + items, kind='mergesort', ascending=False, na_position='last') + expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='first' + result = _nargsort( + items, kind='mergesort', ascending=False, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + assert_equal(result, expected) + + # mergesort, ascending=True, na_position='last' + result = _nargsort( + items2, kind='mergesort', ascending=True, na_position='last') + expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=True, na_position='first' + result = _nargsort( + items2, kind='mergesort', ascending=True, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='last' + result = _nargsort( + items2, kind='mergesort', ascending=False, na_position='last') + expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + assert_equal(result, expected) + + # mergesort, ascending=False, na_position='first' + result = _nargsort( + items2, kind='mergesort', ascending=False, na_position='first') + expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + assert_equal(result, expected) + + def test_datetime_count(self): + df = DataFrame({'a': [1,2,3] * 2, + 'dates': pd.date_range('now', periods=6, freq='T')}) + result = df.groupby('a').dates.count() + expected = Series([2, 2, 2], index=Index([1, 2, 3], name='a'), + name='dates') + tm.assert_series_equal(result, expected) + + def test_lower_int_prec_count(self): + df = DataFrame({'a': np.array([0, 1, 2, 100], np.int8), + 'b': np.array([1, 2, 3, 6], np.uint32), + 'c': np.array([4, 5, 6, 8], np.int16), + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2], + 'b': [2, 2], + 'c': [2, 2]}, index=pd.Index(list('ab'), + name='grp')) + tm.assert_frame_equal(result, expected) + + def test_count_uses_size_on_exception(self): + class RaisingObjectException(Exception): + pass + + class RaisingObject(object): + def __init__(self, msg='I will raise inside Cython'): + super(RaisingObject, self).__init__() + self.msg = msg + + def __eq__(self, other): + # gets called in Cython to check that raising calls the method + raise RaisingObjectException(self.msg) + + df = DataFrame({'a': [RaisingObject() for _ in range(4)], + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2]}, index=pd.Index(list('ab'), + name='grp')) + tm.assert_frame_equal(result, expected) + + def test__cython_agg_general(self): + ops = [('mean', np.mean), + ('median', np.median), + ('var', np.var), + ('add', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + ] + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = df.groupby(labels)._cython_agg_general(op) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + + def test_ops_general(self): + ops = [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), + ] + try: + from scipy.stats import sem + except ImportError: + pass + else: + ops.append(('sem', sem)) + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + + def test_max_nan_bug(self): + raw = """,Date,app,File +2013-04-23,2013-04-23 00:00:00,,log080001.log +2013-05-06,2013-05-06 00:00:00,,log.log +2013-05-07,2013-05-07 00:00:00,OE,xlsx""" + df = pd.read_csv(StringIO(raw), parse_dates=[0]) + gb = df.groupby('Date') + r = gb[['File']].max() + e = gb['File'].max().to_frame() + tm.assert_frame_equal(r, e) + self.assertFalse(r['File'].isnull().any()) + + def test_nlargest(self): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list('a' * 5 + 'b' * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series([7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list('aaabbb'), + [3, 2, 1, 9, 5, 8]])) + tm.assert_series_equal(r, e) + + def test_nsmallest(self): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list('a' * 5 + 'b' * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series([1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list('aaabbb'), + [0, 4, 1, 6, 7, 8]])) + tm.assert_series_equal(r, e) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert(result[k] == v) + + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert(np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), + np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', + '-s'], exit=False) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py new file mode 100644 index 00000000..6fb88eb5 --- /dev/null +++ b/pandas/tests/test_index.py @@ -0,0 +1,2832 @@ +# pylint: disable=E1101,E1103,W0232 + +from datetime import datetime, timedelta +from pandas.compat import range, lrange, lzip, u, zip +import operator +import pickle +import re +import nose +import warnings +import os + +import numpy as np +from numpy.testing import assert_array_equal + +from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex, + InvalidIndexError) +from pandas.tseries.index import DatetimeIndex +from pandas.core.series import Series +from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, + assert_copy) +from pandas import compat +from pandas.compat import long + +import pandas.util.testing as tm +import pandas.core.config as cf + +from pandas.tseries.index import _to_m8 +import pandas.tseries.offsets as offsets + +import pandas as pd +from pandas.lib import Timestamp + +from pandas import _np_version_under1p7 + +class TestIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict( + unicodeIndex = tm.makeUnicodeIndex(100), + strIndex = tm.makeStringIndex(100), + dateIndex = tm.makeDateIndex(100), + intIndex = tm.makeIntIndex(100), + floatIndex = tm.makeFloatIndex(100), + empty = Index([]), + tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], + [1, 2, 3])) + ) + for name, ind in self.indices.items(): + setattr(self, name, ind) + + def test_wrong_number_names(self): + def testit(ind): + ind.names = ["apple", "banana", "carrot"] + + for ind in self.indices.values(): + assertRaisesRegexp(ValueError, "^Length", testit, ind) + + def test_set_name_methods(self): + new_name = "This is the new name for this index" + indices = (self.dateIndex, self.intIndex, self.unicodeIndex, + self.empty) + for ind in indices: + original_name = ind.name + new_ind = ind.set_names([new_name]) + self.assertEqual(new_ind.name, new_name) + self.assertEqual(ind.name, original_name) + res = ind.rename(new_name, inplace=True) + # should return None + self.assertIsNone(res) + self.assertEqual(ind.name, new_name) + self.assertEqual(ind.names, [new_name]) + with assertRaisesRegexp(TypeError, "list-like"): + # should still fail even if it would be the right length + ind.set_names("a") + # rename in place just leaves tuples and other containers alone + name = ('A', 'B') + ind = self.intIndex + ind.rename(name, inplace=True) + self.assertEqual(ind.name, name) + self.assertEqual(ind.names, [name]) + + def test_hash_error(self): + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(self.strIndex).__name__): + hash(self.strIndex) + + def test_new_axis(self): + new_index = self.dateIndex[None, :] + self.assertEqual(new_index.ndim, 2) + tm.assert_isinstance(new_index, np.ndarray) + + def test_copy_and_deepcopy(self): + from copy import copy, deepcopy + + for func in (copy, deepcopy): + idx_copy = func(self.strIndex) + self.assertIsNot(idx_copy, self.strIndex) + self.assertTrue(idx_copy.equals(self.strIndex)) + + new_copy = self.strIndex.copy(deep=True, name="banana") + self.assertEqual(new_copy.name, "banana") + new_copy2 = self.intIndex.copy(dtype=int) + self.assertEqual(new_copy2.dtype.kind, 'i') + + def test_duplicates(self): + idx = Index([0, 0, 0]) + self.assertFalse(idx.is_unique) + + def test_sort(self): + self.assertRaises(TypeError, self.strIndex.sort) + + def test_mutability(self): + self.assertRaises(TypeError, self.strIndex.__setitem__, 0, 'foo') + + def test_constructor(self): + # regular instance creation + tm.assert_contains_all(self.strIndex, self.strIndex) + tm.assert_contains_all(self.dateIndex, self.dateIndex) + + # casting + arr = np.array(self.strIndex) + index = arr.view(Index) + tm.assert_contains_all(arr, index) + self.assert_numpy_array_equal(self.strIndex, index) + + # copy + arr = np.array(self.strIndex) + index = Index(arr, copy=True, name='name') + tm.assert_isinstance(index, Index) + self.assertEqual(index.name, 'name') + assert_array_equal(arr, index) + arr[0] = "SOMEBIGLONGSTRING" + self.assertNotEqual(index[0], "SOMEBIGLONGSTRING") + + # what to do here? + # arr = np.array(5.) + # self.assertRaises(Exception, arr.view, Index) + + def test_constructor_corner(self): + # corner case + self.assertRaises(TypeError, Index, 0) + + def test_constructor_from_series(self): + + expected = DatetimeIndex([Timestamp('20110101'),Timestamp('20120101'),Timestamp('20130101')]) + s = Series([Timestamp('20110101'),Timestamp('20120101'),Timestamp('20130101')]) + result = Index(s) + self.assertTrue(result.equals(expected)) + result = DatetimeIndex(s) + self.assertTrue(result.equals(expected)) + + # GH 6273 + # create from a series, passing a freq + s = Series(pd.to_datetime(['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'])) + result = DatetimeIndex(s, freq='MS') + expected = DatetimeIndex(['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'],freq='MS') + self.assertTrue(result.equals(expected)) + + df = pd.DataFrame(np.random.rand(5,3)) + df['date'] = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'] + result = DatetimeIndex(df['date'], freq='MS') + + # GH 6274 + # infer freq of same + result = pd.infer_freq(df['date']) + self.assertEqual(result,'MS') + + def test_constructor_ndarray_like(self): + # GH 5460#issuecomment-44474502 + # it should be possible to convert any object that satisfies the numpy + # ndarray interface directly into an Index + class ArrayLike(object): + def __init__(self, array): + self.array = array + def __array__(self, dtype=None): + return self.array + + for array in [np.arange(5), + np.array(['a', 'b', 'c']), + pd.date_range('2000-01-01', periods=3).values]: + expected = pd.Index(array) + result = pd.Index(ArrayLike(array)) + self.assertTrue(result.equals(expected)) + + def test_index_ctor_infer_periodindex(self): + from pandas import period_range, PeriodIndex + xp = period_range('2012-1-1', freq='M', periods=3) + rs = Index(xp) + assert_array_equal(rs, xp) + tm.assert_isinstance(rs, PeriodIndex) + + def test_constructor_simple_new(self): + idx = Index([1, 2, 3, 4, 5], name='int') + result = idx._simple_new(idx, 'int') + self.assertTrue(result.equals(idx)) + + idx = Index([1.1, np.nan, 2.2, 3.0], name='float') + result = idx._simple_new(idx, 'float') + self.assertTrue(result.equals(idx)) + + idx = Index(['A', 'B', 'C', np.nan], name='obj') + result = idx._simple_new(idx, 'obj') + self.assertTrue(result.equals(idx)) + + def test_copy(self): + i = Index([], name='Foo') + i_copy = i.copy() + self.assertEqual(i_copy.name, 'Foo') + + def test_view(self): + i = Index([], name='Foo') + i_view = i.view() + self.assertEqual(i_view.name, 'Foo') + + def test_astype(self): + casted = self.intIndex.astype('i8') + + # it works! + casted.get_loc(5) + + # pass on name + self.intIndex.name = 'foobar' + casted = self.intIndex.astype('i8') + self.assertEqual(casted.name, 'foobar') + + def test_compat(self): + self.strIndex.tolist() + + def test_equals(self): + # same + self.assertTrue(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) + + # different length + self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b']))) + + # same length, different values + self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'd']))) + + # Must also be an Index + self.assertFalse(Index(['a', 'b', 'c']).equals(['a', 'b', 'c'])) + + def test_insert(self): + + # GH 7256 + # validate neg/pos inserts + result = Index(['b', 'c', 'd']) + + #test 0th element + self.assertTrue(Index(['a', 'b', 'c', 'd']).equals( + result.insert(0, 'a'))) + + #test Nth element that follows Python list behavior + self.assertTrue(Index(['b', 'c', 'e', 'd']).equals( + result.insert(-1, 'e'))) + + #test loc +/- neq (0, -1) + self.assertTrue(result.insert(1, 'z').equals( + result.insert(-2, 'z'))) + + #test empty + null_index = Index([]) + self.assertTrue(Index(['a']).equals( + null_index.insert(0, 'a'))) + + def test_delete(self): + idx = Index(['a', 'b', 'c', 'd'], name='idx') + + expected = Index(['b', 'c', 'd'], name='idx') + result = idx.delete(0) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + + expected = Index(['a', 'b', 'c'], name='idx') + result = idx.delete(-1) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = idx.delete(5) + + def test_identical(self): + + # index + i1 = Index(['a', 'b', 'c']) + i2 = Index(['a', 'b', 'c']) + + self.assertTrue(i1.identical(i2)) + + i1 = i1.rename('foo') + self.assertTrue(i1.equals(i2)) + self.assertFalse(i1.identical(i2)) + + i2 = i2.rename('foo') + self.assertTrue(i1.identical(i2)) + + i3 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')]) + i4 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')], tupleize_cols=False) + self.assertFalse(i3.identical(i4)) + + def test_is_(self): + ind = Index(range(10)) + self.assertTrue(ind.is_(ind)) + self.assertTrue(ind.is_(ind.view().view().view().view())) + self.assertFalse(ind.is_(Index(range(10)))) + self.assertFalse(ind.is_(ind.copy())) + self.assertFalse(ind.is_(ind.copy(deep=False))) + self.assertFalse(ind.is_(ind[:])) + self.assertFalse(ind.is_(ind.view(np.ndarray).view(Index))) + self.assertFalse(ind.is_(np.array(range(10)))) + # quasi-implementation dependent + self.assertTrue(ind.is_(ind.view().base)) + ind2 = ind.view() + ind2.name = 'bob' + self.assertTrue(ind.is_(ind2)) + self.assertTrue(ind2.is_(ind)) + # doesn't matter if Indices are *actually* views of underlying data, + self.assertFalse(ind.is_(Index(ind.values))) + arr = np.array(range(1, 11)) + ind1 = Index(arr, copy=False) + ind2 = Index(arr, copy=False) + self.assertFalse(ind1.is_(ind2)) + + def test_asof(self): + d = self.dateIndex[0] + self.assertIs(self.dateIndex.asof(d), d) + self.assertTrue(np.isnan(self.dateIndex.asof(d - timedelta(1)))) + + d = self.dateIndex[-1] + self.assertEqual(self.dateIndex.asof(d + timedelta(1)), d) + + d = self.dateIndex[0].to_datetime() + tm.assert_isinstance(self.dateIndex.asof(d), Timestamp) + + def test_nanosecond_index_access(self): + tm._skip_if_not_numpy17_friendly() + + s = Series([Timestamp('20130101')]).values.view('i8')[0] + r = DatetimeIndex([s + 50 + i for i in range(100)]) + x = Series(np.random.randn(100), index=r) + + first_value = x.asof(x.index[0]) + + # this does not yet work, as parsing strings is done via dateutil + #self.assertEqual(first_value, x['2013-01-01 00:00:00.000000050+0000']) + + self.assertEqual(first_value, x[Timestamp(np.datetime64('2013-01-01 00:00:00.000000050+0000', 'ns'))]) + + def test_argsort(self): + result = self.strIndex.argsort() + expected = np.array(self.strIndex).argsort() + self.assert_numpy_array_equal(result, expected) + + def test_comparators(self): + index = self.dateIndex + element = index[len(index) // 2] + element = _to_m8(element) + + arr = np.array(index) + + def _check(op): + arr_result = op(arr, element) + index_result = op(index, element) + + tm.assert_isinstance(index_result, np.ndarray) + self.assertNotIsInstance(index_result, Index) + self.assert_numpy_array_equal(arr_result, index_result) + + _check(operator.eq) + _check(operator.ne) + _check(operator.gt) + _check(operator.lt) + _check(operator.ge) + _check(operator.le) + + def test_booleanindex(self): + boolIdx = np.repeat(True, len(self.strIndex)).astype(bool) + boolIdx[5:30:2] = False + + subIndex = self.strIndex[boolIdx] + + for i, val in enumerate(subIndex): + self.assertEqual(subIndex.get_loc(val), i) + + subIndex = self.strIndex[list(boolIdx)] + for i, val in enumerate(subIndex): + self.assertEqual(subIndex.get_loc(val), i) + + def test_fancy(self): + sl = self.strIndex[[1, 2, 3]] + for i in sl: + self.assertEqual(i, sl[sl.get_loc(i)]) + + def test_empty_fancy(self): + empty_farr = np.array([], dtype=np.float_) + empty_iarr = np.array([], dtype=np.int_) + empty_barr = np.array([], dtype=np.bool_) + + # pd.DatetimeIndex is excluded, because it overrides getitem and should + # be tested separately. + for idx in [self.strIndex, self.intIndex, self.floatIndex]: + empty_idx = idx.__class__([]) + values = idx.values + + self.assertTrue(idx[[]].identical(empty_idx)) + self.assertTrue(idx[empty_iarr].identical(empty_idx)) + self.assertTrue(idx[empty_barr].identical(empty_idx)) + + # np.ndarray only accepts ndarray of int & bool dtypes, so should + # Index. + self.assertRaises(IndexError, idx.__getitem__, empty_farr) + + def test_getitem(self): + arr = np.array(self.dateIndex) + exp = self.dateIndex[5] + exp = _to_m8(exp) + + self.assertEqual(exp, arr[5]) + + def test_shift(self): + shifted = self.dateIndex.shift(0, timedelta(1)) + self.assertIs(shifted, self.dateIndex) + + shifted = self.dateIndex.shift(5, timedelta(1)) + self.assert_numpy_array_equal(shifted, self.dateIndex + timedelta(5)) + + shifted = self.dateIndex.shift(1, 'B') + self.assert_numpy_array_equal(shifted, self.dateIndex + offsets.BDay()) + + shifted.name = 'shifted' + self.assertEqual(shifted.name, shifted.shift(1, 'D').name) + + def test_intersection(self): + first = self.strIndex[:20] + second = self.strIndex[:10] + intersect = first.intersection(second) + + self.assertTrue(tm.equalContents(intersect, second)) + + # Corner cases + inter = first.intersection(first) + self.assertIs(inter, first) + + # non-iterable input + assertRaisesRegexp(TypeError, "iterable", first.intersection, 0.5) + + idx1 = Index([1, 2, 3, 4, 5], name='idx') + # if target has the same name, it is preserved + idx2 = Index([3, 4, 5, 6, 7], name='idx') + expected2 = Index([3, 4, 5], name='idx') + result2 = idx1.intersection(idx2) + self.assertTrue(result2.equals(expected2)) + self.assertEqual(result2.name, expected2.name) + + # if target name is different, it will be reset + idx3 = Index([3, 4, 5, 6, 7], name='other') + expected3 = Index([3, 4, 5], name=None) + result3 = idx1.intersection(idx3) + self.assertTrue(result3.equals(expected3)) + self.assertEqual(result3.name, expected3.name) + + # non monotonic + idx1 = Index([5, 3, 2, 4, 1], name='idx') + idx2 = Index([4, 7, 6, 5, 3], name='idx') + result2 = idx1.intersection(idx2) + self.assertTrue(tm.equalContents(result2, expected2)) + self.assertEqual(result2.name, expected2.name) + + idx3 = Index([4, 7, 6, 5, 3], name='other') + result3 = idx1.intersection(idx3) + self.assertTrue(tm.equalContents(result3, expected3)) + self.assertEqual(result3.name, expected3.name) + + def test_union(self): + first = self.strIndex[5:20] + second = self.strIndex[:10] + everything = self.strIndex[:20] + union = first.union(second) + self.assertTrue(tm.equalContents(union, everything)) + + # Corner cases + union = first.union(first) + self.assertIs(union, first) + + union = first.union([]) + self.assertIs(union, first) + + union = Index([]).union(first) + self.assertIs(union, first) + + # non-iterable input + assertRaisesRegexp(TypeError, "iterable", first.union, 0.5) + + # preserve names + first.name = 'A' + second.name = 'A' + union = first.union(second) + self.assertEqual(union.name, 'A') + + second.name = 'B' + union = first.union(second) + self.assertIsNone(union.name) + + def test_add(self): + firstCat = self.strIndex + self.dateIndex + secondCat = self.strIndex + self.strIndex + + if self.dateIndex.dtype == np.object_: + appended = np.append(self.strIndex, self.dateIndex) + else: + appended = np.append(self.strIndex, self.dateIndex.astype('O')) + + self.assertTrue(tm.equalContents(firstCat, appended)) + self.assertTrue(tm.equalContents(secondCat, self.strIndex)) + tm.assert_contains_all(self.strIndex, firstCat) + tm.assert_contains_all(self.strIndex, secondCat) + tm.assert_contains_all(self.dateIndex, firstCat) + + def test_append_multiple(self): + index = Index(['a', 'b', 'c', 'd', 'e', 'f']) + + foos = [index[:2], index[2:4], index[4:]] + result = foos[0].append(foos[1:]) + self.assertTrue(result.equals(index)) + + # empty + result = index.append([]) + self.assertTrue(result.equals(index)) + + def test_append_empty_preserve_name(self): + left = Index([], name='foo') + right = Index([1, 2, 3], name='foo') + + result = left.append(right) + self.assertEqual(result.name, 'foo') + + left = Index([], name='foo') + right = Index([1, 2, 3], name='bar') + + result = left.append(right) + self.assertIsNone(result.name) + + def test_add_string(self): + # from bug report + index = Index(['a', 'b', 'c']) + index2 = index + 'foo' + + self.assertNotIn('a', index2) + self.assertIn('afoo', index2) + + def test_iadd_string(self): + index = pd.Index(['a', 'b', 'c']) + # doesn't fail test unless there is a check before `+=` + self.assertIn('a', index) + + index += '_x' + self.assertIn('a_x', index) + + def test_diff(self): + first = self.strIndex[5:20] + second = self.strIndex[:10] + answer = self.strIndex[10:20] + first.name = 'name' + # different names + result = first - second + + self.assertTrue(tm.equalContents(result, answer)) + self.assertEqual(result.name, None) + + # same names + second.name = 'name' + result = first - second + self.assertEqual(result.name, 'name') + + # with empty + result = first.diff([]) + self.assertTrue(tm.equalContents(result, first)) + self.assertEqual(result.name, first.name) + + # with everythin + result = first.diff(first) + self.assertEqual(len(result), 0) + self.assertEqual(result.name, first.name) + + # non-iterable input + assertRaisesRegexp(TypeError, "iterable", first.diff, 0.5) + + def test_symmetric_diff(self): + # smoke + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = Index([2, 3, 4, 5]) + result = idx1.sym_diff(idx2) + expected = Index([1, 5]) + self.assertTrue(tm.equalContents(result, expected)) + self.assertIsNone(result.name) + + # __xor__ syntax + expected = idx1 ^ idx2 + self.assertTrue(tm.equalContents(result, expected)) + self.assertIsNone(result.name) + + # multiIndex + idx1 = MultiIndex.from_tuples(self.tuples) + idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) + result = idx1.sym_diff(idx2) + expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) + self.assertTrue(tm.equalContents(result, expected)) + + # nans: + # GH #6444, sorting of nans. Make sure the number of nans is right + # and the correct non-nan values are there. punt on sorting. + idx1 = Index([1, 2, 3, np.nan]) + idx2 = Index([0, 1, np.nan]) + result = idx1.sym_diff(idx2) + # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + nans = pd.isnull(result) + self.assertEqual(nans.sum(), 2) + self.assertEqual((~nans).sum(), 3) + [self.assertIn(x, result) for x in [0.0, 2.0, 3.0]] + + # other not an Index: + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = np.array([2, 3, 4, 5]) + expected = Index([1, 5]) + result = idx1.sym_diff(idx2) + self.assertTrue(tm.equalContents(result, expected)) + self.assertEqual(result.name, 'idx1') + + result = idx1.sym_diff(idx2, result_name='new_name') + self.assertTrue(tm.equalContents(result, expected)) + self.assertEqual(result.name, 'new_name') + + # other isn't iterable + with tm.assertRaises(TypeError): + idx1 - 1 + + def test_pickle(self): + def testit(index): + pickled = pickle.dumps(index) + unpickled = pickle.loads(pickled) + + tm.assert_isinstance(unpickled, Index) + self.assert_numpy_array_equal(unpickled, index) + self.assertEqual(unpickled.name, index.name) + + # tm.assert_dict_equal(unpickled.indexMap, index.indexMap) + + testit(self.strIndex) + self.strIndex.name = 'foo' + testit(self.strIndex) + + testit(self.dateIndex) + + def test_is_numeric(self): + self.assertFalse(self.dateIndex.is_numeric()) + self.assertFalse(self.strIndex.is_numeric()) + self.assertTrue(self.intIndex.is_numeric()) + self.assertTrue(self.floatIndex.is_numeric()) + + def test_is_all_dates(self): + self.assertTrue(self.dateIndex.is_all_dates) + self.assertFalse(self.strIndex.is_all_dates) + self.assertFalse(self.intIndex.is_all_dates) + + def test_summary(self): + self._check_method_works(Index.summary) + # GH3869 + ind = Index(['{other}%s', "~:{range}:0"], name='A') + result = ind.summary() + # shouldn't be formatted accidentally. + self.assertIn('~:{range}:0', result) + self.assertIn('{other}%s', result) + + def test_format(self): + self._check_method_works(Index.format) + + index = Index([datetime.now()]) + formatted = index.format() + expected = [str(index[0])] + self.assertEqual(formatted, expected) + + # 2845 + index = Index([1, 2.0+3.0j, np.nan]) + formatted = index.format() + expected = [str(index[0]), str(index[1]), u('NaN')] + self.assertEqual(formatted, expected) + + # is this really allowed? + index = Index([1, 2.0+3.0j, None]) + formatted = index.format() + expected = [str(index[0]), str(index[1]), u('NaN')] + self.assertEqual(formatted, expected) + + self.strIndex[:0].format() + + def test_format_with_name_time_info(self): + # bug I fixed 12/20/2011 + inc = timedelta(hours=4) + dates = Index([dt + inc for dt in self.dateIndex], name='something') + + formatted = dates.format(name=True) + self.assertEqual(formatted[0], 'something') + + def test_format_datetime_with_time(self): + t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) + + result = t.format() + expected = ['2012-02-07 00:00:00', '2012-02-07 23:00:00'] + self.assertEqual(len(result), 2) + self.assertEqual(result, expected) + + def test_format_none(self): + values = ['a', 'b', 'c', None] + + idx = Index(values) + idx.format() + self.assertIsNone(idx[3]) + + def test_take(self): + indexer = [4, 3, 0, 2] + result = self.dateIndex.take(indexer) + expected = self.dateIndex[indexer] + self.assertTrue(result.equals(expected)) + + def _check_method_works(self, method): + method(self.empty) + method(self.dateIndex) + method(self.unicodeIndex) + method(self.strIndex) + method(self.intIndex) + method(self.tuples) + + def test_get_indexer(self): + idx1 = Index([1, 2, 3, 4, 5]) + idx2 = Index([2, 4, 6]) + + r1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, [1, 3, -1]) + + r1 = idx2.get_indexer(idx1, method='pad') + assert_almost_equal(r1, [-1, 0, 0, 1, 1]) + + rffill1 = idx2.get_indexer(idx1, method='ffill') + assert_almost_equal(r1, rffill1) + + r1 = idx2.get_indexer(idx1, method='backfill') + assert_almost_equal(r1, [0, 0, 1, 1, 2]) + + rbfill1 = idx2.get_indexer(idx1, method='bfill') + assert_almost_equal(r1, rbfill1) + + def test_slice_locs(self): + idx = Index([0, 1, 2, 5, 6, 7, 9, 10]) + n = len(idx) + + self.assertEqual(idx.slice_locs(start=2), (2, n)) + self.assertEqual(idx.slice_locs(start=3), (3, n)) + self.assertEqual(idx.slice_locs(3, 8), (3, 6)) + self.assertEqual(idx.slice_locs(5, 10), (3, n)) + self.assertEqual(idx.slice_locs(end=8), (0, 6)) + self.assertEqual(idx.slice_locs(end=9), (0, 7)) + + idx2 = idx[::-1] + self.assertRaises(KeyError, idx2.slice_locs, 8, 2) + self.assertRaises(KeyError, idx2.slice_locs, 7, 3) + + def test_slice_locs_dup(self): + idx = Index(['a', 'a', 'b', 'c', 'd', 'd']) + rs = idx.slice_locs('a', 'd') + self.assertEqual(rs, (0, 6)) + + rs = idx.slice_locs(end='d') + self.assertEqual(rs, (0, 6)) + + rs = idx.slice_locs('a', 'c') + self.assertEqual(rs, (0, 4)) + + rs = idx.slice_locs('b', 'd') + self.assertEqual(rs, (2, 6)) + + def test_drop(self): + n = len(self.strIndex) + + dropped = self.strIndex.drop(self.strIndex[lrange(5, 10)]) + expected = self.strIndex[lrange(5) + lrange(10, n)] + self.assertTrue(dropped.equals(expected)) + + self.assertRaises(ValueError, self.strIndex.drop, ['foo', 'bar']) + + dropped = self.strIndex.drop(self.strIndex[0]) + expected = self.strIndex[1:] + self.assertTrue(dropped.equals(expected)) + + ser = Index([1, 2, 3]) + dropped = ser.drop(1) + expected = Index([2, 3]) + self.assertTrue(dropped.equals(expected)) + + def test_tuple_union_bug(self): + import pandas + import numpy as np + + aidx1 = np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], + dtype=[('num', int), ('let', 'a1')]) + aidx2 = np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B'), (1, 'C'), (2, + 'C')], dtype=[('num', int), ('let', 'a1')]) + + idx1 = pandas.Index(aidx1) + idx2 = pandas.Index(aidx2) + + # intersection broken? + int_idx = idx1.intersection(idx2) + # needs to be 1d like idx1 and idx2 + expected = idx1[:4] # pandas.Index(sorted(set(idx1) & set(idx2))) + self.assertEqual(int_idx.ndim, 1) + self.assertTrue(int_idx.equals(expected)) + + # union broken + union_idx = idx1.union(idx2) + expected = idx2 + self.assertEqual(union_idx.ndim, 1) + self.assertTrue(union_idx.equals(expected)) + + def test_is_monotonic_incomparable(self): + index = Index([5, datetime.now(), 7]) + self.assertFalse(index.is_monotonic) + + def test_get_set_value(self): + values = np.random.randn(100) + date = self.dateIndex[67] + + assert_almost_equal(self.dateIndex.get_value(values, date), + values[67]) + + self.dateIndex.set_value(values, date, 10) + self.assertEqual(values[67], 10) + + def test_isin(self): + values = ['foo', 'bar'] + + idx = Index(['qux', 'baz', 'foo', 'bar']) + result = idx.isin(values) + expected = np.array([False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + # empty, return dtype bool + idx = Index([]) + result = idx.isin(values) + self.assertEqual(len(result), 0) + self.assertEqual(result.dtype, np.bool_) + + def test_boolean_cmp(self): + values = [1, 2, 3, 4] + + idx = Index(values) + res = (idx == values) + + self.assertTrue(res.all()) + self.assertEqual(res.dtype, 'bool') + self.assertNotIsInstance(res, Index) + + def test_get_level_values(self): + result = self.strIndex.get_level_values(0) + self.assertTrue(result.equals(self.strIndex)) + + def test_slice_keep_name(self): + idx = Index(['a', 'b'], name='asdf') + self.assertEqual(idx.name, idx[1:].name) + + def test_join_self(self): + # instance attributes of the form self.Index + indices = 'unicode', 'str', 'date', 'int', 'float' + kinds = 'outer', 'inner', 'left', 'right' + for index_kind in indices: + res = getattr(self, '{0}Index'.format(index_kind)) + + for kind in kinds: + joined = res.join(res, how=kind) + self.assertIs(res, joined) + + def test_indexing_doesnt_change_class(self): + idx = Index([1, 2, 3, 'a', 'b', 'c']) + + self.assertTrue(idx[1:3].identical( + pd.Index([2, 3], dtype=np.object_))) + self.assertTrue(idx[[0,1]].identical( + pd.Index([1, 2], dtype=np.object_))) + + def test_outer_join_sort(self): + left_idx = Index(np.random.permutation(15)) + right_idx = tm.makeDateIndex(10) + + with tm.assert_produces_warning(RuntimeWarning): + joined = left_idx.join(right_idx, how='outer') + # right_idx in this case because DatetimeIndex has join precedence over + # Int64Index + expected = right_idx.astype(object).union(left_idx.astype(object)) + tm.assert_index_equal(joined, expected) + + def test_nan_first_take_datetime(self): + idx = Index([pd.NaT, Timestamp('20130101'), Timestamp('20130102')]) + res = idx.take([-1, 0, 1]) + exp = Index([idx[-1], idx[0], idx[1]]) + tm.assert_index_equal(res, exp) + + +class TestFloat64Index(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.mixed = Float64Index([1.5, 2, 3, 4, 5]) + self.float = Float64Index(np.arange(5) * 2.5) + + def test_hash_error(self): + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(self.float).__name__): + hash(self.float) + + def test_repr_roundtrip(self): + for ind in (self.mixed, self.float): + tm.assert_index_equal(eval(repr(ind)), ind) + + def check_is_index(self, i): + self.assertIsInstance(i, Index) + self.assertNotIsInstance(i, Float64Index) + + def check_coerce(self, a, b, is_float_index=True): + self.assertTrue(a.equals(b)) + if is_float_index: + self.assertIsInstance(b, Float64Index) + else: + self.check_is_index(b) + + def test_constructor(self): + + # explicit construction + index = Float64Index([1,2,3,4,5]) + self.assertIsInstance(index, Float64Index) + self.assertTrue((index.values == np.array([1,2,3,4,5],dtype='float64')).all()) + index = Float64Index(np.array([1,2,3,4,5])) + self.assertIsInstance(index, Float64Index) + index = Float64Index([1.,2,3,4,5]) + self.assertIsInstance(index, Float64Index) + index = Float64Index(np.array([1.,2,3,4,5])) + self.assertIsInstance(index, Float64Index) + self.assertEqual(index.dtype, float) + + index = Float64Index(np.array([1.,2,3,4,5]),dtype=np.float32) + self.assertIsInstance(index, Float64Index) + self.assertEqual(index.dtype, np.float64) + + index = Float64Index(np.array([1,2,3,4,5]),dtype=np.float32) + self.assertIsInstance(index, Float64Index) + self.assertEqual(index.dtype, np.float64) + + # nan handling + result = Float64Index([np.nan, np.nan]) + self.assertTrue(pd.isnull(result.values).all()) + result = Float64Index(np.array([np.nan])) + self.assertTrue(pd.isnull(result.values).all()) + result = Index(np.array([np.nan])) + self.assertTrue(pd.isnull(result.values).all()) + + def test_constructor_invalid(self): + + # invalid + self.assertRaises(TypeError, Float64Index, 0.) + self.assertRaises(TypeError, Float64Index, ['a','b',0.]) + self.assertRaises(TypeError, Float64Index, [Timestamp('20130101')]) + + def test_constructor_coerce(self): + + self.check_coerce(self.mixed,Index([1.5, 2, 3, 4, 5])) + self.check_coerce(self.float,Index(np.arange(5) * 2.5)) + self.check_coerce(self.float,Index(np.array(np.arange(5) * 2.5, dtype=object))) + + def test_constructor_explicit(self): + + # these don't auto convert + self.check_coerce(self.float,Index((np.arange(5) * 2.5), dtype=object), + is_float_index=False) + self.check_coerce(self.mixed,Index([1.5, 2, 3, 4, 5],dtype=object), + is_float_index=False) + + def test_astype(self): + + result = self.float.astype(object) + self.assertTrue(result.equals(self.float)) + self.assertTrue(self.float.equals(result)) + self.check_is_index(result) + + i = self.mixed.copy() + i.name = 'foo' + result = i.astype(object) + self.assertTrue(result.equals(i)) + self.assertTrue(i.equals(result)) + self.check_is_index(result) + + def test_equals(self): + + i = Float64Index([1.0,2.0]) + self.assertTrue(i.equals(i)) + self.assertTrue(i.identical(i)) + + i2 = Float64Index([1.0,2.0]) + self.assertTrue(i.equals(i2)) + + i = Float64Index([1.0,np.nan]) + self.assertTrue(i.equals(i)) + self.assertTrue(i.identical(i)) + + i2 = Float64Index([1.0,np.nan]) + self.assertTrue(i.equals(i2)) + + def test_contains_nans(self): + i = Float64Index([1.0, 2.0, np.nan]) + self.assertTrue(np.nan in i) + + def test_contains_not_nans(self): + i = Float64Index([1.0, 2.0, np.nan]) + self.assertTrue(1.0 in i) + + def test_doesnt_contain_all_the_things(self): + i = Float64Index([np.nan]) + self.assertFalse(i.isin([0]).item()) + self.assertFalse(i.isin([1]).item()) + self.assertTrue(i.isin([np.nan]).item()) + + def test_nan_multiple_containment(self): + i = Float64Index([1.0, np.nan]) + np.testing.assert_array_equal(i.isin([1.0]), np.array([True, False])) + np.testing.assert_array_equal(i.isin([2.0, np.pi]), + np.array([False, False])) + np.testing.assert_array_equal(i.isin([np.nan]), + np.array([False, True])) + np.testing.assert_array_equal(i.isin([1.0, np.nan]), + np.array([True, True])) + i = Float64Index([1.0, 2.0]) + np.testing.assert_array_equal(i.isin([np.nan]), + np.array([False, False])) + + def test_astype_from_object(self): + index = Index([1.0, np.nan, 0.2], dtype='object') + result = index.astype(float) + expected = Float64Index([1.0, np.nan, 0.2]) + tm.assert_equal(result.dtype, expected.dtype) + tm.assert_index_equal(result, expected) + + +class TestInt64Index(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.index = Int64Index(np.arange(0, 20, 2)) + + def test_too_many_names(self): + def testit(): + self.index.names = ["roger", "harold"] + assertRaisesRegexp(ValueError, "^Length", testit) + + def test_constructor(self): + # pass list, coerce fine + index = Int64Index([-5, 0, 1, 2]) + expected = np.array([-5, 0, 1, 2], dtype=np.int64) + self.assert_numpy_array_equal(index, expected) + + # from iterable + index = Int64Index(iter([-5, 0, 1, 2])) + self.assert_numpy_array_equal(index, expected) + + # scalar raise Exception + self.assertRaises(TypeError, Int64Index, 5) + + # copy + arr = self.index.values + new_index = Int64Index(arr, copy=True) + self.assert_numpy_array_equal(new_index, self.index) + val = arr[0] + 3000 + # this should not change index + arr[0] = val + self.assertNotEqual(new_index[0], val) + + def test_constructor_corner(self): + arr = np.array([1, 2, 3, 4], dtype=object) + index = Int64Index(arr) + self.assertEqual(index.values.dtype, np.int64) + self.assertTrue(index.equals(arr)) + + # preventing casting + arr = np.array([1, '2', 3, '4'], dtype=object) + with tm.assertRaisesRegexp(TypeError, 'casting'): + Int64Index(arr) + + arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1] + with tm.assertRaisesRegexp(TypeError, 'casting'): + Int64Index(arr_with_floats) + + def test_hash_error(self): + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(self.index).__name__): + hash(self.index) + + def test_copy(self): + i = Int64Index([], name='Foo') + i_copy = i.copy() + self.assertEqual(i_copy.name, 'Foo') + + def test_view(self): + i = Int64Index([], name='Foo') + i_view = i.view() + self.assertEqual(i_view.name, 'Foo') + + def test_coerce_list(self): + # coerce things + arr = Index([1, 2, 3, 4]) + tm.assert_isinstance(arr, Int64Index) + + # but not if explicit dtype passed + arr = Index([1, 2, 3, 4], dtype=object) + tm.assert_isinstance(arr, Index) + + def test_dtype(self): + self.assertEqual(self.index.dtype, np.int64) + + def test_is_monotonic(self): + self.assertTrue(self.index.is_monotonic) + + index = Int64Index([4, 3, 2, 1]) + self.assertFalse(index.is_monotonic) + + def test_equals(self): + same_values = Index(self.index, dtype=object) + self.assertTrue(self.index.equals(same_values)) + self.assertTrue(same_values.equals(self.index)) + + def test_identical(self): + i = Index(self.index.copy()) + self.assertTrue(i.identical(self.index)) + + same_values_different_type = Index(i, dtype=object) + self.assertFalse(i.identical(same_values_different_type)) + + i = self.index.copy(dtype=object) + i = i.rename('foo') + same_values = Index(i, dtype=object) + self.assertTrue(same_values.identical(self.index.copy(dtype=object))) + + self.assertFalse(i.identical(self.index)) + self.assertTrue(Index(same_values, name='foo', dtype=object + ).identical(i)) + + self.assertFalse( + self.index.copy(dtype=object) + .identical(self.index.copy(dtype='int64'))) + + def test_get_indexer(self): + target = Int64Index(np.arange(10)) + indexer = self.index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + self.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_pad(self): + target = Int64Index(np.arange(10)) + indexer = self.index.get_indexer(target, method='pad') + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + self.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_backfill(self): + target = Int64Index(np.arange(10)) + indexer = self.index.get_indexer(target, method='backfill') + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + self.assert_numpy_array_equal(indexer, expected) + + def test_join_outer(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + # guarantee of sortedness + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assertTrue(res.equals(noidx_res)) + + eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], + dtype=np.int64) + eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], + dtype=np.int64) + + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='outer', + return_indexers=True) + noidx_res = self.index.join(other_mono, how='outer') + self.assertTrue(res.equals(noidx_res)) + + eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], + dtype=np.int64) + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_inner(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = self.index.join(other, how='inner', + return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Int64Index([2, 12]) + elidx = np.array([1, 6]) + eridx = np.array([4, 1]) + + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='inner', + return_indexers=True) + + res2 = self.index.intersection(other_mono) + self.assertTrue(res.equals(res2)) + + eridx = np.array([1, 4]) + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_left(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = self.index.join(other, how='left', + return_indexers=True) + eres = self.index + eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], + dtype=np.int64) + + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, eridx) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='left', + return_indexers=True) + eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], + dtype=np.int64) + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, eridx) + + # non-unique + """ + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,7,9]) + res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) + eres = idx2 + eridx = np.array([0, 2, 3, -1, -1]) + elidx = np.array([0, 1, 2, 3, 4]) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + """ + + def test_join_right(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = self.index.join(other, how='right', + return_indexers=True) + eres = other + elidx = np.array([-1, 6, -1, -1, 1, -1], + dtype=np.int64) + + tm.assert_isinstance(other, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assertIsNone(ridx) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='right', + return_indexers=True) + eres = other_mono + elidx = np.array([-1, 1, -1, -1, 6, -1], + dtype=np.int64) + tm.assert_isinstance(other, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assertIsNone(ridx) + + # non-unique + """ + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,7,9]) + res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) + eres = idx2 + elidx = np.array([0, 2, 3, -1, -1]) + eridx = np.array([0, 1, 2, 3, 4]) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,9,7]) + res = idx.join(idx2, how='right', return_indexers=False) + eres = idx2 + self.assert(res.equals(eres)) + """ + + def test_join_non_int_index(self): + other = Index([3, 6, 7, 8, 10], dtype=object) + + outer = self.index.join(other, how='outer') + outer2 = other.join(self.index, how='outer') + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, + 16, 18], dtype=object) + self.assertTrue(outer.equals(outer2)) + self.assertTrue(outer.equals(expected)) + + inner = self.index.join(other, how='inner') + inner2 = other.join(self.index, how='inner') + expected = Index([6, 8, 10], dtype=object) + self.assertTrue(inner.equals(inner2)) + self.assertTrue(inner.equals(expected)) + + left = self.index.join(other, how='left') + self.assertTrue(left.equals(self.index)) + + left2 = other.join(self.index, how='left') + self.assertTrue(left2.equals(other)) + + right = self.index.join(other, how='right') + self.assertTrue(right.equals(other)) + + right2 = other.join(self.index, how='right') + self.assertTrue(right2.equals(self.index)) + + def test_join_non_unique(self): + left = Index([4, 4, 3, 3]) + + joined, lidx, ridx = left.join(left, return_indexers=True) + + exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + self.assertTrue(joined.equals(exp_joined)) + + exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.int64) + self.assert_numpy_array_equal(lidx, exp_lidx) + + exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.int64) + self.assert_numpy_array_equal(ridx, exp_ridx) + + def test_join_self(self): + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = self.index.join(self.index, how=kind) + self.assertIs(self.index, joined) + + def test_intersection(self): + other = Index([1, 2, 3, 4, 5]) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_numpy_array_equal(result, expected) + + result = other.intersection(self.index) + expected = np.sort(np.asarray(np.intersect1d(self.index.values, + other.values))) + self.assert_numpy_array_equal(result, expected) + + def test_intersect_str_dates(self): + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + i1 = Index(dt_dates, dtype=object) + i2 = Index(['aa'], dtype=object) + res = i2.intersection(i1) + + self.assertEqual(len(res), 0) + + def test_union_noncomparable(self): + from datetime import datetime, timedelta + # corner case, non-Int64Index + now = datetime.now() + other = Index([now + timedelta(i) for i in range(4)], dtype=object) + result = self.index.union(other) + expected = np.concatenate((self.index, other)) + self.assert_numpy_array_equal(result, expected) + + result = other.union(self.index) + expected = np.concatenate((other, self.index)) + self.assert_numpy_array_equal(result, expected) + + def test_cant_or_shouldnt_cast(self): + # can't + data = ['foo', 'bar', 'baz'] + self.assertRaises(TypeError, Int64Index, data) + + # shouldn't + data = ['0', '1', '2'] + self.assertRaises(TypeError, Int64Index, data) + + def test_view_Index(self): + self.index.view(Index) + + def test_prevent_casting(self): + result = self.index.astype('O') + self.assertEqual(result.dtype, np.object_) + + def test_take_preserve_name(self): + index = Int64Index([1, 2, 3, 4], name='foo') + taken = index.take([3, 0, 1]) + self.assertEqual(index.name, taken.name) + + def test_int_name_format(self): + from pandas import Series, DataFrame + index = Index(['a', 'b', 'c'], name=0) + s = Series(lrange(3), index) + df = DataFrame(lrange(3), index=index) + repr(s) + repr(df) + + def test_print_unicode_columns(self): + df = pd.DataFrame( + {u("\u05d0"): [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + + def test_repr_summary(self): + with cf.option_context('display.max_seq_items', 10): + r = repr(pd.Index(np.arange(1000))) + self.assertTrue(len(r) < 100) + self.assertTrue("..." in r) + + def test_repr_roundtrip(self): + tm.assert_index_equal(eval(repr(self.index)), self.index) + + def test_unicode_string_with_unicode(self): + idx = Index(lrange(1000)) + + if compat.PY3: + str(idx) + else: + compat.text_type(idx) + + def test_bytestring_with_unicode(self): + idx = Index(lrange(1000)) + if compat.PY3: + bytes(idx) + else: + str(idx) + + def test_slice_keep_name(self): + idx = Int64Index([1, 2], name='asdf') + self.assertEqual(idx.name, idx[1:].name) + + +class TestMultiIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + major_axis = Index(['foo', 'bar', 'baz', 'qux']) + minor_axis = Index(['one', 'two']) + + major_labels = np.array([0, 0, 1, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 0, 1]) + self.index_names = ['first', 'second'] + self.index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=self.index_names, verify_integrity=False) + + def test_hash_error(self): + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(self.index).__name__): + hash(self.index) + + def test_set_names_and_rename(self): + # so long as these are synonyms, we don't need to test set_names + self.assertEqual(self.index.rename, self.index.set_names) + new_names = [name + "SUFFIX" for name in self.index_names] + ind = self.index.set_names(new_names) + self.assertEqual(self.index.names, self.index_names) + self.assertEqual(ind.names, new_names) + with assertRaisesRegexp(ValueError, "^Length"): + ind.set_names(new_names + new_names) + new_names2 = [name + "SUFFIX2" for name in new_names] + res = ind.set_names(new_names2, inplace=True) + self.assertIsNone(res) + self.assertEqual(ind.names, new_names2) + + def test_set_levels_and_set_labels(self): + # side note - you probably wouldn't want to use levels and labels + # directly like this - but it is possible. + levels, labels = self.index.levels, self.index.labels + new_levels = [[lev + 'a' for lev in level] for level in levels] + major_labels, minor_labels = labels + major_labels = [(x + 1) % 3 for x in major_labels] + minor_labels = [(x + 1) % 1 for x in minor_labels] + new_labels = [major_labels, minor_labels] + + def assert_matching(actual, expected): + # avoid specifying internal representation + # as much as possible + self.assertEqual(len(actual), len(expected)) + for act, exp in zip(actual, expected): + act = np.asarray(act) + exp = np.asarray(exp) + assert_almost_equal(act, exp) + + # level changing [w/o mutation] + ind2 = self.index.set_levels(new_levels) + assert_matching(ind2.levels, new_levels) + assert_matching(self.index.levels, levels) + + # level changing [w/ mutation] + ind2 = self.index.copy() + inplace_return = ind2.set_levels(new_levels, inplace=True) + self.assertIsNone(inplace_return) + assert_matching(ind2.levels, new_levels) + + # label changing [w/o mutation] + ind2 = self.index.set_labels(new_labels) + assert_matching(ind2.labels, new_labels) + assert_matching(self.index.labels, labels) + + # label changing [w/ mutation] + ind2 = self.index.copy() + inplace_return = ind2.set_labels(new_labels, inplace=True) + self.assertIsNone(inplace_return) + assert_matching(ind2.labels, new_labels) + + def test_set_levels_labels_names_bad_input(self): + levels, labels = self.index.levels, self.index.labels + names = self.index.names + + with tm.assertRaisesRegexp(ValueError, 'Length of levels'): + self.index.set_levels([levels[0]]) + + with tm.assertRaisesRegexp(ValueError, 'Length of labels'): + self.index.set_labels([labels[0]]) + + with tm.assertRaisesRegexp(ValueError, 'Length of names'): + self.index.set_names([names[0]]) + + # shouldn't scalar data error, instead should demand list-like + with tm.assertRaisesRegexp(TypeError, 'list of lists-like'): + self.index.set_levels(levels[0]) + + # shouldn't scalar data error, instead should demand list-like + with tm.assertRaisesRegexp(TypeError, 'list of lists-like'): + self.index.set_labels(labels[0]) + + # shouldn't scalar data error, instead should demand list-like + with tm.assertRaisesRegexp(TypeError, 'list-like'): + self.index.set_names(names[0]) + + def test_metadata_immutable(self): + levels, labels = self.index.levels, self.index.labels + # shouldn't be able to set at either the top level or base level + mutable_regex = re.compile('does not support mutable operations') + with assertRaisesRegexp(TypeError, mutable_regex): + levels[0] = levels[0] + with assertRaisesRegexp(TypeError, mutable_regex): + levels[0][0] = levels[0][0] + # ditto for labels + with assertRaisesRegexp(TypeError, mutable_regex): + labels[0] = labels[0] + with assertRaisesRegexp(TypeError, mutable_regex): + labels[0][0] = labels[0][0] + # and for names + names = self.index.names + with assertRaisesRegexp(TypeError, mutable_regex): + names[0] = names[0] + + def test_inplace_mutation_resets_values(self): + levels = [['a', 'b', 'c'], [4]] + levels2 = [[1, 2, 3], ['a']] + labels = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] + mi1 = MultiIndex(levels=levels, labels=labels) + mi2 = MultiIndex(levels=levels2, labels=labels) + vals = mi1.values.copy() + vals2 = mi2.values.copy() + self.assertIsNotNone(mi1._tuples) + + # make sure level setting works + new_vals = mi1.set_levels(levels2).values + assert_almost_equal(vals2, new_vals) + # non-inplace doesn't kill _tuples [implementation detail] + assert_almost_equal(mi1._tuples, vals) + # and values is still same too + assert_almost_equal(mi1.values, vals) + + # inplace should kill _tuples + mi1.set_levels(levels2, inplace=True) + assert_almost_equal(mi1.values, vals2) + + # make sure label setting works too + labels2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] + exp_values = np.array([(long(1), 'a')] * 6, dtype=object) + new_values = mi2.set_labels(labels2).values + # not inplace shouldn't change + assert_almost_equal(mi2._tuples, vals2) + # should have correct values + assert_almost_equal(exp_values, new_values) + + # and again setting inplace should kill _tuples, etc + mi2.set_labels(labels2, inplace=True) + assert_almost_equal(mi2.values, new_values) + + def test_copy_in_constructor(self): + levels = np.array(["a", "b", "c"]) + labels = np.array([1, 1, 2, 0, 0, 1, 1]) + val = labels[0] + mi = MultiIndex(levels=[levels, levels], labels=[labels, labels], + copy=True) + self.assertEqual(mi.labels[0][0], val) + labels[0] = 15 + self.assertEqual(mi.labels[0][0], val) + val = levels[0] + levels[0] = "PANDA" + self.assertEqual(mi.levels[0][0], val) + + def test_set_value_keeps_names(self): + # motivating example from #3742 + lev1 = ['hans', 'hans', 'hans', 'grethe', 'grethe', 'grethe'] + lev2 = ['1', '2', '3'] * 2 + idx = pd.MultiIndex.from_arrays( + [lev1, lev2], + names=['Name', 'Number']) + df = pd.DataFrame( + np.random.randn(6, 4), + columns=['one', 'two', 'three', 'four'], + index=idx) + df = df.sortlevel() + self.assertIsNone(df.is_copy) + self.assertEqual(df.index.names, ('Name', 'Number')) + df = df.set_value(('grethe', '4'), 'one', 99.34) + self.assertIsNone(df.is_copy) + self.assertEqual(df.index.names, ('Name', 'Number')) + + def test_names(self): + + # names are assigned in __init__ + names = self.index_names + level_names = [level.name for level in self.index.levels] + self.assertEqual(names, level_names) + + # setting bad names on existing + index = self.index + assertRaisesRegexp(ValueError, "^Length of names", setattr, index, + "names", list(index.names) + ["third"]) + assertRaisesRegexp(ValueError, "^Length of names", setattr, index, + "names", []) + + # initializing with bad names (should always be equivalent) + major_axis, minor_axis = self.index.levels + major_labels, minor_labels = self.index.labels + assertRaisesRegexp(ValueError, "^Length of names", MultiIndex, + levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=['first']) + assertRaisesRegexp(ValueError, "^Length of names", MultiIndex, + levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=['first', 'second', 'third']) + + # names are assigned + index.names = ["a", "b"] + ind_names = list(index.names) + level_names = [level.name for level in index.levels] + self.assertEqual(ind_names, level_names) + + def test_reference_duplicate_name(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], names=['x', 'x']) + self.assertTrue(idx._reference_duplicate_name('x')) + + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], names=['x', 'y']) + self.assertFalse(idx._reference_duplicate_name('x')) + + def test_astype(self): + expected = self.index.copy() + actual = self.index.astype('O') + assert_copy(actual.levels, expected.levels) + assert_copy(actual.labels, expected.labels) + self.check_level_names(actual, expected.names) + + with assertRaisesRegexp(TypeError, "^Setting.*dtype.*object"): + self.index.astype(np.dtype(int)) + + def test_constructor_single_level(self): + single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], + names=['first']) + tm.assert_isinstance(single_level, Index) + self.assertNotIsInstance(single_level, MultiIndex) + self.assertEqual(single_level.name, 'first') + + single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]]) + self.assertIsNone(single_level.name) + + def test_constructor_no_levels(self): + assertRaisesRegexp(ValueError, "non-zero number of levels/labels", + MultiIndex, levels=[], labels=[]) + both_re = re.compile('Must pass both levels and labels') + with tm.assertRaisesRegexp(TypeError, both_re): + MultiIndex(levels=[]) + with tm.assertRaisesRegexp(TypeError, both_re): + MultiIndex(labels=[]) + + def test_constructor_mismatched_label_levels(self): + labels = [np.array([1]), np.array([2]), np.array([3])] + levels = ["a"] + assertRaisesRegexp(ValueError, "Length of levels and labels must be" + " the same", MultiIndex, levels=levels, + labels=labels) + length_error = re.compile('>= length of level') + label_error = re.compile(r'Unequal label lengths: \[4, 2\]') + + # important to check that it's looking at the right thing. + with tm.assertRaisesRegexp(ValueError, length_error): + MultiIndex(levels=[['a'], ['b']], labels=[[0, 1, 2, 3], [0, 3, 4, 1]]) + + with tm.assertRaisesRegexp(ValueError, label_error): + MultiIndex(levels=[['a'], ['b']], labels=[[0, 0, 0, 0], [0, 0]]) + + # external API + with tm.assertRaisesRegexp(ValueError, length_error): + self.index.copy().set_levels([['a'], ['b']]) + + with tm.assertRaisesRegexp(ValueError, label_error): + self.index.copy().set_labels([[0, 0, 0, 0], [0, 0]]) + + # deprecated properties + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + + with tm.assertRaisesRegexp(ValueError, length_error): + self.index.copy().levels = [['a'], ['b']] + + with tm.assertRaisesRegexp(ValueError, label_error): + self.index.copy().labels = [[0, 0, 0, 0], [0, 0]] + + + def assert_multiindex_copied(self, copy, original): + # levels shoudl be (at least, shallow copied) + assert_copy(copy.levels, original.levels) + + assert_almost_equal(copy.labels, original.labels) + + # labels doesn't matter which way copied + assert_almost_equal(copy.labels, original.labels) + self.assertIsNot(copy.labels, original.labels) + + # names doesn't matter which way copied + self.assertEqual(copy.names, original.names) + self.assertIsNot(copy.names, original.names) + + # sort order should be copied + self.assertEqual(copy.sortorder, original.sortorder) + + def test_copy(self): + i_copy = self.index.copy() + + self.assert_multiindex_copied(i_copy, self.index) + + def test_shallow_copy(self): + i_copy = self.index._shallow_copy() + + self.assert_multiindex_copied(i_copy, self.index) + + def test_view(self): + i_view = self.index.view() + + self.assert_multiindex_copied(i_view, self.index) + + def check_level_names(self, index, names): + self.assertEqual([level.name for level in index.levels], list(names)) + + def test_changing_names(self): + # names should be applied to levels + level_names = [level.name for level in self.index.levels] + self.check_level_names(self.index, self.index.names) + + view = self.index.view() + copy = self.index.copy() + shallow_copy = self.index._shallow_copy() + + # changing names should change level names on object + new_names = [name + "a" for name in self.index.names] + self.index.names = new_names + self.check_level_names(self.index, new_names) + + # but not on copies + self.check_level_names(view, level_names) + self.check_level_names(copy, level_names) + self.check_level_names(shallow_copy, level_names) + + # and copies shouldn't change original + shallow_copy.names = [name + "c" for name in shallow_copy.names] + self.check_level_names(self.index, new_names) + + def test_duplicate_names(self): + self.index.names = ['foo', 'foo'] + assertRaisesRegexp(KeyError, 'Level foo not found', + self.index._get_level_number, 'foo') + + def test_get_level_number_integer(self): + self.index.names = [1, 0] + self.assertEqual(self.index._get_level_number(1), 0) + self.assertEqual(self.index._get_level_number(0), 1) + self.assertRaises(IndexError, self.index._get_level_number, 2) + assertRaisesRegexp(KeyError, 'Level fourth not found', + self.index._get_level_number, 'fourth') + + def test_from_arrays(self): + arrays = [] + for lev, lab in zip(self.index.levels, self.index.labels): + arrays.append(np.asarray(lev).take(lab)) + + result = MultiIndex.from_arrays(arrays) + self.assertEqual(list(result), list(self.index)) + + def test_from_product(self): + first = ['foo', 'bar', 'buz'] + second = ['a', 'b', 'c'] + names = ['first', 'second'] + result = MultiIndex.from_product([first, second], names=names) + + tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), + ('bar', 'a'), ('bar', 'b'), ('bar', 'c'), + ('buz', 'a'), ('buz', 'b'), ('buz', 'c')] + expected = MultiIndex.from_tuples(tuples, names=names) + + assert_array_equal(result, expected) + self.assertEqual(result.names, names) + + def test_from_product_datetimeindex(self): + dt_index = pd.date_range('2000-01-01', periods=2) + mi = pd.MultiIndex.from_product([[1, 2], dt_index]) + etalon = pd.lib.list_to_object_array([(1, pd.Timestamp('2000-01-01')), + (1, pd.Timestamp('2000-01-02')), + (2, pd.Timestamp('2000-01-01')), + (2, pd.Timestamp('2000-01-02'))]) + assert_array_equal(mi.values, etalon) + + def test_append(self): + result = self.index[:3].append(self.index[3:]) + self.assertTrue(result.equals(self.index)) + + foos = [self.index[:1], self.index[1:3], self.index[3:]] + result = foos[0].append(foos[1:]) + self.assertTrue(result.equals(self.index)) + + # empty + result = self.index.append([]) + self.assertTrue(result.equals(self.index)) + + def test_get_level_values(self): + result = self.index.get_level_values(0) + expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux'] + self.assert_numpy_array_equal(result, expected) + + self.assertEqual(result.name, 'first') + + result = self.index.get_level_values('first') + expected = self.index.get_level_values(0) + self.assert_numpy_array_equal(result, expected) + + def test_get_level_values_na(self): + arrays = [['a', 'b', 'b'], [1, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + values = index.get_level_values(1) + expected = [1, np.nan, 2] + assert_array_equal(values.values.astype(float), expected) + + arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + values = index.get_level_values(1) + expected = [np.nan, np.nan, 2] + assert_array_equal(values.values.astype(float), expected) + + arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] + index = pd.MultiIndex.from_arrays(arrays) + values = index.get_level_values(0) + expected = [np.nan, np.nan, np.nan] + assert_array_equal(values.values.astype(float), expected) + values = index.get_level_values(1) + expected = ['a', np.nan, 1] + assert_array_equal(values.values, expected) + + if not _np_version_under1p7: + arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])] + index = pd.MultiIndex.from_arrays(arrays) + values = index.get_level_values(1) + expected = pd.DatetimeIndex([0, 1, pd.NaT]) + assert_array_equal(values.values, expected.values) + + arrays = [[], []] + index = pd.MultiIndex.from_arrays(arrays) + values = index.get_level_values(0) + self.assertEqual(values.shape, (0,)) + + def test_reorder_levels(self): + # this blows up + assertRaisesRegexp(IndexError, '^Too many levels', + self.index.reorder_levels, [2, 1, 0]) + + def test_nlevels(self): + self.assertEqual(self.index.nlevels, 2) + + def test_iter(self): + result = list(self.index) + expected = [('foo', 'one'), ('foo', 'two'), ('bar', 'one'), + ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] + self.assertEqual(result, expected) + + def test_pickle(self): + pickled = pickle.dumps(self.index) + unpickled = pickle.loads(pickled) + self.assertTrue(self.index.equals(unpickled)) + + def test_legacy_pickle(self): + if compat.PY3: + raise nose.SkipTest("doesn't work on Python 3") + + def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + + ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle') + obj = pickle.load(open(ppath, 'r')) + + self.assertTrue(obj._is_v1) + + obj2 = MultiIndex.from_tuples(obj.values) + self.assertTrue(obj.equals(obj2)) + + res = obj.get_indexer(obj) + exp = np.arange(len(obj)) + assert_almost_equal(res, exp) + + res = obj.get_indexer(obj2[::-1]) + exp = obj.get_indexer(obj[::-1]) + exp2 = obj2.get_indexer(obj2[::-1]) + assert_almost_equal(res, exp) + assert_almost_equal(exp, exp2) + + def test_legacy_v2_unpickle(self): + # 0.7.3 -> 0.8.0 format manage + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'mindex_073.pickle') + + obj = pd.read_pickle(filepath) + + obj2 = MultiIndex.from_tuples(obj.values) + self.assertTrue(obj.equals(obj2)) + + res = obj.get_indexer(obj) + exp = np.arange(len(obj)) + assert_almost_equal(res, exp) + + res = obj.get_indexer(obj2[::-1]) + exp = obj.get_indexer(obj[::-1]) + exp2 = obj2.get_indexer(obj2[::-1]) + assert_almost_equal(res, exp) + assert_almost_equal(exp, exp2) + + def test_from_tuples_index_values(self): + result = MultiIndex.from_tuples(self.index) + self.assertTrue((result.values == self.index.values).all()) + + def test_contains(self): + self.assertIn(('foo', 'two'), self.index) + self.assertNotIn(('bar', 'two'), self.index) + self.assertNotIn(None, self.index) + + def test_is_all_dates(self): + self.assertFalse(self.index.is_all_dates) + + def test_is_numeric(self): + # MultiIndex is never numeric + self.assertFalse(self.index.is_numeric()) + + def test_getitem(self): + # scalar + self.assertEqual(self.index[2], ('bar', 'one')) + + # slice + result = self.index[2:5] + expected = self.index[[2, 3, 4]] + self.assertTrue(result.equals(expected)) + + # boolean + result = self.index[[True, False, True, False, True, True]] + result2 = self.index[np.array([True, False, True, False, True, True])] + expected = self.index[[0, 2, 4, 5]] + self.assertTrue(result.equals(expected)) + self.assertTrue(result2.equals(expected)) + + def test_getitem_group_select(self): + sorted_idx, _ = self.index.sortlevel(0) + self.assertEqual(sorted_idx.get_loc('baz'), slice(3, 4)) + self.assertEqual(sorted_idx.get_loc('foo'), slice(0, 2)) + + def test_get_loc(self): + self.assertEqual(self.index.get_loc(('foo', 'two')), 1) + self.assertEqual(self.index.get_loc(('baz', 'two')), 3) + self.assertRaises(KeyError, self.index.get_loc, ('bar', 'two')) + self.assertRaises(KeyError, self.index.get_loc, 'quux') + + # 3 levels + index = MultiIndex(levels=[Index(lrange(4)), + Index(lrange(4)), + Index(lrange(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + self.assertRaises(KeyError, index.get_loc, (1, 1)) + self.assertEqual(index.get_loc((2, 0)), slice(3, 5)) + + def test_get_loc_duplicates(self): + index = Index([2, 2, 2, 2]) + result = index.get_loc(2) + expected = slice(0, 4) + self.assertEqual(result, expected) + # self.assertRaises(Exception, index.get_loc, 2) + + index = Index(['c', 'a', 'a', 'b', 'b']) + rs = index.get_loc('c') + xp = 0 + assert(rs == xp) + + def test_get_loc_level(self): + index = MultiIndex(levels=[Index(lrange(4)), + Index(lrange(4)), + Index(lrange(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + loc, new_index = index.get_loc_level((0, 1)) + expected = slice(1, 2) + exp_index = index[expected].droplevel(0).droplevel(0) + self.assertEqual(loc, expected) + self.assertTrue(new_index.equals(exp_index)) + + loc, new_index = index.get_loc_level((0, 1, 0)) + expected = 1 + self.assertEqual(loc, expected) + self.assertIsNone(new_index) + + self.assertRaises(KeyError, index.get_loc_level, (2, 2)) + + index = MultiIndex(levels=[[2000], lrange(4)], + labels=[np.array([0, 0, 0, 0]), + np.array([0, 1, 2, 3])]) + result, new_index = index.get_loc_level((2000, slice(None, None))) + expected = slice(None, None) + self.assertEqual(result, expected) + self.assertTrue(new_index.equals(index.droplevel(0))) + + def test_slice_locs(self): + df = tm.makeTimeDataFrame() + stacked = df.stack() + idx = stacked.index + + slob = slice(*idx.slice_locs(df.index[5], df.index[15])) + sliced = stacked[slob] + expected = df[5:16].stack() + tm.assert_almost_equal(sliced.values, expected.values) + + slob = slice(*idx.slice_locs(df.index[5] + timedelta(seconds=30), + df.index[15] - timedelta(seconds=30))) + sliced = stacked[slob] + expected = df[6:15].stack() + tm.assert_almost_equal(sliced.values, expected.values) + + def test_slice_locs_with_type_mismatch(self): + df = tm.makeTimeDataFrame() + stacked = df.stack() + idx = stacked.index + assertRaisesRegexp(TypeError, '^Level type mismatch', idx.slice_locs, + (1, 3)) + assertRaisesRegexp(TypeError, '^Level type mismatch', idx.slice_locs, + df.index[5] + timedelta(seconds=30), (5, 2)) + df = tm.makeCustomDataframe(5, 5) + stacked = df.stack() + idx = stacked.index + with assertRaisesRegexp(TypeError, '^Level type mismatch'): + idx.slice_locs(timedelta(seconds=30)) + # TODO: Try creating a UnicodeDecodeError in exception message + with assertRaisesRegexp(TypeError, '^Level type mismatch'): + idx.slice_locs(df.index[1], (16, "a")) + + def test_slice_locs_not_sorted(self): + index = MultiIndex(levels=[Index(lrange(4)), + Index(lrange(4)), + Index(lrange(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + assertRaisesRegexp(KeyError, "[Kk]ey length.*greater than MultiIndex" + " lexsort depth", index.slice_locs, (1, 0, 1), + (2, 1, 0)) + + # works + sorted_index, _ = index.sortlevel(0) + # should there be a test case here??? + sorted_index.slice_locs((1, 0, 1), (2, 1, 0)) + + def test_slice_locs_partial(self): + sorted_idx, _ = self.index.sortlevel(0) + + result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one')) + self.assertEqual(result, (1, 5)) + + result = sorted_idx.slice_locs(None, ('qux', 'one')) + self.assertEqual(result, (0, 5)) + + result = sorted_idx.slice_locs(('foo', 'two'), None) + self.assertEqual(result, (1, len(sorted_idx))) + + result = sorted_idx.slice_locs('bar', 'baz') + self.assertEqual(result, (2, 4)) + + def test_slice_locs_not_contained(self): + # some searchsorted action + + index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]], + labels=[[0, 0, 0, 1, 1, 2, 3, 3, 3], + [0, 1, 2, 1, 2, 2, 0, 1, 2]], + sortorder=0) + + result = index.slice_locs((1, 0), (5, 2)) + self.assertEqual(result, (3, 6)) + + result = index.slice_locs(1, 5) + self.assertEqual(result, (3, 6)) + + result = index.slice_locs((2, 2), (5, 2)) + self.assertEqual(result, (3, 6)) + + result = index.slice_locs(2, 5) + self.assertEqual(result, (3, 6)) + + result = index.slice_locs((1, 0), (6, 3)) + self.assertEqual(result, (3, 8)) + + result = index.slice_locs(-1, 10) + self.assertEqual(result, (0, len(index))) + + def test_consistency(self): + # need to construct an overflow + major_axis = lrange(70000) + minor_axis = lrange(10) + + major_labels = np.arange(70000) + minor_labels = np.repeat(lrange(10), 7000) + + # the fact that is works means it's consistent + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + + # inconsistent + major_labels = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + + self.assertFalse(index.is_unique) + + def test_truncate(self): + major_axis = Index(lrange(4)) + minor_axis = Index(lrange(2)) + + major_labels = np.array([0, 0, 1, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + + result = index.truncate(before=1) + self.assertNotIn('foo', result.levels[0]) + self.assertIn(1, result.levels[0]) + + result = index.truncate(after=1) + self.assertNotIn(2, result.levels[0]) + self.assertIn(1, result.levels[0]) + + result = index.truncate(before=1, after=2) + self.assertEqual(len(result.levels[0]), 2) + + # after < before + self.assertRaises(ValueError, index.truncate, 3, 1) + + def test_get_indexer(self): + major_axis = Index(lrange(4)) + minor_axis = Index(lrange(2)) + + major_labels = np.array([0, 0, 1, 2, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 0, 1, 0, 1]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + idx1 = index[:5] + idx2 = index[[1, 3, 5]] + + r1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, [1, 3, -1]) + + r1 = idx2.get_indexer(idx1, method='pad') + assert_almost_equal(r1, [-1, 0, 0, 1, 1]) + + rffill1 = idx2.get_indexer(idx1, method='ffill') + assert_almost_equal(r1, rffill1) + + r1 = idx2.get_indexer(idx1, method='backfill') + assert_almost_equal(r1, [0, 0, 1, 1, 2]) + + rbfill1 = idx2.get_indexer(idx1, method='bfill') + assert_almost_equal(r1, rbfill1) + + # pass non-MultiIndex + r1 = idx1.get_indexer(idx2._tuple_index) + rexp1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, rexp1) + + r1 = idx1.get_indexer([1, 2, 3]) + self.assertTrue((r1 == [-1, -1, -1]).all()) + + # create index with duplicates + idx1 = Index(lrange(10) + lrange(10)) + idx2 = Index(lrange(20)) + assertRaisesRegexp(InvalidIndexError, "Reindexing only valid with" + " uniquely valued Index objects", + idx1.get_indexer, idx2) + + def test_format(self): + self.index.format() + self.index[:0].format() + + def test_format_integer_names(self): + index = MultiIndex(levels=[[0, 1], [0, 1]], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=[0, 1]) + index.format(names=True) + + def test_format_sparse_display(self): + index = MultiIndex(levels=[[0, 1], [0, 1], [0, 1], [0]], + labels=[[0, 0, 0, 1, 1, 1], + [0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0]]) + + result = index.format() + self.assertEqual(result[3], '1 0 0 0') + + def test_format_sparse_config(self): + import warnings + warn_filters = warnings.filters + warnings.filterwarnings('ignore', + category=FutureWarning, + module=".*format") + # GH1538 + pd.set_option('display.multi_sparse', False) + + result = self.index.format() + self.assertEqual(result[1], 'foo two') + + self.reset_display_options() + + warnings.filters = warn_filters + + def test_to_hierarchical(self): + index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), + (2, 'one'), (2, 'two')]) + result = index.to_hierarchical(3) + expected = MultiIndex(levels=[[1, 2], ['one', 'two']], + labels=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + tm.assert_index_equal(result, expected) + self.assertEqual(result.names, index.names) + + # K > 1 + result = index.to_hierarchical(3, 2) + expected = MultiIndex(levels=[[1, 2], ['one', 'two']], + labels=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) + tm.assert_index_equal(result, expected) + self.assertEqual(result.names, index.names) + + # non-sorted + index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'), + (2, 'a'), (2, 'b')], + names=['N1', 'N2']) + + result = index.to_hierarchical(2) + expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'), (1, 'b'), + (2, 'a'), (2, 'a'), (2, 'b'), (2, 'b')], + names=['N1', 'N2']) + tm.assert_index_equal(result, expected) + self.assertEqual(result.names, index.names) + + def test_bounds(self): + self.index._bounds + + def test_equals(self): + self.assertTrue(self.index.equals(self.index)) + self.assertTrue(self.index.equal_levels(self.index)) + + self.assertFalse(self.index.equals(self.index[:-1])) + + self.assertTrue(self.index.equals(self.index._tuple_index)) + + # different number of levels + index = MultiIndex(levels=[Index(lrange(4)), + Index(lrange(4)), + Index(lrange(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + index2 = MultiIndex(levels=index.levels[:-1], + labels=index.labels[:-1]) + self.assertFalse(index.equals(index2)) + self.assertFalse(index.equal_levels(index2)) + + # levels are different + major_axis = Index(lrange(4)) + minor_axis = Index(lrange(2)) + + major_labels = np.array([0, 0, 1, 2, 2, 3]) + minor_labels = np.array([0, 1, 0, 0, 1, 0]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + self.assertFalse(self.index.equals(index)) + self.assertFalse(self.index.equal_levels(index)) + + # some of the labels are different + major_axis = Index(['foo', 'bar', 'baz', 'qux']) + minor_axis = Index(['one', 'two']) + + major_labels = np.array([0, 0, 2, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + self.assertFalse(self.index.equals(index)) + + def test_identical(self): + mi = self.index.copy() + mi2 = self.index.copy() + self.assertTrue(mi.identical(mi2)) + + mi = mi.set_names(['new1', 'new2']) + self.assertTrue(mi.equals(mi2)) + self.assertFalse(mi.identical(mi2)) + + mi2 = mi2.set_names(['new1', 'new2']) + self.assertTrue(mi.identical(mi2)) + + mi3 = Index(mi.tolist(), names=mi.names) + mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False) + self.assertTrue(mi.identical(mi3)) + self.assertFalse(mi.identical(mi4)) + self.assertTrue(mi.equals(mi4)) + + def test_is_(self): + mi = MultiIndex.from_tuples(lzip(range(10), range(10))) + self.assertTrue(mi.is_(mi)) + self.assertTrue(mi.is_(mi.view())) + self.assertTrue(mi.is_(mi.view().view().view().view())) + mi2 = mi.view() + # names are metadata, they don't change id + mi2.names = ["A", "B"] + self.assertTrue(mi2.is_(mi)) + self.assertTrue(mi.is_(mi2)) + self.assertTrue(mi.is_(mi.set_names(["C", "D"]))) + mi2 = mi.view() + mi2.set_names(["E", "F"], inplace=True) + self.assertTrue(mi.is_(mi2)) + # levels are inherent properties, they change identity + mi3 = mi2.set_levels([lrange(10), lrange(10)]) + self.assertFalse(mi3.is_(mi2)) + # shouldn't change + self.assertTrue(mi2.is_(mi)) + mi4 = mi3.view() + mi4.set_levels([[1 for _ in range(10)], lrange(10)], inplace=True) + self.assertFalse(mi4.is_(mi3)) + mi5 = mi.view() + mi5.set_levels(mi5.levels, inplace=True) + self.assertFalse(mi5.is_(mi)) + + def test_union(self): + piece1 = self.index[:5][::-1] + piece2 = self.index[3:] + + the_union = piece1 | piece2 + + tups = sorted(self.index._tuple_index) + expected = MultiIndex.from_tuples(tups) + + self.assertTrue(the_union.equals(expected)) + + # corner case, pass self or empty thing: + the_union = self.index.union(self.index) + self.assertIs(the_union, self.index) + + the_union = self.index.union(self.index[:0]) + self.assertIs(the_union, self.index) + + # won't work in python 3 + # tuples = self.index._tuple_index + # result = self.index[:4] | tuples[4:] + # self.assertTrue(result.equals(tuples)) + + # not valid for python 3 + # def test_union_with_regular_index(self): + # other = Index(['A', 'B', 'C']) + + # result = other.union(self.index) + # self.assertIn(('foo', 'one'), result) + # self.assertIn('B', result) + + # result2 = self.index.union(other) + # self.assertTrue(result.equals(result2)) + + def test_intersection(self): + piece1 = self.index[:5][::-1] + piece2 = self.index[3:] + + the_int = piece1 & piece2 + tups = sorted(self.index[3:5]._tuple_index) + expected = MultiIndex.from_tuples(tups) + self.assertTrue(the_int.equals(expected)) + + # corner case, pass self + the_int = self.index.intersection(self.index) + self.assertIs(the_int, self.index) + + # empty intersection: disjoint + empty = self.index[:2] & self.index[2:] + expected = self.index[:0] + self.assertTrue(empty.equals(expected)) + + # can't do in python 3 + # tuples = self.index._tuple_index + # result = self.index & tuples + # self.assertTrue(result.equals(tuples)) + + def test_diff(self): + first = self.index + result = first - self.index[-3:] + expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), + sortorder=0, + names=self.index.names) + + tm.assert_isinstance(result, MultiIndex) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: reflexive + result = self.index - self.index + expected = self.index[:0] + self.assertTrue(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: superset + result = self.index[-3:] - self.index + expected = self.index[:0] + self.assertTrue(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: degenerate + result = self.index[:0] - self.index + expected = self.index[:0] + self.assertTrue(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # names not the same + chunklet = self.index[-3:] + chunklet.names = ['foo', 'baz'] + result = first - chunklet + self.assertEqual(result.names, (None, None)) + + # empty, but non-equal + result = self.index - self.index.sortlevel(1)[0] + self.assertEqual(len(result), 0) + + # raise Exception called with non-MultiIndex + result = first.diff(first._tuple_index) + self.assertTrue(result.equals(first[:0])) + + # name from empty array + result = first.diff([]) + self.assertTrue(first.equals(result)) + self.assertEqual(first.names, result.names) + + # name from non-empty array + result = first.diff([('foo', 'one')]) + expected = pd.MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), + ('foo', 'two'), ('qux', 'one'), + ('qux', 'two')]) + expected.names = first.names + self.assertEqual(first.names, result.names) + assertRaisesRegexp(TypeError, "other must be a MultiIndex or a list" + " of tuples", first.diff, [1, 2, 3, 4, 5]) + + def test_from_tuples(self): + assertRaisesRegexp(TypeError, 'Cannot infer number of levels from' + ' empty list', MultiIndex.from_tuples, []) + + idx = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) + self.assertEqual(len(idx), 2) + + def test_argsort(self): + result = self.index.argsort() + expected = self.index._tuple_index.argsort() + self.assert_numpy_array_equal(result, expected) + + def test_sortlevel(self): + import random + + tuples = list(self.index) + random.shuffle(tuples) + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + self.assertTrue(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + self.assertTrue(sorted_idx.equals(expected[::-1])) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + self.assertTrue(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + self.assertTrue(sorted_idx.equals(expected[::-1])) + + def test_sortlevel_not_sort_remaining(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + sorted_idx, _ = mi.sortlevel('A', sort_remaining=False) + self.assertTrue(sorted_idx.equals(mi)) + + def test_sortlevel_deterministic(self): + tuples = [('bar', 'one'), ('foo', 'two'), ('qux', 'two'), + ('foo', 'one'), ('baz', 'two'), ('qux', 'one')] + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + self.assertTrue(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + self.assertTrue(sorted_idx.equals(expected[::-1])) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + self.assertTrue(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + self.assertTrue(sorted_idx.equals(expected[::-1])) + + def test_dims(self): + pass + + def test_drop(self): + dropped = self.index.drop([('foo', 'two'), ('qux', 'one')]) + + index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')]) + dropped2 = self.index.drop(index) + + expected = self.index[[0, 2, 3, 5]] + self.assertTrue(dropped.equals(expected)) + self.assertTrue(dropped2.equals(expected)) + + dropped = self.index.drop(['bar']) + expected = self.index[[0, 1, 3, 4, 5]] + self.assertTrue(dropped.equals(expected)) + + index = MultiIndex.from_tuples([('bar', 'two')]) + self.assertRaises(KeyError, self.index.drop, [('bar', 'two')]) + self.assertRaises(KeyError, self.index.drop, index) + + # mixed partial / full drop + dropped = self.index.drop(['foo', ('qux', 'one')]) + expected = self.index[[2, 3, 5]] + self.assertTrue(dropped.equals(expected)) + + def test_droplevel_with_names(self): + index = self.index[self.index.get_loc('foo')] + dropped = index.droplevel(0) + self.assertEqual(dropped.name, 'second') + + index = MultiIndex(levels=[Index(lrange(4)), + Index(lrange(4)), + Index(lrange(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])], + names=['one', 'two', 'three']) + dropped = index.droplevel(0) + self.assertEqual(dropped.names, ('two', 'three')) + + dropped = index.droplevel('two') + expected = index.droplevel(1) + self.assertTrue(dropped.equals(expected)) + + def test_droplevel_multiple(self): + index = MultiIndex(levels=[Index(lrange(4)), + Index(lrange(4)), + Index(lrange(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])], + names=['one', 'two', 'three']) + + dropped = index[:2].droplevel(['three', 'one']) + expected = index[:2].droplevel(2).droplevel(0) + self.assertTrue(dropped.equals(expected)) + + def test_insert(self): + # key contained in all levels + new_index = self.index.insert(0, ('bar', 'two')) + self.assertTrue(new_index.equal_levels(self.index)) + self.assertEqual(new_index[0], ('bar', 'two')) + + # key not contained in all levels + new_index = self.index.insert(0, ('abc', 'three')) + self.assert_numpy_array_equal(new_index.levels[0], + list(self.index.levels[0]) + ['abc']) + self.assert_numpy_array_equal(new_index.levels[1], + list(self.index.levels[1]) + ['three']) + self.assertEqual(new_index[0], ('abc', 'three')) + + # key wrong length + assertRaisesRegexp(ValueError, "Item must have length equal to number" + " of levels", self.index.insert, 0, ('foo2',)) + + def test_take_preserve_name(self): + taken = self.index.take([3, 0, 1]) + self.assertEqual(taken.names, self.index.names) + + def test_join_level(self): + def _check_how(other, how): + join_index, lidx, ridx = other.join(self.index, how=how, + level='second', + return_indexers=True) + + exp_level = other.join(self.index.levels[1], how=how) + self.assertTrue(join_index.levels[0].equals(self.index.levels[0])) + self.assertTrue(join_index.levels[1].equals(exp_level)) + + # pare down levels + mask = np.array( + [x[1] in exp_level for x in self.index], dtype=bool) + exp_values = self.index.values[mask] + self.assert_numpy_array_equal(join_index.values, exp_values) + + if how in ('outer', 'inner'): + join_index2, ridx2, lidx2 = \ + self.index.join(other, how=how, level='second', + return_indexers=True) + + self.assertTrue(join_index.equals(join_index2)) + self.assert_numpy_array_equal(lidx, lidx2) + self.assert_numpy_array_equal(ridx, ridx2) + self.assert_numpy_array_equal(join_index2.values, exp_values) + + def _check_all(other): + _check_how(other, 'outer') + _check_how(other, 'inner') + _check_how(other, 'left') + _check_how(other, 'right') + + _check_all(Index(['three', 'one', 'two'])) + _check_all(Index(['one'])) + _check_all(Index(['one', 'three'])) + + # some corner cases + idx = Index(['three', 'one', 'two']) + result = idx.join(self.index, level='second') + tm.assert_isinstance(result, MultiIndex) + + assertRaisesRegexp(TypeError, "Join.*MultiIndex.*ambiguous", + self.index.join, self.index, level=1) + + def test_join_self(self): + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + res = self.index + joined = res.join(res, how=kind) + self.assertIs(res, joined) + + def test_reindex(self): + result, indexer = self.index.reindex(list(self.index[:4])) + tm.assert_isinstance(result, MultiIndex) + self.check_level_names(result, self.index[:4].names) + + result, indexer = self.index.reindex(list(self.index)) + tm.assert_isinstance(result, MultiIndex) + self.assertIsNone(indexer) + self.check_level_names(result, self.index.names) + + def test_reindex_level(self): + idx = Index(['one']) + + target, indexer = self.index.reindex(idx, level='second') + target2, indexer2 = idx.reindex(self.index, level='second') + + exp_index = self.index.join(idx, level='second', how='right') + exp_index2 = self.index.join(idx, level='second', how='left') + + self.assertTrue(target.equals(exp_index)) + exp_indexer = np.array([0, 2, 4]) + self.assert_numpy_array_equal(indexer, exp_indexer) + + self.assertTrue(target2.equals(exp_index2)) + exp_indexer2 = np.array([0, -1, 0, -1, 0, -1]) + self.assert_numpy_array_equal(indexer2, exp_indexer2) + + assertRaisesRegexp(TypeError, "Fill method not supported", + self.index.reindex, self.index, method='pad', + level='second') + + assertRaisesRegexp(TypeError, "Fill method not supported", + idx.reindex, idx, method='bfill', level='first') + + def test_has_duplicates(self): + self.assertFalse(self.index.has_duplicates) + self.assertTrue(self.index.append(self.index).has_duplicates) + + index = MultiIndex(levels=[[0, 1], [0, 1, 2]], + labels=[[0, 0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 0, 1, 2]]) + self.assertTrue(index.has_duplicates) + + def test_tolist(self): + result = self.index.tolist() + exp = list(self.index.values) + self.assertEqual(result, exp) + + def test_repr_with_unicode_data(self): + with pd.core.config.option_context("display.encoding",'UTF-8'): + d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + index = pd.DataFrame(d).set_index(["a", "b"]).index + self.assertFalse("\\u" in repr(index)) # we don't want unicode-escaped + + def test_repr_roundtrip(self): + tm.assert_index_equal(eval(repr(self.index)), self.index) + + def test_unicode_string_with_unicode(self): + d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + idx = pd.DataFrame(d).set_index(["a", "b"]).index + + if compat.PY3: + str(idx) + else: + compat.text_type(idx) + + def test_bytestring_with_unicode(self): + d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + idx = pd.DataFrame(d).set_index(["a", "b"]).index + + if compat.PY3: + bytes(idx) + else: + str(idx) + + def test_slice_keep_name(self): + x = MultiIndex.from_tuples([('a', 'b'), (1, 2), ('c', 'd')], + names=['x', 'y']) + self.assertEqual(x[1:].names, x.names) + + def test_isnull_behavior(self): + # should not segfault GH5123 + # NOTE: if MI representation changes, may make sense to allow + # isnull(MI) + with tm.assertRaises(NotImplementedError): + pd.isnull(self.index) + + def test_level_setting_resets_attributes(self): + ind = MultiIndex.from_arrays([ + ['A', 'A', 'B', 'B', 'B'], + [1, 2, 1, 2, 3]]) + assert ind.is_monotonic + ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], + inplace=True) + # if this fails, probably didn't reset the cache correctly. + assert not ind.is_monotonic + + +def test_get_combined_index(): + from pandas.core.index import _get_combined_index + result = _get_combined_index([]) + assert(result.equals(Index([]))) + + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py new file mode 100644 index 00000000..0e962800 --- /dev/null +++ b/pandas/tests/test_indexing.py @@ -0,0 +1,3725 @@ +# pylint: disable-msg=W0612,E1101 +import nose +import itertools +import warnings + +from pandas.compat import range, lrange, lzip, StringIO, lmap, map +from numpy import nan +from numpy.random import randn +import numpy as np + +import pandas as pd +import pandas.core.common as com +from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, + MultiIndex, Float64Index, Timestamp) +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assert_panel_equal) +from pandas import concat + +import pandas.util.testing as tm +from pandas import date_range + +_verbose = False + +#------------------------------------------------------------------------------- +# Indexing test cases + + +def _generate_indices(f, values=False): + """ generate the indicies + if values is True , use the axis values + is False, use the range + """ + + axes = f.axes + if values: + axes = [ lrange(len(a)) for a in axes ] + + return itertools.product(*axes) + +def _get_value(f, i, values=False): + """ return the value for the location i """ + + # check agains values + if values: + return f.values[i] + + # this is equiv of f[col][row]..... + #v = f + #for a in reversed(i): + # v = v.__getitem__(a) + #return v + return f.ix[i] + +def _get_result(obj, method, key, axis): + """ return the result for this obj with this key and this axis """ + + if isinstance(key, dict): + key = key[axis] + + # use an artifical conversion to map the key as integers to the labels + # so ix can work for comparisions + if method == 'indexer': + method = 'ix' + key = obj._get_axis(axis)[key] + + # in case we actually want 0 index slicing + try: + xp = getattr(obj, method).__getitem__(_axify(obj,key,axis)) + except: + xp = getattr(obj, method).__getitem__(key) + + return xp + +def _axify(obj, key, axis): + # create a tuple accessor + if axis is not None: + axes = [ slice(None) ] * obj.ndim + axes[axis] = key + return tuple(axes) + return k + + +def _mklbl(prefix,n): + return ["%s%s" % (prefix,i) for i in range(n)] + +class TestIndexing(tm.TestCase): + + _multiprocess_can_split_ = True + + _objs = set(['series','frame','panel']) + _typs = set(['ints','labels','mixed','ts','floats','empty']) + + def setUp(self): + + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + self.series_ints = Series(np.random.rand(4), index=lrange(0,8,2)) + self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0,12,3)) + self.panel_ints = Panel(np.random.rand(4,4,4), items=lrange(0,8,2),major_axis=lrange(0,12,3),minor_axis=lrange(0,16,4)) + + self.series_labels = Series(np.random.randn(4), index=list('abcd')) + self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) + self.panel_labels = Panel(np.random.randn(4,4,4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) + + self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) + self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) + self.panel_mixed = Panel(np.random.randn(4,4,4), items=[2,4,'null',8]) + + self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) + self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) + self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) + + #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00]) + #self.frame_floats = DataFrame(np.random.randn(4, 4), columns=[1.00, 2.00, 3.00, 4.00]) + #self.panel_floats = Panel(np.random.rand(4,4,4), items = [1.00,2.00,3.00,4.00]) + + self.frame_empty = DataFrame({}) + self.series_empty = Series({}) + self.panel_empty = Panel({}) + + # form agglomerates + for o in self._objs: + + d = dict() + for t in self._typs: + d[t] = getattr(self,'%s_%s' % (o,t),None) + + setattr(self,o,d) + + def check_values(self, f, func, values = False): + + if f is None: return + axes = f.axes + indicies = itertools.product(*axes) + + for i in indicies: + result = getattr(f,func)[i] + + # check agains values + if values: + expected = f.values[i] + else: + expected = f + for a in reversed(i): + expected = expected.__getitem__(a) + + assert_almost_equal(result, expected) + + + def check_result(self, name, method1, key1, method2, key2, typs = None, objs = None, axes = None, fails = None): + + + def _eq(t, o, a, obj, k1, k2): + """ compare equal for these 2 keys """ + + if a is not None and a > obj.ndim-1: + return + + def _print(result, error = None): + if error is not None: + error = str(error) + v = "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s,key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % (name,result,t,o,method1,method2,a,error or '') + if _verbose: + com.pprint_thing(v) + + try: + + ### good debug location ### + #if name == 'bool' and t == 'empty' and o == 'series' and method1 == 'loc': + # import pdb; pdb.set_trace() + + rs = getattr(obj, method1).__getitem__(_axify(obj,k1,a)) + + try: + xp = _get_result(obj,method2,k2,a) + except: + result = 'no comp' + _print(result) + return + + try: + if np.isscalar(rs) and np.isscalar(xp): + self.assertEqual(rs, xp) + elif xp.ndim == 1: + assert_series_equal(rs,xp) + elif xp.ndim == 2: + assert_frame_equal(rs,xp) + elif xp.ndim == 3: + assert_panel_equal(rs,xp) + result = 'ok' + except (AssertionError): + result = 'fail' + + # reverse the checks + if fails is True: + if result == 'fail': + result = 'ok (fail)' + + if not result.startswith('ok'): + raise AssertionError(_print(result)) + + _print(result) + + except AssertionError: + raise + except TypeError: + raise AssertionError(_print('type error')) + except Exception as detail: + + # if we are in fails, the ok, otherwise raise it + if fails is not None: + if isinstance(detail, fails): + result = 'ok (%s)' % type(detail).__name__ + _print(result) + return + + result = type(detail).__name__ + raise AssertionError(_print(result, error = detail)) + + if typs is None: + typs = self._typs + + if objs is None: + objs = self._objs + + if axes is not None: + if not isinstance(axes,(tuple,list)): + axes = [ axes ] + else: + axes = list(axes) + else: + axes = [ 0, 1, 2] + + # check + for o in objs: + if o not in self._objs: + continue + + d = getattr(self,o) + for a in axes: + for t in typs: + if t not in self._typs: + continue + + obj = d[t] + if obj is not None: + obj = obj.copy() + + k2 = key2 + _eq(t, o, a, obj, key1, k2) + + def test_indexer_caching(self): + # GH5727 + # make sure that indexers are in the _internal_names_set + n = 1000001 + arrays = [lrange(n), lrange(n)] + index = MultiIndex.from_tuples(lzip(*arrays)) + s = Series(np.zeros(n), index=index) + str(s) + + # setitem + expected = Series(np.ones(n), index=index) + s = Series(np.zeros(n), index=index) + s[s==0] = 1 + assert_series_equal(s,expected) + + def test_at_and_iat_get(self): + + def _check(f, func, values = False): + + if f is not None: + indicies = _generate_indices(f, values) + for i in indicies: + result = getattr(f,func)[i] + expected = _get_value(f,i,values) + assert_almost_equal(result, expected) + + for o in self._objs: + + d = getattr(self,o) + + # iat + _check(d['ints'],'iat', values=True) + for f in [d['labels'],d['ts'],d['floats']]: + if f is not None: + self.assertRaises(ValueError, self.check_values, f, 'iat') + + # at + _check(d['ints'], 'at') + _check(d['labels'],'at') + _check(d['ts'], 'at') + _check(d['floats'],'at') + + def test_at_and_iat_set(self): + + def _check(f, func, values = False): + + if f is not None: + indicies = _generate_indices(f, values) + for i in indicies: + getattr(f,func)[i] = 1 + expected = _get_value(f,i,values) + assert_almost_equal(expected, 1) + + for t in self._objs: + + d = getattr(self,t) + + _check(d['ints'],'iat',values=True) + for f in [d['labels'],d['ts'],d['floats']]: + if f is not None: + self.assertRaises(ValueError, _check, f, 'iat') + + # at + _check(d['ints'], 'at') + _check(d['labels'],'at') + _check(d['ts'], 'at') + _check(d['floats'],'at') + + def test_at_timestamp(self): + + # as timestamp is not a tuple! + dates = date_range('1/1/2000', periods=8) + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + s = df['A'] + + result = s.at[dates[5]] + xp = s.values[5] + self.assertEqual(result, xp) + + def test_iat_invalid_args(self): + pass + + def test_imethods_with_dups(self): + + # GH6493 + # iat/iloc with dups + + s = Series(range(5), index=[1,1,2,2,3], dtype='int64') + result = s.iloc[2] + self.assertEqual(result,2) + result = s.iat[2] + self.assertEqual(result,2) + + self.assertRaises(IndexError, lambda : s.iat[10]) + self.assertRaises(IndexError, lambda : s.iat[-10]) + + result = s.iloc[[2,3]] + expected = Series([2,3],[2,2],dtype='int64') + assert_series_equal(result,expected) + + df = s.to_frame() + result = df.iloc[2] + expected = Series(2,index=[0]) + assert_series_equal(result,expected) + + result = df.iat[2,0] + expected = 2 + self.assertEqual(result,2) + + def test_repeated_getitem_dups(self): + # GH 5678 + # repeated gettitems on a dup index returing a ndarray + df = DataFrame(np.random.random_sample((20,5)), index=['ABCDE'[x%5] for x in range(20)]) + expected = df.loc['A',0] + result = df.loc[:,0].loc['A'] + assert_series_equal(result,expected) + + def test_iloc_exceeds_bounds(self): + + # GH6296 + # iloc should allow indexers that exceed the bounds + df = DataFrame(np.random.random_sample((20,5)), columns=list('ABCDE')) + expected = df + + # lists of positions should raise IndexErrror! + with tm.assertRaisesRegexp(IndexError, 'positional indexers are out-of-bounds'): + df.iloc[:,[0,1,2,3,4,5]] + self.assertRaises(IndexError, lambda : df.iloc[[1,30]]) + self.assertRaises(IndexError, lambda : df.iloc[[1,-30]]) + self.assertRaises(IndexError, lambda : df.iloc[[100]]) + + s = df['A'] + self.assertRaises(IndexError, lambda : s.iloc[[100]]) + self.assertRaises(IndexError, lambda : s.iloc[[-100]]) + + # still raise on a single indexer + with tm.assertRaisesRegexp(IndexError, 'single positional indexer is out-of-bounds'): + df.iloc[30] + self.assertRaises(IndexError, lambda : df.iloc[-30]) + + # slices are ok + result = df.iloc[:,4:10] # 0 < start < len < stop + expected = df.iloc[:,4:] + assert_frame_equal(result,expected) + + result = df.iloc[:,-4:-10] # stop < 0 < start < len + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:4:-1] # 0 < stop < len < start (down) + expected = df.iloc[:,:4:-1] + assert_frame_equal(result,expected) + + result = df.iloc[:,4:-10:-1] # stop < 0 < start < len (down) + expected = df.iloc[:,4::-1] + assert_frame_equal(result,expected) + + result = df.iloc[:,-10:4] # start < 0 < stop < len + expected = df.iloc[:,:4] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:4] # 0 < stop < len < start + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,-10:-11:-1] # stop < start < 0 < len (down) + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:11] # 0 < len < start < stop + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + # slice bounds exceeding is ok + result = s.iloc[18:30] + expected = s.iloc[18:] + assert_series_equal(result,expected) + + result = s.iloc[30:] + expected = s.iloc[:0] + assert_series_equal(result,expected) + + result = s.iloc[30::-1] + expected = s.iloc[::-1] + assert_series_equal(result,expected) + + # doc example + def check(result,expected): + str(result) + result.dtypes + assert_frame_equal(result,expected) + + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + check(dfl.iloc[:,2:3],DataFrame(index=dfl.index)) + check(dfl.iloc[:,1:3],dfl.iloc[:,[1]]) + check(dfl.iloc[4:6],dfl.iloc[[4]]) + + self.assertRaises(IndexError, lambda : dfl.iloc[[4,5,6]]) + self.assertRaises(IndexError, lambda : dfl.iloc[:,4]) + + + def test_iloc_getitem_int(self): + + # integer + self.check_result('integer', 'iloc', 2, 'ix', { 0 : 4, 1: 6, 2: 8 }, typs = ['ints']) + self.check_result('integer', 'iloc', 2, 'indexer', 2, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_neg_int(self): + + # neg integer + self.check_result('neg int', 'iloc', -1, 'ix', { 0 : 6, 1: 9, 2: 12 }, typs = ['ints']) + self.check_result('neg int', 'iloc', -1, 'indexer', -1, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_list_int(self): + + # list of ints + self.check_result('list int', 'iloc', [0,1,2], 'ix', { 0 : [0,2,4], 1 : [0,3,6], 2: [0,4,8] }, typs = ['ints']) + self.check_result('list int', 'iloc', [2], 'ix', { 0 : [4], 1 : [6], 2: [8] }, typs = ['ints']) + self.check_result('list int', 'iloc', [0,1,2], 'indexer', [0,1,2], typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + # array of ints + # (GH5006), make sure that a single indexer is returning the correct type + self.check_result('array int', 'iloc', np.array([0,1,2]), 'ix', { 0 : [0,2,4], 1 : [0,3,6], 2: [0,4,8] }, typs = ['ints']) + self.check_result('array int', 'iloc', np.array([2]), 'ix', { 0 : [4], 1 : [6], 2: [8] }, typs = ['ints']) + self.check_result('array int', 'iloc', np.array([0,1,2]), 'indexer', [0,1,2], typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_dups(self): + + # no dups in panel (bug?) + self.check_result('list int (dups)', 'iloc', [0,1,1,3], 'ix', { 0 : [0,2,2,6], 1 : [0,3,3,9] }, objs = ['series','frame'], typs = ['ints']) + + # GH 6766 + df1 = DataFrame([{'A':None, 'B':1},{'A':2, 'B':2}]) + df2 = DataFrame([{'A':3, 'B':3},{'A':4, 'B':4}]) + df = concat([df1, df2], axis=1) + + # cross-sectional indexing + result = df.iloc[0,0] + self.assertTrue(isnull(result)) + + result = df.iloc[0,:] + expected = Series([np.nan,1,3,3],index=['A','B','A','B']) + assert_series_equal(result,expected) + + def test_iloc_getitem_array(self): + + # array like + s = Series(index=lrange(1,4)) + self.check_result('array like', 'iloc', s.index, 'ix', { 0 : [2,4,6], 1 : [3,6,9], 2: [4,8,12] }, typs = ['ints']) + + def test_iloc_getitem_bool(self): + + # boolean indexers + b = [True,False,True,False,] + self.check_result('bool', 'iloc', b, 'ix', b, typs = ['ints']) + self.check_result('bool', 'iloc', b, 'ix', b, typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_slice(self): + + # slices + self.check_result('slice', 'iloc', slice(1,3), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints']) + self.check_result('slice', 'iloc', slice(1,3), 'indexer', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails = IndexError) + + def test_iloc_getitem_slice_dups(self): + + df1 = DataFrame(np.random.randn(10,4),columns=['A','A','B','B']) + df2 = DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C']) + + # axis=1 + df = concat([df1,df2],axis=1) + assert_frame_equal(df.iloc[:,:4],df1) + assert_frame_equal(df.iloc[:,4:],df2) + + df = concat([df2,df1],axis=1) + assert_frame_equal(df.iloc[:,:2],df2) + assert_frame_equal(df.iloc[:,2:],df1) + + assert_frame_equal(df.iloc[:,0:3],concat([df2,df1.iloc[:,[0]]],axis=1)) + + # axis=0 + df = concat([df,df],axis=0) + assert_frame_equal(df.iloc[0:10,:2],df2) + assert_frame_equal(df.iloc[0:10,2:],df1) + assert_frame_equal(df.iloc[10:,:2],df2) + assert_frame_equal(df.iloc[10:,2:],df1) + + def test_iloc_getitem_multiindex(self): + + df = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + rs = df.iloc[2] + xp = df.irow(2) + assert_series_equal(rs, xp) + + rs = df.iloc[:,2] + xp = df.icol(2) + assert_series_equal(rs, xp) + + rs = df.iloc[2,2] + xp = df.values[2,2] + self.assertEqual(rs, xp) + + # for multiple items + # GH 5528 + rs = df.iloc[[0,1]] + xp = df.xs(4,drop_level=False) + assert_frame_equal(rs,xp) + + tup = zip(*[['a','a','b','b'],['x','y','x','y']]) + index = MultiIndex.from_tuples(tup) + df = DataFrame(np.random.randn(4, 4), index=index) + rs = df.iloc[[2, 3]] + xp = df.xs('b',drop_level=False) + assert_frame_equal(rs,xp) + + def test_iloc_setitem(self): + df = self.frame_ints + + df.iloc[1,1] = 1 + result = df.iloc[1,1] + self.assertEqual(result, 1) + + df.iloc[:,2:3] = 0 + expected = df.iloc[:,2:3] + result = df.iloc[:,2:3] + assert_frame_equal(result, expected) + + # GH5771 + s = Series(0,index=[4,5,6]) + s.iloc[1:2] += 1 + expected = Series([0,1,0],index=[4,5,6]) + assert_series_equal(s, expected) + + def test_loc_setitem(self): + # GH 5771 + # loc with slice and series + s = Series(0,index=[4,5,6]) + s.loc[4:5] += 1 + expected = Series([1,1,0],index=[4,5,6]) + assert_series_equal(s, expected) + + # GH 5928 + # chained indexing assignment + df = DataFrame({'a' : [0,1,2] }) + expected = df.copy() + expected.ix[[0,1,2],'a'] = -expected.ix[[0,1,2],'a'] + + df['a'].ix[[0,1,2]] = -df['a'].ix[[0,1,2]] + assert_frame_equal(df,expected) + + df = DataFrame({'a' : [0,1,2], 'b' :[0,1,2] }) + df['a'].ix[[0,1,2]] = -df['a'].ix[[0,1,2]].astype('float64') + 0.5 + expected = DataFrame({'a' : [0.5,-0.5,-1.5], 'b' : [0,1,2] }) + assert_frame_equal(df,expected) + + def test_loc_setitem_multiindex(self): + + # GH7190 + index = pd.MultiIndex.from_product([np.arange(0,100), np.arange(0, 80)], names=['time', 'firm']) + t, n = 0, 2 + + df = DataFrame(np.nan,columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], index=index) + df.loc[(t,n),'X'] = 0 + result = df.loc[(t,n),'X'] + self.assertEqual(result, 0) + + df = DataFrame(-999,columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], index=index) + df.loc[(t,n),'X'] = 1 + result = df.loc[(t,n),'X'] + self.assertEqual(result, 1) + + df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], index=index) + df.loc[(t,n),'X'] = 2 + result = df.loc[(t,n),'X'] + self.assertEqual(result, 2) + + # GH 7218, assinging with 0-dim arrays + df = DataFrame(-999,columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], index=index) + df.loc[(t,n), 'X'] = np.array(3) + result = df.loc[(t,n),'X'] + self.assertEqual(result,3) + + def test_loc_setitem_dups(self): + + # GH 6541 + df_orig = DataFrame({'me' : list('rttti'), + 'foo': list('aaade'), + 'bar': np.arange(5,dtype='float64')*1.34+2, + 'bar2': np.arange(5,dtype='float64')*-.34+2}).set_index('me') + + indexer = tuple(['r',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_series_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['r','bar']) + df = df_orig.copy() + df.loc[indexer]*=2.0 + self.assertEqual(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['t',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_frame_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + + def test_iloc_setitem_dups(self): + + # GH 6766 + # iloc with a mask aligning from another iloc + df1 = DataFrame([{'A':None, 'B':1},{'A':2, 'B':2}]) + df2 = DataFrame([{'A':3, 'B':3},{'A':4, 'B':4}]) + df = concat([df1, df2], axis=1) + + expected = df.fillna(3) + expected['A'] = expected['A'].astype('float64') + inds = np.isnan(df.iloc[:, 0]) + mask = inds[inds].index + df.iloc[mask,0] = df.iloc[mask,2] + assert_frame_equal(df, expected) + + # del a dup column across blocks + expected = DataFrame({ 0 : [1,2], 1 : [3,4] }) + expected.columns=['B','B'] + del df['A'] + assert_frame_equal(df, expected) + + # assign back to self + df.iloc[[0,1],[0,1]] = df.iloc[[0,1],[0,1]] + assert_frame_equal(df, expected) + + # reversed x 2 + df.iloc[[1,0],[0,1]] = df.iloc[[1,0],[0,1]].reset_index(drop=True) + df.iloc[[1,0],[0,1]] = df.iloc[[1,0],[0,1]].reset_index(drop=True) + assert_frame_equal(df, expected) + + def test_chained_getitem_with_lists(self): + + # GH6394 + # Regression in chained getitem indexing with embedded list-like from 0.12 + def check(result, expected): + self.assert_numpy_array_equal(result,expected) + tm.assert_isinstance(result, np.ndarray) + + + df = DataFrame({'A': 5*[np.zeros(3)], 'B':5*[np.ones(3)]}) + expected = df['A'].iloc[2] + result = df.loc[2,'A'] + check(result, expected) + result2 = df.iloc[2]['A'] + check(result2, expected) + result3 = df['A'].loc[2] + check(result3, expected) + result4 = df['A'].iloc[2] + check(result4, expected) + + def test_loc_getitem_int(self): + + # int label + self.check_result('int label', 'loc', 2, 'ix', 2, typs = ['ints'], axes = 0) + self.check_result('int label', 'loc', 3, 'ix', 3, typs = ['ints'], axes = 1) + self.check_result('int label', 'loc', 4, 'ix', 4, typs = ['ints'], axes = 2) + self.check_result('int label', 'loc', 2, 'ix', 2, typs = ['label'], fails = KeyError) + + def test_loc_getitem_label(self): + + # label + self.check_result('label', 'loc', 'c', 'ix', 'c', typs = ['labels'], axes=0) + self.check_result('label', 'loc', 'null', 'ix', 'null', typs = ['mixed'] , axes=0) + self.check_result('label', 'loc', 8, 'ix', 8, typs = ['mixed'] , axes=0) + self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1, typs = ['ts'], axes=0) + self.check_result('label', 'loc', 'c', 'ix', 'c', typs = ['empty'], fails = KeyError) + + def test_loc_getitem_label_out_of_range(self): + + # out of range label + self.check_result('label range', 'loc', 'f', 'ix', 'f', typs = ['ints','labels','mixed','ts','floats'], fails=KeyError) + + def test_loc_getitem_label_list(self): + + # list of labels + self.check_result('list lbl', 'loc', [0,2,4], 'ix', [0,2,4], typs = ['ints'], axes=0) + self.check_result('list lbl', 'loc', [3,6,9], 'ix', [3,6,9], typs = ['ints'], axes=1) + self.check_result('list lbl', 'loc', [4,8,12], 'ix', [4,8,12], typs = ['ints'], axes=2) + self.check_result('list lbl', 'loc', ['a','b','d'], 'ix', ['a','b','d'], typs = ['labels'], axes=0) + self.check_result('list lbl', 'loc', ['A','B','C'], 'ix', ['A','B','C'], typs = ['labels'], axes=1) + self.check_result('list lbl', 'loc', ['Z','Y','W'], 'ix', ['Z','Y','W'], typs = ['labels'], axes=2) + self.check_result('list lbl', 'loc', [2,8,'null'], 'ix', [2,8,'null'], typs = ['mixed'], axes=0) + self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', + [Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0) + + # fails + self.check_result('list lbl', 'loc', [0,1,2], 'indexer', [0,1,2], typs = ['empty'], fails = KeyError) + self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError) + self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError) + self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError) + + # array like + self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0) + self.check_result('array like', 'loc', Series(index=[3,6,9]).index, 'ix', [3,6,9], typs = ['ints'], axes=1) + self.check_result('array like', 'loc', Series(index=[4,8,12]).index, 'ix', [4,8,12], typs = ['ints'], axes=2) + + def test_loc_getitem_bool(self): + + # boolean indexers + b = [True,False,True,False] + self.check_result('bool', 'loc', b, 'ix', b, typs = ['ints','labels','mixed','ts','floats']) + self.check_result('bool', 'loc', b, 'ix', b, typs = ['empty'], fails = KeyError) + + def test_loc_getitem_int_slice(self): + + # int slices in int + self.check_result('int slice1', 'loc', slice(2,4), 'ix', { 0 : [2,4], 1: [3,6], 2: [4,8] }, typs = ['ints'], fails=KeyError) + + # ok + self.check_result('int slice2', 'loc', slice(2,4), 'ix', [2,4], typs = ['ints'], axes = 0) + self.check_result('int slice2', 'loc', slice(3,6), 'ix', [3,6], typs = ['ints'], axes = 1) + self.check_result('int slice2', 'loc', slice(4,8), 'ix', [4,8], typs = ['ints'], axes = 2) + + # GH 3053 + # loc should treat integer slices like label slices + from itertools import product + + index = MultiIndex.from_tuples([t for t in product([6,7,8], ['a', 'b'])]) + df = DataFrame(np.random.randn(6, 6), index, index) + result = df.loc[6:8,:] + expected = df.ix[6:8,:] + assert_frame_equal(result,expected) + + index = MultiIndex.from_tuples([t for t in product([10, 20, 30], ['a', 'b'])]) + df = DataFrame(np.random.randn(6, 6), index, index) + result = df.loc[20:30,:] + expected = df.ix[20:30,:] + assert_frame_equal(result,expected) + + # doc examples + result = df.loc[10,:] + expected = df.ix[10,:] + assert_frame_equal(result,expected) + + result = df.loc[:,10] + #expected = df.ix[:,10] (this fails) + expected = df[10] + assert_frame_equal(result,expected) + + def test_loc_to_fail(self): + + # GH3449 + df = DataFrame(np.random.random((3, 3)), + index=['a', 'b', 'c'], + columns=['e', 'f', 'g']) + + # raise a KeyError? + self.assertRaises(KeyError, df.loc.__getitem__, tuple([[1, 2], [1, 2]])) + + # GH 7496 + # loc should not fallback + + s = Series() + s.loc[1] = 1 + s.loc['a'] = 2 + + self.assertRaises(KeyError, lambda : s.loc[-1]) + + result = s.loc[[-1, -2]] + expected = Series(np.nan,index=[-1,-2]) + assert_series_equal(result, expected) + + result = s.loc[['4']] + expected = Series(np.nan,index=['4']) + assert_series_equal(result, expected) + + s.loc[-1] = 3 + result = s.loc[[-1,-2]] + expected = Series([3,np.nan],index=[-1,-2]) + assert_series_equal(result, expected) + + s['a'] = 2 + result = s.loc[[-2]] + expected = Series([np.nan],index=[-2]) + assert_series_equal(result, expected) + + del s['a'] + def f(): + s.loc[[-2]] = 0 + self.assertRaises(KeyError, f) + + def test_loc_getitem_label_slice(self): + + # label slices (with ints) + self.check_result('lab slice', 'loc', slice(1,3), 'ix', slice(1,3), typs = ['labels','mixed','ts','floats','empty'], fails=KeyError) + + # real label slices + self.check_result('lab slice', 'loc', slice('a','c'), 'ix', slice('a','c'), typs = ['labels'], axes=0) + self.check_result('lab slice', 'loc', slice('A','C'), 'ix', slice('A','C'), typs = ['labels'], axes=1) + self.check_result('lab slice', 'loc', slice('W','Z'), 'ix', slice('W','Z'), typs = ['labels'], axes=2) + + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=0) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=1, fails=KeyError) + self.check_result('ts slice', 'loc', slice('20130102','20130104'), 'ix', slice('20130102','20130104'), typs = ['ts'], axes=2, fails=KeyError) + + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=0, fails=KeyError) + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=1, fails=KeyError) + self.check_result('mixed slice', 'loc', slice(2,8), 'ix', slice(2,8), typs = ['mixed'], axes=2, fails=KeyError) + + self.check_result('mixed slice', 'loc', slice(2,4,2), 'ix', slice(2,4,2), typs = ['mixed'], axes=0) + + def test_loc_general(self): + + # GH 2922 (these are fails) + df = DataFrame(np.random.rand(4,4),columns=['A','B','C','D']) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,2),slice(0,2)])) + + df = DataFrame(np.random.rand(4,4),columns=['A','B','C','D'], index=['A','B','C','D']) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,2),df.columns[0:2]])) + + # want this to work + result = df.loc[:,"A":"B"].iloc[0:2,:] + self.assertTrue((result.columns == ['A','B']).all() == True) + self.assertTrue((result.index == ['A','B']).all() == True) + + # mixed type + result = DataFrame({ 'a' : [Timestamp('20130101')], 'b' : [1] }).iloc[0] + expected = Series([ Timestamp('20130101'), 1],index=['a','b']) + assert_series_equal(result,expected) + self.assertEqual(result.dtype, object) + + def test_loc_setitem_consistency(self): + + # GH 6149 + # coerce similary for setitem and loc when rows have a null-slice + expected = DataFrame({ 'date': Series(0,index=range(5),dtype=np.int64), + 'val' : Series(range(5),dtype=np.int64) }) + + df = DataFrame({ 'date': date_range('2000-01-01','2000-01-5'), + 'val' : Series(range(5),dtype=np.int64) }) + df.loc[:,'date'] = 0 + assert_frame_equal(df,expected) + + df = DataFrame({ 'date': date_range('2000-01-01','2000-01-5'), + 'val' : Series(range(5),dtype=np.int64) }) + df.loc[:,'date'] = np.array(0,dtype=np.int64) + assert_frame_equal(df,expected) + + df = DataFrame({ 'date': date_range('2000-01-01','2000-01-5'), + 'val' : Series(range(5),dtype=np.int64) }) + df.loc[:,'date'] = np.array([0,0,0,0,0],dtype=np.int64) + assert_frame_equal(df,expected) + + expected = DataFrame({ 'date': Series('foo',index=range(5)), + 'val' : Series(range(5),dtype=np.int64) }) + df = DataFrame({ 'date': date_range('2000-01-01','2000-01-5'), + 'val' : Series(range(5),dtype=np.int64) }) + df.loc[:,'date'] = 'foo' + assert_frame_equal(df,expected) + + expected = DataFrame({ 'date': Series(1.0,index=range(5)), + 'val' : Series(range(5),dtype=np.int64) }) + df = DataFrame({ 'date': date_range('2000-01-01','2000-01-5'), + 'val' : Series(range(5),dtype=np.int64) }) + df.loc[:,'date'] = 1.0 + assert_frame_equal(df,expected) + + # empty (essentially noops) + expected = DataFrame(columns=['x', 'y']) + df = DataFrame(columns=['x', 'y']) + df.loc[:, 'x'] = 1 + assert_frame_equal(df,expected) + + df = DataFrame(columns=['x', 'y']) + df['x'] = 1 + assert_frame_equal(df,expected) + + def test_loc_setitem_frame(self): + df = self.frame_labels + + result = df.iloc[0,0] + + df.loc['a','A'] = 1 + result = df.loc['a','A'] + self.assertEqual(result, 1) + + result = df.iloc[0,0] + self.assertEqual(result, 1) + + df.loc[:,'B':'D'] = 0 + expected = df.loc[:,'B':'D'] + result = df.ix[:,1:] + assert_frame_equal(result, expected) + + # GH 6254 + # setting issue + df = DataFrame(index=[3, 5, 4], columns=['A']) + df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3],dtype='int64') + expected = DataFrame(dict(A = Series([1,2,3],index=[4, 3, 5]))).reindex(index=[3,5,4]) + assert_frame_equal(df, expected) + + # GH 6252 + # setting with an empty frame + keys1 = ['@' + str(i) for i in range(5)] + val1 = np.arange(5,dtype='int64') + + keys2 = ['@' + str(i) for i in range(4)] + val2 = np.arange(4,dtype='int64') + + index = list(set(keys1).union(keys2)) + df = DataFrame(index = index) + df['A'] = nan + df.loc[keys1, 'A'] = val1 + + df['B'] = nan + df.loc[keys2, 'B'] = val2 + + expected = DataFrame(dict(A = Series(val1,index=keys1), B = Series(val2,index=keys2))).reindex(index=index) + assert_frame_equal(df, expected) + + # GH 6546 + # setting with mixed labels + df = DataFrame({1:[1,2],2:[3,4],'a':['a','b']}) + + result = df.loc[0,[1,2]] + expected = Series([1,3],index=[1,2],dtype=object) + assert_series_equal(result,expected) + + expected = DataFrame({1:[5,2],2:[6,4],'a':['a','b']}) + df.loc[0,[1,2]] = [5,6] + assert_frame_equal(df, expected) + + + def test_loc_setitem_frame_multiples(self): + + # multiple setting + df = DataFrame({ 'A' : ['foo','bar','baz'], + 'B' : Series(range(3),dtype=np.int64) }) + df.loc[0:1] = df.loc[1:2] + expected = DataFrame({ 'A' : ['bar','baz','baz'], + 'B' : Series([1,2,2],dtype=np.int64) }) + assert_frame_equal(df, expected) + + + # multiple setting with frame on rhs (with M8) + df = DataFrame({ 'date' : date_range('2000-01-01','2000-01-5'), + 'val' : Series(range(5),dtype=np.int64) }) + expected = DataFrame({ 'date' : [Timestamp('20000101'),Timestamp('20000102'),Timestamp('20000101'), + Timestamp('20000102'),Timestamp('20000103')], + 'val' : Series([0,1,0,1,2],dtype=np.int64) }) + + df.loc[2:4] = df.loc[0:2] + assert_frame_equal(df, expected) + + def test_iloc_getitem_frame(self): + df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), columns=lrange(0,8,2)) + + result = df.iloc[2] + exp = df.ix[4] + assert_series_equal(result, exp) + + result = df.iloc[2,2] + exp = df.ix[4,4] + self.assertEqual(result, exp) + + # slice + result = df.iloc[4:8] + expected = df.ix[8:14] + assert_frame_equal(result, expected) + + result = df.iloc[:,2:3] + expected = df.ix[:,4:5] + assert_frame_equal(result, expected) + + # list of integers + result = df.iloc[[0,1,3]] + expected = df.ix[[0,2,6]] + assert_frame_equal(result, expected) + + result = df.iloc[[0,1,3],[0,1]] + expected = df.ix[[0,2,6],[0,2]] + assert_frame_equal(result, expected) + + # neg indicies + result = df.iloc[[-1,1,3],[-1,1]] + expected = df.ix[[18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # dups indicies + result = df.iloc[[-1,-1,1,3],[-1,1]] + expected = df.ix[[18,18,2,6],[6,2]] + assert_frame_equal(result, expected) + + # with index-like + s = Series(index=lrange(1,5)) + result = df.iloc[s.index] + expected = df.ix[[2,4,6,8]] + assert_frame_equal(result, expected) + + # try with labelled frame + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + result = df.iloc[1,1] + exp = df.ix['b','B'] + self.assertEqual(result, exp) + + result = df.iloc[:,2:3] + expected = df.ix[:,['C']] + assert_frame_equal(result, expected) + + # negative indexing + result = df.iloc[-1,-1] + exp = df.ix['j','D'] + self.assertEqual(result, exp) + + # out-of-bounds exception + self.assertRaises(IndexError, df.iloc.__getitem__, tuple([10,5])) + + # trying to use a label + self.assertRaises(ValueError, df.iloc.__getitem__, tuple(['j','D'])) + + def test_iloc_getitem_panel(self): + + # GH 7189 + p = Panel(np.arange(4*3*2).reshape(4,3,2), + items=['A','B','C','D'], + major_axis=['a','b','c'], + minor_axis=['one','two']) + + result = p.iloc[1] + expected = p.loc['B'] + assert_frame_equal(result, expected) + + result = p.iloc[1,1] + expected = p.loc['B','b'] + assert_series_equal(result, expected) + + result = p.iloc[1,1,1] + expected = p.loc['B','b','two'] + self.assertEqual(result,expected) + + # slice + result = p.iloc[1:3] + expected = p.loc[['B','C']] + assert_panel_equal(result, expected) + + result = p.iloc[:,0:2] + expected = p.loc[:,['a','b']] + assert_panel_equal(result, expected) + + # list of integers + result = p.iloc[[0,2]] + expected = p.loc[['A','C']] + assert_panel_equal(result, expected) + + # neg indicies + result = p.iloc[[-1,1],[-1,1]] + expected = p.loc[['D','B'],['c','b']] + assert_panel_equal(result, expected) + + # dups indicies + result = p.iloc[[-1,-1,1],[-1,1]] + expected = p.loc[['D','D','B'],['c','b']] + assert_panel_equal(result, expected) + + # combined + result = p.iloc[0,[True,True],[0,1]] + expected = p.loc['A',['a','b'],['one','two']] + assert_frame_equal(result, expected) + + # out-of-bounds exception + self.assertRaises(IndexError, p.iloc.__getitem__, tuple([10,5])) + def f(): + p.iloc[0,[True,True],[0,1,2]] + self.assertRaises(IndexError, f) + + # trying to use a label + self.assertRaises(ValueError, p.iloc.__getitem__, tuple(['j','D'])) + + # GH + p = Panel(np.random.rand(4,3,2), items=['A','B','C','D'], major_axis=['U','V','W'], minor_axis=['X','Y']) + expected = p['A'] + + result = p.iloc[0,:,:] + assert_frame_equal(result, expected) + + result = p.iloc[0,[True,True,True],:] + assert_frame_equal(result, expected) + + result = p.iloc[0,[True,True,True],[0,1]] + assert_frame_equal(result, expected) + + def f(): + p.iloc[0,[True,True,True],[0,1,2]] + self.assertRaises(IndexError, f) + + def f(): + p.iloc[0,[True,True,True],[2]] + self.assertRaises(IndexError, f) + + # GH 7199 + # Panel with multi-index + multi_index = pd.MultiIndex.from_tuples([('ONE', 'one'), + ('TWO', 'two'), + ('THREE', 'three')], + names=['UPPER', 'lower']) + + simple_index = [x[0] for x in multi_index] + wd1 = Panel(items=['First', 'Second'], + major_axis=['a', 'b', 'c', 'd'], + minor_axis=multi_index) + + wd2 = Panel(items=['First', 'Second'], + major_axis=['a', 'b', 'c', 'd'], + minor_axis=simple_index) + + expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] + result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG + assert_frame_equal(result1,expected1) + + expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] + result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] + assert_frame_equal(result2,expected2) + + expected1 = DataFrame(index=['a'],columns=multi_index,dtype='float64') + result1 = wd1.iloc[0,[0],[0,1,2]] + assert_frame_equal(result1,expected1) + + expected2 = DataFrame(index=['a'],columns=simple_index,dtype='float64') + result2 = wd2.iloc[0,[0],[0,1,2]] + assert_frame_equal(result2,expected2) + + # GH 7516 + mi = MultiIndex.from_tuples([(0,'x'), (1,'y'), (2,'z')]) + p = Panel(np.arange(3*3*3,dtype='int64').reshape(3,3,3), items=['a','b','c'], major_axis=mi, minor_axis=['u','v','w']) + result = p.iloc[:, 1, 0] + expected = Series([3,12,21],index=['a','b','c'], name='u') + assert_series_equal(result,expected) + + result = p.loc[:, (1,'y'), 'u'] + assert_series_equal(result,expected) + + def test_iloc_getitem_doc_issue(self): + + # multi axis slicing issue with single block + # surfaced in GH 6059 + + arr = np.random.randn(6,4) + index = date_range('20130101',periods=6) + columns = list('ABCD') + df = DataFrame(arr,index=index,columns=columns) + + # defines ref_locs + df.describe() + + result = df.iloc[3:5,0:2] + str(result) + result.dtypes + + expected = DataFrame(arr[3:5,0:2],index=index[3:5],columns=columns[0:2]) + assert_frame_equal(result,expected) + + # for dups + df.columns = list('aaaa') + result = df.iloc[3:5,0:2] + str(result) + result.dtypes + + expected = DataFrame(arr[3:5,0:2],index=index[3:5],columns=list('aa')) + assert_frame_equal(result,expected) + + # related + arr = np.random.randn(6,4) + index = list(range(0,12,2)) + columns = list(range(0,8,2)) + df = DataFrame(arr,index=index,columns=columns) + + df._data.blocks[0].mgr_locs + result = df.iloc[1:5,2:4] + str(result) + result.dtypes + expected = DataFrame(arr[1:5,2:4],index=index[1:5],columns=columns[2:4]) + assert_frame_equal(result,expected) + + def test_setitem_ndarray_1d(self): + # GH5508 + + # len of indexer vs length of the 1d ndarray + df = DataFrame(index=Index(lrange(1,11))) + df['foo'] = np.zeros(10, dtype=np.float64) + df['bar'] = np.zeros(10, dtype=np.complex) + + # invalid + def f(): + df.ix[2:5, 'bar'] = np.array([2.33j, 1.23+0.1j, 2.2]) + self.assertRaises(ValueError, f) + + # valid + df.ix[2:5, 'bar'] = np.array([2.33j, 1.23+0.1j, 2.2, 1.0]) + + result = df.ix[2:5, 'bar'] + expected = Series([2.33j, 1.23+0.1j, 2.2, 1.0],index=[2,3,4,5]) + assert_series_equal(result,expected) + + # dtype getting changed? + df = DataFrame(index=Index(lrange(1,11))) + df['foo'] = np.zeros(10, dtype=np.float64) + df['bar'] = np.zeros(10, dtype=np.complex) + + def f(): + df[2:5] = np.arange(1,4)*1j + self.assertRaises(ValueError, f) + + def test_iloc_setitem_series(self): + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) + + df.iloc[1,1] = 1 + result = df.iloc[1,1] + self.assertEqual(result, 1) + + df.iloc[:,2:3] = 0 + expected = df.iloc[:,2:3] + result = df.iloc[:,2:3] + assert_frame_equal(result, expected) + + s = Series(np.random.randn(10), index=lrange(0,20,2)) + + s.iloc[1] = 1 + result = s.iloc[1] + self.assertEqual(result, 1) + + s.iloc[:4] = 0 + expected = s.iloc[:4] + result = s.iloc[:4] + assert_series_equal(result, expected) + + def test_iloc_setitem_list_of_lists(self): + + # GH 7551 + # list-of-list is set incorrectly in mixed vs. single dtyped frames + df = DataFrame(dict(A = np.arange(5,dtype='int64'), B = np.arange(5,10,dtype='int64'))) + df.iloc[2:4] = [[10,11],[12,13]] + expected = DataFrame(dict(A = [0,1,10,12,4], B = [5,6,11,13,9])) + assert_frame_equal(df, expected) + + df = DataFrame(dict(A = list('abcde'), B = np.arange(5,10,dtype='int64'))) + df.iloc[2:4] = [['x',11],['y',13]] + expected = DataFrame(dict(A = ['a','b','x','y','e'], B = [5,6,11,13,9])) + assert_frame_equal(df, expected) + + def test_iloc_getitem_multiindex(self): + mi_labels = DataFrame(np.random.randn(4, 3), columns=[['i', 'i', 'j'], + ['A', 'A', 'B']], + index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y','Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + + # the first row + rs = mi_int.iloc[0] + xp = mi_int.ix[4].ix[8] + assert_series_equal(rs, xp) + + # 2nd (last) columns + rs = mi_int.iloc[:,2] + xp = mi_int.ix[:,2] + assert_series_equal(rs, xp) + + # corner column + rs = mi_int.iloc[2,2] + xp = mi_int.ix[:,2].ix[2] + self.assertEqual(rs, xp) + + # this is basically regular indexing + rs = mi_labels.iloc[2,2] + xp = mi_labels.ix['j'].ix[:,'j'].ix[0,0] + self.assertEqual(rs, xp) + + def test_loc_multiindex(self): + + mi_labels = DataFrame(np.random.randn(3, 3), columns=[['i', 'i', 'j'], + ['A', 'A', 'B']], + index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + + mi_int = DataFrame(np.random.randn(3, 3), + columns=[[2,2,4],[6,8,10]], + index=[[4,4,8],[8,10,12]]) + + # the first row + rs = mi_labels.loc['i'] + xp = mi_labels.ix['i'] + assert_frame_equal(rs, xp) + + # 2nd (last) columns + rs = mi_labels.loc[:,'j'] + xp = mi_labels.ix[:,'j'] + assert_frame_equal(rs, xp) + + # corner column + rs = mi_labels.loc['j'].loc[:,'j'] + xp = mi_labels.ix['j'].ix[:,'j'] + assert_frame_equal(rs,xp) + + # with a tuple + rs = mi_labels.loc[('i','X')] + xp = mi_labels.ix[('i','X')] + assert_frame_equal(rs,xp) + + rs = mi_int.loc[4] + xp = mi_int.ix[4] + assert_frame_equal(rs,xp) + + # GH6788 + # multi-index indexer is None (meaning take all) + attributes = ['Attribute' + str(i) for i in range(1)] + attribute_values = ['Value' + str(i) for i in range(5)] + + index = MultiIndex.from_product([attributes,attribute_values]) + df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 + df = DataFrame(df, columns=index) + result = df[attributes] + assert_frame_equal(result, df) + + # GH 7349 + # loc with a multi-index seems to be doing fallback + df = DataFrame(np.arange(12).reshape(-1,1),index=pd.MultiIndex.from_product([[1,2,3,4],[1,2,3]])) + + expected = df.loc[([1,2],),:] + result = df.loc[[1,2]] + assert_frame_equal(result, expected) + + # GH 7399 + # incomplete indexers + s = pd.Series(np.arange(15,dtype='int64'),MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + expected = s.loc[:, 'a':'c'] + + result = s.loc[0:4, 'a':'c'] + assert_series_equal(result, expected) + assert_series_equal(result, expected) + + result = s.loc[:4, 'a':'c'] + assert_series_equal(result, expected) + assert_series_equal(result, expected) + + result = s.loc[0:, 'a':'c'] + assert_series_equal(result, expected) + assert_series_equal(result, expected) + + # GH 7400 + # multiindexer gettitem with list of indexers skips wrong element + s = pd.Series(np.arange(15,dtype='int64'),MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + expected = s.iloc[[6,7,8,12,13,14]] + result = s.loc[2:4:2, 'a':'c'] + assert_series_equal(result, expected) + + def test_series_getitem_multiindex(self): + + # GH 6018 + # series regression getitem with a multi-index + + s = Series([1,2,3]) + s.index = MultiIndex.from_tuples([(0,0),(1,1), (2,1)]) + + result = s[:,0] + expected = Series([1],index=[0]) + assert_series_equal(result,expected) + + result = s.ix[:,1] + expected = Series([2,3],index=[1,2]) + assert_series_equal(result,expected) + + # xs + result = s.xs(0,level=0) + expected = Series([1],index=[0]) + assert_series_equal(result,expected) + + result = s.xs(1,level=1) + expected = Series([2,3],index=[1,2]) + assert_series_equal(result,expected) + + # GH6258 + s = Series([1,3,4,1,3,4], + index=MultiIndex.from_product([list('AB'), + list(date_range('20130903',periods=3))])) + result = s.xs('20130903',level=1) + expected = Series([1,1],index=list('AB')) + assert_series_equal(result,expected) + + # GH5684 + idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), + ('b', 'one'), ('b', 'two')]) + s = Series([1, 2, 3, 4], index=idx) + s.index.set_names(['L1', 'L2'], inplace=True) + result = s.xs('one', level='L2') + expected = Series([1, 3], index=['a', 'b']) + expected.index.set_names(['L1'], inplace=True) + assert_series_equal(result, expected) + + def test_ix_general(self): + + # ix general issues + + # GH 2817 + data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} + df = DataFrame(data).set_index(keys=['col', 'year']) + key = 4.0, 2012 + + # this should raise correct error + with tm.assertRaises(KeyError): + df.ix[key] + + # this is ok + df.sortlevel(inplace=True) + res = df.ix[key] + index = MultiIndex.from_arrays([[4] * 3, [2012] * 3], + names=['col', 'year']) + expected = DataFrame({'amount': [222, 333, 444]}, index=index) + tm.assert_frame_equal(res, expected) + + def test_ix_weird_slicing(self): + ## http://stackoverflow.com/q/17056560/1240268 + df = DataFrame({'one' : [1, 2, 3, np.nan, np.nan], 'two' : [1, 2, 3, 4, 5]}) + df.ix[df['one']>1, 'two'] = -df['two'] + + expected = DataFrame({'one': {0: 1.0, 1: 2.0, 2: 3.0, 3: nan, 4: nan}, + 'two': {0: 1, 1: -2, 2: -3, 3: 4, 4: 5}}) + assert_frame_equal(df, expected) + + def test_xs_multiindex(self): + + # GH2903 + columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'hello'), ('b', 'world')], names=['lvl0', 'lvl1']) + df = DataFrame(np.random.randn(4, 4), columns=columns) + df.sortlevel(axis=1,inplace=True) + result = df.xs('a', level='lvl0', axis=1) + expected = df.iloc[:,0:2].loc[:,'a'] + assert_frame_equal(result,expected) + + result = df.xs('foo', level='lvl1', axis=1) + expected = df.iloc[:, 1:2].copy() + expected.columns = expected.columns.droplevel('lvl1') + assert_frame_equal(result, expected) + + def test_per_axis_per_level_getitem(self): + + # GH6134 + # example test case + ix = MultiIndex.from_product([_mklbl('A',5),_mklbl('B',7),_mklbl('C',4),_mklbl('D',2)]) + df = DataFrame(np.arange(len(ix.get_values())),index=ix) + + result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C2' or c == 'C3')]] + result = df.loc[(slice('A1','A3'),slice(None), slice('C1','C3')),:] + assert_frame_equal(result, expected) + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], + names=['one','two']) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + + df = DataFrame(np.arange(16,dtype='int64').reshape(4, 4), index=index, columns=columns) + df = df.sortlevel(axis=0).sortlevel(axis=1) + + # identity + result = df.loc[(slice(None),slice(None)),:] + assert_frame_equal(result, df) + result = df.loc[(slice(None),slice(None)),(slice(None),slice(None))] + assert_frame_equal(result, df) + result = df.loc[:,(slice(None),slice(None))] + assert_frame_equal(result, df) + + # index + result = df.loc[(slice(None),[1]),:] + expected = df.iloc[[0,3]] + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),1),:] + expected = df.iloc[[0,3]] + assert_frame_equal(result, expected) + + # columns + result = df.loc[:,(slice(None),['foo'])] + expected = df.iloc[:,[1,3]] + assert_frame_equal(result, expected) + + # both + result = df.loc[(slice(None),1),(slice(None),['foo'])] + expected = df.iloc[[0,3],[1,3]] + assert_frame_equal(result, expected) + + result = df.loc['A','a'] + expected = DataFrame(dict(bar = [1,5,9], foo = [0,4,8]), + index=Index([1,2,3],name='two'), + columns=Index(['bar','foo'],name='lvl1')) + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),[1,2]),:] + expected = df.iloc[[0,1,3]] + assert_frame_equal(result, expected) + + # multi-level series + s = Series(np.arange(len(ix.get_values())),index=ix) + result = s.loc['A1':'A3', :, ['C1','C3']] + expected = s.loc[[ tuple([a,b,c,d]) for a,b,c,d in s.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_series_equal(result, expected) + + # boolean indexers + result = df.loc[(slice(None),df.loc[:,('a','bar')]>5),:] + expected = df.iloc[[2,3]] + assert_frame_equal(result, expected) + + def f(): + df.loc[(slice(None),np.array([True,False])),:] + self.assertRaises(ValueError, f) + + # ambiguous cases + # these can be multiply interpreted + # but we can catch this in some cases + def f(): + df.loc[(slice(None),[1])] + self.assertRaises(KeyError, f) + + # not lexsorted + self.assertEqual(df.index.lexsort_depth,2) + df = df.sortlevel(level=1,axis=0) + self.assertEqual(df.index.lexsort_depth,0) + with tm.assertRaisesRegexp(KeyError, 'MultiIndex Slicing requires the index to be fully lexsorted tuple len \(2\), lexsort depth \(0\)'): + df.loc[(slice(None),df.loc[:,('a','bar')]>5),:] + + def test_multiindex_slicers_non_unique(self): + + # GH 7106 + # non-unique mi index support + df = DataFrame(dict(A = ['foo','foo','foo','foo'], + B = ['a','a','a','a'], + C = [1,2,1,3], + D = [1,2,3,4])).set_index(['A','B','C']).sortlevel() + self.assertFalse(df.index.is_unique) + expected = DataFrame(dict(A = ['foo','foo'], + B = ['a','a'], + C = [1,1], + D = [1,3])).set_index(['A','B','C']).sortlevel() + result = df.loc[(slice(None),slice(None),1),:] + assert_frame_equal(result, expected) + + # this is equivalent of an xs expression + result = df.xs(1,level=2,drop_level=False) + assert_frame_equal(result, expected) + + df = DataFrame(dict(A = ['foo','foo','foo','foo'], + B = ['a','a','a','a'], + C = [1,2,1,2], + D = [1,2,3,4])).set_index(['A','B','C']).sortlevel() + self.assertFalse(df.index.is_unique) + expected = DataFrame(dict(A = ['foo','foo'], + B = ['a','a'], + C = [1,1], + D = [1,3])).set_index(['A','B','C']).sortlevel() + result = df.loc[(slice(None),slice(None),1),:] + self.assertFalse(result.index.is_unique) + assert_frame_equal(result, expected) + + def test_multiindex_slicers_datetimelike(self): + + # GH 7429 + # buggy/inconsistent behavior when slicing with datetime-like + import datetime + dates = [datetime.datetime(2012,1,1,12,12,12) + datetime.timedelta(days=i) for i in range(6)] + freq = [1,2] + index = MultiIndex.from_product([dates,freq], names=['date','frequency']) + + df = DataFrame(np.arange(6*2*4,dtype='int64').reshape(-1,4),index=index,columns=list('ABCD')) + + # multi-axis slicing + idx = pd.IndexSlice + expected = df.iloc[[0,2,4],[0,1]] + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'),Timestamp('2012-01-03 12:12:12')),slice(1,1)), slice('A','B')] + assert_frame_equal(result,expected) + + result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp('2012-01-03 12:12:12')],idx[1:1]), slice('A','B')] + assert_frame_equal(result,expected) + + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'),Timestamp('2012-01-03 12:12:12')),1), slice('A','B')] + assert_frame_equal(result,expected) + + # with strings + result = df.loc[(slice('2012-01-01 12:12:12','2012-01-03 12:12:12'),slice(1,1)), slice('A','B')] + assert_frame_equal(result,expected) + + result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'],1), idx['A','B']] + assert_frame_equal(result,expected) + + def test_per_axis_per_level_doc_examples(self): + + # test index maker + idx = pd.IndexSlice + + # from indexing.rst / advanced + index = MultiIndex.from_product([_mklbl('A',4), + _mklbl('B',2), + _mklbl('C',4), + _mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns),dtype='int64').reshape((len(index),len(columns))), + index=index, + columns=columns) + result = df.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + result = df.loc[idx['A1':'A3',:,['C1','C3']],:] + assert_frame_equal(result, expected) + + result = df.loc[(slice(None),slice(None), ['C1','C3']),:] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + result = df.loc[idx[:,:,['C1','C3']],:] + assert_frame_equal(result, expected) + + # not sorted + def f(): + df.loc['A1',(slice(None),'foo')] + self.assertRaises(KeyError, f) + df = df.sortlevel(axis=1) + + # slicing + df.loc['A1',(slice(None),'foo')] + df.loc[(slice(None),slice(None), ['C1','C3']),(slice(None),'foo')] + + # setitem + df.loc(axis=0)[:,:,['C1','C3']] = -10 + + def test_loc_arguments(self): + + index = MultiIndex.from_product([_mklbl('A',4), + _mklbl('B',2), + _mklbl('C',4), + _mklbl('D',2)]) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'), + ('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + df = DataFrame(np.arange(len(index)*len(columns),dtype='int64').reshape((len(index),len(columns))), + index=index, + columns=columns).sortlevel().sortlevel(axis=1) + + + # axis 0 + result = df.loc(axis=0)['A1':'A3',:,['C1','C3']] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + a == 'A1' or a == 'A2' or a == 'A3') and (c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + result = df.loc(axis='index')[:,:,['C1','C3']] + expected = df.loc[[ tuple([a,b,c,d]) for a,b,c,d in df.index.values if ( + c == 'C1' or c == 'C3')]] + assert_frame_equal(result, expected) + + # axis 1 + result = df.loc(axis=1)[:,'foo'] + expected = df.loc[:,(slice(None),'foo')] + assert_frame_equal(result, expected) + + result = df.loc(axis='columns')[:,'foo'] + expected = df.loc[:,(slice(None),'foo')] + assert_frame_equal(result, expected) + + # invalid axis + def f(): + df.loc(axis=-1)[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis=2)[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def f(): + df.loc(axis='foo')[:,:,['C1','C3']] + self.assertRaises(ValueError, f) + + def test_per_axis_per_level_setitem(self): + + # test index maker + idx = pd.IndexSlice + + # test multi-index slicing with per axis and per index controls + index = MultiIndex.from_tuples([('A',1),('A',2),('A',3),('B',1)], + names=['one','two']) + columns = MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')], + names=['lvl0', 'lvl1']) + + df_orig = DataFrame(np.arange(16,dtype='int64').reshape(4, 4), index=index, columns=columns) + df_orig = df_orig.sortlevel(axis=0).sortlevel(axis=1) + + # identity + df = df_orig.copy() + df.loc[(slice(None),slice(None)),:] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:,:] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),slice(None)),(slice(None),slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[:,(slice(None),slice(None))] = 100 + expected = df_orig.copy() + expected.iloc[:,:] = 100 + assert_frame_equal(df, expected) + + # index + df = df_orig.copy() + df.loc[(slice(None),[1]),:] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),1),:] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc(axis=0)[:,1] = 100 + expected = df_orig.copy() + expected.iloc[[0,3]] = 100 + assert_frame_equal(df, expected) + + # columns + df = df_orig.copy() + df.loc[:,(slice(None),['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[:,[1,3]] = 100 + assert_frame_equal(df, expected) + + # both + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = 100 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:,1],idx[:,['foo']]] = 100 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc['A','a'] = 100 + expected = df_orig.copy() + expected.iloc[0:3,0:2] = 100 + assert_frame_equal(df, expected) + + # setting with a list-like + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([[100, 100], [100, 100]],dtype='int64') + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = 100 + assert_frame_equal(df, expected) + + # not enough values + df = df_orig.copy() + def f(): + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([[100], [100, 100]],dtype='int64') + self.assertRaises(ValueError, f) + def f(): + df.loc[(slice(None),1),(slice(None),['foo'])] = np.array([100, 100, 100, 100],dtype='int64') + self.assertRaises(ValueError, f) + + # with an alignable rhs + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] = df.loc[(slice(None),1),(slice(None),['foo'])] * 5 + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] = expected.iloc[[0,3],[1,3]] * 5 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] *= df.loc[(slice(None),1),(slice(None),['foo'])] + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] *= expected.iloc[[0,3],[1,3]] + assert_frame_equal(df, expected) + + rhs = df_orig.loc[(slice(None),1),(slice(None),['foo'])].copy() + rhs.loc[:,('c','bah')] = 10 + df = df_orig.copy() + df.loc[(slice(None),1),(slice(None),['foo'])] *= rhs + expected = df_orig.copy() + expected.iloc[[0,3],[1,3]] *= expected.iloc[[0,3],[1,3]] + assert_frame_equal(df, expected) + + def test_multiindex_setitem(self): + + # GH 3738 + # setting with a multi-index right hand side + arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), + np.array(['one', 'two', 'one', 'one', 'two', 'one']), + np.arange(0, 6, 1)] + + df_orig = pd.DataFrame(np.random.randn(6, 3), + index=arrays, + columns=['A', 'B', 'C']).sort_index() + + expected = df_orig.loc[['bar']]*2 + df = df_orig.copy() + df.loc[['bar']] *= 2 + assert_frame_equal(df.loc[['bar']],expected) + + # raise because these have differing levels + def f(): + df.loc['bar'] *= 2 + self.assertRaises(TypeError, f) + + # from SO + #http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + df_orig = DataFrame.from_dict({'price': { + ('DE', 'Coal', 'Stock'): 2, + ('DE', 'Gas', 'Stock'): 4, + ('DE', 'Elec', 'Demand'): 1, + ('FR', 'Gas', 'Stock'): 5, + ('FR', 'Solar', 'SupIm'): 0, + ('FR', 'Wind', 'SupIm'): 0}}) + df_orig.index = MultiIndex.from_tuples(df_orig.index, names=['Sit', 'Com', 'Type']) + + expected = df_orig.copy() + expected.iloc[[0,2,3]] *= 2 + + idx = pd.IndexSlice + df = df_orig.copy() + df.loc[idx[:,:,'Stock'],:] *= 2 + assert_frame_equal(df, expected) + + df = df_orig.copy() + df.loc[idx[:,:,'Stock'],'price'] *= 2 + assert_frame_equal(df, expected) + + def test_getitem_multiindex(self): + + # GH 5725 + # the 'A' happens to be a valid Timestamp so the doesn't raise the appropriate + # error, only in PY3 of course! + index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], + labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + arr = np.random.randn(len(index),1) + df = DataFrame(arr,index=index,columns=['val']) + result = df.val['D'] + expected = Series(arr.ravel()[0:3],name='val',index=Index([26,37,57],name='day')) + assert_series_equal(result,expected) + + def f(): + df.val['A'] + self.assertRaises(KeyError, f) + + def f(): + df.val['X'] + self.assertRaises(KeyError, f) + + # A is treated as a special Timestamp + index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], + labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=['tag', 'day']) + df = DataFrame(arr,index=index,columns=['val']) + result = df.val['A'] + expected = Series(arr.ravel()[0:3],name='val',index=Index([26,37,57],name='day')) + assert_series_equal(result,expected) + + def f(): + df.val['X'] + self.assertRaises(KeyError, f) + + def test_setitem_dtype_upcast(self): + + # GH3216 + df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) + df['c'] = np.nan + self.assertEqual(df['c'].dtype, np.float64) + + df.ix[0,'c'] = 'foo' + expected = DataFrame([{"a": 1, "c" : 'foo'}, {"a": 3, "b": 2, "c" : np.nan}]) + assert_frame_equal(df,expected) + + def test_setitem_iloc(self): + + + # setitem with an iloc list + df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"]) + df.iloc[[0,1],[1,2]] + df.iloc[[0,1],[1,2]] += 100 + + expected = DataFrame(np.array([0,101,102,3,104,105,6,7,8]).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"]) + assert_frame_equal(df,expected) + + def test_dups_fancy_indexing(self): + + # GH 3455 + from pandas.util.testing import makeCustomDataframe as mkdf + df= mkdf(10, 3) + df.columns = ['a','a','b'] + cols = ['b','a'] + result = df[['b','a']].columns + expected = Index(['b','a','a']) + self.assertTrue(result.equals(expected)) + + # across dtypes + df = DataFrame([[1,2,1.,2.,3.,'foo','bar']], columns=list('aaaaaaa')) + df.head() + str(df) + result = DataFrame([[1,2,1.,2.,3.,'foo','bar']]) + result.columns = list('aaaaaaa') + + df_v = df.iloc[:,4] + res_v = result.iloc[:,4] + + assert_frame_equal(df,result) + + # GH 3561, dups not in selected order + df = DataFrame({'test': [5,7,9,11], 'test1': [4.,5,6,7], 'other': list('abcd') }, index=['A', 'A', 'B', 'C']) + rows = ['C', 'B'] + expected = DataFrame({'test' : [11,9], 'test1': [ 7., 6], 'other': ['d','c']},index=rows) + result = df.ix[rows] + assert_frame_equal(result, expected) + + result = df.ix[Index(rows)] + assert_frame_equal(result, expected) + + rows = ['C','B','E'] + expected = DataFrame({'test' : [11,9,np.nan], 'test1': [7.,6,np.nan], 'other': ['d','c',np.nan]},index=rows) + result = df.ix[rows] + assert_frame_equal(result, expected) + + # see GH5553, make sure we use the right indexer + rows = ['F','G','H','C','B','E'] + expected = DataFrame({'test' : [np.nan,np.nan,np.nan,11,9,np.nan], + 'test1': [np.nan,np.nan,np.nan,7.,6,np.nan], + 'other': [np.nan,np.nan,np.nan,'d','c',np.nan]},index=rows) + result = df.ix[rows] + assert_frame_equal(result, expected) + + # inconsistent returns for unique/duplicate indices when values are missing + df = DataFrame(randn(4,3),index=list('ABCD')) + expected = df.ix[['E']] + + dfnu = DataFrame(randn(5,3),index=list('AABCD')) + result = dfnu.ix[['E']] + assert_frame_equal(result, expected) + + # GH 4619; duplicate indexer with missing label + df = DataFrame({"A": [0, 1, 2]}) + result = df.ix[[0,8,0]] + expected = DataFrame({"A": [0, np.nan, 0]},index=[0,8,0]) + assert_frame_equal(result,expected) + + df = DataFrame({"A": list('abc')}) + result = df.ix[[0,8,0]] + expected = DataFrame({"A": ['a', np.nan, 'a']},index=[0,8,0]) + assert_frame_equal(result,expected) + + # non unique with non unique selector + df = DataFrame({'test': [5,7,9,11]}, index=['A','A','B','C']) + expected = DataFrame({'test' : [5,7,5,7,np.nan]},index=['A','A','A','A','E']) + result = df.ix[['A','A','E']] + assert_frame_equal(result, expected) + + # GH 5835 + # dups on index and missing values + df = DataFrame(np.random.randn(5,5),columns=['A','B','B','B','A']) + + expected = pd.concat([df.ix[:,['A','B']],DataFrame(np.nan,columns=['C'],index=df.index)],axis=1) + result = df.ix[:,['A','B','C']] + assert_frame_equal(result, expected) + + # GH 6504, multi-axis indexing + df = DataFrame(np.random.randn(9,2), index=[1,1,1,2,2,2,3,3,3], columns=['a', 'b']) + + expected = df.iloc[0:6] + result = df.loc[[1, 2]] + assert_frame_equal(result, expected) + + expected = df + result = df.loc[:,['a', 'b']] + assert_frame_equal(result, expected) + + expected = df.iloc[0:6,:] + result = df.loc[[1, 2], ['a', 'b']] + assert_frame_equal(result, expected) + + def test_indexing_mixed_frame_bug(self): + + # GH3492 + df=DataFrame({'a':{1:'aaa',2:'bbb',3:'ccc'},'b':{1:111,2:222,3:333}}) + + # this works, new column is created correctly + df['test']=df['a'].apply(lambda x: '_' if x=='aaa' else x) + + # this does not work, ie column test is not changed + idx=df['test']=='_' + temp=df.ix[idx,'a'].apply(lambda x: '-----' if x=='aaa' else x) + df.ix[idx,'test']=temp + self.assertEqual(df.iloc[0,2], '-----') + + #if I look at df, then element [0,2] equals '_'. If instead I type df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I get '_'. + + + def test_set_index_nan(self): + + # GH 3586 + df = DataFrame({'PRuid': {17: 'nonQC', 18: 'nonQC', 19: 'nonQC', 20: '10', 21: '11', 22: '12', 23: '13', + 24: '24', 25: '35', 26: '46', 27: '47', 28: '48', 29: '59', 30: '10'}, + 'QC': {17: 0.0, 18: 0.0, 19: 0.0, 20: nan, 21: nan, 22: nan, 23: nan, 24: 1.0, 25: nan, + 26: nan, 27: nan, 28: nan, 29: nan, 30: nan}, + 'data': {17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, + 21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996, + 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, + 29: 0.80140849999999997, 30: 0.81307740000000006}, + 'year': {17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, + 24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986}}).reset_index() + + result = df.set_index(['year','PRuid','QC']).reset_index().reindex(columns=df.columns) + assert_frame_equal(result,df) + + def test_multi_nan_indexing(self): + + # GH 3588 + df = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]}) + result = df.set_index(['a','b'], drop=False) + expected = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]}, + index = [Index(['R1','R2',np.nan,'R4'],name='a'),Index(['C1','C2','C3','C4'],name='b')]) + assert_frame_equal(result,expected) + + + def test_iloc_panel_issue(self): + + # GH 3617 + p = Panel(randn(4, 4, 4)) + + self.assertEqual(p.iloc[:3, :3, :3].shape, (3,3,3)) + self.assertEqual(p.iloc[1, :3, :3].shape, (3,3)) + self.assertEqual(p.iloc[:3, 1, :3].shape, (3,3)) + self.assertEqual(p.iloc[:3, :3, 1].shape, (3,3)) + self.assertEqual(p.iloc[1, 1, :3].shape, (3,)) + self.assertEqual(p.iloc[1, :3, 1].shape, (3,)) + self.assertEqual(p.iloc[:3, 1, 1].shape, (3,)) + + def test_panel_getitem(self): + # GH4016, date selection returns a frame when a partial string selection + ind = date_range(start="2000", freq="D", periods=1000) + df = DataFrame(np.random.randn(len(ind), 5), index=ind, columns=list('ABCDE')) + panel = Panel(dict([ ('frame_'+c,df) for c in list('ABC') ])) + + test2 = panel.ix[:, "2002":"2002-12-31"] + test1 = panel.ix[:, "2002"] + tm.assert_panel_equal(test1,test2) + + def test_panel_assignment(self): + + # GH3777 + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) + wp2 = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) + expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] + + def f(): + wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = wp2.loc[['Item1', 'Item2'], :, ['A', 'B']] + self.assertRaises(NotImplementedError, f) + + #wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = wp2.loc[['Item1', 'Item2'], :, ['A', 'B']] + #result = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] + #tm.assert_panel_equal(result,expected) + + def test_multiindex_assignment(self): + + # GH3777 part 2 + + # mixed dtype + df = DataFrame(np.random.randint(5,10,size=9).reshape(3, 3), + columns=list('abc'), + index=[[4,4,8],[8,10,12]]) + df['d'] = np.nan + arr = np.array([0.,1.]) + + df.ix[4,'d'] = arr + assert_series_equal(df.ix[4,'d'],Series(arr,index=[8,10],name='d')) + + # single dtype + df = DataFrame(np.random.randint(5,10,size=9).reshape(3, 3), + columns=list('abc'), + index=[[4,4,8],[8,10,12]]) + + df.ix[4,'c'] = arr + assert_series_equal(df.ix[4,'c'],Series(arr,index=[8,10],name='c',dtype='int64')) + + # scalar ok + df.ix[4,'c'] = 10 + assert_series_equal(df.ix[4,'c'],Series(10,index=[8,10],name='c',dtype='int64')) + + # invalid assignments + def f(): + df.ix[4,'c'] = [0,1,2,3] + self.assertRaises(ValueError, f) + + def f(): + df.ix[4,'c'] = [0] + self.assertRaises(ValueError, f) + + # groupby example + NUM_ROWS = 100 + NUM_COLS = 10 + col_names = ['A'+num for num in map(str,np.arange(NUM_COLS).tolist())] + index_cols = col_names[:5] + + df = DataFrame(np.random.randint(5, size=(NUM_ROWS,NUM_COLS)), dtype=np.int64, columns=col_names) + df = df.set_index(index_cols).sort_index() + grp = df.groupby(level=index_cols[:4]) + df['new_col'] = np.nan + + f_index = np.arange(5) + def f(name,df2): + return Series(np.arange(df2.shape[0]),name=df2.index.values[0]).reindex(f_index) + new_df = pd.concat([ f(name,df2) for name, df2 in grp ],axis=1).T + + # we are actually operating on a copy here + # but in this case, that's ok + for name, df2 in grp: + new_vals = np.arange(df2.shape[0]) + df.ix[name, 'new_col'] = new_vals + + def test_multi_assign(self): + + # GH 3626, an assignement of a sub-df to a df + df = DataFrame({'FC':['a','b','a','b','a','b'], + 'PF':[0,0,0,0,1,1], + 'col1':lrange(6), + 'col2':lrange(6,12)}) + df.ix[1,0]=np.nan + df2 = df.copy() + + mask=~df2.FC.isnull() + cols=['col1', 'col2'] + + dft = df2 * 2 + dft.ix[3,3] = np.nan + + expected = DataFrame({'FC':['a',np.nan,'a','b','a','b'], + 'PF':[0,0,0,0,1,1], + 'col1':Series([0,1,4,6,8,10]), + 'col2':[12,7,16,np.nan,20,22]}) + + + # frame on rhs + df2.ix[mask, cols]= dft.ix[mask, cols] + assert_frame_equal(df2,expected) + + df2.ix[mask, cols]= dft.ix[mask, cols] + assert_frame_equal(df2,expected) + + # with an ndarray on rhs + df2 = df.copy() + df2.ix[mask, cols]= dft.ix[mask, cols].values + assert_frame_equal(df2,expected) + df2.ix[mask, cols]= dft.ix[mask, cols].values + assert_frame_equal(df2,expected) + + # broadcasting on the rhs is required + df = DataFrame(dict(A = [1,2,0,0,0],B=[0,0,0,10,11],C=[0,0,0,10,11],D=[3,4,5,6,7])) + + expected = df.copy() + mask = expected['A'] == 0 + for col in ['A','B']: + expected.loc[mask,col] = df['D'] + + df.loc[df['A']==0,['A','B']] = df['D'] + assert_frame_equal(df,expected) + + def test_ix_assign_column_mixed(self): + # GH #1142 + df = DataFrame(tm.getSeriesData()) + df['foo'] = 'bar' + + orig = df.ix[:, 'B'].copy() + df.ix[:, 'B'] = df.ix[:, 'B'] + 1 + assert_series_equal(df.B, orig + 1) + + # GH 3668, mixed frame with series value + df = DataFrame({'x':lrange(10), 'y':lrange(10,20),'z' : 'bar'}) + expected = df.copy() + + for i in range(5): + indexer = i*2 + v = 1000 + i*200 + expected.ix[indexer, 'y'] = v + self.assertEqual(expected.ix[indexer, 'y'], v) + + df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100 + assert_frame_equal(df,expected) + + # GH 4508, making sure consistency of assignments + df = DataFrame({'a':[1,2,3],'b':[0,1,2]}) + df.ix[[0,2,],'b'] = [100,-100] + expected = DataFrame({'a' : [1,2,3], 'b' : [100,1,-100] }) + assert_frame_equal(df,expected) + + df = pd.DataFrame({'a': lrange(4) }) + df['b'] = np.nan + df.ix[[1,3],'b'] = [100,-100] + expected = DataFrame({'a' : [0,1,2,3], 'b' : [np.nan,100,np.nan,-100] }) + assert_frame_equal(df,expected) + + # ok, but chained assignments are dangerous + df = pd.DataFrame({'a': lrange(4) }) + df['b'] = np.nan + df['b'].ix[[1,3]] = [100,-100] + assert_frame_equal(df,expected) + + def test_ix_get_set_consistency(self): + + # GH 4544 + # ix/loc get/set not consistent when + # a mixed int/string index + df = DataFrame(np.arange(16).reshape((4, 4)), + columns=['a', 'b', 8, 'c'], + index=['e', 7, 'f', 'g']) + + self.assertEqual(df.ix['e', 8], 2) + self.assertEqual(df.loc['e', 8], 2) + + df.ix['e', 8] = 42 + self.assertEqual(df.ix['e', 8], 42) + self.assertEqual(df.loc['e', 8], 42) + + df.loc['e', 8] = 45 + self.assertEqual(df.ix['e', 8], 45) + self.assertEqual(df.loc['e', 8], 45) + + def test_setitem_list(self): + + # GH 6043 + # ix with a list + df = DataFrame(index=[0,1], columns=[0]) + df.ix[1,0] = [1,2,3] + df.ix[1,0] = [1,2] + + result = DataFrame(index=[0,1], columns=[0]) + result.ix[1,0] = [1,2] + + assert_frame_equal(result,df) + + # ix with an object + class TO(object): + def __init__(self, value): + self.value = value + def __str__(self): + return "[{0}]".format(self.value) + __repr__ = __str__ + def __eq__(self, other): + return self.value == other.value + def view(self): + return self + + df = DataFrame(index=[0,1], columns=[0]) + df.ix[1,0] = TO(1) + df.ix[1,0] = TO(2) + + result = DataFrame(index=[0,1], columns=[0]) + result.ix[1,0] = TO(2) + + assert_frame_equal(result,df) + + # remains object dtype even after setting it back + df = DataFrame(index=[0,1], columns=[0]) + df.ix[1,0] = TO(1) + df.ix[1,0] = np.nan + result = DataFrame(index=[0,1], columns=[0]) + + assert_frame_equal(result, df) + + def test_iloc_mask(self): + + # GH 3631, iloc with a mask (of a series) should raise + df = DataFrame(lrange(5), list('ABCDE'), columns=['a']) + mask = (df.a%2 == 0) + self.assertRaises(ValueError, df.iloc.__getitem__, tuple([mask])) + mask.index = lrange(len(mask)) + self.assertRaises(NotImplementedError, df.iloc.__getitem__, tuple([mask])) + + # ndarray ok + result = df.iloc[np.array([True] * len(mask),dtype=bool)] + assert_frame_equal(result,df) + + # the possibilities + locs = np.arange(4) + nums = 2**locs + reps = lmap(bin, nums) + df = DataFrame({'locs':locs, 'nums':nums}, reps) + + expected = { + (None,'') : '0b1100', + (None,'.loc') : '0b1100', + (None,'.iloc') : '0b1100', + ('index','') : '0b11', + ('index','.loc') : '0b11', + ('index','.iloc') : 'iLocation based boolean indexing cannot use an indexable as a mask', + ('locs','') : 'Unalignable boolean Series key provided', + ('locs','.loc') : 'Unalignable boolean Series key provided', + ('locs','.iloc') : 'iLocation based boolean indexing on an integer type is not available', + } + + warnings.filterwarnings(action='ignore', category=UserWarning) + result = dict() + for idx in [None, 'index', 'locs']: + mask = (df.nums>2).values + if idx: + mask = Series(mask, list(reversed(getattr(df, idx)))) + for method in ['', '.loc', '.iloc']: + try: + if method: + accessor = getattr(df, method[1:]) + else: + accessor = df + ans = str(bin(accessor[mask]['nums'].sum())) + except Exception as e: + ans = str(e) + + key = tuple([idx,method]) + r = expected.get(key) + if r != ans: + raise AssertionError("[%s] does not match [%s], received [%s]" % + (key,ans,r)) + warnings.filterwarnings(action='always', category=UserWarning) + + def test_ix_slicing_strings(self): + ##GH3836 + data = {'Classification': ['SA EQUITY CFD', 'bbb', 'SA EQUITY', 'SA SSF', 'aaa'], + 'Random': [1,2,3,4,5], + 'X': ['correct', 'wrong','correct', 'correct','wrong']} + df = DataFrame(data) + x = df[~df.Classification.isin(['SA EQUITY CFD', 'SA EQUITY', 'SA SSF'])] + df.ix[x.index,'X'] = df['Classification'] + + expected = DataFrame({'Classification': {0: 'SA EQUITY CFD', 1: 'bbb', + 2: 'SA EQUITY', 3: 'SA SSF', 4: 'aaa'}, + 'Random': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + 'X': {0: 'correct', 1: 'bbb', 2: 'correct', + 3: 'correct', 4: 'aaa'}}) # bug was 4: 'bbb' + + assert_frame_equal(df, expected) + + def test_non_unique_loc(self): + ## GH3659 + ## non-unique indexer with loc slice + ## https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs + + # these are going to raise becuase the we are non monotonic + df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,None)])) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,None)])) + self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,2)])) + + # monotonic are ok + df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]).sort(axis=0) + result = df.loc[1:] + expected = DataFrame({'A' : [2,4,5,6], 'B' : [4, 6,7,8]}, index = [1,1,2,3]) + assert_frame_equal(result,expected) + + result = df.loc[0:] + assert_frame_equal(result,df) + + result = df.loc[1:2] + expected = DataFrame({'A' : [2,4,5], 'B' : [4,6,7]}, index = [1,1,2]) + assert_frame_equal(result,expected) + + def test_loc_name(self): + # GH 3880 + df = DataFrame([[1, 1], [1, 1]]) + df.index.name = 'index_name' + result = df.iloc[[0, 1]].index.name + self.assertEqual(result, 'index_name') + + result = df.ix[[0, 1]].index.name + self.assertEqual(result, 'index_name') + + result = df.loc[[0, 1]].index.name + self.assertEqual(result, 'index_name') + + def test_iloc_non_unique_indexing(self): + + #GH 4017, non-unique indexing (on the axis) + df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000}) + idx = np.array(lrange(30)) * 99 + expected = df.iloc[idx] + + df3 = pd.concat([df, 2*df, 3*df]) + result = df3.iloc[idx] + + assert_frame_equal(result, expected) + + df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000}) + df2 = pd.concat([df2, 2*df2, 3*df2]) + + sidx = df2.index.to_series() + expected = df2.iloc[idx[idx<=sidx.max()]] + + new_list = [] + for r, s in expected.iterrows(): + new_list.append(s) + new_list.append(s*2) + new_list.append(s*3) + + expected = DataFrame(new_list) + expected = pd.concat([ expected, DataFrame(index=idx[idx>sidx.max()]) ]) + result = df2.loc[idx] + assert_frame_equal(result, expected) + + def test_mi_access(self): + + # GH 4145 + data = """h1 main h3 sub h5 +0 a A 1 A1 1 +1 b B 2 B1 2 +2 c B 3 A1 3 +3 d A 4 B2 4 +4 e A 5 B2 5 +5 f B 6 A2 6 +""" + + df = pd.read_csv(StringIO(data),sep='\s+',index_col=0) + df2 = df.set_index(['main', 'sub']).T.sort_index(1) + index = Index(['h1','h3','h5']) + columns = MultiIndex.from_tuples([('A','A1')],names=['main','sub']) + expected = DataFrame([['a',1,1]],index=columns,columns=index).T + + result = df2.loc[:,('A','A1')] + assert_frame_equal(result,expected) + + result = df2[('A','A1')] + assert_frame_equal(result,expected) + + # GH 4146, not returning a block manager when selecting a unique index + # from a duplicate index + # as of 4879, this returns a Series (which is similar to what happens with a non-unique) + expected = Series(['a',1,1],index=['h1','h3','h5']) + result = df2['A']['A1'] + assert_series_equal(result,expected) + + # selecting a non_unique from the 2nd level + expected = DataFrame([['d',4,4],['e',5,5]],index=Index(['B2','B2'],name='sub'),columns=['h1','h3','h5'],).T + result = df2['A']['B2'] + assert_frame_equal(result,expected) + + def test_non_unique_loc_memory_error(self): + + # GH 4280 + # non_unique index with a large selection triggers a memory error + + columns = list('ABCDEFG') + def gen_test(l,l2): + return pd.concat([ DataFrame(randn(l,len(columns)),index=lrange(l),columns=columns), + DataFrame(np.ones((l2,len(columns))),index=[0]*l2,columns=columns) ]) + + + def gen_expected(df,mask): + l = len(mask) + return pd.concat([ + df.take([0],convert=False), + DataFrame(np.ones((l,len(columns))),index=[0]*l,columns=columns), + df.take(mask[1:],convert=False) ]) + + df = gen_test(900,100) + self.assertFalse(df.index.is_unique) + + mask = np.arange(100) + result = df.loc[mask] + expected = gen_expected(df,mask) + assert_frame_equal(result,expected) + + df = gen_test(900000,100000) + self.assertFalse(df.index.is_unique) + + mask = np.arange(100000) + result = df.loc[mask] + expected = gen_expected(df,mask) + assert_frame_equal(result,expected) + + def test_astype_assignment(self): + + # GH4312 (iloc) + df_orig = DataFrame([['1','2','3','.4',5,6.,'foo']],columns=list('ABCDEFG')) + + df = df_orig.copy() + df.iloc[:,0:2] = df.iloc[:,0:2].astype(np.int64) + expected = DataFrame([[1,2,'3','.4',5,6.,'foo']],columns=list('ABCDEFG')) + assert_frame_equal(df,expected) + + df = df_orig.copy() + df.iloc[:,0:2] = df.iloc[:,0:2].convert_objects(convert_numeric=True) + expected = DataFrame([[1,2,'3','.4',5,6.,'foo']],columns=list('ABCDEFG')) + assert_frame_equal(df,expected) + + # GH5702 (loc) + df = df_orig.copy() + df.loc[:,'A'] = df.loc[:,'A'].astype(np.int64) + expected = DataFrame([[1,'2','3','.4',5,6.,'foo']],columns=list('ABCDEFG')) + assert_frame_equal(df,expected) + + df = df_orig.copy() + df.loc[:,['B','C']] = df.loc[:,['B','C']].astype(np.int64) + expected = DataFrame([['1',2,3,'.4',5,6.,'foo']],columns=list('ABCDEFG')) + assert_frame_equal(df,expected) + + # full replacements / no nans + df = DataFrame({'A': [1., 2., 3., 4.]}) + df.iloc[:, 0] = df['A'].astype(np.int64) + expected = DataFrame({'A': [1, 2, 3, 4]}) + assert_frame_equal(df,expected) + + df = DataFrame({'A': [1., 2., 3., 4.]}) + df.loc[:, 'A'] = df['A'].astype(np.int64) + expected = DataFrame({'A': [1, 2, 3, 4]}) + assert_frame_equal(df,expected) + + def test_astype_assignment_with_dups(self): + + # GH 4686 + # assignment with dups that has a dtype change + df = DataFrame( + np.arange(3).reshape((1,3)), + columns=pd.MultiIndex.from_tuples( + [('A', '1'), ('B', '1'), ('A', '2')] + ), + dtype=object + ) + index = df.index.copy() + + df['A'] = df['A'].astype(np.float64) + result = df.get_dtype_counts().sort_index() + expected = Series({ 'float64' : 2, 'object' : 1 }).sort_index() + self.assertTrue(df.index.equals(index)) + + def test_dups_loc(self): + + # GH4726 + # dup indexing with iloc/loc + df = DataFrame([[1,2,'foo','bar',Timestamp('20130101')]], + columns=['a','a','a','a','a'],index=[1]) + expected = Series([1,2,'foo','bar',Timestamp('20130101')],index=['a','a','a','a','a']) + + result = df.iloc[0] + assert_series_equal(result,expected) + + result = df.loc[1] + assert_series_equal(result,expected) + + def test_partial_setting(self): + + # GH2578, allow ix and friends to partially set + + ### series ### + s_orig = Series([1,2,3]) + + s = s_orig.copy() + s[5] = 5 + expected = Series([1,2,3,5],index=[0,1,2,5]) + assert_series_equal(s,expected) + + s = s_orig.copy() + s.loc[5] = 5 + expected = Series([1,2,3,5],index=[0,1,2,5]) + assert_series_equal(s,expected) + + s = s_orig.copy() + s[5] = 5. + expected = Series([1,2,3,5.],index=[0,1,2,5]) + assert_series_equal(s,expected) + + s = s_orig.copy() + s.loc[5] = 5. + expected = Series([1,2,3,5.],index=[0,1,2,5]) + assert_series_equal(s,expected) + + # iloc/iat raise + s = s_orig.copy() + def f(): + s.iloc[3] = 5. + self.assertRaises(IndexError, f) + def f(): + s.iat[3] = 5. + self.assertRaises(IndexError, f) + + ### frame ### + + df_orig = DataFrame(np.arange(6).reshape(3,2),columns=['A','B'],dtype='int64') + + # iloc/iat raise + df = df_orig.copy() + def f(): + df.iloc[4,2] = 5. + self.assertRaises(IndexError, f) + def f(): + df.iat[4,2] = 5. + self.assertRaises(IndexError, f) + + # row setting where it exists + expected = DataFrame(dict({ 'A' : [0,4,4], 'B' : [1,5,5] })) + df = df_orig.copy() + df.iloc[1] = df.iloc[2] + assert_frame_equal(df,expected) + + expected = DataFrame(dict({ 'A' : [0,4,4], 'B' : [1,5,5] })) + df = df_orig.copy() + df.loc[1] = df.loc[2] + assert_frame_equal(df,expected) + + expected = DataFrame(dict({ 'A' : [0,2,4,4], 'B' : [1,3,5,5] }),dtype='float64') + df = df_orig.copy() + df.loc[3] = df.loc[2] + assert_frame_equal(df,expected) + + # single dtype frame, overwrite + expected = DataFrame(dict({ 'A' : [0,2,4], 'B' : [0,2,4] })) + df = df_orig.copy() + df.ix[:,'B'] = df.ix[:,'A'] + assert_frame_equal(df,expected) + + # mixed dtype frame, overwrite + expected = DataFrame(dict({ 'A' : [0,2,4], 'B' : Series([0,2,4]) })) + df = df_orig.copy() + df['B'] = df['B'].astype(np.float64) + df.ix[:,'B'] = df.ix[:,'A'] + assert_frame_equal(df,expected) + + # single dtype frame, partial setting + expected = df_orig.copy() + expected['C'] = df['A'] + df = df_orig.copy() + df.ix[:,'C'] = df.ix[:,'A'] + assert_frame_equal(df,expected) + + # mixed frame, partial setting + expected = df_orig.copy() + expected['C'] = df['A'] + df = df_orig.copy() + df.ix[:,'C'] = df.ix[:,'A'] + assert_frame_equal(df,expected) + + ### panel ### + p_orig = Panel(np.arange(16).reshape(2,4,2),items=['Item1','Item2'],major_axis=pd.date_range('2001/1/12',periods=4),minor_axis=['A','B'],dtype='float64') + + # panel setting via item + p_orig = Panel(np.arange(16).reshape(2,4,2),items=['Item1','Item2'],major_axis=pd.date_range('2001/1/12',periods=4),minor_axis=['A','B'],dtype='float64') + expected = p_orig.copy() + expected['Item3'] = expected['Item1'] + p = p_orig.copy() + p.loc['Item3'] = p['Item1'] + assert_panel_equal(p,expected) + + # panel with aligned series + expected = p_orig.copy() + expected = expected.transpose(2,1,0) + expected['C'] = DataFrame({ 'Item1' : [30,30,30,30], 'Item2' : [32,32,32,32] },index=p_orig.major_axis) + expected = expected.transpose(2,1,0) + p = p_orig.copy() + p.loc[:,:,'C'] = Series([30,32],index=p_orig.items) + assert_panel_equal(p,expected) + + def test_series_partial_set(self): + # partial set with new index + # Regression from GH4825 + ser = Series([0.1, 0.2], index=[1, 2]) + + # loc + expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) + result = ser.loc[[3, 2, 3]] + assert_series_equal(result, expected) + + expected = Series([np.nan, np.nan, np.nan], index=[3, 3, 3]) + result = ser.loc[[3, 3, 3]] + assert_series_equal(result, expected) + + expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) + result = ser.loc[[2, 2, 3]] + assert_series_equal(result, expected) + + expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) + result = Series([0.1, 0.2, 0.3], index=[1,2,3]).loc[[3,4,4]] + assert_series_equal(result, expected) + + expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) + result = Series([0.1, 0.2, 0.3, 0.4], index=[1,2,3,4]).loc[[5,3,3]] + assert_series_equal(result, expected) + + expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) + result = Series([0.1, 0.2, 0.3, 0.4], index=[1,2,3,4]).loc[[5,4,4]] + assert_series_equal(result, expected) + + expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) + result = Series([0.1, 0.2, 0.3, 0.4], index=[4,5,6,7]).loc[[7,2,2]] + assert_series_equal(result, expected) + + expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) + result = Series([0.1, 0.2, 0.3, 0.4], index=[1,2,3,4]).loc[[4,5,5]] + assert_series_equal(result, expected) + + # iloc + expected = Series([0.2,0.2,0.1,0.1], index=[2,2,1,1]) + result = ser.iloc[[1,1,0,0]] + assert_series_equal(result, expected) + + def test_partial_set_invalid(self): + + # GH 4940 + # allow only setting of 'valid' values + + df = tm.makeTimeDataFrame() + + # don't allow not string inserts + def f(): + df.loc[100.0, :] = df.ix[0] + self.assertRaises(ValueError, f) + def f(): + df.loc[100,:] = df.ix[0] + self.assertRaises(ValueError, f) + + def f(): + df.ix[100.0, :] = df.ix[0] + self.assertRaises(ValueError, f) + def f(): + df.ix[100,:] = df.ix[0] + self.assertRaises(ValueError, f) + + # allow object conversion here + df.loc['a',:] = df.ix[0] + + def test_partial_set_empty(self): + + # GH5226 + + # partially set with an empty object + # series + s = Series() + s.loc[1] = 1 + assert_series_equal(s,Series([1],index=[1])) + s.loc[3] = 3 + assert_series_equal(s,Series([1,3],index=[1,3])) + + s = Series() + s.loc[1] = 1. + assert_series_equal(s,Series([1.],index=[1])) + s.loc[3] = 3. + assert_series_equal(s,Series([1.,3.],index=[1,3])) + + s = Series() + s.loc['foo'] = 1 + assert_series_equal(s,Series([1],index=['foo'])) + s.loc['bar'] = 3 + assert_series_equal(s,Series([1,3],index=['foo','bar'])) + s.loc[3] = 4 + assert_series_equal(s,Series([1,3,4],index=['foo','bar',3])) + + # partially set with an empty object + # frame + df = DataFrame() + + def f(): + df.loc[1] = 1 + self.assertRaises(ValueError, f) + def f(): + df.loc[1] = Series([1],index=['foo']) + self.assertRaises(ValueError, f) + def f(): + df.loc[:,1] = 1 + self.assertRaises(ValueError, f) + + # these work as they don't really change + # anything but the index + # GH5632 + expected = DataFrame(columns=['foo']) + def f(): + df = DataFrame() + df['foo'] = Series([]) + return df + assert_frame_equal(f(), expected) + def f(): + df = DataFrame() + df['foo'] = Series(df.index) + return df + assert_frame_equal(f(), expected) + def f(): + df = DataFrame() + df['foo'] = Series(range(len(df))) + return df + assert_frame_equal(f(), expected) + def f(): + df = DataFrame() + df['foo'] = [] + return df + assert_frame_equal(f(), expected) + def f(): + df = DataFrame() + df['foo'] = df.index + return df + assert_frame_equal(f(), expected) + def f(): + df = DataFrame() + df['foo'] = range(len(df)) + return df + assert_frame_equal(f(), expected) + + df = DataFrame() + df2 = DataFrame() + df2[1] = Series([1],index=['foo']) + df.loc[:,1] = Series([1],index=['foo']) + assert_frame_equal(df,DataFrame([[1]],index=['foo'],columns=[1])) + assert_frame_equal(df,df2) + + df = DataFrame(columns=['A','B']) + df.loc[3] = [6,7] + assert_frame_equal(df,DataFrame([[6,7]],index=[3],columns=['A','B'])) + + # no label overlap + df = DataFrame(columns=['A','B']) + df.loc[0] = Series(1,index=range(4)) + assert_frame_equal(df,DataFrame(columns=['A','B'],index=[0])) + + # no index to start + expected = DataFrame({ 0 : Series(1,index=range(4)) },columns=['A','B',0]) + + df = DataFrame(columns=['A','B']) + df[0] = Series(1,index=range(4)) + df.dtypes + str(df) + assert_frame_equal(df,expected) + + df = DataFrame(columns=['A','B']) + df.loc[:,0] = Series(1,index=range(4)) + df.dtypes + str(df) + assert_frame_equal(df,expected) + + # GH5720, GH5744 + # don't create rows when empty + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + y['New'] = np.nan + assert_frame_equal(y,DataFrame(columns=['A','B','New'])) + + df = DataFrame(columns=['a', 'b', 'c c']) + df['d'] = 3 + assert_frame_equal(df,DataFrame(columns=['a','b','c c','d'])) + assert_series_equal(df['c c'],Series(name='c c',dtype=object)) + + # reindex columns is ok + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) + y = df[df.A > 5] + result = y.reindex(columns=['A','B','C']) + expected = DataFrame(columns=['A','B','C']) + assert_frame_equal(result,expected) + + # GH 5756 + # setting with empty Series + df = DataFrame(Series()) + assert_frame_equal(df, DataFrame({ 0 : Series() })) + + df = DataFrame(Series(name='foo')) + assert_frame_equal(df, DataFrame({ 'foo' : Series() })) + + # GH 5932 + # copy on empty with assignment fails + df = DataFrame(index=[0]) + df = df.copy() + df['a'] = 0 + expected = DataFrame(0,index=[0],columns=['a']) + assert_frame_equal(df, expected) + + # GH 6171 + # consistency on empty frames + df = DataFrame(columns=['x', 'y']) + df['x'] = [1, 2] + expected = DataFrame(dict(x = [1,2], y = [np.nan,np.nan])) + assert_frame_equal(df, expected, check_dtype=False) + + df = DataFrame(columns=['x', 'y']) + df['x'] = ['1', '2'] + expected = DataFrame(dict(x = ['1','2'], y = [np.nan,np.nan]),dtype=object) + assert_frame_equal(df, expected) + + df = DataFrame(columns=['x', 'y']) + df.loc[0, 'x'] = 1 + expected = DataFrame(dict(x = [1], y = [np.nan])) + assert_frame_equal(df, expected, check_dtype=False) + + def test_cache_updating(self): + # GH 4939, make sure to update the cache on setitem + + df = tm.makeDataFrame() + df['A'] # cache series + df.ix["Hello Friend"] = df.ix[0] + self.assertIn("Hello Friend", df['A'].index) + self.assertIn("Hello Friend", df['B'].index) + + panel = tm.makePanel() + panel.ix[0] # get first item into cache + panel.ix[:, :, 'A+1'] = panel.ix[:, :, 'A'] + 1 + self.assertIn("A+1", panel.ix[0].columns) + self.assertIn("A+1", panel.ix[1].columns) + + # 5216 + # make sure that we don't try to set a dead cache + a = np.random.rand(10, 3) + df = DataFrame(a, columns=['x', 'y', 'z']) + tuples = [(i, j) for i in range(5) for j in range(2)] + index = MultiIndex.from_tuples(tuples) + df.index = index + + # setting via chained assignment + # but actually works, since everything is a view + df.loc[0]['z'].iloc[0] = 1. + result = df.loc[(0,0),'z'] + self.assertEqual(result, 1) + + # correct setting + df.loc[(0,0),'z'] = 2 + result = df.loc[(0,0),'z'] + self.assertEqual(result, 2) + + def test_slice_consolidate_invalidate_item_cache(self): + # #3970 + df = DataFrame({ "aa":lrange(5), "bb":[2.2]*5}) + + # Creates a second float block + df["cc"] = 0.0 + + # caches a reference to the 'bb' series + df["bb"] + + # repr machinery triggers consolidation + repr(df) + + # Assignment to wrong series + df['bb'].iloc[0] = 0.17 + df._clear_item_cache() + self.assertAlmostEqual(df['bb'][0], 0.17) + + def test_setitem_cache_updating(self): + # GH 5424 + cont = ['one', 'two','three', 'four', 'five', 'six', 'seven'] + + for do_ref in [False,False]: + df = DataFrame({'a' : cont, "b":cont[3:]+cont[:3] ,'c' : np.arange(7)}) + + # ref the cache + if do_ref: + df.ix[0,"c"] + + # set it + df.ix[7,'c'] = 1 + + self.assertEqual(df.ix[0,'c'], 0.0) + self.assertEqual(df.ix[7,'c'], 1.0) + + # GH 7084 + # not updating cache on series setting with slices + out = DataFrame({'A': [0, 0, 0]}, index=date_range('5/7/2014', '5/9/2014')) + df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]}) + + #loop through df to update out + six = Timestamp('5/7/2014') + eix = Timestamp('5/9/2014') + for ix, row in df.iterrows(): + out[row['C']][six:eix] = out[row['C']][six:eix] + row['D'] + + expected = DataFrame({'A': [600, 600, 600]}, index=date_range('5/7/2014', '5/9/2014')) + assert_frame_equal(out, expected) + assert_series_equal(out['A'], expected['A']) + + out = DataFrame({'A': [0, 0, 0]}, index=date_range('5/7/2014', '5/9/2014')) + for ix, row in df.iterrows(): + out.loc[six:eix,row['C']] += row['D'] + + assert_frame_equal(out, expected) + assert_series_equal(out['A'], expected['A']) + + def test_setitem_chained_setfault(self): + + # GH6026 + # setfaults under numpy 1.7.1 (ok on 1.8) + data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout'] + mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none'] + + df = DataFrame({'response': np.array(data)}) + mask = df.response == 'timeout' + df.response[mask] = 'none' + assert_frame_equal(df, DataFrame({'response': mdata })) + + recarray = np.rec.fromarrays([data], names=['response']) + df = DataFrame(recarray) + mask = df.response == 'timeout' + df.response[mask] = 'none' + assert_frame_equal(df, DataFrame({'response': mdata })) + + df = DataFrame({'response': data, 'response1' : data }) + mask = df.response == 'timeout' + df.response[mask] = 'none' + assert_frame_equal(df, DataFrame({'response': mdata, 'response1' : data })) + + # GH 6056 + expected = DataFrame(dict(A = [np.nan,'bar','bah','foo','bar'])) + df = DataFrame(dict(A = np.array(['foo','bar','bah','foo','bar']))) + df['A'].iloc[0] = np.nan + result = df.head() + assert_frame_equal(result, expected) + + df = DataFrame(dict(A = np.array(['foo','bar','bah','foo','bar']))) + df.A.iloc[0] = np.nan + result = df.head() + assert_frame_equal(result, expected) + + def test_detect_chained_assignment(self): + + pd.set_option('chained_assignment','raise') + + # work with the chain + expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) + df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64') + self.assertIsNone(df.is_copy) + + df['A'][0] = -5 + df['A'][1] = -6 + assert_frame_equal(df, expected) + + expected = DataFrame([[-5,2],[np.nan,3.]],columns=list('AB')) + df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) + self.assertIsNone(df.is_copy) + df['A'][0] = -5 + df['A'][1] = np.nan + assert_frame_equal(df, expected) + self.assertIsNone(df['A'].is_copy) + + # using a copy (the chain), fails + df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) + def f(): + df.loc[0]['A'] = -5 + self.assertRaises(com.SettingWithCopyError, f) + + # doc example + df = DataFrame({'a' : ['one', 'one', 'two', + 'three', 'two', 'one', 'six'], + 'c' : Series(range(7),dtype='int64') }) + self.assertIsNone(df.is_copy) + expected = DataFrame({'a' : ['one', 'one', 'two', + 'three', 'two', 'one', 'six'], + 'c' : [42,42,2,3,4,42,6]}) + + def f(): + indexer = df.a.str.startswith('o') + df[indexer]['c'] = 42 + self.assertRaises(com.SettingWithCopyError, f) + df['c'][df.a.str.startswith('o')] = 42 + assert_frame_equal(df,expected) + + expected = DataFrame({'A':[111,'bbb','ccc'],'B':[1,2,3]}) + df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + df['A'][0] = 111 + def f(): + df.loc[0]['A'] = 111 + self.assertRaises(com.SettingWithCopyError, f) + assert_frame_equal(df,expected) + + # warnings + pd.set_option('chained_assignment','warn') + df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + with tm.assert_produces_warning(expected_warning=com.SettingWithCopyWarning): + df.loc[0]['A'] = 111 + + # make sure that is_copy is picked up reconstruction + # GH5475 + df = DataFrame({"A": [1,2]}) + self.assertIsNone(df.is_copy) + with tm.ensure_clean('__tmp__pickle') as path: + df.to_pickle(path) + df2 = pd.read_pickle(path) + df2["B"] = df2["A"] + df2["B"] = df2["A"] + + # a suprious raise as we are setting the entire column here + # GH5597 + pd.set_option('chained_assignment','raise') + from string import ascii_letters as letters + + def random_text(nobs=100): + df = [] + for i in range(nobs): + idx= np.random.randint(len(letters), size=2) + idx.sort() + df.append([letters[idx[0]:idx[1]]]) + + return DataFrame(df, columns=['letters']) + + df = random_text(100000) + + # always a copy + x = df.iloc[[0,1,2]] + self.assertIsNotNone(x.is_copy) + x = df.iloc[[0,1,2,4]] + self.assertIsNotNone(x.is_copy) + + # explicity copy + indexer = df.letters.apply(lambda x : len(x) > 10) + df = df.ix[indexer].copy() + self.assertIsNone(df.is_copy) + df['letters'] = df['letters'].apply(str.lower) + + # implicity take + df = random_text(100000) + indexer = df.letters.apply(lambda x : len(x) > 10) + df = df.ix[indexer] + self.assertIsNotNone(df.is_copy) + df['letters'] = df['letters'].apply(str.lower) + + # implicity take 2 + df = random_text(100000) + indexer = df.letters.apply(lambda x : len(x) > 10) + df = df.ix[indexer] + self.assertIsNotNone(df.is_copy) + df.loc[:,'letters'] = df['letters'].apply(str.lower) + + # should be ok even though its a copy! + self.assertIsNone(df.is_copy) + df['letters'] = df['letters'].apply(str.lower) + self.assertIsNone(df.is_copy) + + df = random_text(100000) + indexer = df.letters.apply(lambda x : len(x) > 10) + df.ix[indexer,'letters'] = df.ix[indexer,'letters'].apply(str.lower) + + # an identical take, so no copy + df = DataFrame({'a' : [1]}).dropna() + self.assertIsNone(df.is_copy) + df['a'] += 1 + + # inplace ops + # original from: http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug + a = [12, 23] + b = [123, None] + c = [1234, 2345] + d = [12345, 23456] + tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'), ('ears', 'right')] + events = {('eyes', 'left'): a, ('eyes', 'right'): b, ('ears', 'left'): c, ('ears', 'right'): d} + multiind = MultiIndex.from_tuples(tuples, names=['part', 'side']) + zed = DataFrame(events, index=['a', 'b'], columns=multiind) + def f(): + zed['eyes']['right'].fillna(value=555, inplace=True) + self.assertRaises(com.SettingWithCopyError, f) + + df = DataFrame(np.random.randn(10,4)) + s = df.iloc[:,0] + s = s.order() + assert_series_equal(s,df.iloc[:,0].order()) + assert_series_equal(s,df[0].order()) + + # operating on a copy + df = pd.DataFrame({'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}) + mask = pd.isnull(df.c) + + def f(): + df[['c']][mask] = df[['b']][mask] + self.assertRaises(com.SettingWithCopyError, f) + + # false positives GH6025 + df = DataFrame ({'column1':['a', 'a', 'a'], 'column2': [4,8,9] }) + str(df) + df['column1'] = df['column1'] + 'b' + str(df) + df = df [df['column2']!=8] + str(df) + df['column1'] = df['column1'] + 'c' + str(df) + + def test_float64index_slicing_bug(self): + # GH 5557, related to slicing a float index + ser = {256: 2321.0, 1: 78.0, 2: 2716.0, 3: 0.0, 4: 369.0, 5: 0.0, 6: 269.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 3536.0, 11: 0.0, 12: 24.0, 13: 0.0, 14: 931.0, 15: 0.0, 16: 101.0, 17: 78.0, 18: 9643.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 63761.0, 23: 0.0, 24: 446.0, 25: 0.0, 26: 34773.0, 27: 0.0, 28: 729.0, 29: 78.0, 30: 0.0, 31: 0.0, 32: 3374.0, 33: 0.0, 34: 1391.0, 35: 0.0, 36: 361.0, 37: 0.0, 38: 61808.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 6677.0, 43: 0.0, 44: 802.0, 45: 0.0, 46: 2691.0, 47: 0.0, 48: 3582.0, 49: 0.0, 50: 734.0, 51: 0.0, 52: 627.0, 53: 70.0, 54: 2584.0, 55: 0.0, 56: 324.0, 57: 0.0, 58: 605.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 3989.0, 63: 10.0, 64: 42.0, 65: 0.0, 66: 904.0, 67: 0.0, 68: 88.0, 69: 70.0, 70: 8172.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 64902.0, 75: 0.0, 76: 347.0, 77: 0.0, 78: 36605.0, 79: 0.0, 80: 379.0, 81: 70.0, 82: 0.0, 83: 0.0, 84: 3001.0, 85: 0.0, 86: 1630.0, 87: 7.0, 88: 364.0, 89: 0.0, 90: 67404.0, 91: 9.0, 92: 0.0, 93: 0.0, 94: 7685.0, 95: 0.0, 96: 1017.0, 97: 0.0, 98: 2831.0, 99: 0.0, 100: 2963.0, 101: 0.0, 102: 854.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: 0.0, 117: 0.0, 118: 0.0, 119: 0.0, 120: 0.0, 121: 0.0, 122: 0.0, 123: 0.0, 124: 0.0, 125: 0.0, 126: 67744.0, 127: 22.0, 128: 264.0, 129: 0.0, 260: 197.0, 268: 0.0, 265: 0.0, 269: 0.0, 261: 0.0, 266: 1198.0, 267: 0.0, 262: 2629.0, 258: 775.0, 257: 0.0, 263: 0.0, 259: 0.0, 264: 163.0, 250: 10326.0, 251: 0.0, 252: 1228.0, 253: 0.0, 254: 2769.0, 255: 0.0} + + # smoke test for the repr + s = Series(ser) + result = s.value_counts() + str(result) + + def test_floating_index_doc_example(self): + + index = Index([1.5, 2, 3, 4.5, 5]) + s = Series(range(5),index=index) + self.assertEqual(s[3], 2) + self.assertEqual(s.ix[3], 2) + self.assertEqual(s.loc[3], 2) + self.assertEqual(s.iloc[3], 3) + + def test_floating_index(self): + + # related 236 + # scalar/slicing of a float index + s = Series(np.arange(5), index=np.arange(5) * 2.5, dtype=np.int64) + + # label based slicing + result1 = s[1.0:3.0] + result2 = s.ix[1.0:3.0] + result3 = s.loc[1.0:3.0] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + + # exact indexing when found + result1 = s[5.0] + result2 = s.loc[5.0] + result3 = s.ix[5.0] + self.assertEqual(result1, result2) + self.assertEqual(result1, result3) + + result1 = s[5] + result2 = s.loc[5] + result3 = s.ix[5] + self.assertEqual(result1, result2) + self.assertEqual(result1, result3) + + self.assertEqual(s[5.0], s[5]) + + # value not found (and no fallbacking at all) + + # scalar integers + self.assertRaises(KeyError, lambda : s.loc[4]) + self.assertRaises(KeyError, lambda : s.ix[4]) + self.assertRaises(KeyError, lambda : s[4]) + + # fancy floats/integers create the correct entry (as nan) + # fancy tests + expected = Series([2, 0], index=Float64Index([5.0, 0.0])) + for fancy_idx in [[5.0, 0.0], [5, 0], np.array([5.0, 0.0]), np.array([5, 0])]: + assert_series_equal(s[fancy_idx], expected) + assert_series_equal(s.loc[fancy_idx], expected) + assert_series_equal(s.ix[fancy_idx], expected) + + # all should return the same as we are slicing 'the same' + result1 = s.loc[2:5] + result2 = s.loc[2.0:5.0] + result3 = s.loc[2.0:5] + result4 = s.loc[2.1:5] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + assert_series_equal(result1, result4) + + # previously this did fallback indexing + result1 = s[2:5] + result2 = s[2.0:5.0] + result3 = s[2.0:5] + result4 = s[2.1:5] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + assert_series_equal(result1, result4) + + result1 = s.ix[2:5] + result2 = s.ix[2.0:5.0] + result3 = s.ix[2.0:5] + result4 = s.ix[2.1:5] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + assert_series_equal(result1, result4) + + # combined test + result1 = s.loc[2:5] + result2 = s.ix[2:5] + result3 = s[2:5] + + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + + # list selection + result1 = s[[0.0,5,10]] + result2 = s.loc[[0.0,5,10]] + result3 = s.ix[[0.0,5,10]] + result4 = s.iloc[[0,2,4]] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + assert_series_equal(result1, result4) + + result1 = s[[1.6,5,10]] + result2 = s.loc[[1.6,5,10]] + result3 = s.ix[[1.6,5,10]] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + assert_series_equal(result1, Series([np.nan,2,4],index=[1.6,5,10])) + + result1 = s[[0,1,2]] + result2 = s.ix[[0,1,2]] + result3 = s.loc[[0,1,2]] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + assert_series_equal(result1, Series([0.0,np.nan,np.nan],index=[0,1,2])) + + result1 = s.loc[[2.5, 5]] + result2 = s.ix[[2.5, 5]] + assert_series_equal(result1, result2) + assert_series_equal(result1, Series([1,2],index=[2.5,5.0])) + + result1 = s[[2.5]] + result2 = s.ix[[2.5]] + result3 = s.loc[[2.5]] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + assert_series_equal(result1, Series([1],index=[2.5])) + + def test_scalar_indexer(self): + # float indexing checked above + + def check_invalid(index, loc=None, iloc=None, ix=None, getitem=None): + + # related 236/4850 + # trying to access with a float index + s = Series(np.arange(len(index)),index=index) + + if iloc is None: + iloc = TypeError + self.assertRaises(iloc, lambda : s.iloc[3.5]) + if loc is None: + loc = TypeError + self.assertRaises(loc, lambda : s.loc[3.5]) + if ix is None: + ix = TypeError + self.assertRaises(ix, lambda : s.ix[3.5]) + if getitem is None: + getitem = TypeError + self.assertRaises(getitem, lambda : s[3.5]) + + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + check_invalid(index()) + check_invalid(Index(np.arange(5) * 2.5),loc=KeyError, ix=KeyError, getitem=KeyError) + + def check_getitem(index): + + s = Series(np.arange(len(index)),index=index) + + # positional selection + result1 = s[5] + result2 = s[5.0] + result3 = s.iloc[5] + result4 = s.iloc[5.0] + + # by value + self.assertRaises(KeyError, lambda : s.loc[5]) + self.assertRaises(KeyError, lambda : s.loc[5.0]) + + # this is fallback, so it works + result5 = s.ix[5] + result6 = s.ix[5.0] + self.assertEqual(result1, result2) + self.assertEqual(result1, result3) + self.assertEqual(result1, result4) + self.assertEqual(result1, result5) + self.assertEqual(result1, result6) + + # all index types except float/int + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + check_getitem(index()) + + # exact indexing when found on IntIndex + s = Series(np.arange(10),dtype='int64') + + result1 = s[5.0] + result2 = s.loc[5.0] + result3 = s.ix[5.0] + result4 = s[5] + result5 = s.loc[5] + result6 = s.ix[5] + self.assertEqual(result1, result2) + self.assertEqual(result1, result3) + self.assertEqual(result1, result4) + self.assertEqual(result1, result5) + self.assertEqual(result1, result6) + + def test_slice_indexer(self): + + def check_slicing_positional(index): + + s = Series(np.arange(len(index))+10,index=index) + + # these are all positional + result1 = s[2:5] + result2 = s.ix[2:5] + result3 = s.iloc[2:5] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + + # not in the index + self.assertRaises(KeyError, lambda : s.loc[2:5]) + + # make all float slicing fail + self.assertRaises(TypeError, lambda : s[2.0:5]) + self.assertRaises(TypeError, lambda : s[2.0:5.0]) + self.assertRaises(TypeError, lambda : s[2:5.0]) + + self.assertRaises(TypeError, lambda : s.ix[2.0:5]) + self.assertRaises(TypeError, lambda : s.ix[2.0:5.0]) + self.assertRaises(TypeError, lambda : s.ix[2:5.0]) + + self.assertRaises(KeyError, lambda : s.loc[2.0:5]) + self.assertRaises(KeyError, lambda : s.loc[2.0:5.0]) + self.assertRaises(KeyError, lambda : s.loc[2:5.0]) + + # these work for now + #self.assertRaises(TypeError, lambda : s.iloc[2.0:5]) + #self.assertRaises(TypeError, lambda : s.iloc[2.0:5.0]) + #self.assertRaises(TypeError, lambda : s.iloc[2:5.0]) + + # all index types except int, float + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + check_slicing_positional(index()) + + # int + index = tm.makeIntIndex() + s = Series(np.arange(len(index))+10,index) + + # this is positional + result1 = s[2:5] + result4 = s.iloc[2:5] + assert_series_equal(result1, result4) + + # these are all value based + result2 = s.ix[2:5] + result3 = s.loc[2:5] + result4 = s.loc[2.0:5] + result5 = s.loc[2.0:5.0] + result6 = s.loc[2:5.0] + assert_series_equal(result2, result3) + assert_series_equal(result2, result4) + assert_series_equal(result2, result5) + assert_series_equal(result2, result6) + + # make all float slicing fail + self.assertRaises(TypeError, lambda : s[2.0:5]) + self.assertRaises(TypeError, lambda : s[2.0:5.0]) + self.assertRaises(TypeError, lambda : s[2:5.0]) + + self.assertRaises(TypeError, lambda : s.ix[2.0:5]) + self.assertRaises(TypeError, lambda : s.ix[2.0:5.0]) + self.assertRaises(TypeError, lambda : s.ix[2:5.0]) + + # these work for now + #self.assertRaises(TypeError, lambda : s.iloc[2.0:5]) + #self.assertRaises(TypeError, lambda : s.iloc[2.0:5.0]) + #self.assertRaises(TypeError, lambda : s.iloc[2:5.0]) + + # float + index = tm.makeFloatIndex() + s = Series(np.arange(len(index))+10,index=index) + + # these are all value based + result1 = s[2:5] + result2 = s.ix[2:5] + result3 = s.loc[2:5] + assert_series_equal(result1, result2) + assert_series_equal(result1, result3) + + # these are all valid + result1a = s[2.0:5] + result2a = s[2.0:5.0] + result3a = s[2:5.0] + assert_series_equal(result1a, result2a) + assert_series_equal(result1a, result3a) + + result1b = s.ix[2.0:5] + result2b = s.ix[2.0:5.0] + result3b = s.ix[2:5.0] + assert_series_equal(result1b, result2b) + assert_series_equal(result1b, result3b) + + result1c = s.loc[2.0:5] + result2c = s.loc[2.0:5.0] + result3c = s.loc[2:5.0] + assert_series_equal(result1c, result2c) + assert_series_equal(result1c, result3c) + + assert_series_equal(result1a, result1b) + assert_series_equal(result1a, result1c) + + # these work for now + #self.assertRaises(TypeError, lambda : s.iloc[2.0:5]) + #self.assertRaises(TypeError, lambda : s.iloc[2.0:5.0]) + #self.assertRaises(TypeError, lambda : s.iloc[2:5.0]) + + def test_set_ix_out_of_bounds_axis_0(self): + df = pd.DataFrame(randn(2, 5), index=["row%s" % i for i in range(2)], columns=["col%s" % i for i in range(5)]) + self.assertRaises(ValueError, df.ix.__setitem__, (2, 0), 100) + + def test_set_ix_out_of_bounds_axis_1(self): + df = pd.DataFrame(randn(5, 2), index=["row%s" % i for i in range(5)], columns=["col%s" % i for i in range(2)]) + self.assertRaises(ValueError, df.ix.__setitem__, (0 , 2), 100) + + def test_iloc_empty_list_indexer_is_ok(self): + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.iloc[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.iloc[[],:], df.iloc[:0, :]) # horizontal empty + assert_frame_equal(df.iloc[[]], df.iloc[:0, :]) # horizontal empty + + # FIXME: fix loc & xs + def test_loc_empty_list_indexer_is_ok(self): + raise nose.SkipTest('loc discards columns names') + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.loc[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.loc[[],:], df.iloc[:0, :]) # horizontal empty + assert_frame_equal(df.loc[[]], df.iloc[:0, :]) # horizontal empty + + def test_ix_empty_list_indexer_is_ok(self): + raise nose.SkipTest('ix discards columns names') + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.ix[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.ix[[],:], df.iloc[:0, :]) # horizontal empty + assert_frame_equal(df.ix[[]], df.iloc[:0, :]) # horizontal empty + + def test_deprecate_float_indexers(self): + + # GH 4892 + # deprecate allowing float indexers that are equal to ints to be used + # as indexers in non-float indices + + import warnings + warnings.filterwarnings(action='error', category=FutureWarning) + + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + + i = index(5) + + for s in [ Series(np.arange(len(i)),index=i), DataFrame(np.random.randn(len(i),len(i)),index=i,columns=i) ]: + self.assertRaises(FutureWarning, lambda : + s.iloc[3.0]) + + # setting + def f(): + s.iloc[3.0] = 0 + self.assertRaises(FutureWarning, f) + + # fallsback to position selection ,series only + s = Series(np.arange(len(i)),index=i) + s[3] + self.assertRaises(FutureWarning, lambda : + s[3.0]) + + # ints + i = index(5) + for s in [ Series(np.arange(len(i))), DataFrame(np.random.randn(len(i),len(i)),index=i,columns=i) ]: + self.assertRaises(FutureWarning, lambda : + s.iloc[3.0]) + + # on some arch's this doesn't provide a warning (and thus raise) + # and some it does + try: + s[3.0] + except: + pass + + # setting + def f(): + s.iloc[3.0] = 0 + self.assertRaises(FutureWarning, f) + + # floats: these are all ok! + i = np.arange(5.) + + for s in [ Series(np.arange(len(i)),index=i), DataFrame(np.random.randn(len(i),len(i)),index=i,columns=i) ]: + with tm.assert_produces_warning(False): + s[3.0] + + with tm.assert_produces_warning(False): + s[3] + + self.assertRaises(FutureWarning, lambda : + s.iloc[3.0]) + + with tm.assert_produces_warning(False): + s.iloc[3] + + with tm.assert_produces_warning(False): + s.loc[3.0] + + with tm.assert_produces_warning(False): + s.loc[3] + + def f(): + s.iloc[3.0] = 0 + self.assertRaises(FutureWarning, f) + + # slices + for index in [ tm.makeIntIndex, tm.makeFloatIndex, + tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + + index = index(5) + for s in [ Series(range(5),index=index), DataFrame(np.random.randn(5,2),index=index) ]: + + # getitem + self.assertRaises(FutureWarning, lambda : + s.iloc[3.0:4]) + self.assertRaises(FutureWarning, lambda : + s.iloc[3.0:4.0]) + self.assertRaises(FutureWarning, lambda : + s.iloc[3:4.0]) + + # setitem + def f(): + s.iloc[3.0:4] = 0 + self.assertRaises(FutureWarning, f) + def f(): + s.iloc[3:4.0] = 0 + self.assertRaises(FutureWarning, f) + def f(): + s.iloc[3.0:4.0] = 0 + self.assertRaises(FutureWarning, f) + + warnings.filterwarnings(action='ignore', category=FutureWarning) + + def test_float_index_to_mixed(self): + df = DataFrame({0.0: np.random.rand(10), + 1.0: np.random.rand(10)}) + df['a'] = 10 + tm.assert_frame_equal(DataFrame({0.0: df[0.0], + 1.0: df[1.0], + 'a': [10] * 10}), + df) + + def test_duplicate_ix_returns_series(self): + df = DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2], + columns=list('abc')) + r = df.ix[0.2, 'a'] + e = df.loc[0.2, 'a'] + tm.assert_series_equal(r, e) + + def test_float_index_non_scalar_assignment(self): + df = DataFrame({'a': [1,2,3], 'b': [3,4,5]},index=[1.,2.,3.]) + df.loc[df.index[:2]] = 1 + expected = DataFrame({'a':[1,1,3],'b':[1,1,5]},index=df.index) + tm.assert_frame_equal(expected, df) + + df = DataFrame({'a': [1,2,3], 'b': [3,4,5]},index=[1.,2.,3.]) + df2 = df.copy() + df.loc[df.index] = df.loc[df.index] + tm.assert_frame_equal(df,df2) + + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py new file mode 100644 index 00000000..8a901008 --- /dev/null +++ b/pandas/tests/test_internals.py @@ -0,0 +1,1010 @@ +# pylint: disable=W0102 + +import nose +import numpy as np + +from pandas import Index, MultiIndex, DataFrame, Series +from pandas.compat import OrderedDict, lrange +from pandas.sparse.array import SparseArray +from pandas.core.internals import * +import pandas.core.internals as internals +import pandas.util.testing as tm + +from pandas.util.testing import ( + assert_almost_equal, assert_frame_equal, randn) +from pandas.compat import zip, u + + +def assert_block_equal(left, right): + assert_almost_equal(left.values, right.values) + assert(left.dtype == right.dtype) + assert_almost_equal(left.mgr_locs, right.mgr_locs) + + +def get_numeric_mat(shape): + arr = np.arange(shape[0]) + return np.lib.stride_tricks.as_strided( + x=arr, shape=shape, + strides=(arr.itemsize,) + (0,) * (len(shape) - 1)).copy() + + +N = 10 + + +def create_block(typestr, placement, item_shape=None, num_offset=0): + """ + Supported typestr: + + * float, f8, f4, f2 + * int, i8, i4, i2, i1 + * uint, u8, u4, u2, u1 + * complex, c16, c8 + * bool + * object, string, O + * datetime, dt + * sparse (SparseArray with fill_value=0.0) + * sparse_na (SparseArray with fill_value=np.nan) + + """ + placement = BlockPlacement(placement) + num_items = len(placement) + + if item_shape is None: + item_shape = (N,) + + shape = (num_items,) + item_shape + + mat = get_numeric_mat(shape) + + if typestr in ('float', 'f8', 'f4', 'f2', + 'int', 'i8', 'i4', 'i2', 'i1', + 'uint', 'u8', 'u4', 'u2', 'u1'): + values = mat.astype(typestr) + num_offset + elif typestr in ('complex', 'c16', 'c8'): + values = 1.j * (mat.astype(typestr) + num_offset) + elif typestr in ('object', 'string', 'O'): + values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], + shape) + elif typestr in ('bool'): + values = np.ones(shape, dtype=np.bool_) + elif typestr in ('datetime', 'dt'): + values = (mat * 1e9).astype('M8[ns]') + elif typestr in ('sparse', 'sparse_na'): + # FIXME: doesn't support num_rows != 10 + assert shape[-1] == 10 + assert all(s == 1 for s in shape[:-1]) + if typestr.endswith('_na'): + fill_value = np.nan + else: + fill_value = 0.0 + values = SparseArray([fill_value, fill_value, 1, 2, 3, fill_value, + 4, 5, fill_value, 6], fill_value=fill_value) + arr = values.sp_values.view() + arr += (num_offset - 1) + else: + raise ValueError('Unsupported typestr: "%s"' % typestr) + + return make_block(values, placement=placement, ndim=len(shape)) + + +def create_single_mgr(typestr, num_rows=None): + if num_rows is None: + num_rows = N + + return SingleBlockManager( + create_block(typestr, placement=slice(0, num_rows), item_shape=()), + np.arange(num_rows)) + + +def create_mgr(descr, item_shape=None): + """ + Construct BlockManager from string description. + + String description syntax looks similar to np.matrix initializer. It looks + like this:: + + a,b,c: f8; d,e,f: i8 + + Rules are rather simple: + + * see list of supported datatypes in `create_block` method + * components are semicolon-separated + * each component is `NAME,NAME,NAME: DTYPE_ID` + * whitespace around colons & semicolons are removed + * components with same DTYPE_ID are combined into single block + * to force multiple blocks with same dtype, use '-SUFFIX':: + + 'a:f8-1; b:f8-2; c:f8-foobar' + + """ + if item_shape is None: + item_shape = (N,) + + offset = 0 + mgr_items = [] + block_placements = OrderedDict() + for d in descr.split(';'): + d = d.strip() + names, blockstr = d.partition(':')[::2] + blockstr = blockstr.strip() + names = names.strip().split(',') + + mgr_items.extend(names) + placement = list(np.arange(len(names)) + offset) + try: + block_placements[blockstr].extend(placement) + except KeyError: + block_placements[blockstr] = placement + offset += len(names) + + mgr_items = Index(mgr_items) + + blocks = [] + num_offset = 0 + for blockstr, placement in block_placements.items(): + typestr = blockstr.split('-')[0] + blocks.append(create_block(typestr, placement, item_shape=item_shape, + num_offset=num_offset,)) + num_offset += len(placement) + + return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]), + [mgr_items] + [np.arange(n) for n in item_shape]) + + + +class TestBlock(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + # self.fblock = get_float_ex() # a,c,e + # self.cblock = get_complex_ex() # + # self.oblock = get_obj_ex() + # self.bool_block = get_bool_ex() + # self.int_block = get_int_ex() + + self.fblock = create_block('float', [0, 2, 4]) + self.cblock = create_block('complex', [7]) + self.oblock = create_block('object', [1, 3]) + self.bool_block = create_block('bool', [5]) + self.int_block = create_block('int', [6]) + + def test_constructor(self): + int32block = create_block('i4', [0]) + self.assertEqual(int32block.dtype, np.int32) + + def test_pickle(self): + import pickle + + def _check(blk): + pickled = pickle.dumps(blk) + unpickled = pickle.loads(pickled) + assert_block_equal(blk, unpickled) + + _check(self.fblock) + _check(self.cblock) + _check(self.oblock) + _check(self.bool_block) + + def test_mgr_locs(self): + assert_almost_equal(self.fblock.mgr_locs, [0, 2, 4]) + + def test_attrs(self): + self.assertEqual(self.fblock.shape, self.fblock.values.shape) + self.assertEqual(self.fblock.dtype, self.fblock.values.dtype) + self.assertEqual(len(self.fblock), len(self.fblock.values)) + + def test_merge(self): + avals = randn(2, 10) + bvals = randn(2, 10) + + ref_cols = Index(['e', 'a', 'b', 'd', 'f']) + + ablock = make_block(avals, + ref_cols.get_indexer(['e', 'b'])) + bblock = make_block(bvals, + ref_cols.get_indexer(['a', 'd'])) + merged = ablock.merge(bblock) + assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) + assert_almost_equal(merged.values[[0, 2]], avals) + assert_almost_equal(merged.values[[1, 3]], bvals) + + # TODO: merge with mixed type? + + def test_copy(self): + cop = self.fblock.copy() + self.assertIsNot(cop, self.fblock) + assert_block_equal(self.fblock, cop) + + def test_reindex_index(self): + pass + + def test_reindex_cast(self): + pass + + def test_insert(self): + pass + + def test_delete(self): + newb = self.fblock.copy() + newb.delete(0) + assert_almost_equal(newb.mgr_locs, [2, 4]) + self.assertTrue((newb.values[0] == 1).all()) + + newb = self.fblock.copy() + newb.delete(1) + assert_almost_equal(newb.mgr_locs, [0, 4]) + self.assertTrue((newb.values[1] == 2).all()) + + newb = self.fblock.copy() + newb.delete(2) + assert_almost_equal(newb.mgr_locs, [0, 2]) + self.assertTrue((newb.values[1] == 1).all()) + + newb = self.fblock.copy() + self.assertRaises(Exception, newb.delete, 3) + + def test_split_block_at(self): + + # with dup column support this method was taken out + # GH3679 + raise nose.SkipTest("skipping for now") + + bs = list(self.fblock.split_block_at('a')) + self.assertEqual(len(bs), 1) + self.assertTrue(np.array_equal(bs[0].items, ['c', 'e'])) + + bs = list(self.fblock.split_block_at('c')) + self.assertEqual(len(bs), 2) + self.assertTrue(np.array_equal(bs[0].items, ['a'])) + self.assertTrue(np.array_equal(bs[1].items, ['e'])) + + bs = list(self.fblock.split_block_at('e')) + self.assertEqual(len(bs), 1) + self.assertTrue(np.array_equal(bs[0].items, ['a', 'c'])) + + bblock = get_bool_ex(['f']) + bs = list(bblock.split_block_at('f')) + self.assertEqual(len(bs), 0) + + def test_get(self): + pass + + def test_set(self): + pass + + def test_fillna(self): + pass + + def test_repr(self): + pass + + +class TestBlockManager(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.mgr = create_mgr('a: f8; b: object; c: f8; d: object; e: f8;' + 'f: bool; g: i8; h: complex') + + def test_constructor_corner(self): + pass + + def test_attrs(self): + mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2') + self.assertEqual(mgr.nblocks, 2) + self.assertEqual(len(mgr), 6) + + def test_is_mixed_dtype(self): + self.assertFalse(create_mgr('a,b:f8').is_mixed_type) + self.assertFalse(create_mgr('a:f8-1; b:f8-2').is_mixed_type) + + self.assertTrue(create_mgr('a,b:f8; c,d: f4').is_mixed_type) + self.assertTrue(create_mgr('a,b:f8; c,d: object').is_mixed_type) + + def test_is_indexed_like(self): + mgr1 = create_mgr('a,b: f8') + mgr2 = create_mgr('a:i8; b:bool') + mgr3 = create_mgr('a,b,c: f8') + self.assertTrue(mgr1._is_indexed_like(mgr1)) + self.assertTrue(mgr1._is_indexed_like(mgr2)) + self.assertTrue(mgr1._is_indexed_like(mgr3)) + + self.assertFalse(mgr1._is_indexed_like( + mgr1.get_slice(slice(-1), axis=1))) + + def test_duplicate_ref_loc_failure(self): + tmp_mgr = create_mgr('a:bool; a: f8') + + axes, blocks = tmp_mgr.axes, tmp_mgr.blocks + + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([0]) + # test trying to create block manager with overlapping ref locs + self.assertRaises(AssertionError, BlockManager, blocks, axes) + + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([1]) + mgr = BlockManager(blocks, axes) + mgr.iget(1) + + def test_contains(self): + self.assertIn('a', self.mgr) + self.assertNotIn('baz', self.mgr) + + def test_pickle(self): + import pickle + + pickled = pickle.dumps(self.mgr) + mgr2 = pickle.loads(pickled) + + # same result + assert_frame_equal(DataFrame(self.mgr), DataFrame(mgr2)) + + # share ref_items + # self.assertIs(mgr2.blocks[0].ref_items, mgr2.blocks[1].ref_items) + + # GH2431 + self.assertTrue(hasattr(mgr2, "_is_consolidated")) + self.assertTrue(hasattr(mgr2, "_known_consolidated")) + + # reset to False on load + self.assertFalse(mgr2._is_consolidated) + self.assertFalse(mgr2._known_consolidated) + + def test_non_unique_pickle(self): + import pickle + mgr = create_mgr('a,a,a:f8') + mgr2 = pickle.loads(pickle.dumps(mgr)) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + mgr = create_mgr('a: f8; a: i8') + mgr2 = pickle.loads(pickle.dumps(mgr)) + assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + + def test_get_scalar(self): + for item in self.mgr.items: + for i, index in enumerate(self.mgr.axes[1]): + res = self.mgr.get_scalar((item, index)) + exp = self.mgr.get(item, fastpath=False)[i] + assert_almost_equal(res, exp) + exp = self.mgr.get(item).values[i] + assert_almost_equal(res, exp) + + def test_get(self): + cols = Index(list('abc')) + values = np.random.rand(3, 3) + block = make_block(values=values.copy(), + placement=np.arange(3)) + mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) + + assert_almost_equal(mgr.get('a', fastpath=False), values[0]) + assert_almost_equal(mgr.get('b', fastpath=False), values[1]) + assert_almost_equal(mgr.get('c', fastpath=False), values[2]) + assert_almost_equal(mgr.get('a').values, values[0]) + assert_almost_equal(mgr.get('b').values, values[1]) + assert_almost_equal(mgr.get('c').values, values[2]) + + def test_set(self): + mgr = create_mgr('a,b,c: int', item_shape=(3,)) + + mgr.set('d', np.array(['foo'] * 3)) + mgr.set('b', np.array(['bar'] * 3)) + assert_almost_equal(mgr.get('a').values, [0] * 3) + assert_almost_equal(mgr.get('b').values, ['bar'] * 3) + assert_almost_equal(mgr.get('c').values, [2] * 3) + assert_almost_equal(mgr.get('d').values, ['foo'] * 3) + + def test_insert(self): + self.mgr.insert(0, 'inserted', np.arange(N)) + + self.assertEqual(self.mgr.items[0], 'inserted') + assert_almost_equal(self.mgr.get('inserted'), np.arange(N)) + + for blk in self.mgr.blocks: + yield self.assertIs, self.mgr.items, blk.ref_items + + def test_set_change_dtype(self): + self.mgr.set('baz', np.zeros(N, dtype=bool)) + + self.mgr.set('baz', np.repeat('foo', N)) + self.assertEqual(self.mgr.get('baz').dtype, np.object_) + + mgr2 = self.mgr.consolidate() + mgr2.set('baz', np.repeat('foo', N)) + self.assertEqual(mgr2.get('baz').dtype, np.object_) + + mgr2.set('quux', randn(N).astype(int)) + self.assertEqual(mgr2.get('quux').dtype, np.int_) + + mgr2.set('quux', randn(N)) + self.assertEqual(mgr2.get('quux').dtype, np.float_) + + def test_copy(self): + shallow = self.mgr.copy(deep=False) + + # we don't guaranteee block ordering + for blk in self.mgr.blocks: + found = False + for cp_blk in shallow.blocks: + if cp_blk.values is blk.values: + found = True + break + self.assertTrue(found) + + def test_sparse(self): + mgr = create_mgr('a: sparse-1; b: sparse-2') + + # what to test here? + self.assertEqual(mgr.as_matrix().dtype, np.float64) + + def test_sparse_mixed(self): + mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') + self.assertEqual(len(mgr.blocks), 3) + self.assertIsInstance(mgr, BlockManager) + + # what to test here? + + def test_as_matrix_float(self): + mgr = create_mgr('c: f4; d: f2; e: f8') + self.assertEqual(mgr.as_matrix().dtype, np.float64) + + mgr = create_mgr('c: f4; d: f2') + self.assertEqual(mgr.as_matrix().dtype, np.float32) + + def test_as_matrix_int_bool(self): + mgr = create_mgr('a: bool-1; b: bool-2') + self.assertEqual(mgr.as_matrix().dtype, np.bool_) + + mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') + self.assertEqual(mgr.as_matrix().dtype, np.int64) + + mgr = create_mgr('c: i4; d: i2; e: u1') + self.assertEqual(mgr.as_matrix().dtype, np.int32) + + def test_as_matrix_datetime(self): + mgr = create_mgr('h: datetime-1; g: datetime-2') + self.assertEqual(mgr.as_matrix().dtype, 'M8[ns]') + + def test_astype(self): + # coerce all + mgr = create_mgr('c: f4; d: f2; e: f8') + for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + t = np.dtype(t) + tmgr = mgr.astype(t) + self.assertEqual(tmgr.get('c').dtype.type, t) + self.assertEqual(tmgr.get('d').dtype.type, t) + self.assertEqual(tmgr.get('e').dtype.type, t) + + # mixed + mgr = create_mgr('a,b: object; c: bool; d: datetime;' + 'e: f4; f: f2; g: f8') + for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + t = np.dtype(t) + tmgr = mgr.astype(t, raise_on_error=False) + self.assertEqual(tmgr.get('c').dtype.type, t) + self.assertEqual(tmgr.get('e').dtype.type, t) + self.assertEqual(tmgr.get('f').dtype.type, t) + self.assertEqual(tmgr.get('g').dtype.type, t) + + self.assertEqual(tmgr.get('a').dtype.type, np.object_) + self.assertEqual(tmgr.get('b').dtype.type, np.object_) + if t != np.int64: + self.assertEqual(tmgr.get('d').dtype.type, np.datetime64) + else: + self.assertEqual(tmgr.get('d').dtype.type, t) + + def test_convert(self): + def _compare(old_mgr, new_mgr): + """ compare the blocks, numeric compare ==, object don't """ + old_blocks = set(old_mgr.blocks) + new_blocks = set(new_mgr.blocks) + self.assertEqual(len(old_blocks), len(new_blocks)) + + # compare non-numeric + for b in old_blocks: + found = False + for nb in new_blocks: + if (b.values == nb.values).all(): + found = True + break + self.assertTrue(found) + + for b in new_blocks: + found = False + for ob in old_blocks: + if (b.values == ob.values).all(): + found = True + break + self.assertTrue(found) + + # noops + mgr = create_mgr('f: i8; g: f8') + new_mgr = mgr.convert() + _compare(mgr,new_mgr) + + mgr = create_mgr('a, b: object; f: i8; g: f8') + new_mgr = mgr.convert() + _compare(mgr,new_mgr) + + # convert + mgr = create_mgr('a,b,foo: object; f: i8; g: f8') + mgr.set('a', np.array(['1'] * N, dtype=np.object_)) + mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) + mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + new_mgr = mgr.convert(convert_numeric=True) + self.assertEqual(new_mgr.get('a').dtype, np.int64) + self.assertEqual(new_mgr.get('b').dtype, np.float64) + self.assertEqual(new_mgr.get('foo').dtype, np.object_) + self.assertEqual(new_mgr.get('f').dtype, np.int64) + self.assertEqual(new_mgr.get('g').dtype, np.float64) + + mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;' + 'i: i8; g: f8; h: f2') + mgr.set('a', np.array(['1'] * N, dtype=np.object_)) + mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) + mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + new_mgr = mgr.convert(convert_numeric=True) + self.assertEqual(new_mgr.get('a').dtype, np.int64) + self.assertEqual(new_mgr.get('b').dtype, np.float64) + self.assertEqual(new_mgr.get('foo').dtype, np.object_) + self.assertEqual(new_mgr.get('f').dtype, np.int32) + self.assertEqual(new_mgr.get('bool').dtype, np.bool_) + self.assertEqual(new_mgr.get('dt').dtype.type, np.datetime64) + self.assertEqual(new_mgr.get('i').dtype, np.int64) + self.assertEqual(new_mgr.get('g').dtype, np.float64) + self.assertEqual(new_mgr.get('h').dtype, np.float16) + + def test_interleave(self): + pass + + def test_interleave_non_unique_cols(self): + df = DataFrame([ + [Timestamp('20130101'), 3.5], + [Timestamp('20130102'), 4.5]], + columns=['x', 'x'], + index=[1, 2]) + + df_unique = df.copy() + df_unique.columns = ['x', 'y'] + np.testing.assert_array_equal(df_unique.values, df.values) + + def test_consolidate(self): + pass + + def test_consolidate_ordering_issues(self): + self.mgr.set('f', randn(N)) + self.mgr.set('d', randn(N)) + self.mgr.set('b', randn(N)) + self.mgr.set('g', randn(N)) + self.mgr.set('h', randn(N)) + + cons = self.mgr.consolidate() + self.assertEqual(cons.nblocks, 1) + assert_almost_equal(cons.blocks[0].mgr_locs, + np.arange(len(cons.items))) + + def test_reindex_index(self): + pass + + def test_reindex_items(self): + # mgr is not consolidated, f8 & f8-2 blocks + mgr = create_mgr('a: f8; b: i8; c: f8; d: i8; e: f8;' + 'f: bool; g: f8-2') + + reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0) + self.assertEqual(reindexed.nblocks, 2) + assert_almost_equal(reindexed.items, ['g', 'c', 'a', 'd']) + assert_almost_equal(mgr.get('g',fastpath=False), reindexed.get('g',fastpath=False)) + assert_almost_equal(mgr.get('c',fastpath=False), reindexed.get('c',fastpath=False)) + assert_almost_equal(mgr.get('a',fastpath=False), reindexed.get('a',fastpath=False)) + assert_almost_equal(mgr.get('d',fastpath=False), reindexed.get('d',fastpath=False)) + assert_almost_equal(mgr.get('g').values, reindexed.get('g').values) + assert_almost_equal(mgr.get('c').values, reindexed.get('c').values) + assert_almost_equal(mgr.get('a').values, reindexed.get('a').values) + assert_almost_equal(mgr.get('d').values, reindexed.get('d').values) + + def test_multiindex_xs(self): + mgr = create_mgr('a,b,c: f8; d,e,f: i8') + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + + mgr.set_axis(1, index) + result = mgr.xs('bar', axis=1) + self.assertEqual(result.shape, (6, 2)) + self.assertEqual(result.axes[1][0], ('bar', 'one')) + self.assertEqual(result.axes[1][1], ('bar', 'two')) + + def test_get_numeric_data(self): + mgr = create_mgr('int: int; float: float; complex: complex;' + 'str: object; bool: bool; obj: object; dt: datetime', + item_shape=(3,)) + mgr.set('obj', np.array([1, 2, 3], dtype=np.object_)) + + numeric = mgr.get_numeric_data() + assert_almost_equal(numeric.items, ['int', 'float', 'complex', 'bool']) + assert_almost_equal(mgr.get('float',fastpath=False), numeric.get('float',fastpath=False)) + assert_almost_equal(mgr.get('float').values, numeric.get('float').values) + + # Check sharing + numeric.set('float', np.array([100., 200., 300.])) + assert_almost_equal(mgr.get('float',fastpath=False), np.array([100., 200., 300.])) + assert_almost_equal(mgr.get('float').values, np.array([100., 200., 300.])) + + numeric2 = mgr.get_numeric_data(copy=True) + assert_almost_equal(numeric.items, ['int', 'float', 'complex', 'bool']) + numeric2.set('float', np.array([1000., 2000., 3000.])) + assert_almost_equal(mgr.get('float',fastpath=False), np.array([100., 200., 300.])) + assert_almost_equal(mgr.get('float').values, np.array([100., 200., 300.])) + + def test_get_bool_data(self): + mgr = create_mgr('int: int; float: float; complex: complex;' + 'str: object; bool: bool; obj: object; dt: datetime', + item_shape=(3,)) + mgr.set('obj', np.array([True, False, True], dtype=np.object_)) + + bools = mgr.get_bool_data() + assert_almost_equal(bools.items, ['bool']) + assert_almost_equal(mgr.get('bool',fastpath=False), bools.get('bool',fastpath=False)) + assert_almost_equal(mgr.get('bool').values, bools.get('bool').values) + + bools.set('bool', np.array([True, False, True])) + assert_almost_equal(mgr.get('bool',fastpath=False), [True, False, True]) + assert_almost_equal(mgr.get('bool').values, [True, False, True]) + + # Check sharing + bools2 = mgr.get_bool_data(copy=True) + bools2.set('bool', np.array([False, True, False])) + assert_almost_equal(mgr.get('bool',fastpath=False), [True, False, True]) + assert_almost_equal(mgr.get('bool').values, [True, False, True]) + + def test_unicode_repr_doesnt_raise(self): + str_repr = repr(create_mgr(u('b,\u05d0: object'))) + + def test_missing_unicode_key(self): + df = DataFrame({"a": [1]}) + try: + df.ix[:, u("\u05d0")] # should not raise UnicodeEncodeError + except KeyError: + pass # this is the expected exception + + def test_equals(self): + # unique items + bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) + self.assertTrue(bm1.equals(bm2)) + + bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2') + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) + self.assertTrue(bm1.equals(bm2)) + + def test_single_mgr_ctor(self): + mgr = create_single_mgr('f8', num_rows=5) + self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.]) + + +class TestIndexing(object): + # Nosetests-style data-driven tests. + # + # This test applies different indexing routines to block managers and + # compares the outcome to the result of same operations on np.ndarray. + # + # NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests + # and are disabled. + + MANAGERS = [ + create_single_mgr('f8', N), + create_single_mgr('i8', N), + #create_single_mgr('sparse', N), + create_single_mgr('sparse_na', N), + + # 2-dim + create_mgr('a,b,c,d,e,f: f8', item_shape=(N,)), + create_mgr('a,b,c,d,e,f: i8', item_shape=(N,)), + create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N,)), + create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N,)), + #create_mgr('a: sparse', item_shape=(N,)), + create_mgr('a: sparse_na', item_shape=(N,)), + + # 3-dim + create_mgr('a,b,c,d,e,f: f8', item_shape=(N, N)), + create_mgr('a,b,c,d,e,f: i8', item_shape=(N, N)), + create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N, N)), + create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N, N)), + # create_mgr('a: sparse', item_shape=(1, N)), + ] + + # MANAGERS = [MANAGERS[6]] + + def test_get_slice(self): + def assert_slice_ok(mgr, axis, slobj): + # import pudb; pudb.set_trace() + mat = mgr.as_matrix() + sliced = mgr.get_slice(slobj, axis=axis) + mat_slobj = (slice(None),) * axis + (slobj,) + assert_almost_equal(mat[mat_slobj], sliced.as_matrix()) + assert_almost_equal(mgr.axes[axis][slobj], sliced.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # slice + yield assert_slice_ok, mgr, ax, slice(None) + yield assert_slice_ok, mgr, ax, slice(3) + yield assert_slice_ok, mgr, ax, slice(100) + yield assert_slice_ok, mgr, ax, slice(1, 4) + yield assert_slice_ok, mgr, ax, slice(3, 0, -2) + + # boolean mask + yield assert_slice_ok, mgr, ax, np.array([], dtype=np.bool_) + yield (assert_slice_ok, mgr, ax, + np.ones(mgr.shape[ax], dtype=np.bool_)) + yield (assert_slice_ok, mgr, ax, + np.zeros(mgr.shape[ax], dtype=np.bool_)) + + if mgr.shape[ax] >= 3: + yield (assert_slice_ok, mgr, ax, + np.arange(mgr.shape[ax]) % 3 == 0) + yield (assert_slice_ok, mgr, ax, + np.array([True, True, False], dtype=np.bool_)) + + # fancy indexer + yield assert_slice_ok, mgr, ax, [] + yield assert_slice_ok, mgr, ax, lrange(mgr.shape[ax]) + + if mgr.shape[ax] >= 3: + yield assert_slice_ok, mgr, ax, [0, 1, 2] + yield assert_slice_ok, mgr, ax, [-1, -2, -3] + + def test_take(self): + def assert_take_ok(mgr, axis, indexer): + mat = mgr.as_matrix() + taken = mgr.take(indexer, axis) + assert_almost_equal(np.take(mat, indexer, axis), + taken.as_matrix()) + assert_almost_equal(mgr.axes[axis].take(indexer), + taken.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # take/fancy indexer + yield assert_take_ok, mgr, ax, [] + yield assert_take_ok, mgr, ax, [0, 0, 0] + yield assert_take_ok, mgr, ax, lrange(mgr.shape[ax]) + + if mgr.shape[ax] >= 3: + yield assert_take_ok, mgr, ax, [0, 1, 2] + yield assert_take_ok, mgr, ax, [-1, -2, -3] + + def test_reindex_axis(self): + def assert_reindex_axis_is_ok(mgr, axis, new_labels, + fill_value): + mat = mgr.as_matrix() + indexer = mgr.axes[axis].get_indexer_for(new_labels) + + reindexed = mgr.reindex_axis(new_labels, axis, + fill_value=fill_value) + assert_almost_equal(com.take_nd(mat, indexer, axis, + fill_value=fill_value), + reindexed.as_matrix()) + assert_almost_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.): + yield assert_reindex_axis_is_ok, mgr, ax, [], fill_value + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][[0, 0, 0]], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + ['foo', mgr.axes[ax][0], 'baz'], fill_value) + + if mgr.shape[ax] >= 3: + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][:-3], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][-3::-1], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value) + + def test_reindex_indexer(self): + def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, + fill_value): + mat = mgr.as_matrix() + reindexed_mat = com.take_nd(mat, indexer, axis, + fill_value=fill_value) + reindexed = mgr.reindex_indexer(new_labels, indexer, axis, + fill_value=fill_value) + assert_almost_equal(reindexed_mat, reindexed.as_matrix()) + assert_almost_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.): + yield (assert_reindex_indexer_is_ok, mgr, ax, + [], [], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo'] * mgr.shape[ax], np.arange(mgr.shape[ax]), + fill_value) + + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), + fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], + fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [0, 0, 0], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [-1, 0, -1], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', mgr.axes[ax][0], 'baz'], [-1, -1, -1], + fill_value) + + if mgr.shape[ax] >= 3: + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [0, 1, 2], fill_value) + + + # test_get_slice(slice_like, axis) + # take(indexer, axis) + # reindex_axis(new_labels, axis) + # reindex_indexer(new_labels, indexer, axis) + + +class TestBlockPlacement(tm.TestCase): + _multiprocess_can_split_ = True + + def test_slice_len(self): + self.assertEqual(len(BlockPlacement(slice(0, 4))), 4) + self.assertEqual(len(BlockPlacement(slice(0, 4, 2))), 2) + self.assertEqual(len(BlockPlacement(slice(0, 3, 2))), 2) + + self.assertEqual(len(BlockPlacement(slice(0, 1, 2))), 1) + self.assertEqual(len(BlockPlacement(slice(1, 0, -1))), 1) + + def test_zero_step_raises(self): + self.assertRaises(ValueError, BlockPlacement, slice(1, 1, 0)) + self.assertRaises(ValueError, BlockPlacement, slice(1, 2, 0)) + + def test_unbounded_slice_raises(self): + def assert_unbounded_slice_error(slc): + # assertRaisesRegexp is not available in py2.6 + # self.assertRaisesRegexp(ValueError, "unbounded slice", + # lambda: BlockPlacement(slc)) + self.assertRaises(ValueError, BlockPlacement, slc) + + assert_unbounded_slice_error(slice(None, None)) + assert_unbounded_slice_error(slice(10, None)) + assert_unbounded_slice_error(slice(None, None, -1)) + assert_unbounded_slice_error(slice(None, 10, -1)) + + # These are "unbounded" because negative index will change depending on + # container shape. + assert_unbounded_slice_error(slice(-1, None)) + assert_unbounded_slice_error(slice(None, -1)) + assert_unbounded_slice_error(slice(-1, -1)) + assert_unbounded_slice_error(slice(-1, None, -1)) + assert_unbounded_slice_error(slice(None, -1, -1)) + assert_unbounded_slice_error(slice(-1, -1, -1)) + + def test_not_slice_like_slices(self): + def assert_not_slice_like(slc): + self.assertTrue(not BlockPlacement(slc).is_slice_like) + + assert_not_slice_like(slice(0, 0)) + assert_not_slice_like(slice(100, 0)) + + assert_not_slice_like(slice(100, 100, -1)) + assert_not_slice_like(slice(0, 100, -1)) + + self.assertTrue(not BlockPlacement(slice(0, 0)).is_slice_like) + self.assertTrue(not BlockPlacement(slice(100, 100)).is_slice_like) + + def test_array_to_slice_conversion(self): + def assert_as_slice_equals(arr, slc): + self.assertEqual(BlockPlacement(arr).as_slice, slc) + + assert_as_slice_equals([0], slice(0, 1, 1)) + assert_as_slice_equals([100], slice(100, 101, 1)) + + assert_as_slice_equals([0, 1, 2], slice(0, 3, 1)) + assert_as_slice_equals([0, 5, 10], slice(0, 15, 5)) + assert_as_slice_equals([0, 100], slice(0, 200, 100)) + + assert_as_slice_equals([2, 1], slice(2, 0, -1)) + assert_as_slice_equals([2, 1, 0], slice(2, None, -1)) + assert_as_slice_equals([100, 0], slice(100, None, -100)) + + def test_not_slice_like_arrays(self): + def assert_not_slice_like(arr): + self.assertTrue(not BlockPlacement(arr).is_slice_like) + + assert_not_slice_like([]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, -2, -3]) + assert_not_slice_like([-10]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, 0, 1, 2]) + assert_not_slice_like([-2, 0, 2, 4]) + assert_not_slice_like([1, 0, -1]) + assert_not_slice_like([1, 1, 1]) + + def test_slice_iter(self): + self.assertEqual(list(BlockPlacement(slice(0, 3))), [0, 1, 2]) + self.assertEqual(list(BlockPlacement(slice(0, 0))), []) + self.assertEqual(list(BlockPlacement(slice(3, 0))), []) + + self.assertEqual(list(BlockPlacement(slice(3, 0, -1))), [3, 2, 1]) + self.assertEqual(list(BlockPlacement(slice(3, None, -1))), + [3, 2, 1, 0]) + + def test_slice_to_array_conversion(self): + def assert_as_array_equals(slc, asarray): + np.testing.assert_array_equal( + BlockPlacement(slc).as_array, + np.asarray(asarray)) + + assert_as_array_equals(slice(0, 3), [0, 1, 2]) + assert_as_array_equals(slice(0, 0), []) + assert_as_array_equals(slice(3, 0), []) + + assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) + assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0]) + assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1]) + + def test_blockplacement_add(self): + bpl = BlockPlacement(slice(0, 5)) + self.assertEqual(bpl.add(1).as_slice, slice(1, 6, 1)) + self.assertEqual(bpl.add(np.arange(5)).as_slice, + slice(0, 10, 2)) + self.assertEqual(list(bpl.add(np.arange(5, 0, -1))), + [5, 5, 5, 5, 5]) + + def test_blockplacement_add_int(self): + def assert_add_equals(val, inc, result): + self.assertEqual(list(BlockPlacement(val).add(inc)), + result) + + assert_add_equals(slice(0, 0), 0, []) + assert_add_equals(slice(1, 4), 0, [1, 2, 3]) + assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1]) + assert_add_equals(slice(2, None, -1), 0, [2, 1, 0]) + assert_add_equals([1, 2, 4], 0, [1, 2, 4]) + + assert_add_equals(slice(0, 0), 10, []) + assert_add_equals(slice(1, 4), 10, [11, 12, 13]) + assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11]) + assert_add_equals(slice(2, None, -1), 10, [12, 11, 10]) + assert_add_equals([1, 2, 4], 10, [11, 12, 14]) + + assert_add_equals(slice(0, 0), -1, []) + assert_add_equals(slice(1, 4), -1, [0, 1, 2]) + assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0]) + assert_add_equals([1, 2, 4], -1, [0, 1, 3]) + + self.assertRaises(ValueError, + lambda: BlockPlacement(slice(1, 4)).add(-10)) + self.assertRaises(ValueError, + lambda: BlockPlacement([1, 2, 4]).add(-10)) + self.assertRaises(ValueError, + lambda: BlockPlacement(slice(2, None, -1)).add(-1)) + + # def test_blockplacement_array_add(self): + + # assert_add_equals(slice(0, 2), [0, 1, 1], [0, 2, 3]) + # assert_add_equals(slice(2, None, -1), [1, 1, 0], [3, 2, 0]) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_msgpack/__init__.py b/pandas/tests/test_msgpack/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tests/test_msgpack/test_buffer.py b/pandas/tests/test_msgpack/test_buffer.py new file mode 100644 index 00000000..940b6540 --- /dev/null +++ b/pandas/tests/test_msgpack/test_buffer.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# coding: utf-8 + +from pandas.msgpack import packb, unpackb + + +def test_unpack_buffer(): + from array import array + buf = array('b') + buf.fromstring(packb(('foo', 'bar'))) + obj = unpackb(buf, use_list=1) + assert [b'foo', b'bar'] == obj diff --git a/pandas/tests/test_msgpack/test_case.py b/pandas/tests/test_msgpack/test_case.py new file mode 100644 index 00000000..e78456b2 --- /dev/null +++ b/pandas/tests/test_msgpack/test_case.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# coding: utf-8 + +from pandas.msgpack import packb, unpackb + + +def check(length, obj): + v = packb(obj) + assert len(v) == length, \ + "%r length should be %r but get %r" % (obj, length, len(v)) + assert unpackb(v, use_list=0) == obj + +def test_1(): + for o in [None, True, False, 0, 1, (1 << 6), (1 << 7) - 1, -1, + -((1<<5)-1), -(1<<5)]: + check(1, o) + +def test_2(): + for o in [1 << 7, (1 << 8) - 1, + -((1<<5)+1), -(1<<7) + ]: + check(2, o) + +def test_3(): + for o in [1 << 8, (1 << 16) - 1, + -((1<<7)+1), -(1<<15)]: + check(3, o) + +def test_5(): + for o in [1 << 16, (1 << 32) - 1, + -((1<<15)+1), -(1<<31)]: + check(5, o) + +def test_9(): + for o in [1 << 32, (1 << 64) - 1, + -((1<<31)+1), -(1<<63), + 1.0, 0.1, -0.1, -1.0]: + check(9, o) + + +def check_raw(overhead, num): + check(num + overhead, b" " * num) + +def test_fixraw(): + check_raw(1, 0) + check_raw(1, (1<<5) - 1) + +def test_raw16(): + check_raw(3, 1<<5) + check_raw(3, (1<<16) - 1) + +def test_raw32(): + check_raw(5, 1<<16) + + +def check_array(overhead, num): + check(num + overhead, (None,) * num) + +def test_fixarray(): + check_array(1, 0) + check_array(1, (1 << 4) - 1) + +def test_array16(): + check_array(3, 1 << 4) + check_array(3, (1<<16)-1) + +def test_array32(): + check_array(5, (1<<16)) + + +def match(obj, buf): + assert packb(obj) == buf + assert unpackb(buf, use_list=0) == obj + +def test_match(): + cases = [ + (None, b'\xc0'), + (False, b'\xc2'), + (True, b'\xc3'), + (0, b'\x00'), + (127, b'\x7f'), + (128, b'\xcc\x80'), + (256, b'\xcd\x01\x00'), + (-1, b'\xff'), + (-33, b'\xd0\xdf'), + (-129, b'\xd1\xff\x7f'), + ({1:1}, b'\x81\x01\x01'), + (1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"), + ((), b'\x90'), + (tuple(range(15)),b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e"), + (tuple(range(16)),b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), + ({}, b'\x80'), + (dict([(x,x) for x in range(15)]), b'\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e'), + (dict([(x,x) for x in range(16)]), b'\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e\x0f\x0f'), + ] + + for v, p in cases: + match(v, p) + +def test_unicode(): + assert unpackb(packb('foobar'), use_list=1) == b'foobar' diff --git a/pandas/tests/test_msgpack/test_except.py b/pandas/tests/test_msgpack/test_except.py new file mode 100644 index 00000000..a0239336 --- /dev/null +++ b/pandas/tests/test_msgpack/test_except.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# coding: utf-8 + +import unittest +import nose + +import datetime +from pandas.msgpack import packb, unpackb + +class DummyException(Exception): + pass + +class TestExceptions(unittest.TestCase): + + def test_raise_on_find_unsupported_value(self): + import datetime + self.assertRaises(TypeError, packb, datetime.datetime.now()) + + def test_raise_from_object_hook(self): + def hook(obj): + raise DummyException + self.assertRaises(DummyException, unpackb, packb({}), object_hook=hook) + self.assertRaises(DummyException, unpackb, packb({'fizz': 'buzz'}), object_hook=hook) + self.assertRaises(DummyException, unpackb, packb({'fizz': 'buzz'}), object_pairs_hook=hook) + self.assertRaises(DummyException, unpackb, packb({'fizz': {'buzz': 'spam'}}), object_hook=hook) + self.assertRaises(DummyException, unpackb, packb({'fizz': {'buzz': 'spam'}}), object_pairs_hook=hook) + + def test_invalidvalue(self): + self.assertRaises(ValueError, unpackb, b'\xd9\x97#DL_') diff --git a/pandas/tests/test_msgpack/test_format.py b/pandas/tests/test_msgpack/test_format.py new file mode 100644 index 00000000..a3a3afd0 --- /dev/null +++ b/pandas/tests/test_msgpack/test_format.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# coding: utf-8 + +from pandas.msgpack import unpackb + +def check(src, should, use_list=0): + assert unpackb(src, use_list=use_list) == should + +def testSimpleValue(): + check(b"\x93\xc0\xc2\xc3", + (None, False, True,)) + +def testFixnum(): + check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", + ((0,64,127,), (-32,-16,-1,),) + ) + +def testFixArray(): + check(b"\x92\x90\x91\x91\xc0", + ((),((None,),),), + ) + +def testFixRaw(): + check(b"\x94\xa0\xa1a\xa2bc\xa3def", + (b"", b"a", b"bc", b"def",), + ) + +def testFixMap(): + check( + b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80", + {False: {None: None}, True:{None:{}}}, + ) + +def testUnsignedInt(): + check( + b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00" + b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00" + b"\xce\xff\xff\xff\xff", + (0, 128, 255, 0, 32768, 65535, 0, 2147483648, 4294967295,), + ) + +def testSignedInt(): + check(b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00" + b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00" + b"\xd2\xff\xff\xff\xff", + (0, -128, -1, 0, -32768, -1, 0, -2147483648, -1,)) + +def testRaw(): + check(b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00" + b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab", + (b"", b"a", b"ab", b"", b"a", b"ab")) + +def testArray(): + check(b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00" + b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02" + b"\xc2\xc3", + ((), (None,), (False,True), (), (None,), (False,True)) + ) + +def testMap(): + check( + b"\x96" + b"\xde\x00\x00" + b"\xde\x00\x01\xc0\xc2" + b"\xde\x00\x02\xc0\xc2\xc3\xc2" + b"\xdf\x00\x00\x00\x00" + b"\xdf\x00\x00\x00\x01\xc0\xc2" + b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", + ({}, {None: False}, {True: False, None: False}, {}, + {None: False}, {True: False, None: False})) diff --git a/pandas/tests/test_msgpack/test_obj.py b/pandas/tests/test_msgpack/test_obj.py new file mode 100644 index 00000000..4a018bc8 --- /dev/null +++ b/pandas/tests/test_msgpack/test_obj.py @@ -0,0 +1,71 @@ +# coding: utf-8 + +import unittest +import nose + +import datetime +from pandas.msgpack import packb, unpackb + +class DecodeError(Exception): + pass + +class TestObj(unittest.TestCase): + + def _arr_to_str(self, arr): + return ''.join(str(c) for c in arr) + + def bad_complex_decoder(self, o): + raise DecodeError("Ooops!") + + def _decode_complex(self, obj): + if b'__complex__' in obj: + return complex(obj[b'real'], obj[b'imag']) + return obj + + def _encode_complex(self, obj): + if isinstance(obj, complex): + return {b'__complex__': True, b'real': 1, b'imag': 2} + return obj + + def test_encode_hook(self): + packed = packb([3, 1+2j], default=self._encode_complex) + unpacked = unpackb(packed, use_list=1) + assert unpacked[1] == {b'__complex__': True, b'real': 1, b'imag': 2} + + def test_decode_hook(self): + packed = packb([3, {b'__complex__': True, b'real': 1, b'imag': 2}]) + unpacked = unpackb(packed, object_hook=self._decode_complex, use_list=1) + assert unpacked[1] == 1+2j + + def test_decode_pairs_hook(self): + packed = packb([3, {1: 2, 3: 4}]) + prod_sum = 1 * 2 + 3 * 4 + unpacked = unpackb(packed, object_pairs_hook=lambda l: sum(k * v for k, v in l), use_list=1) + assert unpacked[1] == prod_sum + + def test_only_one_obj_hook(self): + self.assertRaises(ValueError, unpackb, b'', object_hook=lambda x: x, object_pairs_hook=lambda x: x) + + def test_bad_hook(self): + def f(): + packed = packb([3, 1+2j], default=lambda o: o) + unpacked = unpackb(packed, use_list=1) + self.assertRaises(ValueError, f) + + def test_array_hook(self): + packed = packb([1,2,3]) + unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1) + assert unpacked == '123' + + def test_an_exception_in_objecthook1(self): + def f(): + packed = packb({1: {'__complex__': True, 'real': 1, 'imag': 2}}) + unpackb(packed, object_hook=self.bad_complex_decoder) + self.assertRaises(DecodeError, f) + + + def test_an_exception_in_objecthook2(self): + def f(): + packed = packb({1: [{'__complex__': True, 'real': 1, 'imag': 2}]}) + unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1) + self.assertRaises(DecodeError, f) diff --git a/pandas/tests/test_msgpack/test_pack.py b/pandas/tests/test_msgpack/test_pack.py new file mode 100644 index 00000000..22df6df5 --- /dev/null +++ b/pandas/tests/test_msgpack/test_pack.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +# coding: utf-8 + +import unittest +import nose + +import struct +from pandas import compat +from pandas.compat import u, OrderedDict +from pandas.msgpack import packb, unpackb, Unpacker, Packer + +class TestPack(unittest.TestCase): + + def check(self, data, use_list=False): + re = unpackb(packb(data), use_list=use_list) + assert re == data + + def testPack(self): + test_data = [ + 0, 1, 127, 128, 255, 256, 65535, 65536, + -1, -32, -33, -128, -129, -32768, -32769, + 1.0, + b"", b"a", b"a"*31, b"a"*32, + None, True, False, + (), ((),), ((), None,), + {None: 0}, + (1<<23), + ] + for td in test_data: + self.check(td) + + def testPackUnicode(self): + test_data = [ + u(""), u("abcd"), [u("defgh")], u("Русский текст"), + ] + for td in test_data: + re = unpackb(packb(td, encoding='utf-8'), use_list=1, encoding='utf-8') + assert re == td + packer = Packer(encoding='utf-8') + data = packer.pack(td) + re = Unpacker(compat.BytesIO(data), encoding='utf-8', use_list=1).unpack() + assert re == td + + def testPackUTF32(self): + test_data = [ + compat.u(""), + compat.u("abcd"), + [compat.u("defgh")], + compat.u("Русский текст"), + ] + for td in test_data: + re = unpackb(packb(td, encoding='utf-32'), use_list=1, encoding='utf-32') + assert re == td + + def testPackBytes(self): + test_data = [ + b"", b"abcd", (b"defgh",), + ] + for td in test_data: + self.check(td) + + def testIgnoreUnicodeErrors(self): + re = unpackb(packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', use_list=1) + assert re == "abcdef" + + def testStrictUnicodeUnpack(self): + self.assertRaises(UnicodeDecodeError, unpackb, packb(b'abc\xeddef'), encoding='utf-8', use_list=1) + + def testStrictUnicodePack(self): + self.assertRaises(UnicodeEncodeError, packb, compat.u("abc\xeddef"), encoding='ascii', unicode_errors='strict') + + def testIgnoreErrorsPack(self): + re = unpackb(packb(compat.u("abcФФФdef"), encoding='ascii', unicode_errors='ignore'), encoding='utf-8', use_list=1) + assert re == compat.u("abcdef") + + def testNoEncoding(self): + self.assertRaises(TypeError, packb, compat.u("abc"), encoding=None) + + def testDecodeBinary(self): + re = unpackb(packb("abc"), encoding=None, use_list=1) + assert re == b"abc" + + def testPackFloat(self): + assert packb(1.0, use_single_float=True) == b'\xca' + struct.pack('>f', 1.0) + assert packb(1.0, use_single_float=False) == b'\xcb' + struct.pack('>d', 1.0) + + def testArraySize(self, sizes=[0, 5, 50, 1000]): + bio = compat.BytesIO() + packer = Packer() + for size in sizes: + bio.write(packer.pack_array_header(size)) + for i in range(size): + bio.write(packer.pack(i)) + + bio.seek(0) + unpacker = Unpacker(bio, use_list=1) + for size in sizes: + assert unpacker.unpack() == list(range(size)) + + def test_manualreset(self, sizes=[0, 5, 50, 1000]): + packer = Packer(autoreset=False) + for size in sizes: + packer.pack_array_header(size) + for i in range(size): + packer.pack(i) + + bio = compat.BytesIO(packer.bytes()) + unpacker = Unpacker(bio, use_list=1) + for size in sizes: + assert unpacker.unpack() == list(range(size)) + + packer.reset() + assert packer.bytes() == b'' + + def testMapSize(self, sizes=[0, 5, 50, 1000]): + bio = compat.BytesIO() + packer = Packer() + for size in sizes: + bio.write(packer.pack_map_header(size)) + for i in range(size): + bio.write(packer.pack(i)) # key + bio.write(packer.pack(i * 2)) # value + + bio.seek(0) + unpacker = Unpacker(bio) + for size in sizes: + assert unpacker.unpack() == dict((i, i * 2) for i in range(size)) + + + def test_odict(self): + seq = [(b'one', 1), (b'two', 2), (b'three', 3), (b'four', 4)] + od = OrderedDict(seq) + assert unpackb(packb(od), use_list=1) == dict(seq) + def pair_hook(seq): + return list(seq) + assert unpackb(packb(od), object_pairs_hook=pair_hook, use_list=1) == seq + + + def test_pairlist(self): + pairlist = [(b'a', 1), (2, b'b'), (b'foo', b'bar')] + packer = Packer() + packed = packer.pack_map_pairs(pairlist) + unpacked = unpackb(packed, object_pairs_hook=list) + assert pairlist == unpacked diff --git a/pandas/tests/test_msgpack/test_read_size.py b/pandas/tests/test_msgpack/test_read_size.py new file mode 100644 index 00000000..db3e1deb --- /dev/null +++ b/pandas/tests/test_msgpack/test_read_size.py @@ -0,0 +1,65 @@ +"""Test Unpacker's read_array_header and read_map_header methods""" +from pandas.msgpack import packb, Unpacker, OutOfData +UnexpectedTypeException = ValueError + +def test_read_array_header(): + unpacker = Unpacker() + unpacker.feed(packb(['a', 'b', 'c'])) + assert unpacker.read_array_header() == 3 + assert unpacker.unpack() == b'a' + assert unpacker.unpack() == b'b' + assert unpacker.unpack() == b'c' + try: + unpacker.unpack() + assert 0, 'should raise exception' + except OutOfData: + assert 1, 'okay' + + +def test_read_map_header(): + unpacker = Unpacker() + unpacker.feed(packb({'a': 'A'})) + assert unpacker.read_map_header() == 1 + assert unpacker.unpack() == B'a' + assert unpacker.unpack() == B'A' + try: + unpacker.unpack() + assert 0, 'should raise exception' + except OutOfData: + assert 1, 'okay' + +def test_incorrect_type_array(): + unpacker = Unpacker() + unpacker.feed(packb(1)) + try: + unpacker.read_array_header() + assert 0, 'should raise exception' + except UnexpectedTypeException: + assert 1, 'okay' + +def test_incorrect_type_map(): + unpacker = Unpacker() + unpacker.feed(packb(1)) + try: + unpacker.read_map_header() + assert 0, 'should raise exception' + except UnexpectedTypeException: + assert 1, 'okay' + +def test_correct_type_nested_array(): + unpacker = Unpacker() + unpacker.feed(packb({'a': ['b', 'c', 'd']})) + try: + unpacker.read_array_header() + assert 0, 'should raise exception' + except UnexpectedTypeException: + assert 1, 'okay' + +def test_incorrect_type_nested_map(): + unpacker = Unpacker() + unpacker.feed(packb([{'a': 'b'}])) + try: + unpacker.read_map_header() + assert 0, 'should raise exception' + except UnexpectedTypeException: + assert 1, 'okay' diff --git a/pandas/tests/test_msgpack/test_seq.py b/pandas/tests/test_msgpack/test_seq.py new file mode 100644 index 00000000..e5ee68c4 --- /dev/null +++ b/pandas/tests/test_msgpack/test_seq.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# coding: utf-8 + +from pandas import compat +from pandas.compat import u +import pandas.msgpack as msgpack + +binarydata = [chr(i) for i in range(256)] +binarydata = "".join(binarydata) +if compat.PY3: + binarydata = binarydata.encode('utf-8') + +def gen_binary_data(idx): + data = binarydata[:idx % 300] + return data + +def test_exceeding_unpacker_read_size(): + dumpf = compat.BytesIO() + + packer = msgpack.Packer() + + NUMBER_OF_STRINGS = 6 + read_size = 16 + # 5 ok for read_size=16, while 6 glibc detected *** python: double free or corruption (fasttop): + # 20 ok for read_size=256, while 25 segfaults / glibc detected *** python: double free or corruption (!prev) + # 40 ok for read_size=1024, while 50 introduces errors + # 7000 ok for read_size=1024*1024, while 8000 leads to glibc detected *** python: double free or corruption (!prev): + + for idx in range(NUMBER_OF_STRINGS): + data = gen_binary_data(idx) + dumpf.write(packer.pack(data)) + + f = compat.BytesIO(dumpf.getvalue()) + dumpf.close() + + unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1) + + read_count = 0 + for idx, o in enumerate(unpacker): + assert type(o) == bytes + assert o == gen_binary_data(idx) + read_count += 1 + + assert read_count == NUMBER_OF_STRINGS diff --git a/pandas/tests/test_msgpack/test_sequnpack.py b/pandas/tests/test_msgpack/test_sequnpack.py new file mode 100644 index 00000000..4c3ad363 --- /dev/null +++ b/pandas/tests/test_msgpack/test_sequnpack.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# coding: utf-8 + +import unittest +import nose + +from pandas import compat +from pandas.msgpack import Unpacker, BufferFull +from pandas.msgpack import OutOfData + +class TestPack(unittest.TestCase): + + def test_partialdata(self): + unpacker = Unpacker() + unpacker.feed(b'\xa5') + self.assertRaises(StopIteration, next, iter(unpacker)) + unpacker.feed(b'h') + self.assertRaises(StopIteration, next, iter(unpacker)) + unpacker.feed(b'a') + self.assertRaises(StopIteration, next, iter(unpacker)) + unpacker.feed(b'l') + self.assertRaises(StopIteration, next, iter(unpacker)) + unpacker.feed(b'l') + self.assertRaises(StopIteration, next, iter(unpacker)) + unpacker.feed(b'o') + assert next(iter(unpacker)) == b'hallo' + + def test_foobar(self): + unpacker = Unpacker(read_size=3, use_list=1) + unpacker.feed(b'foobar') + assert unpacker.unpack() == ord(b'f') + assert unpacker.unpack() == ord(b'o') + assert unpacker.unpack() == ord(b'o') + assert unpacker.unpack() == ord(b'b') + assert unpacker.unpack() == ord(b'a') + assert unpacker.unpack() == ord(b'r') + self.assertRaises(OutOfData, unpacker.unpack) + + unpacker.feed(b'foo') + unpacker.feed(b'bar') + + k = 0 + for o, e in zip(unpacker, 'foobarbaz'): + assert o == ord(e) + k += 1 + assert k == len(b'foobar') + + def test_foobar_skip(self): + unpacker = Unpacker(read_size=3, use_list=1) + unpacker.feed(b'foobar') + assert unpacker.unpack() == ord(b'f') + unpacker.skip() + assert unpacker.unpack() == ord(b'o') + unpacker.skip() + assert unpacker.unpack() == ord(b'a') + unpacker.skip() + self.assertRaises(OutOfData, unpacker.unpack) + + def test_maxbuffersize(self): + self.assertRaises(ValueError, Unpacker, read_size=5, max_buffer_size=3) + unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1) + unpacker.feed(b'fo') + self.assertRaises(BufferFull, unpacker.feed, b'ob') + unpacker.feed(b'o') + assert ord('f') == next(unpacker) + unpacker.feed(b'b') + assert ord('o') == next(unpacker) + assert ord('o') == next(unpacker) + assert ord('b') == next(unpacker) + + def test_readbytes(self): + unpacker = Unpacker(read_size=3) + unpacker.feed(b'foobar') + assert unpacker.unpack() == ord(b'f') + assert unpacker.read_bytes(3) == b'oob' + assert unpacker.unpack() == ord(b'a') + assert unpacker.unpack() == ord(b'r') + + # Test buffer refill + unpacker = Unpacker(compat.BytesIO(b'foobar'), read_size=3) + assert unpacker.unpack() == ord(b'f') + assert unpacker.read_bytes(3) == b'oob' + assert unpacker.unpack() == ord(b'a') + assert unpacker.unpack() == ord(b'r') diff --git a/pandas/tests/test_msgpack/test_subtype.py b/pandas/tests/test_msgpack/test_subtype.py new file mode 100644 index 00000000..0934b31c --- /dev/null +++ b/pandas/tests/test_msgpack/test_subtype.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# coding: utf-8 + +from pandas.msgpack import packb, unpackb +from collections import namedtuple + +class MyList(list): + pass + +class MyDict(dict): + pass + +class MyTuple(tuple): + pass + +MyNamedTuple = namedtuple('MyNamedTuple', 'x y') + +def test_types(): + assert packb(MyDict()) == packb(dict()) + assert packb(MyList()) == packb(list()) + assert packb(MyNamedTuple(1, 2)) == packb((1, 2)) diff --git a/pandas/tests/test_msgpack/test_unpack_raw.py b/pandas/tests/test_msgpack/test_unpack_raw.py new file mode 100644 index 00000000..0e96a79c --- /dev/null +++ b/pandas/tests/test_msgpack/test_unpack_raw.py @@ -0,0 +1,28 @@ +"""Tests for cases where the user seeks to obtain packed msgpack objects""" + +from pandas import compat +from pandas.msgpack import Unpacker, packb + +def test_write_bytes(): + unpacker = Unpacker() + unpacker.feed(b'abc') + f = compat.BytesIO() + assert unpacker.unpack(f.write) == ord('a') + assert f.getvalue() == b'a' + f = compat.BytesIO() + assert unpacker.skip(f.write) is None + assert f.getvalue() == b'b' + f = compat.BytesIO() + assert unpacker.skip() is None + assert f.getvalue() == b'' + + +def test_write_bytes_multi_buffer(): + long_val = (5) * 100 + expected = packb(long_val) + unpacker = Unpacker(compat.BytesIO(expected), read_size=3, max_buffer_size=3) + + f = compat.BytesIO() + unpacked = unpacker.unpack(f.write) + assert unpacked == long_val + assert f.getvalue() == expected diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py new file mode 100644 index 00000000..c0ca5451 --- /dev/null +++ b/pandas/tests/test_multilevel.py @@ -0,0 +1,2150 @@ +# pylint: disable-msg=W0612,E1101,W0141 +import datetime +import nose + +from numpy.random import randn +import numpy as np + +from pandas.core.index import Index, MultiIndex +from pandas import Panel, DataFrame, Series, notnull, isnull + +from pandas.util.testing import (assert_almost_equal, + assert_series_equal, + assert_frame_equal, + assertRaisesRegexp) +import pandas.core.common as com +import pandas.util.testing as tm +from pandas.compat import (range, lrange, StringIO, lzip, u, cPickle, + product as cart_product, zip) +import pandas as pd + +import pandas.index as _index + + +class TestMultiLevel(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + + self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], + names=['first']) + + # create test series object + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = lzip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + s[3] = np.NaN + self.series = s + + tm.N = 100 + self.tdf = tm.makeTimeDataFrame() + self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]).sum() + + # use Int64Index, to make sure things work + self.ymd.index.set_levels([lev.astype('i8') + for lev in self.ymd.index.levels], + inplace=True) + self.ymd.index.set_names(['year', 'month', 'day'], + inplace=True) + + def test_append(self): + a, b = self.frame[:5], self.frame[5:] + + result = a.append(b) + tm.assert_frame_equal(result, self.frame) + + result = a['A'].append(b['A']) + tm.assert_series_equal(result, self.frame['A']) + + def test_append_index(self): + tm._skip_if_no_pytz() + + idx1 = Index([1.1, 1.2, 1.3]) + idx2 = pd.date_range('2011-01-01', freq='D', periods=3, tz='Asia/Tokyo') + idx3 = Index(['A', 'B', 'C']) + + midx_lv2 = MultiIndex.from_arrays([idx1, idx2]) + midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3]) + + result = idx1.append(midx_lv2) + + # GH 7112 + import pytz + tz = pytz.timezone('Asia/Tokyo') + expected_tuples = [(1.1, datetime.datetime(2011, 1, 1, tzinfo=tz)), + (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz)), + (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz))] + expected = Index([1.1, 1.2, 1.3] + expected_tuples) + self.assert_(result.equals(expected)) + + result = midx_lv2.append(idx1) + expected = Index(expected_tuples + [1.1, 1.2, 1.3]) + self.assert_(result.equals(expected)) + + result = midx_lv2.append(midx_lv2) + expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) + self.assert_(result.equals(expected)) + + result = midx_lv2.append(midx_lv3) + self.assert_(result.equals(expected)) + + result = midx_lv3.append(midx_lv2) + expected = Index._simple_new( + np.array([(1.1, datetime.datetime(2011, 1, 1, tzinfo=tz), 'A'), + (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz), 'B'), + (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz), 'C')] + + expected_tuples), None) + self.assert_(result.equals(expected)) + + def test_dataframe_constructor(self): + multi = DataFrame(np.random.randn(4, 4), + index=[np.array(['a', 'a', 'b', 'b']), + np.array(['x', 'y', 'x', 'y'])]) + tm.assert_isinstance(multi.index, MultiIndex) + self.assertNotIsInstance(multi.columns, MultiIndex) + + multi = DataFrame(np.random.randn(4, 4), + columns=[['a', 'a', 'b', 'b'], + ['x', 'y', 'x', 'y']]) + tm.assert_isinstance(multi.columns, MultiIndex) + + def test_series_constructor(self): + multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), + np.array(['x', 'y', 'x', 'y'])]) + tm.assert_isinstance(multi.index, MultiIndex) + + multi = Series(1., index=[['a', 'a', 'b', 'b'], + ['x', 'y', 'x', 'y']]) + tm.assert_isinstance(multi.index, MultiIndex) + + multi = Series(lrange(4), index=[['a', 'a', 'b', 'b'], + ['x', 'y', 'x', 'y']]) + tm.assert_isinstance(multi.index, MultiIndex) + + def test_reindex_level(self): + # axis=0 + month_sums = self.ymd.sum(level='month') + result = month_sums.reindex(self.ymd.index, level=1) + expected = self.ymd.groupby(level='month').transform(np.sum) + + assert_frame_equal(result, expected) + + # Series + result = month_sums['A'].reindex(self.ymd.index, level=1) + expected = self.ymd['A'].groupby(level='month').transform(np.sum) + assert_series_equal(result, expected) + + # axis=1 + month_sums = self.ymd.T.sum(axis=1, level='month') + result = month_sums.reindex(columns=self.ymd.index, level=1) + expected = self.ymd.groupby(level='month').transform(np.sum).T + assert_frame_equal(result, expected) + + def test_binops_level(self): + def _check_op(opname): + op = getattr(DataFrame, opname) + month_sums = self.ymd.sum(level='month') + result = op(self.ymd, month_sums, level='month') + + broadcasted = self.ymd.groupby(level='month').transform(np.sum) + expected = op(self.ymd, broadcasted) + assert_frame_equal(result, expected) + + # Series + op = getattr(Series, opname) + result = op(self.ymd['A'], month_sums['A'], level='month') + broadcasted = self.ymd['A'].groupby( + level='month').transform(np.sum) + expected = op(self.ymd['A'], broadcasted) + assert_series_equal(result, expected) + + _check_op('sub') + _check_op('add') + _check_op('mul') + _check_op('div') + + def test_pickle(self): + + def _test_roundtrip(frame): + pickled = cPickle.dumps(frame) + unpickled = cPickle.loads(pickled) + assert_frame_equal(frame, unpickled) + + _test_roundtrip(self.frame) + _test_roundtrip(self.frame.T) + _test_roundtrip(self.ymd) + _test_roundtrip(self.ymd.T) + + def test_reindex(self): + reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] + expected = self.frame.ix[[0, 3]] + assert_frame_equal(reindexed, expected) + + def test_reindex_preserve_levels(self): + new_index = self.ymd.index[::10] + chunk = self.ymd.reindex(new_index) + self.assertIs(chunk.index, new_index) + + chunk = self.ymd.ix[new_index] + self.assertIs(chunk.index, new_index) + + ymdT = self.ymd.T + chunk = ymdT.reindex(columns=new_index) + self.assertIs(chunk.columns, new_index) + + chunk = ymdT.ix[:, new_index] + self.assertIs(chunk.columns, new_index) + + def test_sort_index_preserve_levels(self): + result = self.frame.sort_index() + self.assertEqual(result.index.names, self.frame.index.names) + + def test_repr_to_string(self): + repr(self.frame) + repr(self.ymd) + repr(self.frame.T) + repr(self.ymd.T) + + buf = StringIO() + self.frame.to_string(buf=buf) + self.ymd.to_string(buf=buf) + self.frame.T.to_string(buf=buf) + self.ymd.T.to_string(buf=buf) + + def test_repr_name_coincide(self): + index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')], + names=['a', 'b', 'c']) + + df = DataFrame({'value': [0, 1]}, index=index) + + lines = repr(df).split('\n') + self.assertTrue(lines[2].startswith('a 0 foo')) + + def test_getitem_simple(self): + df = self.frame.T + + col = df['foo', 'one'] + assert_almost_equal(col.values, df.values[:, 0]) + self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) + self.assertRaises(KeyError, df.__getitem__, 'foobar') + + def test_series_getitem(self): + s = self.ymd['A'] + + result = s[2000, 3] + result2 = s.ix[2000, 3] + expected = s.reindex(s.index[42:65]) + expected.index = expected.index.droplevel(0).droplevel(0) + assert_series_equal(result, expected) + + result = s[2000, 3, 10] + expected = s[49] + self.assertEqual(result, expected) + + # fancy + result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] + expected = s.reindex(s.index[49:51]) + assert_series_equal(result, expected) + + # key error + self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) + + def test_series_getitem_corner(self): + s = self.ymd['A'] + + # don't segfault, GH #495 + # out of bounds access + self.assertRaises(IndexError, s.__getitem__, len(self.ymd)) + + # generator + result = s[(x > 0 for x in s)] + expected = s[s > 0] + assert_series_equal(result, expected) + + def test_series_setitem(self): + s = self.ymd['A'] + + s[2000, 3] = np.nan + self.assertTrue(isnull(s.values[42:65]).all()) + self.assertTrue(notnull(s.values[:42]).all()) + self.assertTrue(notnull(s.values[65:]).all()) + + s[2000, 3, 10] = np.nan + self.assertTrue(isnull(s[49])) + + def test_series_slice_partial(self): + pass + + def test_frame_getitem_setitem_boolean(self): + df = self.frame.T.copy() + values = df.values + + result = df[df > 0] + expected = df.where(df > 0) + assert_frame_equal(result, expected) + + df[df > 0] = 5 + values[values > 0] = 5 + assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + assert_almost_equal(df.values, values) + + with assertRaisesRegexp(TypeError, 'boolean values only'): + df[df * 0] = 2 + + def test_frame_getitem_setitem_slice(self): + # getitem + result = self.frame.ix[:4] + expected = self.frame[:4] + assert_frame_equal(result, expected) + + # setitem + cp = self.frame.copy() + cp.ix[:4] = 0 + + self.assertTrue((cp.values[:4] == 0).all()) + self.assertTrue((cp.values[4:] != 0).all()) + + def test_frame_getitem_setitem_multislice(self): + levels = [['t1', 't2'], ['a', 'b', 'c']] + labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] + midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) + df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) + + result = df.ix[:, 'value'] + assert_series_equal(df['value'], result) + + result = df.ix[1:3, 'value'] + assert_series_equal(df['value'][1:3], result) + + result = df.ix[:, :] + assert_frame_equal(df, result) + + result = df + df.ix[:, 'value'] = 10 + result['value'] = 10 + assert_frame_equal(df, result) + + df.ix[:, :] = 10 + assert_frame_equal(df, result) + + def test_frame_getitem_multicolumn_empty_level(self): + f = DataFrame({'a': ['1', '2', '3'], + 'b': ['2', '3', '4']}) + f.columns = [['level1 item1', 'level1 item2'], + ['', 'level2 item2'], + ['level3 item1', 'level3 item2']] + + result = f['level1 item1'] + expected = DataFrame([['1'], ['2'], ['3']], index=f.index, + columns=['level3 item1']) + assert_frame_equal(result, expected) + + def test_frame_setitem_multi_column(self): + df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], + [0, 1, 0, 1]]) + + cp = df.copy() + cp['a'] = cp['b'] + assert_frame_equal(cp['a'], cp['b']) + + # set with ndarray + cp = df.copy() + cp['a'] = cp['b'].values + assert_frame_equal(cp['a'], cp['b']) + + #---------------------------------------- + # #1803 + columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) + df = DataFrame(index=[1, 3, 5], columns=columns) + + # Works, but adds a column instead of updating the two existing ones + df['A'] = 0.0 # Doesn't work + self.assertTrue((df['A'].values == 0).all()) + + # it broadcasts + df['B', '1'] = [1, 2, 3] + df['A'] = df['B', '1'] + assert_series_equal(df['A', '1'], df['B', '1']) + assert_series_equal(df['A', '2'], df['B', '1']) + + def test_getitem_tuple_plus_slice(self): + # GH #671 + df = DataFrame({'a': lrange(10), + 'b': lrange(10), + 'c': np.random.randn(10), + 'd': np.random.randn(10)}) + + idf = df.set_index(['a', 'b']) + + result = idf.ix[(0, 0), :] + expected = idf.ix[0, 0] + expected2 = idf.xs((0, 0)) + + assert_series_equal(result, expected) + assert_series_equal(result, expected2) + + def test_getitem_setitem_tuple_plus_columns(self): + # GH #1013 + + df = self.ymd[:5] + + result = df.ix[(2000, 1, 6), ['A', 'B', 'C']] + expected = df.ix[2000, 1, 6][['A', 'B', 'C']] + assert_series_equal(result, expected) + + def test_getitem_multilevel_index_tuple_unsorted(self): + index_columns = list("abc") + df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], + columns=index_columns + ["data"]) + df = df.set_index(index_columns) + query_index = df.index[:1] + rs = df.ix[query_index, "data"] + xp = Series(['x'], index=MultiIndex.from_tuples([(0, 1, 0)])) + assert_series_equal(rs, xp) + + def test_xs(self): + xs = self.frame.xs(('bar', 'two')) + xs2 = self.frame.ix[('bar', 'two')] + + assert_series_equal(xs, xs2) + assert_almost_equal(xs.values, self.frame.values[4]) + + # GH 6574 + # missing values in returned index should be preserrved + acc = [ + ('a','abcde',1), + ('b','bbcde',2), + ('y','yzcde',25), + ('z','xbcde',24), + ('z',None,26), + ('z','zbcde',25), + ('z','ybcde',26), + ] + df = DataFrame(acc, columns=['a1','a2','cnt']).set_index(['a1','a2']) + expected = DataFrame({ 'cnt' : [24,26,25,26] }, index=Index(['xbcde',np.nan,'zbcde','ybcde'],name='a2')) + result = df.xs('z',level='a1') + assert_frame_equal(result, expected) + + def test_xs_partial(self): + result = self.frame.xs('foo') + result2 = self.frame.ix['foo'] + expected = self.frame.T['foo'].T + assert_frame_equal(result, expected) + assert_frame_equal(result, result2) + + result = self.ymd.xs((2000, 4)) + expected = self.ymd.ix[2000, 4] + assert_frame_equal(result, expected) + + # ex from #1796 + index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], + labels=[[0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1]]) + df = DataFrame(np.random.randn(8, 4), index=index, + columns=list('abcd')) + + result = df.xs(['foo', 'one']) + expected = df.ix['foo', 'one'] + assert_frame_equal(result, expected) + + def test_xs_level(self): + result = self.frame.xs('two', level='second') + expected = self.frame[self.frame.index.get_level_values(1) == 'two'] + expected.index = expected.index.droplevel(1) + + assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), + ('p', 'q', 'r')]) + df = DataFrame(np.random.randn(3, 5), index=index) + result = df.xs('c', level=2) + expected = df[1:2] + expected.index = expected.index.droplevel(2) + assert_frame_equal(result, expected) + + # this is a copy in 0.14 + result = self.frame.xs('two', level='second') + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + def f(x): + x[:] = 10 + self.assertRaises(com.SettingWithCopyError, f, result) + + def test_xs_level_multiple(self): + from pandas import read_table + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_table(StringIO(text), sep='\s+', engine='python') + + result = df.xs(('a', 4), level=['one', 'four']) + expected = df.xs('a').xs(4, level='four') + assert_frame_equal(result, expected) + + # this is a copy in 0.14 + result = df.xs(('a', 4), level=['one', 'four']) + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + def f(x): + x[:] = 10 + self.assertRaises(com.SettingWithCopyError, f, result) + + # GH2107 + dates = lrange(20111201, 20111205) + ids = 'abcde' + idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) + idx.names = ['date', 'secid'] + df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) + rs = df.xs(20111201, level='date') + xp = df.ix[20111201, :] + assert_frame_equal(rs, xp) + + def test_xs_level0(self): + from pandas import read_table + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_table(StringIO(text), sep='\s+', engine='python') + + result = df.xs('a', level=0) + expected = df.xs('a') + self.assertEqual(len(result), 2) + assert_frame_equal(result, expected) + + def test_xs_level_series(self): + s = self.frame['A'] + result = s[:, 'two'] + expected = self.frame.xs('two', level=1)['A'] + assert_series_equal(result, expected) + + s = self.ymd['A'] + result = s[2000, 5] + expected = self.ymd.ix[2000, 5]['A'] + assert_series_equal(result, expected) + + # not implementing this for now + + self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4))) + + # result = s[2000, 3:4] + # lv =s.index.get_level_values(1) + # expected = s[(lv == 3) | (lv == 4)] + # expected.index = expected.index.droplevel(0) + # assert_series_equal(result, expected) + + # can do this though + + def test_get_loc_single_level(self): + s = Series(np.random.randn(len(self.single_level)), + index=self.single_level) + for k in self.single_level.values: + s[k] + + def test_getitem_toplevel(self): + df = self.frame.T + + result = df['foo'] + expected = df.reindex(columns=df.columns[:3]) + expected.columns = expected.columns.droplevel(0) + assert_frame_equal(result, expected) + + result = df['bar'] + result2 = df.ix[:, 'bar'] + + expected = df.reindex(columns=df.columns[3:5]) + expected.columns = expected.columns.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result, result2) + + def test_getitem_setitem_slice_integers(self): + index = MultiIndex(levels=[[0, 1, 2], [0, 2]], + labels=[[0, 0, 1, 1, 2, 2], + [0, 1, 0, 1, 0, 1]]) + + frame = DataFrame(np.random.randn(len(index), 4), index=index, + columns=['a', 'b', 'c', 'd']) + res = frame.ix[1:2] + exp = frame.reindex(frame.index[2:]) + assert_frame_equal(res, exp) + + frame.ix[1:2] = 7 + self.assertTrue((frame.ix[1:2] == 7).values.all()) + + series = Series(np.random.randn(len(index)), index=index) + + res = series.ix[1:2] + exp = series.reindex(series.index[2:]) + assert_series_equal(res, exp) + + series.ix[1:2] = 7 + self.assertTrue((series.ix[1:2] == 7).values.all()) + + def test_getitem_int(self): + levels = [[0, 1], [0, 1, 2]] + labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, labels=labels) + + frame = DataFrame(np.random.randn(6, 2), index=index) + + result = frame.ix[1] + expected = frame[-3:] + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + + # raises exception + self.assertRaises(KeyError, frame.ix.__getitem__, 3) + + # however this will work + result = self.frame.ix[2] + expected = self.frame.xs(self.frame.index[2]) + assert_series_equal(result, expected) + + def test_getitem_partial(self): + ymd = self.ymd.T + result = ymd[2000, 2] + + expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) + expected.columns = expected.columns.droplevel(0).droplevel(0) + assert_frame_equal(result, expected) + + def test_getitem_slice_not_sorted(self): + df = self.frame.sortlevel(1).T + + # buglet with int typechecking + result = df.ix[:, :np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + assert_frame_equal(result, expected) + + def test_setitem_change_dtype(self): + dft = self.frame.T + s = dft['foo', 'two'] + dft['foo', 'two'] = s > s.median() + assert_series_equal(dft['foo', 'two'], s > s.median()) + # tm.assert_isinstance(dft._data.blocks[1].items, MultiIndex) + + reindexed = dft.reindex(columns=[('foo', 'two')]) + assert_series_equal(reindexed['foo', 'two'], s > s.median()) + + def test_frame_setitem_ix(self): + self.frame.ix[('bar', 'two'), 'B'] = 5 + self.assertEqual(self.frame.ix[('bar', 'two'), 'B'], 5) + + # with integer labels + df = self.frame.copy() + df.columns = lrange(3) + df.ix[('bar', 'two'), 1] = 7 + self.assertEqual(df.ix[('bar', 'two'), 1], 7) + + def test_fancy_slice_partial(self): + result = self.frame.ix['bar':'baz'] + expected = self.frame[3:7] + assert_frame_equal(result, expected) + + result = self.ymd.ix[(2000, 2):(2000, 4)] + lev = self.ymd.index.labels[1] + expected = self.ymd[(lev >= 1) & (lev <= 3)] + assert_frame_equal(result, expected) + + def test_getitem_partial_column_select(self): + idx = MultiIndex(labels=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) + df = DataFrame(np.random.rand(3, 2), index=idx) + + result = df.ix[('a', 'y'), :] + expected = df.ix[('a', 'y')] + assert_frame_equal(result, expected) + + result = df.ix[('a', 'y'), [1, 0]] + expected = df.ix[('a', 'y')][[1, 0]] + assert_frame_equal(result, expected) + + self.assertRaises(KeyError, df.ix.__getitem__, + (('a', 'foo'), slice(None, None))) + + def test_sortlevel(self): + df = self.frame.copy() + df.index = np.arange(len(df)) + assertRaisesRegexp(TypeError, 'hierarchical index', df.sortlevel, 0) + + # axis=1 + + # series + a_sorted = self.frame['A'].sortlevel(0) + with assertRaisesRegexp(TypeError, 'hierarchical index'): + self.frame.reset_index()['A'].sortlevel() + + # preserve names + self.assertEqual(a_sorted.index.names, self.frame.index.names) + + # inplace + rs = self.frame.copy() + rs.sortlevel(0, inplace=True) + assert_frame_equal(rs, self.frame.sortlevel(0)) + + def test_sortlevel_large_cardinality(self): + + # #2684 (int64) + index = MultiIndex.from_arrays([np.arange(4000)]*3) + df = DataFrame(np.random.randn(4000), index=index, dtype = np.int64) + + # it works! + result = df.sortlevel(0) + self.assertTrue(result.index.lexsort_depth == 3) + + # #2684 (int32) + index = MultiIndex.from_arrays([np.arange(4000)]*3) + df = DataFrame(np.random.randn(4000), index=index, dtype = np.int32) + + # it works! + result = df.sortlevel(0) + self.assertTrue((result.dtypes.values == df.dtypes.values).all() == True) + self.assertTrue(result.index.lexsort_depth == 3) + + def test_delevel_infer_dtype(self): + tuples = [tuple for tuple in cart_product(['foo', 'bar'], + [10, 20], [1.0, 1.1])] + index = MultiIndex.from_tuples(tuples, + names=['prm0', 'prm1', 'prm2']) + df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'], + index=index) + deleveled = df.reset_index() + self.assertTrue(com.is_integer_dtype(deleveled['prm1'])) + self.assertTrue(com.is_float_dtype(deleveled['prm2'])) + + def test_reset_index_with_drop(self): + deleveled = self.ymd.reset_index(drop=True) + self.assertEqual(len(deleveled.columns), len(self.ymd.columns)) + + deleveled = self.series.reset_index() + tm.assert_isinstance(deleveled, DataFrame) + self.assertEqual(len(deleveled.columns), + len(self.series.index.levels) + 1) + + deleveled = self.series.reset_index(drop=True) + tm.assert_isinstance(deleveled, Series) + + def test_sortlevel_by_name(self): + self.frame.index.names = ['first', 'second'] + result = self.frame.sortlevel(level='second') + expected = self.frame.sortlevel(level=1) + assert_frame_equal(result, expected) + + def test_sortlevel_mixed(self): + sorted_before = self.frame.sortlevel(1) + + df = self.frame.copy() + df['foo'] = 'bar' + sorted_after = df.sortlevel(1) + assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) + + dft = self.frame.T + sorted_before = dft.sortlevel(1, axis=1) + dft['foo', 'three'] = 'bar' + + sorted_after = dft.sortlevel(1, axis=1) + assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), + sorted_after.drop([('foo', 'three')], axis=1)) + + def test_count_level(self): + def _check_counts(frame, axis=0): + index = frame._get_axis(axis) + for i in range(index.nlevels): + result = frame.count(axis=axis, level=i) + expected = frame.groupby(axis=axis, level=i).count(axis=axis) + expected = expected.reindex_like(result).astype('i8') + assert_frame_equal(result, expected) + + self.frame.ix[1, [1, 2]] = np.nan + self.frame.ix[7, [0, 1]] = np.nan + self.ymd.ix[1, [1, 2]] = np.nan + self.ymd.ix[7, [0, 1]] = np.nan + + _check_counts(self.frame) + _check_counts(self.ymd) + _check_counts(self.frame.T, axis=1) + _check_counts(self.ymd.T, axis=1) + + # can't call with level on regular DataFrame + df = tm.makeTimeDataFrame() + assertRaisesRegexp(TypeError, 'hierarchical', df.count, level=0) + + self.frame['D'] = 'foo' + result = self.frame.count(level=0, numeric_only=True) + assert_almost_equal(result.columns, ['A', 'B', 'C']) + + def test_count_level_series(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz'], + ['one', 'two', 'three', 'four']], + labels=[[0, 0, 0, 2, 2], + [2, 0, 1, 1, 2]]) + + s = Series(np.random.randn(len(index)), index=index) + + result = s.count(level=0) + expected = s.groupby(level=0).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + + result = s.count(level=1) + expected = s.groupby(level=1).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + + def test_count_level_corner(self): + s = self.frame['A'][:0] + result = s.count(level=0) + expected = Series(0, index=s.index.levels[0]) + assert_series_equal(result, expected) + + df = self.frame[:0] + result = df.count(level=0) + expected = DataFrame({}, index=s.index.levels[0], + columns=df.columns).fillna(0).astype(np.int64) + assert_frame_equal(result, expected) + + def test_unstack(self): + # just check that it works for now + unstacked = self.ymd.unstack() + unstacked2 = unstacked.unstack() + + # test that ints work + unstacked = self.ymd.astype(int).unstack() + + # test that int32 work + unstacked = self.ymd.astype(np.int32).unstack() + + def test_unstack_multiple_no_empty_columns(self): + index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), + (1, 'baz', 1), (1, 'qux', 1)]) + + s = Series(np.random.randn(4), index=index) + + unstacked = s.unstack([1, 2]) + expected = unstacked.dropna(axis=1, how='all') + assert_frame_equal(unstacked, expected) + + def test_stack(self): + # regular roundtrip + unstacked = self.ymd.unstack() + restacked = unstacked.stack() + assert_frame_equal(restacked, self.ymd) + + unlexsorted = self.ymd.sortlevel(2) + + unstacked = unlexsorted.unstack(2) + restacked = unstacked.stack() + assert_frame_equal(restacked.sortlevel(0), self.ymd) + + unlexsorted = unlexsorted[::-1] + unstacked = unlexsorted.unstack(1) + restacked = unstacked.stack().swaplevel(1, 2) + assert_frame_equal(restacked.sortlevel(0), self.ymd) + + unlexsorted = unlexsorted.swaplevel(0, 1) + unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) + restacked = unstacked.stack(0).swaplevel(1, 2) + assert_frame_equal(restacked.sortlevel(0), self.ymd) + + # columns unsorted + unstacked = self.ymd.unstack() + unstacked = unstacked.sort(axis=1, ascending=False) + restacked = unstacked.stack() + assert_frame_equal(restacked, self.ymd) + + # more than 2 levels in the columns + unstacked = self.ymd.unstack(1).unstack(1) + + result = unstacked.stack(1) + expected = self.ymd.unstack() + assert_frame_equal(result, expected) + + result = unstacked.stack(2) + expected = self.ymd.unstack(1) + assert_frame_equal(result, expected) + + result = unstacked.stack(0) + expected = self.ymd.stack().unstack(1).unstack(1) + assert_frame_equal(result, expected) + + # not all levels present in each echelon + unstacked = self.ymd.unstack(2).ix[:, ::3] + stacked = unstacked.stack().stack() + ymd_stacked = self.ymd.stack() + assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + + # stack with negative number + result = self.ymd.unstack(0).stack(-2) + expected = self.ymd.unstack(0).stack(0) + + def test_unstack_odd_failure(self): + data = """day,time,smoker,sum,len +Fri,Dinner,No,8.25,3. +Fri,Dinner,Yes,27.03,9 +Fri,Lunch,No,3.0,1 +Fri,Lunch,Yes,13.68,6 +Sat,Dinner,No,139.63,45 +Sat,Dinner,Yes,120.77,42 +Sun,Dinner,No,180.57,57 +Sun,Dinner,Yes,66.82,19 +Thur,Dinner,No,3.0,1 +Thur,Lunch,No,117.32,44 +Thur,Lunch,Yes,51.51,17""" + + df = pd.read_csv(StringIO(data)).set_index(['day', 'time', 'smoker']) + + # it works, #2100 + result = df.unstack(2) + + recons = result.stack() + assert_frame_equal(recons, df) + + def test_stack_mixed_dtype(self): + df = self.frame.T + df['foo', 'four'] = 'foo' + df = df.sortlevel(1, axis=1) + + stacked = df.stack() + assert_series_equal(stacked['foo'], df['foo'].stack()) + self.assertEqual(stacked['bar'].dtype, np.float_) + + def test_unstack_bug(self): + df = DataFrame({'state': ['naive', 'naive', 'naive', + 'activ', 'activ', 'activ'], + 'exp': ['a', 'b', 'b', 'b', 'a', 'a'], + 'barcode': [1, 2, 3, 4, 1, 3], + 'v': ['hi', 'hi', 'bye', 'bye', 'bye', 'peace'], + 'extra': np.arange(6.)}) + + result = df.groupby(['state', 'exp', 'barcode', 'v']).apply(len) + + unstacked = result.unstack() + restacked = unstacked.stack() + assert_series_equal(restacked, + result.reindex(restacked.index).astype(float)) + + def test_stack_unstack_preserve_names(self): + unstacked = self.frame.unstack() + self.assertEqual(unstacked.index.name, 'first') + self.assertEqual(unstacked.columns.names, ['exp', 'second']) + + restacked = unstacked.stack() + self.assertEqual(restacked.index.names, self.frame.index.names) + + def test_unstack_level_name(self): + result = self.frame.unstack('second') + expected = self.frame.unstack(level=1) + assert_frame_equal(result, expected) + + def test_stack_level_name(self): + unstacked = self.frame.unstack('second') + result = unstacked.stack('exp') + expected = self.frame.unstack().stack(0) + assert_frame_equal(result, expected) + + result = self.frame.stack('exp') + expected = self.frame.stack() + assert_series_equal(result, expected) + + def test_stack_unstack_multiple(self): + unstacked = self.ymd.unstack(['year', 'month']) + expected = self.ymd.unstack('year').unstack('month') + assert_frame_equal(unstacked, expected) + self.assertEqual(unstacked.columns.names, + expected.columns.names) + + # series + s = self.ymd['A'] + s_unstacked = s.unstack(['year', 'month']) + assert_frame_equal(s_unstacked, expected['A']) + + restacked = unstacked.stack(['year', 'month']) + restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) + restacked = restacked.sortlevel(0) + + assert_frame_equal(restacked, self.ymd) + self.assertEqual(restacked.index.names, self.ymd.index.names) + + # GH #451 + unstacked = self.ymd.unstack([1, 2]) + expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all') + assert_frame_equal(unstacked, expected) + + unstacked = self.ymd.unstack([2, 1]) + expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') + assert_frame_equal(unstacked, expected.ix[:, unstacked.columns]) + + def test_unstack_period_series(self): + # GH 4342 + idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02', + '2013-03', '2013-03'], freq='M', name='period') + idx2 = Index(['A', 'B'] * 3, name='str') + value = [1, 2, 3, 4, 5, 6] + + idx = MultiIndex.from_arrays([idx1, idx2]) + s = Series(value, index=idx) + + result1 = s.unstack() + result2 = s.unstack(level=1) + result3 = s.unstack(level=0) + + e_idx = pd.PeriodIndex(['2013-01', '2013-02', '2013-03'], freq='M', name='period') + expected = DataFrame({'A': [1, 3, 5], 'B': [2, 4, 6]}, index=e_idx, + columns=['A', 'B']) + expected.columns.name = 'str' + + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected.T) + + idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02', + '2013-03', '2013-03'], freq='M', name='period1') + + idx2 = pd.PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', + '2013-08', '2013-07'], freq='M', name='period2') + idx = pd.MultiIndex.from_arrays([idx1, idx2]) + s = Series(value, index=idx) + + result1 = s.unstack() + result2 = s.unstack(level=1) + result3 = s.unstack(level=0) + + e_idx = pd.PeriodIndex(['2013-01', '2013-02', '2013-03'], freq='M', name='period1') + e_cols = pd.PeriodIndex(['2013-07', '2013-08', '2013-09', '2013-10', + '2013-11', '2013-12'], freq='M', name='period2') + expected = DataFrame([[np.nan, np.nan, np.nan, np.nan, 2, 1], + [np.nan, np.nan, 4, 3, np.nan, np.nan], + [6, 5, np.nan, np.nan, np.nan, np.nan]], + index=e_idx, columns=e_cols) + + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected.T) + + def test_unstack_period_frame(self): + # GH 4342 + idx1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-02', '2014-02', '2014-01', '2014-01'], + freq='M', name='period1') + idx2 = pd.PeriodIndex(['2013-12', '2013-12', '2014-02', '2013-10', '2013-10', '2014-02'], + freq='M', name='period2') + value = {'A': [1, 2, 3, 4, 5, 6], 'B': [6, 5, 4, 3, 2, 1]} + idx = pd.MultiIndex.from_arrays([idx1, idx2]) + df = pd.DataFrame(value, index=idx) + + result1 = df.unstack() + result2 = df.unstack(level=1) + result3 = df.unstack(level=0) + + e_1 = pd.PeriodIndex(['2014-01', '2014-02'], freq='M', name='period1') + e_2 = pd.PeriodIndex(['2013-10', '2013-12', '2014-02', '2013-10', + '2013-12', '2014-02'], freq='M', name='period2') + e_cols = pd.MultiIndex.from_arrays(['A A A B B B'.split(), e_2]) + expected = DataFrame([[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], + index=e_1, columns=e_cols) + + assert_frame_equal(result1, expected) + assert_frame_equal(result2, expected) + + e_1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-01', + '2014-02'], freq='M', name='period1') + e_2 = pd.PeriodIndex(['2013-10', '2013-12', '2014-02'], freq='M', name='period2') + e_cols = pd.MultiIndex.from_arrays(['A A B B'.split(), e_1]) + expected = DataFrame([[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], + index=e_2, columns=e_cols) + + assert_frame_equal(result3, expected) + + def test_stack_multiple_bug(self): + """ bug when some uniques are not present in the data #3170""" + id_col = ([1] * 3) + ([2] * 3) + name = (['a'] * 3) + (['b'] * 3) + date = pd.to_datetime(['2013-01-03', '2013-01-04', '2013-01-05'] * 2) + var1 = np.random.randint(0, 100, 6) + df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1)) + + multi = df.set_index(['DATE', 'ID']) + multi.columns.name = 'Params' + unst = multi.unstack('ID') + down = unst.resample('W-THU') + + rs = down.stack('ID') + xp = unst.ix[:, ['VAR1']].resample('W-THU').stack('ID') + xp.columns.name = 'Params' + assert_frame_equal(rs, xp) + + def test_stack_dropna(self): + # GH #3997 + df = pd.DataFrame({'A': ['a1', 'a2'], + 'B': ['b1', 'b2'], + 'C': [1, 1]}) + df = df.set_index(['A', 'B']) + + stacked = df.unstack().stack(dropna=False) + self.assertTrue(len(stacked) > len(stacked.dropna())) + + stacked = df.unstack().stack(dropna=True) + assert_frame_equal(stacked, stacked.dropna()) + + def test_unstack_multiple_hierarchical(self): + df = DataFrame(index=[[0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1]], + columns=[[0, 0, 1, 1], [0, 1, 0, 1]]) + + df.index.names = ['a', 'b', 'c'] + df.columns.names = ['d', 'e'] + + # it works! + df.unstack(['b', 'c']) + + def test_groupby_transform(self): + s = self.frame['A'] + grouper = s.index.get_level_values(0) + + grouped = s.groupby(grouper) + + applied = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + assert_series_equal(applied.reindex(expected.index), expected) + + def test_unstack_sparse_keyspace(self): + # memory problems with naive impl #2278 + # Generate Long File & Test Pivot + NUM_ROWS = 1000 + + df = DataFrame({'A': np.random.randint(100, size=NUM_ROWS), + 'B': np.random.randint(300, size=NUM_ROWS), + 'C': np.random.randint(-7, 7, size=NUM_ROWS), + 'D': np.random.randint(-19, 19, size=NUM_ROWS), + 'E': np.random.randint(3000, size=NUM_ROWS), + 'F': np.random.randn(NUM_ROWS)}) + + idf = df.set_index(['A', 'B', 'C', 'D', 'E']) + + # it works! is sufficient + idf.unstack('E') + + def test_unstack_unobserved_keys(self): + # related to #2278 refactoring + levels = [[0, 1], [0, 1, 2, 3]] + labels = [[0, 0, 1, 1], [0, 2, 0, 2]] + + index = MultiIndex(levels, labels) + + df = DataFrame(np.random.randn(4, 2), index=index) + + result = df.unstack() + self.assertEqual(len(result.columns), 4) + + recons = result.stack() + assert_frame_equal(recons, df) + + def test_groupby_corner(self): + midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], + labels=[[0], [0], [0]], names=['one', 'two', 'three']) + df = DataFrame([np.random.rand(4)], columns=['a', 'b', 'c', 'd'], + index=midx) + # should work + df.groupby(level='three') + + def test_groupby_level_no_obs(self): + # #1697 + midx = MultiIndex.from_tuples([('f1', 's1'), ('f1', 's2'), + ('f2', 's1'), ('f2', 's2'), + ('f3', 's1'), ('f3', 's2')]) + df = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) + df1 = df.select(lambda u: u[0] in ['f2', 'f3'], axis=1) + + grouped = df1.groupby(axis=1, level=0) + result = grouped.sum() + self.assertTrue((result.columns == ['f2', 'f3']).all()) + + def test_join(self): + a = self.frame.ix[:5, ['A']] + b = self.frame.ix[2:, ['B', 'C']] + + joined = a.join(b, how='outer').reindex(self.frame.index) + expected = self.frame.copy() + expected.values[np.isnan(joined.values)] = np.nan + + self.assertFalse(np.isnan(joined.values).all()) + + assert_frame_equal(joined, expected, check_names=False) # TODO what should join do with names ? + + def test_swaplevel(self): + swapped = self.frame['A'].swaplevel(0, 1) + swapped2 = self.frame['A'].swaplevel('first', 'second') + self.assertFalse(swapped.index.equals(self.frame.index)) + assert_series_equal(swapped, swapped2) + + back = swapped.swaplevel(0, 1) + back2 = swapped.swaplevel('second', 'first') + self.assertTrue(back.index.equals(self.frame.index)) + assert_series_equal(back, back2) + + ft = self.frame.T + swapped = ft.swaplevel('first', 'second', axis=1) + exp = self.frame.swaplevel('first', 'second').T + assert_frame_equal(swapped, exp) + + def test_swaplevel_panel(self): + panel = Panel({'ItemA': self.frame, + 'ItemB': self.frame * 2}) + + result = panel.swaplevel(0, 1, axis='major') + expected = panel.copy() + expected.major_axis = expected.major_axis.swaplevel(0, 1) + tm.assert_panel_equal(result, expected) + + def test_reorder_levels(self): + result = self.ymd.reorder_levels(['month', 'day', 'year']) + expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) + assert_frame_equal(result, expected) + + result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) + expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) + assert_series_equal(result, expected) + + result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) + expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) + assert_frame_equal(result, expected) + + with assertRaisesRegexp(TypeError, 'hierarchical axis'): + self.ymd.reorder_levels([1, 2], axis=1) + + with assertRaisesRegexp(IndexError, 'Too many levels'): + self.ymd.index.reorder_levels([1, 2, 3]) + + def test_insert_index(self): + df = self.ymd[:5].T + df[2000, 1, 10] = df[2000, 1, 7] + tm.assert_isinstance(df.columns, MultiIndex) + self.assertTrue((df[2000, 1, 10] == df[2000, 1, 7]).all()) + + def test_alignment(self): + x = Series(data=[1, 2, 3], + index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])) + + y = Series(data=[4, 5, 6], + index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])) + + res = x - y + exp_index = x.index.union(y.index) + exp = x.reindex(exp_index) - y.reindex(exp_index) + assert_series_equal(res, exp) + + # hit non-monotonic code path + res = x[::-1] - y[::-1] + exp_index = x.index.union(y.index) + exp = x.reindex(exp_index) - y.reindex(exp_index) + assert_series_equal(res, exp) + + def test_is_lexsorted(self): + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 1, 2]]) + self.assertTrue(index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 2, 1]]) + self.assertFalse(index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 1, 0, 1, 1], + [0, 1, 0, 2, 2, 1]]) + self.assertFalse(index.is_lexsorted()) + self.assertEqual(index.lexsort_depth, 0) + + def test_frame_getitem_view(self): + df = self.frame.T.copy() + + # this works because we are modifying the underlying array + # really a no-no + df['foo'].values[:] = 0 + self.assertTrue((df['foo'].values == 0).all()) + + # but not if it's mixed-type + df['foo', 'four'] = 'foo' + df = df.sortlevel(0, axis=1) + + # this will work, but will raise/warn as its chained assignment + def f(): + df['foo']['one'] = 2 + return df + self.assertRaises(com.SettingWithCopyError, f) + + try: + df = f() + except: + pass + self.assertTrue((df['foo', 'one'] == 0).all()) + + def test_frame_getitem_not_sorted(self): + df = self.frame.T + df['foo', 'four'] = 'foo' + + arrays = [np.array(x) for x in zip(*df.columns._tuple_index)] + + result = df['foo'] + result2 = df.ix[:, 'foo'] + expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + expected.columns = expected.columns.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + df = df.T + result = df.xs('foo') + result2 = df.ix['foo'] + expected = df.reindex(df.index[arrays[0] == 'foo']) + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = lzip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + + arrays = [np.array(x) for x in zip(*index._tuple_index)] + + result = s['qux'] + result2 = s.ix['qux'] + expected = s[arrays[0] == 'qux'] + expected.index = expected.index.droplevel(0) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + def test_count(self): + frame = self.frame.copy() + frame.index.names = ['a', 'b'] + + result = frame.count(level='b') + expect = self.frame.count(level=1) + assert_frame_equal(result, expect, check_names=False) + + result = frame.count(level='a') + expect = self.frame.count(level=0) + assert_frame_equal(result, expect, check_names=False) + + series = self.series.copy() + series.index.names = ['a', 'b'] + + result = series.count(level='b') + expect = self.series.count(level=1) + assert_series_equal(result, expect) + + result = series.count(level='a') + expect = self.series.count(level=0) + assert_series_equal(result, expect) + + self.assertRaises(KeyError, series.count, 'x') + self.assertRaises(KeyError, frame.count, level='x') + + AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', + 'mad', 'std', 'var', 'sem'] + + def test_series_group_min_max(self): + for op, level, skipna in cart_product(self.AGG_FUNCTIONS, + lrange(2), + [False, True]): + grouped = self.series.groupby(level=level) + aggf = lambda x: getattr(x, op)(skipna=skipna) + # skipna=True + leftside = grouped.agg(aggf) + rightside = getattr(self.series, op)(level=level, skipna=skipna) + assert_series_equal(leftside, rightside) + + def test_frame_group_ops(self): + self.frame.ix[1, [1, 2]] = np.nan + self.frame.ix[7, [0, 1]] = np.nan + + for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, + lrange(2), lrange(2), + [False, True]): + if axis == 0: + frame = self.frame + else: + frame = self.frame.T + + grouped = frame.groupby(level=level, axis=axis) + + pieces = [] + + def aggf(x): + pieces.append(x) + return getattr(x, op)(skipna=skipna, axis=axis) + leftside = grouped.agg(aggf) + rightside = getattr(frame, op)(level=level, axis=axis, + skipna=skipna) + + # for good measure, groupby detail + level_index = frame._get_axis(axis).levels[level] + + self.assertTrue(leftside._get_axis(axis).equals(level_index)) + self.assertTrue(rightside._get_axis(axis).equals(level_index)) + + assert_frame_equal(leftside, rightside) + + def test_stat_op_corner(self): + obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) + + result = obj.sum(level=0) + expected = Series([10.0], index=[2]) + assert_series_equal(result, expected) + + def test_frame_any_all_group(self): + df = DataFrame( + {'data': [False, False, True, False, True, False, True]}, + index=[ + ['one', 'one', 'two', 'one', 'two', 'two', 'two'], + [0, 1, 0, 2, 1, 2, 3]]) + + result = df.any(level=0) + ex = DataFrame({'data': [False, True]}, index=['one', 'two']) + assert_frame_equal(result, ex) + + result = df.all(level=0) + ex = DataFrame({'data': [False, False]}, index=['one', 'two']) + assert_frame_equal(result, ex) + + def test_std_var_pass_ddof(self): + index = MultiIndex.from_arrays([np.arange(5).repeat(10), + np.tile(np.arange(10), 5)]) + df = DataFrame(np.random.randn(len(index), 5), index=index) + + for meth in ['var', 'std']: + ddof = 4 + alt = lambda x: getattr(x, meth)(ddof=ddof) + + result = getattr(df[0], meth)(level=0, ddof=ddof) + expected = df[0].groupby(level=0).agg(alt) + assert_series_equal(result, expected) + + result = getattr(df, meth)(level=0, ddof=ddof) + expected = df.groupby(level=0).agg(alt) + assert_frame_equal(result, expected) + + def test_frame_series_agg_multiple_levels(self): + result = self.ymd.sum(level=['year', 'month']) + expected = self.ymd.groupby(level=['year', 'month']).sum() + assert_frame_equal(result, expected) + + result = self.ymd['A'].sum(level=['year', 'month']) + expected = self.ymd['A'].groupby(level=['year', 'month']).sum() + assert_series_equal(result, expected) + + def test_groupby_multilevel(self): + result = self.ymd.groupby(level=[0, 1]).mean() + + k1 = self.ymd.index.get_level_values(0) + k2 = self.ymd.index.get_level_values(1) + + expected = self.ymd.groupby([k1, k2]).mean() + + assert_frame_equal(result, expected, check_names=False) # TODO groupby with level_values drops names + self.assertEqual(result.index.names, self.ymd.index.names[:2]) + + result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() + assert_frame_equal(result, result2) + + def test_groupby_multilevel_with_transform(self): + pass + + def test_multilevel_consolidate(self): + index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), + ('bar', 'one'), ('bar', 'two')]) + df = DataFrame(np.random.randn(4, 4), index=index, columns=index) + df['Totals', ''] = df.sum(1) + df = df.consolidate() + + def test_ix_preserve_names(self): + result = self.ymd.ix[2000] + result2 = self.ymd['A'].ix[2000] + self.assertEqual(result.index.names, self.ymd.index.names[1:]) + self.assertEqual(result2.index.names, self.ymd.index.names[1:]) + + result = self.ymd.ix[2000, 2] + result2 = self.ymd['A'].ix[2000, 2] + self.assertEqual(result.index.name, self.ymd.index.names[2]) + self.assertEqual(result2.index.name, self.ymd.index.names[2]) + + def test_partial_set(self): + # GH #397 + df = self.ymd.copy() + exp = self.ymd.copy() + df.ix[2000, 4] = 0 + exp.ix[2000, 4].values[:] = 0 + assert_frame_equal(df, exp) + + df['A'].ix[2000, 4] = 1 + exp['A'].ix[2000, 4].values[:] = 1 + assert_frame_equal(df, exp) + + df.ix[2000] = 5 + exp.ix[2000].values[:] = 5 + assert_frame_equal(df, exp) + + # this works...for now + df['A'].ix[14] = 5 + self.assertEqual(df['A'][14], 5) + + def test_unstack_preserve_types(self): + # GH #403 + self.ymd['E'] = 'foo' + self.ymd['F'] = 2 + + unstacked = self.ymd.unstack('month') + self.assertEqual(unstacked['A', 1].dtype, np.float64) + self.assertEqual(unstacked['E', 1].dtype, np.object_) + self.assertEqual(unstacked['F', 1].dtype, np.float64) + + def test_unstack_group_index_overflow(self): + labels = np.tile(np.arange(500), 2) + level = np.arange(500) + + index = MultiIndex(levels=[level] * 8 + [[0, 1]], + labels=[labels] * 8 + [np.arange(2).repeat(500)]) + + s = Series(np.arange(1000), index=index) + result = s.unstack() + self.assertEqual(result.shape, (500, 2)) + + # test roundtrip + stacked = result.stack() + assert_series_equal(s, + stacked.reindex(s.index)) + + # put it at beginning + index = MultiIndex(levels=[[0, 1]] + [level] * 8, + labels=[np.arange(2).repeat(500)] + [labels] * 8) + + s = Series(np.arange(1000), index=index) + result = s.unstack(0) + self.assertEqual(result.shape, (500, 2)) + + # put it in middle + index = MultiIndex(levels=[level] * 4 + [[0, 1]] + [level] * 4, + labels=([labels] * 4 + [np.arange(2).repeat(500)] + + [labels] * 4)) + + s = Series(np.arange(1000), index=index) + result = s.unstack(4) + self.assertEqual(result.shape, (500, 2)) + + def test_getitem_lowerdim_corner(self): + self.assertRaises(KeyError, self.frame.ix.__getitem__, + (('bar', 'three'), 'B')) + + + # in theory should be inserting in a sorted space???? + self.frame.ix[('bar','three'),'B'] = 0 + self.assertEqual(self.frame.sortlevel().ix[('bar','three'),'B'], 0) + + #---------------------------------------------------------------------- + # AMBIGUOUS CASES! + + def test_partial_ix_missing(self): + raise nose.SkipTest("skipping for now") + + result = self.ymd.ix[2000, 0] + expected = self.ymd.ix[2000]['A'] + assert_series_equal(result, expected) + + # need to put in some work here + + # self.ymd.ix[2000, 0] = 0 + # self.assertTrue((self.ymd.ix[2000]['A'] == 0).all()) + + # Pretty sure the second (and maybe even the first) is already wrong. + self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) + self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) + + #---------------------------------------------------------------------- + + def test_to_html(self): + self.ymd.columns.name = 'foo' + self.ymd.to_html() + self.ymd.T.to_html() + + def test_level_with_tuples(self): + index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), + ('foo', 'qux', 0)], + [0, 1]], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + result = series[('foo', 'bar', 0)] + result2 = series.ix[('foo', 'bar', 0)] + expected = series[:2] + expected.index = expected.index.droplevel(0) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) + + result = frame.ix[('foo', 'bar', 0)] + result2 = frame.xs(('foo', 'bar', 0)) + expected = frame[:2] + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), + ('foo', 'qux')], + [0, 1]], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + result = series[('foo', 'bar')] + result2 = series.ix[('foo', 'bar')] + expected = series[:2] + expected.index = expected.index.droplevel(0) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + result = frame.ix[('foo', 'bar')] + result2 = frame.xs(('foo', 'bar')) + expected = frame[:2] + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_int_series_slicing(self): + s = self.ymd['A'] + result = s[5:] + expected = s.reindex(s.index[5:]) + assert_series_equal(result, expected) + + exp = self.ymd['A'].copy() + s[5:] = 0 + exp.values[5:] = 0 + self.assert_numpy_array_equal(s.values, exp.values) + + result = self.ymd[5:] + expected = self.ymd.reindex(s.index[5:]) + assert_frame_equal(result, expected) + + def test_mixed_depth_get(self): + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df['a'] + expected = df['a', '', ''] + assert_series_equal(result, expected) + self.assertEqual(result.name, 'a') + + result = df['routine1', 'result1'] + expected = df['routine1', 'result1', ''] + assert_series_equal(result, expected) + self.assertEqual(result.name, ('routine1', 'result1')) + + def test_mixed_depth_insert(self): + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df.copy() + expected = df.copy() + result['b'] = [1, 2, 3, 4] + expected['b', '', ''] = [1, 2, 3, 4] + assert_frame_equal(result, expected) + + def test_mixed_depth_drop(self): + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df.drop('a', axis=1) + expected = df.drop([('a', '', '')], axis=1) + assert_frame_equal(expected, result) + + result = df.drop(['top'], axis=1) + expected = df.drop([('top', 'OD', 'wx')], axis=1) + expected = expected.drop([('top', 'OD', 'wy')], axis=1) + assert_frame_equal(expected, result) + + result = df.drop(('top', 'OD', 'wx'), axis=1) + expected = df.drop([('top', 'OD', 'wx')], axis=1) + assert_frame_equal(expected, result) + + expected = df.drop([('top', 'OD', 'wy')], axis=1) + expected = df.drop('top', axis=1) + + result = df.drop('result1', level=1, axis=1) + expected = df.drop([('routine1', 'result1', ''), + ('routine2', 'result1', '')], axis=1) + assert_frame_equal(expected, result) + + def test_drop_nonunique(self): + df = DataFrame([["x-a", "x", "a", 1.5], ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], ["z-b", "z", "b", 2.1]], + columns=["var1", "var2", "var3", "var4"]) + + grp_size = df.groupby("var1").size() + drop_idx = grp_size.ix[grp_size == 1] + + idf = df.set_index(["var1", "var2", "var3"]) + + # it works! #2101 + result = idf.drop(drop_idx.index, level=0).reset_index() + expected = df[-df.var1.isin(drop_idx.index)] + + result.index = expected.index + + assert_frame_equal(result, expected) + + def test_mixed_depth_pop(self): + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + df1 = df.copy() + df2 = df.copy() + result = df1.pop('a') + expected = df2.pop(('a', '', '')) + assert_series_equal(expected, result) + assert_frame_equal(df1, df2) + self.assertEqual(result.name, 'a') + + expected = df1['top'] + df1 = df1.drop(['top'], axis=1) + result = df2.pop('top') + assert_frame_equal(expected, result) + assert_frame_equal(df1, df2) + + def test_reindex_level_partial_selection(self): + result = self.frame.reindex(['foo', 'qux'], level=0) + expected = self.frame.ix[[0, 1, 2, 7, 8, 9]] + assert_frame_equal(result, expected) + + result = self.frame.T.reindex_axis(['foo', 'qux'], axis=1, level=0) + assert_frame_equal(result, expected.T) + + result = self.frame.ix[['foo', 'qux']] + assert_frame_equal(result, expected) + + result = self.frame['A'].ix[['foo', 'qux']] + assert_series_equal(result, expected['A']) + + result = self.frame.T.ix[:, ['foo', 'qux']] + assert_frame_equal(result, expected.T) + + def test_setitem_multiple_partial(self): + expected = self.frame.copy() + result = self.frame.copy() + result.ix[['foo', 'bar']] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_frame_equal(result, expected) + + expected = self.frame.copy() + result = self.frame.copy() + result.ix['foo':'bar'] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_frame_equal(result, expected) + + expected = self.frame['A'].copy() + result = self.frame['A'].copy() + result.ix[['foo', 'bar']] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_series_equal(result, expected) + + expected = self.frame['A'].copy() + result = self.frame['A'].copy() + result.ix['foo':'bar'] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_series_equal(result, expected) + + def test_drop_level(self): + result = self.frame.drop(['bar', 'qux'], level='first') + expected = self.frame.ix[[0, 1, 2, 5, 6]] + assert_frame_equal(result, expected) + + result = self.frame.drop(['two'], level='second') + expected = self.frame.ix[[0, 2, 3, 6, 7, 9]] + assert_frame_equal(result, expected) + + result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') + expected = self.frame.ix[[0, 1, 2, 5, 6]].T + assert_frame_equal(result, expected) + + result = self.frame.T.drop(['two'], axis=1, level='second') + expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T + assert_frame_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]], + names=['one', 'two']) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + self.assertEqual(result.index.names, ('one', 'two')) + + def test_unicode_repr_issues(self): + levels = [Index([u('a/\u03c3'), u('b/\u03c3'), u('c/\u03c3')]), + Index([0, 1])] + labels = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] + index = MultiIndex(levels=levels, labels=labels) + + repr(index.levels) + + # NumPy bug + # repr(index.get_level_values(1)) + + def test_unicode_repr_level_names(self): + index = MultiIndex.from_tuples([(0, 0), (1, 1)], + names=[u('\u0394'), 'i1']) + + s = Series(lrange(2), index=index) + df = DataFrame(np.random.randn(2, 4), index=index) + repr(s) + repr(df) + + def test_dataframe_insert_column_all_na(self): + # GH #1534 + mix = MultiIndex.from_tuples( + [('1a', '2a'), ('1a', '2b'), ('1a', '2c')]) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) + s = Series({(1, 1): 1, (1, 2): 2}) + df['new'] = s + self.assertTrue(df['new'].isnull().all()) + + def test_join_segfault(self): + # 1532 + df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]}) + df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]}) + df1 = df1.set_index(['a', 'b']) + df2 = df2.set_index(['a', 'b']) + # it works! + for how in ['left', 'right', 'outer']: + df1.join(df2, how=how) + + def test_set_column_scalar_with_ix(self): + subset = self.frame.index[[1, 4, 5]] + + self.frame.ix[subset] = 99 + self.assertTrue((self.frame.ix[subset].values == 99).all()) + + col = self.frame['B'] + col[subset] = 97 + self.assertTrue((self.frame.ix[subset, 'B'] == 97).all()) + + def test_frame_dict_constructor_empty_series(self): + s1 = Series([1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), + (2, 2), (2, 4)])) + s2 = Series([1, 2, 3, 4], + index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)])) + s3 = Series() + + # it works! + df = DataFrame({'foo': s1, 'bar': s2, 'baz': s3}) + df = DataFrame.from_dict({'foo': s1, 'baz': s3, 'bar': s2}) + + def test_indexing_ambiguity_bug_1678(self): + columns = MultiIndex.from_tuples([('Ohio', 'Green'), ('Ohio', 'Red'), + ('Colorado', 'Green')]) + index = MultiIndex.from_tuples( + [('a', 1), ('a', 2), ('b', 1), ('b', 2)]) + + frame = DataFrame(np.arange(12).reshape((4, 3)), index=index, + columns=columns) + + result = frame.ix[:, 1] + exp = frame.icol(1) + tm.assert_isinstance(result, Series) + assert_series_equal(result, exp) + + def test_nonunique_assignment_1750(self): + df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], + columns=list("ABCD")) + + df = df.set_index(['A', 'B']) + ix = MultiIndex.from_tuples([(1, 1)]) + + df.ix[ix, "C"] = '_' + + self.assertTrue((df.xs((1, 1))['C'] == '_').all()) + + def test_indexing_over_hashtable_size_cutoff(self): + n = 10000 + + old_cutoff = _index._SIZE_CUTOFF + _index._SIZE_CUTOFF = 20000 + + s = Series(np.arange(n), + MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + + # hai it works! + self.assertEqual(s[("a", 5)], 5) + self.assertEqual(s[("a", 6)], 6) + self.assertEqual(s[("a", 7)], 7) + + _index._SIZE_CUTOFF = old_cutoff + + def test_multiindex_na_repr(self): + # only an issue with long columns + + from numpy import nan + df3 = DataFrame({ + 'A' * 30: {('A', 'A0006000', 'nuit'): 'A0006000'}, + 'B' * 30: {('A', 'A0006000', 'nuit'): nan}, + 'C' * 30: {('A', 'A0006000', 'nuit'): nan}, + 'D' * 30: {('A', 'A0006000', 'nuit'): nan}, + 'E' * 30: {('A', 'A0006000', 'nuit'): 'A'}, + 'F' * 30: {('A', 'A0006000', 'nuit'): nan}, + }) + + idf = df3.set_index(['A' * 30, 'C' * 30]) + repr(idf) + + def test_assign_index_sequences(self): + # #2200 + df = DataFrame({"a": [1, 2, 3], + "b": [4, 5, 6], + "c": [7, 8, 9]}).set_index(["a", "b"]) + l = list(df.index) + l[0] = ("faz", "boo") + df.index = l + repr(df) + + # this travels an improper code path + l[0] = ["faz", "boo"] + df.index = l + repr(df) + + def test_tuples_have_na(self): + index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], + labels=[[1, 1, 1, 1, -1, 0, 0, 0], + [0, 1, 2, 3, 0, 1, 2, 3]]) + + self.assertTrue(isnull(index[4][0])) + self.assertTrue(isnull(index.values[4][0])) + + def test_duplicate_groupby_issues(self): + idx_tp = [('600809', '20061231'), ('600809', '20070331'), + ('600809', '20070630'), ('600809', '20070331')] + dt = ['demo','demo','demo','demo'] + + idx = MultiIndex.from_tuples(idx_tp,names = ['STK_ID','RPT_Date']) + s = Series(dt, index=idx) + + result = s.groupby(s.index).first() + self.assertEqual(len(result), 3) + + def test_duplicate_mi(self): + # GH 4516 + df = DataFrame([['foo','bar',1.0,1],['foo','bar',2.0,2],['bah','bam',3.0,3], + ['bah','bam',4.0,4],['foo','bar',5.0,5],['bah','bam',6.0,6]], + columns=list('ABCD')) + df = df.set_index(['A','B']) + df = df.sortlevel(0) + expected = DataFrame([['foo','bar',1.0,1],['foo','bar',2.0,2],['foo','bar',5.0,5]], + columns=list('ABCD')).set_index(['A','B']) + result = df.loc[('foo','bar')] + assert_frame_equal(result,expected) + + def test_multiindex_set_index(self): + # segfault in #3308 + d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]} + df = DataFrame(d) + tuples = [(0, 1), (0, 2), (1, 2)] + df['tuples'] = tuples + + index = MultiIndex.from_tuples(df['tuples']) + # it works! + df.set_index(index) + + def test_datetimeindex(self): + idx1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'] * 2, tz='Asia/Tokyo') + idx2 = pd.date_range('2010/01/01', periods=6, freq='M', tz='US/Eastern') + idx = MultiIndex.from_arrays([idx1, idx2]) + + expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00'], tz='Asia/Tokyo') + + self.assertTrue(idx.levels[0].equals(expected1)) + self.assertTrue(idx.levels[1].equals(idx2)) + + def test_set_index_datetime(self): + # GH 3950 + df = pd.DataFrame({'label':['a', 'a', 'a', 'b', 'b', 'b'], + 'datetime':['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 09:00:00'], + 'value':range(6)}) + df.index = pd.to_datetime(df.pop('datetime'), utc=True) + df.index = df.index.tz_localize('UTC').tz_convert('US/Pacific') + + expected = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']) + expected = expected.tz_localize('UTC').tz_convert('US/Pacific') + + df = df.set_index('label', append=True) + self.assertTrue(df.index.levels[0].equals(expected)) + self.assertTrue(df.index.levels[1].equals(pd.Index(['a', 'b']))) + + df = df.swaplevel(0, 1) + self.assertTrue(df.index.levels[0].equals(pd.Index(['a', 'b']))) + self.assertTrue(df.index.levels[1].equals(expected)) + + + df = DataFrame(np.random.random(6)) + idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Eastern') + idx2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-01 09:00', '2012-04-01 09:00', + '2012-04-02 09:00', '2012-04-02 09:00', '2012-04-02 09:00'], + tz='US/Eastern') + idx3 = pd.date_range('2011-01-01 09:00', periods=6, tz='Asia/Tokyo') + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], tz='US/Eastern') + expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'], tz='US/Eastern') + + self.assertTrue(df.index.levels[0].equals(expected1)) + self.assertTrue(df.index.levels[1].equals(expected2)) + self.assertTrue(df.index.levels[2].equals(idx3)) + + # GH 7092 + self.assertTrue(df.index.get_level_values(0).equals(idx1)) + self.assertTrue(df.index.get_level_values(1).equals(idx2)) + self.assertTrue(df.index.get_level_values(2).equals(idx3)) + + def test_reset_index_datetime(self): + # GH 3950 + for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: + idx1 = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx1') + idx2 = pd.Index(range(5), name='idx2',dtype='int64') + idx = pd.MultiIndex.from_arrays([idx1, idx2]) + df = pd.DataFrame({'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) + + expected = pd.DataFrame({'idx1': [datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5)], + 'idx2': np.arange(5,dtype='int64'), + 'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, + columns=['idx1', 'idx2', 'a', 'b']) + expected['idx1'] = expected['idx1'].apply(lambda d: pd.Timestamp(d, tz=tz)) + assert_frame_equal(df.reset_index(), expected) + + idx3 = pd.date_range('1/1/2012', periods=5, freq='MS', tz='Europe/Paris', name='idx3') + idx = pd.MultiIndex.from_arrays([idx1, idx2, idx3]) + df = pd.DataFrame({'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) + + expected = pd.DataFrame({'idx1': [datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5)], + 'idx2': np.arange(5,dtype='int64'), + 'idx3': [datetime.datetime(2012, 1, 1), + datetime.datetime(2012, 2, 1), + datetime.datetime(2012, 3, 1), + datetime.datetime(2012, 4, 1), + datetime.datetime(2012, 5, 1)], + 'a': np.arange(5,dtype='int64'), 'b': ['A', 'B', 'C', 'D', 'E']}, + columns=['idx1', 'idx2', 'idx3', 'a', 'b']) + expected['idx1'] = expected['idx1'].apply(lambda d: pd.Timestamp(d, tz=tz)) + expected['idx3'] = expected['idx3'].apply(lambda d: pd.Timestamp(d, tz='Europe/Paris')) + assert_frame_equal(df.reset_index(), expected) + + def test_set_index_period(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = pd.period_range('2011-01-01', periods=3, freq='M') + idx1 = idx1.append(idx1) + idx2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + idx2 = idx2.append(idx2).append(idx2) + idx3 = pd.period_range('2005', periods=6, freq='Y') + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = pd.period_range('2011-01-01', periods=3, freq='M') + expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + + self.assertTrue(df.index.levels[0].equals(expected1)) + self.assertTrue(df.index.levels[1].equals(expected2)) + self.assertTrue(df.index.levels[2].equals(idx3)) + + self.assertTrue(df.index.get_level_values(0).equals(idx1)) + self.assertTrue(df.index.get_level_values(1).equals(idx2)) + self.assertTrue(df.index.get_level_values(2).equals(idx3)) + + +if __name__ == '__main__': + + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py new file mode 100644 index 00000000..3e8a5fec --- /dev/null +++ b/pandas/tests/test_nanops.py @@ -0,0 +1,756 @@ +from __future__ import division, print_function + +from functools import partial + +import numpy as np + +from pandas.core.common import isnull +import pandas.core.nanops as nanops +import pandas.util.testing as tm + +nanops._USE_BOTTLENECK = False + + +class TestnanopsDataFrame(tm.TestCase): + def setUp(self): + np.random.seed(11235) + + self.arr_shape = (11, 7, 5) + + self.arr_float = np.random.randn(*self.arr_shape) + self.arr_float1 = np.random.randn(*self.arr_shape) + self.arr_complex = self.arr_float + self.arr_float1*1j + self.arr_int = np.random.randint(-10, 10, self.arr_shape) + self.arr_bool = np.random.randint(0, 2, self.arr_shape) == 0 + self.arr_str = np.abs(self.arr_float).astype('S') + self.arr_utf = np.abs(self.arr_float).astype('U') + self.arr_date = np.random.randint(0, 20000, + self.arr_shape).astype('M8[ns]') + self.arr_tdelta = np.random.randint(0, 20000, + self.arr_shape).astype('m8[ns]') + + self.arr_nan = np.tile(np.nan, self.arr_shape) + self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan]) + self.arr_float1_nan = np.vstack([self.arr_float1, self.arr_nan]) + self.arr_nan_float1 = np.vstack([self.arr_nan, self.arr_float1]) + self.arr_nan_nan = np.vstack([self.arr_nan, self.arr_nan]) + + self.arr_inf = self.arr_float*np.inf + self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf]) + self.arr_float1_inf = np.vstack([self.arr_float1, self.arr_inf]) + self.arr_inf_float1 = np.vstack([self.arr_inf, self.arr_float1]) + self.arr_inf_inf = np.vstack([self.arr_inf, self.arr_inf]) + + self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf]) + self.arr_float_nan_inf = np.vstack([self.arr_float, + self.arr_nan, + self.arr_inf]) + self.arr_nan_float1_inf = np.vstack([self.arr_float, + self.arr_inf, + self.arr_nan]) + self.arr_nan_nan_inf = np.vstack([self.arr_nan, + self.arr_nan, + self.arr_inf]) + self.arr_obj = np.vstack([self.arr_float.astype('O'), + self.arr_int.astype('O'), + self.arr_bool.astype('O'), + self.arr_complex.astype('O'), + self.arr_str.astype('O'), + self.arr_utf.astype('O'), + self.arr_date.astype('O'), + self.arr_tdelta.astype('O')]) + + self.arr_nan_nanj = self.arr_nan + self.arr_nan*1j + self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj]) + + self.arr_nan_infj = self.arr_inf*1j + self.arr_complex_nan_infj = np.vstack([self.arr_complex, + self.arr_nan_infj]) + + self.arr_float_2d = self.arr_float[:, :, 0] + self.arr_float1_2d = self.arr_float1[:, :, 0] + self.arr_complex_2d = self.arr_complex[:, :, 0] + self.arr_int_2d = self.arr_int[:, :, 0] + self.arr_bool_2d = self.arr_bool[:, :, 0] + self.arr_str_2d = self.arr_str[:, :, 0] + self.arr_utf_2d = self.arr_utf[:, :, 0] + self.arr_date_2d = self.arr_date[:, :, 0] + self.arr_tdelta_2d = self.arr_tdelta[:, :, 0] + + self.arr_nan_2d = self.arr_nan[:, :, 0] + self.arr_float_nan_2d = self.arr_float_nan[:, :, 0] + self.arr_float1_nan_2d = self.arr_float1_nan[:, :, 0] + self.arr_nan_float1_2d = self.arr_nan_float1[:, :, 0] + self.arr_nan_nan_2d = self.arr_nan_nan[:, :, 0] + self.arr_nan_nanj_2d = self.arr_nan_nanj[:, :, 0] + self.arr_complex_nan_2d = self.arr_complex_nan[:, :, 0] + + self.arr_inf_2d = self.arr_inf[:, :, 0] + self.arr_float_inf_2d = self.arr_float_inf[:, :, 0] + self.arr_nan_inf_2d = self.arr_nan_inf[:, :, 0] + self.arr_float_nan_inf_2d = self.arr_float_nan_inf[:, :, 0] + self.arr_nan_nan_inf_2d = self.arr_nan_nan_inf[:, :, 0] + + self.arr_float_1d = self.arr_float[:, 0, 0] + self.arr_float1_1d = self.arr_float1[:, 0, 0] + self.arr_complex_1d = self.arr_complex[:, 0, 0] + self.arr_int_1d = self.arr_int[:, 0, 0] + self.arr_bool_1d = self.arr_bool[:, 0, 0] + self.arr_str_1d = self.arr_str[:, 0, 0] + self.arr_utf_1d = self.arr_utf[:, 0, 0] + self.arr_date_1d = self.arr_date[:, 0, 0] + self.arr_tdelta_1d = self.arr_tdelta[:, 0, 0] + + self.arr_nan_1d = self.arr_nan[:, 0, 0] + self.arr_float_nan_1d = self.arr_float_nan[:, 0, 0] + self.arr_float1_nan_1d = self.arr_float1_nan[:, 0, 0] + self.arr_nan_float1_1d = self.arr_nan_float1[:, 0, 0] + self.arr_nan_nan_1d = self.arr_nan_nan[:, 0, 0] + self.arr_nan_nanj_1d = self.arr_nan_nanj[:, 0, 0] + self.arr_complex_nan_1d = self.arr_complex_nan[:, 0, 0] + + self.arr_inf_1d = self.arr_inf.ravel() + self.arr_float_inf_1d = self.arr_float_inf[:, 0, 0] + self.arr_nan_inf_1d = self.arr_nan_inf[:, 0, 0] + self.arr_float_nan_inf_1d = self.arr_float_nan_inf[:, 0, 0] + self.arr_nan_nan_inf_1d = self.arr_nan_nan_inf[:, 0, 0] + + def check_results(self, targ, res, axis): + res = getattr(res, 'asm8', res) + res = getattr(res, 'values', res) + if axis != 0 and hasattr(targ, 'shape') and targ.ndim: + res = np.split(res, [targ.shape[0]], axis=0)[0] + try: + tm.assert_almost_equal(targ, res) + except: + # There are sometimes rounding errors with + # complex and object dtypes. + # If it isn't one of those, re-raise the error. + if not hasattr(res, 'dtype') or res.dtype.kind not in ['c', 'O']: + raise + # convert object dtypes to something that can be split into + # real and imaginary parts + if res.dtype.kind == 'O': + if targ.dtype.kind != 'O': + res = res.astype(targ.dtype) + else: + try: + res = res.astype('c16') + except: + res = res.astype('f8') + try: + targ = targ.astype('c16') + except: + targ = targ.astype('f8') + # there should never be a case where numpy returns an object + # but nanops doesn't, so make that an exception + elif targ.dtype.kind == 'O': + raise + tm.assert_almost_equal(targ.real, res.real) + tm.assert_almost_equal(targ.imag, res.imag) + + def check_fun_data(self, testfunc, targfunc, + testarval, targarval, targarnanval, **kwargs): + for axis in list(range(targarval.ndim))+[None]: + for skipna in [False, True]: + targartempval = targarval if skipna else targarnanval + try: + targ = targfunc(targartempval, axis=axis, **kwargs) + res = testfunc(testarval, axis=axis, skipna=skipna, + **kwargs) + self.check_results(targ, res, axis) + if skipna: + res = testfunc(testarval, axis=axis) + self.check_results(targ, res, axis) + if axis is None: + res = testfunc(testarval, skipna=skipna) + self.check_results(targ, res, axis) + if skipna and axis is None: + res = testfunc(testarval) + self.check_results(targ, res, axis) + except BaseException as exc: + exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1), + 'skipna: %s' % skipna, + 'kwargs: %s' % kwargs) + raise + + if testarval.ndim <= 1: + return + + try: + testarval2 = np.take(testarval, 0, axis=-1) + targarval2 = np.take(targarval, 0, axis=-1) + targarnanval2 = np.take(targarnanval, 0, axis=-1) + except ValueError: + return + self.check_fun_data(testfunc, targfunc, + testarval2, targarval2, targarnanval2, + **kwargs) + + def check_fun(self, testfunc, targfunc, + testar, targar=None, targarnan=None, + **kwargs): + if targar is None: + targar = testar + if targarnan is None: + targarnan = testar + testarval = getattr(self, testar) + targarval = getattr(self, targar) + targarnanval = getattr(self, targarnan) + try: + self.check_fun_data(testfunc, targfunc, + testarval, targarval, targarnanval, **kwargs) + except BaseException as exc: + exc.args += ('testar: %s' % testar, + 'targar: %s' % targar, + 'targarnan: %s' % targarnan) + raise + + def check_funs(self, testfunc, targfunc, + allow_complex=True, allow_all_nan=True, allow_str=True, + allow_date=True, allow_obj=True, + **kwargs): + self.check_fun(testfunc, targfunc, 'arr_float', **kwargs) + self.check_fun(testfunc, targfunc, 'arr_float_nan', 'arr_float', + **kwargs) + self.check_fun(testfunc, targfunc, 'arr_int', **kwargs) + self.check_fun(testfunc, targfunc, 'arr_bool', **kwargs) + objs = [self.arr_float.astype('O'), + self.arr_int.astype('O'), + self.arr_bool.astype('O')] + + if allow_all_nan: + self.check_fun(testfunc, targfunc, 'arr_nan', **kwargs) + + if allow_complex: + self.check_fun(testfunc, targfunc, 'arr_complex', **kwargs) + self.check_fun(testfunc, targfunc, + 'arr_complex_nan', 'arr_complex', **kwargs) + if allow_all_nan: + self.check_fun(testfunc, targfunc, 'arr_nan_nanj', **kwargs) + objs += [self.arr_complex.astype('O')] + + if allow_str: + self.check_fun(testfunc, targfunc, 'arr_str', **kwargs) + self.check_fun(testfunc, targfunc, 'arr_utf', **kwargs) + objs += [self.arr_str.astype('O'), + self.arr_utf.astype('O')] + + if allow_date: + try: + targfunc(self.arr_date) + except TypeError: + pass + else: + self.check_fun(testfunc, targfunc, 'arr_date', **kwargs) + objs += [self.arr_date.astype('O')] + try: + targfunc(self.arr_tdelta) + except TypeError: + pass + else: + self.check_fun(testfunc, targfunc, 'arr_tdelta', **kwargs) + objs += [self.arr_tdelta.astype('O')] + + if allow_obj: + self.arr_obj = np.vstack(objs) + # some nanops handle object dtypes better than their numpy + # counterparts, so the numpy functions need to be given something + # else + if allow_obj == 'convert': + targfunc = partial(self._badobj_wrap, + func=targfunc, allow_complex=allow_complex) + self.check_fun(testfunc, targfunc, 'arr_obj', **kwargs) + + def check_funs_ddof(self, testfunc, targfunc, + allow_complex=True, allow_all_nan=True, allow_str=True, + allow_date=True, allow_obj=True,): + for ddof in range(3): + try: + self.check_funs(self, testfunc, targfunc, + allow_complex, allow_all_nan, allow_str, + allow_date, allow_obj, + ddof=ddof) + except BaseException as exc: + exc.args += ('ddof %s' % ddof,) + + def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): + if value.dtype.kind == 'O': + if allow_complex: + value = value.astype('c16') + else: + value = value.astype('f8') + return func(value, **kwargs) + + def test_nanany(self): + self.check_funs(nanops.nanany, np.any, + allow_all_nan=False, allow_str=False, allow_date=False) + + def test_nanall(self): + self.check_funs(nanops.nanall, np.all, + allow_all_nan=False, allow_str=False, allow_date=False) + + def test_nansum(self): + self.check_funs(nanops.nansum, np.sum, + allow_str=False, allow_date=False) + + def test_nanmean(self): + self.check_funs(nanops.nanmean, np.mean, + allow_complex=False, allow_obj=False, + allow_str=False, allow_date=False) + + def test_nanmedian(self): + self.check_funs(nanops.nanmedian, np.median, + allow_complex=False, allow_str=False, allow_date=False, + allow_obj='convert') + + def test_nanvar(self): + self.check_funs_ddof(nanops.nanvar, np.var, + allow_complex=False, allow_date=False) + + def test_nansem(self): + tm.skip_if_no_package('scipy.stats') + self.check_funs_ddof(nanops.nansem, np.var, + allow_complex=False, allow_date=False) + + def _minmax_wrap(self, value, axis=None, func=None): + res = func(value, axis) + if res.dtype.kind == 'm': + res = np.atleast_1d(res) + return res + + def test_nanmin(self): + func = partial(self._minmax_wrap, func=np.min) + self.check_funs(nanops.nanmin, func, + allow_str=False, allow_obj=False) + + def test_nanmax(self): + func = partial(self._minmax_wrap, func=np.max) + self.check_funs(nanops.nanmax, func, + allow_str=False, allow_obj=False) + + def _argminmax_wrap(self, value, axis=None, func=None): + res = func(value, axis) + nans = np.min(value, axis) + nullnan = isnull(nans) + if res.ndim: + res[nullnan] = -1 + elif (hasattr(nullnan, 'all') and nullnan.all() or + not hasattr(nullnan, 'all') and nullnan): + res = -1 + return res + + def test_nanargmax(self): + func = partial(self._argminmax_wrap, func=np.argmax) + self.check_funs(nanops.nanargmax, func, + allow_str=False, allow_obj=False) + + def test_nanargmin(self): + func = partial(self._argminmax_wrap, func=np.argmin) + if tm.sys.version_info[0:2] == (2, 6): + self.check_funs(nanops.nanargmin, func, + allow_date=False, + allow_str=False, allow_obj=False) + else: + self.check_funs(nanops.nanargmin, func, + allow_str=False, allow_obj=False) + + def _skew_kurt_wrap(self, values, axis=None, func=None): + if not isinstance(values.dtype.type, np.floating): + values = values.astype('f8') + result = func(values, axis=axis, bias=False) + # fix for handling cases where all elements in an axis are the same + if isinstance(result, np.ndarray): + result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0 + return result + elif np.max(values) == np.min(values): + return 0. + return result + + def test_nanskew(self): + tm.skip_if_no_package('scipy.stats') + from scipy.stats import skew + func = partial(self._skew_kurt_wrap, func=skew) + self.check_funs(nanops.nanskew, func, + allow_complex=False, allow_str=False, allow_date=False) + + def test_nankurt(self): + tm.skip_if_no_package('scipy.stats') + from scipy.stats import kurtosis + func1 = partial(kurtosis, fisher=True) + func = partial(self._skew_kurt_wrap, func=func1) + self.check_funs(nanops.nankurt, func, + allow_complex=False, allow_str=False, allow_date=False) + + def test_nanprod(self): + self.check_funs(nanops.nanprod, np.prod, + allow_str=False, allow_date=False) + + def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): + res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, + **kwargs) + res01 = checkfun(self.arr_float_2d, self.arr_float1_2d, + min_periods=len(self.arr_float_2d)-1, + **kwargs) + tm.assert_almost_equal(targ0, res00) + tm.assert_almost_equal(targ0, res01) + + res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, + **kwargs) + res11 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, + min_periods=len(self.arr_float_2d)-1, + **kwargs) + tm.assert_almost_equal(targ1, res10) + tm.assert_almost_equal(targ1, res11) + + targ2 = np.nan + res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, + **kwargs) + res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, + **kwargs) + res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, + **kwargs) + res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, + **kwargs) + res24 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, + min_periods=len(self.arr_float_2d)-1, + **kwargs) + res25 = checkfun(self.arr_float_2d, self.arr_float1_2d, + min_periods=len(self.arr_float_2d)+1, + **kwargs) + tm.assert_almost_equal(targ2, res20) + tm.assert_almost_equal(targ2, res21) + tm.assert_almost_equal(targ2, res22) + tm.assert_almost_equal(targ2, res23) + tm.assert_almost_equal(targ2, res24) + tm.assert_almost_equal(targ2, res25) + + def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): + res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, + **kwargs) + res01 = checkfun(self.arr_float_1d, self.arr_float1_1d, + min_periods=len(self.arr_float_1d)-1, + **kwargs) + tm.assert_almost_equal(targ0, res00) + tm.assert_almost_equal(targ0, res01) + + res10 = checkfun(self.arr_float_nan_1d, + self.arr_float1_nan_1d, + **kwargs) + res11 = checkfun(self.arr_float_nan_1d, + self.arr_float1_nan_1d, + min_periods=len(self.arr_float_1d)-1, + **kwargs) + tm.assert_almost_equal(targ1, res10) + tm.assert_almost_equal(targ1, res11) + + targ2 = np.nan + res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, + **kwargs) + res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, + **kwargs) + res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, + **kwargs) + res23 = checkfun(self.arr_float_nan_1d, + self.arr_nan_float1_1d, + **kwargs) + res24 = checkfun(self.arr_float_nan_1d, + self.arr_nan_float1_1d, + min_periods=len(self.arr_float_1d)-1, + **kwargs) + res25 = checkfun(self.arr_float_1d, + self.arr_float1_1d, + min_periods=len(self.arr_float_1d)+1, + **kwargs) + tm.assert_almost_equal(targ2, res20) + tm.assert_almost_equal(targ2, res21) + tm.assert_almost_equal(targ2, res22) + tm.assert_almost_equal(targ2, res23) + tm.assert_almost_equal(targ2, res24) + tm.assert_almost_equal(targ2, res25) + + def test_nancorr(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, + self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1) + targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] + targ1 = np.corrcoef(self.arr_float_1d.flat, + self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, + method='pearson') + + def test_nancorr_pearson(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, + self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, + method='pearson') + targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] + targ1 = np.corrcoef(self.arr_float_1d.flat, + self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, + method='pearson') + + def test_nancorr_kendall(self): + tm.skip_if_no_package('scipy.stats') + from scipy.stats import kendalltau + targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0] + targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, + method='kendall') + targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0] + targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, + method='kendall') + + def test_nancorr_spearman(self): + tm.skip_if_no_package('scipy.stats') + from scipy.stats import spearmanr + targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0] + targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, + method='spearman') + targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0] + targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, + method='spearman') + + def test_nancov(self): + targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancov, targ0, targ1) + targ0 = np.cov(self.arr_float_1d, self.arr_float1_1d)[0, 1] + targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1) + + def check_nancomp(self, checkfun, targ0): + arr_float = self.arr_float + arr_float1 = self.arr_float1 + arr_nan = self.arr_nan + arr_nan_nan = self.arr_nan_nan + arr_float_nan = self.arr_float_nan + arr_float1_nan = self.arr_float1_nan + arr_nan_float1 = self.arr_nan_float1 + + while targ0.ndim: + try: + res0 = checkfun(arr_float, arr_float1) + tm.assert_almost_equal(targ0, res0) + + if targ0.ndim > 1: + targ1 = np.vstack([targ0, arr_nan]) + else: + targ1 = np.hstack([targ0, arr_nan]) + res1 = checkfun(arr_float_nan, arr_float1_nan) + tm.assert_almost_equal(targ1, res1) + + targ2 = arr_nan_nan + res2 = checkfun(arr_float_nan, arr_nan_float1) + tm.assert_almost_equal(targ2, res2) + except Exception as exc: + exc.args += ('ndim: %s' % arr_float.ndim,) + raise + + try: + arr_float = np.take(arr_float, 0, axis=-1) + arr_float1 = np.take(arr_float1, 0, axis=-1) + arr_nan = np.take(arr_nan, 0, axis=-1) + arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1) + arr_float_nan = np.take(arr_float_nan, 0, axis=-1) + arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1) + arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1) + targ0 = np.take(targ0, 0, axis=-1) + except ValueError: + break + + def test_nangt(self): + targ0 = self.arr_float > self.arr_float1 + self.check_nancomp(nanops.nangt, targ0) + + def test_nange(self): + targ0 = self.arr_float >= self.arr_float1 + self.check_nancomp(nanops.nange, targ0) + + def test_nanlt(self): + targ0 = self.arr_float < self.arr_float1 + self.check_nancomp(nanops.nanlt, targ0) + + def test_nanle(self): + targ0 = self.arr_float <= self.arr_float1 + self.check_nancomp(nanops.nanle, targ0) + + def test_naneq(self): + targ0 = self.arr_float == self.arr_float1 + self.check_nancomp(nanops.naneq, targ0) + + def test_nanne(self): + targ0 = self.arr_float != self.arr_float1 + self.check_nancomp(nanops.nanne, targ0) + + def check_bool(self, func, value, correct, *args, **kwargs): + while getattr(value, 'ndim', True): + try: + res0 = func(value, *args, **kwargs) + if correct: + self.assertTrue(res0) + else: + self.assertFalse(res0) + except BaseException as exc: + exc.args += ('dim: %s' % getattr(value, 'ndim', value),) + raise + if not hasattr(value, 'ndim'): + break + try: + value = np.take(value, 0, axis=-1) + except ValueError: + break + + def test__has_infs(self): + pairs = [('arr_complex', False), + ('arr_int', False), + ('arr_bool', False), + ('arr_str', False), + ('arr_utf', False), + ('arr_complex', False), + ('arr_complex_nan', False), + + ('arr_nan_nanj', False), + ('arr_nan_infj', True), + ('arr_complex_nan_infj', True)] + pairs_float = [('arr_float', False), + ('arr_nan', False), + ('arr_float_nan', False), + ('arr_nan_nan', False), + + ('arr_float_inf', True), + ('arr_inf', True), + ('arr_nan_inf', True), + ('arr_float_nan_inf', True), + ('arr_nan_nan_inf', True)] + + for arr, correct in pairs: + val = getattr(self, arr) + try: + self.check_bool(nanops._has_infs, val, correct) + except BaseException as exc: + exc.args += (arr,) + raise + + for arr, correct in pairs_float: + val = getattr(self, arr) + try: + self.check_bool(nanops._has_infs, val, correct) + self.check_bool(nanops._has_infs, val.astype('f4'), correct) + self.check_bool(nanops._has_infs, val.astype('f2'), correct) + except BaseException as exc: + exc.args += (arr,) + raise + + def test__isfinite(self): + pairs = [('arr_complex', False), + ('arr_int', False), + ('arr_bool', False), + ('arr_str', False), + ('arr_utf', False), + ('arr_complex', False), + ('arr_complex_nan', True), + + ('arr_nan_nanj', True), + ('arr_nan_infj', True), + ('arr_complex_nan_infj', True)] + pairs_float = [('arr_float', False), + ('arr_nan', True), + ('arr_float_nan', True), + ('arr_nan_nan', True), + + ('arr_float_inf', True), + ('arr_inf', True), + ('arr_nan_inf', True), + ('arr_float_nan_inf', True), + ('arr_nan_nan_inf', True)] + + func1 = lambda x: np.any(nanops._isfinite(x).ravel()) + func2 = lambda x: np.any(nanops._isfinite(x).values.ravel()) + for arr, correct in pairs: + val = getattr(self, arr) + try: + self.check_bool(func1, val, correct) + except BaseException as exc: + exc.args += (arr,) + raise + + for arr, correct in pairs_float: + val = getattr(self, arr) + try: + self.check_bool(func1, val, correct) + self.check_bool(func1, val.astype('f4'), correct) + self.check_bool(func1, val.astype('f2'), correct) + except BaseException as exc: + exc.args += (arr,) + raise + + def test__bn_ok_dtype(self): + self.assertTrue(nanops._bn_ok_dtype(self.arr_float.dtype, 'test')) + self.assertTrue(nanops._bn_ok_dtype(self.arr_complex.dtype, 'test')) + self.assertTrue(nanops._bn_ok_dtype(self.arr_int.dtype, 'test')) + self.assertTrue(nanops._bn_ok_dtype(self.arr_bool.dtype, 'test')) + self.assertTrue(nanops._bn_ok_dtype(self.arr_str.dtype, 'test')) + self.assertTrue(nanops._bn_ok_dtype(self.arr_utf.dtype, 'test')) + self.assertFalse(nanops._bn_ok_dtype(self.arr_date.dtype, 'test')) + self.assertFalse(nanops._bn_ok_dtype(self.arr_tdelta.dtype, 'test')) + self.assertFalse(nanops._bn_ok_dtype(self.arr_obj.dtype, 'test')) + + +class TestEnsureNumeric(tm.TestCase): + def test_numeric_values(self): + # Test integer + self.assertEqual(nanops._ensure_numeric(1), 1, 'Failed for int') + # Test float + self.assertEqual(nanops._ensure_numeric(1.1), 1.1, 'Failed for float') + # Test complex + self.assertEqual(nanops._ensure_numeric(1 + 2j), 1 + 2j, + 'Failed for complex') + + def test_ndarray(self): + # Test numeric ndarray + values = np.array([1, 2, 3]) + self.assertTrue(np.allclose(nanops._ensure_numeric(values), values), + 'Failed for numeric ndarray') + + # Test object ndarray + o_values = values.astype(object) + self.assertTrue(np.allclose(nanops._ensure_numeric(o_values), values), + 'Failed for object ndarray') + + # Test convertible string ndarray + s_values = np.array(['1', '2', '3'], dtype=object) + self.assertTrue(np.allclose(nanops._ensure_numeric(s_values), values), + 'Failed for convertible string ndarray') + + # Test non-convertible string ndarray + s_values = np.array(['foo', 'bar', 'baz'], dtype=object) + self.assertRaises(ValueError, + lambda: nanops._ensure_numeric(s_values)) + + def test_convertable_values(self): + self.assertTrue(np.allclose(nanops._ensure_numeric('1'), 1.0), + 'Failed for convertible integer string') + self.assertTrue(np.allclose(nanops._ensure_numeric('1.1'), 1.1), + 'Failed for convertible float string') + self.assertTrue(np.allclose(nanops._ensure_numeric('1+1j'), 1 + 1j), + 'Failed for convertible complex string') + + def test_non_convertable_values(self): + self.assertRaises(TypeError, + lambda: nanops._ensure_numeric('foo')) + self.assertRaises(TypeError, + lambda: nanops._ensure_numeric({})) + self.assertRaises(TypeError, + lambda: nanops._ensure_numeric([])) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', + '-s'], exit=False) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py new file mode 100644 index 00000000..255da1af --- /dev/null +++ b/pandas/tests/test_panel.py @@ -0,0 +1,2345 @@ +# pylint: disable=W0612,E1101 + +from datetime import datetime +import operator +import nose + +import numpy as np + +from pandas import Series, DataFrame, Index, isnull, notnull, pivot, MultiIndex +from pandas.core.datetools import bday +from pandas.core.frame import group_agg +from pandas.core.panel import Panel +from pandas.core.series import remove_na +import pandas.core.common as com +from pandas import compat +from pandas.compat import range, lrange, StringIO, cPickle, OrderedDict + +from pandas.util.testing import (assert_panel_equal, + assert_frame_equal, + assert_series_equal, + assert_almost_equal, + ensure_clean, + assertRaisesRegexp, + makeCustomDataframe as mkdf, + makeMixedDataFrame + ) +import pandas.core.panel as panelm +import pandas.util.testing as tm + + +class PanelTests(object): + panel = None + + def test_pickle(self): + pickled = cPickle.dumps(self.panel) + unpickled = cPickle.loads(pickled) + assert_frame_equal(unpickled['ItemA'], self.panel['ItemA']) + + def test_cumsum(self): + cumsum = self.panel.cumsum() + assert_frame_equal(cumsum['ItemA'], self.panel['ItemA'].cumsum()) + + def not_hashable(self): + c_empty = Panel() + c = Panel(Panel([[[1]]])) + self.assertRaises(TypeError, hash, c_empty) + self.assertRaises(TypeError, hash, c) + + +class SafeForLongAndSparse(object): + _multiprocess_can_split_ = True + + def test_repr(self): + foo = repr(self.panel) + + def test_copy_names(self): + for attr in ('major_axis', 'minor_axis'): + getattr(self.panel, attr).name = None + cp = self.panel.copy() + getattr(cp, attr).name = 'foo' + self.assertIsNone(getattr(self.panel, attr).name) + + def test_iter(self): + tm.equalContents(list(self.panel), self.panel.items) + + def test_count(self): + f = lambda s: notnull(s).sum() + self._check_stat_op('count', f, obj=self.panel, has_skipna=False) + + def test_sum(self): + self._check_stat_op('sum', np.sum) + + def test_mean(self): + self._check_stat_op('mean', np.mean) + + def test_prod(self): + self._check_stat_op('prod', np.prod) + + def test_median(self): + def wrapper(x): + if isnull(x).any(): + return np.nan + return np.median(x) + + self._check_stat_op('median', wrapper) + + def test_min(self): + self._check_stat_op('min', np.min) + + def test_max(self): + self._check_stat_op('max', np.max) + + def test_skew(self): + try: + from scipy.stats import skew + except ImportError: + raise nose.SkipTest("no scipy.stats.skew") + + def this_skew(x): + if len(x) < 3: + return np.nan + return skew(x, bias=False) + self._check_stat_op('skew', this_skew) + + # def test_mad(self): + # f = lambda x: np.abs(x - x.mean()).mean() + # self._check_stat_op('mad', f) + + def test_var(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.var(x, ddof=1) + self._check_stat_op('var', alt) + + def test_std(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.std(x, ddof=1) + self._check_stat_op('std', alt) + + def test_sem(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + + # def test_skew(self): + # from scipy.stats import skew + + # def alt(x): + # if len(x) < 3: + # return np.nan + # return skew(x, bias=False) + + # self._check_stat_op('skew', alt) + + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + if obj is None: + obj = self.panel + + # # set some NAs + # obj.ix[5:10] = np.nan + # obj.ix[15:20, -2:] = np.nan + + f = getattr(obj, name) + + if has_skipna: + def skipna_wrapper(x): + nona = remove_na(x) + if len(nona) == 0: + return np.nan + return alternative(nona) + + def wrapper(x): + return alternative(np.asarray(x)) + + for i in range(obj.ndim): + result = f(axis=i, skipna=False) + assert_frame_equal(result, obj.apply(wrapper, axis=i)) + else: + skipna_wrapper = alternative + wrapper = alternative + + for i in range(obj.ndim): + result = f(axis=i) + assert_frame_equal(result, obj.apply(skipna_wrapper, axis=i)) + + self.assertRaises(Exception, f, axis=obj.ndim) + + +class SafeForSparse(object): + _multiprocess_can_split_ = True + + @classmethod + def assert_panel_equal(cls, x, y): + assert_panel_equal(x, y) + + def test_get_axis(self): + assert(self.panel._get_axis(0) is self.panel.items) + assert(self.panel._get_axis(1) is self.panel.major_axis) + assert(self.panel._get_axis(2) is self.panel.minor_axis) + + def test_set_axis(self): + new_items = Index(np.arange(len(self.panel.items))) + new_major = Index(np.arange(len(self.panel.major_axis))) + new_minor = Index(np.arange(len(self.panel.minor_axis))) + + # ensure propagate to potentially prior-cached items too + item = self.panel['ItemA'] + self.panel.items = new_items + + if hasattr(self.panel, '_item_cache'): + self.assertNotIn('ItemA', self.panel._item_cache) + self.assertIs(self.panel.items, new_items) + + item = self.panel[0] + self.panel.major_axis = new_major + self.assertIs(self.panel[0].index, new_major) + self.assertIs(self.panel.major_axis, new_major) + + item = self.panel[0] + self.panel.minor_axis = new_minor + self.assertIs(self.panel[0].columns, new_minor) + self.assertIs(self.panel.minor_axis, new_minor) + + def test_get_axis_number(self): + self.assertEqual(self.panel._get_axis_number('items'), 0) + self.assertEqual(self.panel._get_axis_number('major'), 1) + self.assertEqual(self.panel._get_axis_number('minor'), 2) + + def test_get_axis_name(self): + self.assertEqual(self.panel._get_axis_name(0), 'items') + self.assertEqual(self.panel._get_axis_name(1), 'major_axis') + self.assertEqual(self.panel._get_axis_name(2), 'minor_axis') + + def test_get_plane_axes(self): + # what to do here? + + index, columns = self.panel._get_plane_axes('items') + index, columns = self.panel._get_plane_axes('major_axis') + index, columns = self.panel._get_plane_axes('minor_axis') + index, columns = self.panel._get_plane_axes(0) + + def test_truncate(self): + dates = self.panel.major_axis + start, end = dates[1], dates[5] + + trunced = self.panel.truncate(start, end, axis='major') + expected = self.panel['ItemA'].truncate(start, end) + + assert_frame_equal(trunced['ItemA'], expected) + + trunced = self.panel.truncate(before=start, axis='major') + expected = self.panel['ItemA'].truncate(before=start) + + assert_frame_equal(trunced['ItemA'], expected) + + trunced = self.panel.truncate(after=end, axis='major') + expected = self.panel['ItemA'].truncate(after=end) + + assert_frame_equal(trunced['ItemA'], expected) + + # XXX test other axes + + def test_arith(self): + self._test_op(self.panel, operator.add) + self._test_op(self.panel, operator.sub) + self._test_op(self.panel, operator.mul) + self._test_op(self.panel, operator.truediv) + self._test_op(self.panel, operator.floordiv) + self._test_op(self.panel, operator.pow) + + self._test_op(self.panel, lambda x, y: y + x) + self._test_op(self.panel, lambda x, y: y - x) + self._test_op(self.panel, lambda x, y: y * x) + self._test_op(self.panel, lambda x, y: y / x) + self._test_op(self.panel, lambda x, y: y ** x) + + self._test_op(self.panel, lambda x, y: x + y) # panel + 1 + self._test_op(self.panel, lambda x, y: x - y) # panel - 1 + self._test_op(self.panel, lambda x, y: x * y) # panel * 1 + self._test_op(self.panel, lambda x, y: x / y) # panel / 1 + self._test_op(self.panel, lambda x, y: x ** y) # panel ** 1 + + self.assertRaises(Exception, self.panel.__add__, self.panel['ItemA']) + + @staticmethod + def _test_op(panel, op): + result = op(panel, 1) + assert_frame_equal(result['ItemA'], op(panel['ItemA'], 1)) + + def test_keys(self): + tm.equalContents(list(self.panel.keys()), self.panel.items) + + def test_iteritems(self): + # Test panel.iteritems(), aka panel.iteritems() + # just test that it works + for k, v in compat.iteritems(self.panel): + pass + + self.assertEqual(len(list(compat.iteritems(self.panel))), + len(self.panel.items)) + + def test_combineFrame(self): + def check_op(op, name): + # items + df = self.panel['ItemA'] + + func = getattr(self.panel, name) + + result = func(df, axis='items') + + assert_frame_equal(result['ItemB'], op(self.panel['ItemB'], df)) + + # major + xs = self.panel.major_xs(self.panel.major_axis[0]) + result = func(xs, axis='major') + + idx = self.panel.major_axis[1] + + assert_frame_equal(result.major_xs(idx), + op(self.panel.major_xs(idx), xs)) + + # minor + xs = self.panel.minor_xs(self.panel.minor_axis[0]) + result = func(xs, axis='minor') + + idx = self.panel.minor_axis[1] + + assert_frame_equal(result.minor_xs(idx), + op(self.panel.minor_xs(idx), xs)) + from pandas import SparsePanel + ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + if not compat.PY3: + ops.append('div') + # pow, mod not supported for SparsePanel as flex ops (for now) + if not isinstance(self.panel, SparsePanel): + ops.extend(['pow', 'mod']) + else: + idx = self.panel.minor_axis[1] + with assertRaisesRegexp(ValueError, "Simple arithmetic.*scalar"): + self.panel.pow(self.panel.minor_xs(idx), axis='minor') + with assertRaisesRegexp(ValueError, "Simple arithmetic.*scalar"): + self.panel.mod(self.panel.minor_xs(idx), axis='minor') + + for op in ops: + try: + check_op(getattr(operator, op), op) + except: + com.pprint_thing("Failing operation: %r" % op) + raise + if compat.PY3: + try: + check_op(operator.truediv, 'div') + except: + com.pprint_thing("Failing operation: %r" % name) + raise + + def test_combinePanel(self): + result = self.panel.add(self.panel) + self.assert_panel_equal(result, self.panel * 2) + + def test_neg(self): + self.assert_panel_equal(-self.panel, self.panel * -1) + + def test_select(self): + p = self.panel + + # select items + result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') + expected = p.reindex(items=['ItemA', 'ItemC']) + self.assert_panel_equal(result, expected) + + # select major_axis + result = p.select(lambda x: x >= datetime(2000, 1, 15), axis='major') + new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] + expected = p.reindex(major=new_major) + self.assert_panel_equal(result, expected) + + # select minor_axis + result = p.select(lambda x: x in ('D', 'A'), axis=2) + expected = p.reindex(minor=['A', 'D']) + self.assert_panel_equal(result, expected) + + # corner case, empty thing + result = p.select(lambda x: x in ('foo',), axis='items') + self.assert_panel_equal(result, p.reindex(items=[])) + + def test_get_value(self): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + result = self.panel.get_value(item, mjr, mnr) + expected = self.panel[item][mnr][mjr] + assert_almost_equal(result, expected) + + def test_abs(self): + result = self.panel.abs() + result2 = abs(self.panel) + expected = np.abs(self.panel) + self.assert_panel_equal(result, expected) + self.assert_panel_equal(result2, expected) + + df = self.panel['ItemA'] + result = df.abs() + result2 = abs(df) + expected = np.abs(df) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + s = df['A'] + result = s.abs() + result2 = abs(s) + expected = np.abs(s) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + +class CheckIndexing(object): + + _multiprocess_can_split_ = True + + def test_getitem(self): + self.assertRaises(Exception, self.panel.__getitem__, 'ItemQ') + + def test_delitem_and_pop(self): + expected = self.panel['ItemA'] + result = self.panel.pop('ItemA') + assert_frame_equal(expected, result) + self.assertNotIn('ItemA', self.panel.items) + + del self.panel['ItemB'] + self.assertNotIn('ItemB', self.panel.items) + self.assertRaises(Exception, self.panel.__delitem__, 'ItemB') + + values = np.empty((3, 3, 3)) + values[0] = 0 + values[1] = 1 + values[2] = 2 + + panel = Panel(values, lrange(3), lrange(3), lrange(3)) + + # did we delete the right row? + + panelc = panel.copy() + del panelc[0] + assert_frame_equal(panelc[1], panel[1]) + assert_frame_equal(panelc[2], panel[2]) + + panelc = panel.copy() + del panelc[1] + assert_frame_equal(panelc[0], panel[0]) + assert_frame_equal(panelc[2], panel[2]) + + panelc = panel.copy() + del panelc[2] + assert_frame_equal(panelc[1], panel[1]) + assert_frame_equal(panelc[0], panel[0]) + + def test_setitem(self): + # LongPanel with one item + lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() + with tm.assertRaises(ValueError): + self.panel['ItemE'] = lp + + # DataFrame + df = self.panel['ItemA'][2:].filter(items=['A', 'B']) + self.panel['ItemF'] = df + self.panel['ItemE'] = df + + df2 = self.panel['ItemF'] + + assert_frame_equal(df, df2.reindex(index=df.index, + columns=df.columns)) + + # scalar + self.panel['ItemG'] = 1 + self.panel['ItemE'] = True + self.assertEqual(self.panel['ItemG'].values.dtype, np.int64) + self.assertEqual(self.panel['ItemE'].values.dtype, np.bool_) + + # object dtype + self.panel['ItemQ'] = 'foo' + self.assertEqual(self.panel['ItemQ'].values.dtype, np.object_) + + # boolean dtype + self.panel['ItemP'] = self.panel['ItemA'] > 0 + self.assertEqual(self.panel['ItemP'].values.dtype, np.bool_) + + self.assertRaises(TypeError, self.panel.__setitem__, 'foo', + self.panel.ix[['ItemP']]) + + # bad shape + p = Panel(np.random.randn(4, 3, 2)) + with tm.assertRaisesRegexp(ValueError, + "shape of value must be \(3, 2\), " + "shape of given object was \(4, 2\)"): + p[0] = np.random.randn(4, 2) + + def test_setitem_ndarray(self): + from pandas import date_range, datetools + + timeidx = date_range(start=datetime(2009, 1, 1), + end=datetime(2009, 12, 31), + freq=datetools.MonthEnd()) + lons_coarse = np.linspace(-177.5, 177.5, 72) + lats_coarse = np.linspace(-87.5, 87.5, 36) + P = Panel(items=timeidx, major_axis=lons_coarse, + minor_axis=lats_coarse) + data = np.random.randn(72 * 36).reshape((72, 36)) + key = datetime(2009, 2, 28) + P[key] = data + + assert_almost_equal(P[key].values, data) + + def test_major_xs(self): + ref = self.panel['ItemA'] + + idx = self.panel.major_axis[5] + xs = self.panel.major_xs(idx) + + assert_series_equal(xs['ItemA'], ref.xs(idx)) + + # not contained + idx = self.panel.major_axis[0] - bday + self.assertRaises(Exception, self.panel.major_xs, idx) + + def test_major_xs_mixed(self): + self.panel['ItemD'] = 'foo' + xs = self.panel.major_xs(self.panel.major_axis[0]) + self.assertEqual(xs['ItemA'].dtype, np.float64) + self.assertEqual(xs['ItemD'].dtype, np.object_) + + def test_minor_xs(self): + ref = self.panel['ItemA'] + + idx = self.panel.minor_axis[1] + xs = self.panel.minor_xs(idx) + + assert_series_equal(xs['ItemA'], ref[idx]) + + # not contained + self.assertRaises(Exception, self.panel.minor_xs, 'E') + + def test_minor_xs_mixed(self): + self.panel['ItemD'] = 'foo' + + xs = self.panel.minor_xs('D') + self.assertEqual(xs['ItemA'].dtype, np.float64) + self.assertEqual(xs['ItemD'].dtype, np.object_) + + def test_xs(self): + itemA = self.panel.xs('ItemA', axis=0) + expected = self.panel['ItemA'] + assert_frame_equal(itemA, expected) + + # get a view by default + itemA_view = self.panel.xs('ItemA', axis=0) + itemA_view.values[:] = np.nan + self.assertTrue(np.isnan(self.panel['ItemA'].values).all()) + + # mixed-type yields a copy + self.panel['strings'] = 'foo' + result = self.panel.xs('D', axis=2) + self.assertIsNotNone(result.is_copy) + + def test_getitem_fancy_labels(self): + p = self.panel + + items = p.items[[1, 0]] + dates = p.major_axis[::2] + cols = ['D', 'C', 'F'] + + # all 3 specified + assert_panel_equal(p.ix[items, dates, cols], + p.reindex(items=items, major=dates, minor=cols)) + + # 2 specified + assert_panel_equal(p.ix[:, dates, cols], + p.reindex(major=dates, minor=cols)) + + assert_panel_equal(p.ix[items, :, cols], + p.reindex(items=items, minor=cols)) + + assert_panel_equal(p.ix[items, dates, :], + p.reindex(items=items, major=dates)) + + # only 1 + assert_panel_equal(p.ix[items, :, :], + p.reindex(items=items)) + + assert_panel_equal(p.ix[:, dates, :], + p.reindex(major=dates)) + + assert_panel_equal(p.ix[:, :, cols], + p.reindex(minor=cols)) + + def test_getitem_fancy_slice(self): + pass + + def test_getitem_fancy_ints(self): + p = self.panel + + # #1603 + result = p.ix[:, -1, :] + expected = p.ix[:, p.major_axis[-1], :] + assert_frame_equal(result, expected) + + def test_getitem_fancy_xs(self): + p = self.panel + item = 'ItemB' + + date = p.major_axis[5] + col = 'C' + + # get DataFrame + # item + assert_frame_equal(p.ix[item], p[item]) + assert_frame_equal(p.ix[item, :], p[item]) + assert_frame_equal(p.ix[item, :, :], p[item]) + + # major axis, axis=1 + assert_frame_equal(p.ix[:, date], p.major_xs(date)) + assert_frame_equal(p.ix[:, date, :], p.major_xs(date)) + + # minor axis, axis=2 + assert_frame_equal(p.ix[:, :, 'C'], p.minor_xs('C')) + + # get Series + assert_series_equal(p.ix[item, date], p[item].ix[date]) + assert_series_equal(p.ix[item, date, :], p[item].ix[date]) + assert_series_equal(p.ix[item, :, col], p[item][col]) + assert_series_equal(p.ix[:, date, col], p.major_xs(date).ix[col]) + + def test_getitem_fancy_xs_check_view(self): + item = 'ItemB' + date = self.panel.major_axis[5] + col = 'C' + + # make sure it's always a view + NS = slice(None, None) + + # DataFrames + comp = assert_frame_equal + self._check_view(item, comp) + self._check_view((item, NS), comp) + self._check_view((item, NS, NS), comp) + self._check_view((NS, date), comp) + self._check_view((NS, date, NS), comp) + self._check_view((NS, NS, 'C'), comp) + + # Series + comp = assert_series_equal + self._check_view((item, date), comp) + self._check_view((item, date, NS), comp) + self._check_view((item, NS, 'C'), comp) + self._check_view((NS, date, 'C'), comp) + + def test_ix_setitem_slice_dataframe(self): + a = Panel(items=[1, 2, 3], major_axis=[11, 22, 33], + minor_axis=[111, 222, 333]) + b = DataFrame(np.random.randn(2, 3), index=[111, 333], + columns=[1, 2, 3]) + + a.ix[:, 22, [111, 333]] = b + + assert_frame_equal(a.ix[:, 22, [111, 333]], b) + + def test_ix_align(self): + from pandas import Series + b = Series(np.random.randn(10)) + b.sort() + df_orig = Panel(np.random.randn(3, 10, 2)) + df = df_orig.copy() + + df.ix[0, :, 0] = b + assert_series_equal(df.ix[0, :, 0].reindex(b.index), b) + + df = df_orig.swapaxes(0, 1) + df.ix[:, 0, 0] = b + assert_series_equal(df.ix[:, 0, 0].reindex(b.index), b) + + df = df_orig.swapaxes(1, 2) + df.ix[0, 0, :] = b + assert_series_equal(df.ix[0, 0, :].reindex(b.index), b) + + def test_ix_frame_align(self): + from pandas import DataFrame + df = DataFrame(np.random.randn(2, 10)) + df.sort_index(inplace=True) + p_orig = Panel(np.random.randn(3, 10, 2)) + + p = p_orig.copy() + p.ix[0, :, :] = df + out = p.ix[0, :, :].T.reindex(df.index, columns=df.columns) + assert_frame_equal(out, df) + + p = p_orig.copy() + p.ix[0] = df + out = p.ix[0].T.reindex(df.index, columns=df.columns) + assert_frame_equal(out, df) + + p = p_orig.copy() + p.ix[0, [0, 1, 3, 5], -2:] = df + out = p.ix[0, [0, 1, 3, 5], -2:] + assert_frame_equal(out, df.T.reindex([0, 1, 3, 5], p.minor_axis[-2:])) + + # GH3830, panel assignent by values/frame + for dtype in ['float64','int64']: + + panel = Panel(np.arange(40).reshape((2,4,5)), items=['a1','a2'], dtype=dtype) + df1 = panel.iloc[0] + df2 = panel.iloc[1] + + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df2) + + # Assignment by Value Passes for 'a2' + panel.loc['a2'] = df1.values + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df1) + + # Assignment by DataFrame Ok w/o loc 'a2' + panel['a2'] = df2 + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df2) + + # Assignment by DataFrame Fails for 'a2' + panel.loc['a2'] = df2 + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df2) + + def _check_view(self, indexer, comp): + cp = self.panel.copy() + obj = cp.ix[indexer] + obj.values[:] = 0 + self.assertTrue((obj.values == 0).all()) + comp(cp.ix[indexer].reindex_like(obj), obj) + + def test_logical_with_nas(self): + d = Panel({'ItemA': {'a': [np.nan, False]}, 'ItemB': { + 'a': [True, True]}}) + + result = d['ItemA'] | d['ItemB'] + expected = DataFrame({'a': [np.nan, True]}) + assert_frame_equal(result, expected) + + # this is autodowncasted here + result = d['ItemA'].fillna(False) | d['ItemB'] + expected = DataFrame({'a': [True, True]}) + assert_frame_equal(result, expected) + + def test_neg(self): + # what to do? + assert_panel_equal(-self.panel, -1 * self.panel) + + def test_invert(self): + assert_panel_equal(-(self.panel < 0), ~(self.panel < 0)) + + def test_comparisons(self): + p1 = tm.makePanel() + p2 = tm.makePanel() + + tp = p1.reindex(items=p1.items + ['foo']) + df = p1[p1.items[0]] + + def test_comp(func): + + # versus same index + result = func(p1, p2) + self.assert_numpy_array_equal(result.values, + func(p1.values, p2.values)) + + # versus non-indexed same objs + self.assertRaises(Exception, func, p1, tp) + + # versus different objs + self.assertRaises(Exception, func, p1, df) + + # versus scalar + result3 = func(self.panel, 0) + self.assert_numpy_array_equal(result3.values, + func(self.panel.values, 0)) + + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) + + def test_get_value(self): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + result = self.panel.get_value(item, mjr, mnr) + expected = self.panel[item][mnr][mjr] + assert_almost_equal(result, expected) + with tm.assertRaisesRegexp(TypeError, + "There must be an argument for each axis"): + self.panel.get_value('a') + + def test_set_value(self): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + self.panel.set_value(item, mjr, mnr, 1.) + assert_almost_equal(self.panel[item][mnr][mjr], 1.) + + # resize + res = self.panel.set_value('ItemE', 'foo', 'bar', 1.5) + tm.assert_isinstance(res, Panel) + self.assertIsNot(res, self.panel) + self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) + + res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) + self.assertTrue(com.is_float_dtype(res3['ItemE'].values)) + with tm.assertRaisesRegexp(TypeError, + "There must be an argument for each axis" + " plus the value provided"): + self.panel.set_value('a') + +_panel = tm.makePanel() +tm.add_nans(_panel) + + +class TestPanel(tm.TestCase, PanelTests, CheckIndexing, + SafeForLongAndSparse, + SafeForSparse): + _multiprocess_can_split_ = True + + @classmethod + def assert_panel_equal(cls, x, y): + assert_panel_equal(x, y) + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + self.panel = _panel.copy() + self.panel.major_axis.name = None + self.panel.minor_axis.name = None + self.panel.items.name = None + + def test_panel_warnings(self): + with tm.assert_produces_warning(FutureWarning): + shifted1 = self.panel.shift(lags=1) + + with tm.assert_produces_warning(False): + shifted2 = self.panel.shift(periods=1) + + tm.assert_panel_equal(shifted1, shifted2) + + with tm.assert_produces_warning(False): + shifted3 = self.panel.shift() + + tm.assert_panel_equal(shifted1, shifted3) + + def test_constructor(self): + # with BlockManager + wp = Panel(self.panel._data) + self.assertIs(wp._data, self.panel._data) + + wp = Panel(self.panel._data, copy=True) + self.assertIsNot(wp._data, self.panel._data) + assert_panel_equal(wp, self.panel) + + # strings handled prop + wp = Panel([[['foo', 'foo', 'foo', ], + ['foo', 'foo', 'foo']]]) + self.assertEqual(wp.values.dtype, np.object_) + + vals = self.panel.values + + # no copy + wp = Panel(vals) + self.assertIs(wp.values, vals) + + # copy + wp = Panel(vals, copy=True) + self.assertIsNot(wp.values, vals) + + def test_constructor_cast(self): + zero_filled = self.panel.fillna(0) + + casted = Panel(zero_filled._data, dtype=int) + casted2 = Panel(zero_filled.values, dtype=int) + + exp_values = zero_filled.values.astype(int) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) + + casted = Panel(zero_filled._data, dtype=np.int32) + casted2 = Panel(zero_filled.values, dtype=np.int32) + + exp_values = zero_filled.values.astype(np.int32) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) + + # can't cast + data = [[['foo', 'bar', 'baz']]] + self.assertRaises(ValueError, Panel, data, dtype=float) + + def test_constructor_empty_panel(self): + empty = Panel() + self.assertEqual(len(empty.items), 0) + self.assertEqual(len(empty.major_axis), 0) + self.assertEqual(len(empty.minor_axis), 0) + + def test_constructor_observe_dtype(self): + # GH #411 + panel = Panel(items=lrange(3), major_axis=lrange(3), + minor_axis=lrange(3), dtype='O') + self.assertEqual(panel.values.dtype, np.object_) + + def test_constructor_dtypes(self): + # GH #797 + + def _check_dtype(panel, dtype): + for i in panel.items: + self.assertEqual(panel[i].values.dtype.name, dtype) + + # only nan holding types allowed here + for dtype in ['float64','float32','object']: + panel = Panel(items=lrange(2),major_axis=lrange(10),minor_axis=lrange(5),dtype=dtype) + _check_dtype(panel,dtype) + + for dtype in ['float64','float32','int64','int32','object']: + panel = Panel(np.array(np.random.randn(2,10,5),dtype=dtype),items=lrange(2),major_axis=lrange(10),minor_axis=lrange(5),dtype=dtype) + _check_dtype(panel,dtype) + + for dtype in ['float64','float32','int64','int32','object']: + panel = Panel(np.array(np.random.randn(2,10,5),dtype='O'),items=lrange(2),major_axis=lrange(10),minor_axis=lrange(5),dtype=dtype) + _check_dtype(panel,dtype) + + for dtype in ['float64','float32','int64','int32','object']: + panel = Panel(np.random.randn(2,10,5),items=lrange(2),major_axis=lrange(10),minor_axis=lrange(5),dtype=dtype) + _check_dtype(panel,dtype) + + def test_constructor_fails_with_not_3d_input(self): + with tm.assertRaisesRegexp(ValueError, + "The number of dimensions required is 3"): + Panel(np.random.randn(10, 2)) + + def test_consolidate(self): + self.assertTrue(self.panel._data.is_consolidated()) + + self.panel['foo'] = 1. + self.assertFalse(self.panel._data.is_consolidated()) + + panel = self.panel.consolidate() + self.assertTrue(panel._data.is_consolidated()) + + def test_ctor_dict(self): + itema = self.panel['ItemA'] + itemb = self.panel['ItemB'] + + d = {'A': itema, 'B': itemb[5:]} + d2 = {'A': itema._series, 'B': itemb[5:]._series} + d3 = {'A': None, + 'B': DataFrame(itemb[5:]._series), + 'C': DataFrame(itema._series)} + + wp = Panel.from_dict(d) + wp2 = Panel.from_dict(d2) # nested Dict + wp3 = Panel.from_dict(d3) + self.assertTrue(wp.major_axis.equals(self.panel.major_axis)) + assert_panel_equal(wp, wp2) + + # intersect + wp = Panel.from_dict(d, intersect=True) + self.assertTrue(wp.major_axis.equals(itemb.index[5:])) + + # use constructor + assert_panel_equal(Panel(d), Panel.from_dict(d)) + assert_panel_equal(Panel(d2), Panel.from_dict(d2)) + assert_panel_equal(Panel(d3), Panel.from_dict(d3)) + + # a pathological case + d4 = {'A': None, 'B': None} + wp4 = Panel.from_dict(d4) + assert_panel_equal(Panel(d4), Panel(items=['A', 'B'])) + + # cast + dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) + for k, v in compat.iteritems(d)) + result = Panel(dcasted, dtype=int) + expected = Panel(dict((k, v.astype(int)) + for k, v in compat.iteritems(dcasted))) + assert_panel_equal(result, expected) + + result = Panel(dcasted, dtype=np.int32) + expected = Panel(dict((k, v.astype(np.int32)) + for k, v in compat.iteritems(dcasted))) + assert_panel_equal(result, expected) + + def test_constructor_dict_mixed(self): + data = dict((k, v.values) for k, v in compat.iteritems(self.panel)) + result = Panel(data) + exp_major = Index(np.arange(len(self.panel.major_axis))) + self.assertTrue(result.major_axis.equals(exp_major)) + + result = Panel(data, items=self.panel.items, + major_axis=self.panel.major_axis, + minor_axis=self.panel.minor_axis) + assert_panel_equal(result, self.panel) + + data['ItemC'] = self.panel['ItemC'] + result = Panel(data) + assert_panel_equal(result, self.panel) + + # corner, blow up + data['ItemB'] = data['ItemB'][:-1] + self.assertRaises(Exception, Panel, data) + + data['ItemB'] = self.panel['ItemB'].values[:, :-1] + self.assertRaises(Exception, Panel, data) + + def test_ctor_orderedDict(self): + keys = list(set(np.random.randint(0,5000,100)))[:50] # unique random int keys + d = OrderedDict([(k,mkdf(10,5)) for k in keys]) + p = Panel(d) + self.assertTrue(list(p.items) == keys) + + p = Panel.from_dict(d) + self.assertTrue(list(p.items) == keys) + + def test_constructor_resize(self): + data = self.panel._data + items = self.panel.items[:-1] + major = self.panel.major_axis[:-1] + minor = self.panel.minor_axis[:-1] + + result = Panel(data, items=items, major_axis=major, + minor_axis=minor) + expected = self.panel.reindex(items=items, major=major, minor=minor) + assert_panel_equal(result, expected) + + result = Panel(data, items=items, major_axis=major) + expected = self.panel.reindex(items=items, major=major) + assert_panel_equal(result, expected) + + result = Panel(data, items=items) + expected = self.panel.reindex(items=items) + assert_panel_equal(result, expected) + + result = Panel(data, minor_axis=minor) + expected = self.panel.reindex(minor=minor) + assert_panel_equal(result, expected) + + def test_from_dict_mixed_orient(self): + df = tm.makeDataFrame() + df['foo'] = 'bar' + + data = {'k1': df, + 'k2': df} + + panel = Panel.from_dict(data, orient='minor') + + self.assertEqual(panel['foo'].values.dtype, np.object_) + self.assertEqual(panel['A'].values.dtype, np.float64) + + def test_constructor_error_msgs(self): + + def testit(): + Panel(np.random.randn(3,4,5), lrange(4), lrange(5), lrange(5)) + assertRaisesRegexp(ValueError, "Shape of passed values is \(3, 4, 5\), indices imply \(4, 5, 5\)", testit) + + def testit(): + Panel(np.random.randn(3,4,5), lrange(5), lrange(4), lrange(5)) + assertRaisesRegexp(ValueError, "Shape of passed values is \(3, 4, 5\), indices imply \(5, 4, 5\)", testit) + + def testit(): + Panel(np.random.randn(3,4,5), lrange(5), lrange(5), lrange(4)) + assertRaisesRegexp(ValueError, "Shape of passed values is \(3, 4, 5\), indices imply \(5, 5, 4\)", testit) + + def test_conform(self): + df = self.panel['ItemA'][:-5].filter(items=['A', 'B']) + conformed = self.panel.conform(df) + + assert(conformed.index.equals(self.panel.major_axis)) + assert(conformed.columns.equals(self.panel.minor_axis)) + + def test_convert_objects(self): + + # GH 4937 + p = Panel(dict(A = dict(a = ['1','1.0']))) + expected = Panel(dict(A = dict(a = [1,1.0]))) + result = p.convert_objects(convert_numeric='force') + assert_panel_equal(result, expected) + + def test_dtypes(self): + + result = self.panel.dtypes + expected = Series(np.dtype('float64'),index=self.panel.items) + assert_series_equal(result, expected) + + def test_apply(self): + # GH1148 + + from pandas import Series,DataFrame + + # ufunc + applied = self.panel.apply(np.sqrt) + self.assertTrue(assert_almost_equal(applied.values, + np.sqrt(self.panel.values))) + + # ufunc same shape + result = self.panel.apply(lambda x: x*2, axis='items') + expected = self.panel*2 + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x*2, axis='major_axis') + expected = self.panel*2 + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x*2, axis='minor_axis') + expected = self.panel*2 + assert_panel_equal(result, expected) + + # reduction to DataFrame + result = self.panel.apply(lambda x: x.dtype, axis='items') + expected = DataFrame(np.dtype('float64'),index=self.panel.major_axis,columns=self.panel.minor_axis) + assert_frame_equal(result,expected) + result = self.panel.apply(lambda x: x.dtype, axis='major_axis') + expected = DataFrame(np.dtype('float64'),index=self.panel.minor_axis,columns=self.panel.items) + assert_frame_equal(result,expected) + result = self.panel.apply(lambda x: x.dtype, axis='minor_axis') + expected = DataFrame(np.dtype('float64'),index=self.panel.major_axis,columns=self.panel.items) + assert_frame_equal(result,expected) + + # reductions via other dims + expected = self.panel.sum(0) + result = self.panel.apply(lambda x: x.sum(), axis='items') + assert_frame_equal(result,expected) + expected = self.panel.sum(1) + result = self.panel.apply(lambda x: x.sum(), axis='major_axis') + assert_frame_equal(result,expected) + expected = self.panel.sum(2) + result = self.panel.apply(lambda x: x.sum(), axis='minor_axis') + assert_frame_equal(result,expected) + + # pass kwargs + result = self.panel.apply(lambda x, y: x.sum() + y, axis='items', y=5) + expected = self.panel.sum(0) + 5 + assert_frame_equal(result,expected) + + def test_apply_slabs(self): + + # same shape as original + result = self.panel.apply(lambda x: x*2, axis = ['items','major_axis']) + expected = (self.panel*2).transpose('minor_axis','major_axis','items') + assert_panel_equal(result,expected) + result = self.panel.apply(lambda x: x*2, axis = ['major_axis','items']) + assert_panel_equal(result,expected) + + result = self.panel.apply(lambda x: x*2, axis = ['items','minor_axis']) + expected = (self.panel*2).transpose('major_axis','minor_axis','items') + assert_panel_equal(result,expected) + result = self.panel.apply(lambda x: x*2, axis = ['minor_axis','items']) + assert_panel_equal(result,expected) + + result = self.panel.apply(lambda x: x*2, axis = ['major_axis','minor_axis']) + expected = self.panel*2 + assert_panel_equal(result,expected) + result = self.panel.apply(lambda x: x*2, axis = ['minor_axis','major_axis']) + assert_panel_equal(result,expected) + + # reductions + result = self.panel.apply(lambda x: x.sum(0), axis = ['items','major_axis']) + expected = self.panel.sum(1).T + assert_frame_equal(result,expected) + + result = self.panel.apply(lambda x: x.sum(1), axis = ['items','major_axis']) + expected = self.panel.sum(0) + assert_frame_equal(result,expected) + + # transforms + f = lambda x: ((x.T-x.mean(1))/x.std(1)).T + + # make sure that we don't trigger any warnings + with tm.assert_produces_warning(False): + result = self.panel.apply(f, axis = ['items','major_axis']) + expected = Panel(dict([ (ax,f(self.panel.loc[:,:,ax])) for ax in self.panel.minor_axis ])) + assert_panel_equal(result,expected) + + result = self.panel.apply(f, axis = ['major_axis','minor_axis']) + expected = Panel(dict([ (ax,f(self.panel.loc[ax])) for ax in self.panel.items ])) + assert_panel_equal(result,expected) + + result = self.panel.apply(f, axis = ['minor_axis','items']) + expected = Panel(dict([ (ax,f(self.panel.loc[:,ax])) for ax in self.panel.major_axis ])) + assert_panel_equal(result,expected) + + # with multi-indexes + # GH7469 + index = MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ('two', 'a'), ('two', 'b')]) + dfa = DataFrame(np.array(np.arange(12, dtype='int64')).reshape(4,3), columns=list("ABC"), index=index) + dfb = DataFrame(np.array(np.arange(10, 22, dtype='int64')).reshape(4,3), columns=list("ABC"), index=index) + p = Panel({'f':dfa, 'g':dfb}) + result = p.apply(lambda x: x.sum(), axis=0) + expected = p.sum(0) + assert_frame_equal(result,expected) + + def test_reindex(self): + ref = self.panel['ItemB'] + + # items + result = self.panel.reindex(items=['ItemA', 'ItemB']) + assert_frame_equal(result['ItemB'], ref) + + # major + new_major = list(self.panel.major_axis[:10]) + result = self.panel.reindex(major=new_major) + assert_frame_equal(result['ItemB'], ref.reindex(index=new_major)) + + # raise exception put both major and major_axis + self.assertRaises(Exception, self.panel.reindex, + major_axis=new_major, major=new_major) + + # minor + new_minor = list(self.panel.minor_axis[:2]) + result = self.panel.reindex(minor=new_minor) + assert_frame_equal(result['ItemB'], ref.reindex(columns=new_minor)) + + # this ok + result = self.panel.reindex() + assert_panel_equal(result,self.panel) + self.assertFalse(result is self.panel) + + # with filling + smaller_major = self.panel.major_axis[::5] + smaller = self.panel.reindex(major=smaller_major) + + larger = smaller.reindex(major=self.panel.major_axis, + method='pad') + + assert_frame_equal(larger.major_xs(self.panel.major_axis[1]), + smaller.major_xs(smaller_major[0])) + + # don't necessarily copy + result = self.panel.reindex(major=self.panel.major_axis, copy=False) + assert_panel_equal(result,self.panel) + self.assertTrue(result is self.panel) + + def test_reindex_multi(self): + + # with and without copy full reindexing + result = self.panel.reindex(items=self.panel.items, + major=self.panel.major_axis, + minor=self.panel.minor_axis, + copy = False) + + self.assertIs(result.items, self.panel.items) + self.assertIs(result.major_axis, self.panel.major_axis) + self.assertIs(result.minor_axis, self.panel.minor_axis) + + result = self.panel.reindex(items=self.panel.items, + major=self.panel.major_axis, + minor=self.panel.minor_axis, + copy = False) + assert_panel_equal(result,self.panel) + + # multi-axis indexing consistency + # GH 5900 + df = DataFrame(np.random.randn(4,3)) + p = Panel({ 'Item1' : df }) + expected = Panel({ 'Item1' : df }) + expected['Item2'] = np.nan + + items = ['Item1','Item2'] + major_axis = np.arange(4) + minor_axis = np.arange(3) + + results = [] + results.append(p.reindex(items=items, major_axis=major_axis, copy=True)) + results.append(p.reindex(items=items, major_axis=major_axis, copy=False)) + results.append(p.reindex(items=items, minor_axis=minor_axis, copy=True)) + results.append(p.reindex(items=items, minor_axis=minor_axis, copy=False)) + results.append(p.reindex(items=items, major_axis=major_axis, minor_axis=minor_axis, copy=True)) + results.append(p.reindex(items=items, major_axis=major_axis, minor_axis=minor_axis, copy=False)) + + for i, r in enumerate(results): + assert_panel_equal(expected,r) + + def test_reindex_like(self): + # reindex_like + smaller = self.panel.reindex(items=self.panel.items[:-1], + major=self.panel.major_axis[:-1], + minor=self.panel.minor_axis[:-1]) + smaller_like = self.panel.reindex_like(smaller) + assert_panel_equal(smaller, smaller_like) + + def test_take(self): + # axis == 0 + result = self.panel.take([2, 0, 1], axis=0) + expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) + assert_panel_equal(result, expected) + + # axis >= 1 + result = self.panel.take([3, 0, 1, 2], axis=2) + expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) + assert_panel_equal(result, expected) + + # neg indicies ok + expected = self.panel.reindex(minor=['D', 'D', 'B', 'C']) + result = self.panel.take([3, -1, 1, 2], axis=2) + assert_panel_equal(result, expected) + + self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + + def test_sort_index(self): + import random + + ritems = list(self.panel.items) + rmajor = list(self.panel.major_axis) + rminor = list(self.panel.minor_axis) + random.shuffle(ritems) + random.shuffle(rmajor) + random.shuffle(rminor) + + random_order = self.panel.reindex(items=ritems) + sorted_panel = random_order.sort_index(axis=0) + assert_panel_equal(sorted_panel, self.panel) + + # descending + random_order = self.panel.reindex(items=ritems) + sorted_panel = random_order.sort_index(axis=0, ascending=False) + assert_panel_equal(sorted_panel, + self.panel.reindex(items=self.panel.items[::-1])) + + random_order = self.panel.reindex(major=rmajor) + sorted_panel = random_order.sort_index(axis=1) + assert_panel_equal(sorted_panel, self.panel) + + random_order = self.panel.reindex(minor=rminor) + sorted_panel = random_order.sort_index(axis=2) + assert_panel_equal(sorted_panel, self.panel) + + def test_fillna(self): + filled = self.panel.fillna(0) + self.assertTrue(np.isfinite(filled.values).all()) + + filled = self.panel.fillna(method='backfill') + assert_frame_equal(filled['ItemA'], + self.panel['ItemA'].fillna(method='backfill')) + + panel = self.panel.copy() + panel['str'] = 'foo' + + filled = panel.fillna(method='backfill') + assert_frame_equal(filled['ItemA'], + panel['ItemA'].fillna(method='backfill')) + + empty = self.panel.reindex(items=[]) + filled = empty.fillna(0) + assert_panel_equal(filled, empty) + + self.assertRaises(ValueError, self.panel.fillna) + self.assertRaises(ValueError, self.panel.fillna, 5, method='ffill') + + self.assertRaises(TypeError, self.panel.fillna, [1, 2]) + self.assertRaises(TypeError, self.panel.fillna, (1, 2)) + + # limit not implemented when only value is specified + p = Panel(np.random.randn(3,4,5)) + p.iloc[0:2,0:2,0:2] = np.nan + self.assertRaises(NotImplementedError, lambda : p.fillna(999,limit=1)) + + def test_ffill_bfill(self): + assert_panel_equal(self.panel.ffill(), + self.panel.fillna(method='ffill')) + assert_panel_equal(self.panel.bfill(), + self.panel.fillna(method='bfill')) + + def test_truncate_fillna_bug(self): + # #1823 + result = self.panel.truncate(before=None, after=None, axis='items') + + # it works! + result.fillna(value=0.0) + + def test_swapaxes(self): + result = self.panel.swapaxes('items', 'minor') + self.assertIs(result.items, self.panel.minor_axis) + + result = self.panel.swapaxes('items', 'major') + self.assertIs(result.items, self.panel.major_axis) + + result = self.panel.swapaxes('major', 'minor') + self.assertIs(result.major_axis, self.panel.minor_axis) + + panel = self.panel.copy() + result = panel.swapaxes('major', 'minor') + panel.values[0, 0, 1] = np.nan + expected = panel.swapaxes('major', 'minor') + assert_panel_equal(result, expected) + + # this should also work + result = self.panel.swapaxes(0, 1) + self.assertIs(result.items, self.panel.major_axis) + + # this works, but return a copy + result = self.panel.swapaxes('items', 'items') + assert_panel_equal(self.panel,result) + self.assertNotEqual(id(self.panel), id(result)) + + def test_transpose(self): + result = self.panel.transpose('minor', 'major', 'items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) + + # test kwargs + result = self.panel.transpose(items='minor', major='major', + minor='items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) + + # text mixture of args + result = self.panel.transpose('minor', major='major', minor='items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) + + result = self.panel.transpose('minor', 'major', minor='items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) + + # duplicate axes + with tm.assertRaisesRegexp(TypeError, 'not enough/duplicate arguments'): + self.panel.transpose('minor', maj='major', minor='items') + + with tm.assertRaisesRegexp(ValueError, 'repeated axis in transpose'): + self.panel.transpose('minor', 'major', major='minor', minor='items') + + result = self.panel.transpose(2, 1, 0) + assert_panel_equal(result, expected) + + result = self.panel.transpose('minor', 'items', 'major') + expected = self.panel.swapaxes('items', 'minor') + expected = expected.swapaxes('major', 'minor') + assert_panel_equal(result, expected) + + result = self.panel.transpose(2, 0, 1) + assert_panel_equal(result, expected) + + self.assertRaises(ValueError, self.panel.transpose, 0, 0, 1) + + def test_transpose_copy(self): + panel = self.panel.copy() + result = panel.transpose(2, 0, 1, copy=True) + expected = panel.swapaxes('items', 'minor') + expected = expected.swapaxes('major', 'minor') + assert_panel_equal(result, expected) + + panel.values[0, 1, 1] = np.nan + self.assertTrue(notnull(result.values[1, 0, 1])) + + def test_to_frame(self): + # filtered + filtered = self.panel.to_frame() + expected = self.panel.to_frame().dropna(how='any') + assert_frame_equal(filtered, expected) + + # unfiltered + unfiltered = self.panel.to_frame(filter_observations=False) + assert_panel_equal(unfiltered.to_panel(), self.panel) + + # names + self.assertEqual(unfiltered.index.names, ('major', 'minor')) + + # unsorted, round trip + df = self.panel.to_frame(filter_observations=False) + unsorted = df.take(np.random.permutation(len(df))) + pan = unsorted.to_panel() + assert_panel_equal(pan, self.panel) + + # preserve original index names + df = DataFrame(np.random.randn(6, 2), + index=[['a', 'a', 'b', 'b', 'c', 'c'], + [0, 1, 0, 1, 0, 1]], + columns=['one', 'two']) + df.index.names = ['foo', 'bar'] + df.columns.name = 'baz' + + rdf = df.to_panel().to_frame() + self.assertEqual(rdf.index.names, df.index.names) + self.assertEqual(rdf.columns.names, df.columns.names) + + def test_to_frame_mixed(self): + panel = self.panel.fillna(0) + panel['str'] = 'foo' + panel['bool'] = panel['ItemA'] > 0 + + lp = panel.to_frame() + wp = lp.to_panel() + self.assertEqual(wp['bool'].values.dtype, np.bool_) + # Previously, this was mutating the underlying index and changing its name + assert_frame_equal(wp['bool'], panel['bool'], check_names=False) + + def test_to_frame_multi_major(self): + idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), + (2, 'two')]) + df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]], + columns=['A', 'B', 'C'], index=idx) + wp = Panel({'i1': df, 'i2': df}) + expected_idx = MultiIndex.from_tuples([(1, 'one', 'A'), (1, 'one', 'B'), + (1, 'one', 'C'), (1, 'two', 'A'), + (1, 'two', 'B'), (1, 'two', 'C'), + (2, 'one', 'A'), (2, 'one', 'B'), + (2, 'one', 'C'), (2, 'two', 'A'), + (2, 'two', 'B'), (2, 'two', 'C')], + names=[None, None, 'minor']) + expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1], + 'i2': [1, 'a', 1, 2, 'b', 1, 3, 'c', 1, 4, 'd', 1]}, + index=expected_idx) + result = wp.to_frame() + assert_frame_equal(result, expected) + + wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773 + result = wp.to_frame() + assert_frame_equal(result, expected[1:]) + + idx = MultiIndex.from_tuples([(1, 'two'), (1, 'one'), (2, 'one'), + (np.nan, 'two')]) + df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]], + columns=['A', 'B', 'C'], index=idx) + wp = Panel({'i1': df, 'i2': df}) + ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'), (1, 'two', 'C'), + (1, 'one', 'A'), (1, 'one', 'B'), (1, 'one', 'C'), + (2, 'one', 'A'), (2, 'one', 'B'), (2, 'one', 'C'), + (np.nan, 'two', 'A'), (np.nan, 'two', 'B'), + (np.nan, 'two', 'C')], + names=[None, None, 'minor']) + expected.index = ex_idx + result = wp.to_frame() + assert_frame_equal(result, expected) + + def test_to_frame_multi_major_minor(self): + cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), + (2, 'two'), (3, 'three'), (4, 'four')]) + df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], ['a', 'b', 'w', 'x'], + ['c', 'd', 'y', 'z'], [-1, -2, -3, -4], [-5, -6, -7, -8] + ], columns=cols, index=idx) + wp = Panel({'i1': df, 'i2': df}) + + exp_idx = MultiIndex.from_tuples([(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'), + (1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'), + (1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'), + (1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'), + (2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'), + (2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'), + (2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'), + (2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'), + (3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'), + (3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'), + (4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'), + (4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')], + names=[None, None, None, None]) + exp_val = [[1, 1], [2, 2], [11, 11], [12, 12], [3, 3], [4, 4], [13, 13], + [14, 14], ['a', 'a'], ['b', 'b'], ['w', 'w'], ['x', 'x'], + ['c', 'c'], ['d', 'd'], ['y', 'y'], ['z', 'z'], [-1, -1], + [-2, -2], [-3, -3], [-4, -4], [-5, -5], [-6, -6], [-7, -7], + [-8, -8]] + result = wp.to_frame() + expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx) + assert_frame_equal(result, expected) + + def test_to_frame_multi_drop_level(self): + idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')]) + df = DataFrame({'A': [np.nan, 1, 2]}, index=idx) + wp = Panel({'i1': df, 'i2': df}) + result = wp.to_frame() + exp_idx = MultiIndex.from_tuples([(2, 'one', 'A'), (2, 'two', 'A')], + names=[None, None, 'minor']) + expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx) + assert_frame_equal(result, expected) + + def test_to_panel_na_handling(self): + df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)), + index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1], + [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]]) + + panel = df.to_panel() + self.assertTrue(isnull(panel[0].ix[1, [0, 1]]).all()) + + def test_to_panel_duplicates(self): + # #2441 + df = DataFrame({'a': [0, 0, 1], 'b': [1, 1, 1], 'c': [1, 2, 3]}) + idf = df.set_index(['a', 'b']) + assertRaisesRegexp(ValueError, 'non-uniquely indexed', idf.to_panel) + + def test_panel_dups(self): + + # GH 4960 + # duplicates in an index + + # items + data = np.random.randn(5, 100, 5) + no_dup_panel = Panel(data, items=list("ABCDE")) + panel = Panel(data, items=list("AACDE")) + + expected = no_dup_panel['A'] + result = panel.iloc[0] + assert_frame_equal(result, expected) + + expected = no_dup_panel['E'] + result = panel.loc['E'] + assert_frame_equal(result, expected) + + expected = no_dup_panel.loc[['A','B']] + expected.items = ['A','A'] + result = panel.loc['A'] + assert_panel_equal(result, expected) + + # major + data = np.random.randn(5, 5, 5) + no_dup_panel = Panel(data, major_axis=list("ABCDE")) + panel = Panel(data, major_axis=list("AACDE")) + + expected = no_dup_panel.loc[:,'A'] + result = panel.iloc[:,0] + assert_frame_equal(result, expected) + + expected = no_dup_panel.loc[:,'E'] + result = panel.loc[:,'E'] + assert_frame_equal(result, expected) + + expected = no_dup_panel.loc[:,['A','B']] + expected.major_axis = ['A','A'] + result = panel.loc[:,'A'] + assert_panel_equal(result, expected) + + # minor + data = np.random.randn(5, 100, 5) + no_dup_panel = Panel(data, minor_axis=list("ABCDE")) + panel = Panel(data, minor_axis=list("AACDE")) + + expected = no_dup_panel.loc[:,:,'A'] + result = panel.iloc[:,:,0] + assert_frame_equal(result, expected) + + expected = no_dup_panel.loc[:,:,'E'] + result = panel.loc[:,:,'E'] + assert_frame_equal(result, expected) + + expected = no_dup_panel.loc[:,:,['A','B']] + expected.minor_axis = ['A','A'] + result = panel.loc[:,:,'A'] + assert_panel_equal(result, expected) + + def test_filter(self): + pass + + def test_compound(self): + compounded = self.panel.compound() + + assert_series_equal(compounded['ItemA'], + (1 + self.panel['ItemA']).product(0) - 1) + + def test_shift(self): + # major + idx = self.panel.major_axis[0] + idx_lag = self.panel.major_axis[1] + + shifted = self.panel.shift(1) + + assert_frame_equal(self.panel.major_xs(idx), + shifted.major_xs(idx_lag)) + + # minor + idx = self.panel.minor_axis[0] + idx_lag = self.panel.minor_axis[1] + + shifted = self.panel.shift(1, axis='minor') + + assert_frame_equal(self.panel.minor_xs(idx), + shifted.minor_xs(idx_lag)) + + self.assertRaises(Exception, self.panel.shift, 1, axis='items') + + # negative numbers, #2164 + result = self.panel.shift(-1) + expected = Panel(dict((i, f.shift(-1)[:-1]) + for i, f in compat.iteritems(self.panel))) + assert_panel_equal(result, expected) + + # mixed dtypes #6959 + data = [('item '+ch, makeMixedDataFrame()) for ch in list('abcde')] + data = dict(data) + mixed_panel = Panel.from_dict(data, orient='minor') + shifted = mixed_panel.shift(1) + assert_series_equal(mixed_panel.dtypes, shifted.dtypes) + + def test_tshift(self): + # PeriodIndex + ps = tm.makePeriodPanel() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + assert_panel_equal(unshifted, ps) + + shifted2 = ps.tshift(freq='B') + assert_panel_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=bday) + assert_panel_equal(shifted, shifted3) + + assertRaisesRegexp(ValueError, 'does not match', ps.tshift, freq='M') + + # DatetimeIndex + panel = _panel + shifted = panel.tshift(1) + unshifted = shifted.tshift(-1) + + assert_panel_equal(panel, unshifted) + + shifted2 = panel.tshift(freq=panel.major_axis.freq) + assert_panel_equal(shifted, shifted2) + + inferred_ts = Panel(panel.values, + items=panel.items, + major_axis=Index(np.asarray(panel.major_axis)), + minor_axis=panel.minor_axis) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + assert_panel_equal(shifted, panel.tshift(1)) + assert_panel_equal(unshifted, inferred_ts) + + no_freq = panel.ix[:, [0, 5, 7], :] + self.assertRaises(ValueError, no_freq.tshift) + + def test_pct_change(self): + df1 = DataFrame({'c1': [1, 2, 5], 'c2': [3, 4, 6]}) + df2 = df1 + 1 + df3 = DataFrame({'c1': [3, 4, 7], 'c2': [5, 6, 8]}) + wp = Panel({'i1': df1, 'i2': df2, 'i3': df3}) + # major, 1 + result = wp.pct_change() # axis='major' + expected = Panel({'i1': df1.pct_change(), + 'i2': df2.pct_change(), + 'i3': df3.pct_change()}) + assert_panel_equal(result, expected) + result = wp.pct_change(axis=1) + assert_panel_equal(result, expected) + # major, 2 + result = wp.pct_change(periods=2) + expected = Panel({'i1': df1.pct_change(2), + 'i2': df2.pct_change(2), + 'i3': df3.pct_change(2)}) + assert_panel_equal(result, expected) + # minor, 1 + result = wp.pct_change(axis='minor') + expected = Panel({'i1': df1.pct_change(axis=1), + 'i2': df2.pct_change(axis=1), + 'i3': df3.pct_change(axis=1)}) + assert_panel_equal(result, expected) + result = wp.pct_change(axis=2) + assert_panel_equal(result, expected) + # minor, 2 + result = wp.pct_change(periods=2, axis='minor') + expected = Panel({'i1': df1.pct_change(periods=2, axis=1), + 'i2': df2.pct_change(periods=2, axis=1), + 'i3': df3.pct_change(periods=2, axis=1)}) + assert_panel_equal(result, expected) + # items, 1 + result = wp.pct_change(axis='items') + expected = Panel({'i1': DataFrame({'c1': [np.nan, np.nan, np.nan], + 'c2': [np.nan, np.nan, np.nan]}), + 'i2': DataFrame({'c1': [1, 0.5, .2], + 'c2': [1./3, 0.25, 1./6]}), + 'i3': DataFrame({'c1': [.5, 1./3, 1./6], + 'c2': [.25, .2, 1./7]})}) + assert_panel_equal(result, expected) + result = wp.pct_change(axis=0) + assert_panel_equal(result, expected) + # items, 2 + result = wp.pct_change(periods=2, axis='items') + expected = Panel({'i1': DataFrame({'c1': [np.nan, np.nan, np.nan], + 'c2': [np.nan, np.nan, np.nan]}), + 'i2': DataFrame({'c1': [np.nan, np.nan, np.nan], + 'c2': [np.nan, np.nan, np.nan]}), + 'i3': DataFrame({'c1': [2, 1, .4], + 'c2': [2./3, .5, 1./3]})}) + assert_panel_equal(result, expected) + + def test_multiindex_get(self): + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)], + names=['first', 'second']) + wp = Panel(np.random.random((4, 5, 5)), + items=ind, + major_axis=np.arange(5), + minor_axis=np.arange(5)) + f1 = wp['a'] + f2 = wp.ix['a'] + assert_panel_equal(f1, f2) + + self.assertTrue((f1.items == [1, 2]).all()) + self.assertTrue((f2.items == [1, 2]).all()) + + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + names=['first', 'second']) + + def test_multiindex_blocks(self): + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + names=['first', 'second']) + wp = Panel(self.panel._data) + wp.items = ind + f1 = wp['a'] + self.assertTrue((f1.items == [1, 2]).all()) + + f1 = wp[('b', 1)] + self.assertTrue((f1.columns == ['A', 'B', 'C', 'D']).all()) + + def test_repr_empty(self): + empty = Panel() + repr(empty) + + def test_rename(self): + mapper = { + 'ItemA': 'foo', + 'ItemB': 'bar', + 'ItemC': 'baz' + } + + renamed = self.panel.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assertTrue(renamed.items.equals(exp)) + + renamed = self.panel.rename_axis(str.lower, axis=2) + exp = Index(['a', 'b', 'c', 'd']) + self.assertTrue(renamed.minor_axis.equals(exp)) + + # don't copy + renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) + renamed_nocopy['foo'] = 3. + self.assertTrue((self.panel['ItemA'].values == 3).all()) + + def test_get_attr(self): + assert_frame_equal(self.panel['ItemA'], self.panel.ItemA) + + # specific cases from #3440 + self.panel['a'] = self.panel['ItemA'] + assert_frame_equal(self.panel['a'], self.panel.a) + self.panel['i'] = self.panel['ItemA'] + assert_frame_equal(self.panel['i'], self.panel.i) + + def test_group_agg(self): + values = np.ones((10, 2)) * np.arange(10).reshape((10, 1)) + bounds = np.arange(5) * 2 + f = lambda x: x.mean(axis=0) + + agged = group_agg(values, bounds, f) + + assert(agged[1][0] == 2.5) + assert(agged[2][0] == 4.5) + + # test a function that doesn't aggregate + f2 = lambda x: np.zeros((2, 2)) + self.assertRaises(Exception, group_agg, values, bounds, f2) + + def test_from_frame_level1_unsorted(self): + tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), + ('AAPL', 1), ('MSFT', 1)] + midx = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.rand(5, 4), index=midx) + p = df.to_panel() + assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index()) + + def test_to_excel(self): + import os + try: + import xlwt + import xlrd + import openpyxl + from pandas.io.excel import ExcelFile + except ImportError: + raise nose.SkipTest("need xlwt xlrd openpyxl") + + for ext in ['xls', 'xlsx']: + path = '__tmp__.' + ext + with ensure_clean(path) as path: + self.panel.to_excel(path) + try: + reader = ExcelFile(path) + except ImportError: + raise nose.SkipTest("need xlwt xlrd openpyxl") + + for item, df in compat.iteritems(self.panel): + recdf = reader.parse(str(item), index_col=0) + assert_frame_equal(df, recdf) + + def test_to_excel_xlsxwriter(self): + try: + import xlrd + import xlsxwriter + from pandas.io.excel import ExcelFile + except ImportError: + raise nose.SkipTest("Requires xlrd and xlsxwriter. Skipping test.") + + path = '__tmp__.xlsx' + with ensure_clean(path) as path: + self.panel.to_excel(path, engine='xlsxwriter') + try: + reader = ExcelFile(path) + except ImportError as e: + raise nose.SkipTest("cannot write excel file: %s" % e) + + for item, df in compat.iteritems(self.panel): + recdf = reader.parse(str(item), index_col=0) + assert_frame_equal(df, recdf) + + def test_dropna(self): + p = Panel(np.random.randn(4, 5, 6), major_axis=list('abcde')) + p.ix[:, ['b', 'd'], 0] = np.nan + + result = p.dropna(axis=1) + exp = p.ix[:, ['a', 'c', 'e'], :] + assert_panel_equal(result, exp) + inp = p.copy() + inp.dropna(axis=1, inplace=True) + assert_panel_equal(inp, exp) + + result = p.dropna(axis=1, how='all') + assert_panel_equal(result, p) + + p.ix[:, ['b', 'd'], :] = np.nan + result = p.dropna(axis=1, how='all') + exp = p.ix[:, ['a', 'c', 'e'], :] + assert_panel_equal(result, exp) + + p = Panel(np.random.randn(4, 5, 6), items=list('abcd')) + p.ix[['b'], :, 0] = np.nan + + result = p.dropna() + exp = p.ix[['a', 'c', 'd']] + assert_panel_equal(result, exp) + + result = p.dropna(how='all') + assert_panel_equal(result, p) + + p.ix['b'] = np.nan + result = p.dropna(how='all') + exp = p.ix[['a', 'c', 'd']] + assert_panel_equal(result, exp) + + def test_drop(self): + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + panel = Panel({"One": df, "Two": df}) + + def check_drop(drop_val, axis_number, aliases, expected): + try: + actual = panel.drop(drop_val, axis=axis_number) + assert_panel_equal(actual, expected) + for alias in aliases: + actual = panel.drop(drop_val, axis=alias) + assert_panel_equal(actual, expected) + except AssertionError: + com.pprint_thing("Failed with axis_number %d and aliases: %s" % + (axis_number, aliases)) + raise + # Items + expected = Panel({"One": df}) + check_drop('Two', 0, ['items'], expected) + + # Major + exp_df = DataFrame({"A": [2], "B": [4]}, index=[1]) + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop(0, 1, ['major_axis', 'major'], expected) + + exp_df = DataFrame({"A": [1], "B": [3]}, index=[0]) + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop([1], 1, ['major_axis', 'major'], expected) + + # Minor + exp_df = df[['B']] + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop(["A"], 2, ['minor_axis', 'minor'], expected) + + exp_df = df[['A']] + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop("B", 2, ['minor_axis', 'minor'], expected) + + def test_update(self): + pan = Panel([[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) + + other = Panel([[[3.6, 2., np.nan], + [np.nan, np.nan, 7]]], items=[1]) + + pan.update(other) + + expected = Panel([[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[3.6, 2., 3], + [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) + + assert_panel_equal(pan, expected) + + def test_update_from_dict(self): + pan = Panel({'one': DataFrame([[1.5, np.nan, 3], + [1.5, np.nan, 3], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]), + 'two': DataFrame([[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]])}) + + other = {'two': DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]])} + + pan.update(other) + + expected = Panel({'two': DataFrame([[3.6, 2., 3], + [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]), + 'one': DataFrame([[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]])}) + + assert_panel_equal(pan, expected) + + def test_update_nooverwrite(self): + pan = Panel([[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) + + other = Panel([[[3.6, 2., np.nan], + [np.nan, np.nan, 7]]], items=[1]) + + pan.update(other, overwrite=False) + + expected = Panel([[[1.5, np.nan, 3], + [1.5, np.nan, 3], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, 2., 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) + + assert_panel_equal(pan, expected) + + def test_update_filtered(self): + pan = Panel([[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) + + other = Panel([[[3.6, 2., np.nan], + [np.nan, np.nan, 7]]], items=[1]) + + pan.update(other, filter_func=lambda x: x > 2) + + expected = Panel([[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3], + [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) + + assert_panel_equal(pan, expected) + + def test_update_raise(self): + pan = Panel([[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) + + np.testing.assert_raises(Exception, pan.update, *(pan,), + **{'raise_conflict': True}) + + +class TestLongPanel(tm.TestCase): + """ + LongPanel no longer exists, but... + """ + _multiprocess_can_split_ = True + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + panel = tm.makePanel() + tm.add_nans(panel) + + self.panel = panel.to_frame() + self.unfiltered_panel = panel.to_frame(filter_observations=False) + + def test_ops_differently_indexed(self): + # trying to set non-identically indexed panel + wp = self.panel.to_panel() + wp2 = wp.reindex(major=wp.major_axis[:-1]) + lp2 = wp2.to_frame() + + result = self.panel + lp2 + assert_frame_equal(result.reindex(lp2.index), lp2 * 2) + + # careful, mutation + self.panel['foo'] = lp2['ItemA'] + assert_series_equal(self.panel['foo'].reindex(lp2.index), + lp2['ItemA']) + + def test_ops_scalar(self): + result = self.panel.mul(2) + expected = DataFrame.__mul__(self.panel, 2) + assert_frame_equal(result, expected) + + def test_combineFrame(self): + wp = self.panel.to_panel() + result = self.panel.add(wp['ItemA'].stack(), axis=0) + assert_frame_equal(result.to_panel()['ItemA'], wp['ItemA'] * 2) + + def test_combinePanel(self): + wp = self.panel.to_panel() + result = self.panel.add(self.panel) + wide_result = result.to_panel() + assert_frame_equal(wp['ItemA'] * 2, wide_result['ItemA']) + + # one item + result = self.panel.add(self.panel.filter(['ItemA'])) + + def test_combine_scalar(self): + result = self.panel.mul(2) + expected = DataFrame(self.panel._data) * 2 + assert_frame_equal(result, expected) + + def test_combine_series(self): + s = self.panel['ItemA'][:10] + result = self.panel.add(s, axis=0) + expected = DataFrame.add(self.panel, s, axis=0) + assert_frame_equal(result, expected) + + s = self.panel.ix[5] + result = self.panel + s + expected = DataFrame.add(self.panel, s, axis=1) + assert_frame_equal(result, expected) + + def test_operators(self): + wp = self.panel.to_panel() + result = (self.panel + 1).to_panel() + assert_frame_equal(wp['ItemA'] + 1, result['ItemA']) + + def test_arith_flex_panel(self): + ops = ['add', 'sub', 'mul', 'div', 'truediv', 'pow', 'floordiv', 'mod'] + if not compat.PY3: + aliases = {} + else: + aliases = {'div': 'truediv'} + self.panel = self.panel.to_panel() + + for n in [ np.random.randint(-50, -1), np.random.randint(1, 50), 0]: + for op in ops: + alias = aliases.get(op, op) + f = getattr(operator, alias) + exp = f(self.panel, n) + result = getattr(self.panel, op)(n) + assert_panel_equal(result, exp, check_panel_type=True) + + # rops + r_f = lambda x, y: f(y, x) + exp = r_f(self.panel, n) + result = getattr(self.panel, 'r' + op)(n) + assert_panel_equal(result, exp) + + def test_sort(self): + def is_sorted(arr): + return (arr[1:] > arr[:-1]).any() + + sorted_minor = self.panel.sortlevel(level=1) + self.assertTrue(is_sorted(sorted_minor.index.labels[1])) + + sorted_major = sorted_minor.sortlevel(level=0) + self.assertTrue(is_sorted(sorted_major.index.labels[0])) + + def test_to_string(self): + buf = StringIO() + self.panel.to_string(buf) + + def test_truncate(self): + dates = self.panel.index.levels[0] + start, end = dates[1], dates[5] + + trunced = self.panel.truncate(start, end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(start, end) + + assert_frame_equal(trunced['ItemA'], expected, check_names=False) # TODO trucate drops index.names + + trunced = self.panel.truncate(before=start).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(before=start) + + assert_frame_equal(trunced['ItemA'], expected, check_names=False) # TODO trucate drops index.names + + trunced = self.panel.truncate(after=end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(after=end) + + assert_frame_equal(trunced['ItemA'], expected, check_names=False) # TODO trucate drops index.names + + # truncate on dates that aren't in there + wp = self.panel.to_panel() + new_index = wp.major_axis[::5] + + wp2 = wp.reindex(major=new_index) + + lp2 = wp2.to_frame() + lp_trunc = lp2.truncate(wp.major_axis[2], wp.major_axis[-2]) + + wp_trunc = wp2.truncate(wp.major_axis[2], wp.major_axis[-2]) + + assert_panel_equal(wp_trunc, lp_trunc.to_panel()) + + # throw proper exception + self.assertRaises(Exception, lp2.truncate, wp.major_axis[-2], + wp.major_axis[2]) + + def test_axis_dummies(self): + from pandas.core.reshape import make_axis_dummies + + minor_dummies = make_axis_dummies(self.panel, 'minor') + self.assertEqual(len(minor_dummies.columns), + len(self.panel.index.levels[1])) + + major_dummies = make_axis_dummies(self.panel, 'major') + self.assertEqual(len(major_dummies.columns), + len(self.panel.index.levels[0])) + + mapping = {'A': 'one', + 'B': 'one', + 'C': 'two', + 'D': 'two'} + + transformed = make_axis_dummies(self.panel, 'minor', + transform=mapping.get) + self.assertEqual(len(transformed.columns), 2) + self.assert_numpy_array_equal(transformed.columns, ['one', 'two']) + + # TODO: test correctness + + def test_get_dummies(self): + from pandas.core.reshape import get_dummies, make_axis_dummies + + self.panel['Label'] = self.panel.index.labels[1] + minor_dummies = make_axis_dummies(self.panel, 'minor') + dummies = get_dummies(self.panel['Label']) + self.assert_numpy_array_equal(dummies.values, minor_dummies.values) + + def test_mean(self): + means = self.panel.mean(level='minor') + + # test versus Panel version + wide_means = self.panel.to_panel().mean('major') + assert_frame_equal(means, wide_means) + + def test_sum(self): + sums = self.panel.sum(level='minor') + + # test versus Panel version + wide_sums = self.panel.to_panel().sum('major') + assert_frame_equal(sums, wide_sums) + + def test_count(self): + index = self.panel.index + + major_count = self.panel.count(level=0)['ItemA'] + labels = index.labels[0] + for i, idx in enumerate(index.levels[0]): + self.assertEqual(major_count[i], (labels == i).sum()) + + minor_count = self.panel.count(level=1)['ItemA'] + labels = index.labels[1] + for i, idx in enumerate(index.levels[1]): + self.assertEqual(minor_count[i], (labels == i).sum()) + + def test_join(self): + lp1 = self.panel.filter(['ItemA', 'ItemB']) + lp2 = self.panel.filter(['ItemC']) + + joined = lp1.join(lp2) + + self.assertEqual(len(joined.columns), 3) + + self.assertRaises(Exception, lp1.join, + self.panel.filter(['ItemB', 'ItemC'])) + + def test_pivot(self): + from pandas.core.reshape import _slow_pivot + + one, two, three = (np.array([1, 2, 3, 4, 5]), + np.array(['a', 'b', 'c', 'd', 'e']), + np.array([1, 2, 3, 5, 4.])) + df = pivot(one, two, three) + self.assertEqual(df['a'][1], 1) + self.assertEqual(df['b'][2], 2) + self.assertEqual(df['c'][3], 3) + self.assertEqual(df['d'][4], 5) + self.assertEqual(df['e'][5], 4) + assert_frame_equal(df, _slow_pivot(one, two, three)) + + # weird overlap, TODO: test? + a, b, c = (np.array([1, 2, 3, 4, 4]), + np.array(['a', 'a', 'a', 'a', 'a']), + np.array([1., 2., 3., 4., 5.])) + self.assertRaises(Exception, pivot, a, b, c) + + # corner case, empty + df = pivot(np.array([]), np.array([]), np.array([])) + + +def test_monotonic(): + pos = np.array([1, 2, 3, 5]) + + def _monotonic(arr): + return not (arr[1:] < arr[:-1]).any() + + assert _monotonic(pos) + + neg = np.array([1, 2, 3, 4, 3]) + + assert not _monotonic(neg) + + neg2 = np.array([5, 1, 2, 3, 4, 5]) + + assert not _monotonic(neg2) + + +def test_panel_index(): + index = panelm.panel_index([1, 2, 3, 4], [1, 2, 3]) + expected = MultiIndex.from_arrays([np.tile([1, 2, 3, 4], 3), + np.repeat([1, 2, 3], 4)]) + assert(index.equals(expected)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py new file mode 100644 index 00000000..7dc5d9bd --- /dev/null +++ b/pandas/tests/test_panel4d.py @@ -0,0 +1,1055 @@ +from datetime import datetime +from pandas.compat import range, lrange +import os +import operator +import nose + +import numpy as np + +from pandas import Series, DataFrame, Index, isnull, notnull, pivot, MultiIndex +from pandas.core.datetools import bday +from pandas.core.frame import group_agg +from pandas.core.panel import Panel +from pandas.core.panel4d import Panel4D +from pandas.core.series import remove_na +import pandas.core.common as com +import pandas.core.panel as panelmod +from pandas import compat + +from pandas.util.testing import (assert_panel_equal, + assert_panel4d_equal, + assert_frame_equal, + assert_series_equal, + assert_almost_equal) +import pandas.util.testing as tm +import pandas.compat as compat + + +def add_nans(panel4d): + for l, label in enumerate(panel4d.labels): + panel = panel4d[label] + tm.add_nans(panel) + + +class SafeForLongAndSparse(object): + + _multiprocess_can_split_ = True + + def test_repr(self): + foo = repr(self.panel4d) + + def test_iter(self): + tm.equalContents(list(self.panel4d), self.panel4d.labels) + + def test_count(self): + f = lambda s: notnull(s).sum() + self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False) + + def test_sum(self): + self._check_stat_op('sum', np.sum) + + def test_mean(self): + self._check_stat_op('mean', np.mean) + + def test_prod(self): + self._check_stat_op('prod', np.prod) + + def test_median(self): + def wrapper(x): + if isnull(x).any(): + return np.nan + return np.median(x) + + self._check_stat_op('median', wrapper) + + def test_min(self): + self._check_stat_op('min', np.min) + + def test_max(self): + self._check_stat_op('max', np.max) + + def test_skew(self): + try: + from scipy.stats import skew + except ImportError: + raise nose.SkipTest("no scipy.stats.skew") + + def this_skew(x): + if len(x) < 3: + return np.nan + return skew(x, bias=False) + self._check_stat_op('skew', this_skew) + + # def test_mad(self): + # f = lambda x: np.abs(x - x.mean()).mean() + # self._check_stat_op('mad', f) + + def test_var(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.var(x, ddof=1) + self._check_stat_op('var', alt) + + def test_std(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.std(x, ddof=1) + self._check_stat_op('std', alt) + + def test_sem(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + + # def test_skew(self): + # from scipy.stats import skew + + # def alt(x): + # if len(x) < 3: + # return np.nan + # return skew(x, bias=False) + + # self._check_stat_op('skew', alt) + + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + if obj is None: + obj = self.panel4d + + # # set some NAs + # obj.ix[5:10] = np.nan + # obj.ix[15:20, -2:] = np.nan + + f = getattr(obj, name) + + if has_skipna: + def skipna_wrapper(x): + nona = remove_na(x) + if len(nona) == 0: + return np.nan + return alternative(nona) + + def wrapper(x): + return alternative(np.asarray(x)) + + for i in range(obj.ndim): + result = f(axis=i, skipna=False) + assert_panel_equal(result, obj.apply(wrapper, axis=i)) + else: + skipna_wrapper = alternative + wrapper = alternative + + for i in range(obj.ndim): + result = f(axis=i) + assert_panel_equal(result, obj.apply(skipna_wrapper, axis=i)) + + self.assertRaises(Exception, f, axis=obj.ndim) + + +class SafeForSparse(object): + + _multiprocess_can_split_ = True + + @classmethod + def assert_panel_equal(cls, x, y): + assert_panel_equal(x, y) + + @classmethod + def assert_panel4d_equal(cls, x, y): + assert_panel4d_equal(x, y) + + def test_get_axis(self): + assert(self.panel4d._get_axis(0) is self.panel4d.labels) + assert(self.panel4d._get_axis(1) is self.panel4d.items) + assert(self.panel4d._get_axis(2) is self.panel4d.major_axis) + assert(self.panel4d._get_axis(3) is self.panel4d.minor_axis) + + def test_set_axis(self): + new_labels = Index(np.arange(len(self.panel4d.labels))) + new_items = Index(np.arange(len(self.panel4d.items))) + new_major = Index(np.arange(len(self.panel4d.major_axis))) + new_minor = Index(np.arange(len(self.panel4d.minor_axis))) + + # ensure propagate to potentially prior-cached items too + label = self.panel4d['l1'] + self.panel4d.labels = new_labels + + if hasattr(self.panel4d, '_item_cache'): + self.assertNotIn('l1', self.panel4d._item_cache) + self.assertIs(self.panel4d.labels, new_labels) + + self.panel4d.major_axis = new_major + self.assertIs(self.panel4d[0].major_axis, new_major) + self.assertIs(self.panel4d.major_axis, new_major) + + self.panel4d.minor_axis = new_minor + self.assertIs(self.panel4d[0].minor_axis, new_minor) + self.assertIs(self.panel4d.minor_axis, new_minor) + + def test_get_axis_number(self): + self.assertEqual(self.panel4d._get_axis_number('labels'), 0) + self.assertEqual(self.panel4d._get_axis_number('items'), 1) + self.assertEqual(self.panel4d._get_axis_number('major'), 2) + self.assertEqual(self.panel4d._get_axis_number('minor'), 3) + + def test_get_axis_name(self): + self.assertEqual(self.panel4d._get_axis_name(0), 'labels') + self.assertEqual(self.panel4d._get_axis_name(1), 'items') + self.assertEqual(self.panel4d._get_axis_name(2), 'major_axis') + self.assertEqual(self.panel4d._get_axis_name(3), 'minor_axis') + + def test_arith(self): + self._test_op(self.panel4d, operator.add) + self._test_op(self.panel4d, operator.sub) + self._test_op(self.panel4d, operator.mul) + self._test_op(self.panel4d, operator.truediv) + self._test_op(self.panel4d, operator.floordiv) + self._test_op(self.panel4d, operator.pow) + + self._test_op(self.panel4d, lambda x, y: y + x) + self._test_op(self.panel4d, lambda x, y: y - x) + self._test_op(self.panel4d, lambda x, y: y * x) + self._test_op(self.panel4d, lambda x, y: y / x) + self._test_op(self.panel4d, lambda x, y: y ** x) + + self.assertRaises(Exception, self.panel4d.__add__, self.panel4d['l1']) + + @staticmethod + def _test_op(panel4d, op): + result = op(panel4d, 1) + assert_panel_equal(result['l1'], op(panel4d['l1'], 1)) + + def test_keys(self): + tm.equalContents(list(self.panel4d.keys()), self.panel4d.labels) + + def test_iteritems(self): + """Test panel4d.iteritems()""" + + self.assertEqual(len(list(compat.iteritems(self.panel4d))), + len(self.panel4d.labels)) + + def test_combinePanel4d(self): + result = self.panel4d.add(self.panel4d) + self.assert_panel4d_equal(result, self.panel4d * 2) + + def test_neg(self): + self.assert_panel4d_equal(-self.panel4d, self.panel4d * -1) + + def test_select(self): + p = self.panel4d + + # select labels + result = p.select(lambda x: x in ('l1', 'l3'), axis='labels') + expected = p.reindex(labels=['l1', 'l3']) + self.assert_panel4d_equal(result, expected) + + # select items + result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') + expected = p.reindex(items=['ItemA', 'ItemC']) + self.assert_panel4d_equal(result, expected) + + # select major_axis + result = p.select(lambda x: x >= datetime(2000, 1, 15), axis='major') + new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] + expected = p.reindex(major=new_major) + self.assert_panel4d_equal(result, expected) + + # select minor_axis + result = p.select(lambda x: x in ('D', 'A'), axis=3) + expected = p.reindex(minor=['A', 'D']) + self.assert_panel4d_equal(result, expected) + + # corner case, empty thing + result = p.select(lambda x: x in ('foo',), axis='items') + self.assert_panel4d_equal(result, p.reindex(items=[])) + + def test_get_value(self): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + result = self.panel.get_value(item, mjr, mnr) + expected = self.panel[item][mnr][mjr] + assert_almost_equal(result, expected) + + def test_abs(self): + result = self.panel4d.abs() + expected = np.abs(self.panel4d) + self.assert_panel4d_equal(result, expected) + + p = self.panel4d['l1'] + result = p.abs() + expected = np.abs(p) + assert_panel_equal(result, expected) + + df = p['ItemA'] + result = df.abs() + expected = np.abs(df) + assert_frame_equal(result, expected) + + +class CheckIndexing(object): + + _multiprocess_can_split_ = True + + def test_getitem(self): + self.assertRaises(Exception, self.panel4d.__getitem__, 'ItemQ') + + def test_delitem_and_pop(self): + expected = self.panel4d['l2'] + result = self.panel4d.pop('l2') + assert_panel_equal(expected, result) + self.assertNotIn('l2', self.panel4d.labels) + + del self.panel4d['l3'] + self.assertNotIn('l3', self.panel4d.labels) + self.assertRaises(Exception, self.panel4d.__delitem__, 'l3') + + values = np.empty((4, 4, 4, 4)) + values[0] = 0 + values[1] = 1 + values[2] = 2 + values[3] = 3 + + panel4d = Panel4D(values, lrange(4), lrange(4), lrange(4), lrange(4)) + + # did we delete the right row? + + panel4dc = panel4d.copy() + del panel4dc[0] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[1] + assert_panel_equal(panel4dc[0], panel4d[0]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[2] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[0], panel4d[0]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[3] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[0], panel4d[0]) + + def test_setitem(self): + ## LongPanel with one item + # lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() + # self.assertRaises(Exception, self.panel.__setitem__, + # 'ItemE', lp) + + # Panel + p = Panel(dict( + ItemA=self.panel4d['l1']['ItemA'][2:].filter(items=['A', 'B']))) + self.panel4d['l4'] = p + self.panel4d['l5'] = p + + p2 = self.panel4d['l4'] + + assert_panel_equal(p, p2.reindex(items=p.items, + major_axis=p.major_axis, + minor_axis=p.minor_axis)) + + # scalar + self.panel4d['lG'] = 1 + self.panel4d['lE'] = True + self.assertEqual(self.panel4d['lG'].values.dtype, np.int64) + self.assertEqual(self.panel4d['lE'].values.dtype, np.bool_) + + # object dtype + self.panel4d['lQ'] = 'foo' + self.assertEqual(self.panel4d['lQ'].values.dtype, np.object_) + + # boolean dtype + self.panel4d['lP'] = self.panel4d['l1'] > 0 + self.assertEqual(self.panel4d['lP'].values.dtype, np.bool_) + + def test_comparisons(self): + p1 = tm.makePanel4D() + p2 = tm.makePanel4D() + + tp = p1.reindex(labels=p1.labels + ['foo']) + p = p1[p1.labels[0]] + + def test_comp(func): + result = func(p1, p2) + self.assert_numpy_array_equal(result.values, + func(p1.values, p2.values)) + + # versus non-indexed same objs + self.assertRaises(Exception, func, p1, tp) + + # versus different objs + self.assertRaises(Exception, func, p1, p) + + result3 = func(self.panel4d, 0) + self.assert_numpy_array_equal(result3.values, + func(self.panel4d.values, 0)) + + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) + + def test_setitem_ndarray(self): + raise nose.SkipTest("skipping for now") + # from pandas import DateRange, datetools + + # timeidx = DateRange(start=datetime(2009,1,1), + # end=datetime(2009,12,31), + # offset=datetools.MonthEnd()) + # lons_coarse = np.linspace(-177.5, 177.5, 72) + # lats_coarse = np.linspace(-87.5, 87.5, 36) + # P = Panel(items=timeidx, major_axis=lons_coarse, minor_axis=lats_coarse) + # data = np.random.randn(72*36).reshape((72,36)) + # key = datetime(2009,2,28) + # P[key] = data# + + # assert_almost_equal(P[key].values, data) + + def test_major_xs(self): + ref = self.panel4d['l1']['ItemA'] + + idx = self.panel4d.major_axis[5] + xs = self.panel4d.major_xs(idx) + + assert_series_equal(xs['l1'].T['ItemA'], ref.xs(idx)) + + # not contained + idx = self.panel4d.major_axis[0] - bday + self.assertRaises(Exception, self.panel4d.major_xs, idx) + + def test_major_xs_mixed(self): + self.panel4d['l4'] = 'foo' + xs = self.panel4d.major_xs(self.panel4d.major_axis[0]) + self.assertEqual(xs['l1']['A'].dtype, np.float64) + self.assertEqual(xs['l4']['A'].dtype, np.object_) + + def test_minor_xs(self): + ref = self.panel4d['l1']['ItemA'] + + idx = self.panel4d.minor_axis[1] + xs = self.panel4d.minor_xs(idx) + + assert_series_equal(xs['l1'].T['ItemA'], ref[idx]) + + # not contained + self.assertRaises(Exception, self.panel4d.minor_xs, 'E') + + def test_minor_xs_mixed(self): + self.panel4d['l4'] = 'foo' + + xs = self.panel4d.minor_xs('D') + self.assertEqual(xs['l1'].T['ItemA'].dtype, np.float64) + self.assertEqual(xs['l4'].T['ItemA'].dtype, np.object_) + + def test_xs(self): + l1 = self.panel4d.xs('l1', axis=0) + expected = self.panel4d['l1'] + assert_panel_equal(l1, expected) + + # view if possible + l1_view = self.panel4d.xs('l1', axis=0) + l1_view.values[:] = np.nan + self.assertTrue(np.isnan(self.panel4d['l1'].values).all()) + + # mixed-type + self.panel4d['strings'] = 'foo' + result = self.panel4d.xs('D', axis=3) + self.assertIsNotNone(result.is_copy) + + def test_getitem_fancy_labels(self): + panel4d = self.panel4d + + labels = panel4d.labels[[1, 0]] + items = panel4d.items[[1, 0]] + dates = panel4d.major_axis[::2] + cols = ['D', 'C', 'F'] + + # all 4 specified + assert_panel4d_equal(panel4d.ix[labels, items, dates, cols], + panel4d.reindex(labels=labels, items=items, major=dates, minor=cols)) + + # 3 specified + assert_panel4d_equal(panel4d.ix[:, items, dates, cols], + panel4d.reindex(items=items, major=dates, minor=cols)) + + # 2 specified + assert_panel4d_equal(panel4d.ix[:, :, dates, cols], + panel4d.reindex(major=dates, minor=cols)) + + assert_panel4d_equal(panel4d.ix[:, items, :, cols], + panel4d.reindex(items=items, minor=cols)) + + assert_panel4d_equal(panel4d.ix[:, items, dates, :], + panel4d.reindex(items=items, major=dates)) + + # only 1 + assert_panel4d_equal(panel4d.ix[:, items, :, :], + panel4d.reindex(items=items)) + + assert_panel4d_equal(panel4d.ix[:, :, dates, :], + panel4d.reindex(major=dates)) + + assert_panel4d_equal(panel4d.ix[:, :, :, cols], + panel4d.reindex(minor=cols)) + + def test_getitem_fancy_slice(self): + pass + + def test_getitem_fancy_ints(self): + pass + + def test_getitem_fancy_xs(self): + raise nose.SkipTest("skipping for now") + # self.assertRaises(NotImplementedError, self.panel4d.major_xs) + # self.assertRaises(NotImplementedError, self.panel4d.minor_xs) + + def test_get_value(self): + for label in self.panel4d.labels: + for item in self.panel4d.items: + for mjr in self.panel4d.major_axis[::2]: + for mnr in self.panel4d.minor_axis: + result = self.panel4d.get_value( + label, item, mjr, mnr) + expected = self.panel4d[label][item][mnr][mjr] + assert_almost_equal(result, expected) + + def test_set_value(self): + for label in self.panel4d.labels: + for item in self.panel4d.items: + for mjr in self.panel4d.major_axis[::2]: + for mnr in self.panel4d.minor_axis: + self.panel4d.set_value(label, item, mjr, mnr, 1.) + assert_almost_equal( + self.panel4d[label][item][mnr][mjr], 1.) + + # resize + res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) + tm.assert_isinstance(res, Panel4D) + self.assertIsNot(res, self.panel4d) + self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) + + res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) + self.assertTrue(com.is_float_dtype(res3['l4'].values)) + + +class TestPanel4d(tm.TestCase, CheckIndexing, SafeForSparse, + SafeForLongAndSparse): + + _multiprocess_can_split_ = True + + @classmethod + def assert_panel4d_equal(cls, x, y): + assert_panel4d_equal(x, y) + + def setUp(self): + self.panel4d = tm.makePanel4D(nper=8) + add_nans(self.panel4d) + + def test_constructor(self): + # with BlockManager + panel4d = Panel4D(self.panel4d._data) + self.assertIs(panel4d._data, self.panel4d._data) + + panel4d = Panel4D(self.panel4d._data, copy=True) + self.assertIsNot(panel4d._data, self.panel4d._data) + assert_panel4d_equal(panel4d, self.panel4d) + + # strings handled prop + # panel4d = Panel4D([[['foo', 'foo', 'foo',], + # ['foo', 'foo', 'foo']]]) + # self.assertEqual(wp.values.dtype, np.object_) + + vals = self.panel4d.values + + # no copy + panel4d = Panel4D(vals) + self.assertIs(panel4d.values, vals) + + # copy + panel4d = Panel4D(vals, copy=True) + self.assertIsNot(panel4d.values, vals) + + def test_constructor_cast(self): + zero_filled = self.panel4d.fillna(0) + + casted = Panel4D(zero_filled._data, dtype=int) + casted2 = Panel4D(zero_filled.values, dtype=int) + + exp_values = zero_filled.values.astype(int) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) + + casted = Panel4D(zero_filled._data, dtype=np.int32) + casted2 = Panel4D(zero_filled.values, dtype=np.int32) + + exp_values = zero_filled.values.astype(np.int32) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) + + # can't cast + data = [[['foo', 'bar', 'baz']]] + self.assertRaises(ValueError, Panel, data, dtype=float) + + def test_constructor_empty_panel(self): + empty = Panel() + self.assertEqual(len(empty.items), 0) + self.assertEqual(len(empty.major_axis), 0) + self.assertEqual(len(empty.minor_axis), 0) + + def test_constructor_observe_dtype(self): + # GH #411 + panel = Panel(items=lrange(3), major_axis=lrange(3), + minor_axis=lrange(3), dtype='O') + self.assertEqual(panel.values.dtype, np.object_) + + def test_consolidate(self): + self.assertTrue(self.panel4d._data.is_consolidated()) + + self.panel4d['foo'] = 1. + self.assertFalse(self.panel4d._data.is_consolidated()) + + panel4d = self.panel4d.consolidate() + self.assertTrue(panel4d._data.is_consolidated()) + + def test_ctor_dict(self): + l1 = self.panel4d['l1'] + l2 = self.panel4d['l2'] + + d = {'A': l1, 'B': l2.ix[['ItemB'], :, :]} + # d2 = {'A' : itema._series, 'B' : itemb[5:]._series} + # d3 = {'A' : DataFrame(itema._series), + # 'B' : DataFrame(itemb[5:]._series)} + + panel4d = Panel4D(d) + # wp2 = Panel.from_dict(d2) # nested Dict + # wp3 = Panel.from_dict(d3) + # self.assertTrue(wp.major_axis.equals(self.panel.major_axis)) + assert_panel_equal(panel4d['A'], self.panel4d['l1']) + assert_frame_equal(panel4d.ix['B', 'ItemB', :, :], + self.panel4d.ix['l2', ['ItemB'], :, :]['ItemB']) + + # intersect + # wp = Panel.from_dict(d, intersect=True) + # self.assertTrue(wp.major_axis.equals(itemb.index[5:])) + + # use constructor + # assert_panel_equal(Panel(d), Panel.from_dict(d)) + # assert_panel_equal(Panel(d2), Panel.from_dict(d2)) + # assert_panel_equal(Panel(d3), Panel.from_dict(d3)) + + # cast + # dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) + # for k, v in d.iteritems()) + # result = Panel(dcasted, dtype=int) + # expected = Panel(dict((k, v.astype(int)) + # for k, v in dcasted.iteritems())) + # assert_panel_equal(result, expected) + + def test_constructor_dict_mixed(self): + data = dict((k, v.values) for k, v in compat.iteritems(self.panel4d)) + result = Panel4D(data) + exp_major = Index(np.arange(len(self.panel4d.major_axis))) + self.assertTrue(result.major_axis.equals(exp_major)) + + result = Panel4D(data, + labels=self.panel4d.labels, + items=self.panel4d.items, + major_axis=self.panel4d.major_axis, + minor_axis=self.panel4d.minor_axis) + assert_panel4d_equal(result, self.panel4d) + + data['l2'] = self.panel4d['l2'] + result = Panel4D(data) + assert_panel4d_equal(result, self.panel4d) + + # corner, blow up + data['l2'] = data['l2']['ItemB'] + self.assertRaises(Exception, Panel4D, data) + + data['l2'] = self.panel4d['l2'].values[:, :, :-1] + self.assertRaises(Exception, Panel4D, data) + + def test_constructor_resize(self): + data = self.panel4d._data + labels = self.panel4d.labels[:-1] + items = self.panel4d.items[:-1] + major = self.panel4d.major_axis[:-1] + minor = self.panel4d.minor_axis[:-1] + + result = Panel4D(data, labels=labels, items=items, + major_axis=major, minor_axis=minor) + expected = self.panel4d.reindex( + labels=labels, items=items, major=major, minor=minor) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, items=items, major_axis=major) + expected = self.panel4d.reindex(items=items, major=major) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, items=items) + expected = self.panel4d.reindex(items=items) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, minor_axis=minor) + expected = self.panel4d.reindex(minor=minor) + assert_panel4d_equal(result, expected) + + def test_from_dict_mixed_orient(self): + raise nose.SkipTest("skipping for now") + # df = tm.makeDataFrame() + # df['foo'] = 'bar' + + # data = {'k1' : df, + # 'k2' : df} + + # panel = Panel.from_dict(data, orient='minor') + + # self.assertEqual(panel['foo'].values.dtype, np.object_) + # self.assertEqual(panel['A'].values.dtype, np.float64) + + def test_values(self): + self.assertRaises(Exception, Panel, np.random.randn(5, 5, 5), + lrange(5), lrange(5), lrange(4)) + + def test_conform(self): + p = self.panel4d['l1'].filter(items=['ItemA', 'ItemB']) + conformed = self.panel4d.conform(p) + + assert(conformed.items.equals(self.panel4d.labels)) + assert(conformed.major_axis.equals(self.panel4d.major_axis)) + assert(conformed.minor_axis.equals(self.panel4d.minor_axis)) + + def test_reindex(self): + ref = self.panel4d['l2'] + + # labels + result = self.panel4d.reindex(labels=['l1', 'l2']) + assert_panel_equal(result['l2'], ref) + + # items + result = self.panel4d.reindex(items=['ItemA', 'ItemB']) + assert_frame_equal(result['l2']['ItemB'], ref['ItemB']) + + # major + new_major = list(self.panel4d.major_axis[:10]) + result = self.panel4d.reindex(major=new_major) + assert_frame_equal( + result['l2']['ItemB'], ref['ItemB'].reindex(index=new_major)) + + # raise exception put both major and major_axis + self.assertRaises(Exception, self.panel4d.reindex, + major_axis=new_major, major=new_major) + + # minor + new_minor = list(self.panel4d.minor_axis[:2]) + result = self.panel4d.reindex(minor=new_minor) + assert_frame_equal( + result['l2']['ItemB'], ref['ItemB'].reindex(columns=new_minor)) + + result = self.panel4d.reindex(labels=self.panel4d.labels, + items=self.panel4d.items, + major=self.panel4d.major_axis, + minor=self.panel4d.minor_axis) + + # don't necessarily copy + result = self.panel4d.reindex() + assert_panel4d_equal(result,self.panel4d) + self.assertFalse(result is self.panel4d) + + # with filling + smaller_major = self.panel4d.major_axis[::5] + smaller = self.panel4d.reindex(major=smaller_major) + + larger = smaller.reindex(major=self.panel4d.major_axis, + method='pad') + + assert_panel_equal(larger.ix[:, :, self.panel4d.major_axis[1], :], + smaller.ix[:, :, smaller_major[0], :]) + + # don't necessarily copy + result = self.panel4d.reindex( + major=self.panel4d.major_axis, copy=False) + assert_panel4d_equal(result,self.panel4d) + self.assertTrue(result is self.panel4d) + + def test_not_hashable(self): + p4D_empty = Panel4D() + self.assertRaises(TypeError, hash, p4D_empty) + self.assertRaises(TypeError, hash, self.panel4d) + + def test_reindex_like(self): + # reindex_like + smaller = self.panel4d.reindex(labels=self.panel4d.labels[:-1], + items=self.panel4d.items[:-1], + major=self.panel4d.major_axis[:-1], + minor=self.panel4d.minor_axis[:-1]) + smaller_like = self.panel4d.reindex_like(smaller) + assert_panel4d_equal(smaller, smaller_like) + + def test_take(self): + raise nose.SkipTest("skipping for now") + + # # axis == 0 + # result = self.panel.take([2, 0, 1], axis=0) + # expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) + # assert_panel_equal(result, expected)# + + # # axis >= 1 + # result = self.panel.take([3, 0, 1, 2], axis=2) + # expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) + # assert_panel_equal(result, expected) + + # self.assertRaises(Exception, self.panel.take, [3, -1, 1, 2], axis=2) + # self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + + def test_sort_index(self): + import random + + rlabels = list(self.panel4d.labels) + ritems = list(self.panel4d.items) + rmajor = list(self.panel4d.major_axis) + rminor = list(self.panel4d.minor_axis) + random.shuffle(rlabels) + random.shuffle(ritems) + random.shuffle(rmajor) + random.shuffle(rminor) + + random_order = self.panel4d.reindex(labels=rlabels) + sorted_panel4d = random_order.sort_index(axis=0) + assert_panel4d_equal(sorted_panel4d, self.panel4d) + + # descending + # random_order = self.panel.reindex(items=ritems) + # sorted_panel = random_order.sort_index(axis=0, ascending=False) + # assert_panel_equal(sorted_panel, + # self.panel.reindex(items=self.panel.items[::-1])) + + # random_order = self.panel.reindex(major=rmajor) + # sorted_panel = random_order.sort_index(axis=1) + # assert_panel_equal(sorted_panel, self.panel) + + # random_order = self.panel.reindex(minor=rminor) + # sorted_panel = random_order.sort_index(axis=2) + # assert_panel_equal(sorted_panel, self.panel) + + def test_fillna(self): + self.assertFalse(np.isfinite(self.panel4d.values).all()) + filled = self.panel4d.fillna(0) + self.assertTrue(np.isfinite(filled.values).all()) + + self.assertRaises(NotImplementedError, self.panel4d.fillna, method='pad') + + def test_swapaxes(self): + result = self.panel4d.swapaxes('labels', 'items') + self.assertIs(result.items, self.panel4d.labels) + + result = self.panel4d.swapaxes('labels', 'minor') + self.assertIs(result.labels, self.panel4d.minor_axis) + + result = self.panel4d.swapaxes('items', 'minor') + self.assertIs(result.items, self.panel4d.minor_axis) + + result = self.panel4d.swapaxes('items', 'major') + self.assertIs(result.items, self.panel4d.major_axis) + + result = self.panel4d.swapaxes('major', 'minor') + self.assertIs(result.major_axis, self.panel4d.minor_axis) + + # this should also work + result = self.panel4d.swapaxes(0, 1) + self.assertIs(result.labels, self.panel4d.items) + + # this works, but return a copy + result = self.panel4d.swapaxes('items', 'items') + assert_panel4d_equal(self.panel4d,result) + self.assertNotEqual(id(self.panel4d), id(result)) + + def test_to_frame(self): + raise nose.SkipTest("skipping for now") + # # filtered + # filtered = self.panel.to_frame() + # expected = self.panel.to_frame().dropna(how='any') + # assert_frame_equal(filtered, expected) + + # # unfiltered + # unfiltered = self.panel.to_frame(filter_observations=False) + # assert_panel_equal(unfiltered.to_panel(), self.panel) + + # # names + # self.assertEqual(unfiltered.index.names, ('major', 'minor')) + + def test_to_frame_mixed(self): + raise nose.SkipTest("skipping for now") + # panel = self.panel.fillna(0) + # panel['str'] = 'foo' + # panel['bool'] = panel['ItemA'] > 0 + + # lp = panel.to_frame() + # wp = lp.to_panel() + # self.assertEqual(wp['bool'].values.dtype, np.bool_) + # assert_frame_equal(wp['bool'], panel['bool']) + + def test_update(self): + + p4d = Panel4D([[[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]]) + + other = Panel4D([[[[3.6, 2., np.nan]], + [[np.nan, np.nan, 7]]]]) + + p4d.update(other) + + expected = Panel4D([[[[3.6, 2, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]]) + + assert_panel4d_equal(p4d, expected) + + def test_filter(self): + raise nose.SkipTest("skipping for now") + + def test_apply(self): + raise nose.SkipTest("skipping for now") + + def test_dtypes(self): + + result = self.panel4d.dtypes + expected = Series(np.dtype('float64'),index=self.panel4d.labels) + assert_series_equal(result, expected) + + def test_compound(self): + raise nose.SkipTest("skipping for now") + # compounded = self.panel.compound() + + # assert_series_equal(compounded['ItemA'], + # (1 + self.panel['ItemA']).product(0) - 1) + + def test_shift(self): + raise nose.SkipTest("skipping for now") + # # major + # idx = self.panel.major_axis[0] + # idx_lag = self.panel.major_axis[1] + + # shifted = self.panel.shift(1) + + # assert_frame_equal(self.panel.major_xs(idx), + # shifted.major_xs(idx_lag)) + + # # minor + # idx = self.panel.minor_axis[0] + # idx_lag = self.panel.minor_axis[1] + + # shifted = self.panel.shift(1, axis='minor') + + # assert_frame_equal(self.panel.minor_xs(idx), + # shifted.minor_xs(idx_lag)) + + # self.assertRaises(Exception, self.panel.shift, 1, axis='items') + + def test_multiindex_get(self): + raise nose.SkipTest("skipping for now") + # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b',2)], + # names=['first', 'second']) + # wp = Panel(np.random.random((4,5,5)), + # items=ind, + # major_axis=np.arange(5), + # minor_axis=np.arange(5)) + # f1 = wp['a'] + # f2 = wp.ix['a'] + # assert_panel_equal(f1, f2) + + # self.assertTrue((f1.items == [1, 2]).all()) + # self.assertTrue((f2.items == [1, 2]).all()) + + # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + # names=['first', 'second']) + + def test_multiindex_blocks(self): + raise nose.SkipTest("skipping for now") + # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + # names=['first', 'second']) + # wp = Panel(self.panel._data) + # wp.items = ind + # f1 = wp['a'] + # self.assertTrue((f1.items == [1, 2]).all()) + + # f1 = wp[('b',1)] + # self.assertTrue((f1.columns == ['A', 'B', 'C', 'D']).all()) + + def test_repr_empty(self): + empty = Panel4D() + repr(empty) + + def test_rename(self): + mapper = { + 'l1': 'foo', + 'l2': 'bar', + 'l3': 'baz' + } + + renamed = self.panel4d.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assertTrue(renamed.labels.equals(exp)) + + renamed = self.panel4d.rename_axis(str.lower, axis=3) + exp = Index(['a', 'b', 'c', 'd']) + self.assertTrue(renamed.minor_axis.equals(exp)) + + # don't copy + renamed_nocopy = self.panel4d.rename_axis(mapper, axis=0, copy=False) + renamed_nocopy['foo'] = 3. + self.assertTrue((self.panel4d['l1'].values == 3).all()) + + def test_get_attr(self): + assert_panel_equal(self.panel4d['l1'], self.panel4d.l1) + + def test_group_agg(self): + values = np.ones((10, 2)) * np.arange(10).reshape((10, 1)) + bounds = np.arange(5) * 2 + f = lambda x: x.mean(axis=0) + + agged = group_agg(values, bounds, f) + + assert(agged[1][0] == 2.5) + assert(agged[2][0] == 4.5) + + # test a function that doesn't aggregate + f2 = lambda x: np.zeros((2, 2)) + self.assertRaises(Exception, group_agg, values, bounds, f2) + + def test_from_frame_level1_unsorted(self): + raise nose.SkipTest("skipping for now") + + def test_to_excel(self): + raise nose.SkipTest("skipping for now") + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', + '--with-timer'], + exit=False) diff --git a/pandas/tests/test_panelnd.py b/pandas/tests/test_panelnd.py new file mode 100644 index 00000000..92083afb --- /dev/null +++ b/pandas/tests/test_panelnd.py @@ -0,0 +1,110 @@ +from datetime import datetime +import os +import operator +import nose + +import numpy as np + +from pandas.core import panelnd +from pandas.core.panel import Panel +import pandas.core.common as com +from pandas import compat + +from pandas.util.testing import (assert_panel_equal, + assert_panel4d_equal, + assert_frame_equal, + assert_series_equal, + assert_almost_equal) +import pandas.util.testing as tm + + +class TestPanelnd(tm.TestCase): + + def setUp(self): + pass + + def test_4d_construction(self): + + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) + + def test_4d_construction_alt(self): + + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer='Panel', + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) + + def test_4d_construction_error(self): + + # create a 4D + self.assertRaises(Exception, + panelnd.create_nd_panel_factory, + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', + 'minor_axis'], + slices={'items': 'items', + 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer='foo', + aliases={'major': 'major_axis', + 'minor': 'minor_axis'}, + stat_axis=2) + + def test_5d_construction(self): + + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels1', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) + + # create a 5D + Panel5D = panelnd.create_nd_panel_factory( + klass_name='Panel5D', + orders=['cool1', 'labels1', 'items', 'major_axis', + 'minor_axis'], + slices={'labels1': 'labels1', 'items': 'items', + 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel4D, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + p5d = Panel5D(dict(C1=p4d)) + + # slice back to 4d + results = p5d.ix['C1', :, :, 0:3, :] + expected = p4d.ix[:, :, 0:3, :] + assert_panel_equal(results['L1'], expected['L1']) + + # test a transpose + # results = p5d.transpose(1,2,3,4,0) + # expected = + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py new file mode 100644 index 00000000..42427617 --- /dev/null +++ b/pandas/tests/test_reshape.py @@ -0,0 +1,334 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +import operator +import os + +import nose + +from pandas import DataFrame, Series +import pandas as pd + +from numpy import nan +import numpy as np + +from pandas.util.testing import assert_frame_equal +from numpy.testing import assert_array_equal + +from pandas.core.reshape import (melt, convert_dummies, lreshape, get_dummies, + wide_to_long) +import pandas.util.testing as tm +from pandas.compat import StringIO, cPickle, range, u + +_multiprocess_can_split_ = True + + +class TestMelt(tm.TestCase): + + def setUp(self): + self.df = tm.makeTimeDataFrame()[:10] + self.df['id1'] = (self.df['A'] > 0).astype(np.int64) + self.df['id2'] = (self.df['B'] > 0).astype(np.int64) + + self.var_name = 'var' + self.value_name = 'val' + + self.df1 = pd.DataFrame([[ 1.067683, -1.110463, 0.20867 ], + [-1.321405, 0.368915, -1.055342], + [-0.807333, 0.08298 , -0.873361]]) + self.df1.columns = [list('ABC'), list('abc')] + self.df1.columns.names = ['CAP', 'low'] + + def test_default_col_names(self): + result = melt(self.df) + self.assertEqual(result.columns.tolist(), ['variable', 'value']) + + result1 = melt(self.df, id_vars=['id1']) + self.assertEqual(result1.columns.tolist(), ['id1', 'variable', 'value']) + + result2 = melt(self.df, id_vars=['id1', 'id2']) + self.assertEqual(result2.columns.tolist(), ['id1', 'id2', 'variable', 'value']) + + def test_value_vars(self): + result3 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A') + self.assertEqual(len(result3), 10) + + result4 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B']) + expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A']*10 + ['B']*10, + 'value': self.df['A'].tolist() + self.df['B'].tolist()}, + columns=['id1', 'id2', 'variable', 'value']) + tm.assert_frame_equal(result4, expected4) + + def test_custom_var_name(self): + result5 = melt(self.df, var_name=self.var_name) + self.assertEqual(result5.columns.tolist(), ['var', 'value']) + + result6 = melt(self.df, id_vars=['id1'], var_name=self.var_name) + self.assertEqual(result6.columns.tolist(), ['id1', 'var', 'value']) + + result7 = melt(self.df, id_vars=['id1', 'id2'], var_name=self.var_name) + self.assertEqual(result7.columns.tolist(), ['id1', 'id2', 'var', 'value']) + + result8 = melt(self.df, id_vars=['id1', 'id2'], + value_vars='A', var_name=self.var_name) + self.assertEqual(result8.columns.tolist(), ['id1', 'id2', 'var', 'value']) + + result9 = melt(self.df, id_vars=['id1', 'id2'], + value_vars=['A', 'B'], var_name=self.var_name) + expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + self.var_name: ['A']*10 + ['B']*10, + 'value': self.df['A'].tolist() + self.df['B'].tolist()}, + columns=['id1', 'id2', self.var_name, 'value']) + tm.assert_frame_equal(result9, expected9) + + def test_custom_value_name(self): + result10 = melt(self.df, value_name=self.value_name) + self.assertEqual(result10.columns.tolist(), ['variable', 'val']) + + result11 = melt(self.df, id_vars=['id1'], value_name=self.value_name) + self.assertEqual(result11.columns.tolist(), ['id1', 'variable', 'val']) + + result12 = melt(self.df, id_vars=['id1', 'id2'], value_name=self.value_name) + self.assertEqual(result12.columns.tolist(), ['id1', 'id2', 'variable', 'val']) + + result13 = melt(self.df, id_vars=['id1', 'id2'], + value_vars='A', value_name=self.value_name) + self.assertEqual(result13.columns.tolist(), ['id1', 'id2', 'variable', 'val']) + + result14 = melt(self.df, id_vars=['id1', 'id2'], + value_vars=['A', 'B'], value_name=self.value_name) + expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + 'variable': ['A']*10 + ['B']*10, + self.value_name: self.df['A'].tolist() + self.df['B'].tolist()}, + columns=['id1', 'id2', 'variable', self.value_name]) + tm.assert_frame_equal(result14, expected14) + + def test_custom_var_and_value_name(self): + + result15 = melt(self.df, var_name=self.var_name, value_name=self.value_name) + self.assertEqual(result15.columns.tolist(), ['var', 'val']) + + result16 = melt(self.df, id_vars=['id1'], var_name=self.var_name, value_name=self.value_name) + self.assertEqual(result16.columns.tolist(), ['id1', 'var', 'val']) + + result17 = melt(self.df, id_vars=['id1', 'id2'], + var_name=self.var_name, value_name=self.value_name) + self.assertEqual(result17.columns.tolist(), ['id1', 'id2', 'var', 'val']) + + result18 = melt(self.df, id_vars=['id1', 'id2'], + value_vars='A', var_name=self.var_name, value_name=self.value_name) + self.assertEqual(result18.columns.tolist(), ['id1', 'id2', 'var', 'val']) + + result19 = melt(self.df, id_vars=['id1', 'id2'], + value_vars=['A', 'B'], var_name=self.var_name, value_name=self.value_name) + expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2, + 'id2': self.df['id2'].tolist() * 2, + self.var_name: ['A']*10 + ['B']*10, + self.value_name: self.df['A'].tolist() + self.df['B'].tolist()}, + columns=['id1', 'id2', self.var_name, self.value_name]) + tm.assert_frame_equal(result19, expected19) + + df20 = self.df.copy() + df20.columns.name = 'foo' + result20 = melt(df20) + self.assertEqual(result20.columns.tolist(), ['foo', 'value']) + + def test_col_level(self): + res1 = melt(self.df1, col_level=0) + res2 = melt(self.df1, col_level='CAP') + self.assertEqual(res1.columns.tolist(), ['CAP', 'value']) + self.assertEqual(res1.columns.tolist(), ['CAP', 'value']) + + def test_multiindex(self): + res = pd.melt(self.df1) + self.assertEqual(res.columns.tolist(), ['CAP', 'low', 'value']) + + +class TestGetDummies(tm.TestCase): + def test_basic(self): + s_list = list('abc') + s_series = Series(s_list) + s_series_index = Series(s_list, list('ABC')) + + expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}, + 'c': {0: 0.0, 1: 0.0, 2: 1.0}}) + assert_frame_equal(get_dummies(s_list), expected) + assert_frame_equal(get_dummies(s_series), expected) + + expected.index = list('ABC') + assert_frame_equal(get_dummies(s_series_index), expected) + + def test_just_na(self): + just_na_list = [np.nan] + just_na_series = Series(just_na_list) + just_na_series_index = Series(just_na_list, index = ['A']) + + res_list = get_dummies(just_na_list) + res_series = get_dummies(just_na_series) + res_series_index = get_dummies(just_na_series_index) + + self.assertEqual(res_list.empty, True) + self.assertEqual(res_series.empty, True) + self.assertEqual(res_series_index.empty, True) + + self.assertEqual(res_list.index.tolist(), [0]) + self.assertEqual(res_series.index.tolist(), [0]) + self.assertEqual(res_series_index.index.tolist(), ['A']) + + def test_include_na(self): + s = ['a', 'b', np.nan] + res = get_dummies(s) + exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + assert_frame_equal(res, exp) + + res_na = get_dummies(s, dummy_na=True) + exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, + 'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1) + # hack (NaN handling in assert_index_equal) + exp_na.columns = res_na.columns + assert_frame_equal(res_na, exp_na) + + res_just_na = get_dummies([nan], dummy_na=True) + exp_just_na = DataFrame(Series(1.0,index=[0]),columns=[nan]) + assert_array_equal(res_just_na.values, exp_just_na.values) + + def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values + import unicodedata + e = 'e' + eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') + s = [e, eacute, eacute] + res = get_dummies(s, prefix='letter') + exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0}, + u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) + assert_frame_equal(res, exp) + +class TestConvertDummies(tm.TestCase): + def test_convert_dummies(self): + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + result = convert_dummies(df, ['A', 'B']) + result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.') + + expected = DataFrame({'A_foo': [1, 0, 1, 0, 1, 0, 1, 1], + 'A_bar': [0, 1, 0, 1, 0, 1, 0, 0], + 'B_one': [1, 1, 0, 0, 0, 0, 1, 0], + 'B_two': [0, 0, 1, 0, 1, 1, 0, 0], + 'B_three': [0, 0, 0, 1, 0, 0, 0, 1], + 'C': df['C'].values, + 'D': df['D'].values}, + columns=result.columns, dtype=float) + expected2 = expected.rename(columns=lambda x: x.replace('_', '.')) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected2) + + +class TestLreshape(tm.TestCase): + + def test_pairs(self): + data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009'], + 'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], + 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt1': [1823, 3338, 1549, 3298, 4306], + 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], + 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + + df = DataFrame(data) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + result = lreshape(df, spec) + + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', '08jan2009', + '30dec2008', '21dec2008', '11jan2009', + '08jan2009', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, + 1454, 3139, 4133, 1766, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, + 103, 104, 105, 101, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Male', + 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', + '20jan2009', '21jan2009', '22jan2009', '31dec2008', + '03feb2009', '05feb2009', '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + result = lreshape(df, spec, dropna=False) + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105, + 101, 102, 103, 104, 105, + 101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', + '21jan2009', nan, '22jan2009', + '31dec2008', '03feb2009', + '05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, + 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + self.assertRaises(ValueError, lreshape, df, spec) + +class TestWideToLong(tm.TestCase): + def test_simple(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, + "A1980" : {0 : "d", 1 : "e", 2 : "f"}, + "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, + "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, + "X" : dict(zip(range(3), x)) + }) + df["id"] = df.index + exp_data = {"X" : x.tolist() + x.tolist(), + "A" : ['a', 'b', 'c', 'd', 'e', 'f'], + "B" : [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year" : [1970, 1970, 1970, 1980, 1980, 1980], + "id" : [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_rplot.py b/pandas/tests/test_rplot.py new file mode 100644 index 00000000..ddfce477 --- /dev/null +++ b/pandas/tests/test_rplot.py @@ -0,0 +1,298 @@ +from pandas.compat import range +import pandas.tools.rplot as rplot +import pandas.util.testing as tm +from pandas import read_csv +import os + +import nose + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + + +def between(a, b, x): + """Check if x is in the somewhere between a and b. + + Parameters: + ----------- + a: float, interval start + b: float, interval end + x: float, value to test for + + Returns: + -------- + True if x is between a and b, False otherwise + """ + if a < b: + return x >= a and x <= b + else: + return x <= a and x >= b + + +@tm.mplskip +class TestUtilityFunctions(tm.TestCase): + """ + Tests for RPlot utility functions. + """ + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + + def test_make_aes1(self): + aes = rplot.make_aes() + self.assertTrue(aes['x'] is None) + self.assertTrue(aes['y'] is None) + self.assertTrue(aes['size'] is None) + self.assertTrue(aes['colour'] is None) + self.assertTrue(aes['shape'] is None) + self.assertTrue(aes['alpha'] is None) + self.assertTrue(isinstance(aes, dict)) + + def test_make_aes2(self): + self.assertRaises(ValueError, rplot.make_aes, + size=rplot.ScaleShape('test')) + self.assertRaises(ValueError, rplot.make_aes, + colour=rplot.ScaleShape('test')) + self.assertRaises(ValueError, rplot.make_aes, + shape=rplot.ScaleSize('test')) + self.assertRaises(ValueError, rplot.make_aes, + alpha=rplot.ScaleShape('test')) + + def test_dictionary_union(self): + dict1 = {1 : 1, 2 : 2, 3 : 3} + dict2 = {1 : 1, 2 : 2, 4 : 4} + union = rplot.dictionary_union(dict1, dict2) + self.assertEqual(len(union), 4) + keys = list(union.keys()) + self.assertTrue(1 in keys) + self.assertTrue(2 in keys) + self.assertTrue(3 in keys) + self.assertTrue(4 in keys) + self.assertEqual(rplot.dictionary_union(dict1, {}), dict1) + self.assertEqual(rplot.dictionary_union({}, dict1), dict1) + self.assertEqual(rplot.dictionary_union({}, {}), {}) + + def test_merge_aes(self): + layer1 = rplot.Layer(size=rplot.ScaleSize('test')) + layer2 = rplot.Layer(shape=rplot.ScaleShape('test')) + rplot.merge_aes(layer1, layer2) + self.assertTrue(isinstance(layer2.aes['size'], rplot.ScaleSize)) + self.assertTrue(isinstance(layer2.aes['shape'], rplot.ScaleShape)) + self.assertEqual(layer2.aes['size'], layer1.aes['size']) + for key in layer2.aes.keys(): + if key != 'size' and key != 'shape': + self.assertTrue(layer2.aes[key] is None) + + def test_sequence_layers(self): + layer1 = rplot.Layer(self.data) + layer2 = rplot.GeomPoint(x='SepalLength', y='SepalWidth', + size=rplot.ScaleSize('PetalLength')) + layer3 = rplot.GeomPolyFit(2) + result = rplot.sequence_layers([layer1, layer2, layer3]) + self.assertEqual(len(result), 3) + last = result[-1] + self.assertEqual(last.aes['x'], 'SepalLength') + self.assertEqual(last.aes['y'], 'SepalWidth') + self.assertTrue(isinstance(last.aes['size'], rplot.ScaleSize)) + self.assertTrue(self.data is last.data) + self.assertTrue(rplot.sequence_layers([layer1])[0] is layer1) + + +@tm.mplskip +class TestTrellis(tm.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/tips.csv') + self.data = read_csv(path, sep=',') + layer1 = rplot.Layer(self.data) + layer2 = rplot.GeomPoint(x='total_bill', y='tip') + layer3 = rplot.GeomPolyFit(2) + self.layers = rplot.sequence_layers([layer1, layer2, layer3]) + self.trellis1 = rplot.TrellisGrid(['sex', 'smoker']) + self.trellis2 = rplot.TrellisGrid(['sex', '.']) + self.trellis3 = rplot.TrellisGrid(['.', 'smoker']) + self.trellised1 = self.trellis1.trellis(self.layers) + self.trellised2 = self.trellis2.trellis(self.layers) + self.trellised3 = self.trellis3.trellis(self.layers) + + def test_grid_sizes(self): + self.assertEqual(len(self.trellised1), 3) + self.assertEqual(len(self.trellised2), 3) + self.assertEqual(len(self.trellised3), 3) + self.assertEqual(len(self.trellised1[0]), 2) + self.assertEqual(len(self.trellised1[0][0]), 2) + self.assertEqual(len(self.trellised2[0]), 2) + self.assertEqual(len(self.trellised2[0][0]), 1) + self.assertEqual(len(self.trellised3[0]), 1) + self.assertEqual(len(self.trellised3[0][0]), 2) + self.assertEqual(len(self.trellised1[1]), 2) + self.assertEqual(len(self.trellised1[1][0]), 2) + self.assertEqual(len(self.trellised2[1]), 2) + self.assertEqual(len(self.trellised2[1][0]), 1) + self.assertEqual(len(self.trellised3[1]), 1) + self.assertEqual(len(self.trellised3[1][0]), 2) + self.assertEqual(len(self.trellised1[2]), 2) + self.assertEqual(len(self.trellised1[2][0]), 2) + self.assertEqual(len(self.trellised2[2]), 2) + self.assertEqual(len(self.trellised2[2][0]), 1) + self.assertEqual(len(self.trellised3[2]), 1) + self.assertEqual(len(self.trellised3[2][0]), 2) + + def test_trellis_cols_rows(self): + self.assertEqual(self.trellis1.cols, 2) + self.assertEqual(self.trellis1.rows, 2) + self.assertEqual(self.trellis2.cols, 1) + self.assertEqual(self.trellis2.rows, 2) + self.assertEqual(self.trellis3.cols, 2) + self.assertEqual(self.trellis3.rows, 1) + + +@tm.mplskip +class TestScaleGradient(tm.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.gradient = rplot.ScaleGradient("SepalLength", colour1=(0.2, 0.3, + 0.4), + colour2=(0.8, 0.7, 0.6)) + + def test_gradient(self): + for index in range(len(self.data)): + row = self.data.irow(index) + r, g, b = self.gradient(self.data, index) + r1, g1, b1 = self.gradient.colour1 + r2, g2, b2 = self.gradient.colour2 + self.assertTrue(between(r1, r2, r)) + self.assertTrue(between(g1, g2, g)) + self.assertTrue(between(b1, b2, b)) + + +@tm.mplskip +class TestScaleGradient2(tm.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.gradient = rplot.ScaleGradient2("SepalLength", colour1=(0.2, 0.3, 0.4), colour2=(0.8, 0.7, 0.6), colour3=(0.5, 0.5, 0.5)) + + def test_gradient2(self): + for index in range(len(self.data)): + row = self.data.irow(index) + r, g, b = self.gradient(self.data, index) + r1, g1, b1 = self.gradient.colour1 + r2, g2, b2 = self.gradient.colour2 + r3, g3, b3 = self.gradient.colour3 + value = row[self.gradient.column] + a_ = min(self.data[self.gradient.column]) + b_ = max(self.data[self.gradient.column]) + scaled = (value - a_) / (b_ - a_) + if scaled < 0.5: + self.assertTrue(between(r1, r2, r)) + self.assertTrue(between(g1, g2, g)) + self.assertTrue(between(b1, b2, b)) + else: + self.assertTrue(between(r2, r3, r)) + self.assertTrue(between(g2, g3, g)) + self.assertTrue(between(b2, b3, b)) + + +@tm.mplskip +class TestScaleRandomColour(tm.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.colour = rplot.ScaleRandomColour('SepalLength') + + def test_random_colour(self): + for index in range(len(self.data)): + colour = self.colour(self.data, index) + self.assertEqual(len(colour), 3) + r, g, b = colour + self.assertTrue(r >= 0.0) + self.assertTrue(g >= 0.0) + self.assertTrue(b >= 0.0) + self.assertTrue(r <= 1.0) + self.assertTrue(g <= 1.0) + self.assertTrue(b <= 1.0) + + +@tm.mplskip +class TestScaleConstant(tm.TestCase): + def test_scale_constant(self): + scale = rplot.ScaleConstant(1.0) + self.assertEqual(scale(None, None), 1.0) + scale = rplot.ScaleConstant("test") + self.assertEqual(scale(None, None), "test") + + +class TestScaleSize(tm.TestCase): + def setUp(self): + path = os.path.join(curpath(), 'data/iris.csv') + self.data = read_csv(path, sep=',') + self.scale1 = rplot.ScaleShape('Name') + self.scale2 = rplot.ScaleShape('PetalLength') + + def test_scale_size(self): + for index in range(len(self.data)): + marker = self.scale1(self.data, index) + self.assertTrue(marker in ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x']) + + def test_scale_overflow(self): + def f(): + for index in range(len(self.data)): + self.scale2(self.data, index) + + self.assertRaises(ValueError, f) + + +@tm.mplskip +class TestRPlot(tm.TestCase): + def test_rplot1(self): + import matplotlib.pyplot as plt + path = os.path.join(curpath(), 'data/tips.csv') + plt.figure() + self.data = read_csv(path, sep=',') + self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') + self.plot.add(rplot.TrellisGrid(['sex', 'smoker'])) + self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'))) + self.fig = plt.gcf() + self.plot.render(self.fig) + + def test_rplot2(self): + import matplotlib.pyplot as plt + path = os.path.join(curpath(), 'data/tips.csv') + plt.figure() + self.data = read_csv(path, sep=',') + self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') + self.plot.add(rplot.TrellisGrid(['.', 'smoker'])) + self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'))) + self.fig = plt.gcf() + self.plot.render(self.fig) + + def test_rplot3(self): + import matplotlib.pyplot as plt + path = os.path.join(curpath(), 'data/tips.csv') + plt.figure() + self.data = read_csv(path, sep=',') + self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') + self.plot.add(rplot.TrellisGrid(['sex', '.'])) + self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'))) + self.fig = plt.gcf() + self.plot.render(self.fig) + + def test_rplot_iris(self): + import matplotlib.pyplot as plt + path = os.path.join(curpath(), 'data/iris.csv') + plt.figure() + self.data = read_csv(path, sep=',') + plot = rplot.RPlot(self.data, x='SepalLength', y='SepalWidth') + plot.add(rplot.GeomPoint(colour=rplot.ScaleGradient('PetalLength', colour1=(0.0, 1.0, 0.5), colour2=(1.0, 0.0, 0.5)), + size=rplot.ScaleSize('PetalWidth', min_size=10.0, max_size=200.0), + shape=rplot.ScaleShape('Name'))) + self.fig = plt.gcf() + plot.render(self.fig) + + +if __name__ == '__main__': + import unittest + unittest.main() diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py new file mode 100644 index 00000000..d08f7e1d --- /dev/null +++ b/pandas/tests/test_series.py @@ -0,0 +1,6050 @@ +# pylint: disable-msg=E1101,W0612 + +import sys +from datetime import datetime, timedelta +import operator +import string +from itertools import product, starmap +from distutils.version import LooseVersion + +import nose + +from numpy import nan +import numpy as np +import numpy.ma as ma +import pandas as pd + +from pandas import (Index, Series, DataFrame, isnull, notnull, + bdate_range, date_range, _np_version_under1p7) +from pandas.core.index import MultiIndex +from pandas.core.indexing import IndexingError +from pandas.tseries.index import Timestamp, DatetimeIndex +import pandas.core.common as com +import pandas.core.config as cf +import pandas.lib as lib + +import pandas.core.datetools as datetools +import pandas.core.nanops as nanops + +from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long +from pandas import compat +from pandas.util.testing import (assert_series_equal, + assert_almost_equal, + assert_frame_equal, + ensure_clean) +import pandas.util.testing as tm + + +#------------------------------------------------------------------------------ +# Series test cases + +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + + +class CheckNameIntegration(object): + + _multiprocess_can_split_ = True + + def test_scalarop_preserve_name(self): + result = self.ts * 2 + self.assertEqual(result.name, self.ts.name) + + def test_copy_name(self): + result = self.ts.copy() + self.assertEqual(result.name, self.ts.name) + + def test_copy_index_name_checking(self): + # don't want to be able to modify the index stored elsewhere after + # making a copy + + self.ts.index.name = None + self.assertIsNone(self.ts.index.name) + self.assertIs(self.ts, self.ts) + cp = self.ts.copy() + cp.index.name = 'foo' + com.pprint_thing(self.ts.index.name) + self.assertIsNone(self.ts.index.name) + + def test_append_preserve_name(self): + result = self.ts[:5].append(self.ts[5:]) + self.assertEqual(result.name, self.ts.name) + + def test_binop_maybe_preserve_name(self): + + # names match, preserve + result = self.ts * self.ts + self.assertEqual(result.name, self.ts.name) + + result = self.ts * self.ts[:-2] + self.assertEqual(result.name, self.ts.name) + + # names don't match, don't preserve + cp = self.ts.copy() + cp.name = 'something else' + result = self.ts + cp + self.assertIsNone(result.name) + + def test_combine_first_name(self): + result = self.ts.combine_first(self.ts[:5]) + self.assertEqual(result.name, self.ts.name) + + def test_combine_first_dt64(self): + from pandas.tseries.tools import to_datetime + s0 = to_datetime(Series(["2010", np.NaN])) + s1 = to_datetime(Series([np.NaN, "2011"])) + rs = s0.combine_first(s1) + xp = to_datetime(Series(['2010', '2011'])) + assert_series_equal(rs, xp) + + s0 = to_datetime(Series(["2010", np.NaN])) + s1 = Series([np.NaN, "2011"]) + rs = s0.combine_first(s1) + xp = Series([datetime(2010, 1, 1), '2011']) + assert_series_equal(rs, xp) + + def test_get(self): + + # GH 6383 + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, + 45, 51, 39, 55, 43, 54, 52, 51, 54])) + + result = s.get(25, 0) + expected = 0 + self.assertEqual(result,expected) + + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, + 45, 51, 39, 55, 43, 54, 52, 51, 54]), + index=pd.Float64Index([25.0, 36.0, 49.0, 64.0, 81.0, 100.0, + 121.0, 144.0, 169.0, 196.0, 1225.0, + 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, + 1681.0, 1764.0, 1849.0, 1936.0], + dtype='object')) + + result = s.get(25, 0) + expected = 43 + self.assertEqual(result,expected) + + # GH 7407 + # with a boolean accessor + df = pd.DataFrame({'i':[0]*3, 'b':[False]*3}) + vc = df.i.value_counts() + result = vc.get(99,default='Missing') + self.assertEquals(result,'Missing') + + vc = df.b.value_counts() + result = vc.get(False,default='Missing') + self.assertEquals(result,3) + + result = vc.get(True,default='Missing') + self.assertEquals(result,'Missing') + + def test_delitem(self): + + # GH 5542 + # should delete the item inplace + s = Series(lrange(5)) + del s[0] + + expected = Series(lrange(1,5),index=lrange(1,5)) + assert_series_equal(s, expected) + + del s[1] + expected = Series(lrange(2,5),index=lrange(2,5)) + assert_series_equal(s, expected) + + # empty + s = Series() + def f(): + del s[0] + self.assertRaises(KeyError, f) + + # only 1 left, del, add, del + s = Series(1) + del s[0] + assert_series_equal(s, Series(dtype='int64')) + s[0] = 1 + assert_series_equal(s, Series(1)) + del s[0] + assert_series_equal(s, Series(dtype='int64')) + + def test_getitem_preserve_name(self): + result = self.ts[self.ts > 0] + self.assertEqual(result.name, self.ts.name) + + result = self.ts[[0, 2, 4]] + self.assertEqual(result.name, self.ts.name) + + result = self.ts[5:10] + self.assertEqual(result.name, self.ts.name) + + def test_getitem_setitem_ellipsis(self): + s = Series(np.random.randn(10)) + + np.fix(s) + + result = s[...] + assert_series_equal(result, s) + + s[...] = 5 + self.assertTrue((result == 5).all()) + + def test_getitem_negative_out_of_bounds(self): + s = Series([tm.rands(5) for _ in range(10)], + index=[tm.rands(10) for _ in range(10)]) + + self.assertRaises(IndexError, s.__getitem__, -11) + self.assertRaises(IndexError, s.__setitem__, -11, 'foo') + + def test_multilevel_name_print(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + s = Series(lrange(0, len(index)), index=index, name='sth') + expected = ["first second", + "foo one 0", + " two 1", + " three 2", + "bar one 3", + " two 4", + "baz two 5", + " three 6", + "qux one 7", + " two 8", + " three 9", + "Name: sth, dtype: int64"] + expected = "\n".join(expected) + self.assertEqual(repr(s), expected) + + def test_multilevel_preserve_name(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + s = Series(np.random.randn(len(index)), index=index, name='sth') + + result = s['foo'] + result2 = s.ix['foo'] + self.assertEqual(result.name, s.name) + self.assertEqual(result2.name, s.name) + + def test_name_printing(self): + # test small series + s = Series([0, 1, 2]) + s.name = "test" + self.assertIn("Name: test", repr(s)) + s.name = None + self.assertNotIn("Name:", repr(s)) + # test big series (diff code path) + s = Series(lrange(0, 1000)) + s.name = "test" + self.assertIn("Name: test", repr(s)) + s.name = None + self.assertNotIn("Name:", repr(s)) + + s = Series(index=date_range('20010101', '20020101'), name='test') + self.assertIn("Name: test", repr(s)) + + def test_pickle_preserve_name(self): + unpickled = self._pickle_roundtrip_name(self.ts) + self.assertEqual(unpickled.name, self.ts.name) + + def _pickle_roundtrip_name(self, obj): + + with ensure_clean() as path: + obj.to_pickle(path) + unpickled = pd.read_pickle(path) + return unpickled + + def test_argsort_preserve_name(self): + result = self.ts.argsort() + self.assertEqual(result.name, self.ts.name) + + def test_sort_index_name(self): + result = self.ts.sort_index(ascending=False) + self.assertEqual(result.name, self.ts.name) + + def test_to_sparse_pass_name(self): + result = self.ts.to_sparse() + self.assertEqual(result.name, self.ts.name) + + +class TestNanops(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_comparisons(self): + left = np.random.randn(10) + right = np.random.randn(10) + left[:3] = np.nan + + result = nanops.nangt(left, right) + expected = (left > right).astype('O') + expected[:3] = np.nan + + assert_almost_equal(result, expected) + + s = Series(['a', 'b', 'c']) + s2 = Series([False, True, False]) + + # it works! + s == s2 + s2 == s + + def test_none_comparison(self): + # bug brought up by #1079 + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + self.assertRaises(TypeError, s.__eq__, None) + + def test_sum_zero(self): + arr = np.array([]) + self.assertEqual(nanops.nansum(arr), 0) + + arr = np.empty((10, 0)) + self.assertTrue((nanops.nansum(arr, axis=1) == 0).all()) + + # GH #844 + s = Series([], index=[]) + self.assertEqual(s.sum(), 0) + + df = DataFrame(np.empty((10, 0))) + self.assertTrue((df.sum(1) == 0).all()) + + def test_nansum_buglet(self): + s = Series([1.0, np.nan], index=[0, 1]) + result = np.nansum(s) + assert_almost_equal(result, 1) + + def test_overflow(self): + + # GH 6915 + # overflowing on the smaller int dtypes + for dtype in ['int32','int64']: + v = np.arange(5000000,dtype=dtype) + s = Series(v) + + # no bottleneck + result = s.sum(skipna=False) + self.assertEqual(int(result),v.sum(dtype='int64')) + result = s.min(skipna=False) + self.assertEqual(int(result),0) + result = s.max(skipna=False) + self.assertEqual(int(result),v[-1]) + + # use bottleneck if available + result = s.sum() + self.assertEqual(int(result),v.sum(dtype='int64')) + result = s.min() + self.assertEqual(int(result),0) + result = s.max() + self.assertEqual(int(result),v[-1]) + + for dtype in ['float32','float64']: + v = np.arange(5000000,dtype=dtype) + s = Series(v) + + # no bottleneck + result = s.sum(skipna=False) + self.assertTrue(np.allclose(float(result),v.sum(dtype='float64'))) + result = s.min(skipna=False) + self.assertTrue(np.allclose(float(result),0.0)) + result = s.max(skipna=False) + self.assertTrue(np.allclose(float(result),v[-1])) + + # use bottleneck if available + result = s.sum() + self.assertTrue(np.allclose(float(result),v.sum(dtype='float64'))) + result = s.min() + self.assertTrue(np.allclose(float(result),0.0)) + result = s.max() + self.assertTrue(np.allclose(float(result),v[-1])) + +class SafeForSparse(object): + pass + +_ts = tm.makeTimeSeries() + +class TestSeries(tm.TestCase, CheckNameIntegration): + + _multiprocess_can_split_ = True + + def setUp(self): + import warnings + warnings.filterwarnings(action='ignore', category=FutureWarning) + + self.ts = _ts.copy() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.objSeries = tm.makeObjectSeries() + self.objSeries.name = 'objects' + + self.empty = Series([], index=[]) + + def test_scalar_conversion(self): + + # Pass in scalar is disabled + scalar = Series(0.5) + self.assertNotIsInstance(scalar, float) + + # coercion + self.assertEqual(float(Series([1.])), 1.0) + self.assertEqual(int(Series([1.])), 1) + self.assertEqual(long(Series([1.])), 1) + + def test_astype(self): + s = Series(np.random.randn(5),name='foo') + + for dtype in ['float32','float64','int64','int32']: + astyped = s.astype(dtype) + self.assertEqual(astyped.dtype, dtype) + self.assertEqual(astyped.name, s.name) + + def test_constructor(self): + # Recognize TimeSeries + self.assertTrue(self.ts.is_time_series) + + # Pass in Series + derived = Series(self.ts) + self.assertTrue(derived.is_time_series) + + self.assertTrue(tm.equalContents(derived.index, self.ts.index)) + # Ensure new index is not created + self.assertEqual(id(self.ts.index), id(derived.index)) + + # Mixed type Series + mixed = Series(['hello', np.NaN], index=[0, 1]) + self.assertEqual(mixed.dtype, np.object_) + self.assertIs(mixed[1], np.NaN) + + self.assertFalse(self.empty.is_time_series) + self.assertFalse(Series({}).is_time_series) + + self.assertRaises(Exception, Series, np.random.randn(3, 3), + index=np.arange(3)) + + mixed.name = 'Series' + rs = Series(mixed).name + xp = 'Series' + self.assertEqual(rs, xp) + + # raise on MultiIndex GH4187 + m = MultiIndex.from_arrays([[1, 2], [3, 4]]) + self.assertRaises(NotImplementedError, Series, m) + + def test_constructor_empty(self): + empty = Series() + empty2 = Series([]) + assert_series_equal(empty, empty2) + + empty = Series(index=lrange(10)) + empty2 = Series(np.nan, index=lrange(10)) + assert_series_equal(empty, empty2) + + def test_constructor_series(self): + index1 = ['d', 'b', 'a', 'c'] + index2 = sorted(index1) + s1 = Series([4, 7, -5, 3], index=index1) + s2 = Series(s1, index=index2) + + assert_series_equal(s2, s1.sort_index()) + + def test_constructor_iterator(self): + + expected = Series(list(range(10)),dtype='int64') + result = Series(range(10),dtype='int64') + assert_series_equal(result, expected) + + def test_constructor_generator(self): + gen = (i for i in range(10)) + + result = Series(gen) + exp = Series(lrange(10)) + assert_series_equal(result, exp) + + gen = (i for i in range(10)) + result = Series(gen, index=lrange(10, 20)) + exp.index = lrange(10, 20) + assert_series_equal(result, exp) + + def test_constructor_categorical(self): + cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c']) + res = Series(cat) + exp = Series({0: 'a', 1: 'b', 2: 'c', 3: 'a', 4: 'b', 5: 'c'}) + assert_series_equal(res, exp) + + cat.name = 'foo' + res = Series(cat) + self.assertEqual(res.name, cat.name) + + def test_constructor_maskedarray(self): + data = ma.masked_all((3,), dtype=float) + result = Series(data) + expected = Series([nan, nan, nan]) + assert_series_equal(result, expected) + + data[0] = 0.0 + data[2] = 2.0 + index = ['a', 'b', 'c'] + result = Series(data, index=index) + expected = Series([0.0, nan, 2.0], index=index) + assert_series_equal(result, expected) + + data[1] = 1.0 + result = Series(data, index=index) + expected = Series([0.0, 1.0, 2.0], index=index) + assert_series_equal(result, expected) + + data = ma.masked_all((3,), dtype=int) + result = Series(data) + expected = Series([nan, nan, nan], dtype=float) + assert_series_equal(result, expected) + + data[0] = 0 + data[2] = 2 + index = ['a', 'b', 'c'] + result = Series(data, index=index) + expected = Series([0, nan, 2], index=index, dtype=float) + assert_series_equal(result, expected) + + data[1] = 1 + result = Series(data, index=index) + expected = Series([0, 1, 2], index=index, dtype=int) + assert_series_equal(result, expected) + + data = ma.masked_all((3,), dtype=bool) + result = Series(data) + expected = Series([nan, nan, nan], dtype=object) + assert_series_equal(result, expected) + + data[0] = True + data[2] = False + index = ['a', 'b', 'c'] + result = Series(data, index=index) + expected = Series([True, nan, False], index=index, dtype=object) + assert_series_equal(result, expected) + + data[1] = True + result = Series(data, index=index) + expected = Series([True, True, False], index=index, dtype=bool) + assert_series_equal(result, expected) + + from pandas import tslib + data = ma.masked_all((3,), dtype='M8[ns]') + result = Series(data) + expected = Series([tslib.iNaT, tslib.iNaT, tslib.iNaT], dtype='M8[ns]') + assert_series_equal(result, expected) + + data[0] = datetime(2001, 1, 1) + data[2] = datetime(2001, 1, 3) + index = ['a', 'b', 'c'] + result = Series(data, index=index) + expected = Series([datetime(2001, 1, 1), tslib.iNaT, + datetime(2001, 1, 3)], index=index, dtype='M8[ns]') + assert_series_equal(result, expected) + + data[1] = datetime(2001, 1, 2) + result = Series(data, index=index) + expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2), + datetime(2001, 1, 3)], index=index, dtype='M8[ns]') + assert_series_equal(result, expected) + + def test_constructor_default_index(self): + s = Series([0, 1, 2]) + assert_almost_equal(s.index, np.arange(3)) + + def test_constructor_corner(self): + df = tm.makeTimeDataFrame() + objs = [df, df] + s = Series(objs, index=[0, 1]) + tm.assert_isinstance(s, Series) + + def test_constructor_sanitize(self): + s = Series(np.array([1., 1., 8.]), dtype='i8') + self.assertEqual(s.dtype, np.dtype('i8')) + + s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') + self.assertEqual(s.dtype, np.dtype('f8')) + + def test_constructor_pass_none(self): + s = Series(None, index=lrange(5)) + self.assertEqual(s.dtype, np.float64) + + s = Series(None, index=lrange(5), dtype=object) + self.assertEqual(s.dtype, np.object_) + + # GH 7431 + # inference on the index + s = Series(index=np.array([None])) + expected = Series(index=Index([None])) + assert_series_equal(s,expected) + + def test_constructor_cast(self): + self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float) + + def test_constructor_dtype_nocast(self): + # 1572 + s = Series([1, 2, 3]) + + s2 = Series(s, dtype=np.int64) + + s2[1] = 5 + self.assertEqual(s[1], 5) + + def test_constructor_dtype_datetime64(self): + import pandas.tslib as tslib + + s = Series(tslib.iNaT, dtype='M8[ns]', index=lrange(5)) + self.assertTrue(isnull(s).all()) + + # in theory this should be all nulls, but since + # we are not specifying a dtype is ambiguous + s = Series(tslib.iNaT, index=lrange(5)) + self.assertFalse(isnull(s).all()) + + s = Series(nan, dtype='M8[ns]', index=lrange(5)) + self.assertTrue(isnull(s).all()) + + s = Series([datetime(2001, 1, 2, 0, 0), tslib.iNaT], dtype='M8[ns]') + self.assertTrue(isnull(s[1])) + self.assertEqual(s.dtype, 'M8[ns]') + + s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]') + self.assertTrue(isnull(s[1])) + self.assertEqual(s.dtype, 'M8[ns]') + + # GH3416 + dates = [ + np.datetime64(datetime(2013, 1, 1)), + np.datetime64(datetime(2013, 1, 2)), + np.datetime64(datetime(2013, 1, 3)), + ] + + s = Series(dates) + self.assertEqual(s.dtype, 'M8[ns]') + + s.ix[0] = np.nan + self.assertEqual(s.dtype, 'M8[ns]') + + # invalid astypes + for t in ['s', 'D', 'us', 'ms']: + self.assertRaises(TypeError, s.astype, 'M8[%s]' % t) + + # GH3414 related + self.assertRaises(TypeError, lambda x: Series( + Series(dates).astype('int') / 1000000, dtype='M8[ms]')) + self.assertRaises( + TypeError, lambda x: Series(dates, dtype='datetime64')) + + # invalid dates can be help as object + result = Series([datetime(2,1,1)]) + self.assertEqual(result[0], datetime(2,1,1,0,0)) + + result = Series([datetime(3000,1,1)]) + self.assertEqual(result[0], datetime(3000,1,1,0,0)) + + # don't mix types + result = Series([ Timestamp('20130101'), 1],index=['a','b']) + self.assertEqual(result['a'], Timestamp('20130101')) + self.assertEqual(result['b'], 1) + + # GH6529 + # coerce datetime64 non-ns properly + dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') + values2 = dates.view(np.ndarray).astype('datetime64[ns]') + expected = Series(values2, dates) + + # numpy < 1.7 is very odd about astyping + if not _np_version_under1p7: + for dtype in ['s','D','ms','us','ns']: + values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + result = Series(values1, dates) + assert_series_equal(result,expected) + + # leave datetime.date alone + dates2 = np.array([ d.date() for d in dates.to_pydatetime() ],dtype=object) + series1 = Series(dates2, dates) + self.assert_numpy_array_equal(series1.values,dates2) + self.assertEqual(series1.dtype,object) + + # these will correctly infer a datetime + s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001']) + self.assertEqual(s.dtype,'datetime64[ns]') + s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001']) + self.assertEqual(s.dtype,'datetime64[ns]') + s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001']) + self.assertEqual(s.dtype,'datetime64[ns]') + s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) + self.assertEqual(s.dtype,'datetime64[ns]') + + def test_constructor_dict(self): + d = {'a': 0., 'b': 1., 'c': 2.} + result = Series(d, index=['b', 'c', 'd', 'a']) + expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) + assert_series_equal(result, expected) + + pidx = tm.makePeriodIndex(100) + d = {pidx[0]: 0, pidx[1]: 1} + result = Series(d, index=pidx) + expected = Series(np.nan, pidx) + expected.ix[0] = 0 + expected.ix[1] = 1 + assert_series_equal(result, expected) + + def test_constructor_dict_multiindex(self): + check = lambda result, expected: tm.assert_series_equal( + result, expected, check_dtype=True, check_index_type=True, + check_series_type=True) + d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.} + _d = sorted(d.items()) + ser = Series(d) + expected = Series([x[1] for x in _d], + index=MultiIndex.from_tuples([x[0] for x in _d])) + check(ser, expected) + + d['z'] = 111. + _d.insert(0, ('z', d['z'])) + ser = Series(d) + expected = Series( + [x[1] for x in _d], + index=Index([x[0] for x in _d], tupleize_cols=False)) + ser = ser.reindex(index=expected.index) + check(ser, expected) + + def test_constructor_subclass_dict(self): + data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) + series = Series(data) + refseries = Series(dict(compat.iteritems(data))) + assert_series_equal(refseries, series) + + def test_orderedDict_ctor(self): + # GH3283 + import pandas + import random + data = OrderedDict([('col%s' % i, random.random()) for i in range(12)]) + s = pandas.Series(data) + self.assertTrue(all(s.values == list(data.values()))) + + def test_orderedDict_subclass_ctor(self): + # GH3283 + import pandas + import random + + class A(OrderedDict): + pass + data = A([('col%s' % i, random.random()) for i in range(12)]) + s = pandas.Series(data) + self.assertTrue(all(s.values == list(data.values()))) + + def test_constructor_list_of_tuples(self): + data = [(1, 1), (2, 2), (2, 3)] + s = Series(data) + self.assertEqual(list(s), data) + + def test_constructor_tuple_of_tuples(self): + data = ((1, 1), (2, 2), (2, 3)) + s = Series(data) + self.assertEqual(tuple(s), data) + + def test_constructor_set(self): + values = set([1, 2, 3, 4, 5]) + self.assertRaises(TypeError, Series, values) + values = frozenset(values) + self.assertRaises(TypeError, Series, values) + + def test_fromDict(self): + data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} + + series = Series(data) + self.assertTrue(tm.is_sorted(series.index)) + + data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} + series = Series(data) + self.assertEqual(series.dtype, np.object_) + + data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'} + series = Series(data) + self.assertEqual(series.dtype, np.object_) + + data = {'a': '0', 'b': '1'} + series = Series(data, dtype=float) + self.assertEqual(series.dtype, np.float64) + + def test_setindex(self): + # wrong type + series = self.series.copy() + self.assertRaises(TypeError, setattr, series, 'index', None) + + # wrong length + series = self.series.copy() + self.assertRaises(Exception, setattr, series, 'index', + np.arange(len(series) - 1)) + + # works + series = self.series.copy() + series.index = np.arange(len(series)) + tm.assert_isinstance(series.index, Index) + + def test_array_finalize(self): + pass + + def test_pop(self): + # GH 6600 + df = DataFrame({ + 'A': 0, + 'B': np.arange(5,dtype='int64'), + 'C': 0, + }) + k = df.iloc[4] + + result = k.pop('B') + self.assertEqual(result, 4) + + expected = Series([0,0],index=['A','C']) + assert_series_equal(k, expected) + + def test_not_hashable(self): + s_empty = Series() + s = Series([1]) + self.assertRaises(TypeError, hash, s_empty) + self.assertRaises(TypeError, hash, s) + + def test_fromValue(self): + + nans = Series(np.NaN, index=self.ts.index) + self.assertEqual(nans.dtype, np.float_) + self.assertEqual(len(nans), len(self.ts)) + + strings = Series('foo', index=self.ts.index) + self.assertEqual(strings.dtype, np.object_) + self.assertEqual(len(strings), len(self.ts)) + + d = datetime.now() + dates = Series(d, index=self.ts.index) + self.assertEqual(dates.dtype, 'M8[ns]') + self.assertEqual(len(dates), len(self.ts)) + + def test_contains(self): + tm.assert_contains_all(self.ts.index, self.ts) + + def test_pickle(self): + unp_series = self._pickle_roundtrip(self.series) + unp_ts = self._pickle_roundtrip(self.ts) + assert_series_equal(unp_series, self.series) + assert_series_equal(unp_ts, self.ts) + + def _pickle_roundtrip(self, obj): + + with ensure_clean() as path: + obj.to_pickle(path) + unpickled = pd.read_pickle(path) + return unpickled + + def test_getitem_get(self): + idx1 = self.series.index[5] + idx2 = self.objSeries.index[5] + + self.assertEqual(self.series[idx1], self.series.get(idx1)) + self.assertEqual(self.objSeries[idx2], self.objSeries.get(idx2)) + + self.assertEqual(self.series[idx1], self.series[5]) + self.assertEqual(self.objSeries[idx2], self.objSeries[5]) + + self.assertEqual( + self.series.get(-1), self.series.get(self.series.index[-1])) + self.assertEqual(self.series[5], self.series.get(self.series.index[5])) + + # missing + d = self.ts.index[0] - datetools.bday + self.assertRaises(KeyError, self.ts.__getitem__, d) + + # None + # GH 5652 + for s in [Series(), Series(index=list('abc'))]: + result = s.get(None) + self.assertIsNone(result) + + def test_iget(self): + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + for i in range(len(s)): + result = s.iget(i) + exp = s[s.index[i]] + assert_almost_equal(result, exp) + + # pass a slice + result = s.iget(slice(1, 3)) + expected = s.ix[2:4] + assert_series_equal(result, expected) + + # test slice is a view + result[:] = 0 + self.assertTrue((s[1:3] == 0).all()) + + # list of integers + result = s.iget([0, 2, 3, 4, 5]) + expected = s.reindex(s.index[[0, 2, 3, 4, 5]]) + assert_series_equal(result, expected) + + def test_iget_nonunique(self): + s = Series([0, 1, 2], index=[0, 1, 0]) + self.assertEqual(s.iget(2), 2) + + def test_getitem_regression(self): + s = Series(lrange(5), index=lrange(5)) + result = s[lrange(5)] + assert_series_equal(result, s) + + def test_getitem_setitem_slice_bug(self): + s = Series(lrange(10), lrange(10)) + result = s[-12:] + assert_series_equal(result, s) + + result = s[-7:] + assert_series_equal(result, s[3:]) + + result = s[:-12] + assert_series_equal(result, s[:0]) + + s = Series(lrange(10), lrange(10)) + s[-12:] = 0 + self.assertTrue((s == 0).all()) + + s[:-12] = 5 + self.assertTrue((s == 0).all()) + + def test_getitem_int64(self): + idx = np.int64(5) + self.assertEqual(self.ts[idx], self.ts[5]) + + def test_getitem_fancy(self): + slice1 = self.series[[1, 2, 3]] + slice2 = self.objSeries[[1, 2, 3]] + self.assertEqual(self.series.index[2], slice1.index[1]) + self.assertEqual(self.objSeries.index[2], slice2.index[1]) + self.assertEqual(self.series[2], slice1[1]) + self.assertEqual(self.objSeries[2], slice2[1]) + + def test_getitem_boolean(self): + s = self.series + mask = s > s.median() + + # passing list is OK + result = s[list(mask)] + expected = s[mask] + assert_series_equal(result, expected) + self.assert_numpy_array_equal(result.index, s.index[mask]) + + def test_getitem_boolean_empty(self): + s = Series([], dtype=np.int64) + s.index.name = 'index_name' + s = s[s.isnull()] + self.assertEqual(s.index.name, 'index_name') + self.assertEqual(s.dtype, np.int64) + + # GH5877 + # indexing with empty series + s = Series(['A', 'B']) + expected = Series(np.nan,index=['C'],dtype=object) + result = s[Series(['C'], dtype=object)] + assert_series_equal(result, expected) + + s = Series(['A', 'B']) + expected = Series(dtype=object) + result = s[Series([], dtype=object)] + assert_series_equal(result, expected) + + # invalid because of the boolean indexer + # that's empty or not-aligned + def f(): + s[Series([], dtype=bool)] + self.assertRaises(IndexingError, f) + + def f(): + s[Series([True], dtype=bool)] + self.assertRaises(IndexingError, f) + + def test_getitem_generator(self): + gen = (x > 0 for x in self.series) + result = self.series[gen] + result2 = self.series[iter(self.series > 0)] + expected = self.series[self.series > 0] + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + def test_getitem_boolean_object(self): + # using column from DataFrame + + s = self.series + mask = s > s.median() + omask = mask.astype(object) + + # getitem + result = s[omask] + expected = s[mask] + assert_series_equal(result, expected) + + # setitem + s2 = s.copy() + cop = s.copy() + cop[omask] = 5 + s2[mask] = 5 + assert_series_equal(cop, s2) + + # nans raise exception + omask[5:10] = np.nan + self.assertRaises(Exception, s.__getitem__, omask) + self.assertRaises(Exception, s.__setitem__, omask, 5) + + def test_getitem_setitem_boolean_corner(self): + ts = self.ts + mask_shifted = ts.shift(1, freq=datetools.bday) > ts.median() + + # these used to raise...?? + + self.assertRaises(Exception, ts.__getitem__, mask_shifted) + self.assertRaises(Exception, ts.__setitem__, mask_shifted, 1) + #ts[mask_shifted] + #ts[mask_shifted] = 1 + + self.assertRaises(Exception, ts.ix.__getitem__, mask_shifted) + self.assertRaises(Exception, ts.ix.__setitem__, mask_shifted, 1) + #ts.ix[mask_shifted] + #ts.ix[mask_shifted] = 2 + + def test_getitem_setitem_slice_integers(self): + s = Series(np.random.randn(8), index=[2, 4, 6, 8, 10, 12, 14, 16]) + + result = s[:4] + expected = s.reindex([2, 4, 6, 8]) + assert_series_equal(result, expected) + + s[:4] = 0 + self.assertTrue((s[:4] == 0).all()) + self.assertTrue(not (s[4:] == 0).any()) + + def test_getitem_out_of_bounds(self): + # don't segfault, GH #495 + self.assertRaises(IndexError, self.ts.__getitem__, len(self.ts)) + + # GH #917 + s = Series([]) + self.assertRaises(IndexError, s.__getitem__, -1) + + def test_getitem_setitem_integers(self): + # caused bug without test + s = Series([1, 2, 3], ['a', 'b', 'c']) + + self.assertEqual(s.ix[0], s['a']) + s.ix[0] = 5 + self.assertAlmostEqual(s['a'], 5) + + def test_getitem_box_float64(self): + value = self.ts[5] + tm.assert_isinstance(value, np.float64) + + def test_getitem_ambiguous_keyerror(self): + s = Series(lrange(10), index=lrange(0, 20, 2)) + self.assertRaises(KeyError, s.__getitem__, 1) + self.assertRaises(KeyError, s.ix.__getitem__, 1) + + def test_getitem_unordered_dup(self): + obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b']) + self.assertTrue(np.isscalar(obj['c'])) + self.assertEqual(obj['c'], 0) + + def test_getitem_dups_with_missing(self): + + # breaks reindex, so need to use .ix internally + # GH 4246 + s = Series([1, 2, 3, 4], ['foo', 'bar', 'foo', 'bah']) + expected = s.ix[['foo', 'bar', 'bah', 'bam']] + result = s[['foo', 'bar', 'bah', 'bam']] + assert_series_equal(result, expected) + + def test_getitem_dups(self): + s = Series(range(5),index=['A','A','B','C','C'],dtype=np.int64) + expected = Series([3,4],index=['C','C'],dtype=np.int64) + result = s['C'] + assert_series_equal(result, expected) + + def test_setitem_ambiguous_keyerror(self): + s = Series(lrange(10), index=lrange(0, 20, 2)) + + # equivalent of an append + s2 = s.copy() + s2[1] = 5 + expected = s.append(Series([5],index=[1])) + assert_series_equal(s2,expected) + + s2 = s.copy() + s2.ix[1] = 5 + expected = s.append(Series([5],index=[1])) + assert_series_equal(s2,expected) + + def test_setitem_float_labels(self): + # note labels are floats + s = Series(['a', 'b', 'c'], index=[0, 0.5, 1]) + tmp = s.copy() + + s.ix[1] = 'zoo' + tmp.iloc[2] = 'zoo' + + assert_series_equal(s, tmp) + + def test_slice(self): + numSlice = self.series[10:20] + numSliceEnd = self.series[-10:] + objSlice = self.objSeries[10:20] + + self.assertNotIn(self.series.index[9], numSlice.index) + self.assertNotIn(self.objSeries.index[9], objSlice.index) + + self.assertEqual(len(numSlice), len(numSlice.index)) + self.assertEqual(self.series[numSlice.index[0]], + numSlice[numSlice.index[0]]) + + self.assertEqual(numSlice.index[1], self.series.index[11]) + + self.assertTrue(tm.equalContents(numSliceEnd, + np.array(self.series)[-10:])) + + # test return view + sl = self.series[10:20] + sl[:] = 0 + self.assertTrue((self.series[10:20] == 0).all()) + + def test_slice_can_reorder_not_uniquely_indexed(self): + s = Series(1, index=['a', 'a', 'b', 'b', 'c']) + result = s[::-1] # it works! + + def test_slice_float_get_set(self): + + self.assertRaises(TypeError, lambda : self.ts[4.0:10.0]) + def f(): + self.ts[4.0:10.0] = 0 + self.assertRaises(TypeError, f) + + self.assertRaises(TypeError, self.ts.__getitem__, slice(4.5, 10.0)) + self.assertRaises(TypeError, self.ts.__setitem__, slice(4.5, 10.0), 0) + + def test_slice_floats2(self): + s = Series(np.random.rand(10), index=np.arange(10, 20, dtype=float)) + + self.assertEqual(len(s.ix[12.0:]), 8) + self.assertEqual(len(s.ix[12.5:]), 7) + + i = np.arange(10, 20, dtype=float) + i[2] = 12.2 + s.index = i + self.assertEqual(len(s.ix[12.0:]), 8) + self.assertEqual(len(s.ix[12.5:]), 7) + + def test_slice_float64(self): + + values = np.arange(10., 50., 2) + index = Index(values) + + start, end = values[[5, 15]] + + s = Series(np.random.randn(20), index=index) + + result = s[start:end] + expected = s.iloc[5:16] + assert_series_equal(result, expected) + + result = s.loc[start:end] + assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(20, 3), index=index) + + result = df[start:end] + expected = df.iloc[5:16] + tm.assert_frame_equal(result, expected) + + result = df.loc[start:end] + tm.assert_frame_equal(result, expected) + + def test_setitem(self): + self.ts[self.ts.index[5]] = np.NaN + self.ts[[1, 2, 17]] = np.NaN + self.ts[6] = np.NaN + self.assertTrue(np.isnan(self.ts[6])) + self.assertTrue(np.isnan(self.ts[2])) + self.ts[np.isnan(self.ts)] = 5 + self.assertFalse(np.isnan(self.ts[2])) + + # caught this bug when writing tests + series = Series(tm.makeIntIndex(20).astype(float), + index=tm.makeIntIndex(20)) + + series[::2] = 0 + self.assertTrue((series[::2] == 0).all()) + + # set item that's not contained + s = self.series.copy() + s['foobar'] = 1 + expected = self.series.append(Series([1],index=['foobar'])) + assert_series_equal(s,expected) + + def test_setitem_dtypes(self): + + # change dtypes + # GH 4463 + expected = Series([np.nan,2,3]) + + s = Series([1,2,3]) + s.iloc[0] = np.nan + assert_series_equal(s,expected) + + s = Series([1,2,3]) + s.loc[0] = np.nan + assert_series_equal(s,expected) + + s = Series([1,2,3]) + s[0] = np.nan + assert_series_equal(s,expected) + + s = Series([False]) + s.loc[0] = np.nan + assert_series_equal(s,Series([np.nan])) + + s = Series([False,True]) + s.loc[0] = np.nan + assert_series_equal(s,Series([np.nan,1.0])) + + def test_set_value(self): + idx = self.ts.index[10] + res = self.ts.set_value(idx, 0) + self.assertIs(res, self.ts) + self.assertEqual(self.ts[idx], 0) + + # equiv + s = self.series.copy() + res = s.set_value('foobar', 0) + self.assertIs(res, s) + self.assertEqual(res.index[-1], 'foobar') + self.assertEqual(res['foobar'], 0) + + s = self.series.copy() + s.loc['foobar'] = 0 + self.assertEqual(s.index[-1], 'foobar') + self.assertEqual(s['foobar'], 0) + + def test_setslice(self): + sl = self.ts[5:20] + self.assertEqual(len(sl), len(sl.index)) + self.assertTrue(sl.index.is_unique) + + def test_basic_getitem_setitem_corner(self): + # invalid tuples, e.g. self.ts[:, None] vs. self.ts[:, 2] + with tm.assertRaisesRegexp(ValueError, 'tuple-index'): + self.ts[:, 2] + with tm.assertRaisesRegexp(ValueError, 'tuple-index'): + self.ts[:, 2] = 2 + + # weird lists. [slice(0, 5)] will work but not two slices + result = self.ts[[slice(None, 5)]] + expected = self.ts[:5] + assert_series_equal(result, expected) + + # OK + self.assertRaises(Exception, self.ts.__getitem__, + [5, slice(None, None)]) + self.assertRaises(Exception, self.ts.__setitem__, + [5, slice(None, None)], 2) + + def test_reshape_non_2d(self): + # GH 4554 + x = Series(np.random.random(201), name='x') + self.assertTrue(x.reshape(x.shape,) is x) + + # GH 2719 + a = Series([1, 2, 3, 4]) + result = a.reshape(2, 2) + expected = a.values.reshape(2, 2) + np.testing.assert_array_equal(result, expected) + self.assertTrue(type(result) is type(expected)) + + def test_reshape_2d_return_array(self): + x = Series(np.random.random(201), name='x') + result = x.reshape((-1, 1)) + self.assertNotIsInstance(result, Series) + + result2 = np.reshape(x, (-1, 1)) + self.assertNotIsInstance(result, Series) + + result = x[:, None] + expected = x.reshape((-1, 1)) + assert_almost_equal(result, expected) + + def test_basic_getitem_with_labels(self): + indices = self.ts.index[[5, 10, 15]] + + result = self.ts[indices] + expected = self.ts.reindex(indices) + assert_series_equal(result, expected) + + result = self.ts[indices[0]:indices[2]] + expected = self.ts.ix[indices[0]:indices[2]] + assert_series_equal(result, expected) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + inds = [0, 2, 5, 7, 8] + arr_inds = np.array([0, 2, 5, 7, 8]) + result = s[inds] + expected = s.reindex(inds) + assert_series_equal(result, expected) + + result = s[arr_inds] + expected = s.reindex(arr_inds) + assert_series_equal(result, expected) + + def test_basic_setitem_with_labels(self): + indices = self.ts.index[[5, 10, 15]] + + cp = self.ts.copy() + exp = self.ts.copy() + cp[indices] = 0 + exp.ix[indices] = 0 + assert_series_equal(cp, exp) + + cp = self.ts.copy() + exp = self.ts.copy() + cp[indices[0]:indices[2]] = 0 + exp.ix[indices[0]:indices[2]] = 0 + assert_series_equal(cp, exp) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + inds = [0, 4, 6] + arr_inds = np.array([0, 4, 6]) + + cp = s.copy() + exp = s.copy() + s[inds] = 0 + s.ix[inds] = 0 + assert_series_equal(cp, exp) + + cp = s.copy() + exp = s.copy() + s[arr_inds] = 0 + s.ix[arr_inds] = 0 + assert_series_equal(cp, exp) + + inds_notfound = [0, 4, 5, 6] + arr_inds_notfound = np.array([0, 4, 5, 6]) + self.assertRaises(Exception, s.__setitem__, inds_notfound, 0) + self.assertRaises(Exception, s.__setitem__, arr_inds_notfound, 0) + + def test_ix_getitem(self): + inds = self.series.index[[3, 4, 7]] + assert_series_equal(self.series.ix[inds], self.series.reindex(inds)) + assert_series_equal(self.series.ix[5::2], self.series[5::2]) + + # slice with indices + d1, d2 = self.ts.index[[5, 15]] + result = self.ts.ix[d1:d2] + expected = self.ts.truncate(d1, d2) + assert_series_equal(result, expected) + + # boolean + mask = self.series > self.series.median() + assert_series_equal(self.series.ix[mask], self.series[mask]) + + # ask for index value + self.assertEqual(self.ts.ix[d1], self.ts[d1]) + self.assertEqual(self.ts.ix[d2], self.ts[d2]) + + def test_ix_getitem_not_monotonic(self): + d1, d2 = self.ts.index[[5, 15]] + + ts2 = self.ts[::2][::-1] + + self.assertRaises(KeyError, ts2.ix.__getitem__, slice(d1, d2)) + self.assertRaises(KeyError, ts2.ix.__setitem__, slice(d1, d2), 0) + + def test_ix_getitem_setitem_integer_slice_keyerrors(self): + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + + # this is OK + cp = s.copy() + cp.ix[4:10] = 0 + self.assertTrue((cp.ix[4:10] == 0).all()) + + # so is this + cp = s.copy() + cp.ix[3:11] = 0 + self.assertTrue((cp.ix[3:11] == 0).values.all()) + + result = s.ix[4:10] + result2 = s.ix[3:11] + expected = s.reindex([4, 6, 8, 10]) + + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + # non-monotonic, raise KeyError + s2 = s[::-1] + self.assertRaises(KeyError, s2.ix.__getitem__, slice(3, 11)) + self.assertRaises(KeyError, s2.ix.__setitem__, slice(3, 11), 0) + + def test_ix_getitem_iterator(self): + idx = iter(self.series.index[:10]) + result = self.series.ix[idx] + assert_series_equal(result, self.series[:10]) + + def test_where(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond).dropna() + rs2 = s[cond] + assert_series_equal(rs, rs2) + + rs = s.where(cond, -s) + assert_series_equal(rs, s.abs()) + + rs = s.where(cond) + assert(s.shape == rs.shape) + assert(rs is not s) + + # test alignment + cond = Series([True,False,False,True,False],index=s.index) + s2 = -(s.abs()) + + expected = s2[cond].reindex(s2.index[:3]).reindex(s2.index) + rs = s2.where(cond[:3]) + assert_series_equal(rs, expected) + + expected = s2.abs() + expected.ix[0] = s2[0] + rs = s2.where(cond[:3], -s2) + assert_series_equal(rs, expected) + + self.assertRaises(ValueError, s.where, 1) + self.assertRaises(ValueError, s.where, cond[:3].values, -s) + + # GH 2745 + s = Series([1, 2]) + s[[True, False]] = [0, 1] + expected = Series([0, 2]) + assert_series_equal(s, expected) + + # failures + self.assertRaises( + ValueError, s.__setitem__, tuple([[[True, False]]]), [0, 2, 3]) + self.assertRaises( + ValueError, s.__setitem__, tuple([[[True, False]]]), []) + + # unsafe dtype changes + for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: + s = Series(np.arange(10), dtype=dtype) + mask = s < 5 + s[mask] = lrange(2, 7) + expected = Series(lrange(2, 7) + lrange(5, 10), dtype=dtype) + assert_series_equal(s, expected) + self.assertEqual(s.dtype, expected.dtype) + + # these are allowed operations, but are upcasted + for dtype in [np.int64, np.float64]: + s = Series(np.arange(10), dtype=dtype) + mask = s < 5 + values = [2.5, 3.5, 4.5, 5.5, 6.5] + s[mask] = values + expected = Series(values + lrange(5, 10), dtype='float64') + assert_series_equal(s, expected) + self.assertEqual(s.dtype, expected.dtype) + + # can't do these as we are forced to change the itemsize of the input + # to something we cannot + for dtype in [np.int8, np.int16, np.int32, np.float16, np.float32]: + s = Series(np.arange(10), dtype=dtype) + mask = s < 5 + values = [2.5, 3.5, 4.5, 5.5, 6.5] + self.assertRaises(Exception, s.__setitem__, tuple(mask), values) + + # GH3235 + s = Series(np.arange(10), dtype='int64') + mask = s < 5 + s[mask] = lrange(2, 7) + expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') + assert_series_equal(s, expected) + self.assertEqual(s.dtype, expected.dtype) + + s = Series(np.arange(10), dtype='int64') + mask = s > 5 + s[mask] = [0] * 4 + expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') + assert_series_equal(s, expected) + + s = Series(np.arange(10)) + mask = s > 5 + def f(): + s[mask] = [5,4,3,2,1] + self.assertRaises(ValueError, f) + def f(): + s[mask] = [0] * 5 + self.assertRaises(ValueError, f) + + # dtype changes + s = Series([1,2,3,4]) + result = s.where(s>2,np.nan) + expected = Series([np.nan,np.nan,3,4]) + assert_series_equal(result, expected) + + # GH 4667 + # setting with None changes dtype + s = Series(range(10)).astype(float) + s[8] = None + result = s[8] + self.assertTrue(isnull(result)) + + s = Series(range(10)).astype(float) + s[s > 8] = None + result = s[isnull(s)] + expected = Series(np.nan,index=[9]) + assert_series_equal(result, expected) + + def test_where_setitem_invalid(self): + + # GH 2702 + # make sure correct exceptions are raised on invalid list assignment + + # slice + s = Series(list('abc')) + def f(): + s[0:3] = list(range(27)) + self.assertRaises(ValueError, f) + + s[0:3] = list(range(3)) + expected = Series([0,1,2]) + assert_series_equal(s.astype(np.int64), expected, ) + + # slice with step + s = Series(list('abcdef')) + def f(): + s[0:4:2] = list(range(27)) + self.assertRaises(ValueError, f) + + s = Series(list('abcdef')) + s[0:4:2] = list(range(2)) + expected = Series([0,'b',1,'d','e','f']) + assert_series_equal(s, expected) + + # neg slices + s = Series(list('abcdef')) + def f(): + s[:-1] = list(range(27)) + self.assertRaises(ValueError, f) + + s[-3:-1] = list(range(2)) + expected = Series(['a','b','c',0,1,'f']) + assert_series_equal(s, expected) + + # list + s = Series(list('abc')) + def f(): + s[[0,1,2]] = list(range(27)) + self.assertRaises(ValueError, f) + + s = Series(list('abc')) + def f(): + s[[0,1,2]] = list(range(2)) + self.assertRaises(ValueError, f) + + # scalar + s = Series(list('abc')) + s[0] = list(range(10)) + expected = Series([list(range(10)),'b','c']) + assert_series_equal(s, expected) + + def test_where_broadcast(self): + # Test a variety of differently sized series + for size in range(2, 6): + # Test a variety of boolean indices + for selection in [np.resize([True, False, False, False, False], size), # First element should be set + # Set alternating elements] + np.resize([True, False], size), + np.resize([False], size)]: # No element should be set + # Test a variety of different numbers as content + for item in [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min]: + # Test numpy arrays, lists and tuples as the input to be + # broadcast + for arr in [np.array([item]), [item], (item,)]: + data = np.arange(size, dtype=float) + s = Series(data) + s[selection] = arr + # Construct the expected series by taking the source + # data or item based on the selection + expected = Series([item if use_item else data[i] + for i, use_item in enumerate(selection)]) + assert_series_equal(s, expected) + + def test_where_inplace(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.copy() + + rs.where(cond, inplace=True) + assert_series_equal(rs.dropna(), s[cond]) + assert_series_equal(rs, s.where(cond)) + + rs = s.copy() + rs.where(cond, -s, inplace=True) + assert_series_equal(rs, s.where(cond, -s)) + + def test_where_dups(self): + # GH 4550 + # where crashes with dups in index + s1 = Series(list(range(3))) + s2 = Series(list(range(3))) + comb = pd.concat([s1,s2]) + result = comb.where(comb < 2) + expected = Series([0,1,np.nan,0,1,np.nan],index=[0,1,2,0,1,2]) + assert_series_equal(result, expected) + + # GH 4548 + # inplace updating not working with dups + comb[comb<1] = 5 + expected = Series([5,1,2,5,1,2],index=[0,1,2,0,1,2]) + assert_series_equal(comb, expected) + + comb[comb<2] += 10 + expected = Series([5,11,2,5,11,2],index=[0,1,2,0,1,2]) + assert_series_equal(comb, expected) + + def test_mask(self): + s = Series(np.random.randn(5)) + cond = s > 0 + + rs = s.where(cond, np.nan) + assert_series_equal(rs, s.mask(~cond)) + + def test_drop(self): + + # unique + s = Series([1,2],index=['one','two']) + expected = Series([1],index=['one']) + result = s.drop(['two']) + assert_series_equal(result,expected) + result = s.drop('two', axis='rows') + assert_series_equal(result,expected) + + # non-unique + # GH 5248 + s = Series([1,1,2],index=['one','two','one']) + expected = Series([1,2],index=['one','one']) + result = s.drop(['two'], axis=0) + assert_series_equal(result,expected) + result = s.drop('two') + assert_series_equal(result,expected) + + expected = Series([1],index=['two']) + result = s.drop(['one']) + assert_series_equal(result,expected) + result = s.drop('one') + assert_series_equal(result,expected) + + # single string/tuple-like + s = Series(range(3),index=list('abc')) + self.assertRaises(ValueError, s.drop, 'bc') + self.assertRaises(ValueError, s.drop, ('a',)) + + # bad axis + self.assertRaises(ValueError, s.drop, 'one', axis='columns') + + def test_ix_setitem(self): + inds = self.series.index[[3, 4, 7]] + + result = self.series.copy() + result.ix[inds] = 5 + + expected = self.series.copy() + expected[[3, 4, 7]] = 5 + assert_series_equal(result, expected) + + result.ix[5:10] = 10 + expected[5:10] = 10 + assert_series_equal(result, expected) + + # set slice with indices + d1, d2 = self.series.index[[5, 15]] + result.ix[d1:d2] = 6 + expected[5:16] = 6 # because it's inclusive + assert_series_equal(result, expected) + + # set index value + self.series.ix[d1] = 4 + self.series.ix[d2] = 6 + self.assertEqual(self.series[d1], 4) + self.assertEqual(self.series[d2], 6) + + def test_setitem_boolean(self): + mask = self.series > self.series.median() + + # similiar indexed series + result = self.series.copy() + result[mask] = self.series * 2 + expected = self.series * 2 + assert_series_equal(result[mask], expected[mask]) + + # needs alignment + result = self.series.copy() + result[mask] = (self.series * 2)[0:5] + expected = (self.series * 2)[0:5].reindex_like(self.series) + expected[-mask] = self.series[mask] + assert_series_equal(result[mask], expected[mask]) + + def test_ix_setitem_boolean(self): + mask = self.series > self.series.median() + + result = self.series.copy() + result.ix[mask] = 0 + expected = self.series + expected[mask] = 0 + assert_series_equal(result, expected) + + def test_ix_setitem_corner(self): + inds = list(self.series.index[[5, 8, 12]]) + self.series.ix[inds] = 5 + self.assertRaises(Exception, self.series.ix.__setitem__, + inds + ['foo'], 5) + + def test_get_set_boolean_different_order(self): + ordered = self.series.order() + + # setting + copy = self.series.copy() + copy[ordered > 0] = 0 + + expected = self.series.copy() + expected[expected > 0] = 0 + + assert_series_equal(copy, expected) + + # getting + sel = self.series[ordered > 0] + exp = self.series[self.series > 0] + assert_series_equal(sel, exp) + + def test_repr(self): + str(self.ts) + str(self.series) + str(self.series.astype(int)) + str(self.objSeries) + + str(Series(tm.randn(1000), index=np.arange(1000))) + str(Series(tm.randn(1000), index=np.arange(1000, 0, step=-1))) + + # empty + str(self.empty) + + # with NaNs + self.series[5:7] = np.NaN + str(self.series) + + # with Nones + ots = self.ts.astype('O') + ots[::2] = None + repr(ots) + + # various names + for name in ['', 1, 1.2, 'foo', u('\u03B1\u03B2\u03B3'), + 'loooooooooooooooooooooooooooooooooooooooooooooooooooong', + ('foo', 'bar', 'baz'), + (1, 2), + ('foo', 1, 2.3), + (u('\u03B1'), u('\u03B2'), u('\u03B3')), + (u('\u03B1'), 'bar')]: + self.series.name = name + repr(self.series) + + biggie = Series(tm.randn(1000), index=np.arange(1000), + name=('foo', 'bar', 'baz')) + repr(biggie) + + # 0 as name + ser = Series(np.random.randn(100), name=0) + rep_str = repr(ser) + self.assertIn("Name: 0", rep_str) + + # tidy repr + ser = Series(np.random.randn(1001), name=0) + rep_str = repr(ser) + self.assertIn("Name: 0", rep_str) + + ser = Series(["a\n\r\tb"], name=["a\n\r\td"], index=["a\n\r\tf"]) + self.assertFalse("\t" in repr(ser)) + self.assertFalse("\r" in repr(ser)) + self.assertFalse("a\n" in repr(ser)) + + # with empty series (#4651) + s = Series([], dtype=np.int64, name='foo') + self.assertEqual(repr(s), 'Series([], name: foo, dtype: int64)') + + s = Series([], dtype=np.int64, name=None) + self.assertEqual(repr(s), 'Series([], dtype: int64)') + + def test_tidy_repr(self): + a = Series([u("\u05d0")] * 1000) + a.name = 'title1' + repr(a) # should not raise exception + + def test_repr_bool_fails(self): + s = Series([DataFrame(np.random.randn(2, 2)) for i in range(5)]) + + import sys + + buf = StringIO() + tmp = sys.stderr + sys.stderr = buf + try: + # it works (with no Cython exception barf)! + repr(s) + finally: + sys.stderr = tmp + self.assertEqual(buf.getvalue(), '') + + def test_repr_name_iterable_indexable(self): + s = Series([1, 2, 3], name=np.int64(3)) + + # it works! + repr(s) + + s.name = (u("\u05d0"),) * 2 + repr(s) + + def test_repr_should_return_str(self): + # http://docs.python.org/py3k/reference/datamodel.html#object.__repr__ + # http://docs.python.org/reference/datamodel.html#object.__repr__ + # ...The return value must be a string object. + + # (str on py2.x, str (unicode) on py3) + + data = [8, 5, 3, 5] + index1 = [u("\u03c3"), u("\u03c4"), u("\u03c5"), u("\u03c6")] + df = Series(data, index=index1) + self.assertTrue(type(df.__repr__() == str)) # both py2 / 3 + + def test_repr_max_rows(self): + # GH 6863 + with pd.option_context('max_rows', None): + str(Series(range(1001))) # should not raise exception + + def test_unicode_string_with_unicode(self): + df = Series([u("\u05d0")], name=u("\u05d1")) + if compat.PY3: + str(df) + else: + compat.text_type(df) + + def test_bytestring_with_unicode(self): + df = Series([u("\u05d0")], name=u("\u05d1")) + if compat.PY3: + bytes(df) + else: + str(df) + + def test_timeseries_repr_object_dtype(self): + index = Index([datetime(2000, 1, 1) + timedelta(i) + for i in range(1000)], dtype=object) + ts = Series(np.random.randn(len(index)), index) + repr(ts) + + ts = tm.makeTimeSeries(1000) + self.assertTrue(repr(ts).splitlines()[-1].startswith('Freq:')) + + ts2 = ts.ix[np.random.randint(0, len(ts) - 1, 400)] + repr(ts).splitlines()[-1] + + def test_timeseries_periodindex(self): + # GH2891 + import pickle + from pandas import period_range + prng = period_range('1/1/2011', '1/1/2012', freq='M') + ts = Series(np.random.randn(len(prng)), prng) + new_ts = pickle.loads(pickle.dumps(ts)) + self.assertEqual(new_ts.index.freq, 'M') + + def test_iter(self): + for i, val in enumerate(self.series): + self.assertEqual(val, self.series[i]) + + for i, val in enumerate(self.ts): + self.assertEqual(val, self.ts[i]) + + def test_keys(self): + # HACK: By doing this in two stages, we avoid 2to3 wrapping the call + # to .keys() in a list() + getkeys = self.ts.keys + self.assertIs(getkeys(), self.ts.index) + + def test_values(self): + self.assert_numpy_array_equal(self.ts, self.ts.values) + + def test_iteritems(self): + for idx, val in compat.iteritems(self.series): + self.assertEqual(val, self.series[idx]) + + for idx, val in compat.iteritems(self.ts): + self.assertEqual(val, self.ts[idx]) + + # assert is lazy (genrators don't define reverse, lists do) + self.assertFalse(hasattr(self.series.iteritems(), 'reverse')) + + def test_sum(self): + self._check_stat_op('sum', np.sum) + + def test_sum_inf(self): + import pandas.core.nanops as nanops + + s = Series(np.random.randn(10)) + s2 = s.copy() + + s[5:8] = np.inf + s2[5:8] = np.nan + + self.assertTrue(np.isinf(s.sum())) + + arr = np.random.randn(100, 100).astype('f4') + arr[:, 2] = np.inf + + with cf.option_context("mode.use_inf_as_null", True): + assert_almost_equal(s.sum(), s2.sum()) + + res = nanops.nansum(arr, axis=1) + self.assertTrue(np.isinf(res).all()) + + def test_mean(self): + self._check_stat_op('mean', np.mean) + + def test_median(self): + self._check_stat_op('median', np.median) + + # test with integers, test failure + int_ts = Series(np.ones(10, dtype=int), index=lrange(10)) + self.assertAlmostEqual(np.median(int_ts), int_ts.median()) + + def test_mode(self): + s = Series([12, 12, 11, 10, 19, 11]) + exp = Series([11, 12]) + assert_series_equal(s.mode(), exp) + + assert_series_equal(Series([1, 2, 3]).mode(), Series([], dtype='int64')) + + lst = [5] * 20 + [1] * 10 + [6] * 25 + np.random.shuffle(lst) + s = Series(lst) + assert_series_equal(s.mode(), Series([6])) + + s = Series([5] * 10) + assert_series_equal(s.mode(), Series([5])) + + s = Series(lst) + s[0] = np.nan + assert_series_equal(s.mode(), Series([6.])) + + s = Series(list('adfasbasfwewefwefweeeeasdfasnbam')) + assert_series_equal(s.mode(), Series(['e'])) + + s = Series(['2011-01-03', '2013-01-02', '1900-05-03'], dtype='M8[ns]') + assert_series_equal(s.mode(), Series([], dtype="M8[ns]")) + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', '2011-01-03', + '2013-01-02'], dtype='M8[ns]') + assert_series_equal(s.mode(), Series(['2011-01-03', '2013-01-02'], + dtype='M8[ns]')) + + def test_prod(self): + self._check_stat_op('prod', np.prod) + + def test_min(self): + self._check_stat_op('min', np.min, check_objects=True) + + def test_max(self): + self._check_stat_op('max', np.max, check_objects=True) + + def test_var_std(self): + alt = lambda x: np.std(x, ddof=1) + self._check_stat_op('std', alt) + + alt = lambda x: np.var(x, ddof=1) + self._check_stat_op('var', alt) + + result = self.ts.std(ddof=4) + expected = np.std(self.ts.values, ddof=4) + assert_almost_equal(result, expected) + + result = self.ts.var(ddof=4) + expected = np.var(self.ts.values, ddof=4) + assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = self.ts.iloc[[0]] + result = s.var(ddof=1) + self.assertTrue(isnull(result)) + + result = s.std(ddof=1) + self.assertTrue(isnull(result)) + + def test_sem(self): + alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x)) + self._check_stat_op('sem', alt) + + result = self.ts.sem(ddof=4) + expected = np.std(self.ts.values, ddof=4)/np.sqrt(len(self.ts.values)) + assert_almost_equal(result, expected) + + # 1 - element series with ddof=1 + s = self.ts.iloc[[0]] + result = s.sem(ddof=1) + self.assert_(isnull(result)) + + def test_skew(self): + tm._skip_if_no_scipy() + + from scipy.stats import skew + alt = lambda x: skew(x, bias=False) + self._check_stat_op('skew', alt) + + def test_kurt(self): + tm._skip_if_no_scipy() + + from scipy.stats import kurtosis + alt = lambda x: kurtosis(x, bias=False) + self._check_stat_op('kurt', alt) + + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + s = Series(np.random.randn(6), index=index) + self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar']) + + def test_argsort(self): + self._check_accum_op('argsort') + argsorted = self.ts.argsort() + self.assertTrue(issubclass(argsorted.dtype.type, np.integer)) + + # GH 2967 (introduced bug in 0.11-dev I think) + s = Series([Timestamp('201301%02d' % (i + 1)) for i in range(5)]) + self.assertEqual(s.dtype, 'datetime64[ns]') + shifted = s.shift(-1) + self.assertEqual(shifted.dtype, 'datetime64[ns]') + self.assertTrue(isnull(shifted[4])) + + result = s.argsort() + expected = Series(lrange(5), dtype='int64') + assert_series_equal(result, expected) + + result = shifted.argsort() + expected = Series(lrange(4) + [-1], dtype='int64') + assert_series_equal(result, expected) + + def test_argsort_stable(self): + s = Series(np.random.randint(0, 100, size=10000)) + mindexer = s.argsort(kind='mergesort') + qindexer = s.argsort() + + mexpected = np.argsort(s.values, kind='mergesort') + qexpected = np.argsort(s.values, kind='quicksort') + + self.assert_numpy_array_equal(mindexer, mexpected) + self.assert_numpy_array_equal(qindexer, qexpected) + self.assertFalse(np.array_equal(qindexer, mindexer)) + + def test_reorder_levels(self): + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]], + names=['L0', 'L1', 'L2']) + s = Series(np.arange(6), index=index) + + # no change, position + result = s.reorder_levels([0, 1, 2]) + assert_series_equal(s, result) + + # no change, labels + result = s.reorder_levels(['L0', 'L1', 'L2']) + assert_series_equal(s, result) + + # rotate, position + result = s.reorder_levels([1, 2, 0]) + e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], + labels=[[0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0]], + names=['L1', 'L2', 'L0']) + expected = Series(np.arange(6), index=e_idx) + assert_series_equal(result, expected) + + result = s.reorder_levels([0, 0, 0]) + e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], + labels=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + names=['L0', 'L0', 'L0']) + expected = Series(range(6), index=e_idx) + assert_series_equal(result, expected) + + result = s.reorder_levels(['L0', 'L0', 'L0']) + assert_series_equal(result, expected) + + def test_cumsum(self): + self._check_accum_op('cumsum') + + def test_cumprod(self): + self._check_accum_op('cumprod') + + def test_cummin(self): + self.assert_numpy_array_equal(self.ts.cummin(), + np.minimum.accumulate(np.array(self.ts))) + ts = self.ts.copy() + ts[::2] = np.NaN + result = ts.cummin()[1::2] + expected = np.minimum.accumulate(ts.valid()) + + self.assert_numpy_array_equal(result, expected) + + def test_cummax(self): + self.assert_numpy_array_equal(self.ts.cummax(), + np.maximum.accumulate(np.array(self.ts))) + ts = self.ts.copy() + ts[::2] = np.NaN + result = ts.cummax()[1::2] + expected = np.maximum.accumulate(ts.valid()) + + self.assert_numpy_array_equal(result, expected) + + def test_npdiff(self): + raise nose.SkipTest("skipping due to Series no longer being an " + "ndarray") + + # no longer works as the return type of np.diff is now nd.array + s = Series(np.arange(5)) + + r = np.diff(s) + assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + + def _check_stat_op(self, name, alternate, check_objects=False): + import pandas.core.nanops as nanops + + def testit(): + f = getattr(Series, name) + + # add some NaNs + self.series[5:15] = np.NaN + + # idxmax, idxmin, min, and max are valid for dates + if not ('max' in name or 'min' in name): + ds = Series(date_range('1/1/2001', periods=10)) + self.assertRaises(TypeError, f, ds) + + # skipna or no + self.assertTrue(notnull(f(self.series))) + self.assertTrue(isnull(f(self.series, skipna=False))) + + # check the result is correct + nona = self.series.dropna() + assert_almost_equal(f(nona), alternate(nona.values)) + assert_almost_equal(f(self.series), alternate(nona.values)) + + allna = self.series * nan + self.assertTrue(np.isnan(f(allna))) + + # dtype=object with None, it works! + s = Series([1, 2, 3, None, 5]) + f(s) + + # 2888 + l = [0] + l.extend(lrange(2 ** 40, 2 ** 40+1000)) + s = Series(l, dtype='int64') + assert_almost_equal(float(f(s)), float(alternate(s.values))) + + # check date range + if check_objects: + s = Series(bdate_range('1/1/2000', periods=10)) + res = f(s) + exp = alternate(s) + self.assertEqual(res, exp) + + testit() + + try: + import bottleneck as bn + nanops._USE_BOTTLENECK = False + testit() + nanops._USE_BOTTLENECK = True + except ImportError: + pass + + def _check_accum_op(self, name): + func = getattr(np, name) + self.assert_numpy_array_equal(func(self.ts), func(np.array(self.ts))) + + # with missing values + ts = self.ts.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.valid())) + + self.assert_numpy_array_equal(result, expected) + + def test_round(self): + # numpy.round doesn't preserve metadata, probably a numpy bug, + # re: GH #314 + result = np.round(self.ts, 2) + expected = Series(np.round(self.ts.values, 2), index=self.ts.index) + assert_series_equal(result, expected) + self.assertEqual(result.name, self.ts.name) + + def test_prod_numpy16_bug(self): + s = Series([1., 1., 1.], index=lrange(3)) + result = s.prod() + self.assertNotIsInstance(result, Series) + + def test_quantile(self): + from numpy import percentile + + q = self.ts.quantile(0.1) + self.assertEqual(q, percentile(self.ts.valid(), 10)) + + q = self.ts.quantile(0.9) + self.assertEqual(q, percentile(self.ts.valid(), 90)) + + # object dtype + q = Series(self.ts,dtype=object).quantile(0.9) + self.assertEqual(q, percentile(self.ts.valid(), 90)) + + # datetime64[ns] dtype + dts = self.ts.index.to_series() + q = dts.quantile(.2) + self.assertEqual(q, Timestamp('2000-01-10 19:12:00')) + + if not _np_version_under1p7: + # timedelta64[ns] dtype + tds = dts.diff() + q = tds.quantile(.25) + self.assertEqual(q, pd.to_timedelta('24:00:00')) + + def test_quantile_multi(self): + from numpy import percentile + + qs = [.1, .9] + result = self.ts.quantile(qs) + expected = pd.Series([percentile(self.ts.valid(), 10), + percentile(self.ts.valid(), 90)], + index=qs) + assert_series_equal(result, expected) + + dts = self.ts.index.to_series() + result = dts.quantile((.2, .2)) + assert_series_equal(result, Series([Timestamp('2000-01-10 19:12:00'), + Timestamp('2000-01-10 19:12:00')], + index=[.2, .2])) + + def test_append(self): + appendedSeries = self.series.append(self.objSeries) + for idx, value in compat.iteritems(appendedSeries): + if idx in self.series.index: + self.assertEqual(value, self.series[idx]) + elif idx in self.objSeries.index: + self.assertEqual(value, self.objSeries[idx]) + else: + self.fail("orphaned index!") + + self.assertRaises(ValueError, self.ts.append, self.ts, + verify_integrity=True) + + def test_append_many(self): + pieces = [self.ts[:5], self.ts[5:10], self.ts[10:]] + + result = pieces[0].append(pieces[1:]) + assert_series_equal(result, self.ts) + + def test_all_any(self): + ts = tm.makeTimeSeries() + bool_series = ts > 0 + self.assertFalse(bool_series.all()) + self.assertTrue(bool_series.any()) + + def test_op_method(self): + def check(series, other, check_reverse=False): + simple_ops = ['add', 'sub', 'mul', 'floordiv', 'truediv', 'pow'] + if not compat.PY3: + simple_ops.append('div') + + for opname in simple_ops: + op = getattr(Series, opname) + + if op == 'div': + alt = operator.truediv + else: + alt = getattr(operator, opname) + + result = op(series, other) + expected = alt(series, other) + tm.assert_almost_equal(result, expected) + if check_reverse: + rop = getattr(Series, "r" + opname) + result = rop(series, other) + expected = alt(other, series) + tm.assert_almost_equal(result, expected) + + check(self.ts, self.ts * 2) + check(self.ts, self.ts[::2]) + check(self.ts, 5, check_reverse=True) + check(tm.makeFloatSeries(), tm.makeFloatSeries(), check_reverse=True) + + def test_neg(self): + assert_series_equal(-self.series, -1 * self.series) + + def test_invert(self): + assert_series_equal(-(self.series < 0), ~(self.series < 0)) + + def test_modulo(self): + + # GH3590, modulo as ints + p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = p['first'] % p['second'] + expected = Series(p['first'].values % + p['second'].values, dtype='float64') + expected.iloc[0:3] = np.nan + assert_series_equal(result, expected) + + result = p['first'] % 0 + expected = Series(np.nan, index=p.index) + assert_series_equal(result, expected) + + p = p.astype('float64') + result = p['first'] % p['second'] + expected = Series(p['first'].values % p['second'].values) + assert_series_equal(result, expected) + + p = p.astype('float64') + result = p['first'] % p['second'] + result2 = p['second'] % p['first'] + self.assertFalse(np.array_equal(result, result2)) + + def test_div(self): + + # no longer do integer div for any ops, but deal with the 0's + p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = p['first'] / p['second'] + expected = Series( + p['first'].values.astype(float) / p['second'].values, dtype='float64') + expected.iloc[0:3] = np.inf + assert_series_equal(result, expected) + + result = p['first'] / 0 + expected = Series(np.inf, index=p.index) + assert_series_equal(result, expected) + + p = p.astype('float64') + result = p['first'] / p['second'] + expected = Series(p['first'].values / p['second'].values) + assert_series_equal(result, expected) + + p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) + result = p['first'] / p['second'] + assert_series_equal(result, p['first'].astype('float64')) + self.assertFalse(np.array_equal(result, p['second'] / p['first'])) + + # inf signing + s = Series([np.nan,1.,-1.]) + result = s / 0 + expected = Series([np.nan,np.inf,-np.inf]) + assert_series_equal(result, expected) + + def test_operators(self): + + def _check_op(series, other, op, pos_only=False): + left = np.abs(series) if pos_only else series + right = np.abs(other) if pos_only else other + + cython_or_numpy = op(left, right) + python = left.combine(right, op) + tm.assert_almost_equal(cython_or_numpy, python) + + def check(series, other): + simple_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod'] + + for opname in simple_ops: + _check_op(series, other, getattr(operator, opname)) + + _check_op(series, other, operator.pow, pos_only=True) + + _check_op(series, other, lambda x, y: operator.add(y, x)) + _check_op(series, other, lambda x, y: operator.sub(y, x)) + _check_op(series, other, lambda x, y: operator.truediv(y, x)) + _check_op(series, other, lambda x, y: operator.floordiv(y, x)) + _check_op(series, other, lambda x, y: operator.mul(y, x)) + _check_op(series, other, lambda x, y: operator.pow(y, x), + pos_only=True) + _check_op(series, other, lambda x, y: operator.mod(y, x)) + + check(self.ts, self.ts * 2) + check(self.ts, self.ts * 0) + check(self.ts, self.ts[::2]) + check(self.ts, 5) + + def check_comparators(series, other): + _check_op(series, other, operator.gt) + _check_op(series, other, operator.ge) + _check_op(series, other, operator.eq) + _check_op(series, other, operator.lt) + _check_op(series, other, operator.le) + + check_comparators(self.ts, 5) + check_comparators(self.ts, self.ts + 1) + + def test_operators_empty_int_corner(self): + s1 = Series([], [], dtype=np.int32) + s2 = Series({'x': 0.}) + + # it works! + _ = s1 * s2 + + def test_constructor_dtype_timedelta64(self): + + # basic + td = Series([timedelta(days=i) for i in range(3)]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + td = Series([timedelta(days=1)]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + if not _np_version_under1p7: + td = Series([timedelta(days=1),timedelta(days=2),np.timedelta64(1,'s')]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + # mixed with NaT + from pandas import tslib + td = Series([timedelta(days=1),tslib.NaT ], dtype='m8[ns]' ) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + td = Series([timedelta(days=1),np.nan ], dtype='m8[ns]' ) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + td = Series([np.timedelta64(300000000), pd.NaT],dtype='m8[ns]') + self.assertEqual(td.dtype, 'timedelta64[ns]') + + # improved inference + # GH5689 + td = Series([np.timedelta64(300000000), pd.NaT]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + td = Series([np.timedelta64(300000000), tslib.iNaT]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + td = Series([np.timedelta64(300000000), np.nan]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + td = Series([pd.NaT, np.timedelta64(300000000)]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + if not _np_version_under1p7: + td = Series([np.timedelta64(1,'s')]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + # these are frequency conversion astypes + #for t in ['s', 'D', 'us', 'ms']: + # self.assertRaises(TypeError, td.astype, 'm8[%s]' % t) + + # valid astype + td.astype('int64') + + # invalid casting + self.assertRaises(TypeError, td.astype, 'int32') + + # this is an invalid casting + def f(): + Series([timedelta(days=1), 'foo'],dtype='m8[ns]') + self.assertRaises(Exception, f) + + # leave as object here + td = Series([timedelta(days=i) for i in range(3)] + ['foo']) + self.assertEqual(td.dtype, 'object') + + # these will correctly infer a timedelta + # but only on numpy > 1.7 as the cython path will only be used + if not _np_version_under1p7: + s = Series([None, pd.NaT, '1 Day']) + self.assertEqual(s.dtype,'timedelta64[ns]') + s = Series([np.nan, pd.NaT, '1 Day']) + self.assertEqual(s.dtype,'timedelta64[ns]') + s = Series([pd.NaT, None, '1 Day']) + self.assertEqual(s.dtype,'timedelta64[ns]') + s = Series([pd.NaT, np.nan, '1 Day']) + self.assertEqual(s.dtype,'timedelta64[ns]') + + def test_operators_timedelta64(self): + + # invalid ops + self.assertRaises(Exception, self.objSeries.__add__, 1) + self.assertRaises( + Exception, self.objSeries.__add__, np.array(1, dtype=np.int64)) + self.assertRaises(Exception, self.objSeries.__sub__, 1) + self.assertRaises( + Exception, self.objSeries.__sub__, np.array(1, dtype=np.int64)) + + # seriese ops + v1 = date_range('2012-1-1', periods=3, freq='D') + v2 = date_range('2012-1-2', periods=3, freq='D') + rs = Series(v2) - Series(v1) + xp = Series(1e9 * 3600 * 24, rs.index).astype( + 'int64').astype('timedelta64[ns]') + assert_series_equal(rs, xp) + self.assertEqual(rs.dtype, 'timedelta64[ns]') + + df = DataFrame(dict(A=v1)) + td = Series([timedelta(days=i) for i in range(3)]) + self.assertEqual(td.dtype, 'timedelta64[ns]') + + # series on the rhs + result = df['A'] - df['A'].shift() + self.assertEqual(result.dtype, 'timedelta64[ns]') + + result = df['A'] + td + self.assertEqual(result.dtype, 'M8[ns]') + + # scalar Timestamp on rhs + maxa = df['A'].max() + tm.assert_isinstance(maxa, Timestamp) + + resultb = df['A'] - df['A'].max() + self.assertEqual(resultb.dtype, 'timedelta64[ns]') + + # timestamp on lhs + result = resultb + df['A'] + expected = Series( + [Timestamp('20111230'), Timestamp('20120101'), Timestamp('20120103')]) + assert_series_equal(result, expected) + + # datetimes on rhs + result = df['A'] - datetime(2001, 1, 1) + expected = Series([timedelta(days=4017 + i) for i in range(3)]) + assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'm8[ns]') + + d = datetime(2001, 1, 1, 3, 4) + resulta = df['A'] - d + self.assertEqual(resulta.dtype, 'm8[ns]') + + # roundtrip + resultb = resulta + d + assert_series_equal(df['A'], resultb) + + # timedeltas on rhs + td = timedelta(days=1) + resulta = df['A'] + td + resultb = resulta - td + assert_series_equal(resultb, df['A']) + self.assertEqual(resultb.dtype, 'M8[ns]') + + # roundtrip + td = timedelta(minutes=5, seconds=3) + resulta = df['A'] + td + resultb = resulta - td + assert_series_equal(df['A'], resultb) + self.assertEqual(resultb.dtype, 'M8[ns]') + + # inplace + value = rs[2] + np.timedelta64(timedelta(minutes=5,seconds=1)) + rs[2] += np.timedelta64(timedelta(minutes=5,seconds=1)) + self.assertEqual(rs[2], value) + + def test_timedeltas_with_DateOffset(self): + + # GH 4532 + # operate with pd.offsets + s = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) + + result = s + pd.offsets.Second(5) + result2 = pd.offsets.Second(5) + s + expected = Series( + [Timestamp('20130101 9:01:05'), Timestamp('20130101 9:02:05')]) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + result = s + pd.offsets.Milli(5) + result2 = pd.offsets.Milli(5) + s + expected = Series( + [Timestamp('20130101 9:01:00.005'), Timestamp('20130101 9:02:00.005')]) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + result = s + pd.offsets.Minute(5) + pd.offsets.Milli(5) + expected = Series( + [Timestamp('20130101 9:06:00.005'), Timestamp('20130101 9:07:00.005')]) + assert_series_equal(result, expected) + + if not _np_version_under1p7: + + # operate with np.timedelta64 correctly + result = s + np.timedelta64(1, 's') + result2 = np.timedelta64(1, 's') + s + expected = Series( + [Timestamp('20130101 9:01:01'), Timestamp('20130101 9:02:01')]) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + result = s + np.timedelta64(5, 'ms') + result2 = np.timedelta64(5, 'ms') + s + expected = Series( + [Timestamp('20130101 9:01:00.005'), Timestamp('20130101 9:02:00.005')]) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + # valid DateOffsets + for do in [ 'Hour', 'Minute', 'Second', 'Day', 'Micro', + 'Milli', 'Nano' ]: + op = getattr(pd.offsets,do) + s + op(5) + op(5) + s + + # invalid DateOffsets + for do in [ 'Week', 'BDay', 'BQuarterEnd', 'BMonthEnd', 'BYearEnd', + 'BYearBegin','BQuarterBegin', 'BMonthBegin', + 'MonthEnd','YearBegin', 'YearEnd', + 'MonthBegin', 'QuarterBegin' ]: + op = getattr(pd.offsets,do) + self.assertRaises(TypeError, s.__add__, op(5)) + self.assertRaises(TypeError, s.__radd__, op(5)) + + def test_timedelta64_operations_with_timedeltas(self): + + # td operate with td + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td2 = timedelta(minutes=5, seconds=4) + result = td1 - td2 + expected = Series([timedelta(seconds=0)] * 3) -Series( + [timedelta(seconds=1)] * 3) + self.assertEqual(result.dtype, 'm8[ns]') + assert_series_equal(result, expected) + + result2 = td2 - td1 + expected = (Series([timedelta(seconds=1)] * 3) - + Series([timedelta(seconds=0)] * 3)) + assert_series_equal(result2, expected) + + # roundtrip + assert_series_equal(result + td2,td1) + + # Now again, using pd.to_timedelta, which should build + # a Series or a scalar, depending on input. + if not _np_version_under1p7: + td1 = Series(pd.to_timedelta(['00:05:03'] * 3)) + td2 = pd.to_timedelta('00:05:04') + result = td1 - td2 + expected = Series([timedelta(seconds=0)] * 3) -Series( + [timedelta(seconds=1)] * 3) + self.assertEqual(result.dtype, 'm8[ns]') + assert_series_equal(result, expected) + + result2 = td2 - td1 + expected = (Series([timedelta(seconds=1)] * 3) - + Series([timedelta(seconds=0)] * 3)) + assert_series_equal(result2, expected) + + # roundtrip + assert_series_equal(result + td2,td1) + + def test_timedelta64_operations_with_integers(self): + + # GH 4521 + # divide/multiply by integers + startdate = Series(date_range('2013-01-01', '2013-01-03')) + enddate = Series(date_range('2013-03-01', '2013-03-03')) + + s1 = enddate - startdate + s1[2] = np.nan + s2 = Series([2, 3, 4]) + expected = Series(s1.values.astype(np.int64) / s2, dtype='m8[ns]') + expected[2] = np.nan + result = s1 / s2 + assert_series_equal(result,expected) + + s2 = Series([20, 30, 40]) + expected = Series(s1.values.astype(np.int64) / s2, dtype='m8[ns]') + expected[2] = np.nan + result = s1 / s2 + assert_series_equal(result,expected) + + result = s1 / 2 + expected = Series(s1.values.astype(np.int64) / 2, dtype='m8[ns]') + expected[2] = np.nan + assert_series_equal(result,expected) + + s2 = Series([20, 30, 40]) + expected = Series(s1.values.astype(np.int64) * s2, dtype='m8[ns]') + expected[2] = np.nan + result = s1 * s2 + assert_series_equal(result,expected) + + for dtype in ['int32','int16','uint32','uint64','uint32','uint16','uint8']: + s2 = Series([20, 30, 40],dtype=dtype) + expected = Series(s1.values.astype(np.int64) * s2.astype(np.int64), dtype='m8[ns]') + expected[2] = np.nan + result = s1 * s2 + assert_series_equal(result,expected) + + result = s1 * 2 + expected = Series(s1.values.astype(np.int64) * 2, dtype='m8[ns]') + expected[2] = np.nan + assert_series_equal(result,expected) + + result = s1 * -1 + expected = Series(s1.values.astype(np.int64) * -1, dtype='m8[ns]') + expected[2] = np.nan + assert_series_equal(result,expected) + + # invalid ops + for op in ['__true_div__','__div__','__mul__']: + sop = getattr(s1,op,None) + if sop is not None: + self.assertRaises(TypeError, sop, s2.astype(float)) + self.assertRaises(TypeError, sop, 2.) + + for op in ['__add__','__sub__']: + sop = getattr(s1,op,None) + if sop is not None: + self.assertRaises(TypeError, sop, 1) + self.assertRaises(TypeError, sop, s2.values) + + def test_timedelta64_conversions(self): + tm._skip_if_not_numpy17_friendly() + + startdate = Series(date_range('2013-01-01', '2013-01-03')) + enddate = Series(date_range('2013-03-01', '2013-03-03')) + + s1 = enddate - startdate + s1[2] = np.nan + + for m in [1, 3, 10]: + for unit in ['D','h','m','s','ms','us','ns']: + + # op + expected = s1.apply(lambda x: x / np.timedelta64(m,unit)) + result = s1 / np.timedelta64(m,unit) + assert_series_equal(result, expected) + + if m == 1 and unit != 'ns': + + # astype + result = s1.astype("timedelta64[{0}]".format(unit)) + assert_series_equal(result, expected) + + # reverse op + expected = s1.apply(lambda x: np.timedelta64(m,unit) / x) + result = np.timedelta64(m,unit) / s1 + + def test_timedelta64_equal_timedelta_supported_ops(self): + ser = Series([Timestamp('20130301'), Timestamp('20130228 23:00:00'), + Timestamp('20130228 22:00:00'), + Timestamp('20130228 21:00:00')]) + + intervals = 'D', 'h', 'm', 's', 'us' + npy16_mappings = {'D': 24 * 60 * 60 * 1000000, 'h': 60 * 60 * 1000000, + 'm': 60 * 1000000, 's': 1000000, 'us': 1} + + def timedelta64(*args): + if _np_version_under1p7: + coeffs = np.array(args) + terms = np.array([npy16_mappings[interval] + for interval in intervals]) + return np.timedelta64(coeffs.dot(terms)) + return sum(starmap(np.timedelta64, zip(args, intervals))) + + for op, d, h, m, s, us in product([operator.add, operator.sub], + *([range(2)] * 5)): + nptd = timedelta64(d, h, m, s, us) + pytd = timedelta(days=d, hours=h, minutes=m, seconds=s, + microseconds=us) + lhs = op(ser, nptd) + rhs = op(ser, pytd) + + try: + assert_series_equal(lhs, rhs) + except: + raise AssertionError( + "invalid comparsion [op->{0},d->{1},h->{2},m->{3},s->{4},us->{5}]\n{6}\n{7}\n".format(op, d, h, m, s, us, lhs, rhs)) + + def test_operators_datetimelike(self): + + def run_ops(ops, get_ser, test_ser): + for op in ops: + try: + op = getattr(get_ser, op, None) + if op is not None: + self.assertRaises(TypeError, op, test_ser) + except: + com.pprint_thing("Failed on op %r" % op) + raise + ### timedelta64 ### + td1 = Series([timedelta(minutes=5,seconds=3)]*3) + td2 = timedelta(minutes=5,seconds=4) + ops = ['__mul__','__floordiv__','__pow__', + '__rmul__','__rfloordiv__','__rpow__'] + run_ops(ops, td1, td2) + td1 + td2 + td2 + td1 + td1 - td2 + td2 - td1 + td1 / td2 + td2 / td1 + + ### datetime64 ### + dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), + Timestamp('20120103')]) + dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), + Timestamp('20120104')]) + ops = ['__add__', '__mul__', '__floordiv__', '__truediv__', '__div__', + '__pow__', '__radd__', '__rmul__', '__rfloordiv__', + '__rtruediv__', '__rdiv__', '__rpow__'] + run_ops(ops, dt1, dt2) + dt1 - dt2 + dt2 - dt1 + + ### datetime64 with timetimedelta ### + ops = ['__mul__', '__floordiv__', '__truediv__', '__div__', '__pow__', + '__rmul__', '__rfloordiv__', '__rtruediv__', '__rdiv__', + '__rpow__'] + run_ops(ops, dt1, td1) + dt1 + td1 + td1 + dt1 + dt1 - td1 + # TODO: Decide if this ought to work. + # td1 - dt1 + + ### timetimedelta with datetime64 ### + ops = ['__sub__', '__mul__', '__floordiv__', '__truediv__', '__div__', + '__pow__', '__rsub__', '__rmul__', '__rfloordiv__', + '__rtruediv__', '__rdiv__', '__rpow__'] + run_ops(ops, td1, dt1) + td1 + dt1 + dt1 + td1 + + def test_ops_datetimelike_align(self): + tm._skip_if_not_numpy17_friendly() + + # GH 7500 + # datetimelike ops need to align + dt = Series(date_range('2012-1-1', periods=3, freq='D')) + dt.iloc[2] = np.nan + dt2 = dt[::-1] + + expected = Series([timedelta(0),timedelta(0),pd.NaT]) + + result = dt2-dt + assert_series_equal(result,expected) + + result = (dt2.to_frame()-dt.to_frame())[0] + assert_series_equal(result,expected) + + def test_timedelta64_functions(self): + + from datetime import timedelta + from pandas import date_range + + # index min/max + td = Series(date_range('2012-1-1', periods=3, freq='D')) - \ + Timestamp('20120101') + + result = td.idxmin() + self.assertEqual(result, 0) + + result = td.idxmax() + self.assertEqual(result, 2) + + # GH 2982 + # with NaT + td[0] = np.nan + + result = td.idxmin() + self.assertEqual(result, 1) + + result = td.idxmax() + self.assertEqual(result, 2) + + # abs + s1 = Series(date_range('20120101', periods=3)) + s2 = Series(date_range('20120102', periods=3)) + expected = Series(s2 - s1) + + # this fails as numpy returns timedelta64[us] + #result = np.abs(s1-s2) + # assert_frame_equal(result,expected) + + result = (s1 - s2).abs() + assert_series_equal(result, expected) + + # max/min + result = td.max() + expected = Series([timedelta(2)], dtype='timedelta64[ns]') + assert_series_equal(result, expected) + + result = td.min() + expected = Series([timedelta(1)], dtype='timedelta64[ns]') + assert_series_equal(result, expected) + + def test_timedelta_fillna(self): + tm._skip_if_not_numpy17_friendly() + + #GH 3371 + s = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130102'), Timestamp('20130103 9:01:01')]) + td = s.diff() + + # reg fillna + result = td.fillna(0) + expected = Series([timedelta(0), timedelta(0), timedelta(1), + timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) + + # interprested as seconds + result = td.fillna(1) + expected = Series([timedelta(seconds=1), timedelta(0), + timedelta(1), timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) + + result = td.fillna(timedelta(days=1, seconds=1)) + expected = Series([timedelta(days=1, seconds=1), timedelta(0), + timedelta(1), timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) + + result = td.fillna(np.timedelta64(int(1e9))) + expected = Series([timedelta(seconds=1), timedelta(0), timedelta(1), + timedelta(days=1, seconds=9*3600+60+1)]) + assert_series_equal(result, expected) + + from pandas import tslib + result = td.fillna(tslib.NaT) + expected = Series([tslib.NaT, timedelta(0), timedelta(1), + timedelta(days=1, seconds=9*3600+60+1)], dtype='m8[ns]') + assert_series_equal(result, expected) + + # ffill + td[2] = np.nan + result = td.ffill() + expected = td.fillna(0) + expected[0] = np.nan + assert_series_equal(result, expected) + + # bfill + td[2] = np.nan + result = td.bfill() + expected = td.fillna(0) + expected[2] = timedelta(days=1, seconds=9*3600+60+1) + assert_series_equal(result, expected) + + def test_datetime64_fillna(self): + + s = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130102'), Timestamp('20130103 9:01:01')]) + s[2] = np.nan + + # reg fillna + result = s.fillna(Timestamp('20130104')) + expected = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130104'), Timestamp('20130103 9:01:01')]) + assert_series_equal(result, expected) + + from pandas import tslib + result = s.fillna(tslib.NaT) + expected = s + assert_series_equal(result, expected) + + # ffill + result = s.ffill() + expected = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130101'), Timestamp('20130103 9:01:01')]) + assert_series_equal(result, expected) + + # bfill + result = s.bfill() + expected = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130103 9:01:01'), + Timestamp('20130103 9:01:01')]) + assert_series_equal(result, expected) + + # GH 6587 + # make sure that we are treating as integer when filling + # this also tests inference of a datetime-like with NaT's + s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001']) + expected = Series(['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001'], dtype='M8[ns]') + result = s.fillna(method='backfill') + assert_series_equal(result, expected) + + def test_fillna_int(self): + s = Series(np.random.randint(-100, 100, 50)) + s.fillna(method='ffill', inplace=True) + assert_series_equal(s.fillna(method='ffill', inplace=False), s) + + def test_fillna_raise(self): + s = Series(np.random.randint(-100, 100, 50)) + self.assertRaises(TypeError, s.fillna, [1, 2]) + self.assertRaises(TypeError, s.fillna, (1, 2)) + + def test_raise_on_info(self): + s = Series(np.random.randn(10)) + with tm.assertRaises(AttributeError): + s.info() + + def test_isnull_for_inf(self): + s = Series(['a', np.inf, np.nan, 1.0]) + with pd.option_context('mode.use_inf_as_null', True): + r = s.isnull() + dr = s.dropna() + e = Series([False, True, True, False]) + de = Series(['a', 1.0], index=[0, 3]) + tm.assert_series_equal(r, e) + tm.assert_series_equal(dr, de) + + +# TimeSeries-specific + + def test_fillna(self): + ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + + self.assert_numpy_array_equal(ts, ts.fillna(method='ffill')) + + ts[2] = np.NaN + + self.assert_numpy_array_equal(ts.fillna(method='ffill'), + [0., 1., 1., 3., 4.]) + self.assert_numpy_array_equal(ts.fillna(method='backfill'), + [0., 1., 3., 3., 4.]) + + self.assert_numpy_array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.]) + + self.assertRaises(ValueError, ts.fillna) + self.assertRaises(ValueError, self.ts.fillna, value=0, method='ffill') + + # GH 5703 + s1 = Series([np.nan]) + s2 = Series([1]) + result = s1.fillna(s2) + expected = Series([1.]) + assert_series_equal(result,expected) + result = s1.fillna({}) + assert_series_equal(result,s1) + result = s1.fillna(Series(())) + assert_series_equal(result,s1) + result = s2.fillna(s1) + assert_series_equal(result,s2) + result = s1.fillna({ 0 : 1}) + assert_series_equal(result,expected) + result = s1.fillna({ 1 : 1}) + assert_series_equal(result,Series([np.nan])) + result = s1.fillna({ 0 : 1, 1 : 1}) + assert_series_equal(result,expected) + result = s1.fillna(Series({ 0 : 1, 1 : 1})) + assert_series_equal(result,expected) + result = s1.fillna(Series({ 0 : 1, 1 : 1},index=[4,5])) + assert_series_equal(result,s1) + + s1 = Series([0, 1, 2], list('abc')) + s2 = Series([0, np.nan, 2], list('bac')) + result = s2.fillna(s1) + expected = Series([0,0,2.], list('bac')) + assert_series_equal(result,expected) + + # limit + s = Series(np.nan,index=[0,1,2]) + result = s.fillna(999,limit=1) + expected = Series([999,np.nan,np.nan],index=[0,1,2]) + assert_series_equal(result,expected) + + result = s.fillna(999,limit=2) + expected = Series([999,999,np.nan],index=[0,1,2]) + assert_series_equal(result,expected) + + def test_fillna_bug(self): + x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) + filled = x.fillna(method='ffill') + expected = Series([nan, 1., 1., 3., 3.], x.index) + assert_series_equal(filled, expected) + + filled = x.fillna(method='bfill') + expected = Series([1., 1., 3., 3., nan], x.index) + assert_series_equal(filled, expected) + + def test_fillna_inplace(self): + x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) + y = x.copy() + + y.fillna(value=0, inplace=True) + + expected = x.fillna(value=0) + assert_series_equal(y, expected) + + def test_fillna_invalid_method(self): + try: + self.ts.fillna(method='ffil') + except ValueError as inst: + self.assertIn('ffil', str(inst)) + + def test_ffill(self): + ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + assert_series_equal(ts.ffill(), ts.fillna(method='ffill')) + + def test_bfill(self): + ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + assert_series_equal(ts.bfill(), ts.fillna(method='bfill')) + + def test_sub_of_datetime_from_TimeSeries(self): + tm._skip_if_not_numpy17_friendly() + + from pandas.tseries.timedeltas import _possibly_cast_to_timedelta + from datetime import datetime + a = Timestamp(datetime(1993, 0o1, 0o7, 13, 30, 00)) + b = datetime(1993, 6, 22, 13, 30) + a = Series([a]) + result = _possibly_cast_to_timedelta(np.abs(a - b)) + self.assertEqual(result.dtype, 'timedelta64[ns]') + + result = _possibly_cast_to_timedelta(np.abs(b - a)) + self.assertEqual(result.dtype, 'timedelta64[ns]') + + def test_datetime64_with_index(self): + + # arithmetic integer ops with an index + s = Series(np.random.randn(5)) + expected = s-s.index.to_series() + result = s-s.index + assert_series_equal(result,expected) + + # GH 4629 + # arithmetic datetime64 ops with an index + s = Series(date_range('20130101',periods=5),index=date_range('20130101',periods=5)) + expected = s-s.index.to_series() + result = s-s.index + assert_series_equal(result,expected) + + result = s-s.index.to_period() + assert_series_equal(result,expected) + + df = DataFrame(np.random.randn(5,2),index=date_range('20130101',periods=5)) + df['date'] = Timestamp('20130102') + df['expected'] = df['date'] - df.index.to_series() + df['result'] = df['date'] - df.index + assert_series_equal(df['result'],df['expected']) + + def test_timedelta64_nan(self): + + from pandas import tslib + td = Series([timedelta(days=i) for i in range(10)]) + + # nan ops on timedeltas + td1 = td.copy() + td1[0] = np.nan + self.assertTrue(isnull(td1[0])) + self.assertEqual(td1[0].view('i8'), tslib.iNaT) + td1[0] = td[0] + self.assertFalse(isnull(td1[0])) + + td1[1] = tslib.iNaT + self.assertTrue(isnull(td1[1])) + self.assertEqual(td1[1].view('i8'), tslib.iNaT) + td1[1] = td[1] + self.assertFalse(isnull(td1[1])) + + td1[2] = tslib.NaT + self.assertTrue(isnull(td1[2])) + self.assertEqual(td1[2].view('i8'), tslib.iNaT) + td1[2] = td[2] + self.assertFalse(isnull(td1[2])) + + # boolean setting + # this doesn't work, not sure numpy even supports it + #result = td[(td>np.timedelta64(timedelta(days=3))) & (td= -0.5) & (self.ts <= 0.5) + # assert_series_equal(selector, expected) + + def test_operators_na_handling(self): + from decimal import Decimal + from datetime import date + s = Series([Decimal('1.3'), Decimal('2.3')], + index=[date(2012, 1, 1), date(2012, 1, 2)]) + + result = s + s.shift(1) + result2 = s.shift(1) + s + self.assertTrue(isnull(result[0])) + self.assertTrue(isnull(result2[0])) + + s = Series(['foo', 'bar', 'baz', np.nan]) + result = 'prefix_' + s + expected = Series(['prefix_foo', 'prefix_bar', 'prefix_baz', np.nan]) + assert_series_equal(result, expected) + + result = s + '_suffix' + expected = Series(['foo_suffix', 'bar_suffix', 'baz_suffix', np.nan]) + assert_series_equal(result, expected) + + def test_object_comparisons(self): + s = Series(['a', 'b', np.nan, 'c', 'a']) + + result = s == 'a' + expected = Series([True, False, False, False, True]) + assert_series_equal(result, expected) + + result = s < 'a' + expected = Series([False, False, False, False, False]) + assert_series_equal(result, expected) + + result = s != 'a' + expected = -(s == 'a') + assert_series_equal(result, expected) + + def test_comparison_operators_with_nas(self): + s = Series(bdate_range('1/1/2000', periods=10), dtype=object) + s[::2] = np.nan + + # test that comparisons work + ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + for op in ops: + val = s[5] + + f = getattr(operator, op) + result = f(s, val) + + expected = f(s.dropna(), val).reindex(s.index) + + if op == 'ne': + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) + + assert_series_equal(result, expected) + + # fffffffuuuuuuuuuuuu + # result = f(val, s) + # expected = f(val, s.dropna()).reindex(s.index) + # assert_series_equal(result, expected) + + # boolean &, |, ^ should work with object arrays and propagate NAs + + ops = ['and_', 'or_', 'xor'] + mask = s.isnull() + for bool_op in ops: + f = getattr(operator, bool_op) + + filled = s.fillna(s[0]) + + result = f(s < s[9], s > s[3]) + + expected = f(filled < filled[9], filled > filled[3]) + expected[mask] = False + assert_series_equal(result, expected) + + def test_comparison_object_numeric_nas(self): + s = Series(np.random.randn(10), dtype=object) + shifted = s.shift(2) + + ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + for op in ops: + f = getattr(operator, op) + + result = f(s, shifted) + expected = f(s.astype(float), shifted.astype(float)) + assert_series_equal(result, expected) + + def test_comparison_invalid(self): + + # GH4968 + # invalid date/int comparisons + s = Series(range(5)) + s2 = Series(date_range('20010101', periods=5)) + + for (x, y) in [(s,s2),(s2,s)]: + self.assertRaises(TypeError, lambda : x == y) + self.assertRaises(TypeError, lambda : x != y) + self.assertRaises(TypeError, lambda : x >= y) + self.assertRaises(TypeError, lambda : x > y) + self.assertRaises(TypeError, lambda : x < y) + self.assertRaises(TypeError, lambda : x <= y) + + def test_more_na_comparisons(self): + left = Series(['a', np.nan, 'c']) + right = Series(['a', np.nan, 'd']) + + result = left == right + expected = Series([True, False, False]) + assert_series_equal(result, expected) + + result = left != right + expected = Series([False, True, True]) + assert_series_equal(result, expected) + + result = left == np.nan + expected = Series([False, False, False]) + assert_series_equal(result, expected) + + result = left != np.nan + expected = Series([True, True, True]) + assert_series_equal(result, expected) + + def test_comparison_different_length(self): + a = Series(['a', 'b', 'c']) + b = Series(['b', 'a']) + self.assertRaises(ValueError, a.__lt__, b) + + a = Series([1, 2]) + b = Series([2, 3, 4]) + self.assertRaises(ValueError, a.__eq__, b) + + def test_comparison_label_based(self): + + # GH 4947 + # comparisons should be label based + + a = Series([True, False, True], list('bca')) + b = Series([False, True, False], list('abc')) + + expected = Series([True, False, False], list('bca')) + result = a & b + assert_series_equal(result,expected) + + expected = Series([True, False, True], list('bca')) + result = a | b + assert_series_equal(result,expected) + + expected = Series([False, False, True], list('bca')) + result = a ^ b + assert_series_equal(result,expected) + + # rhs is bigger + a = Series([True, False, True], list('bca')) + b = Series([False, True, False, True], list('abcd')) + + expected = Series([True, False, False], list('bca')) + result = a & b + assert_series_equal(result,expected) + + expected = Series([True, False, True], list('bca')) + result = a | b + assert_series_equal(result,expected) + + # filling + + # vs empty + result = a & Series([]) + expected = Series([False, False, False], list('bca')) + assert_series_equal(result,expected) + + result = a | Series([]) + expected = Series([True, False, True], list('bca')) + assert_series_equal(result,expected) + + # vs non-matching + result = a & Series([1],['z']) + expected = Series([False, False, False], list('bca')) + assert_series_equal(result,expected) + + result = a | Series([1],['z']) + expected = Series([True, False, True], list('bca')) + assert_series_equal(result,expected) + + # identity + # we would like s[s|e] == s to hold for any e, whether empty or not + for e in [Series([]),Series([1],['z']),Series(['z']),Series(np.nan,b.index),Series(np.nan,a.index)]: + result = a[a | e] + assert_series_equal(result,a[a]) + + # vs scalars + index = list('bca') + t = Series([True,False,True]) + + for v in [True,1,2]: + result = Series([True,False,True],index=index) | v + expected = Series([True,True,True],index=index) + assert_series_equal(result,expected) + + for v in [np.nan,'foo']: + self.assertRaises(TypeError, lambda : t | v) + + for v in [False,0]: + result = Series([True,False,True],index=index) | v + expected = Series([True,False,True],index=index) + assert_series_equal(result,expected) + + for v in [True,1]: + result = Series([True,False,True],index=index) & v + expected = Series([True,False,True],index=index) + assert_series_equal(result,expected) + + for v in [False,0]: + result = Series([True,False,True],index=index) & v + expected = Series([False,False,False],index=index) + assert_series_equal(result,expected) + for v in [np.nan]: + self.assertRaises(TypeError, lambda : t & v) + + def test_between(self): + s = Series(bdate_range('1/1/2000', periods=20).asobject) + s[::2] = np.nan + + result = s[s.between(s[3], s[17])] + expected = s[3:18].dropna() + assert_series_equal(result, expected) + + result = s[s.between(s[3], s[17], inclusive=False)] + expected = s[5:16].dropna() + assert_series_equal(result, expected) + + def test_setitem_na(self): + # these induce dtype changes + expected = Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]) + s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) + s[::2] = np.nan + assert_series_equal(s, expected) + + # get's coerced to float, right? + expected = Series([np.nan, 1, np.nan, 0]) + s = Series([True, True, False, False]) + s[::2] = np.nan + assert_series_equal(s, expected) + + expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]) + s = Series(np.arange(10)) + s[:5] = np.nan + assert_series_equal(s, expected) + + def test_scalar_na_cmp_corners(self): + s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10]) + + def tester(a, b): + return a & b + + self.assertRaises(TypeError, tester, s, datetime(2005, 1, 1)) + + s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)]) + s[::2] = np.nan + + expected = Series(True,index=s.index) + expected[::2] = False + assert_series_equal(tester(s, list(s)), expected) + + d = DataFrame({'A': s}) + # TODO: Fix this exception - needs to be fixed! (see GH5035) + # (previously this was a TypeError because series returned + # NotImplemented + self.assertRaises(ValueError, tester, s, d) + + def test_idxmin(self): + # test idxmin + # _check_stat_op approach can not be used here because of isnull check. + + # add some NaNs + self.series[5:15] = np.NaN + + # skipna or no + self.assertEqual(self.series[self.series.idxmin()], self.series.min()) + self.assertTrue(isnull(self.series.idxmin(skipna=False))) + + # no NaNs + nona = self.series.dropna() + self.assertEqual(nona[nona.idxmin()], nona.min()) + self.assertEqual(nona.index.values.tolist().index(nona.idxmin()), + nona.values.argmin()) + + # all NaNs + allna = self.series * nan + self.assertTrue(isnull(allna.idxmin())) + + # datetime64[ns] + from pandas import date_range + s = Series(date_range('20130102', periods=6)) + result = s.idxmin() + self.assertEqual(result, 0) + + s[0] = np.nan + result = s.idxmin() + self.assertEqual(result, 1) + + def test_idxmax(self): + # test idxmax + # _check_stat_op approach can not be used here because of isnull check. + + # add some NaNs + self.series[5:15] = np.NaN + + # skipna or no + self.assertEqual(self.series[self.series.idxmax()], self.series.max()) + self.assertTrue(isnull(self.series.idxmax(skipna=False))) + + # no NaNs + nona = self.series.dropna() + self.assertEqual(nona[nona.idxmax()], nona.max()) + self.assertEqual(nona.index.values.tolist().index(nona.idxmax()), + nona.values.argmax()) + + # all NaNs + allna = self.series * nan + self.assertTrue(isnull(allna.idxmax())) + + from pandas import date_range + s = Series(date_range('20130102', periods=6)) + result = s.idxmax() + self.assertEqual(result, 5) + + s[5] = np.nan + result = s.idxmax() + self.assertEqual(result, 4) + + # Float64Index + # GH 5914 + s = pd.Series([1,2,3],[1.1,2.1,3.1]) + result = s.idxmax() + self.assertEqual(result, 3.1) + result = s.idxmin() + self.assertEqual(result, 1.1) + + s = pd.Series(s.index, s.index) + result = s.idxmax() + self.assertEqual(result, 3.1) + result = s.idxmin() + self.assertEqual(result, 1.1) + + def test_ndarray_compat(self): + + # test numpy compat with Series as sub-class of NDFrame + tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], + index=date_range('1/1/2000', periods=1000)) + + def f(x): + return x[x.argmax()] + result = tsdf.apply(f) + expected = tsdf.max() + assert_series_equal(result,expected) + + # .item() + s = Series([1]) + result = s.item() + self.assertEqual(result, 1) + self.assertEqual(s.item(), s.iloc[0]) + + # using an ndarray like function + s = Series(np.random.randn(10)) + result = np.ones_like(s) + expected = Series(1,index=range(10),dtype='float64') + #assert_series_equal(result,expected) + + # ravel + s = Series(np.random.randn(10)) + tm.assert_almost_equal(s.ravel(order='F'),s.values.ravel(order='F')) + + # compress + # GH 6658 + s = Series([0,1.,-1],index=list('abc')) + result = np.compress(s>0,s) + assert_series_equal(result, Series([1.],index=['b'])) + + result = np.compress(s<-1,s) + assert_series_equal(result, Series([],dtype='float64')) + + def test_complexx(self): + + # GH4819 + # complex access for ndarray compat + a = np.arange(5) + b = Series(a + 4j*a) + tm.assert_almost_equal(a,b.real) + tm.assert_almost_equal(4*a,b.imag) + + b.real = np.arange(5)+5 + tm.assert_almost_equal(a+5,b.real) + tm.assert_almost_equal(4*a,b.imag) + + def test_underlying_data_conversion(self): + + # GH 4080 + df = DataFrame(dict((c, [1,2,3]) for c in ['a', 'b', 'c'])) + df.set_index(['a', 'b', 'c'], inplace=True) + s = Series([1], index=[(2,2,2)]) + df['val'] = 0 + df + df['val'].update(s) + + expected = DataFrame(dict(a = [1,2,3], b = [1,2,3], c = [1,2,3], val = [0,1,0])) + expected.set_index(['a', 'b', 'c'], inplace=True) + tm.assert_frame_equal(df,expected) + + # GH 3970 + df = DataFrame({ "aa":range(5), "bb":[2.2]*5}) + df["cc"] = 0.0 + ck = [True]*len(df) + df["bb"].iloc[0] = .13 + df_tmp = df.iloc[ck] + df["bb"].iloc[0] = .15 + self.assertEqual(df['bb'].iloc[0], 0.15) + + # GH 3217 + df = DataFrame(dict(a = [1,3], b = [np.nan, 2])) + df['c'] = np.nan + df['c'].update(pd.Series(['foo'],index=[0])) + + expected = DataFrame(dict(a = [1,3], b = [np.nan, 2], c = ['foo',np.nan])) + tm.assert_frame_equal(df,expected) + + def test_operators_corner(self): + series = self.ts + + empty = Series([], index=Index([])) + + result = series + empty + self.assertTrue(np.isnan(result).all()) + + result = empty + Series([], index=Index([])) + self.assertEqual(len(result), 0) + + # TODO: this returned NotImplemented earlier, what to do? + # deltas = Series([timedelta(1)] * 5, index=np.arange(5)) + # sub_deltas = deltas[::2] + # deltas5 = deltas * 5 + # deltas = deltas + sub_deltas + + # float + int + int_ts = self.ts.astype(int)[:-5] + added = self.ts + int_ts + expected = self.ts.values[:-5] + int_ts.values + self.assert_numpy_array_equal(added[:-5], expected) + + def test_operators_reverse_object(self): + # GH 56 + arr = Series(np.random.randn(10), index=np.arange(10), + dtype=object) + + def _check_op(arr, op): + result = op(1., arr) + expected = op(1., arr.astype(float)) + assert_series_equal(result.astype(float), expected) + + _check_op(arr, operator.add) + _check_op(arr, operator.sub) + _check_op(arr, operator.mul) + _check_op(arr, operator.truediv) + _check_op(arr, operator.floordiv) + + def test_series_frame_radd_bug(self): + from pandas.util.testing import rands + import operator + + # GH 353 + vals = Series([rands(5) for _ in range(10)]) + result = 'foo_' + vals + expected = vals.map(lambda x: 'foo_' + x) + assert_series_equal(result, expected) + + frame = DataFrame({'vals': vals}) + result = 'foo_' + frame + expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) + tm.assert_frame_equal(result, expected) + + # really raise this time + self.assertRaises(TypeError, operator.add, datetime.now(), self.ts) + + def test_operators_frame(self): + # rpow does not work with DataFrame + df = DataFrame({'A': self.ts}) + + tm.assert_almost_equal(self.ts + self.ts, (self.ts + df)['A']) + tm.assert_almost_equal(self.ts ** self.ts, (self.ts ** df)['A']) + tm.assert_almost_equal(self.ts < self.ts, (self.ts < df)['A']) + tm.assert_almost_equal(self.ts / self.ts, (self.ts / df)['A']) + + def test_operators_combine(self): + def _check_fill(meth, op, a, b, fill_value=0): + exp_index = a.index.union(b.index) + a = a.reindex(exp_index) + b = b.reindex(exp_index) + + amask = isnull(a) + bmask = isnull(b) + + exp_values = [] + for i in range(len(exp_index)): + if amask[i]: + if bmask[i]: + exp_values.append(nan) + continue + exp_values.append(op(fill_value, b[i])) + elif bmask[i]: + if amask[i]: + exp_values.append(nan) + continue + exp_values.append(op(a[i], fill_value)) + else: + exp_values.append(op(a[i], b[i])) + + result = meth(a, b, fill_value=fill_value) + expected = Series(exp_values, exp_index) + assert_series_equal(result, expected) + + a = Series([nan, 1., 2., 3., nan], index=np.arange(5)) + b = Series([nan, 1, nan, 3, nan, 4.], index=np.arange(6)) + + pairings = [] + for op in ['add', 'sub', 'mul', 'pow', 'truediv', 'floordiv']: + fv = 0 + lop = getattr(Series, op) + lequiv = getattr(operator, op) + rop = getattr(Series, 'r' + op) + # bind op at definition time... + requiv = lambda x, y, op=op: getattr(operator, op)(y, x) + pairings.append((lop, lequiv, fv)) + pairings.append((rop, requiv, fv)) + + if compat.PY3: + pairings.append((Series.div, operator.truediv, 1)) + pairings.append((Series.rdiv, lambda x, y: operator.truediv(y, x), 1)) + else: + pairings.append((Series.div, operator.div, 1)) + pairings.append((Series.rdiv, lambda x, y: operator.div(y, x), 1)) + + for op, equiv_op, fv in pairings: + result = op(a, b) + exp = equiv_op(a, b) + assert_series_equal(result, exp) + _check_fill(op, equiv_op, a, b, fill_value=fv) + # should accept axis=0 or axis='rows' + op(a, b, axis=0) + + def test_combine_first(self): + values = tm.makeIntIndex(20).values.astype(float) + series = Series(values, index=tm.makeIntIndex(20)) + + series_copy = series * 2 + series_copy[::2] = np.NaN + + # nothing used from the input + combined = series.combine_first(series_copy) + + self.assert_numpy_array_equal(combined, series) + + # Holes filled from input + combined = series_copy.combine_first(series) + self.assertTrue(np.isfinite(combined).all()) + + self.assert_numpy_array_equal(combined[::2], series[::2]) + self.assert_numpy_array_equal(combined[1::2], series_copy[1::2]) + + # mixed types + index = tm.makeStringIndex(20) + floats = Series(tm.randn(20), index=index) + strings = Series(tm.makeStringIndex(10), index=index[::2]) + + combined = strings.combine_first(floats) + + tm.assert_dict_equal(strings, combined, compare_keys=False) + tm.assert_dict_equal(floats[1::2], combined, compare_keys=False) + + # corner case + s = Series([1., 2, 3], index=[0, 1, 2]) + result = s.combine_first(Series([], index=[])) + assert_series_equal(s, result) + + def test_update(self): + s = Series([1.5, nan, 3., 4., nan]) + s2 = Series([nan, 3.5, nan, 5.]) + s.update(s2) + + expected = Series([1.5, 3.5, 3., 5., np.nan]) + assert_series_equal(s, expected) + + # GH 3217 + df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) + df['c'] = np.nan + + # this will fail as long as series is a sub-class of ndarray + # df['c'].update(Series(['foo'],index=[0])) ##### + + def test_corr(self): + tm._skip_if_no_scipy() + + import scipy.stats as stats + + # full overlap + self.assertAlmostEqual(self.ts.corr(self.ts), 1) + + # partial overlap + self.assertAlmostEqual(self.ts[:15].corr(self.ts[5:]), 1) + + self.assertTrue(isnull(self.ts[:15].corr(self.ts[5:], min_periods=12))) + + ts1 = self.ts[:15].reindex(self.ts.index) + ts2 = self.ts[5:].reindex(self.ts.index) + self.assertTrue(isnull(ts1.corr(ts2, min_periods=12))) + + # No overlap + self.assertTrue(np.isnan(self.ts[::2].corr(self.ts[1::2]))) + + # all NA + cp = self.ts[:10].copy() + cp[:] = np.nan + self.assertTrue(isnull(cp.corr(cp))) + + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + result = A.corr(B) + expected, _ = stats.pearsonr(A, B) + self.assertAlmostEqual(result, expected) + + def test_corr_rank(self): + tm._skip_if_no_scipy() + + import scipy + import scipy.stats as stats + + # kendall and spearman + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + A[-5:] = A[:5] + result = A.corr(B, method='kendall') + expected = stats.kendalltau(A, B)[0] + self.assertAlmostEqual(result, expected) + + result = A.corr(B, method='spearman') + expected = stats.spearmanr(A, B)[0] + self.assertAlmostEqual(result, expected) + + # these methods got rewritten in 0.8 + if scipy.__version__ < LooseVersion('0.9'): + raise nose.SkipTest("skipping corr rank because of scipy version " + "{0}".format(scipy.__version__)) + + # results from R + A = Series([-0.89926396, 0.94209606, -1.03289164, -0.95445587, + 0.76910310, -0.06430576, -2.09704447, 0.40660407, + -0.89926396, 0.94209606]) + B = Series([-1.01270225, -0.62210117, -1.56895827, 0.59592943, + -0.01680292, 1.17258718, -1.06009347, -0.10222060, + -0.89076239, 0.89372375]) + kexp = 0.4319297 + sexp = 0.5853767 + self.assertAlmostEqual(A.corr(B, method='kendall'), kexp) + self.assertAlmostEqual(A.corr(B, method='spearman'), sexp) + + def test_cov(self): + # full overlap + self.assertAlmostEqual(self.ts.cov(self.ts), self.ts.std() ** 2) + + # partial overlap + self.assertAlmostEqual( + self.ts[:15].cov(self.ts[5:]), self.ts[5:15].std() ** 2) + + # No overlap + self.assertTrue(np.isnan(self.ts[::2].cov(self.ts[1::2]))) + + # all NA + cp = self.ts[:10].copy() + cp[:] = np.nan + self.assertTrue(isnull(cp.cov(cp))) + + # min_periods + self.assertTrue(isnull(self.ts[:15].cov(self.ts[5:], min_periods=12))) + + ts1 = self.ts[:15].reindex(self.ts.index) + ts2 = self.ts[5:].reindex(self.ts.index) + self.assertTrue(isnull(ts1.cov(ts2, min_periods=12))) + + def test_copy(self): + ts = self.ts.copy() + + ts[::2] = np.NaN + + # Did not modify original Series + self.assertFalse(np.isnan(self.ts[0])) + + def test_count(self): + self.assertEqual(self.ts.count(), len(self.ts)) + + self.ts[::2] = np.NaN + + self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum()) + + def test_dtype(self): + + self.assertEqual(self.ts.dtype, np.dtype('float64')) + self.assertEqual(self.ts.dtypes, np.dtype('float64')) + self.assertEqual(self.ts.ftype, 'float64:dense') + self.assertEqual(self.ts.ftypes, 'float64:dense') + assert_series_equal(self.ts.get_dtype_counts(),Series(1,['float64'])) + assert_series_equal(self.ts.get_ftype_counts(),Series(1,['float64:dense'])) + + def test_dot(self): + a = Series(np.random.randn(4), index=['p', 'q', 'r', 's']) + b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'], + columns=['p', 'q', 'r', 's']).T + + result = a.dot(b) + expected = Series(np.dot(a.values, b.values), + index=['1', '2', '3']) + assert_series_equal(result, expected) + + # Check index alignment + b2 = b.reindex(index=reversed(b.index)) + result = a.dot(b) + assert_series_equal(result, expected) + + # Check ndarray argument + result = a.dot(b.values) + self.assertTrue(np.all(result == expected.values)) + assert_almost_equal(a.dot(b['2'].values), expected['2']) + + # Check series argument + assert_almost_equal(a.dot(b['1']), expected['1']) + assert_almost_equal(a.dot(b2['1']), expected['1']) + + self.assertRaises(Exception, a.dot, a.values[:3]) + self.assertRaises(ValueError, a.dot, b.T) + + def test_value_counts_nunique(self): + + # basics.rst doc example + series = Series(np.random.randn(500)) + series[20:500] = np.nan + series[10:20] = 5000 + result = series.nunique() + self.assertEqual(result, 11) + + def test_unique(self): + + # 714 also, dtype=float + s = Series([1.2345] * 100) + s[::2] = np.nan + result = s.unique() + self.assertEqual(len(result), 2) + + s = Series([1.2345] * 100, dtype='f4') + s[::2] = np.nan + result = s.unique() + self.assertEqual(len(result), 2) + + # NAs in object arrays #714 + s = Series(['foo'] * 100, dtype='O') + s[::2] = np.nan + result = s.unique() + self.assertEqual(len(result), 2) + + # decision about None + s = Series([1, 2, 3, None, None, None], dtype=object) + result = s.unique() + expected = np.array([1, 2, 3, None], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_dropna_empty(self): + s = Series([]) + self.assertEqual(len(s.dropna()), 0) + s.dropna(inplace=True) + self.assertEqual(len(s), 0) + + # invalid axis + self.assertRaises(ValueError, s.dropna, axis=1) + + def test_axis_alias(self): + s = Series([1, 2, np.nan]) + assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index')) + self.assertEqual(s.dropna().sum('rows'), 3) + self.assertEqual(s._get_axis_number('rows'), 0) + self.assertEqual(s._get_axis_name('rows'), 'index') + + def test_drop_duplicates(self): + s = Series([1, 2, 3, 3]) + + result = s.duplicated() + expected = Series([False, False, False, True]) + assert_series_equal(result, expected) + + result = s.duplicated(take_last=True) + expected = Series([False, False, True, False]) + assert_series_equal(result, expected) + + result = s.drop_duplicates() + expected = s[[True, True, True, False]] + assert_series_equal(result, expected) + sc = s.copy() + sc.drop_duplicates(inplace=True) + assert_series_equal(sc, expected) + + result = s.drop_duplicates(take_last=True) + expected = s[[True, True, False, True]] + assert_series_equal(result, expected) + sc = s.copy() + sc.drop_duplicates(take_last=True, inplace=True) + assert_series_equal(sc, expected) + + def test_sort(self): + ts = self.ts.copy() + ts.sort() + + self.assert_numpy_array_equal(ts, self.ts.order()) + self.assert_numpy_array_equal(ts.index, self.ts.order().index) + + ts.sort(ascending=False) + self.assert_numpy_array_equal(ts, self.ts.order(ascending=False)) + self.assert_numpy_array_equal(ts.index, + self.ts.order(ascending=False).index) + + # GH 5856/5853 + # Series.sort operating on a view + df = DataFrame(np.random.randn(10,4)) + s = df.iloc[:,0] + def f(): + s.sort() + self.assertRaises(ValueError, f) + + # test order/sort inplace + # GH6859 + ts1 = self.ts.copy() + ts1.sort(ascending=False) + ts2 = self.ts.copy() + ts2.order(ascending=False,inplace=True) + assert_series_equal(ts1,ts2) + + ts1 = self.ts.copy() + ts1 = ts1.sort(ascending=False,inplace=False) + ts2 = self.ts.copy() + ts2 = ts.order(ascending=False) + assert_series_equal(ts1,ts2) + + def test_sort_index(self): + import random + + rindex = list(self.ts.index) + random.shuffle(rindex) + + random_order = self.ts.reindex(rindex) + sorted_series = random_order.sort_index() + assert_series_equal(sorted_series, self.ts) + + # descending + sorted_series = random_order.sort_index(ascending=False) + assert_series_equal(sorted_series, + self.ts.reindex(self.ts.index[::-1])) + + def test_order(self): + ts = self.ts.copy() + ts[:5] = np.NaN + vals = ts.values + + result = ts.order() + self.assertTrue(np.isnan(result[-5:]).all()) + self.assert_numpy_array_equal(result[:-5], np.sort(vals[5:])) + + result = ts.order(na_position='first') + self.assertTrue(np.isnan(result[:5]).all()) + self.assert_numpy_array_equal(result[5:], np.sort(vals[5:])) + + # something object-type + ser = Series(['A', 'B'], [1, 2]) + # no failure + ser.order() + + # ascending=False + ordered = ts.order(ascending=False) + expected = np.sort(ts.valid().values)[::-1] + assert_almost_equal(expected, ordered.valid().values) + ordered = ts.order(ascending=False, na_position='first') + assert_almost_equal(expected, ordered.valid().values) + + def test_nsmallest_nlargest(self): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + + base = [3, 2, 1, 2, 5] + + s_list = [ + Series(base, dtype='int8'), + Series(base, dtype='int16'), + Series(base, dtype='int32'), + Series(base, dtype='int64'), + Series(base, dtype='float32'), + Series(base, dtype='float64'), + Series(base, dtype='uint8'), + Series(base, dtype='uint16'), + Series(base, dtype='uint32'), + Series(base, dtype='uint64'), + Series(base).astype('timedelta64[ns]'), + Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005'])), + ] + + raising = [ + Series([3., 2, 1, 2, '5'], dtype='object'), + Series([3., 2, 1, 2, 5], dtype='object'), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3., 2, 1, 2, 5], dtype='complex128'), + ] + + for r in raising: + dt = r.dtype + msg = "Cannot use method 'n(larg|small)est' with dtype %s" % dt + args = 2, len(r), 0, -1 + methods = r.nlargest, r.nsmallest + for method, arg in product(methods, args): + with tm.assertRaisesRegexp(TypeError, msg): + method(arg) + + for s in s_list: + + assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) + assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) + + assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]]) + assert_series_equal(s.nlargest(3, take_last=True), + s.iloc[[4, 0, 3]]) + + empty = s.iloc[0:0] + assert_series_equal(s.nsmallest(0), empty) + assert_series_equal(s.nsmallest(-1), empty) + assert_series_equal(s.nlargest(0), empty) + assert_series_equal(s.nlargest(-1), empty) + + assert_series_equal(s.nsmallest(len(s)), s.order()) + assert_series_equal(s.nsmallest(len(s) + 1), s.order()) + assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), + s.iloc[[4, 0, 1, 3, 2]]) + + s = Series([3., np.nan, 1, 2, 5]) + assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) + assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) + + def test_rank(self): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + self.ts[::2] = np.nan + self.ts[:10][::3] = 4. + + ranks = self.ts.rank() + oranks = self.ts.astype('O').rank() + + assert_series_equal(ranks, oranks) + + mask = np.isnan(self.ts) + filled = self.ts.fillna(np.inf) + + # rankdata returns a ndarray + exp = Series(rankdata(filled),index=filled.index) + exp[mask] = np.nan + + assert_almost_equal(ranks, exp) + + iseries = Series(np.arange(5).repeat(2)) + + iranks = iseries.rank() + exp = iseries.astype(float).rank() + assert_series_equal(iranks, exp) + iseries = Series(np.arange(5)) + 1.0 + exp = iseries / 5.0 + iranks = iseries.rank(pct=True) + + assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(1, 100)) + exp = Series(np.repeat(0.505, 100)) + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries[1] = np.nan + exp = Series(np.repeat(50.0 / 99.0, 100)) + exp[1] = np.nan + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1.0 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.repeat(np.nan, 100)) + exp = iseries.copy() + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series(np.arange(5)) + 1 + iseries[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + rng = date_range('1/1/1990', periods=5) + iseries = Series(np.arange(5), rng) + 1 + iseries.ix[4] = np.nan + exp = iseries / 4.0 + iranks = iseries.rank(pct=True) + assert_series_equal(iranks, exp) + + iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1]) + exp = Series([2, 1, 3.5, 5, 3.5, 6]) + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + values = np.array([-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype='float64') + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype='float64') + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + def test_rank_inf(self): + raise nose.SkipTest('DataFrame.rank does not currently rank np.inf and -np.inf properly') + + values = np.array([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40, np.inf], dtype='float64') + random_order = np.random.permutation(len(values)) + iseries = Series(values[random_order]) + exp = Series(random_order + 1.0, dtype='float64') + iranks = iseries.rank() + assert_series_equal(iranks, exp) + + + def test_from_csv(self): + + with ensure_clean() as path: + self.ts.to_csv(path) + ts = Series.from_csv(path) + assert_series_equal(self.ts, ts) + self.assertTrue(ts.index.name is None) + + self.series.to_csv(path) + series = Series.from_csv(path) + self.assertIsNone(series.name) + self.assertIsNone(series.index.name) + assert_series_equal(self.series, series) + + outfile = open(path, 'w') + outfile.write('1998-01-01|1.0\n1999-01-01|2.0') + outfile.close() + series = Series.from_csv(path, sep='|') + checkseries = Series( + {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + assert_series_equal(checkseries, series) + + series = Series.from_csv(path, sep='|', parse_dates=False) + checkseries = Series({'1998-01-01': 1.0, '1999-01-01': 2.0}) + assert_series_equal(checkseries, series) + + def test_to_csv(self): + import io + + with ensure_clean() as path: + self.ts.to_csv(path) + + lines = io.open(path, newline=None).readlines() + assert(lines[1] != '\n') + + self.ts.to_csv(path, index=False) + arr = np.loadtxt(path) + assert_almost_equal(arr, self.ts.values) + + def test_to_csv_unicode_index(self): + buf = StringIO() + s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")]) + + s.to_csv(buf, encoding='UTF-8') + buf.seek(0) + + s2 = Series.from_csv(buf, index_col=0, encoding='UTF-8') + + assert_series_equal(s, s2) + + def test_tolist(self): + rs = self.ts.tolist() + xp = self.ts.values.tolist() + assert_almost_equal(rs, xp) + + # datetime64 + s = Series(self.ts.index) + rs = s.tolist() + self.assertEqual(self.ts.index[0], rs[0]) + + def test_to_frame(self): + self.ts.name = None + rs = self.ts.to_frame() + xp = pd.DataFrame(self.ts.values, index=self.ts.index) + assert_frame_equal(rs, xp) + + self.ts.name = 'testname' + rs = self.ts.to_frame() + xp = pd.DataFrame(dict(testname=self.ts.values), index=self.ts.index) + assert_frame_equal(rs, xp) + + rs = self.ts.to_frame(name='testdifferent') + xp = pd.DataFrame(dict(testdifferent=self.ts.values), index=self.ts.index) + assert_frame_equal(rs, xp) + + def test_to_dict(self): + self.assert_numpy_array_equal(Series(self.ts.to_dict()), self.ts) + + def test_to_csv_float_format(self): + + with ensure_clean() as filename: + ser = Series([0.123456, 0.234567, 0.567567]) + ser.to_csv(filename, float_format='%.2f') + + rs = Series.from_csv(filename) + xp = Series([0.12, 0.23, 0.57]) + assert_series_equal(rs, xp) + + def test_to_csv_list_entries(self): + s = Series(['jack and jill', 'jesse and frank']) + + split = s.str.split(r'\s+and\s+') + + buf = StringIO() + split.to_csv(buf) + + def test_clip(self): + val = self.ts.median() + + self.assertEqual(self.ts.clip_lower(val).min(), val) + self.assertEqual(self.ts.clip_upper(val).max(), val) + + self.assertEqual(self.ts.clip(lower=val).min(), val) + self.assertEqual(self.ts.clip(upper=val).max(), val) + + result = self.ts.clip(-0.5, 0.5) + expected = np.clip(self.ts, -0.5, 0.5) + assert_series_equal(result, expected) + tm.assert_isinstance(expected, Series) + + def test_clip_types_and_nulls(self): + + sers = [Series([np.nan, 1.0, 2.0, 3.0]), + Series([None, 'a', 'b', 'c']), + Series(pd.to_datetime([np.nan, 1, 2, 3], unit='D'))] + + for s in sers: + thresh = s[2] + l = s.clip_lower(thresh) + u = s.clip_upper(thresh) + self.assertEqual(l[notnull(l)].min(), thresh) + self.assertEqual(u[notnull(u)].max(), thresh) + self.assertEqual(list(isnull(s)), list(isnull(l))) + self.assertEqual(list(isnull(s)), list(isnull(u))) + + def test_valid(self): + ts = self.ts.copy() + ts[::2] = np.NaN + + result = ts.valid() + self.assertEqual(len(result), ts.count()) + + tm.assert_dict_equal(result, ts, compare_keys=False) + + def test_isnull(self): + ser = Series([0, 5.4, 3, nan, -0.001]) + np.array_equal( + ser.isnull(), Series([False, False, False, True, False]).values) + ser = Series(["hi", "", nan]) + np.array_equal(ser.isnull(), Series([False, False, True]).values) + + def test_notnull(self): + ser = Series([0, 5.4, 3, nan, -0.001]) + np.array_equal( + ser.notnull(), Series([True, True, True, False, True]).values) + ser = Series(["hi", "", nan]) + np.array_equal(ser.notnull(), Series([True, True, False]).values) + + def test_shift(self): + shifted = self.ts.shift(1) + unshifted = shifted.shift(-1) + + tm.assert_dict_equal(unshifted.valid(), self.ts, compare_keys=False) + + offset = datetools.bday + shifted = self.ts.shift(1, freq=offset) + unshifted = shifted.shift(-1, freq=offset) + + assert_series_equal(unshifted, self.ts) + + unshifted = self.ts.shift(0, freq=offset) + assert_series_equal(unshifted, self.ts) + + shifted = self.ts.shift(1, freq='B') + unshifted = shifted.shift(-1, freq='B') + + assert_series_equal(unshifted, self.ts) + + # corner case + unshifted = self.ts.shift(0) + assert_series_equal(unshifted, self.ts) + + # Shifting with PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + tm.assert_dict_equal(unshifted.valid(), ps, compare_keys=False) + + shifted2 = ps.shift(1, 'B') + shifted3 = ps.shift(1, datetools.bday) + assert_series_equal(shifted2, shifted3) + assert_series_equal(ps, shifted2.shift(-1, 'B')) + + self.assertRaises(ValueError, ps.shift, freq='D') + + # legacy support + shifted4 = ps.shift(1, timeRule='B') + assert_series_equal(shifted2, shifted4) + + shifted5 = ps.shift(1, offset=datetools.bday) + assert_series_equal(shifted5, shifted4) + + def test_tshift(self): + # PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + assert_series_equal(unshifted, ps) + + shifted2 = ps.tshift(freq='B') + assert_series_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=datetools.bday) + assert_series_equal(shifted, shifted3) + + self.assertRaises(ValueError, ps.tshift, freq='M') + + # DatetimeIndex + shifted = self.ts.tshift(1) + unshifted = shifted.tshift(-1) + + assert_series_equal(self.ts, unshifted) + + shifted2 = self.ts.tshift(freq=self.ts.index.freq) + assert_series_equal(shifted, shifted2) + + inferred_ts = Series(self.ts.values, Index(np.asarray(self.ts.index))) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + assert_series_equal(shifted, self.ts.tshift(1)) + assert_series_equal(unshifted, inferred_ts) + + no_freq = self.ts[[0, 5, 7]] + self.assertRaises(ValueError, no_freq.tshift) + + def test_shift_int(self): + ts = self.ts.astype(int) + shifted = ts.shift(1) + expected = ts.astype(float).shift(1) + assert_series_equal(shifted, expected) + + def test_truncate(self): + offset = datetools.bday + + ts = self.ts[::3] + + start, end = self.ts.index[3], self.ts.index[6] + start_missing, end_missing = self.ts.index[2], self.ts.index[7] + + # neither specified + truncated = ts.truncate() + assert_series_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + assert_series_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + assert_series_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + assert_series_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + assert_series_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + assert_series_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + assert_series_equal(truncated, expected) + + # corner case, empty series returned + truncated = ts.truncate(after=self.ts.index[0] - offset) + assert(len(truncated) == 0) + + truncated = ts.truncate(before=self.ts.index[-1] + offset) + assert(len(truncated) == 0) + + self.assertRaises(ValueError, ts.truncate, + before=self.ts.index[-1] + offset, + after=self.ts.index[0] - offset) + + def test_ptp(self): + N = 1000 + arr = np.random.randn(N) + ser = Series(arr) + self.assertEqual(np.ptp(ser), np.ptp(arr)) + + def test_asof(self): + # array or list or dates + N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='25s') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + val = result[result.index[result.index >= ub][0]] + self.assertEqual(ts[ub], val) + + self.ts[5:10] = np.NaN + self.ts[15:20] = np.NaN + + val1 = self.ts.asof(self.ts.index[7]) + val2 = self.ts.asof(self.ts.index[19]) + + self.assertEqual(val1, self.ts[4]) + self.assertEqual(val2, self.ts[14]) + + # accepts strings + val1 = self.ts.asof(str(self.ts.index[7])) + self.assertEqual(val1, self.ts[4]) + + # in there + self.assertEqual(self.ts.asof(self.ts.index[3]), self.ts[3]) + + # no as of value + d = self.ts.index[0] - datetools.bday + self.assertTrue(np.isnan(self.ts.asof(d))) + + def test_getitem_setitem_datetimeindex(self): + from pandas import date_range + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04:00:00"] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00"] = 0 + result["1990-01-01 04:00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 + result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] + assert_series_equal(result, ts) + + lb = "1990-01-01 04:00:00" + rb = "1990-01-01 07:00:00" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + # repeat all the above with naive datetimes + result = ts[datetime(1990, 1, 1, 4)] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4)] = 0 + result[datetime(1990, 1, 1, 4)] = ts[4] + assert_series_equal(result, ts) + + result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 + result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] + assert_series_equal(result, ts) + + lb = datetime(1990, 1, 1, 4) + rb = datetime(1990, 1, 1, 7) + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts[ts.index[4]] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts[ts.index[4:8]] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + assert_series_equal(result, ts) + + # also test partial date slicing + result = ts["1990-01-02"] + expected = ts[24:48] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-02"] = 0 + result["1990-01-02"] = ts[24:48] + assert_series_equal(result, ts) + + def test_getitem_setitem_datetime_tz_pytz(self): + tm._skip_if_no_pytz(); + from pytz import timezone as tz + + from pandas import date_range + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + + # comparison dates with datetime MUST be localized! + date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) + result[date] = 0 + result[date] = ts[4] + assert_series_equal(result, ts) + + + def test_getitem_setitem_datetime_tz_dateutil(self): + tm._skip_if_no_dateutil(); + from dateutil.tz import gettz, tzutc + tz = lambda x: tzutc() if x == 'UTC' else gettz(x) # handle special case for utc in dateutil + + from pandas import date_range + N = 50 + # testing with timezone, GH #2785 + rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(N), index=rng) + + # also test Timestamp tz handling, GH #2789 + result = ts.copy() + result["1990-01-01 09:00:00+00:00"] = 0 + result["1990-01-01 09:00:00+00:00"] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result["1990-01-01 03:00:00-06:00"] = 0 + result["1990-01-01 03:00:00-06:00"] = ts[4] + assert_series_equal(result, ts) + + # repeat with datetimes + result = ts.copy() + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + assert_series_equal(result, ts) + + result = ts.copy() + result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = ts[4] + assert_series_equal(result, ts) + + def test_getitem_setitem_periodindex(self): + from pandas import period_range + N = 50 + rng = period_range('1/1/1990', periods=N, freq='H') + ts = Series(np.random.randn(N), index=rng) + + result = ts["1990-01-01 04"] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts.copy() + result["1990-01-01 04"] = 0 + result["1990-01-01 04"] = ts[4] + assert_series_equal(result, ts) + + result = ts["1990-01-01 04":"1990-01-01 07"] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result["1990-01-01 04":"1990-01-01 07"] = 0 + result["1990-01-01 04":"1990-01-01 07"] = ts[4:8] + assert_series_equal(result, ts) + + lb = "1990-01-01 04" + rb = "1990-01-01 07" + result = ts[(ts.index >= lb) & (ts.index <= rb)] + expected = ts[4:8] + assert_series_equal(result, expected) + + # GH 2782 + result = ts[ts.index[4]] + expected = ts[4] + self.assertEqual(result, expected) + + result = ts[ts.index[4:8]] + expected = ts[4:8] + assert_series_equal(result, expected) + + result = ts.copy() + result[ts.index[4:8]] = 0 + result[4:8] = ts[4:8] + assert_series_equal(result, ts) + + def test_asof_periodindex(self): + from pandas import period_range, PeriodIndex + # array or list or dates + N = 50 + rng = period_range('1/1/1990', periods=N, freq='H') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='37min') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + pix = PeriodIndex(result.index.values, freq='H') + mask = (pix >= lb) & (pix < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + ts[5:10] = np.NaN + ts[15:20] = np.NaN + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + self.assertEqual(val1, ts[4]) + self.assertEqual(val2, ts[14]) + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + self.assertEqual(val1, ts[4]) + + # in there + self.assertEqual(ts.asof(ts.index[3]), ts[3]) + + # no as of value + d = ts.index[0].to_timestamp() - datetools.bday + self.assertTrue(np.isnan(ts.asof(d))) + + def test_asof_more(self): + from pandas import date_range + s = Series([nan, nan, 1, 2, nan, nan, 3, 4, 5], + index=date_range('1/1/2000', periods=9)) + + dates = s.index[[4, 5, 6, 2, 1]] + + result = s.asof(dates) + expected = Series([2, 2, 3, 1, np.nan], index=dates) + + assert_series_equal(result, expected) + + s = Series([1.5, 2.5, 1, 2, nan, nan, 3, 4, 5], + index=date_range('1/1/2000', periods=9)) + result = s.asof(s.index[0]) + self.assertEqual(result, s[0]) + + def test_cast_on_putmask(self): + + # GH 2746 + + # need to upcast + s = Series([1, 2], index=[1, 2], dtype='int64') + s[[True, False]] = Series([0], index=[1], dtype='int64') + expected = Series([0, 2], index=[1, 2], dtype='int64') + + assert_series_equal(s, expected) + + def test_astype_cast_nan_int(self): + df = Series([1.0, 2.0, 3.0, np.nan]) + self.assertRaises(ValueError, df.astype, np.int64) + + def test_astype_cast_object_int(self): + arr = Series(["car", "house", "tree", "1"]) + + self.assertRaises(ValueError, arr.astype, int) + self.assertRaises(ValueError, arr.astype, np.int64) + self.assertRaises(ValueError, arr.astype, np.int8) + + arr = Series(['1', '2', '3', '4'], dtype=object) + result = arr.astype(int) + self.assert_numpy_array_equal(result, np.arange(1, 5)) + + def test_astype_datetimes(self): + import pandas.tslib as tslib + + s = Series(tslib.iNaT, dtype='M8[ns]', index=lrange(5)) + s = s.astype('O') + self.assertEqual(s.dtype, np.object_) + + s = Series([datetime(2001, 1, 2, 0, 0)]) + s = s.astype('O') + self.assertEqual(s.dtype, np.object_) + + s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) + s[1] = np.nan + self.assertEqual(s.dtype, 'M8[ns]') + s = s.astype('O') + self.assertEqual(s.dtype, np.object_) + + def test_astype_str(self): + # GH4405 + digits = string.digits + s1 = Series([digits * 10, tm.rands(63), tm.rands(64), + tm.rands(1000)]) + s2 = Series([digits * 10, tm.rands(63), tm.rands(64), nan, 1.0]) + types = (compat.text_type,) + (np.str_, np.unicode_) + for typ in types: + for s in (s1, s2): + res = s.astype(typ) + expec = s.map(compat.text_type) + assert_series_equal(res, expec) + + def test_map(self): + index, data = tm.getMixedTypeDict() + + source = Series(data['B'], index=data['C']) + target = Series(data['C'][:4], index=data['D'][:4]) + + merged = target.map(source) + + for k, v in compat.iteritems(merged): + self.assertEqual(v, source[target[k]]) + + # input could be a dict + merged = target.map(source.to_dict()) + + for k, v in compat.iteritems(merged): + self.assertEqual(v, source[target[k]]) + + # function + result = self.ts.map(lambda x: x * 2) + self.assert_numpy_array_equal(result, self.ts * 2) + + def test_map_int(self): + left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4}) + right = Series({1: 11, 2: 22, 3: 33}) + + self.assertEqual(left.dtype, np.float_) + self.assertTrue(issubclass(right.dtype.type, np.integer)) + + merged = left.map(right) + self.assertEqual(merged.dtype, np.float_) + self.assertTrue(isnull(merged['d'])) + self.assertTrue(not isnull(merged['c'])) + + def test_map_type_inference(self): + s = Series(lrange(3)) + s2 = s.map(lambda x: np.where(x == 0, 0, 1)) + self.assertTrue(issubclass(s2.dtype.type, np.integer)) + + def test_map_decimal(self): + from decimal import Decimal + + result = self.series.map(lambda x: Decimal(str(x))) + self.assertEqual(result.dtype, np.object_) + tm.assert_isinstance(result[0], Decimal) + + def test_map_na_exclusion(self): + s = Series([1.5, np.nan, 3, np.nan, 5]) + + result = s.map(lambda x: x * 2, na_action='ignore') + exp = s * 2 + assert_series_equal(result, exp) + + def test_map_dict_with_tuple_keys(self): + ''' + Due to new MultiIndex-ing behaviour in v0.14.0, + dicts with tuple keys passed to map were being + converted to a multi-index, preventing tuple values + from being mapped properly. + ''' + df = pd.DataFrame({'a': [(1,), (2,), (3, 4), (5, 6)]}) + label_mappings = { + (1,): 'A', + (2,): 'B', + (3, 4): 'A', + (5, 6): 'B' + } + df['labels'] = df['a'].map(label_mappings) + df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) + # All labels should be filled now + tm.assert_series_equal(df['labels'], df['expected_labels']) + + def test_apply(self): + assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) + + # elementwise-apply + import math + assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) + + # how to handle Series result, #2316 + result = self.ts.apply(lambda x: Series([x, x ** 2], + index=['x', 'x^2'])) + expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) + tm.assert_frame_equal(result, expected) + + # empty series + s = Series() + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) + + # index but no data + s = Series(index=[1, 2, 3]) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) + + def test_apply_same_length_inference_bug(self): + s = Series([1, 2]) + f = lambda x: (x, x + 1) + + result = s.apply(f) + expected = s.map(f) + assert_series_equal(result, expected) + + s = Series([1, 2, 3]) + result = s.apply(f) + expected = s.map(f) + assert_series_equal(result, expected) + + def test_apply_dont_convert_dtype(self): + s = Series(np.random.randn(10)) + + f = lambda x: x if x > 0 else np.nan + result = s.apply(f, convert_dtype=False) + self.assertEqual(result.dtype, object) + + def test_convert_objects(self): + + s = Series([1., 2, 3], index=['a', 'b', 'c']) + result = s.convert_objects(convert_dates=False, convert_numeric=True) + assert_series_equal(result, s) + + # force numeric conversion + r = s.copy().astype('O') + r['a'] = '1' + result = r.convert_objects(convert_dates=False, convert_numeric=True) + assert_series_equal(result, s) + + r = s.copy().astype('O') + r['a'] = '1.' + result = r.convert_objects(convert_dates=False, convert_numeric=True) + assert_series_equal(result, s) + + r = s.copy().astype('O') + r['a'] = 'garbled' + expected = s.copy() + expected['a'] = np.nan + result = r.convert_objects(convert_dates=False, convert_numeric=True) + assert_series_equal(result, expected) + + # GH 4119, not converting a mixed type (e.g.floats and object) + s = Series([1, 'na', 3, 4]) + result = s.convert_objects(convert_numeric=True) + expected = Series([1, np.nan, 3, 4]) + assert_series_equal(result, expected) + + s = Series([1, '', 3, 4]) + result = s.convert_objects(convert_numeric=True) + expected = Series([1, np.nan, 3, 4]) + assert_series_equal(result, expected) + + # dates + s = Series( + [datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0)]) + s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime( + 2001, 1, 3, 0, 0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'], dtype='O') + + result = s.convert_objects(convert_dates=True, convert_numeric=False) + expected = Series( + [Timestamp('20010101'), Timestamp('20010102'), Timestamp('20010103')], dtype='M8[ns]') + assert_series_equal(result, expected) + + result = s.convert_objects( + convert_dates='coerce', convert_numeric=False) + result = s.convert_objects( + convert_dates='coerce', convert_numeric=True) + assert_series_equal(result, expected) + + expected = Series( + [Timestamp( + '20010101'), Timestamp('20010102'), Timestamp('20010103'), + lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]') + result = s2.convert_objects( + convert_dates='coerce', convert_numeric=False) + assert_series_equal(result, expected) + result = s2.convert_objects( + convert_dates='coerce', convert_numeric=True) + assert_series_equal(result, expected) + + # preserver all-nans (if convert_dates='coerce') + s = Series(['foo', 'bar', 1, 1.0], dtype='O') + result = s.convert_objects( + convert_dates='coerce', convert_numeric=False) + assert_series_equal(result, s) + + # preserver if non-object + s = Series([1], dtype='float32') + result = s.convert_objects( + convert_dates='coerce', convert_numeric=False) + assert_series_equal(result, s) + + #r = s.copy() + #r[0] = np.nan + #result = r.convert_objects(convert_dates=True,convert_numeric=False) + #self.assertEqual(result.dtype, 'M8[ns]') + + # dateutil parses some single letters into today's value as a date + for x in 'abcdefghijklmnopqrstuvwxyz': + s = Series([x]) + result = s.convert_objects(convert_dates='coerce') + assert_series_equal(result, s) + s = Series([x.upper()]) + result = s.convert_objects(convert_dates='coerce') + assert_series_equal(result, s) + + def test_convert_objects_preserve_bool(self): + s = Series([1, True, 3, 5], dtype=object) + r = s.convert_objects(convert_numeric=True) + e = Series([1, 1, 3, 5], dtype='i8') + tm.assert_series_equal(r, e) + + def test_convert_objects_preserve_all_bool(self): + s = Series([False, True, False, False], dtype=object) + r = s.convert_objects(convert_numeric=True) + e = Series([False, True, False, False], dtype=bool) + tm.assert_series_equal(r, e) + + def test_apply_args(self): + s = Series(['foo,bar']) + + result = s.apply(str.split, args=(',',)) + self.assertEqual(result[0], ['foo', 'bar']) + tm.assert_isinstance(result[0], list) + + def test_align(self): + def _check_align(a, b, how='left', fill=None): + aa, ab = a.align(b, join=how, fill_value=fill) + + join_index = a.index.join(b.index, how=how) + if fill is not None: + diff_a = aa.index.diff(join_index) + diff_b = ab.index.diff(join_index) + if len(diff_a) > 0: + self.assertTrue((aa.reindex(diff_a) == fill).all()) + if len(diff_b) > 0: + self.assertTrue((ab.reindex(diff_b) == fill).all()) + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) + + for kind in JOIN_TYPES: + _check_align(self.ts[2:], self.ts[:-5], how=kind) + _check_align(self.ts[2:], self.ts[:-5], how=kind, fill=-1) + + # empty left + _check_align(self.ts[:0], self.ts[:-5], how=kind) + + # empty right + _check_align(self.ts[:-5], self.ts[:0], how=kind) + + # both empty + _check_align(self.ts[:0], self.ts[:0], how=kind) + + def test_align_fill_method(self): + def _check_align(a, b, how='left', method='pad', limit=None): + aa, ab = a.align(b, join=how, method=method, limit=limit) + + join_index = a.index.join(b.index, how=how) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) + + for kind in JOIN_TYPES: + for meth in ['pad', 'bfill']: + _check_align(self.ts[2:], self.ts[:-5], how=kind, method=meth) + _check_align(self.ts[2:], self.ts[:-5], how=kind, + method=meth, limit=1) + + # empty left + _check_align(self.ts[:0], self.ts[:-5], how=kind, method=meth) + _check_align(self.ts[:0], self.ts[:-5], how=kind, method=meth, + limit=1) + + # empty right + _check_align(self.ts[:-5], self.ts[:0], how=kind, method=meth) + _check_align(self.ts[:-5], self.ts[:0], how=kind, method=meth, + limit=1) + + # both empty + _check_align(self.ts[:0], self.ts[:0], how=kind, method=meth) + _check_align(self.ts[:0], self.ts[:0], how=kind, method=meth, + limit=1) + + def test_align_nocopy(self): + b = self.ts[:5].copy() + + # do copy + a = self.ts.copy() + ra, _ = a.align(b, join='left') + ra[:5] = 5 + self.assertFalse((a[:5] == 5).any()) + + # do not copy + a = self.ts.copy() + ra, _ = a.align(b, join='left', copy=False) + ra[:5] = 5 + self.assertTrue((a[:5] == 5).all()) + + # do copy + a = self.ts.copy() + b = self.ts[:5].copy() + _, rb = a.align(b, join='right') + rb[:3] = 5 + self.assertFalse((b[:3] == 5).any()) + + # do not copy + a = self.ts.copy() + b = self.ts[:5].copy() + _, rb = a.align(b, join='right', copy=False) + rb[:2] = 5 + self.assertTrue((b[:2] == 5).all()) + + def test_align_sameindex(self): + a, b = self.ts.align(self.ts, copy=False) + self.assertIs(a.index, self.ts.index) + self.assertIs(b.index, self.ts.index) + + # a, b = self.ts.align(self.ts, copy=True) + # self.assertIsNot(a.index, self.ts.index) + # self.assertIsNot(b.index, self.ts.index) + + def test_reindex(self): + identity = self.series.reindex(self.series.index) + self.assertTrue(np.may_share_memory(self.series.index, identity.index)) + self.assertTrue(identity.index.is_(self.series.index)) + + subIndex = self.series.index[10:20] + subSeries = self.series.reindex(subIndex) + + for idx, val in compat.iteritems(subSeries): + self.assertEqual(val, self.series[idx]) + + subIndex2 = self.ts.index[10:20] + subTS = self.ts.reindex(subIndex2) + + for idx, val in compat.iteritems(subTS): + self.assertEqual(val, self.ts[idx]) + stuffSeries = self.ts.reindex(subIndex) + + self.assertTrue(np.isnan(stuffSeries).all()) + + # This is extremely important for the Cython code to not screw up + nonContigIndex = self.ts.index[::2] + subNonContig = self.ts.reindex(nonContigIndex) + for idx, val in compat.iteritems(subNonContig): + self.assertEqual(val, self.ts[idx]) + + # return a copy the same index here + result = self.ts.reindex() + self.assertFalse((result is self.ts)) + + def test_reindex_corner(self): + # (don't forget to fix this) I think it's fixed + reindexed_dep = self.empty.reindex(self.ts.index, method='pad') + + # corner case: pad empty series + reindexed = self.empty.reindex(self.ts.index, method='pad') + + # pass non-Index + reindexed = self.ts.reindex(list(self.ts.index)) + assert_series_equal(self.ts, reindexed) + + # bad fill method + ts = self.ts[::2] + self.assertRaises(Exception, ts.reindex, self.ts.index, method='foo') + + def test_reindex_pad(self): + + s = Series(np.arange(10),dtype='int64') + s2 = s[::2] + + reindexed = s2.reindex(s.index, method='pad') + reindexed2 = s2.reindex(s.index, method='ffill') + assert_series_equal(reindexed, reindexed2) + + expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10)) + assert_series_equal(reindexed, expected) + + # GH4604 + s = Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e']) + new_index = ['a','g','c','f'] + expected = Series([1,1,3,3],index=new_index) + + # this changes dtype because the ffill happens after + result = s.reindex(new_index).ffill() + assert_series_equal(result, expected.astype('float64')) + + result = s.reindex(new_index).ffill(downcast='infer') + assert_series_equal(result, expected) + + # invalid because we can't forward fill on this type of index + self.assertRaises(ValueError, lambda : s.reindex(new_index, method='ffill')) + + # inferrence of new dtype + s = Series([True,False,False,True],index=list('abcd')) + new_index='agc' + result = s.reindex(list(new_index)).ffill() + expected = Series([True,True,False],index=list(new_index)) + assert_series_equal(result, expected) + + # GH4618 shifted series downcasting + s = Series(False,index=lrange(0,5)) + result = s.shift(1).fillna(method='bfill') + expected = Series(False,index=lrange(0,5)) + assert_series_equal(result, expected) + + def test_reindex_backfill(self): + pass + + def test_reindex_int(self): + ts = self.ts[::2] + int_ts = Series(np.zeros(len(ts), dtype=int), index=ts.index) + + # this should work fine + reindexed_int = int_ts.reindex(self.ts.index) + + # if NaNs introduced + self.assertEqual(reindexed_int.dtype, np.float_) + + # NO NaNs introduced + reindexed_int = int_ts.reindex(int_ts.index[::2]) + self.assertEqual(reindexed_int.dtype, np.int_) + + def test_reindex_bool(self): + + # A series other than float, int, string, or object + ts = self.ts[::2] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + + # this should work fine + reindexed_bool = bool_ts.reindex(self.ts.index) + + # if NaNs introduced + self.assertEqual(reindexed_bool.dtype, np.object_) + + # NO NaNs introduced + reindexed_bool = bool_ts.reindex(bool_ts.index[::2]) + self.assertEqual(reindexed_bool.dtype, np.bool_) + + def test_reindex_bool_pad(self): + # fail + ts = self.ts[5:] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + filled_bool = bool_ts.reindex(self.ts.index, method='pad') + self.assertTrue(isnull(filled_bool[:5]).all()) + + def test_reindex_like(self): + other = self.ts[::2] + assert_series_equal(self.ts.reindex(other.index), + self.ts.reindex_like(other)) + + # GH 7179 + day1 = datetime(2013,3,5) + day2 = datetime(2013,5,5) + day3 = datetime(2014,3,5) + + series1 = Series([5, None, None],[day1, day2, day3]) + series2 = Series([None, None], [day1, day3]) + + result = series1.reindex_like(series2, method='pad') + expected = Series([5, np.nan], index=[day1, day3]) + assert_series_equal(result, expected) + + def test_reindex_fill_value(self): + #------------------------------------------------------------ + # floats + floats = Series([1., 2., 3.]) + result = floats.reindex([1, 2, 3]) + expected = Series([2., 3., np.nan], index=[1, 2, 3]) + assert_series_equal(result, expected) + + result = floats.reindex([1, 2, 3], fill_value=0) + expected = Series([2., 3., 0], index=[1, 2, 3]) + assert_series_equal(result, expected) + + #------------------------------------------------------------ + # ints + ints = Series([1, 2, 3]) + + result = ints.reindex([1, 2, 3]) + expected = Series([2., 3., np.nan], index=[1, 2, 3]) + assert_series_equal(result, expected) + + # don't upcast + result = ints.reindex([1, 2, 3], fill_value=0) + expected = Series([2, 3, 0], index=[1, 2, 3]) + self.assertTrue(issubclass(result.dtype.type, np.integer)) + assert_series_equal(result, expected) + + #------------------------------------------------------------ + # objects + objects = Series([1, 2, 3], dtype=object) + + result = objects.reindex([1, 2, 3]) + expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + result = objects.reindex([1, 2, 3], fill_value='foo') + expected = Series([2, 3, 'foo'], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + #------------------------------------------------------------ + # bools + bools = Series([True, False, True]) + + result = bools.reindex([1, 2, 3]) + expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + result = bools.reindex([1, 2, 3], fill_value=False) + expected = Series([False, True, False], index=[1, 2, 3]) + assert_series_equal(result, expected) + + def test_rename(self): + renamer = lambda x: x.strftime('%Y%m%d') + renamed = self.ts.rename(renamer) + self.assertEqual(renamed.index[0], renamer(self.ts.index[0])) + + # dict + rename_dict = dict(zip(self.ts.index, renamed.index)) + renamed2 = self.ts.rename(rename_dict) + assert_series_equal(renamed, renamed2) + + # partial dict + s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64') + renamed = s.rename({'b': 'foo', 'd': 'bar'}) + self.assert_numpy_array_equal(renamed.index, ['a', 'foo', 'c', 'bar']) + + # index with name + renamer = Series( + np.arange(4), index=Index(['a', 'b', 'c', 'd'], name='name'), dtype='int64') + renamed = renamer.rename({}) + self.assertEqual(renamed.index.name, renamer.index.name) + + def test_rename_inplace(self): + renamer = lambda x: x.strftime('%Y%m%d') + expected = renamer(self.ts.index[0]) + + self.ts.rename(renamer, inplace=True) + self.assertEqual(self.ts.index[0], expected) + + def test_preserveRefs(self): + seq = self.ts[[5, 10, 15]] + seq[1] = np.NaN + self.assertFalse(np.isnan(self.ts[10])) + + def test_ne(self): + ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) + expected = [True, True, False, True, True] + self.assertTrue(tm.equalContents(ts.index != 5, expected)) + self.assertTrue(tm.equalContents(~(ts.index == 5), expected)) + + def test_pad_nan(self): + x = Series([np.nan, 1., np.nan, 3., np.nan], + ['z', 'a', 'b', 'c', 'd'], dtype=float) + + x.fillna(method='pad', inplace=True) + + expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], + ['z', 'a', 'b', 'c', 'd'], dtype=float) + assert_series_equal(x[1:], expected[1:]) + self.assertTrue(np.isnan(x[0]), np.isnan(expected[0])) + + def test_unstack(self): + from numpy import nan + from pandas.util.testing import assert_frame_equal + + index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']], + labels=[[1, 1, 0, 0], [0, 1, 0, 2]]) + + s = Series(np.arange(4.), index=index) + unstacked = s.unstack() + + expected = DataFrame([[2., nan, 3.], [0., 1., nan]], + index=['bar', 'foo'], + columns=['one', 'three', 'two']) + + assert_frame_equal(unstacked, expected) + + unstacked = s.unstack(level=0) + assert_frame_equal(unstacked, expected.T) + + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + s = Series(np.random.randn(6), index=index) + exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]], + labels=[[0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + expected = DataFrame({'bar': s.values}, index=exp_index).sortlevel(0) + unstacked = s.unstack(0) + assert_frame_equal(unstacked, expected) + + def test_sortlevel(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + res = s.sortlevel('A') + assert_series_equal(backwards, res) + + res = s.sortlevel(['A', 'B']) + assert_series_equal(backwards, res) + + res = s.sortlevel('A', sort_remaining=False) + assert_series_equal(s, res) + + res = s.sortlevel(['A', 'B'], sort_remaining=False) + assert_series_equal(s, res) + + def test_head_tail(self): + assert_series_equal(self.series.head(), self.series[:5]) + assert_series_equal(self.series.tail(), self.series[-5:]) + + def test_isin(self): + s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) + + result = s.isin(['A', 'C']) + expected = Series([True, False, True, False, False, False, True, True]) + assert_series_equal(result, expected) + + def test_isin_with_string_scalar(self): + # GH4763 + s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) + with tm.assertRaises(TypeError): + s.isin('a') + + with tm.assertRaises(TypeError): + s = Series(['aaa', 'b', 'c']) + s.isin('aaa') + + def test_isin_with_i8(self): + # GH 5021 + + expected = Series([True,True,False,False,False]) + expected2 = Series([False,True,False,False,False]) + + # datetime64[ns] + s = Series(date_range('jan-01-2013','jan-05-2013')) + + result = s.isin(s[0:2]) + assert_series_equal(result, expected) + + result = s.isin(s[0:2].values) + assert_series_equal(result, expected) + + # fails on dtype conversion in the first place + if not _np_version_under1p7: + result = s.isin(s[0:2].values.astype('datetime64[D]')) + assert_series_equal(result, expected) + + result = s.isin([s[1]]) + assert_series_equal(result, expected2) + + result = s.isin([np.datetime64(s[1])]) + assert_series_equal(result, expected2) + + # timedelta64[ns] + if not _np_version_under1p7: + s = Series(pd.to_timedelta(lrange(5),unit='d')) + result = s.isin(s[0:2]) + assert_series_equal(result, expected) + +#------------------------------------------------------------------------------ +# TimeSeries-specific + def test_cummethods_bool(self): + # GH 6270 + # looks like a buggy np.maximum.accumulate for numpy 1.6.1, py 3.2 + if _np_version_under1p7 and sys.version_info[0] == 3 and sys.version_info[1] == 2: + raise nose.SkipTest("failure of GH6270 on numpy < 1.7 and py 3.2") + + def cummin(x): + return np.minimum.accumulate(x) + + def cummax(x): + return np.maximum.accumulate(x) + + a = pd.Series([False, False, False, True, True, False, False]) + b = ~a + c = pd.Series([False] * len(b)) + d = ~c + methods = {'cumsum': np.cumsum, 'cumprod': np.cumprod, + 'cummin': cummin, 'cummax': cummax} + args = product((a, b, c, d), methods) + for s, method in args: + expected = Series(methods[method](s.values)) + result = getattr(s, method)() + assert_series_equal(result, expected) + + e = pd.Series([False, True, nan, False]) + cse = pd.Series([0, 1, nan, 1], dtype=object) + cpe = pd.Series([False, 0, nan, 0]) + cmin = pd.Series([False, False, nan, False]) + cmax = pd.Series([False, True, nan, True]) + expecteds = {'cumsum': cse, 'cumprod': cpe, 'cummin': cmin, + 'cummax': cmax} + + for method in methods: + res = getattr(e, method)() + assert_series_equal(res, expecteds[method]) + + def test_replace(self): + N = 100 + ser = Series(np.random.randn(N)) + ser[0:4] = np.nan + ser[6:10] = 0 + + # replace list with a single value + ser.replace([np.nan], -1, inplace=True) + + exp = ser.fillna(-1) + assert_series_equal(ser, exp) + + rs = ser.replace(0., np.nan) + ser[ser == 0.] = np.nan + assert_series_equal(rs, ser) + + ser = Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), + dtype=object) + ser[:5] = np.nan + ser[6:10] = 'foo' + ser[20:30] = 'bar' + + # replace list with a single value + rs = ser.replace([np.nan, 'foo', 'bar'], -1) + + self.assertTrue((rs[:5] == -1).all()) + self.assertTrue((rs[6:10] == -1).all()) + self.assertTrue((rs[20:30] == -1).all()) + self.assertTrue((isnull(ser[:5])).all()) + + # replace with different values + rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) + + self.assertTrue((rs[:5] == -1).all()) + self.assertTrue((rs[6:10] == -2).all()) + self.assertTrue((rs[20:30] == -3).all()) + self.assertTrue((isnull(ser[:5])).all()) + + # replace with different values with 2 lists + rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + assert_series_equal(rs, rs2) + + # replace inplace + ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + + self.assertTrue((ser[:5] == -1).all()) + self.assertTrue((ser[6:10] == -1).all()) + self.assertTrue((ser[20:30] == -1).all()) + + ser = Series([np.nan, 0, np.inf]) + assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + ser = Series([np.nan, 0, 'foo', 'bar', np.inf, None, lib.NaT]) + assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + filled = ser.copy() + filled[4] = 0 + assert_series_equal(ser.replace(np.inf, 0), filled) + + ser = Series(self.ts.index) + assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + # malformed + self.assertRaises(ValueError, ser.replace, [1, 2, 3], [np.nan, 0]) + + # make sure that we aren't just masking a TypeError because bools don't + # implement indexing + with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): + ser.replace([1, 2], [np.nan, 0]) + + ser = Series([0, 1, 2, 3, 4]) + result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) + assert_series_equal(result, Series([4, 3, 2, 1, 0])) + + # API change from 0.12? + # GH 5319 + ser = Series([0, np.nan, 2, 3, 4]) + expected = ser.ffill() + result = ser.replace([np.nan]) + assert_series_equal(result, expected) + + ser = Series([0, np.nan, 2, 3, 4]) + expected = ser.ffill() + result = ser.replace(np.nan) + assert_series_equal(result, expected) + #GH 5797 + ser = Series(date_range('20130101', periods=5)) + expected = ser.copy() + expected.loc[2] = Timestamp('20120101') + result = ser.replace({Timestamp('20130103'): + Timestamp('20120101')}) + assert_series_equal(result, expected) + result = ser.replace(Timestamp('20130103'), Timestamp('20120101')) + assert_series_equal(result, expected) + + def test_replace_with_single_list(self): + ser = Series([0, 1, 2, 3, 4]) + result = ser.replace([1,2,3]) + assert_series_equal(result, Series([0,0,0,0,4])) + + s = ser.copy() + s.replace([1,2,3],inplace=True) + assert_series_equal(s, Series([0,0,0,0,4])) + + # make sure things don't get corrupted when fillna call fails + s = ser.copy() + with tm.assertRaises(ValueError): + s.replace([1,2,3],inplace=True,method='crash_cymbal') + assert_series_equal(s, ser) + + + def test_replace_mixed_types(self): + s = Series(np.arange(5),dtype='int64') + + def check_replace(to_rep, val, expected): + sc = s.copy() + r = s.replace(to_rep, val) + sc.replace(to_rep, val, inplace=True) + assert_series_equal(expected, r) + assert_series_equal(expected, sc) + + # should NOT upcast to float + e = Series([0,1,2,3,4]) + tr, v = [3], [3.0] + check_replace(tr, v, e) + + # MUST upcast to float + e = Series([0,1,2,3.5,4]) + tr, v = [3], [3.5] + check_replace(tr, v, e) + + # casts to object + e = Series([0,1,2,3.5,'a']) + tr, v = [3,4], [3.5,'a'] + check_replace(tr, v, e) + + # again casts to object + e = Series([0,1,2,3.5,Timestamp('20130101')]) + tr, v = [3,4],[3.5,Timestamp('20130101')] + check_replace(tr, v, e) + + # casts to float + e = Series([0,1,2,3.5,1]) + tr, v = [3,4],[3.5,True] + check_replace(tr, v, e) + + # test an object with dates + floats + integers + strings + dr = date_range('1/1/2001', '1/10/2001', + freq='D').to_series().reset_index(drop=True) + r = dr.astype(object).replace([dr[0],dr[1],dr[2]], [1.0,2,'a']) + assert_series_equal(r, Series([1.0,2,'a'] + + dr[3:].tolist(),dtype=object)) + + def test_replace_bool_with_string_no_op(self): + s = Series([True, False, True]) + result = s.replace('fun', 'in-the-sun') + tm.assert_series_equal(s, result) + + def test_replace_bool_with_string(self): + # nonexistent elements + s = Series([True, False, True]) + result = s.replace(True, '2u') + expected = Series(['2u', False, '2u']) + tm.assert_series_equal(expected, result) + + def test_replace_bool_with_bool(self): + s = Series([True, False, True]) + result = s.replace(True, False) + expected = Series([False] * len(s)) + tm.assert_series_equal(expected, result) + + def test_replace_with_dict_with_bool_keys(self): + s = Series([True, False, True]) + with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): + s.replace({'asdf': 'asdb', True: 'yes'}) + + def test_asfreq(self): + ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), + datetime(2009, 11, 30), + datetime(2009, 12, 31)]) + + daily_ts = ts.asfreq('B') + monthly_ts = daily_ts.asfreq('BM') + self.assert_numpy_array_equal(monthly_ts, ts) + + daily_ts = ts.asfreq('B', method='pad') + monthly_ts = daily_ts.asfreq('BM') + self.assert_numpy_array_equal(monthly_ts, ts) + + daily_ts = ts.asfreq(datetools.bday) + monthly_ts = daily_ts.asfreq(datetools.bmonthEnd) + self.assert_numpy_array_equal(monthly_ts, ts) + + result = ts[:0].asfreq('M') + self.assertEqual(len(result), 0) + self.assertIsNot(result, ts) + + def test_diff(self): + # Just run the function + self.ts.diff() + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = s.diff() + self.assertEqual(rs[1], 1) + + # neg n + rs = self.ts.diff(-1) + xp = self.ts - self.ts.shift(-1) + assert_series_equal(rs, xp) + + # 0 + rs = self.ts.diff(0) + xp = self.ts - self.ts + assert_series_equal(rs, xp) + + # datetime diff (GH3100) + s = Series(date_range('20130102', periods=5)) + rs = s - s.shift(1) + xp = s.diff() + assert_series_equal(rs, xp) + + # timedelta diff + nrs = rs - rs.shift(1) + nxp = xp.diff() + assert_series_equal(nrs, nxp) + + def test_pct_change(self): + rs = self.ts.pct_change(fill_method=None) + assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) + + rs = self.ts.pct_change(2) + filled = self.ts.fillna(method='pad') + assert_series_equal(rs, filled / filled.shift(2) - 1) + + rs = self.ts.pct_change(fill_method='bfill', limit=1) + filled = self.ts.fillna(method='bfill', limit=1) + assert_series_equal(rs, filled / filled.shift(1) - 1) + + rs = self.ts.pct_change(freq='5D') + filled = self.ts.fillna(method='pad') + assert_series_equal(rs, filled / filled.shift(freq='5D') - 1) + + def test_pct_change_shift_over_nas(self): + s = Series([1., 1.5, np.nan, 2.5, 3.]) + + chg = s.pct_change() + expected = Series([np.nan, 0.5, np.nan, 2.5 / 1.5 - 1, .2]) + assert_series_equal(chg, expected) + + def test_autocorr(self): + # Just run the function + self.ts.autocorr() + + def test_first_last_valid(self): + ts = self.ts.copy() + ts[:5] = np.NaN + + index = ts.first_valid_index() + self.assertEqual(index, ts.index[5]) + + ts[-5:] = np.NaN + index = ts.last_valid_index() + self.assertEqual(index, ts.index[-6]) + + ts[:] = np.nan + self.assertIsNone(ts.last_valid_index()) + self.assertIsNone(ts.first_valid_index()) + + ser = Series([], index=[]) + self.assertIsNone(ser.last_valid_index()) + self.assertIsNone(ser.first_valid_index()) + + def test_mpl_compat_hack(self): + result = self.ts[:, np.newaxis] + expected = self.ts.values[:, np.newaxis] + assert_almost_equal(result, expected) + +#------------------------------------------------------------------------------ +# GroupBy + + def test_select(self): + n = len(self.ts) + result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) + expected = self.ts.reindex(self.ts.index[n // 2:]) + assert_series_equal(result, expected) + + result = self.ts.select(lambda x: x.weekday() == 2) + expected = self.ts[self.ts.index.weekday == 2] + assert_series_equal(result, expected) + +#------------------------------------------------------------------------------ +# Misc not safe for sparse + + def test_dropna_preserve_name(self): + self.ts[:5] = np.nan + result = self.ts.dropna() + self.assertEqual(result.name, self.ts.name) + name = self.ts.name + ts = self.ts.copy() + ts.dropna(inplace=True) + self.assertEqual(ts.name, name) + + def test_numpy_unique(self): + # it works! + result = np.unique(self.ts) + + def test_concat_empty_series_dtypes(self): + self.assertEqual(pd.concat([Series(dtype=np.float64)]).dtype, np.float64) + self.assertEqual(pd.concat([Series(dtype=np.int8)]).dtype, np.int8) + self.assertEqual(pd.concat([Series(dtype=np.bool_)]).dtype, np.bool_) + + self.assertEqual(pd.concat([Series(dtype=np.bool_), + Series(dtype=np.int32)]).dtype, np.int32) + + + +class TestSeriesNonUnique(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + pass + + def test_basic_indexing(self): + s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) + + self.assertRaises(IndexError, s.__getitem__, 5) + self.assertRaises(IndexError, s.__setitem__, 5, 0) + + self.assertRaises(KeyError, s.__getitem__, 'c') + + s = s.sort_index() + + self.assertRaises(IndexError, s.__getitem__, 5) + self.assertRaises(IndexError, s.__setitem__, 5, 0) + + + def test_int_indexing(self): + s = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2]) + + self.assertRaises(KeyError, s.__getitem__, 5) + + self.assertRaises(KeyError, s.__getitem__, 'c') + + # not monotonic + s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) + + self.assertRaises(KeyError, s.__getitem__, 5) + + self.assertRaises(KeyError, s.__getitem__, 'c') + + def test_datetime_indexing(self): + from pandas import date_range + + index = date_range('1/1/2000', '1/7/2000') + index = index.repeat(3) + + s = Series(len(index), index=index) + stamp = Timestamp('1/8/2000') + + self.assertRaises(KeyError, s.__getitem__, stamp) + s[stamp] = 0 + self.assertEqual(s[stamp], 0) + + # not monotonic + s = Series(len(index), index=index) + s = s[::-1] + + self.assertRaises(KeyError, s.__getitem__, stamp) + s[stamp] = 0 + self.assertEqual(s[stamp], 0) + + def test_reset_index(self): + df = tm.makeDataFrame()[:5] + ser = df.stack() + ser.index.names = ['hash', 'category'] + + ser.name = 'value' + df = ser.reset_index() + self.assertIn('value', df) + + df = ser.reset_index(name='value2') + self.assertIn('value2', df) + + # check inplace + s = ser.reset_index(drop=True) + s2 = ser + s2.reset_index(drop=True, inplace=True) + assert_series_equal(s, s2) + + # level + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + s = Series(np.random.randn(6), index=index) + rs = s.reset_index(level=1) + self.assertEqual(len(rs.columns), 2) + + rs = s.reset_index(level=[0, 2], drop=True) + self.assertTrue(rs.index.equals(Index(index.get_level_values(1)))) + tm.assert_isinstance(rs, Series) + + def test_set_index_makes_timeseries(self): + idx = tm.makeDateIndex(10) + + s = Series(lrange(10)) + s.index = idx + + self.assertTrue(s.is_time_series == True) + + def test_timeseries_coercion(self): + idx = tm.makeDateIndex(10000) + ser = Series(np.random.randn(len(idx)), idx.astype(object)) + self.assertTrue(ser.is_time_series) + self.assertIsInstance(ser.index, DatetimeIndex) + + def test_replace(self): + N = 100 + ser = Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), + dtype=object) + ser[:5] = np.nan + ser[6:10] = 'foo' + ser[20:30] = 'bar' + + # replace list with a single value + rs = ser.replace([np.nan, 'foo', 'bar'], -1) + + self.assertTrue((rs[:5] == -1).all()) + self.assertTrue((rs[6:10] == -1).all()) + self.assertTrue((rs[20:30] == -1).all()) + self.assertTrue((isnull(ser[:5])).all()) + + # replace with different values + rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) + + self.assertTrue((rs[:5] == -1).all()) + self.assertTrue((rs[6:10] == -2).all()) + self.assertTrue((rs[20:30] == -3).all()) + self.assertTrue((isnull(ser[:5])).all()) + + # replace with different values with 2 lists + rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + assert_series_equal(rs, rs2) + + # replace inplace + ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + self.assertTrue((ser[:5] == -1).all()) + self.assertTrue((ser[6:10] == -1).all()) + self.assertTrue((ser[20:30] == -1).all()) + + def test_repeat(self): + s = Series(np.random.randn(3), index=['a', 'b', 'c']) + + reps = s.repeat(5) + exp = Series(s.values.repeat(5), index=s.index.values.repeat(5)) + assert_series_equal(reps, exp) + + to_rep = [2, 3, 4] + reps = s.repeat(to_rep) + exp = Series(s.values.repeat(to_rep), + index=s.index.values.repeat(to_rep)) + assert_series_equal(reps, exp) + + def test_unique_data_ownership(self): + # it works! #1807 + Series(Series(["a", "c", "b"]).unique()).sort() + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py new file mode 100644 index 00000000..cb3fdcaf --- /dev/null +++ b/pandas/tests/test_stats.py @@ -0,0 +1,137 @@ +from pandas import compat +import nose + +from numpy import nan +import numpy as np + +from pandas import Series, DataFrame + +from pandas.compat import product +from pandas.util.testing import (assert_frame_equal, + assert_series_equal, + assert_almost_equal) +import pandas.util.testing as tm + +class TestRank(tm.TestCase): + _multiprocess_can_split_ = True + s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + df = DataFrame({'A': s, 'B': s}) + + results = { + 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, + 3.5, 1.5, 8.0, nan, 5.5]), + 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), + } + + def test_rank_tie_methods(self): + s = self.s + + def _check(s, expected, method='average'): + result = s.rank(method=method) + assert_almost_equal(result, expected) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, dtype in product(results, dtypes): + if (dtype, method) in disabled: + continue + series = s if dtype is None else s.astype(dtype) + _check(series, results[method], method=method) + + def test_rank_dense_method(self): + dtypes = ['O', 'f8', 'i8'] + in_out = [([1], [1]), + ([2], [1]), + ([0], [1]), + ([2,2], [1,1]), + ([1,2,3], [1,2,3]), + ([4,2,1], [3,2,1],), + ([1,1,5,5,3], [1,1,3,3,2]), + ([-5,-4,-3,-2,-1], [1,2,3,4,5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense') + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + def test_rank_descending(self): + dtypes = ['O', 'f8', 'i8'] + + for dtype, method in product(dtypes, self.results): + if 'i' in dtype: + s = self.s.dropna() + df = self.df.dropna() + else: + s = self.s.astype(dtype) + df = self.df.astype(dtype) + + res = s.rank(ascending=False) + expected = (s.max() - s).rank() + assert_series_equal(res, expected) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + assert_frame_equal(res, expected) + + if method == 'first' and dtype == 'O': + continue + + expected = (s.max() - s).rank(method=method) + res2 = s.rank(method=method, ascending=False) + assert_series_equal(res2, expected) + + expected = (df.max() - df).rank(method=method) + + if dtype != 'O': + res2 = df.rank(method=method, ascending=False, + numeric_only=True) + assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, + numeric_only=False) + assert_frame_equal(res3, expected) + + def test_rank_2d_tie_methods(self): + s = self.s + df = self.df + + def _check2d(df, expected, method='average', axis=0): + exp_df = DataFrame({'A': expected, 'B': expected}) + + if axis == 1: + df = df.T + exp_df = exp_df.T + + result = df.rank(method=method, axis=axis) + assert_frame_equal(result, exp_df) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, axis, dtype in product(results, [0, 1], dtypes): + if (dtype, method) in disabled: + continue + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, results[method], method=method, axis=axis) + + def test_rank_int(self): + s = self.s.dropna().astype('i8') + + for method, res in compat.iteritems(self.results): + result = s.rank(method=method) + expected = Series(res).dropna() + expected.index = result.index + assert_series_equal(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py new file mode 100644 index 00000000..971d7acf --- /dev/null +++ b/pandas/tests/test_strings.py @@ -0,0 +1,1199 @@ +# pylint: disable-msg=E1101,W0612 + +from datetime import datetime, timedelta, date +import os +import operator +import re +import warnings + +import nose + +from numpy import nan as NA +import numpy as np +from numpy.testing import assert_array_equal +from numpy.random import randint + +from pandas.compat import range, lrange, u +import pandas.compat as compat +from pandas import (Index, Series, TimeSeries, DataFrame, isnull, notnull, + bdate_range, date_range) +import pandas.core.common as com + +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +import pandas.core.strings as strings + + +class TestStringMethods(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_api(self): + + # GH 6106 + self.assertIsNone(Series.str) + + def test_iter(self): + # GH3638 + strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel' + ds = Series(strs) + + for s in ds.str: + # iter must yield a Series + tm.assert_isinstance(s, Series) + + # indices of each yielded Series should be equal to the index of + # the original Series + assert_array_equal(s.index, ds.index) + + for el in s: + # each element of the series is either a basestring/str or nan + self.assertTrue(isinstance(el, compat.string_types) or isnull(el)) + + # desired behavior is to iterate until everything would be nan on the + # next iter so make sure the last element of the iterator was 'l' in + # this case since 'wikitravel' is the longest string + self.assertEqual(s.dropna().values.item(), 'l') + + def test_iter_empty(self): + ds = Series([], dtype=object) + + i, s = 100, 1 + + for i, s in enumerate(ds.str): + pass + + # nothing to iterate over so nothing defined values should remain + # unchanged + self.assertEqual(i, 100) + self.assertEqual(s, 1) + + def test_iter_single_element(self): + ds = Series(['a']) + + for i, s in enumerate(ds.str): + pass + + self.assertFalse(i) + assert_series_equal(ds, s) + + def test_iter_numeric_try_string(self): + # behavior identical to empty series + dsi = Series(lrange(4)) + + i, s = 100, 'h' + + for i, s in enumerate(dsi.str): + pass + + self.assertEqual(i, 100) + self.assertEqual(s, 'h') + + dsf = Series(np.arange(4.)) + + for i, s in enumerate(dsf.str): + pass + + self.assertEqual(i, 100) + self.assertEqual(s, 'h') + + def test_iter_object_try_string(self): + ds = Series([slice(None, randint(10), randint(10, 20)) + for _ in range(4)]) + + i, s = 100, 'h' + + for i, s in enumerate(ds.str): + pass + + self.assertEqual(i, 100) + self.assertEqual(s, 'h') + + def test_cat(self): + one = ['a', 'a', 'b', 'b', 'c', NA] + two = ['a', NA, 'b', 'd', 'foo', NA] + + # single array + result = strings.str_cat(one) + self.assertTrue(isnull(result)) + + result = strings.str_cat(one, na_rep='NA') + exp = 'aabbcNA' + self.assertEqual(result, exp) + + result = strings.str_cat(one, na_rep='-') + exp = 'aabbc-' + self.assertEqual(result, exp) + + result = strings.str_cat(one, sep='_', na_rep='NA') + exp = 'a_a_b_b_c_NA' + self.assertEqual(result, exp) + + # Multiple arrays + result = strings.str_cat(one, [two], na_rep='NA') + exp = ['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'] + self.assert_numpy_array_equal(result, exp) + + result = strings.str_cat(one, two) + exp = ['aa', NA, 'bb', 'bd', 'cfoo', NA] + tm.assert_almost_equal(result, exp) + + def test_count(self): + values = ['foo', 'foofoo', NA, 'foooofooofommmfoo'] + + result = strings.str_count(values, 'f[o]+') + exp = [1, 2, NA, 4] + tm.assert_almost_equal(result, exp) + + result = Series(values).str.count('f[o]+') + tm.assert_isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + # mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_count(mixed, 'a') + xp = [1, NA, 0, NA, NA, 0, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.count('a') + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = [u('foo'), u('foofoo'), NA, u('foooofooofommmfoo')] + + result = strings.str_count(values, 'f[o]+') + exp = [1, 2, NA, 4] + tm.assert_almost_equal(result, exp) + + result = Series(values).str.count('f[o]+') + tm.assert_isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + def test_contains(self): + values = ['foo', NA, 'fooommm__foo', 'mmm_', 'foommm[_]+bar'] + pat = 'mmm[_]+' + + result = strings.str_contains(values, pat) + expected = [False, NA, True, True, False] + tm.assert_almost_equal(result, expected) + + result = strings.str_contains(values, pat, regex=False) + expected = [False, NA, False, False, True] + tm.assert_almost_equal(result, expected) + + values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + result = strings.str_contains(values, pat) + expected = [False, False, True, True] + self.assertEqual(result.dtype, np.bool_) + tm.assert_almost_equal(result, expected) + + # case insensitive using regex + values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_'] + result = strings.str_contains(values, 'FOO|mmm', case=False) + expected = [True, False, True, True] + tm.assert_almost_equal(result, expected) + + # case insensitive without regex + result = strings.str_contains(values, 'foo', regex=False, case=False) + expected = [True, False, True, False] + tm.assert_almost_equal(result, expected) + + # mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_contains(mixed, 'o') + xp = [False, NA, False, NA, NA, True, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.contains('o') + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = [u('foo'), NA, u('fooommm__foo'), u('mmm_')] + pat = 'mmm[_]+' + + result = strings.str_contains(values, pat) + expected = [False, np.nan, True, True] + tm.assert_almost_equal(result, expected) + + result = strings.str_contains(values, pat, na=False) + expected = [False, False, True, True] + tm.assert_almost_equal(result, expected) + + values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + result = strings.str_contains(values, pat) + expected = [False, False, True, True] + self.assertEqual(result.dtype, np.bool_) + tm.assert_almost_equal(result, expected) + + # na + values = Series(['om', 'foo',np.nan]) + res = values.str.contains('foo', na="foo") + self.assertEqual (res.ix[2], "foo") + + def test_startswith(self): + values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + + result = values.str.startswith('foo') + exp = Series([False, NA, True, False, False, NA, True]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_startswith(mixed, 'f') + xp = [False, NA, False, NA, NA, True, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.startswith('f') + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA, + u('foo')]) + + result = values.str.startswith('foo') + exp = Series([False, NA, True, False, False, NA, True]) + tm.assert_series_equal(result, exp) + + result = values.str.startswith('foo', na=True) + tm.assert_series_equal(result, exp.fillna(True).astype(bool)) + + def test_endswith(self): + values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + + result = values.str.endswith('foo') + exp = Series([False, NA, False, False, True, NA, True]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + rs = strings.str_endswith(mixed, 'f') + xp = [False, NA, False, NA, NA, False, NA, NA, NA] + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.endswith('f') + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA, + u('foo')]) + + result = values.str.endswith('foo') + exp = Series([False, NA, False, False, True, NA, True]) + tm.assert_series_equal(result, exp) + + result = values.str.endswith('foo', na=False) + tm.assert_series_equal(result, exp.fillna(False).astype(bool)) + + def test_title(self): + values = Series(["FOO", "BAR", NA, "Blah", "blurg"]) + + result = values.str.title() + exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(["FOO", NA, "bar", True, datetime.today(), + "blah", None, 1, 2.]) + mixed = mixed.str.title() + exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) + tm.assert_almost_equal(mixed, exp) + + # unicode + values = Series([u("FOO"), NA, u("bar"), u("Blurg")]) + + results = values.str.title() + exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")]) + + tm.assert_series_equal(results, exp) + + def test_lower_upper(self): + values = Series(['om', NA, 'nom', 'nom']) + + result = values.str.upper() + exp = Series(['OM', NA, 'NOM', 'NOM']) + tm.assert_series_equal(result, exp) + + result = result.str.lower() + tm.assert_series_equal(result, values) + + # mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, + 1, 2.]) + mixed = mixed.str.upper() + rs = Series(mixed).str.lower() + xp = ['a', NA, 'b', NA, NA, 'foo', NA, NA, NA] + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('om'), NA, u('nom'), u('nom')]) + + result = values.str.upper() + exp = Series([u('OM'), NA, u('NOM'), u('NOM')]) + tm.assert_series_equal(result, exp) + + result = result.str.lower() + tm.assert_series_equal(result, values) + + def test_replace(self): + values = Series(['fooBAD__barBAD', NA]) + + result = values.str.replace('BAD[_]*', '') + exp = Series(['foobar', NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace('BAD[_]*', '', n=1) + exp = Series(['foobarBAD', NA]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', + None, 1, 2.]) + + rs = Series(mixed).str.replace('BAD[_]*', '') + xp = ['a', NA, 'b', NA, NA, 'foo', NA, NA, NA] + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA]) + + result = values.str.replace('BAD[_]*', '') + exp = Series([u('foobar'), NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace('BAD[_]*', '', n=1) + exp = Series([u('foobarBAD'), NA]) + tm.assert_series_equal(result, exp) + + #flags + unicode + values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) + exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) + result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE) + tm.assert_series_equal(result, exp) + + def test_repeat(self): + values = Series(['a', 'b', NA, 'c', NA, 'd']) + + result = values.str.repeat(3) + exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd']) + tm.assert_series_equal(result, exp) + + result = values.str.repeat([1, 2, 3, 4, 5, 6]) + exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd']) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', + None, 1, 2.]) + + rs = Series(mixed).str.repeat(3) + xp = ['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA] + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('a'), u('b'), NA, u('c'), NA, + u('d')]) + + result = values.str.repeat(3) + exp = Series([u('aaa'), u('bbb'), NA, u('ccc'), NA, + u('ddd')]) + tm.assert_series_equal(result, exp) + + result = values.str.repeat([1, 2, 3, 4, 5, 6]) + exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, + u('dddddd')]) + tm.assert_series_equal(result, exp) + + def test_deprecated_match(self): + # Old match behavior, deprecated (but still default) in 0.13 + values = Series(['fooBAD__barBAD', NA, 'foo']) + + with tm.assert_produces_warning(): + result = values.str.match('.*(BAD[_]+).*(BAD)') + exp = Series([('BAD__', 'BAD'), NA, []]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), + 'foo', None, 1, 2.]) + + with tm.assert_produces_warning(): + rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') + xp = [('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), NA, NA, [], NA, NA, NA] + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA, u('foo')]) + + with tm.assert_produces_warning(): + result = values.str.match('.*(BAD[_]+).*(BAD)') + exp = Series([(u('BAD__'), u('BAD')), NA, []]) + tm.assert_series_equal(result, exp) + + def test_match(self): + # New match behavior introduced in 0.13 + values = Series(['fooBAD__barBAD', NA, 'foo']) + with tm.assert_produces_warning(): + result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + exp = Series([True, NA, False]) + tm.assert_series_equal(result, exp) + + # If no groups, use new behavior even when as_indexer is False. + # (Old behavior is pretty much useless in this case.) + values = Series(['fooBAD__barBAD', NA, 'foo']) + result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False) + exp = Series([True, NA, False]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), + 'foo', None, 1, 2.]) + + with tm.assert_produces_warning(): + rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + xp = [True, NA, True, NA, NA, False, NA, NA, NA] + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA, u('foo')]) + + with tm.assert_produces_warning(): + result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True) + exp = Series([True, NA, False]) + tm.assert_series_equal(result, exp) + + # na GH #6609 + res = Series(['a', 0, np.nan]).str.match('a', na=False) + exp = Series([True, False, False]) + assert_series_equal(exp, res) + res = Series(['a', 0, np.nan]).str.match('a') + exp = Series([True, np.nan, np.nan]) + assert_series_equal(exp, res) + + def test_extract(self): + # Contains tests like those in test_match and some others. + + values = Series(['fooBAD__barBAD', NA, 'foo']) + er = [NA, NA] # empty row + + result = values.str.extract('.*(BAD[_]+).*(BAD)') + exp = DataFrame([['BAD__', 'BAD'], er, er]) + tm.assert_frame_equal(result, exp) + + # mixed + mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), + 'foo', None, 1, 2.]) + + rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)') + exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, + er, er, er, er]) + tm.assert_frame_equal(rs, exp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA, u('foo')]) + + result = values.str.extract('.*(BAD[_]+).*(BAD)') + exp = DataFrame([[u('BAD__'), u('BAD')], er, er]) + tm.assert_frame_equal(result, exp) + + # no groups + s = Series(['A1', 'B2', 'C3']) + f = lambda: s.str.extract('[ABC][123]') + self.assertRaises(ValueError, f) + + # only non-capturing groups + f = lambda: s.str.extract('(?:[AB]).*') + self.assertRaises(ValueError, f) + + # one group, no matches + result = s.str.extract('(_)') + exp = Series([NA, NA, NA], dtype=object) + tm.assert_series_equal(result, exp) + + # two groups, no matches + result = s.str.extract('(_)(_)') + exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) + tm.assert_frame_equal(result, exp) + + # one group, some matches + result = s.str.extract('([AB])[123]') + exp = Series(['A', 'B', NA]) + tm.assert_series_equal(result, exp) + + # two groups, some matches + result = s.str.extract('([AB])([123])') + exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + tm.assert_frame_equal(result, exp) + + # named group/groups + result = s.str.extract('(?P[AB])(?P[123])') + exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number']) + tm.assert_frame_equal(result, exp) + result = s.str.extract('(?P[AB])') + exp = Series(['A', 'B', NA], name='letter') + tm.assert_series_equal(result, exp) + + # mix named and unnamed groups + result = s.str.extract('([AB])(?P[123])') + exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number']) + tm.assert_frame_equal(result, exp) + + # one normal group, one non-capturing group + result = s.str.extract('([AB])(?:[123])') + exp = Series(['A', 'B', NA]) + tm.assert_series_equal(result, exp) + + # two normal groups, one non-capturing group + result = Series(['A11', 'B22', 'C33']).str.extract('([AB])([123])(?:[123])') + exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + tm.assert_frame_equal(result, exp) + + # one optional group followed by one normal group + result = Series(['A1', 'B2', '3']).str.extract('(?P[AB])?(?P[123])') + exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']], columns=['letter', 'number']) + tm.assert_frame_equal(result, exp) + + # one normal group followed by one optional group + result = Series(['A1', 'B2', 'C']).str.extract('(?P[ABC])(?P[123])?') + exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) + tm.assert_frame_equal(result, exp) + + # single group renames series properly + s = Series(['A1', 'A2']) + result = s.str.extract(r'(?PA)\d') + tm.assert_equal(result.name, 'uno') + + # GH6348 + # not passing index to the extractor + def check_index(index): + data = ['A1', 'B2', 'C'] + index = index[:len(data)] + result = Series(data, index=index).str.extract('(\d)') + exp = Series(['1', '2', NA], index=index) + tm.assert_series_equal(result, exp) + + result = Series(data, index=index).str.extract('(?P\D)(?P\d)?') + exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'], index=index) + tm.assert_frame_equal(result, exp) + + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + check_index(index()) + + def test_extract_single_series_name_is_preserved(self): + s = Series(['a3', 'b3', 'c2'], name='bob') + r = s.str.extract(r'(?P[a-z])') + e = Series(['a', 'b', 'c'], name='sue') + tm.assert_series_equal(r, e) + self.assertEqual(r.name, e.name) + + def test_empty_str_methods(self): + empty_str = empty = Series(dtype=str) + empty_int = Series(dtype=int) + empty_bool = Series(dtype=bool) + empty_list = Series(dtype=list) + empty_bytes = Series(dtype=object) + + # GH7241 + # (extract) on empty series + + tm.assert_series_equal(empty_str, empty.str.cat(empty)) + tm.assert_equal('', empty.str.cat()) + tm.assert_series_equal(empty_str, empty.str.title()) + tm.assert_series_equal(empty_int, empty.str.count('a')) + tm.assert_series_equal(empty_bool, empty.str.contains('a')) + tm.assert_series_equal(empty_bool, empty.str.startswith('a')) + tm.assert_series_equal(empty_bool, empty.str.endswith('a')) + tm.assert_series_equal(empty_str, empty.str.lower()) + tm.assert_series_equal(empty_str, empty.str.upper()) + tm.assert_series_equal(empty_str, empty.str.replace('a','b')) + tm.assert_series_equal(empty_str, empty.str.repeat(3)) + tm.assert_series_equal(empty_bool, empty.str.match('^a')) + tm.assert_series_equal(empty_str, empty.str.extract('()')) + tm.assert_frame_equal(DataFrame(columns=[0,1], dtype=str), empty.str.extract('()()')) + tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) + tm.assert_series_equal(empty_str, empty_list.str.join('')) + tm.assert_series_equal(empty_int, empty.str.len()) + tm.assert_series_equal(empty_list, empty_list.str.findall('a')) + tm.assert_series_equal(empty_str, empty.str.pad(42)) + tm.assert_series_equal(empty_str, empty.str.center(42)) + tm.assert_series_equal(empty_list, empty.str.split('a')) + tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) + tm.assert_series_equal(empty_str, empty.str.strip()) + tm.assert_series_equal(empty_str, empty.str.lstrip()) + tm.assert_series_equal(empty_str, empty.str.rstrip()) + tm.assert_series_equal(empty_str, empty.str.rstrip()) + tm.assert_series_equal(empty_str, empty.str.wrap(42)) + tm.assert_series_equal(empty_str, empty.str.get(0)) + tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii')) + tm.assert_series_equal(empty_bytes, empty.str.encode('ascii')) + + def test_get_dummies(self): + s = Series(['a|b', 'a|c', np.nan]) + result = s.str.get_dummies('|') + expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list('abc')) + tm.assert_frame_equal(result, expected) + + s = Series(['a;b', 'a', 7]) + result = s.str.get_dummies(';') + expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], + columns=list('7ab')) + tm.assert_frame_equal(result, expected) + + def test_join(self): + values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + result = values.str.split('_').str.join('_') + tm.assert_series_equal(values, result) + + # mixed + mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), + 'foo', None, 1, 2.]) + + rs = Series(mixed).str.split('_').str.join('_') + xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('a_b_c'), u('c_d_e'), np.nan, + u('f_g_h')]) + result = values.str.split('_').str.join('_') + tm.assert_series_equal(values, result) + + def test_len(self): + values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) + + result = values.str.len() + exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), + 'foo', None, 1, 2.]) + + rs = Series(mixed).str.len() + xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('foo'), u('fooo'), u('fooooo'), np.nan, + u('fooooooo')]) + + result = values.str.len() + exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + tm.assert_series_equal(result, exp) + + def test_findall(self): + values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD']) + + result = values.str.findall('BAD[_]*') + exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']]) + tm.assert_almost_equal(result, exp) + + # mixed + mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(), + 'BAD', None, 1, 2.]) + + rs = Series(mixed).str.findall('BAD[_]*') + xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA, u('foo'), + u('BAD')]) + + result = values.str.findall('BAD[_]*') + exp = Series([[u('BAD__'), u('BAD')], NA, [], [u('BAD')]]) + tm.assert_almost_equal(result, exp) + + def test_pad(self): + values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + + result = values.str.pad(5, side='left') + exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='right') + exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='both') + exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + # mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'ee', None, 1, 2.]) + + rs = Series(mixed).str.pad(5, side='left') + xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'ee', None, 1, 2.]) + + rs = Series(mixed).str.pad(5, side='right') + xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'ee', None, 1, 2.]) + + rs = Series(mixed).str.pad(5, side='both') + xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('a'), u('b'), NA, u('c'), NA, + u('eeeeee')]) + + result = values.str.pad(5, side='left') + exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, + u('eeeeee')]) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='right') + exp = Series([u('a '), u('b '), NA, u('c '), NA, + u('eeeeee')]) + tm.assert_almost_equal(result, exp) + + result = values.str.pad(5, side='both') + exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, + u('eeeeee')]) + tm.assert_almost_equal(result, exp) + + def test_center(self): + values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + + result = values.str.center(5) + exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + tm.assert_almost_equal(result, exp) + + # mixed + mixed = Series(['a', NA, 'b', True, datetime.today(), + 'c', 'eee', None, 1, 2.]) + + rs = Series(mixed).str.center(5) + xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, + NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('a'), u('b'), NA, u('c'), NA, + u('eeeeee')]) + + result = values.str.center(5) + exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, + u('eeeeee')]) + tm.assert_almost_equal(result, exp) + + def test_split(self): + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + + result = values.str.split('_') + exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + tm.assert_series_equal(result, exp) + + # more than one char + values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) + result = values.str.split('__') + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.split('_') + xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, + NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) + + result = values.str.split('_') + exp = Series([[u('a'), u('b'), u('c')], + [u('c'), u('d'), u('e')], NA, + [u('f'), u('g'), u('h')]]) + tm.assert_series_equal(result, exp) + + def test_split_noargs(self): + # #1859 + s = Series(['Wes McKinney', 'Travis Oliphant']) + + result = s.str.split() + self.assertEqual(result[1], ['Travis', 'Oliphant']) + + def test_split_maxsplit(self): + # re.split 0, str.split -1 + s = Series(['bd asdf jfg', 'kjasdflqw asdfnfk']) + + result = s.str.split(n=-1) + xp = s.str.split() + tm.assert_series_equal(result, xp) + + result = s.str.split(n=0) + tm.assert_series_equal(result, xp) + + xp = s.str.split('asdf') + result = s.str.split('asdf', n=0) + tm.assert_series_equal(result, xp) + + result = s.str.split('asdf', n=-1) + tm.assert_series_equal(result, xp) + + def test_split_no_pat_with_nonzero_n(self): + s = Series(['split once', 'split once too!']) + result = s.str.split(n=1) + expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) + tm.assert_series_equal(expected, result) + + def test_pipe_failures(self): + # #2119 + s = Series(['A|B|C']) + + result = s.str.split('|') + exp = Series([['A', 'B', 'C']]) + + tm.assert_series_equal(result, exp) + + result = s.str.replace('|', ' ') + exp = Series(['A B C']) + + tm.assert_series_equal(result, exp) + + def test_slice(self): + values = Series(['aafootwo', 'aabartwo', NA, 'aabazqux']) + + result = values.str.slice(2, 5) + exp = Series(['foo', 'bar', NA, 'baz']) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.slice(2, 5) + xp = Series(['foo', NA, 'bar', NA, NA, + NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('aafootwo'), u('aabartwo'), NA, + u('aabazqux')]) + + result = values.str.slice(2, 5) + exp = Series([u('foo'), u('bar'), NA, u('baz')]) + tm.assert_series_equal(result, exp) + + def test_slice_replace(self): + pass + + def test_strip_lstrip_rstrip(self): + values = Series([' aa ', ' bb \n', NA, 'cc ']) + + result = values.str.strip() + exp = Series(['aa', 'bb', NA, 'cc']) + tm.assert_series_equal(result, exp) + + result = values.str.lstrip() + exp = Series(['aa ', 'bb \n', NA, 'cc ']) + tm.assert_series_equal(result, exp) + + result = values.str.rstrip() + exp = Series([' aa', ' bb', NA, 'cc']) + tm.assert_series_equal(result, exp) + + def test_strip_lstrip_rstrip_mixed(self): + # mixed + mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.strip() + xp = Series(['aa', NA, 'bb', NA, NA, + NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.lstrip() + xp = Series(['aa ', NA, 'bb \t\n', NA, NA, + NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + rs = Series(mixed).str.rstrip() + xp = Series([' aa', NA, ' bb', NA, NA, + NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + def test_strip_lstrip_rstrip_unicode(self): + # unicode + values = Series([u(' aa '), u(' bb \n'), NA, + u('cc ')]) + + result = values.str.strip() + exp = Series([u('aa'), u('bb'), NA, u('cc')]) + tm.assert_series_equal(result, exp) + + result = values.str.lstrip() + exp = Series([u('aa '), u('bb \n'), NA, u('cc ')]) + tm.assert_series_equal(result, exp) + + result = values.str.rstrip() + exp = Series([u(' aa'), u(' bb'), NA, u('cc')]) + tm.assert_series_equal(result, exp) + + def test_strip_lstrip_rstrip_args(self): + values = Series(['xxABCxx', 'xx BNSD', 'LDFJH xx']) + + rs = values.str.strip('x') + xp = Series(['ABC', ' BNSD', 'LDFJH ']) + assert_series_equal(rs, xp) + + rs = values.str.lstrip('x') + xp = Series(['ABCxx', ' BNSD', 'LDFJH xx']) + assert_series_equal(rs, xp) + + rs = values.str.rstrip('x') + xp = Series(['xxABC', 'xx BNSD', 'LDFJH ']) + assert_series_equal(rs, xp) + + def test_strip_lstrip_rstrip_args_unicode(self): + values = Series([u('xxABCxx'), u('xx BNSD'), + u('LDFJH xx')]) + + rs = values.str.strip(u('x')) + xp = Series(['ABC', ' BNSD', 'LDFJH ']) + assert_series_equal(rs, xp) + + rs = values.str.lstrip(u('x')) + xp = Series(['ABCxx', ' BNSD', 'LDFJH xx']) + assert_series_equal(rs, xp) + + rs = values.str.rstrip(u('x')) + xp = Series(['xxABC', 'xx BNSD', 'LDFJH ']) + assert_series_equal(rs, xp) + + def test_wrap(self): + # test values are: two words less than width, two words equal to width, + # two words greater than width, one word less than width, one word + # equal to width, one word greater than width, multiple tokens with trailing + # whitespace equal to width + values = Series([u('hello world'), u('hello world!'), + u('hello world!!'), u('abcdefabcde'), + u('abcdefabcdef'), u('abcdefabcdefa'), + u('ab ab ab ab '), u('ab ab ab ab a'), + u('\t')]) + + # expected values + xp = Series([u('hello world'), u('hello world!'), + u('hello\nworld!!'), u('abcdefabcde'), + u('abcdefabcdef'), u('abcdefabcdef\na'), + u('ab ab ab ab'), u('ab ab ab ab\na'), + u('')]) + + rs = values.str.wrap(12, break_long_words=True) + assert_series_equal(rs, xp) + + # test with pre and post whitespace (non-unicode), NaN, and non-ascii Unicode + values = Series([' pre ', np.nan, u('\xac\u20ac\U00008000 abadcafe')]) + xp = Series([' pre', NA, u('\xac\u20ac\U00008000 ab\nadcafe')]) + rs = values.str.wrap(6) + assert_series_equal(rs, xp) + + def test_get(self): + values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + + result = values.str.split('_').str.get(1) + expected = Series(['b', 'd', np.nan, 'g']) + tm.assert_series_equal(result, expected) + + # mixed + mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), + None, 1, 2.]) + + rs = Series(mixed).str.split('_').str.get(1) + xp = Series(['b', NA, 'd', NA, NA, + NA, NA, NA]) + + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('a_b_c'), u('c_d_e'), np.nan, + u('f_g_h')]) + + result = values.str.split('_').str.get(1) + expected = Series([u('b'), u('d'), np.nan, u('g')]) + tm.assert_series_equal(result, expected) + + def test_more_contains(self): + # PR #1179 + import re + + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, + 'CABA', 'dog', 'cat']) + + result = s.str.contains('a') + expected = Series([False, False, False, True, True, False, np.nan, + False, False, True]) + assert_series_equal(result, expected) + + result = s.str.contains('a', case=False) + expected = Series([True, False, False, True, True, False, np.nan, + True, False, True]) + assert_series_equal(result, expected) + + result = s.str.contains('Aa') + expected = Series([False, False, False, True, False, False, np.nan, + False, False, False]) + assert_series_equal(result, expected) + + result = s.str.contains('ba') + expected = Series([False, False, False, True, False, False, np.nan, + False, False, False]) + assert_series_equal(result, expected) + + result = s.str.contains('ba', case=False) + expected = Series([False, False, False, True, True, False, np.nan, + True, False, False]) + assert_series_equal(result, expected) + + def test_more_replace(self): + # PR #1179 + import re + s = Series(['A', 'B', 'C', 'Aaba', 'Baca', + '', NA, 'CABA', 'dog', 'cat']) + + result = s.str.replace('A', 'YYY') + expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA, + 'CYYYBYYY', 'dog', 'cat']) + assert_series_equal(result, expected) + + result = s.str.replace('A', 'YYY', case=False) + expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA, + 'CYYYBYYY', 'dog', 'cYYYt']) + assert_series_equal(result, expected) + + result = s.str.replace('^.a|dog', 'XX-XX ', case=False) + expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA, + 'XX-XX BA', 'XX-XX ', 'XX-XX t']) + assert_series_equal(result, expected) + + def test_string_slice_get_syntax(self): + s = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', NA, + 'CYYYBYYY', 'dog', 'cYYYt']) + + result = s.str[0] + expected = s.str.get(0) + assert_series_equal(result, expected) + + result = s.str[:3] + expected = s.str.slice(stop=3) + assert_series_equal(result, expected) + + def test_string_slice_out_of_bounds(self): + s = Series([(1, 2), (1,), (3,4,5)]) + + result = s.str[1] + expected = Series([2, np.nan, 4]) + + assert_series_equal(result, expected) + + s = Series(['foo', 'b', 'ba']) + result = s.str[1] + expected = Series(['o', np.nan, 'a']) + assert_series_equal(result, expected) + + def test_match_findall_flags(self): + data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', + 'Rob': 'rob@gmail.com', 'Wes': np.nan} + data = Series(data) + + pat = pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = data.str.match(pat, flags=re.IGNORECASE) + assert issubclass(w[-1].category, UserWarning) + self.assertEqual(result[0], ('dave', 'google', 'com')) + + result = data.str.findall(pat, flags=re.IGNORECASE) + self.assertEqual(result[0][0], ('dave', 'google', 'com')) + + result = data.str.count(pat, flags=re.IGNORECASE) + self.assertEqual(result[0], 1) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + result = data.str.contains(pat, flags=re.IGNORECASE) + assert issubclass(w[-1].category, UserWarning) + self.assertEqual(result[0], True) + + def test_encode_decode(self): + base = Series([u('a'), u('b'), u('a\xe4')]) + series = base.str.encode('utf-8') + + f = lambda x: x.decode('utf-8') + result = series.str.decode('utf-8') + exp = series.map(f) + + tm.assert_series_equal(result, exp) + + def test_encode_decode_errors(self): + encodeBase = Series([u('a'), u('b'), u('a\x9d')]) + + self.assertRaises(UnicodeEncodeError, + encodeBase.str.encode, 'cp1252') + + f = lambda x: x.encode('cp1252', 'ignore') + result = encodeBase.str.encode('cp1252', 'ignore') + exp = encodeBase.map(f) + tm.assert_series_equal(result, exp) + + decodeBase = Series([b'a', b'b', b'a\x9d']) + + self.assertRaises(UnicodeDecodeError, + decodeBase.str.decode, 'cp1252') + + f = lambda x: x.decode('cp1252', 'ignore') + result = decodeBase.str.decode('cp1252', 'ignore') + exp = decodeBase.map(f) + + tm.assert_series_equal(result, exp) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py new file mode 100644 index 00000000..298fa73c --- /dev/null +++ b/pandas/tests/test_testing.py @@ -0,0 +1,186 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import pandas as pd +import unittest +import warnings +import nose +import numpy as np +import sys +from pandas import Series +from pandas.util.testing import ( + assert_almost_equal, assertRaisesRegexp, raise_with_traceback, assert_series_equal, + RNGContext +) + +# let's get meta. + +class TestAssertAlmostEqual(unittest.TestCase): + _multiprocess_can_split_ = True + + def _assert_almost_equal_both(self, a, b, **kwargs): + assert_almost_equal(a, b, **kwargs) + assert_almost_equal(b, a, **kwargs) + + def _assert_not_almost_equal_both(self, a, b, **kwargs): + self.assertRaises(AssertionError, assert_almost_equal, a, b, **kwargs) + self.assertRaises(AssertionError, assert_almost_equal, b, a, **kwargs) + + def test_assert_almost_equal_numbers(self): + self._assert_almost_equal_both(1.1, 1.1) + self._assert_almost_equal_both(1.1, 1.100001) + self._assert_almost_equal_both(np.int16(1), 1.000001) + self._assert_almost_equal_both(np.float64(1.1), 1.1) + self._assert_almost_equal_both(np.uint32(5), 5) + + self._assert_not_almost_equal_both(1.1, 1) + self._assert_not_almost_equal_both(1.1, True) + self._assert_not_almost_equal_both(1, 2) + self._assert_not_almost_equal_both(1.0001, np.int16(1)) + + def test_assert_almost_equal_numbers_with_zeros(self): + self._assert_almost_equal_both(0, 0) + self._assert_almost_equal_both(0.000001, 0) + + self._assert_not_almost_equal_both(0.001, 0) + self._assert_not_almost_equal_both(1, 0) + + def test_assert_almost_equal_numbers_with_mixed(self): + self._assert_not_almost_equal_both(1, 'abc') + self._assert_not_almost_equal_both(1, [1,]) + self._assert_not_almost_equal_both(1, object()) + + def test_assert_almost_equal_edge_case_ndarrays(self): + self._assert_almost_equal_both(np.array([], dtype='M8[ns]'), + np.array([], dtype='float64')) + self._assert_almost_equal_both(np.array([], dtype=str), + np.array([], dtype='int64')) + + def test_assert_almost_equal_dicts(self): + self._assert_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) + + self._assert_not_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 3}) + self._assert_not_almost_equal_both( + {'a': 1, 'b': 2}, {'a': 1, 'b': 2, 'c': 3} + ) + self._assert_not_almost_equal_both({'a': 1}, 1) + self._assert_not_almost_equal_both({'a': 1}, 'abc') + self._assert_not_almost_equal_both({'a': 1}, [1,]) + + def test_assert_almost_equal_dict_like_object(self): + class DictLikeObj(object): + def keys(self): + return ('a',) + + def __getitem__(self, item): + if item == 'a': + return 1 + + self._assert_almost_equal_both({'a': 1}, DictLikeObj()) + + self._assert_not_almost_equal_both({'a': 2}, DictLikeObj()) + + def test_assert_almost_equal_strings(self): + self._assert_almost_equal_both('abc', 'abc') + + self._assert_not_almost_equal_both('abc', 'abcd') + self._assert_not_almost_equal_both('abc', 'abd') + self._assert_not_almost_equal_both('abc', 1) + self._assert_not_almost_equal_both('abc', [1,]) + + def test_assert_almost_equal_iterables(self): + self._assert_almost_equal_both([1, 2, 3], [1, 2, 3]) + self._assert_almost_equal_both(np.array([1, 2, 3]), [1, 2, 3]) + + # Can't compare generators + self._assert_not_almost_equal_both(iter([1, 2, 3]), [1, 2, 3]) + + self._assert_not_almost_equal_both([1, 2, 3], [1, 2, 4]) + self._assert_not_almost_equal_both([1, 2, 3], [1, 2, 3, 4]) + self._assert_not_almost_equal_both([1, 2, 3], 1) + + def test_assert_almost_equal_null(self): + self._assert_almost_equal_both(None, None) + self._assert_almost_equal_both(None, np.NaN) + + self._assert_not_almost_equal_both(None, 0) + self._assert_not_almost_equal_both(np.NaN, 0) + + def test_assert_almost_equal_inf(self): + self._assert_almost_equal_both(np.inf, np.inf) + self._assert_almost_equal_both(np.inf, float("inf")) + + self._assert_not_almost_equal_both(np.inf, 0) + +class TestUtilTesting(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_raise_with_traceback(self): + with assertRaisesRegexp(LookupError, "error_text"): + try: + raise ValueError("THIS IS AN ERROR") + except ValueError as e: + e = LookupError("error_text") + raise_with_traceback(e) + with assertRaisesRegexp(LookupError, "error_text"): + try: + raise ValueError("This is another error") + except ValueError: + e = LookupError("error_text") + _, _, traceback = sys.exc_info() + raise_with_traceback(e, traceback) + +class TestAssertSeriesEqual(unittest.TestCase): + _multiprocess_can_split_ = True + + def _assert_equal(self, x, y, **kwargs): + assert_series_equal(x,y,**kwargs) + assert_series_equal(y,x,**kwargs) + + def _assert_not_equal(self, a, b, **kwargs): + self.assertRaises(AssertionError, assert_series_equal, a, b, **kwargs) + self.assertRaises(AssertionError, assert_series_equal, b, a, **kwargs) + + def test_equal(self): + self._assert_equal(Series(range(3)),Series(range(3))) + self._assert_equal(Series(list('abc')),Series(list('abc'))) + + def test_not_equal(self): + self._assert_not_equal(Series(range(3)),Series(range(3))+1) + self._assert_not_equal(Series(list('abc')),Series(list('xyz'))) + self._assert_not_equal(Series(range(3)),Series(range(4))) + self._assert_not_equal(Series(range(3)),Series(range(3),dtype='float64')) + self._assert_not_equal(Series(range(3)),Series(range(3),index=[1,2,4])) + + # ATM meta data is not checked in assert_series_equal + # self._assert_not_equal(Series(range(3)),Series(range(3),name='foo'),check_names=True) + + def test_less_precise(self): + s1 = Series([0.12345],dtype='float64') + s2 = Series([0.12346],dtype='float64') + + self.assertRaises(AssertionError, assert_series_equal, s1, s2) + self._assert_equal(s1,s2,check_less_precise=True) + + s1 = Series([0.12345],dtype='float32') + s2 = Series([0.12346],dtype='float32') + + self.assertRaises(AssertionError, assert_series_equal, s1, s2) + self._assert_equal(s1,s2,check_less_precise=True) + + # even less than less precise + s1 = Series([0.1235],dtype='float32') + s2 = Series([0.1236],dtype='float32') + + self.assertRaises(AssertionError, assert_series_equal, s1, s2) + self.assertRaises(AssertionError, assert_series_equal, s1, s2, True) + +class TestRNGContext(unittest.TestCase): + + def test_RNGContext(self): + expected0 = 1.764052345967664 + expected1 = 1.6243453636632417 + + with RNGContext(0): + with RNGContext(1): + self.assertEqual(np.random.randn(), expected1) + self.assertEqual(np.random.randn(), expected0) diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py new file mode 100644 index 00000000..d5f7a536 --- /dev/null +++ b/pandas/tests/test_tseries.py @@ -0,0 +1,722 @@ + +import nose +from numpy import nan +import numpy as np +from pandas import Index, isnull, Timestamp +from pandas.util.testing import assert_almost_equal +import pandas.util.testing as tm +from pandas.compat import range, lrange, zip +import pandas.lib as lib +import pandas.algos as algos + + +class TestTseriesUtil(tm.TestCase): + _multiprocess_can_split_ = True + + def test_combineFunc(self): + pass + + def test_reindex(self): + pass + + def test_isnull(self): + pass + + def test_groupby(self): + pass + + def test_groupby_withnull(self): + pass + + def test_backfill(self): + old = Index([1, 5, 10]) + new = Index(lrange(12)) + + filler = algos.backfill_int64(old, new) + + expect_filler = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1] + self.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([1, 4]) + new = Index(lrange(5, 10)) + filler = algos.backfill_int64(old, new) + + expect_filler = [-1, -1, -1, -1, -1] + self.assert_numpy_array_equal(filler, expect_filler) + + def test_pad(self): + old = Index([1, 5, 10]) + new = Index(lrange(12)) + + filler = algos.pad_int64(old, new) + + expect_filler = [-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2] + self.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([5, 10]) + new = Index(lrange(5)) + filler = algos.pad_int64(old, new) + expect_filler = [-1, -1, -1, -1, -1] + self.assert_numpy_array_equal(filler, expect_filler) + + +def test_left_join_indexer_unique(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + + result = algos.left_join_indexer_unique_int64(b, a) + expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + assert(np.array_equal(result, expected)) + + +def test_left_outer_join_bug(): + left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, + 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, + 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, + 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, + 3, 1, 2, 0, 2], dtype=np.int64) + + right = np.array([3, 1], dtype=np.int64) + max_groups = 4 + + lidx, ridx = algos.left_outer_join(left, right, max_groups, sort=False) + + exp_lidx = np.arange(len(left)) + exp_ridx = -np.ones(len(left)) + exp_ridx[left == 1] = 1 + exp_ridx[left == 3] = 0 + + assert(np.array_equal(lidx, exp_lidx)) + assert(np.array_equal(ridx, exp_ridx)) + + +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = algos.inner_join_indexer_int64(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4]) + bexp = np.array([1, 2]) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = algos.inner_join_indexer_int64(a, b) + assert_almost_equal(index, [5]) + assert_almost_equal(ares, [0]) + assert_almost_equal(bres, [0]) + + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = algos.outer_join_indexer_int64(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4]) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = algos.outer_join_indexer_int64(a, b) + assert_almost_equal(index, [5]) + assert_almost_equal(ares, [0]) + assert_almost_equal(bres, [0]) + + +def test_left_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = algos.left_join_indexer_int64(a, b) + + assert_almost_equal(index, a) + + aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = algos.left_join_indexer_int64(a, b) + assert_almost_equal(index, [5]) + assert_almost_equal(ares, [0]) + assert_almost_equal(bres, [0]) + + +def test_left_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = algos.left_join_indexer_int64(idx2, idx) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_outer_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = algos.outer_join_indexer_int64(idx2, idx) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_inner_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = algos.inner_join_indexer_int64(idx2, idx) + + exp_res = np.array([1, 1, 2, 5], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_is_lexsorted(): + failure = [ + np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, + 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]), + np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, + 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, + 4, 3, 2, 1, 0])] + + assert(not algos.is_lexsorted(failure)) + +# def test_get_group_index(): +# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) +# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) +# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) + +# result = lib.get_group_index([a, b], (3, 4)) + +# assert(np.array_equal(result, expected)) + + +def test_groupsort_indexer(): + a = np.random.randint(0, 1000, 100).astype(np.int64) + b = np.random.randint(0, 1000, 100).astype(np.int64) + + result = algos.groupsort_indexer(a, 1000)[0] + + # need to use a stable sort + expected = np.argsort(a, kind='mergesort') + assert(np.array_equal(result, expected)) + + # compare with lexsort + key = a * 1000 + b + result = algos.groupsort_indexer(key, 1000000)[0] + expected = np.lexsort((b, a)) + assert(np.array_equal(result, expected)) + + +def test_ensure_platform_int(): + arr = np.arange(100) + + result = algos.ensure_platform_int(arr) + assert(result is arr) + + +def test_duplicated_with_nas(): + keys = np.array([0, 1, nan, 0, 2, nan], dtype=object) + + result = lib.duplicated(keys) + expected = [False, False, False, True, False, True] + assert(np.array_equal(result, expected)) + + result = lib.duplicated(keys, take_last=True) + expected = [True, False, True, False, False, False] + assert(np.array_equal(result, expected)) + + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, nan, nan] * 2, [0, nan, 0, nan] * 2)): + keys[i] = t + + result = lib.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = falses + trues + assert(np.array_equal(result, expected)) + + result = lib.duplicated(keys, take_last=True) + expected = trues + falses + assert(np.array_equal(result, expected)) + + +def test_maybe_booleans_to_slice(): + arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) + result = lib.maybe_booleans_to_slice(arr) + assert(result.dtype == np.bool_) + + result = lib.maybe_booleans_to_slice(arr[:0]) + assert(result == slice(0, 0)) + + +def test_convert_objects(): + arr = np.array(['a', 'b', nan, nan, 'd', 'e', 'f'], dtype='O') + result = lib.maybe_convert_objects(arr) + assert(result.dtype == np.object_) + + +def test_convert_infs(): + arr = np.array(['inf', 'inf', 'inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + assert(result.dtype == np.float64) + + arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + assert(result.dtype == np.float64) + + +def test_convert_objects_ints(): + # test that we can detect many kinds of integers + dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + + for dtype_str in dtypes: + arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') + assert(arr[0].dtype == np.dtype(dtype_str)) + result = lib.maybe_convert_objects(arr) + assert(issubclass(result.dtype.type, np.integer)) + + +def test_convert_objects_complex_number(): + for dtype in np.sctypes['complex']: + arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') + assert(arr[0].dtype == np.dtype(dtype)) + result = lib.maybe_convert_objects(arr) + assert(issubclass(result.dtype.type, np.complexfloating)) + + +def test_rank(): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + def _check(arr): + mask = ~np.isfinite(arr) + arr = arr.copy() + result = algos.rank_1d_float64(arr) + arr[mask] = np.inf + exp = rankdata(arr) + exp[mask] = nan + assert_almost_equal(result, exp) + + _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) + _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + + +def test_get_reverse_indexer(): + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) + result = lib.get_reverse_indexer(indexer, 5) + expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) + assert(np.array_equal(result, expected)) + + +def test_pad_backfill_object_segfault(): + from datetime import datetime + old = np.array([], dtype='O') + new = np.array([datetime(2010, 12, 31)], dtype='O') + + result = algos.pad_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert(np.array_equal(result, expected)) + + result = algos.pad_object(new, old) + expected = np.array([], dtype=np.int64) + assert(np.array_equal(result, expected)) + + result = algos.backfill_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert(np.array_equal(result, expected)) + + result = algos.backfill_object(new, old) + expected = np.array([], dtype=np.int64) + assert(np.array_equal(result, expected)) + + +def test_arrmap(): + values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') + result = algos.arrmap_object(values, lambda x: x in ['foo', 'bar']) + assert(result.dtype == np.bool_) + + +def test_series_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + grouper = lib.SeriesGrouper(obj, np.mean, labels, 2, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + + +def test_series_bin_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + bins = np.array([3, 6]) + + grouper = lib.SeriesBinGrouper(obj, np.mean, bins, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + + +class TestBinGroupers(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.obj = np.random.randn(10, 1) + self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) + self.bins = np.array([3, 6], dtype=np.int64) + + def test_generate_bins(self): + from pandas.core.groupby import generate_bins_generic + values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + binner = np.array([0, 3, 6, 9], dtype=np.int64) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + bins = func(values, binner, closed='left') + assert((bins == np.array([2, 5, 6])).all()) + + bins = func(values, binner, closed='right') + assert((bins == np.array([3, 6, 6])).all()) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + binner = np.array([0, 3, 6], dtype=np.int64) + + bins = func(values, binner, closed='right') + assert((bins == np.array([3, 6])).all()) + + self.assertRaises(ValueError, generate_bins_generic, values, [], + 'right') + self.assertRaises(ValueError, generate_bins_generic, values[:0], + binner, 'right') + + self.assertRaises(ValueError, generate_bins_generic, + values, [4], 'right') + self.assertRaises(ValueError, generate_bins_generic, + values, [-3, -1], 'right') + + def test_group_bin_functions(self): + + dtypes = ['float32','float64'] + funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] + + np_funcs = { + 'add': np.sum, + 'mean': np.mean, + 'prod': np.prod, + 'min': np.min, + 'max': np.max, + 'var': lambda x: x.var(ddof=1) if len(x) >= 2 else np.nan + } + + for fname in funcs: + for d in dtypes: + check_less_precise = False + if d == 'float32': + check_less_precise = True + args = [getattr(algos, 'group_%s_%s' % (fname,d)), + getattr(algos, 'group_%s_bin_%s' % (fname,d)), + np_funcs[fname], + d, + check_less_precise] + self._check_versions(*args) + + def _check_versions(self, irr_func, bin_func, np_func, dtype, check_less_precise): + obj = self.obj.astype(dtype) + + cts = np.zeros(3, dtype=np.int64) + exp = np.zeros((3, 1), dtype) + irr_func(exp, cts, obj, self.labels) + + # bin-based version + bins = np.array([3, 6], dtype=np.int64) + out = np.zeros((3, 1), dtype) + counts = np.zeros(len(out), dtype=np.int64) + bin_func(out, counts, obj, bins) + + assert_almost_equal(out, exp, check_less_precise=check_less_precise) + + bins = np.array([3, 9, 10], dtype=np.int64) + out = np.zeros((3, 1), dtype) + counts = np.zeros(len(out), dtype=np.int64) + bin_func(out, counts, obj, bins) + exp = np.array([np_func(obj[:3]), np_func(obj[3:9]), + np_func(obj[9:])], + dtype=dtype) + assert_almost_equal(out.squeeze(), exp, check_less_precise=check_less_precise) + + # duplicate bins + bins = np.array([3, 6, 10, 10], dtype=np.int64) + out = np.zeros((4, 1), dtype) + counts = np.zeros(len(out), dtype=np.int64) + bin_func(out, counts, obj, bins) + exp = np.array([np_func(obj[:3]), np_func(obj[3:6]), + np_func(obj[6:10]), np.nan], + dtype=dtype) + assert_almost_equal(out.squeeze(), exp, check_less_precise=check_less_precise) + + +def test_group_ohlc(): + + def _check(dtype): + obj = np.array(np.random.randn(20),dtype=dtype) + + bins = np.array([6, 12], dtype=np.int64) + out = np.zeros((3, 4), dtype) + counts = np.zeros(len(out), dtype=np.int64) + + func = getattr(algos,'group_ohlc_%s' % dtype) + func(out, counts, obj[:, None], bins) + + def _ohlc(group): + if isnull(group).all(): + return np.repeat(nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), + _ohlc(obj[12:])]) + + assert_almost_equal(out, expected) + assert_almost_equal(counts, [6, 6, 8]) + + obj[:6] = nan + func(out, counts, obj[:, None], bins) + expected[0] = nan + assert_almost_equal(out, expected) + + _check('float32') + _check('float64') + +def test_try_parse_dates(): + from dateutil.parser import parse + + arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) + + result = lib.try_parse_dates(arr, dayfirst=True) + expected = [parse(d, dayfirst=True) for d in arr] + assert(np.array_equal(result, expected)) + + +class TestTypeInference(tm.TestCase): + _multiprocess_can_split_ = True + + def test_length_zero(self): + result = lib.infer_dtype(np.array([], dtype='i4')) + self.assertEqual(result, 'integer') + + result = lib.infer_dtype([]) + self.assertEqual(result, 'empty') + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='i4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([True, False, True, 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + def test_floats(self): + arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='f4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, 4, 5], dtype='f8') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + def test_string(self): + pass + + def test_unicode(self): + pass + + def test_datetime(self): + import datetime + dates = [datetime.datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'datetime64') + + def test_date(self): + import datetime + dates = [datetime.date(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'date') + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + + try: + # make sure record array works + from collections import namedtuple + record = namedtuple('record', 'x y') + r = record(5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + except ImportError: + pass + + def test_object(self): + + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None],dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + +class TestMoments(tm.TestCase): + pass + + +class TestReducer(tm.TestCase): + + def test_int_index(self): + from pandas.core.series import Series + + arr = np.random.randn(100, 4) + result = lib.reduce(arr, np.sum, labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + result = lib.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(100)) + result = lib.reduce( + arr, np.sum, dummy=dummy, labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(4)) + result = lib.reduce(arr, np.sum, axis=1, + dummy=dummy, labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + + +class TestTsUtil(tm.TestCase): + def test_min_valid(self): + # Ensure that Timestamp.min is a valid Timestamp + Timestamp(Timestamp.min) + + def test_max_valid(self): + # Ensure that Timestamp.max is a valid Timestamp + Timestamp(Timestamp.max) + + def test_to_datetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + self.assertEqual(Timestamp(Timestamp.max.to_pydatetime()).value/1000, Timestamp.max.value/1000) + self.assertEqual(Timestamp(Timestamp.min.to_pydatetime()).value/1000, Timestamp.min.value/1000) + +class TestPeriodField(tm.TestCase): + + def test_get_period_field_raises_on_out_of_range(self): + from pandas import tslib + self.assertRaises(ValueError, tslib.get_period_field, -1, 0, 0) + + def test_get_period_field_array_raises_on_out_of_range(self): + from pandas import tslib + self.assertRaises(ValueError, tslib.get_period_field_arr, -1, np.empty(1), 0) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/__init__.py b/pandas/tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tools/describe.py b/pandas/tools/describe.py new file mode 100644 index 00000000..eca5a800 --- /dev/null +++ b/pandas/tools/describe.py @@ -0,0 +1,17 @@ +from pandas.core.series import Series + + +def value_range(df): + """ + Return the minimum and maximum of a dataframe in a series object + + Parameters + ---------- + df : DataFrame + + Returns + ------- + (maximum, minimum) : Series + + """ + return Series((min(df.min()), max(df.max())), ('Minimum', 'Maximum')) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py new file mode 100644 index 00000000..d17e2e2d --- /dev/null +++ b/pandas/tools/merge.py @@ -0,0 +1,1093 @@ +""" +SQL-style merge routines +""" +import types + +import numpy as np +from pandas.compat import range, long, lrange, lzip, zip +import pandas.compat as compat +from pandas.core.categorical import Categorical +from pandas.core.frame import DataFrame, _merge_doc +from pandas.core.generic import NDFrame +from pandas.core.groupby import get_group_index +from pandas.core.series import Series +from pandas.core.index import (Index, MultiIndex, _get_combined_index, + _ensure_index, _get_consensus_names, + _all_indexes_same) +from pandas.core.internals import (items_overlap_with_suffix, + concatenate_block_managers) +from pandas.util.decorators import Appender, Substitution +from pandas.core.common import ABCSeries +from pandas.io.parsers import TextFileReader + +import pandas.core.common as com + +import pandas.lib as lib +import pandas.algos as algos +import pandas.hashtable as _hash + + +@Substitution('\nleft : DataFrame') +@Appender(_merge_doc, indents=0) +def merge(left, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=False, + suffixes=('_x', '_y'), copy=True): + op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, + right_on=right_on, left_index=left_index, + right_index=right_index, sort=sort, suffixes=suffixes, + copy=copy) + return op.get_result() +if __debug__: + merge.__doc__ = _merge_doc % '\nleft : DataFrame' + + +class MergeError(Exception): + pass + + +def ordered_merge(left, right, on=None, left_by=None, right_by=None, + left_on=None, right_on=None, + fill_method=None, suffixes=('_x', '_y')): + """Perform merge with optional filling/interpolation designed for ordered + data like time series data. Optionally perform group-wise merge (see + examples) + + Parameters + ---------- + left : DataFrame + right : DataFrame + fill_method : {'ffill', None}, default None + Interpolation method for data + on : label or list + Field names to join on. Must be found in both DataFrames. + left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns + right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs + left_by : column name or list of column names + Group left DataFrame by group columns and merge piece by piece with + right DataFrame + right_by : column name or list of column names + Group right DataFrame by group columns and merge piece by piece with + left DataFrame + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively + + Examples + -------- + >>> A >>> B + key lvalue group key rvalue + 0 a 1 a 0 b 1 + 1 c 2 a 1 c 2 + 2 e 3 a 2 d 3 + 3 a 1 b + 4 c 2 b + 5 e 3 b + + >>> ordered_merge(A, B, fill_method='ffill', left_by='group') + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1 + 2 c 2 a 2 + 3 d 2 a 3 + 4 e 3 a 3 + 5 f 3 a 4 + 6 a 1 b NaN + 7 b 1 b 1 + 8 c 2 b 2 + 9 d 2 b 3 + 10 e 3 b 3 + 11 f 3 b 4 + + Returns + ------- + merged : DataFrame + """ + def _merger(x, y): + op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, + # left_index=left_index, right_index=right_index, + suffixes=suffixes, fill_method=fill_method) + return op.get_result() + + if left_by is not None and right_by is not None: + raise ValueError('Can only group either left or right frames') + elif left_by is not None: + if not isinstance(left_by, (list, tuple)): + left_by = [left_by] + pieces = [] + for key, xpiece in left.groupby(left_by): + merged = _merger(xpiece, right) + for k in left_by: + # May have passed ndarray + try: + if k in merged: + merged[k] = key + except: + pass + pieces.append(merged) + return concat(pieces, ignore_index=True) + elif right_by is not None: + if not isinstance(right_by, (list, tuple)): + right_by = [right_by] + pieces = [] + for key, ypiece in right.groupby(right_by): + merged = _merger(left, ypiece) + for k in right_by: + try: + if k in merged: + merged[k] = key + except: + pass + pieces.append(merged) + return concat(pieces, ignore_index=True) + else: + return _merger(left, right) + + +# TODO: transformations?? +# TODO: only copy DataFrames when modification necessary +class _MergeOperation(object): + """ + Perform a database (SQL) merge operation between two DataFrame objects + using either columns as keys or their row indexes + """ + + def __init__(self, left, right, how='inner', on=None, + left_on=None, right_on=None, axis=1, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True): + self.left = self.orig_left = left + self.right = self.orig_right = right + self.how = how + self.axis = axis + + self.on = com._maybe_make_list(on) + self.left_on = com._maybe_make_list(left_on) + self.right_on = com._maybe_make_list(right_on) + + self.copy = copy + self.suffixes = suffixes + self.sort = sort + + self.left_index = left_index + self.right_index = right_index + + # note this function has side effects + (self.left_join_keys, + self.right_join_keys, + self.join_names) = self._get_merge_keys() + + def get_result(self): + join_index, left_indexer, right_indexer = self._get_join_info() + + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) + + lindexers = {1: left_indexer} if left_indexer is not None else {} + rindexers = {1: right_indexer} if right_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) + + result = DataFrame(result_data).__finalize__(self, method='merge') + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): + # insert group keys + + keys = zip(self.join_names, self.left_on, self.right_on) + for i, (name, lname, rname) in enumerate(keys): + if not _should_fill(lname, rname): + continue + + if name in result: + key_col = result[name] + + if left_indexer is not None and right_indexer is not None: + + if name in self.left: + na_indexer = (left_indexer == -1).nonzero()[0] + if len(na_indexer) == 0: + continue + + right_na_indexer = right_indexer.take(na_indexer) + key_col.put( + na_indexer, com.take_1d(self.right_join_keys[i], + right_na_indexer)) + elif name in self.right: + na_indexer = (right_indexer == -1).nonzero()[0] + if len(na_indexer) == 0: + continue + + left_na_indexer = left_indexer.take(na_indexer) + key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], + left_na_indexer)) + + elif left_indexer is not None: + if name is None: + name = 'key_%d' % i + + # a faster way? + key_col = com.take_1d(self.left_join_keys[i], left_indexer) + na_indexer = (left_indexer == -1).nonzero()[0] + right_na_indexer = right_indexer.take(na_indexer) + key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], + right_na_indexer)) + result.insert(i, name, key_col) + + def _get_join_info(self): + left_ax = self.left._data.axes[self.axis] + right_ax = self.right._data.axes[self.axis] + if self.left_index and self.right_index: + join_index, left_indexer, right_indexer = \ + left_ax.join(right_ax, how=self.how, return_indexers=True) + elif self.right_index and self.how == 'left': + join_index, left_indexer, right_indexer = \ + _left_join_on_index(left_ax, right_ax, self.left_join_keys, + sort=self.sort) + + elif self.left_index and self.how == 'right': + join_index, right_indexer, left_indexer = \ + _left_join_on_index(right_ax, left_ax, self.right_join_keys, + sort=self.sort) + else: + (left_indexer, + right_indexer) = _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, how=self.how) + + if self.right_index: + join_index = self.left.index.take(left_indexer) + elif self.left_index: + join_index = self.right.index.take(right_indexer) + else: + join_index = Index(np.arange(len(left_indexer))) + + return join_index, left_indexer, right_indexer + + def _get_merge_data(self): + """ + Handles overlapping column names etc. + """ + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf) + + if not llabels.equals(ldata.items): + ldata = ldata.copy(deep=False) + ldata.set_axis(0, llabels) + + if not rlabels.equals(rdata.items): + rdata = rdata.copy(deep=False) + rdata.set_axis(0, rlabels) + + return ldata, rdata + + def _get_merge_keys(self): + """ + Note: has side effects (copy/delete key columns) + + Parameters + ---------- + left + right + on + + Returns + ------- + left_keys, right_keys + """ + self._validate_specification() + + left_keys = [] + right_keys = [] + join_names = [] + right_drop = [] + left_drop = [] + left, right = self.left, self.right + + is_lkey = lambda x: isinstance(x, (np.ndarray, ABCSeries)) and len(x) == len(left) + is_rkey = lambda x: isinstance(x, (np.ndarray, ABCSeries)) and len(x) == len(right) + + # ugh, spaghetti re #733 + if _any(self.left_on) and _any(self.right_on): + for lk, rk in zip(self.left_on, self.right_on): + if is_lkey(lk): + left_keys.append(lk) + if is_rkey(rk): + right_keys.append(rk) + join_names.append(None) # what to do? + else: + right_keys.append(right[rk].values) + join_names.append(rk) + else: + if not is_rkey(rk): + right_keys.append(right[rk].values) + if lk == rk: + # avoid key upcast in corner case (length-0) + if len(left) > 0: + right_drop.append(rk) + else: + left_drop.append(lk) + else: + right_keys.append(rk) + left_keys.append(left[lk].values) + join_names.append(lk) + elif _any(self.left_on): + for k in self.left_on: + if is_lkey(k): + left_keys.append(k) + join_names.append(None) + else: + left_keys.append(left[k].values) + join_names.append(k) + if isinstance(self.right.index, MultiIndex): + right_keys = [lev.values.take(lab) + for lev, lab in zip(self.right.index.levels, + self.right.index.labels)] + else: + right_keys = [self.right.index.values] + elif _any(self.right_on): + for k in self.right_on: + if is_rkey(k): + right_keys.append(k) + join_names.append(None) + else: + right_keys.append(right[k].values) + join_names.append(k) + if isinstance(self.left.index, MultiIndex): + left_keys = [lev.values.take(lab) + for lev, lab in zip(self.left.index.levels, + self.left.index.labels)] + else: + left_keys = [self.left.index.values] + + if left_drop: + self.left = self.left.drop(left_drop, axis=1) + + if right_drop: + self.right = self.right.drop(right_drop, axis=1) + + return left_keys, right_keys, join_names + + def _validate_specification(self): + # Hm, any way to make this logic less complicated?? + if (self.on is None and self.left_on is None + and self.right_on is None): + + if self.left_index and self.right_index: + self.left_on, self.right_on = (), () + elif self.left_index: + if self.right_on is None: + raise MergeError('Must pass right_on or right_index=True') + elif self.right_index: + if self.left_on is None: + raise MergeError('Must pass left_on or left_index=True') + else: + if not self.left.columns.is_unique: + raise MergeError("Left data columns not unique: %s" + % repr(self.left.columns)) + + if not self.right.columns.is_unique: + raise MergeError("Right data columns not unique: %s" + % repr(self.right.columns)) + + # use the common columns + common_cols = self.left.columns.intersection( + self.right.columns) + if len(common_cols) == 0: + raise MergeError('No common columns to perform merge on') + self.left_on = self.right_on = common_cols + elif self.on is not None: + if self.left_on is not None or self.right_on is not None: + raise MergeError('Can only pass on OR left_on and ' + 'right_on') + self.left_on = self.right_on = self.on + elif self.left_on is not None: + n = len(self.left_on) + if self.right_index: + if len(self.left_on) != self.right.index.nlevels: + raise ValueError('len(left_on) must equal the number ' + 'of levels in the index of "right"') + self.right_on = [None] * n + elif self.right_on is not None: + n = len(self.right_on) + if self.left_index: + if len(self.right_on) != self.left.index.nlevels: + raise ValueError('len(right_on) must equal the number ' + 'of levels in the index of "left"') + self.left_on = [None] * n + if len(self.right_on) != len(self.left_on): + raise ValueError("len(right_on) must equal len(left_on)") + + +def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): + """ + + Parameters + ---------- + + Returns + ------- + + """ + if len(left_keys) != len(right_keys): + raise AssertionError('left_key and right_keys must be the same length') + + left_labels = [] + right_labels = [] + group_sizes = [] + + for lk, rk in zip(left_keys, right_keys): + llab, rlab, count = _factorize_keys(lk, rk, sort=sort) + + left_labels.append(llab) + right_labels.append(rlab) + group_sizes.append(count) + + max_groups = long(1) + for x in group_sizes: + max_groups *= long(x) + + if max_groups > 2 ** 63: # pragma: no cover + left_group_key, right_group_key, max_groups = \ + _factorize_keys(lib.fast_zip(left_labels), + lib.fast_zip(right_labels)) + else: + left_group_key = get_group_index(left_labels, group_sizes) + right_group_key = get_group_index(right_labels, group_sizes) + + left_group_key, right_group_key, max_groups = \ + _factorize_keys(left_group_key, right_group_key, sort=sort) + + join_func = _join_functions[how] + return join_func(left_group_key, right_group_key, max_groups) + + +class _OrderedMerge(_MergeOperation): + + def __init__(self, left, right, on=None, by=None, left_on=None, + right_on=None, axis=1, left_index=False, right_index=False, + suffixes=('_x', '_y'), copy=True, + fill_method=None): + + self.fill_method = fill_method + + _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, + right_on=right_on, axis=axis, + left_index=left_index, + right_index=right_index, + how='outer', suffixes=suffixes, + sort=True # sorts when factorizing + ) + + def get_result(self): + join_index, left_indexer, right_indexer = self._get_join_info() + + # this is a bit kludgy + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) + + if self.fill_method == 'ffill': + left_join_indexer = algos.ffill_indexer(left_indexer) + right_join_indexer = algos.ffill_indexer(right_indexer) + else: + left_join_indexer = left_indexer + right_join_indexer = right_indexer + + lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) + + result = DataFrame(result_data) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + +def _get_multiindex_indexer(join_keys, index, sort=False): + shape = [] + labels = [] + for level, key in zip(index.levels, join_keys): + llab, rlab, count = _factorize_keys(level, key, sort=False) + labels.append(rlab) + shape.append(count) + + left_group_key = get_group_index(labels, shape) + right_group_key = get_group_index(index.labels, shape) + + left_group_key, right_group_key, max_groups = \ + _factorize_keys(left_group_key, right_group_key, + sort=False) + + left_indexer, right_indexer = \ + algos.left_outer_join(com._ensure_int64(left_group_key), + com._ensure_int64(right_group_key), + max_groups, sort=False) + + return left_indexer, right_indexer + + +def _get_single_indexer(join_key, index, sort=False): + left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) + + left_indexer, right_indexer = \ + algos.left_outer_join(com._ensure_int64(left_key), + com._ensure_int64(right_key), + count, sort=sort) + + return left_indexer, right_indexer + + +def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): + join_index = left_ax + left_indexer = None + + if len(join_keys) > 1: + if not ((isinstance(right_ax, MultiIndex) and + len(join_keys) == right_ax.nlevels)): + raise AssertionError("If more than one join key is given then " + "'right_ax' must be a MultiIndex and the " + "number of join keys must be the number of " + "levels in right_ax") + + left_tmp, right_indexer = \ + _get_multiindex_indexer(join_keys, right_ax, + sort=sort) + if sort: + left_indexer = left_tmp + join_index = left_ax.take(left_indexer) + else: + jkey = join_keys[0] + if sort: + left_indexer, right_indexer = \ + _get_single_indexer(jkey, right_ax, sort=sort) + join_index = left_ax.take(left_indexer) + else: + right_indexer = right_ax.get_indexer(jkey) + + return join_index, left_indexer, right_indexer + + +def _right_outer_join(x, y, max_groups): + right_indexer, left_indexer = algos.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer + +_join_functions = { + 'inner': algos.inner_join, + 'left': algos.left_outer_join, + 'right': _right_outer_join, + 'outer': algos.full_outer_join, +} + + +def _factorize_keys(lk, rk, sort=True): + if com._is_int_or_datetime_dtype(lk) and com._is_int_or_datetime_dtype(rk): + klass = _hash.Int64Factorizer + lk = com._ensure_int64(lk) + rk = com._ensure_int64(rk) + else: + klass = _hash.Factorizer + lk = com._ensure_object(lk) + rk = com._ensure_object(rk) + + rizer = klass(max(len(lk), len(rk))) + + llab = rizer.factorize(lk) + rlab = rizer.factorize(rk) + + count = rizer.get_count() + + if sort: + uniques = rizer.uniques.to_array() + llab, rlab = _sort_labels(uniques, llab, rlab) + + # NA group + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + + if lany or rany: + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 + + return llab, rlab, count + + +def _sort_labels(uniques, left, right): + if not isinstance(uniques, np.ndarray): + # tuplesafe + uniques = Index(uniques).values + + sorter = uniques.argsort() + + reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + new_left = reverse_indexer.take(com._ensure_platform_int(left)) + np.putmask(new_left, left == -1, -1) + + new_right = reverse_indexer.take(com._ensure_platform_int(right)) + np.putmask(new_right, right == -1, -1) + + return new_left, new_right + + +#---------------------------------------------------------------------- +# Concatenate DataFrame objects + + +def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, + keys=None, levels=None, names=None, verify_integrity=False): + """ + Concatenate pandas objects along a particular axis with optional set logic + along the other axes. Can also add a layer of hierarchical indexing on the + concatenation axis, which may be useful if the labels are the same (or + overlapping) on the passed axis number + + Parameters + ---------- + objs : list or dict of Series, DataFrame, or Panel objects + If a dict is passed, the sorted keys will be used as the `keys` + argument, unless it is passed, in which case the values will be + selected (see below). Any None objects will be dropped silently unless + they are all None in which case an Exception will be raised + axis : {0, 1, ...}, default 0 + The axis to concatenate along + join : {'inner', 'outer'}, default 'outer' + How to handle indexes on other axis(es) + join_axes : list of Index objects + Specific indexes to use for the other n - 1 axes instead of performing + inner/outer set logic + verify_integrity : boolean, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys + names : list, default None + Names for the levels in the resulting hierarchical index + ignore_index : boolean, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the the index values on the other + axes are still respected in the join. + + Notes + ----- + The keys, levels, and names arguments are all optional + + Returns + ------- + concatenated : type of objects + """ + op = _Concatenator(objs, axis=axis, join_axes=join_axes, + ignore_index=ignore_index, join=join, + keys=keys, levels=levels, names=names, + verify_integrity=verify_integrity) + return op.get_result() + + +class _Concatenator(object): + """ + Orchestrates a concatenation operation for BlockManagers + """ + + def __init__(self, objs, axis=0, join='outer', join_axes=None, + keys=None, levels=None, names=None, + ignore_index=False, verify_integrity=False): + if not isinstance(objs, (list,tuple,types.GeneratorType,dict,TextFileReader)): + raise TypeError('first argument must be a list-like of pandas ' + 'objects, you passed an object of type ' + '"{0}"'.format(type(objs).__name__)) + + if join == 'outer': + self.intersect = False + elif join == 'inner': + self.intersect = True + else: # pragma: no cover + raise ValueError('Only can inner (intersect) or outer (union) ' + 'join the other axis') + + if isinstance(objs, dict): + if keys is None: + keys = sorted(objs) + objs = [objs[k] for k in keys] + + if keys is None: + objs = [obj for obj in objs if obj is not None ] + else: + # #1649 + clean_keys = [] + clean_objs = [] + for k, v in zip(keys, objs): + if v is None: + continue + clean_keys.append(k) + clean_objs.append(v) + objs = clean_objs + keys = clean_keys + + if len(objs) == 0: + raise Exception('All objects passed were None') + + # consolidate data & figure out what our result ndim is going to be + ndims = set() + for obj in objs: + if not isinstance(obj, NDFrame): + raise TypeError("cannot concatenate a non-NDFrame object") + + # consolidate + obj.consolidate(inplace=True) + ndims.add(obj.ndim) + + # get the sample + # want the higest ndim that we have, and must be non-empty + # unless all objs are empty + sample = None + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and np.sum(obj.shape): + sample = obj + break + + else: + # filter out the empties + # if we have not multi-index possibiltes + df = DataFrame([ obj.shape for obj in objs ]).sum(1) + non_empties = df[df!=0] + if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None): + objs = [ objs[i] for i in non_empties.index ] + sample = objs[0] + + if sample is None: + sample = objs[0] + self.objs = objs + + # Need to flip BlockManager axis in the DataFrame special case + self._is_frame = isinstance(sample, DataFrame) + if self._is_frame: + axis = 1 if axis == 0 else 0 + + self._is_series = isinstance(sample, ABCSeries) + if not 0 <= axis <= sample.ndim: + raise AssertionError("axis must be between 0 and {0}, " + "input was {1}".format(sample.ndim, axis)) + + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + current_column = 0 + max_ndim = sample.ndim + self.objs, objs = [], self.objs + for obj in objs: + + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim-1: + raise ValueError("cannot concatenate unaligned mixed " + "dimensional NDFrame objects") + + else: + name = getattr(obj,'name',None) + if ignore_index or name is None: + name = current_column + current_column += 1 + + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 + obj = sample._constructor({ name : obj }) + + self.objs.append(obj) + + # note: this is the BlockManager axis (since DataFrame is transposed) + self.axis = axis + self.join_axes = join_axes + self.keys = keys + self.names = names + self.levels = levels + + self.ignore_index = ignore_index + self.verify_integrity = verify_integrity + + self.new_axes = self._get_new_axes() + + def get_result(self): + if self._is_series: + if self.axis == 0: + new_data = com._concat_compat([x.get_values() for x in self.objs]) + name = com._consensus_name_attr(self.objs) + return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') + else: + data = dict(zip(range(len(self.objs)), self.objs)) + index, columns = self.new_axes + tmpdf = DataFrame(data, index=index) + if columns is not None: + tmpdf.columns = columns + return tmpdf.__finalize__(self, method='concat') + else: + mgrs_indexers = [] + for obj in self.objs: + mgr = obj._data + indexers = {} + for ax, new_labels in enumerate(self.new_axes): + if ax == self.axis: + # Suppress reindexing on concat axis + continue + + obj_labels = mgr.axes[ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.reindex(new_labels)[1] + + mgrs_indexers.append((obj._data, indexers)) + + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True) + + return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') + + def _get_result_dim(self): + if self._is_series and self.axis == 1: + return 2 + else: + return self.objs[0].ndim + + def _get_new_axes(self): + ndim = self._get_result_dim() + new_axes = [None] * ndim + + if self.join_axes is None: + for i in range(ndim): + if i == self.axis: + continue + new_axes[i] = self._get_comb_axis(i) + else: + if len(self.join_axes) != ndim - 1: + raise AssertionError("length of join_axes must not be " + "equal to {0}".format(ndim - 1)) + + # ufff... + indices = lrange(ndim) + indices.remove(self.axis) + + for i, ax in zip(indices, self.join_axes): + new_axes[i] = ax + + new_axes[self.axis] = self._get_concat_axis() + return new_axes + + def _get_comb_axis(self, i): + if self._is_series: + all_indexes = [x.index for x in self.objs] + else: + try: + all_indexes = [x._data.axes[i] for x in self.objs] + except IndexError: + types = [type(x).__name__ for x in self.objs] + raise TypeError("Cannot concatenate list of %s" % types) + + return _get_combined_index(all_indexes, intersect=self.intersect) + + def _get_concat_axis(self): + """ + Return index to be used along concatenation axis. + """ + if self._is_series: + if self.axis == 0: + indexes = [x.index for x in self.objs] + elif self.ignore_index: + idx = Index(np.arange(len(self.objs))) + idx.is_unique = True # arange is always unique + return idx + elif self.keys is None: + names = [] + for x in self.objs: + if not isinstance(x, Series): + raise TypeError("Cannot concatenate type 'Series' " + "with object of type " + "%r" % type(x).__name__) + if x.name is not None: + names.append(x.name) + else: + idx = Index(np.arange(len(self.objs))) + idx.is_unique = True + return idx + + return Index(names) + else: + return _ensure_index(self.keys) + else: + indexes = [x._data.axes[self.axis] for x in self.objs] + + if self.ignore_index: + idx = Index(np.arange(sum(len(i) for i in indexes))) + idx.is_unique = True + return idx + + if self.keys is None: + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, self.keys, + self.levels, self.names) + + self._maybe_check_integrity(concat_axis) + + return concat_axis + + def _maybe_check_integrity(self, concat_index): + if self.verify_integrity: + if not concat_index.is_unique: + overlap = concat_index.get_duplicates() + raise ValueError('Indexes have overlapping values: %s' + % str(overlap)) + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + + +def _make_concat_multiindex(indexes, keys, levels=None, names=None): + if ((levels is None and isinstance(keys[0], tuple)) or + (levels is not None and len(levels) > 1)): + zipped = lzip(*keys) + if names is None: + names = [None] * len(zipped) + + if levels is None: + levels = [Categorical.from_array(zp).levels for zp in zipped] + else: + levels = [_ensure_index(x) for x in levels] + else: + zipped = [keys] + if names is None: + names = [None] + + if levels is None: + levels = [_ensure_index(keys)] + else: + levels = [_ensure_index(x) for x in levels] + + if not _all_indexes_same(indexes): + label_list = [] + + # things are potentially different sizes, so compute the exact labels + # for each level and pass those to MultiIndex.from_arrays + + for hlevel, level in zip(zipped, levels): + to_concat = [] + for key, index in zip(hlevel, indexes): + try: + i = level.get_loc(key) + except KeyError: + raise ValueError('Key %s not in level %s' + % (str(key), str(level))) + + to_concat.append(np.repeat(i, len(index))) + label_list.append(np.concatenate(to_concat)) + + concat_index = _concat_indexes(indexes) + + # these go at the end + if isinstance(concat_index, MultiIndex): + levels.extend(concat_index.levels) + label_list.extend(concat_index.labels) + else: + factor = Categorical.from_array(concat_index) + levels.append(factor.levels) + label_list.append(factor.labels) + + if len(names) == len(levels): + names = list(names) + else: + # make sure that all of the passed indices have the same nlevels + if not len(set([ i.nlevels for i in indexes ])) == 1: + raise AssertionError("Cannot concat indices that do" + " not have the same number of levels") + + # also copies + names = names + _get_consensus_names(indexes) + + return MultiIndex(levels=levels, labels=label_list, names=names, + verify_integrity=False) + + new_index = indexes[0] + n = len(new_index) + kpieces = len(indexes) + + # also copies + new_names = list(names) + new_levels = list(levels) + + # construct labels + new_labels = [] + + # do something a bit more speedy + + for hlevel, level in zip(zipped, levels): + hlevel = _ensure_index(hlevel) + mapped = level.get_indexer(hlevel) + + mask = mapped == -1 + if mask.any(): + raise ValueError('Values not found in passed level: %s' + % str(hlevel[mask])) + + new_labels.append(np.repeat(mapped, n)) + + if isinstance(new_index, MultiIndex): + new_levels.extend(new_index.levels) + new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) + else: + new_levels.append(new_index) + new_labels.append(np.tile(np.arange(n), kpieces)) + + if len(new_names) < len(new_levels): + new_names.extend(new_index.names) + + return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, + verify_integrity=False) + + +def _should_fill(lname, rname): + if not isinstance(lname, compat.string_types) or not isinstance(rname, compat.string_types): + return True + return lname == rname + + +def _any(x): + return x is not None and len(x) > 0 and any([y is not None for y in x]) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py new file mode 100644 index 00000000..9132fea0 --- /dev/null +++ b/pandas/tools/pivot.py @@ -0,0 +1,409 @@ +# pylint: disable=E1103 + +import warnings + +from pandas import Series, DataFrame +from pandas.core.index import MultiIndex +from pandas.core.groupby import Grouper +from pandas.tools.merge import concat +from pandas.tools.util import cartesian_product +from pandas.compat import range, lrange, zip +from pandas.util.decorators import deprecate_kwarg +from pandas import compat +import pandas.core.common as com +import numpy as np + +@deprecate_kwarg(old_arg_name='cols', new_arg_name='columns') +@deprecate_kwarg(old_arg_name='rows', new_arg_name='index') +def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', + fill_value=None, margins=False, dropna=True): + """ + Create a spreadsheet-style pivot table as a DataFrame. The levels in the + pivot table will be stored in MultiIndex objects (hierarchical indexes) on + the index and columns of the result DataFrame + + Parameters + ---------- + data : DataFrame + values : column to aggregate, optional + index : a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table index. + If an array is passed, it is being used as the same manner as column values. + columns : a column, Grouper, array which has the same length as data, or list of them. + Keys to group by on the pivot table column. + If an array is passed, it is being used as the same manner as column values. + aggfunc : function, default numpy.mean, or list of functions + If list of functions passed, the resulting pivot table will have + hierarchical columns whose top level are the function names (inferred + from the function objects themselves) + fill_value : scalar, default None + Value to replace missing values with + margins : boolean, default False + Add all row / columns (e.g. for subtotal / grand totals) + dropna : boolean, default True + Do not include columns whose entries are all NaN + rows : kwarg only alias of index [deprecated] + cols : kwarg only alias of columns [deprecated] + + Examples + -------- + >>> df + A B C D + 0 foo one small 1 + 1 foo one large 2 + 2 foo one large 2 + 3 foo two small 3 + 4 foo two small 3 + 5 bar one large 4 + 6 bar one small 5 + 7 bar two small 6 + 8 bar two large 7 + + >>> table = pivot_table(df, values='D', index=['A', 'B'], + ... columns=['C'], aggfunc=np.sum) + >>> table + small large + foo one 1 4 + two 6 NaN + bar one 5 4 + two 6 7 + + Returns + ------- + table : DataFrame + """ + index = _convert_by(index) + columns = _convert_by(columns) + + if isinstance(aggfunc, list): + pieces = [] + keys = [] + for func in aggfunc: + table = pivot_table(data, values=values, index=index, columns=columns, + fill_value=fill_value, aggfunc=func, + margins=margins) + pieces.append(table) + keys.append(func.__name__) + return concat(pieces, keys=keys, axis=1) + + keys = index + columns + + values_passed = values is not None + if values_passed: + if isinstance(values, (list, tuple)): + values_multi = True + else: + values_multi = False + values = [values] + else: + values = list(data.columns.drop(keys)) + + if values_passed: + to_filter = [] + for x in keys + values: + if isinstance(x, Grouper): + x = x.key + try: + if x in data: + to_filter.append(x) + except TypeError: + pass + if len(to_filter) < len(data.columns): + data = data[to_filter] + + grouped = data.groupby(keys) + agged = grouped.agg(aggfunc) + + table = agged + if table.index.nlevels > 1: + to_unstack = [agged.index.names[i] + for i in range(len(index), len(keys))] + table = agged.unstack(to_unstack) + + if not dropna: + try: + m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) + table = table.reindex_axis(m, axis=0) + except AttributeError: + pass # it's a single level + + try: + m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) + table = table.reindex_axis(m, axis=1) + except AttributeError: + pass # it's a single level or a series + + if isinstance(table, DataFrame): + if isinstance(table.columns, MultiIndex): + table = table.sortlevel(axis=1) + else: + table = table.sort_index(axis=1) + + if fill_value is not None: + table = table.fillna(value=fill_value, downcast='infer') + + if margins: + table = _add_margins(table, data, values, rows=index, + cols=columns, aggfunc=aggfunc) + + # discard the top level + if values_passed and not values_multi: + table = table[values[0]] + + if len(index) == 0 and len(columns) > 0: + table = table.T + + return table + + +DataFrame.pivot_table = pivot_table + + +def _add_margins(table, data, values, rows, cols, aggfunc): + + grand_margin = _compute_grand_margin(data, values, aggfunc) + + if not values and isinstance(table, Series): + # If there are no values and the table is a series, then there is only + # one column in the data. Compute grand margin and return it. + row_key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All' + return table.append(Series({row_key: grand_margin['All']})) + + if values: + marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin) + if not isinstance(marginal_result_set, tuple): + return marginal_result_set + result, margin_keys, row_margin = marginal_result_set + else: + marginal_result_set = _generate_marginal_results_without_values(table, data, rows, cols, aggfunc) + if not isinstance(marginal_result_set, tuple): + return marginal_result_set + result, margin_keys, row_margin = marginal_result_set + + key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All' + + row_margin = row_margin.reindex(result.columns) + # populate grand margin + for k in margin_keys: + if isinstance(k, compat.string_types): + row_margin[k] = grand_margin[k] + else: + row_margin[k] = grand_margin[k[0]] + + margin_dummy = DataFrame(row_margin, columns=[key]).T + + row_names = result.index.names + result = result.append(margin_dummy) + result.index.names = row_names + + return result + + +def _compute_grand_margin(data, values, aggfunc): + + if values: + grand_margin = {} + for k, v in data[values].iteritems(): + try: + if isinstance(aggfunc, compat.string_types): + grand_margin[k] = getattr(v, aggfunc)() + else: + grand_margin[k] = aggfunc(v) + except TypeError: + pass + return grand_margin + else: + return {'All': aggfunc(data.index)} + + +def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin): + if len(cols) > 0: + # need to "interleave" the margins + table_pieces = [] + margin_keys = [] + + def _all_key(key): + return (key, 'All') + ('',) * (len(cols) - 1) + + if len(rows) > 0: + margin = data[rows + values].groupby(rows).agg(aggfunc) + cat_axis = 1 + for key, piece in table.groupby(level=0, axis=cat_axis): + all_key = _all_key(key) + piece[all_key] = margin[key] + table_pieces.append(piece) + margin_keys.append(all_key) + else: + margin = grand_margin + cat_axis = 0 + for key, piece in table.groupby(level=0, axis=cat_axis): + all_key = _all_key(key) + table_pieces.append(piece) + table_pieces.append(Series(margin[key], index=[all_key])) + margin_keys.append(all_key) + + result = concat(table_pieces, axis=cat_axis) + + if len(rows) == 0: + return result + else: + result = table + margin_keys = table.columns + + if len(cols) > 0: + row_margin = data[cols + values].groupby(cols).agg(aggfunc) + row_margin = row_margin.stack() + + # slight hack + new_order = [len(cols)] + lrange(len(cols)) + row_margin.index = row_margin.index.reorder_levels(new_order) + else: + row_margin = Series(np.nan, index=result.columns) + + return result, margin_keys, row_margin + + +def _generate_marginal_results_without_values(table, data, rows, cols, aggfunc): + if len(cols) > 0: + # need to "interleave" the margins + margin_keys = [] + + def _all_key(): + if len(cols) == 1: + return 'All' + return ('All', ) + ('', ) * (len(cols) - 1) + + if len(rows) > 0: + margin = data[rows].groupby(rows).apply(aggfunc) + all_key = _all_key() + table[all_key] = margin + result = table + margin_keys.append(all_key) + + else: + margin = data.groupby(level=0, axis=0).apply(aggfunc) + all_key = _all_key() + table[all_key] = margin + result = table + margin_keys.append(all_key) + return result + else: + result = table + margin_keys = table.columns + + if len(cols): + row_margin = data[cols].groupby(cols).apply(aggfunc) + else: + row_margin = Series(np.nan, index=result.columns) + + return result, margin_keys, row_margin + + +def _convert_by(by): + if by is None: + by = [] + elif (np.isscalar(by) or isinstance(by, (np.ndarray, Series, Grouper)) + or hasattr(by, '__call__')): + by = [by] + else: + by = list(by) + return by + +@deprecate_kwarg(old_arg_name='cols', new_arg_name='columns') +@deprecate_kwarg(old_arg_name='rows', new_arg_name='index') +def crosstab(index, columns, values=None, rownames=None, colnames=None, + aggfunc=None, margins=False, dropna=True): + """ + Compute a simple cross-tabulation of two (or more) factors. By default + computes a frequency table of the factors unless an array of values and an + aggregation function are passed + + Parameters + ---------- + index : array-like, Series, or list of arrays/Series + Values to group by in the rows + columns : array-like, Series, or list of arrays/Series + Values to group by in the columns + values : array-like, optional + Array of values to aggregate according to the factors + aggfunc : function, optional + If no values array is passed, computes a frequency table + rownames : sequence, default None + If passed, must match number of row arrays passed + colnames : sequence, default None + If passed, must match number of column arrays passed + margins : boolean, default False + Add row/column margins (subtotals) + dropna : boolean, default True + Do not include columns whose entries are all NaN + rows : kwarg only alias of index [deprecated] + cols : kwarg only alias of columns [deprecated] + + Notes + ----- + Any Series passed will have their name attributes used unless row or column + names for the cross-tabulation are specified + + Examples + -------- + >>> a + array([foo, foo, foo, foo, bar, bar, + bar, bar, foo, foo, foo], dtype=object) + >>> b + array([one, one, one, two, one, one, + one, two, two, two, one], dtype=object) + >>> c + array([dull, dull, shiny, dull, dull, shiny, + shiny, dull, shiny, shiny, shiny], dtype=object) + + >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + b one two + c dull shiny dull shiny + a + bar 1 2 1 0 + foo 2 2 1 2 + + Returns + ------- + crosstab : DataFrame + """ + + index = com._maybe_make_list(index) + columns = com._maybe_make_list(columns) + + rownames = _get_names(index, rownames, prefix='row') + colnames = _get_names(columns, colnames, prefix='col') + + data = {} + data.update(zip(rownames, index)) + data.update(zip(colnames, columns)) + + if values is None: + df = DataFrame(data) + df['__dummy__'] = 0 + table = df.pivot_table('__dummy__', index=rownames, columns=colnames, + aggfunc=len, margins=margins, dropna=dropna) + return table.fillna(0).astype(np.int64) + else: + data['__dummy__'] = values + df = DataFrame(data) + table = df.pivot_table('__dummy__', index=rownames, columns=colnames, + aggfunc=aggfunc, margins=margins, dropna=dropna) + return table + + +def _get_names(arrs, names, prefix='row'): + if names is None: + names = [] + for i, arr in enumerate(arrs): + if isinstance(arr, Series) and arr.name is not None: + names.append(arr.name) + else: + names.append('%s_%d' % (prefix, i)) + else: + if len(names) != len(arrs): + raise AssertionError('arrays and names must have the same length') + if not isinstance(names, list): + names = list(names) + + return names diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py new file mode 100644 index 00000000..d3ea809b --- /dev/null +++ b/pandas/tools/plotting.py @@ -0,0 +1,3075 @@ +# being a bit too dynamic +# pylint: disable=E1101 +import datetime +import warnings +import re +from collections import namedtuple +from contextlib import contextmanager +from distutils.version import LooseVersion + +import numpy as np + +from pandas.util.decorators import cache_readonly, deprecate_kwarg +import pandas.core.common as com +from pandas.core.generic import _shared_docs, _shared_doc_kwargs +from pandas.core.index import MultiIndex +from pandas.core.series import Series, remove_na +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex, Period +from pandas.tseries.frequencies import get_period_alias, get_base_alias +from pandas.tseries.offsets import DateOffset +from pandas.compat import range, lrange, lmap, map, zip, string_types +import pandas.compat as compat +from pandas.util.decorators import Appender + +try: # mpl optional + import pandas.tseries.converter as conv + conv.register() # needs to override so set_xlim works with str/number +except ImportError: + pass + +# Extracted from https://gist.github.com/huyng/816622 +# this is the rcParams set when setting display.with_mpl_style +# to True. +mpl_stylesheet = { + 'axes.axisbelow': True, + 'axes.color_cycle': ['#348ABD', + '#7A68A6', + '#A60628', + '#467821', + '#CF4457', + '#188487', + '#E24A33'], + 'axes.edgecolor': '#bcbcbc', + 'axes.facecolor': '#eeeeee', + 'axes.grid': True, + 'axes.labelcolor': '#555555', + 'axes.labelsize': 'large', + 'axes.linewidth': 1.0, + 'axes.titlesize': 'x-large', + 'figure.edgecolor': 'white', + 'figure.facecolor': 'white', + 'figure.figsize': (6.0, 4.0), + 'figure.subplot.hspace': 0.5, + 'font.family': 'monospace', + 'font.monospace': ['Andale Mono', + 'Nimbus Mono L', + 'Courier New', + 'Courier', + 'Fixed', + 'Terminal', + 'monospace'], + 'font.size': 10, + 'interactive': True, + 'keymap.all_axes': ['a'], + 'keymap.back': ['left', 'c', 'backspace'], + 'keymap.forward': ['right', 'v'], + 'keymap.fullscreen': ['f'], + 'keymap.grid': ['g'], + 'keymap.home': ['h', 'r', 'home'], + 'keymap.pan': ['p'], + 'keymap.save': ['s'], + 'keymap.xscale': ['L', 'k'], + 'keymap.yscale': ['l'], + 'keymap.zoom': ['o'], + 'legend.fancybox': True, + 'lines.antialiased': True, + 'lines.linewidth': 1.0, + 'patch.antialiased': True, + 'patch.edgecolor': '#EEEEEE', + 'patch.facecolor': '#348ABD', + 'patch.linewidth': 0.5, + 'toolbar': 'toolbar2', + 'xtick.color': '#555555', + 'xtick.direction': 'in', + 'xtick.major.pad': 6.0, + 'xtick.major.size': 0.0, + 'xtick.minor.pad': 6.0, + 'xtick.minor.size': 0.0, + 'ytick.color': '#555555', + 'ytick.direction': 'in', + 'ytick.major.pad': 6.0, + 'ytick.major.size': 0.0, + 'ytick.minor.pad': 6.0, + 'ytick.minor.size': 0.0 +} + +def _get_standard_kind(kind): + return {'density': 'kde'}.get(kind, kind) + +def _get_standard_colors(num_colors=None, colormap=None, color_type='default', + color=None): + import matplotlib.pyplot as plt + + if color is None and colormap is not None: + if isinstance(colormap, compat.string_types): + import matplotlib.cm as cm + cmap = colormap + colormap = cm.get_cmap(colormap) + if colormap is None: + raise ValueError("Colormap {0} is not recognized".format(cmap)) + colors = lmap(colormap, np.linspace(0, 1, num=num_colors)) + elif color is not None: + if colormap is not None: + warnings.warn("'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'") + colors = color + else: + if color_type == 'default': + colors = plt.rcParams.get('axes.color_cycle', list('bgrcmyk')) + if isinstance(colors, compat.string_types): + colors = list(colors) + elif color_type == 'random': + import random + def random_color(column): + random.seed(column) + return [random.random() for _ in range(3)] + + colors = lmap(random_color, lrange(num_colors)) + else: + raise NotImplementedError + + if len(colors) != num_colors: + multiple = num_colors//len(colors) - 1 + mod = num_colors % len(colors) + + colors += multiple * colors + colors += colors[:mod] + + return colors + +class _Options(dict): + """ + Stores pandas plotting options. + Allows for parameter aliasing so you can just use parameter names that are + the same as the plot function parameters, but is stored in a canonical + format that makes it easy to breakdown into groups later + """ + + # alias so the names are same as plotting method parameter names + _ALIASES = {'x_compat': 'xaxis.compat'} + _DEFAULT_KEYS = ['xaxis.compat'] + + def __init__(self): + self['xaxis.compat'] = False + + def __getitem__(self, key): + key = self._get_canonical_key(key) + if key not in self: + raise ValueError('%s is not a valid pandas plotting option' % key) + return super(_Options, self).__getitem__(key) + + def __setitem__(self, key, value): + key = self._get_canonical_key(key) + return super(_Options, self).__setitem__(key, value) + + def __delitem__(self, key): + key = self._get_canonical_key(key) + if key in self._DEFAULT_KEYS: + raise ValueError('Cannot remove default parameter %s' % key) + return super(_Options, self).__delitem__(key) + + def __contains__(self, key): + key = self._get_canonical_key(key) + return super(_Options, self).__contains__(key) + + def reset(self): + """ + Reset the option store to its initial state + + Returns + ------- + None + """ + self.__init__() + + def _get_canonical_key(self, key): + return self._ALIASES.get(key, key) + + @contextmanager + def use(self, key, value): + """ + Temporarily set a parameter value using the with statement. + Aliasing allowed. + """ + old_value = self[key] + try: + self[key] = value + yield self + finally: + self[key] = old_value + + +plot_params = _Options() + + +def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, + diagonal='hist', marker='.', density_kwds=None, + hist_kwds=None, range_padding=0.05, **kwds): + """ + Draw a matrix of scatter plots. + + Parameters + ---------- + frame : DataFrame + alpha : float, optional + amount of transparency applied + figsize : (float,float), optional + a tuple (width, height) in inches + ax : Matplotlib axis object, optional + grid : bool, optional + setting this to True will show the grid + diagonal : {'hist', 'kde'} + pick between 'kde' and 'hist' for + either Kernel Density Estimation or Histogram + plot in the diagonal + marker : str, optional + Matplotlib marker type, default '.' + hist_kwds : other plotting keyword arguments + To be passed to hist function + density_kwds : other plotting keyword arguments + To be passed to kernel density estimate plot + range_padding : float, optional + relative extension of axis range in x and y + with respect to (x_max - x_min) or (y_max - y_min), + default 0.05 + kwds : other plotting keyword arguments + To be passed to scatter function + + Examples + -------- + >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) + >>> scatter_matrix(df, alpha=0.2) + """ + import matplotlib.pyplot as plt + from matplotlib.artist import setp + + df = frame._get_numeric_data() + n = df.columns.size + fig, axes = _subplots(nrows=n, ncols=n, figsize=figsize, ax=ax, + squeeze=False) + + # no gaps between subplots + fig.subplots_adjust(wspace=0, hspace=0) + + mask = com.notnull(df) + + marker = _get_marker_compat(marker) + + hist_kwds = hist_kwds or {} + density_kwds = density_kwds or {} + + # workaround because `c='b'` is hardcoded in matplotlibs scatter method + kwds.setdefault('c', plt.rcParams['patch.facecolor']) + + boundaries_list = [] + for a in df.columns: + values = df[a].values[mask[a].values] + rmin_, rmax_ = np.min(values), np.max(values) + rdelta_ext = (rmax_ - rmin_) * range_padding / 2. + boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext)) + + for i, a in zip(lrange(n), df.columns): + for j, b in zip(lrange(n), df.columns): + ax = axes[i, j] + + if i == j: + values = df[a].values[mask[a].values] + + # Deal with the diagonal by drawing a histogram there. + if diagonal == 'hist': + ax.hist(values, **hist_kwds) + + elif diagonal in ('kde', 'density'): + from scipy.stats import gaussian_kde + y = values + gkde = gaussian_kde(y) + ind = np.linspace(y.min(), y.max(), 1000) + ax.plot(ind, gkde.evaluate(ind), **density_kwds) + + ax.set_xlim(boundaries_list[i]) + + else: + common = (mask[a] & mask[b]).values + + ax.scatter(df[b][common], df[a][common], + marker=marker, alpha=alpha, **kwds) + + ax.set_xlim(boundaries_list[j]) + ax.set_ylim(boundaries_list[i]) + + ax.set_xlabel('') + ax.set_ylabel('') + + _label_axis(ax, kind='x', label=b, position='bottom', rotate=True) + + _label_axis(ax, kind='y', label=a, position='left') + + if j!= 0: + ax.yaxis.set_visible(False) + if i != n-1: + ax.xaxis.set_visible(False) + + for ax in axes.flat: + setp(ax.get_xticklabels(), fontsize=8) + setp(ax.get_yticklabels(), fontsize=8) + + return axes + +def _label_axis(ax, kind='x', label='', position='top', + ticks=True, rotate=False): + + from matplotlib.artist import setp + if kind == 'x': + ax.set_xlabel(label, visible=True) + ax.xaxis.set_visible(True) + ax.xaxis.set_ticks_position(position) + ax.xaxis.set_label_position(position) + if rotate: + setp(ax.get_xticklabels(), rotation=90) + elif kind == 'y': + ax.yaxis.set_visible(True) + ax.set_ylabel(label, visible=True) + # ax.set_ylabel(a) + ax.yaxis.set_ticks_position(position) + ax.yaxis.set_label_position(position) + return + + + + + +def _gca(): + import matplotlib.pyplot as plt + return plt.gca() + + +def _gcf(): + import matplotlib.pyplot as plt + return plt.gcf() + +def _get_marker_compat(marker): + import matplotlib.lines as mlines + import matplotlib as mpl + if mpl.__version__ < '1.1.0' and marker == '.': + return 'o' + if marker not in mlines.lineMarkers: + return 'o' + return marker + +def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): + """RadViz - a multivariate data visualization algorithm + + Parameters: + ----------- + frame: DataFrame + class_column: str + Column name containing class names + ax: Matplotlib axis object, optional + color: list or tuple, optional + Colors to use for the different classes + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that name + from matplotlib. + kwds: keywords + Options to pass to matplotlib scatter plotting method + + Returns: + -------- + ax: Matplotlib axis object + """ + import matplotlib.pyplot as plt + import matplotlib.patches as patches + + def normalize(series): + a = min(series) + b = max(series) + return (series - a) / (b - a) + + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] + df = frame.drop(class_column, axis=1).apply(normalize) + + if ax is None: + ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) + + to_plot = {} + colors = _get_standard_colors(num_colors=len(classes), colormap=colormap, + color_type='random', color=color) + + for kls in classes: + to_plot[kls] = [[], []] + + n = len(frame.columns) - 1 + s = np.array([(np.cos(t), np.sin(t)) + for t in [2.0 * np.pi * (i / float(n)) + for i in range(n)]]) + + for i in range(n): + row = df.iloc[i].values + row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) + y = (s * row_).sum(axis=0) / row.sum() + kls = class_col.iat[i] + to_plot[kls][0].append(y[0]) + to_plot[kls][1].append(y[1]) + + for i, kls in enumerate(classes): + ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i], + label=com.pprint_thing(kls), **kwds) + ax.legend() + + ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none')) + + for xy, name in zip(s, df.columns): + + ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray')) + + if xy[0] < 0.0 and xy[1] < 0.0: + ax.text(xy[0] - 0.025, xy[1] - 0.025, name, + ha='right', va='top', size='small') + elif xy[0] < 0.0 and xy[1] >= 0.0: + ax.text(xy[0] - 0.025, xy[1] + 0.025, name, + ha='right', va='bottom', size='small') + elif xy[0] >= 0.0 and xy[1] < 0.0: + ax.text(xy[0] + 0.025, xy[1] - 0.025, name, + ha='left', va='top', size='small') + elif xy[0] >= 0.0 and xy[1] >= 0.0: + ax.text(xy[0] + 0.025, xy[1] + 0.025, name, + ha='left', va='bottom', size='small') + + ax.axis('equal') + return ax + +@deprecate_kwarg(old_arg_name='data', new_arg_name='frame') +def andrews_curves(frame, class_column, ax=None, samples=200, color=None, + colormap=None, **kwds): + """ + Parameters: + ----------- + frame : DataFrame + Data to be plotted, preferably normalized to (0.0, 1.0) + class_column : Name of the column containing class names + ax : matplotlib axes object, default None + samples : Number of points to plot in each curve + color: list or tuple, optional + Colors to use for the different classes + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that name + from matplotlib. + kwds: keywords + Options to pass to matplotlib plotting method + + Returns: + -------- + ax: Matplotlib axis object + + """ + from math import sqrt, pi, sin, cos + import matplotlib.pyplot as plt + + def function(amplitudes): + def f(x): + x1 = amplitudes[0] + result = x1 / sqrt(2.0) + harmonic = 1.0 + for x_even, x_odd in zip(amplitudes[1::2], amplitudes[2::2]): + result += (x_even * sin(harmonic * x) + + x_odd * cos(harmonic * x)) + harmonic += 1.0 + if len(amplitudes) % 2 != 0: + result += amplitudes[-1] * sin(harmonic * x) + return result + return f + + n = len(frame) + class_col = frame[class_column] + classes = frame[class_column].drop_duplicates() + df = frame.drop(class_column, axis=1) + x = [-pi + 2.0 * pi * (t / float(samples)) for t in range(samples)] + used_legends = set([]) + + color_values = _get_standard_colors(num_colors=len(classes), + colormap=colormap, color_type='random', + color=color) + colors = dict(zip(classes, color_values)) + if ax is None: + ax = plt.gca(xlim=(-pi, pi)) + for i in range(n): + row = df.iloc[i].values + f = function(row) + y = [f(t) for t in x] + kls = class_col.iat[i] + label = com.pprint_thing(kls) + if label not in used_legends: + used_legends.add(label) + ax.plot(x, y, color=colors[kls], label=label, **kwds) + else: + ax.plot(x, y, color=colors[kls], **kwds) + + ax.legend(loc='upper right') + ax.grid() + return ax + + +def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): + """Bootstrap plot. + + Parameters: + ----------- + series: Time series + fig: matplotlib figure object, optional + size: number of data points to consider during each sampling + samples: number of times the bootstrap procedure is performed + kwds: optional keyword arguments for plotting commands, must be accepted + by both hist and plot + + Returns: + -------- + fig: matplotlib figure + """ + import random + import matplotlib.pyplot as plt + + # random.sample(ndarray, int) fails on python 3.3, sigh + data = list(series.values) + samplings = [random.sample(data, size) for _ in range(samples)] + + means = np.array([np.mean(sampling) for sampling in samplings]) + medians = np.array([np.median(sampling) for sampling in samplings]) + midranges = np.array([(min(sampling) + max(sampling)) * 0.5 + for sampling in samplings]) + if fig is None: + fig = plt.figure() + x = lrange(samples) + axes = [] + ax1 = fig.add_subplot(2, 3, 1) + ax1.set_xlabel("Sample") + axes.append(ax1) + ax1.plot(x, means, **kwds) + ax2 = fig.add_subplot(2, 3, 2) + ax2.set_xlabel("Sample") + axes.append(ax2) + ax2.plot(x, medians, **kwds) + ax3 = fig.add_subplot(2, 3, 3) + ax3.set_xlabel("Sample") + axes.append(ax3) + ax3.plot(x, midranges, **kwds) + ax4 = fig.add_subplot(2, 3, 4) + ax4.set_xlabel("Mean") + axes.append(ax4) + ax4.hist(means, **kwds) + ax5 = fig.add_subplot(2, 3, 5) + ax5.set_xlabel("Median") + axes.append(ax5) + ax5.hist(medians, **kwds) + ax6 = fig.add_subplot(2, 3, 6) + ax6.set_xlabel("Midrange") + axes.append(ax6) + ax6.hist(midranges, **kwds) + for axis in axes: + plt.setp(axis.get_xticklabels(), fontsize=8) + plt.setp(axis.get_yticklabels(), fontsize=8) + return fig + +@deprecate_kwarg(old_arg_name='colors', new_arg_name='color') +@deprecate_kwarg(old_arg_name='data', new_arg_name='frame') +def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, + use_columns=False, xticks=None, colormap=None, + **kwds): + """Parallel coordinates plotting. + + Parameters + ---------- + frame: DataFrame + class_column: str + Column name containing class names + cols: list, optional + A list of column names to use + ax: matplotlib.axis, optional + matplotlib axis object + color: list or tuple, optional + Colors to use for the different classes + use_columns: bool, optional + If true, columns will be used as xticks + xticks: list or tuple, optional + A list of values to use for xticks + colormap: str or matplotlib colormap, default None + Colormap to use for line colors. + kwds: keywords + Options to pass to matplotlib plotting method + + Returns + ------- + ax: matplotlib axis object + + Examples + -------- + >>> from pandas import read_csv + >>> from pandas.tools.plotting import parallel_coordinates + >>> from matplotlib import pyplot as plt + >>> df = read_csv('https://raw.github.com/pydata/pandas/master/pandas/tests/data/iris.csv') + >>> parallel_coordinates(df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) + >>> plt.show() + """ + import matplotlib.pyplot as plt + + n = len(frame) + classes = frame[class_column].drop_duplicates() + class_col = frame[class_column] + + if cols is None: + df = frame.drop(class_column, axis=1) + else: + df = frame[cols] + + used_legends = set([]) + + ncols = len(df.columns) + + # determine values to use for xticks + if use_columns is True: + if not np.all(np.isreal(list(df.columns))): + raise ValueError('Columns must be numeric to be used as xticks') + x = df.columns + elif xticks is not None: + if not np.all(np.isreal(xticks)): + raise ValueError('xticks specified must be numeric') + elif len(xticks) != ncols: + raise ValueError('Length of xticks must match number of columns') + x = xticks + else: + x = lrange(ncols) + + if ax is None: + ax = plt.gca() + + color_values = _get_standard_colors(num_colors=len(classes), + colormap=colormap, color_type='random', + color=color) + + colors = dict(zip(classes, color_values)) + + for i in range(n): + y = df.iloc[i].values + kls = class_col.iat[i] + label = com.pprint_thing(kls) + if label not in used_legends: + used_legends.add(label) + ax.plot(x, y, color=colors[kls], label=label, **kwds) + else: + ax.plot(x, y, color=colors[kls], **kwds) + + for i in x: + ax.axvline(i, linewidth=1, color='black') + + ax.set_xticks(x) + ax.set_xticklabels(df.columns) + ax.set_xlim(x[0], x[-1]) + ax.legend(loc='upper right') + ax.grid() + return ax + + +def lag_plot(series, lag=1, ax=None, **kwds): + """Lag plot for time series. + + Parameters: + ----------- + series: Time series + lag: lag of the scatter plot, default 1 + ax: Matplotlib axis object, optional + kwds: Matplotlib scatter method keyword arguments, optional + + Returns: + -------- + ax: Matplotlib axis object + """ + import matplotlib.pyplot as plt + + # workaround because `c='b'` is hardcoded in matplotlibs scatter method + kwds.setdefault('c', plt.rcParams['patch.facecolor']) + + data = series.values + y1 = data[:-lag] + y2 = data[lag:] + if ax is None: + ax = plt.gca() + ax.set_xlabel("y(t)") + ax.set_ylabel("y(t + %s)" % lag) + ax.scatter(y1, y2, **kwds) + return ax + + +def autocorrelation_plot(series, ax=None, **kwds): + """Autocorrelation plot for time series. + + Parameters: + ----------- + series: Time series + ax: Matplotlib axis object, optional + kwds : keywords + Options to pass to matplotlib plotting method + + Returns: + ----------- + ax: Matplotlib axis object + """ + import matplotlib.pyplot as plt + n = len(series) + data = np.asarray(series) + if ax is None: + ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0)) + mean = np.mean(data) + c0 = np.sum((data - mean) ** 2) / float(n) + + def r(h): + return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 + x = np.arange(n) + 1 + y = lmap(r, x) + z95 = 1.959963984540054 + z99 = 2.5758293035489004 + ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey') + ax.axhline(y=z95 / np.sqrt(n), color='grey') + ax.axhline(y=0.0, color='black') + ax.axhline(y=-z95 / np.sqrt(n), color='grey') + ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey') + ax.set_xlabel("Lag") + ax.set_ylabel("Autocorrelation") + ax.plot(x, y, **kwds) + if 'label' in kwds: + ax.legend() + ax.grid() + return ax + + +class MPLPlot(object): + """ + Base class for assembling a pandas plot using matplotlib + + Parameters + ---------- + data : + + """ + _default_rot = 0 + + _pop_attributes = ['label', 'style', 'logy', 'logx', 'loglog', + 'mark_right'] + _attr_defaults = {'logy': False, 'logx': False, 'loglog': False, + 'mark_right': True} + + def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, + sharey=False, use_index=True, + figsize=None, grid=None, legend=True, rot=None, + ax=None, fig=None, title=None, xlim=None, ylim=None, + xticks=None, yticks=None, + sort_columns=False, fontsize=None, + secondary_y=False, colormap=None, + table=False, **kwds): + + self.data = data + self.by = by + + self.kind = kind + + self.sort_columns = sort_columns + + self.subplots = subplots + self.sharex = sharex + self.sharey = sharey + self.figsize = figsize + + self.xticks = xticks + self.yticks = yticks + self.xlim = xlim + self.ylim = ylim + self.title = title + self.use_index = use_index + + self.fontsize = fontsize + self.rot = rot + + if grid is None: + grid = False if secondary_y else True + + self.grid = grid + self.legend = legend + self.legend_handles = [] + self.legend_labels = [] + + for attr in self._pop_attributes: + value = kwds.pop(attr, self._attr_defaults.get(attr, None)) + setattr(self, attr, value) + + self.ax = ax + self.fig = fig + self.axes = None + + # parse errorbar input if given + xerr = kwds.pop('xerr', None) + yerr = kwds.pop('yerr', None) + self.errors = {} + for kw, err in zip(['xerr', 'yerr'], [xerr, yerr]): + self.errors[kw] = self._parse_errorbars(kw, err) + + if not isinstance(secondary_y, (bool, tuple, list, np.ndarray)): + secondary_y = [secondary_y] + self.secondary_y = secondary_y + + # ugly TypeError if user passes matplotlib's `cmap` name. + # Probably better to accept either. + if 'cmap' in kwds and colormap: + raise TypeError("Only specify one of `cmap` and `colormap`.") + elif 'cmap' in kwds: + self.colormap = kwds.pop('cmap') + else: + self.colormap = colormap + + self.table = table + + self.kwds = kwds + + self._validate_color_args() + + def _validate_color_args(self): + from pandas import DataFrame + if 'color' not in self.kwds and 'colors' in self.kwds: + warnings.warn(("'colors' is being deprecated. Please use 'color'" + "instead of 'colors'")) + colors = self.kwds.pop('colors') + self.kwds['color'] = colors + + if ('color' in self.kwds and + (isinstance(self.data, Series) or + isinstance(self.data, DataFrame) and len(self.data.columns) == 1)): + # support series.plot(color='green') + self.kwds['color'] = [self.kwds['color']] + + if ('color' in self.kwds or 'colors' in self.kwds) and \ + self.colormap is not None: + warnings.warn("'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'") + + if 'color' in self.kwds and self.style is not None: + # need only a single match + if re.match('^[a-z]+?', self.style) is not None: + raise ValueError("Cannot pass 'style' string with a color " + "symbol and 'color' keyword argument. Please" + " use one or the other or pass 'style' " + "without a color symbol") + + def _iter_data(self, data=None, keep_index=False): + if data is None: + data = self.data + + from pandas.core.frame import DataFrame + if isinstance(data, (Series, np.ndarray)): + if keep_index is True: + yield self.label, data + else: + yield self.label, np.asarray(data) + elif isinstance(data, DataFrame): + if self.sort_columns: + columns = com._try_sort(data.columns) + else: + columns = data.columns + + for col in columns: + # # is this right? + # empty = df[col].count() == 0 + # values = df[col].values if not empty else np.zeros(len(df)) + + if keep_index is True: + yield col, data[col] + else: + yield col, data[col].values + + @property + def nseries(self): + if self.data.ndim == 1: + return 1 + else: + return self.data.shape[1] + + def draw(self): + self.plt.draw_if_interactive() + + def generate(self): + self._args_adjust() + self._compute_plot_data() + self._setup_subplots() + self._make_plot() + self._add_table() + self._make_legend() + self._post_plot_logic() + self._adorn_subplots() + + def _args_adjust(self): + pass + + def _maybe_right_yaxis(self, ax): + if hasattr(ax, 'right_ax'): + return ax.right_ax + else: + orig_ax, new_ax = ax, ax.twinx() + new_ax._get_lines.color_cycle = orig_ax._get_lines.color_cycle + + orig_ax.right_ax, new_ax.left_ax = new_ax, orig_ax + new_ax.right_ax = new_ax + + if len(orig_ax.get_lines()) == 0: # no data on left y + orig_ax.get_yaxis().set_visible(False) + return new_ax + + def _setup_subplots(self): + if self.subplots: + nrows, ncols = self._get_layout() + fig, axes = _subplots(nrows=nrows, ncols=ncols, + sharex=self.sharex, sharey=self.sharey, + figsize=self.figsize, ax=self.ax) + if not com.is_list_like(axes): + axes = np.array([axes]) + else: + if self.ax is None: + fig = self.plt.figure(figsize=self.figsize) + ax = fig.add_subplot(111) + else: + fig = self.ax.get_figure() + if self.figsize is not None: + fig.set_size_inches(self.figsize) + ax = self.ax + axes = [ax] + + if self.logx or self.loglog: + [a.set_xscale('log') for a in axes] + if self.logy or self.loglog: + [a.set_yscale('log') for a in axes] + + self.fig = fig + self.axes = axes + + def _get_layout(self): + from pandas.core.frame import DataFrame + if isinstance(self.data, DataFrame): + return (len(self.data.columns), 1) + else: + return (1, 1) + + def _compute_plot_data(self): + numeric_data = self.data.convert_objects()._get_numeric_data() + + try: + is_empty = numeric_data.empty + except AttributeError: + is_empty = not len(numeric_data) + + # no empty frames or series allowed + if is_empty: + raise TypeError('Empty {0!r}: no numeric data to ' + 'plot'.format(numeric_data.__class__.__name__)) + + self.data = numeric_data + + def _make_plot(self): + raise NotImplementedError + + def _add_table(self): + if self.table is False: + return + elif self.table is True: + from pandas.core.frame import DataFrame + if isinstance(self.data, Series): + data = DataFrame(self.data, columns=[self.data.name]) + elif isinstance(self.data, DataFrame): + data = self.data + data = data.transpose() + else: + data = self.table + ax = self._get_ax(0) + table(ax, data) + + def _post_plot_logic(self): + pass + + def _adorn_subplots(self): + to_adorn = self.axes + + # todo: sharex, sharey handling? + + for ax in to_adorn: + if self.yticks is not None: + ax.set_yticks(self.yticks) + + if self.xticks is not None: + ax.set_xticks(self.xticks) + + if self.ylim is not None: + ax.set_ylim(self.ylim) + + if self.xlim is not None: + ax.set_xlim(self.xlim) + + ax.grid(self.grid) + + if self.title: + if self.subplots: + self.fig.suptitle(self.title) + else: + self.axes[0].set_title(self.title) + + if self._need_to_set_index: + labels = [com.pprint_thing(key) for key in self.data.index] + labels = dict(zip(range(len(self.data.index)), labels)) + + for ax_ in self.axes: + # ax_.set_xticks(self.xticks) + xticklabels = [labels.get(x, '') for x in ax_.get_xticks()] + ax_.set_xticklabels(xticklabels, rotation=self.rot) + + @property + def legend_title(self): + if hasattr(self.data, 'columns'): + if not isinstance(self.data.columns, MultiIndex): + name = self.data.columns.name + if name is not None: + name = com.pprint_thing(name) + return name + else: + stringified = map(com.pprint_thing, + self.data.columns.names) + return ','.join(stringified) + else: + return None + + def _add_legend_handle(self, handle, label, index=None): + if not label is None: + if self.mark_right and index is not None: + if self.on_right(index): + label = label + ' (right)' + self.legend_handles.append(handle) + self.legend_labels.append(label) + + def _make_legend(self): + ax, leg = self._get_ax_legend(self.axes[0]) + + handles = [] + labels = [] + title = '' + + if not self.subplots: + if not leg is None: + title = leg.get_title().get_text() + handles = leg.legendHandles + labels = [x.get_text() for x in leg.get_texts()] + + if self.legend: + if self.legend == 'reverse': + self.legend_handles = reversed(self.legend_handles) + self.legend_labels = reversed(self.legend_labels) + + handles += self.legend_handles + labels += self.legend_labels + if not self.legend_title is None: + title = self.legend_title + + if len(handles) > 0: + ax.legend(handles, labels, loc='best', title=title) + + elif self.subplots and self.legend: + for ax in self.axes: + ax.legend(loc='best') + + + def _get_ax_legend(self, ax): + leg = ax.get_legend() + other_ax = (getattr(ax, 'right_ax', None) or + getattr(ax, 'left_ax', None)) + other_leg = None + if other_ax is not None: + other_leg = other_ax.get_legend() + if leg is None and other_leg is not None: + leg = other_leg + ax = other_ax + return ax, leg + + @cache_readonly + def plt(self): + import matplotlib.pyplot as plt + return plt + + _need_to_set_index = False + + def _get_xticks(self, convert_period=False): + index = self.data.index + is_datetype = index.inferred_type in ('datetime', 'date', + 'datetime64', 'time') + + if self.use_index: + if convert_period and isinstance(index, PeriodIndex): + self.data = self.data.reindex(index=index.order()) + x = self.data.index.to_timestamp()._mpl_repr() + elif index.is_numeric(): + """ + Matplotlib supports numeric values or datetime objects as + xaxis values. Taking LBYL approach here, by the time + matplotlib raises exception when using non numeric/datetime + values for xaxis, several actions are already taken by plt. + """ + x = index._mpl_repr() + elif is_datetype: + self.data = self.data.sort_index() + x = self.data.index._mpl_repr() + else: + self._need_to_set_index = True + x = lrange(len(index)) + else: + x = lrange(len(index)) + + return x + + def _is_datetype(self): + index = self.data.index + return (isinstance(index, (PeriodIndex, DatetimeIndex)) or + index.inferred_type in ('datetime', 'date', 'datetime64', + 'time')) + + def _get_plot_function(self): + ''' + Returns the matplotlib plotting function (plot or errorbar) based on + the presence of errorbar keywords. + ''' + + if all(e is None for e in self.errors.values()): + plotf = self.plt.Axes.plot + else: + plotf = self.plt.Axes.errorbar + + return plotf + + def _get_index_name(self): + if isinstance(self.data.index, MultiIndex): + name = self.data.index.names + if any(x is not None for x in name): + name = ','.join([com.pprint_thing(x) for x in name]) + else: + name = None + else: + name = self.data.index.name + if name is not None: + name = com.pprint_thing(name) + + return name + + def _get_ax(self, i): + # get the twinx ax if appropriate + if self.subplots: + ax = self.axes[i] + + if self.on_right(i): + ax = self._maybe_right_yaxis(ax) + self.axes[i] = ax + else: + ax = self.axes[0] + + if self.on_right(i): + ax = self._maybe_right_yaxis(ax) + + sec_true = isinstance(self.secondary_y, bool) and self.secondary_y + all_sec = (com.is_list_like(self.secondary_y) and + len(self.secondary_y) == self.nseries) + if sec_true or all_sec: + self.axes[0] = ax + + ax.get_yaxis().set_visible(True) + return ax + + def on_right(self, i): + from pandas.core.frame import DataFrame + if isinstance(self.secondary_y, bool): + return self.secondary_y + + if (isinstance(self.data, DataFrame) and + isinstance(self.secondary_y, (tuple, list, np.ndarray))): + return self.data.columns[i] in self.secondary_y + + def _get_style(self, i, col_name): + style = '' + if self.subplots: + style = 'k' + + if self.style is not None: + if isinstance(self.style, list): + try: + style = self.style[i] + except IndexError: + pass + elif isinstance(self.style, dict): + style = self.style.get(col_name, style) + else: + style = self.style + + return style or None + + def _get_colors(self, num_colors=None, color_kwds='color'): + from pandas.core.frame import DataFrame + if num_colors is None: + if isinstance(self.data, DataFrame): + num_colors = len(self.data.columns) + else: + num_colors = 1 + + return _get_standard_colors(num_colors=num_colors, + colormap=self.colormap, + color=self.kwds.get(color_kwds)) + + def _maybe_add_color(self, colors, kwds, style, i): + has_color = 'color' in kwds or self.colormap is not None + if has_color and (style is None or re.match('[a-z]+', style) is None): + kwds['color'] = colors[i % len(colors)] + + def _parse_errorbars(self, label, err): + ''' + Look for error keyword arguments and return the actual errorbar data + or return the error DataFrame/dict + + Error bars can be specified in several ways: + Series: the user provides a pandas.Series object of the same + length as the data + ndarray: provides a np.ndarray of the same length as the data + DataFrame/dict: error values are paired with keys matching the + key in the plotted DataFrame + str: the name of the column within the plotted DataFrame + ''' + + if err is None: + return None + + from pandas import DataFrame, Series + + def match_labels(data, e): + e = e.reindex_axis(data.index) + return e + + # key-matched DataFrame + if isinstance(err, DataFrame): + + err = match_labels(self.data, err) + # key-matched dict + elif isinstance(err, dict): + pass + + # Series of error values + elif isinstance(err, Series): + # broadcast error series across data + err = match_labels(self.data, err) + err = np.atleast_2d(err) + err = np.tile(err, (self.nseries, 1)) + + # errors are a column in the dataframe + elif isinstance(err, string_types): + evalues = self.data[err].values + self.data = self.data[self.data.columns.drop(err)] + err = np.atleast_2d(evalues) + err = np.tile(err, (self.nseries, 1)) + + elif com.is_list_like(err): + if com.is_iterator(err): + err = np.atleast_2d(list(err)) + else: + # raw error values + err = np.atleast_2d(err) + + err_shape = err.shape + + # asymmetrical error bars + if err.ndim == 3: + if (err_shape[0] != self.nseries) or \ + (err_shape[1] != 2) or \ + (err_shape[2] != len(self.data)): + msg = "Asymmetrical error bars should be provided " + \ + "with the shape (%u, 2, %u)" % \ + (self.nseries, len(self.data)) + raise ValueError(msg) + + # broadcast errors to each data series + if len(err) == 1: + err = np.tile(err, (self.nseries, 1)) + + elif com.is_number(err): + err = np.tile([err], (self.nseries, len(self.data))) + + else: + msg = "No valid %s detected" % label + raise ValueError(msg) + + return err + + def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): + from pandas import DataFrame + errors = {} + + for kw, flag in zip(['xerr', 'yerr'], [xerr, yerr]): + if flag: + err = self.errors[kw] + # user provided label-matched dataframe of errors + if isinstance(err, (DataFrame, dict)): + if label is not None and label in err.keys(): + err = err[label] + else: + err = None + elif index is not None and err is not None: + err = err[index] + + if err is not None: + errors[kw] = err + return errors + + +class KdePlot(MPLPlot): + def __init__(self, data, bw_method=None, ind=None, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + self.bw_method=bw_method + self.ind=ind + + def _make_plot(self): + from scipy.stats import gaussian_kde + from scipy import __version__ as spv + from distutils.version import LooseVersion + plotf = self.plt.Axes.plot + colors = self._get_colors() + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + style = self._get_style(i, label) + + label = com.pprint_thing(label) + + if LooseVersion(spv) >= '0.11.0': + gkde = gaussian_kde(y, bw_method=self.bw_method) + else: + gkde = gaussian_kde(y) + if self.bw_method is not None: + msg = ('bw_method was added in Scipy 0.11.0.' + + ' Scipy version in use is %s.' % spv) + warnings.warn(msg) + + sample_range = max(y) - min(y) + + if self.ind is None: + ind = np.linspace(min(y) - 0.5 * sample_range, + max(y) + 0.5 * sample_range, 1000) + else: + ind = self.ind + + ax.set_ylabel("Density") + + y = gkde.evaluate(ind) + kwds = self.kwds.copy() + kwds['label'] = label + self._maybe_add_color(colors, kwds, style, i) + if style is None: + args = (ax, ind, y) + else: + args = (ax, ind, y, style) + + newlines = plotf(*args, **kwds) + self._add_legend_handle(newlines[0], label) + + +class ScatterPlot(MPLPlot): + def __init__(self, data, x, y, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + self.kwds.setdefault('c', self.plt.rcParams['patch.facecolor']) + if x is None or y is None: + raise ValueError( 'scatter requires and x and y column') + if com.is_integer(x) and not self.data.columns.holds_integer(): + x = self.data.columns[x] + if com.is_integer(y) and not self.data.columns.holds_integer(): + y = self.data.columns[y] + self.x = x + self.y = y + + def _get_layout(self): + return (1, 1) + + def _make_plot(self): + x, y, data = self.x, self.y, self.data + ax = self.axes[0] + + if self.legend and hasattr(self, 'label'): + label = self.label + else: + label = None + scatter = ax.scatter(data[x].values, data[y].values, label=label, + **self.kwds) + + self._add_legend_handle(scatter, label) + + errors_x = self._get_errorbars(label=x, index=0, yerr=False) + errors_y = self._get_errorbars(label=y, index=1, xerr=False) + if len(errors_x) > 0 or len(errors_y) > 0: + err_kwds = dict(errors_x, **errors_y) + if 'color' in self.kwds: + err_kwds['color'] = self.kwds['color'] + ax.errorbar(data[x].values, data[y].values, linestyle='none', **err_kwds) + + def _post_plot_logic(self): + ax = self.axes[0] + x, y = self.x, self.y + ax.set_ylabel(com.pprint_thing(y)) + ax.set_xlabel(com.pprint_thing(x)) + + +class HexBinPlot(MPLPlot): + def __init__(self, data, x, y, C=None, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + + if x is None or y is None: + raise ValueError('hexbin requires and x and y column') + if com.is_integer(x) and not self.data.columns.holds_integer(): + x = self.data.columns[x] + if com.is_integer(y) and not self.data.columns.holds_integer(): + y = self.data.columns[y] + + if com.is_integer(C) and not self.data.columns.holds_integer(): + C = self.data.columns[C] + + self.x = x + self.y = y + self.C = C + + def _get_layout(self): + return (1, 1) + + def _make_plot(self): + import matplotlib.pyplot as plt + + x, y, data, C = self.x, self.y, self.data, self.C + ax = self.axes[0] + # pandas uses colormap, matplotlib uses cmap. + cmap = self.colormap or 'BuGn' + cmap = plt.cm.get_cmap(cmap) + cb = self.kwds.pop('colorbar', True) + + if C is None: + c_values = None + else: + c_values = data[C].values + + ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, + **self.kwds) + if cb: + img = ax.collections[0] + self.fig.colorbar(img, ax=ax) + + def _post_plot_logic(self): + ax = self.axes[0] + x, y = self.x, self.y + ax.set_ylabel(com.pprint_thing(y)) + ax.set_xlabel(com.pprint_thing(x)) + + +class LinePlot(MPLPlot): + + def __init__(self, data, **kwargs): + self.stacked = kwargs.pop('stacked', False) + if self.stacked: + data = data.fillna(value=0) + + MPLPlot.__init__(self, data, **kwargs) + self.x_compat = plot_params['x_compat'] + if 'x_compat' in self.kwds: + self.x_compat = bool(self.kwds.pop('x_compat')) + + def _index_freq(self): + from pandas.core.frame import DataFrame + if isinstance(self.data, (Series, DataFrame)): + freq = getattr(self.data.index, 'freq', None) + if freq is None: + freq = getattr(self.data.index, 'inferred_freq', None) + if freq == 'B': + weekdays = np.unique(self.data.index.dayofweek) + if (5 in weekdays) or (6 in weekdays): + freq = None + return freq + + def _is_dynamic_freq(self, freq): + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = get_base_alias(freq) + freq = get_period_alias(freq) + return freq is not None and self._no_base(freq) + + def _no_base(self, freq): + # hack this for 0.10.1, creating more technical debt...sigh + from pandas.core.frame import DataFrame + if (isinstance(self.data, (Series, DataFrame)) + and isinstance(self.data.index, DatetimeIndex)): + import pandas.tseries.frequencies as freqmod + base = freqmod.get_freq(freq) + x = self.data.index + if (base <= freqmod.FreqGroup.FR_DAY): + return x[:1].is_normalized + + return Period(x[0], freq).to_timestamp(tz=x.tz) == x[0] + return True + + def _use_dynamic_x(self): + freq = self._index_freq() + + ax = self._get_ax(0) + ax_freq = getattr(ax, 'freq', None) + if freq is None: # convert irregular if axes has freq info + freq = ax_freq + else: # do not use tsplot if irregular was plotted first + if (ax_freq is None) and (len(ax.get_lines()) > 0): + return False + + return (freq is not None) and self._is_dynamic_freq(freq) + + def _is_ts_plot(self): + # this is slightly deceptive + return not self.x_compat and self.use_index and self._use_dynamic_x() + + def _make_plot(self): + self._pos_prior = np.zeros(len(self.data)) + self._neg_prior = np.zeros(len(self.data)) + + if self._is_ts_plot(): + data = self._maybe_convert_index(self.data) + self._make_ts_plot(data) + else: + x = self._get_xticks(convert_period=True) + + plotf = self._get_plot_function() + colors = self._get_colors() + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + style = self._get_style(i, label) + kwds = self.kwds.copy() + self._maybe_add_color(colors, kwds, style, i) + + errors = self._get_errorbars(label=label, index=i) + kwds = dict(kwds, **errors) + + label = com.pprint_thing(label) # .encode('utf-8') + kwds['label'] = label + + y_values = self._get_stacked_values(y, label) + + if not self.stacked: + mask = com.isnull(y_values) + if mask.any(): + y_values = np.ma.array(y_values) + y_values = np.ma.masked_where(mask, y_values) + + # prevent style kwarg from going to errorbar, where it is unsupported + if style is not None and plotf.__name__ != 'errorbar': + args = (ax, x, y_values, style) + else: + args = (ax, x, y_values) + + newlines = plotf(*args, **kwds) + self._add_legend_handle(newlines[0], label, index=i) + + if self.stacked and not self.subplots: + if (y >= 0).all(): + self._pos_prior += y + elif (y <= 0).all(): + self._neg_prior += y + + lines = _get_all_lines(ax) + left, right = _get_xlim(lines) + ax.set_xlim(left, right) + + def _get_stacked_values(self, y, label): + if self.stacked: + if (y >= 0).all(): + return self._pos_prior + y + elif (y <= 0).all(): + return self._neg_prior + y + else: + raise ValueError('When stacked is True, each column must be either all positive or negative.' + '{0} contains both positive and negative values'.format(label)) + else: + return y + + def _get_ts_plot_function(self): + from pandas.tseries.plotting import tsplot + plotf = self._get_plot_function() + + def _plot(data, ax, label, style, **kwds): + # errorbar function does not support style argument + if plotf.__name__ == 'errorbar': + lines = tsplot(data, plotf, ax=ax, label=label, + **kwds) + return lines + else: + lines = tsplot(data, plotf, ax=ax, label=label, + style=style, **kwds) + return lines + return _plot + + def _make_ts_plot(self, data, **kwargs): + colors = self._get_colors() + plotf = self._get_ts_plot_function() + + it = self._iter_data(data=data, keep_index=True) + for i, (label, y) in enumerate(it): + ax = self._get_ax(i) + style = self._get_style(i, label) + kwds = self.kwds.copy() + + self._maybe_add_color(colors, kwds, style, i) + + errors = self._get_errorbars(label=label, index=i, xerr=False) + kwds = dict(kwds, **errors) + + label = com.pprint_thing(label) + + y_values = self._get_stacked_values(y, label) + + newlines = plotf(y_values, ax, label, style, **kwds) + self._add_legend_handle(newlines[0], label, index=i) + + if self.stacked and not self.subplots: + if (y >= 0).all(): + self._pos_prior += y + elif (y <= 0).all(): + self._neg_prior += y + + def _maybe_convert_index(self, data): + # tsplot converts automatically, but don't want to convert index + # over and over for DataFrames + from pandas.core.frame import DataFrame + if (isinstance(data.index, DatetimeIndex) and + isinstance(data, DataFrame)): + freq = getattr(data.index, 'freq', None) + + if freq is None: + freq = getattr(data.index, 'inferred_freq', None) + if isinstance(freq, DateOffset): + freq = freq.rule_code + freq = get_base_alias(freq) + freq = get_period_alias(freq) + + if freq is None: + ax = self._get_ax(0) + freq = getattr(ax, 'freq', None) + + if freq is None: + raise ValueError('Could not get frequency alias for plotting') + + data = DataFrame(data.values, + index=data.index.to_period(freq=freq), + columns=data.columns) + return data + + def _post_plot_logic(self): + df = self.data + + condition = (not self._use_dynamic_x() + and df.index.is_all_dates + and not self.subplots + or (self.subplots and self.sharex)) + + index_name = self._get_index_name() + + rot = 30 + if self.rot is not None: + rot = self.rot + + for ax in self.axes: + if condition: + format_date_labels(ax, rot=rot) + elif self.rot is not None: + for l in ax.get_xticklabels(): + l.set_rotation(self.rot) + + if index_name is not None: + ax.set_xlabel(index_name) + + +class AreaPlot(LinePlot): + + def __init__(self, data, **kwargs): + kwargs.setdefault('stacked', True) + data = data.fillna(value=0) + LinePlot.__init__(self, data, **kwargs) + + if not self.stacked: + # use smaller alpha to distinguish overlap + self.kwds.setdefault('alpha', 0.5) + + def _get_plot_function(self): + if self.logy or self.loglog: + raise ValueError("Log-y scales are not supported in area plot") + else: + f = LinePlot._get_plot_function(self) + + def plotf(*args, **kwds): + lines = f(*args, **kwds) + + # insert fill_between starting point + y = args[2] + if (y >= 0).all(): + start = self._pos_prior + elif (y <= 0).all(): + start = self._neg_prior + else: + start = np.zeros(len(y)) + + # get x data from the line + # to retrieve x coodinates of tsplot + xdata = lines[0].get_data()[0] + # remove style + args = (args[0], xdata, start, y) + + if not 'color' in kwds: + kwds['color'] = lines[0].get_color() + + self.plt.Axes.fill_between(*args, **kwds) + return lines + + return plotf + + def _add_legend_handle(self, handle, label, index=None): + from matplotlib.patches import Rectangle + # Because fill_between isn't supported in legend, + # specifically add Rectangle handle here + alpha = self.kwds.get('alpha', 0.5) + handle = Rectangle((0, 0), 1, 1, fc=handle.get_color(), alpha=alpha) + LinePlot._add_legend_handle(self, handle, label, index=index) + + def _post_plot_logic(self): + LinePlot._post_plot_logic(self) + + if self._is_ts_plot(): + pass + else: + if self.xlim is None: + for ax in self.axes: + lines = _get_all_lines(ax) + left, right = _get_xlim(lines) + ax.set_xlim(left, right) + + if self.ylim is None: + if (self.data >= 0).all().all(): + for ax in self.axes: + ax.set_ylim(0, None) + elif (self.data <= 0).all().all(): + for ax in self.axes: + ax.set_ylim(None, 0) + + +class BarPlot(MPLPlot): + + _default_rot = {'bar': 90, 'barh': 0} + + def __init__(self, data, **kwargs): + self.stacked = kwargs.pop('stacked', False) + + self.bar_width = kwargs.pop('width', 0.5) + + pos = kwargs.pop('position', 0.5) + + kwargs.setdefault('align', 'center') + self.tick_pos = np.arange(len(data)) + + self.bottom = kwargs.pop('bottom', None) + self.left = kwargs.pop('left', None) + + self.log = kwargs.pop('log',False) + MPLPlot.__init__(self, data, **kwargs) + + if self.stacked or self.subplots: + self.tickoffset = self.bar_width * pos + if kwargs['align'] == 'edge': + self.lim_offset = self.bar_width / 2 + else: + self.lim_offset = 0 + else: + if kwargs['align'] == 'edge': + w = self.bar_width / self.nseries + self.tickoffset = self.bar_width * (pos - 0.5) + w * 0.5 + self.lim_offset = w * 0.5 + else: + self.tickoffset = self.bar_width * pos + self.lim_offset = 0 + + self.ax_pos = self.tick_pos - self.tickoffset + + def _args_adjust(self): + if self.rot is None: + self.rot = self._default_rot[self.kind] + + if com.is_list_like(self.bottom): + self.bottom = np.array(self.bottom) + if com.is_list_like(self.left): + self.left = np.array(self.left) + + def _get_plot_function(self): + if self.kind == 'bar': + def f(ax, x, y, w, start=None, **kwds): + if self.bottom is not None: + start = start + self.bottom + return ax.bar(x, y, w, bottom=start,log=self.log, **kwds) + elif self.kind == 'barh': + def f(ax, x, y, w, start=None, log=self.log, **kwds): + if self.left is not None: + start = start + self.left + return ax.barh(x, y, w, left=start, **kwds) + else: + raise NotImplementedError + + return f + + def _make_plot(self): + import matplotlib as mpl + # mpl decided to make their version string unicode across all Python + # versions for mpl >= 1.3 so we have to call str here for python 2 + mpl_le_1_2_1 = str(mpl.__version__) <= LooseVersion('1.2.1') + + colors = self._get_colors() + ncolors = len(colors) + + bar_f = self._get_plot_function() + pos_prior = neg_prior = np.zeros(len(self.data)) + K = self.nseries + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + kwds = self.kwds.copy() + kwds['color'] = colors[i % ncolors] + + errors = self._get_errorbars(label=label, index=i) + kwds = dict(kwds, **errors) + + label = com.pprint_thing(label) + + if (('yerr' in kwds) or ('xerr' in kwds)) \ + and (kwds.get('ecolor') is None): + kwds['ecolor'] = mpl.rcParams['xtick.color'] + + start = 0 + if self.log: + start = 1 + if any(y < 1): + # GH3254 + start = 0 if mpl_le_1_2_1 else None + + if self.subplots: + w = self.bar_width / 2 + rect = bar_f(ax, self.ax_pos + w, y, self.bar_width, + start=start, label=label, **kwds) + ax.set_title(label) + elif self.stacked: + mask = y > 0 + start = np.where(mask, pos_prior, neg_prior) + w = self.bar_width / 2 + rect = bar_f(ax, self.ax_pos + w, y, self.bar_width, + start=start, label=label, **kwds) + pos_prior = pos_prior + np.where(mask, y, 0) + neg_prior = neg_prior + np.where(mask, 0, y) + else: + w = self.bar_width / K + rect = bar_f(ax, self.ax_pos + (i + 0.5) * w, y, w, + start=start, label=label, **kwds) + self._add_legend_handle(rect, label, index=i) + + def _post_plot_logic(self): + for ax in self.axes: + if self.use_index: + str_index = [com.pprint_thing(key) for key in self.data.index] + else: + str_index = [com.pprint_thing(key) for key in + range(self.data.shape[0])] + name = self._get_index_name() + + s_edge = self.ax_pos[0] - 0.25 + self.lim_offset + e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset + + if self.kind == 'bar': + ax.set_xlim((s_edge, e_edge)) + ax.set_xticks(self.tick_pos) + ax.set_xticklabels(str_index, rotation=self.rot, + fontsize=self.fontsize) + if not self.log: # GH3254+ + ax.axhline(0, color='k', linestyle='--') + if name is not None: + ax.set_xlabel(name) + elif self.kind == 'barh': + # horizontal bars + ax.set_ylim((s_edge, e_edge)) + ax.set_yticks(self.tick_pos) + ax.set_yticklabels(str_index, rotation=self.rot, + fontsize=self.fontsize) + ax.axvline(0, color='k', linestyle='--') + if name is not None: + ax.set_ylabel(name) + else: + raise NotImplementedError(self.kind) + + +class PiePlot(MPLPlot): + + def __init__(self, data, kind=None, **kwargs): + data = data.fillna(value=0) + if (data < 0).any().any(): + raise ValueError("{0} doesn't allow negative values".format(kind)) + MPLPlot.__init__(self, data, kind=kind, **kwargs) + + def _args_adjust(self): + self.grid = False + self.logy = False + self.logx = False + self.loglog = False + + def _get_layout(self): + from pandas import DataFrame + if isinstance(self.data, DataFrame): + return (1, len(self.data.columns)) + else: + return (1, 1) + + def _validate_color_args(self): + pass + + def _make_plot(self): + self.kwds.setdefault('colors', self._get_colors(num_colors=len(self.data), + color_kwds='colors')) + + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + if label is not None: + label = com.pprint_thing(label) + ax.set_ylabel(label) + + kwds = self.kwds.copy() + + idx = [com.pprint_thing(v) for v in self.data.index] + labels = kwds.pop('labels', idx) + # labels is used for each wedge's labels + results = ax.pie(y, labels=labels, **kwds) + + if kwds.get('autopct', None) is not None: + patches, texts, autotexts = results + else: + patches, texts = results + autotexts = [] + + if self.fontsize is not None: + for t in texts + autotexts: + t.set_fontsize(self.fontsize) + + # leglabels is used for legend labels + leglabels = labels if labels is not None else idx + for p, l in zip(patches, leglabels): + self._add_legend_handle(p, l) + + +class BoxPlot(MPLPlot): + pass + + +class HistPlot(MPLPlot): + pass + +# kinds supported by both dataframe and series +_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area'] +# kinds supported by dataframe +_dataframe_kinds = ['scatter', 'hexbin'] +# kinds supported only by series or dataframe single column +_series_kinds = ['pie'] +_all_kinds = _common_kinds + _dataframe_kinds + _series_kinds + +_plot_klass = {'line': LinePlot, 'bar': BarPlot, 'barh': BarPlot, + 'kde': KdePlot, + 'scatter': ScatterPlot, 'hexbin': HexBinPlot, + 'area': AreaPlot, 'pie': PiePlot} + + +def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, + sharey=False, use_index=True, figsize=None, grid=None, + legend=True, rot=None, ax=None, style=None, title=None, + xlim=None, ylim=None, logx=False, logy=False, xticks=None, + yticks=None, kind='line', sort_columns=False, fontsize=None, + secondary_y=False, **kwds): + + """ + Make line, bar, or scatter plots of DataFrame series with the index on the x-axis + using matplotlib / pylab. + + Parameters + ---------- + frame : DataFrame + x : label or position, default None + y : label or position, default None + Allows plotting of one column versus another + yerr : DataFrame (with matching labels), Series, list-type (tuple, list, + ndarray), or str of column name containing y error values + xerr : similar functionality as yerr, but for x error values + subplots : boolean, default False + Make separate subplots for each time series + sharex : boolean, default True + In case subplots=True, share x axis + sharey : boolean, default False + In case subplots=True, share y axis + use_index : boolean, default True + Use index as ticks for x axis + stacked : boolean, default False + If True, create stacked bar plot. Only valid for DataFrame input + sort_columns: boolean, default False + Sort column names to determine plot ordering + title : string + Title to use for the plot + grid : boolean, default None (matlab style default) + Axis grid lines + legend : False/True/'reverse' + Place legend on axis subplots + + ax : matplotlib axis object, default None + style : list or dict + matplotlib line style per column + kind : {'line', 'bar', 'barh', 'kde', 'density', 'area', scatter', 'hexbin'} + line : line plot + bar : vertical bar plot + barh : horizontal bar plot + kde/density : Kernel Density Estimation plot + area : area plot + scatter : scatter plot + hexbin : hexbin plot + logx : boolean, default False + Use log scaling on x axis + logy : boolean, default False + Use log scaling on y axis + loglog : boolean, default False + Use log scaling on both x and y axes + xticks : sequence + Values to use for the xticks + yticks : sequence + Values to use for the yticks + xlim : 2-tuple/list + ylim : 2-tuple/list + rot : int, default None + Rotation for ticks + secondary_y : boolean or sequence, default False + Whether to plot on the secondary y-axis + If a list/tuple, which columns to plot on secondary y-axis + mark_right: boolean, default True + When using a secondary_y axis, should the legend label the axis of + the various columns automatically + colormap : str or matplotlib colormap object, default None + Colormap to select colors from. If string, load colormap with that name + from matplotlib. + position : float + Specify relative alignments for bar plot layout. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) + table : boolean, Series or DataFrame, default False + If True, draw a table using the data in the DataFrame and the data will + be transposed to meet matplotlib's default layout. + If a Series or DataFrame is passed, use passed data to draw a table. + kwds : keywords + Options to pass to matplotlib plotting method + + Returns + ------- + ax_or_axes : matplotlib.AxesSubplot or list of them + + Notes + ----- + + If `kind`='hexbin', you can control the size of the bins with the + `gridsize` argument. By default, a histogram of the counts around each + `(x, y)` point is computed. You can specify alternative aggregations + by passing values to the `C` and `reduce_C_function` arguments. + `C` specifies the value at each `(x, y)` point and `reduce_C_function` + is a function of one argument that reduces all the values in a bin to + a single number (e.g. `mean`, `max`, `sum`, `std`). + """ + + kind = _get_standard_kind(kind.lower().strip()) + if kind in _all_kinds: + klass = _plot_klass[kind] + else: + raise ValueError('Invalid chart type given %s' % kind) + + if kind in _dataframe_kinds: + plot_obj = klass(frame, x=x, y=y, kind=kind, subplots=subplots, + rot=rot,legend=legend, ax=ax, style=style, + fontsize=fontsize, use_index=use_index, sharex=sharex, + sharey=sharey, xticks=xticks, yticks=yticks, + xlim=xlim, ylim=ylim, title=title, grid=grid, + figsize=figsize, logx=logx, logy=logy, + sort_columns=sort_columns, secondary_y=secondary_y, + **kwds) + elif kind in _series_kinds: + if y is None and subplots is False: + msg = "{0} requires either y column or 'subplots=True'" + raise ValueError(msg.format(kind)) + elif y is not None: + if com.is_integer(y) and not frame.columns.holds_integer(): + y = frame.columns[y] + frame = frame[y] # converted to series actually + frame.index.name = y + + plot_obj = klass(frame, kind=kind, subplots=subplots, + rot=rot,legend=legend, ax=ax, style=style, + fontsize=fontsize, use_index=use_index, sharex=sharex, + sharey=sharey, xticks=xticks, yticks=yticks, + xlim=xlim, ylim=ylim, title=title, grid=grid, + figsize=figsize, + sort_columns=sort_columns, + **kwds) + else: + if x is not None: + if com.is_integer(x) and not frame.columns.holds_integer(): + x = frame.columns[x] + frame = frame.set_index(x) + + if y is not None: + if com.is_integer(y) and not frame.columns.holds_integer(): + y = frame.columns[y] + label = x if x is not None else frame.index.name + label = kwds.pop('label', label) + ser = frame[y] + ser.index.name = label + + for kw in ['xerr', 'yerr']: + if (kw in kwds) and \ + (isinstance(kwds[kw], string_types) or com.is_integer(kwds[kw])): + try: + kwds[kw] = frame[kwds[kw]] + except (IndexError, KeyError, TypeError): + pass + + return plot_series(ser, label=label, kind=kind, + use_index=use_index, + rot=rot, xticks=xticks, yticks=yticks, + xlim=xlim, ylim=ylim, ax=ax, style=style, + grid=grid, logx=logx, logy=logy, + secondary_y=secondary_y, title=title, + figsize=figsize, fontsize=fontsize, **kwds) + + else: + plot_obj = klass(frame, kind=kind, subplots=subplots, rot=rot, + legend=legend, ax=ax, style=style, fontsize=fontsize, + use_index=use_index, sharex=sharex, sharey=sharey, + xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, + title=title, grid=grid, figsize=figsize, logx=logx, + logy=logy, sort_columns=sort_columns, + secondary_y=secondary_y, **kwds) + + plot_obj.generate() + plot_obj.draw() + if subplots: + return plot_obj.axes + else: + return plot_obj.axes[0] + + +def plot_series(series, label=None, kind='line', use_index=True, rot=None, + xticks=None, yticks=None, xlim=None, ylim=None, + ax=None, style=None, grid=None, legend=False, logx=False, + logy=False, secondary_y=False, **kwds): + """ + Plot the input series with the index on the x-axis using matplotlib + + Parameters + ---------- + label : label argument to provide to plot + kind : {'line', 'bar', 'barh', 'kde', 'density', 'area'} + line : line plot + bar : vertical bar plot + barh : horizontal bar plot + kde/density : Kernel Density Estimation plot + area : area plot + use_index : boolean, default True + Plot index as axis tick labels + rot : int, default None + Rotation for tick labels + xticks : sequence + Values to use for the xticks + yticks : sequence + Values to use for the yticks + xlim : 2-tuple/list + ylim : 2-tuple/list + ax : matplotlib axis object + If not passed, uses gca() + style : string, default matplotlib default + matplotlib line style to use + grid : matplotlib grid + legend: matplotlib legend + logx : boolean, default False + Use log scaling on x axis + logy : boolean, default False + Use log scaling on y axis + loglog : boolean, default False + Use log scaling on both x and y axes + secondary_y : boolean or sequence of ints, default False + If True then y-axis will be on the right + figsize : a tuple (width, height) in inches + position : float + Specify relative alignments for bar plot layout. + From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) + table : boolean, Series or DataFrame, default False + If True, draw a table using the data in the Series and the data will + be transposed to meet matplotlib's default layout. + If a Series or DataFrame is passed, use passed data to draw a table. + kwds : keywords + Options to pass to matplotlib plotting method + + Notes + ----- + See matplotlib documentation online for more on this subject + """ + + kind = _get_standard_kind(kind.lower().strip()) + if kind in _common_kinds or kind in _series_kinds: + klass = _plot_klass[kind] + else: + raise ValueError('Invalid chart type given %s' % kind) + + """ + If no axis is specified, we check whether there are existing figures. + If so, we get the current axis and check whether yaxis ticks are on the + right. Ticks for the plot of the series will be on the right unless + there is at least one axis with ticks on the left. + + If we do not check for whether there are existing figures, _gca() will + create a figure with the default figsize, causing the figsize= parameter to + be ignored. + """ + import matplotlib.pyplot as plt + if ax is None and len(plt.get_fignums()) > 0: + ax = _gca() + ax = getattr(ax, 'left_ax', ax) + + # is there harm in this? + if label is None: + label = series.name + + plot_obj = klass(series, kind=kind, rot=rot, logx=logx, logy=logy, + ax=ax, use_index=use_index, style=style, + xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, + legend=legend, grid=grid, label=label, + secondary_y=secondary_y, **kwds) + + plot_obj.generate() + plot_obj.draw() + + # plot_obj.ax is None if we created the first figure + return plot_obj.axes[0] + + +_shared_docs['boxplot'] = """ + Make a box plot from DataFrame column optionally grouped by some columns or + other inputs + + Parameters + ---------- + data : the pandas object holding the data + column : column name or list of names, or vector + Can be any valid input to groupby + by : string or sequence + Column in the DataFrame to group by + ax : Matplotlib axes object, optional + fontsize : int or string + rot : label rotation angle + figsize : A tuple (width, height) in inches + grid : Setting this to True will show the grid + layout : tuple (optional) + (rows, columns) for the layout of the plot + return_type : {'axes', 'dict', 'both'}, default 'dict' + The kind of object to return. 'dict' returns a dictionary + whose values are the matplotlib Lines of the boxplot; + 'axes' returns the matplotlib axes the boxplot is drawn on; + 'both' returns a namedtuple with the axes and dict. + + When grouping with ``by``, a dict mapping columns to ``return_type`` + is returned. + + kwds : other plotting keyword arguments to be passed to matplotlib boxplot + function + + Returns + ------- + lines : dict + ax : matplotlib Axes + (ax, lines): namedtuple + + Notes + ----- + Use ``return_type='dict'`` when you want to tweak the appearance + of the lines after plotting. In this case a dict containing the Lines + making up the boxes, caps, fliers, medians, and whiskers is returned. + """ + + +@Appender(_shared_docs['boxplot'] % _shared_doc_kwargs) +def boxplot(data, column=None, by=None, ax=None, fontsize=None, + rot=0, grid=True, figsize=None, layout=None, return_type=None, + **kwds): + + # validate return_type: + valid_types = (None, 'axes', 'dict', 'both') + if return_type not in valid_types: + raise ValueError("return_type") + + from pandas import Series, DataFrame + if isinstance(data, Series): + data = DataFrame({'x': data}) + column = 'x' + + def _get_colors(): + return _get_standard_colors(color=kwds.get('color'), num_colors=1) + + def maybe_color_bp(bp): + if 'color' not in kwds : + from matplotlib.artist import setp + setp(bp['boxes'],color=colors[0],alpha=1) + setp(bp['whiskers'],color=colors[0],alpha=1) + setp(bp['medians'],color=colors[2],alpha=1) + + BP = namedtuple("Boxplot", ['ax', 'lines']) # namedtuple to hold results + + def plot_group(keys, values, ax): + keys = [com.pprint_thing(x) for x in keys] + values = [remove_na(v) for v in values] + bp = ax.boxplot(values, **kwds) + if kwds.get('vert', 1): + ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) + else: + ax.set_yticklabels(keys, rotation=rot, fontsize=fontsize) + maybe_color_bp(bp) + + # Return axes in multiplot case, maybe revisit later # 985 + if return_type == 'dict': + return bp + elif return_type == 'both': + return BP(ax=ax, lines=bp) + else: + return ax + + colors = _get_colors() + if column is None: + columns = None + else: + if isinstance(column, (list, tuple)): + columns = column + else: + columns = [column] + + if by is not None: + result = _grouped_plot_by_column(plot_group, data, columns=columns, + by=by, grid=grid, figsize=figsize, + ax=ax, layout=layout, return_type=return_type) + else: + if layout is not None: + raise ValueError("The 'layout' keyword is not supported when " + "'by' is None") + + if return_type is None: + msg = ("\nThe default value for 'return_type' will change to " + "'axes' in a future release.\n To use the future behavior " + "now, set return_type='axes'.\n To keep the previous " + "behavior and silence this warning, set " + "return_type='dict'.") + warnings.warn(msg, FutureWarning) + return_type = 'dict' + if ax is None: + ax = _gca() + data = data._get_numeric_data() + if columns is None: + columns = data.columns + else: + data = data[columns] + + result = plot_group(columns, data.values.T, ax) + ax.grid(grid) + + return result + + +def format_date_labels(ax, rot): + # mini version of autofmt_xdate + try: + for label in ax.get_xticklabels(): + label.set_ha('right') + label.set_rotation(rot) + fig = ax.get_figure() + fig.subplots_adjust(bottom=0.2) + except Exception: # pragma: no cover + pass + + +def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False, **kwargs): + """ + Make a scatter plot from two DataFrame columns + + Parameters + ---------- + data : DataFrame + x : Column name for the x-axis values + y : Column name for the y-axis values + ax : Matplotlib axis object + figsize : A tuple (width, height) in inches + grid : Setting this to True will show the grid + kwargs : other plotting keyword arguments + To be passed to scatter function + + Returns + ------- + fig : matplotlib.Figure + """ + import matplotlib.pyplot as plt + + # workaround because `c='b'` is hardcoded in matplotlibs scatter method + kwargs.setdefault('c', plt.rcParams['patch.facecolor']) + + def plot_group(group, ax): + xvals = group[x].values + yvals = group[y].values + ax.scatter(xvals, yvals, **kwargs) + ax.grid(grid) + + if by is not None: + fig = _grouped_plot(plot_group, data, by=by, figsize=figsize, ax=ax) + else: + if ax is None: + fig = plt.figure() + ax = fig.add_subplot(111) + else: + fig = ax.get_figure() + plot_group(data, ax) + ax.set_ylabel(com.pprint_thing(y)) + ax.set_xlabel(com.pprint_thing(x)) + + ax.grid(grid) + + return fig + + +def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, + xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, + sharey=False, figsize=None, layout=None, bins=10, **kwds): + """ + Draw histogram of the DataFrame's series using matplotlib / pylab. + + Parameters + ---------- + data : DataFrame + column : string or sequence + If passed, will be used to limit data to a subset of columns + by : object, optional + If passed, then used to form histograms for separate groups + grid : boolean, default True + Whether to show axis grid lines + xlabelsize : int, default None + If specified changes the x-axis label size + xrot : float, default None + rotation of x axis labels + ylabelsize : int, default None + If specified changes the y-axis label size + yrot : float, default None + rotation of y axis labels + ax : matplotlib axes object, default None + sharex : bool, if True, the X axis will be shared amongst all subplots. + sharey : bool, if True, the Y axis will be shared amongst all subplots. + figsize : tuple + The size of the figure to create in inches by default + layout: (optional) a tuple (rows, columns) for the layout of the histograms + bins: integer, default 10 + Number of histogram bins to be used + kwds : other plotting keyword arguments + To be passed to hist function + """ + + if by is not None: + axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, + sharex=sharex, sharey=sharey, layout=layout, bins=bins, + xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, + **kwds) + return axes + + if column is not None: + if not isinstance(column, (list, np.ndarray)): + column = [column] + data = data[column] + data = data._get_numeric_data() + naxes = len(data.columns) + + nrows, ncols = _get_layout(naxes, layout=layout) + fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, ax=ax, squeeze=False, + sharex=sharex, sharey=sharey, figsize=figsize) + + for i, col in enumerate(com._try_sort(data.columns)): + ax = axes[i // ncols, i % ncols] + ax.hist(data[col].dropna().values, bins=bins, **kwds) + ax.set_title(col) + ax.grid(grid) + + _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, + ylabelsize=ylabelsize, yrot=yrot) + fig.subplots_adjust(wspace=0.3, hspace=0.3) + + return axes + + +def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, + xrot=None, ylabelsize=None, yrot=None, figsize=None, bins=10, **kwds): + """ + Draw histogram of the input series using matplotlib + + Parameters + ---------- + by : object, optional + If passed, then used to form histograms for separate groups + ax : matplotlib axis object + If not passed, uses gca() + grid : boolean, default True + Whether to show axis grid lines + xlabelsize : int, default None + If specified changes the x-axis label size + xrot : float, default None + rotation of x axis labels + ylabelsize : int, default None + If specified changes the y-axis label size + yrot : float, default None + rotation of y axis labels + figsize : tuple, default None + figure size in inches by default + bins: integer, default 10 + Number of histogram bins to be used + kwds : keywords + To be passed to the actual plotting function + + Notes + ----- + See matplotlib documentation online for more on this + + """ + import matplotlib.pyplot as plt + + if by is None: + if kwds.get('layout', None) is not None: + raise ValueError("The 'layout' keyword is not supported when " + "'by' is None") + # hack until the plotting interface is a bit more unified + fig = kwds.pop('figure', plt.gcf() if plt.get_fignums() else + plt.figure(figsize=figsize)) + if (figsize is not None and tuple(figsize) != + tuple(fig.get_size_inches())): + fig.set_size_inches(*figsize, forward=True) + if ax is None: + ax = fig.gca() + elif ax.get_figure() != fig: + raise AssertionError('passed axis not bound to passed figure') + values = self.dropna().values + + ax.hist(values, bins=bins, **kwds) + ax.grid(grid) + axes = np.array([ax]) + + _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, + ylabelsize=ylabelsize, yrot=yrot) + + else: + if 'figure' in kwds: + raise ValueError("Cannot pass 'figure' when using the " + "'by' argument, since a new 'Figure' instance " + "will be created") + axes = grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize, bins=bins, + xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, + **kwds) + + if axes.ndim == 1 and len(axes) == 1: + return axes[0] + return axes + + +def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, + layout=None, sharex=False, sharey=False, rot=90, grid=True, + xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, + **kwargs): + """ + Grouped histogram + + Parameters + ---------- + data: Series/DataFrame + column: object, optional + by: object, optional + ax: axes, optional + bins: int, default 50 + figsize: tuple, optional + layout: optional + sharex: boolean, default False + sharey: boolean, default False + rot: int, default 90 + grid: bool, default True + kwargs: dict, keyword arguments passed to matplotlib.Axes.hist + + Returns + ------- + axes: collection of Matplotlib Axes + """ + def plot_group(group, ax): + ax.hist(group.dropna().values, bins=bins, **kwargs) + + xrot = xrot or rot + + fig, axes = _grouped_plot(plot_group, data, column=column, + by=by, sharex=sharex, sharey=sharey, + figsize=figsize, layout=layout, rot=rot) + + _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, + ylabelsize=ylabelsize, yrot=yrot) + + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, + hspace=0.5, wspace=0.3) + return axes + + +def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, + rot=0, grid=True, ax=None, figsize=None, + layout=None, **kwds): + """ + Make box plots from DataFrameGroupBy data. + + Parameters + ---------- + grouped : Grouped DataFrame + subplots : + * ``False`` - no subplots will be used + * ``True`` - create a subplot for each group + column : column name or list of names, or vector + Can be any valid input to groupby + fontsize : int or string + rot : label rotation angle + grid : Setting this to True will show the grid + figsize : A tuple (width, height) in inches + layout : tuple (optional) + (rows, columns) for the layout of the plot + kwds : other plotting keyword arguments to be passed to matplotlib boxplot + function + + Returns + ------- + dict of key/value = group key/DataFrame.boxplot return value + or DataFrame.boxplot return value in case subplots=figures=False + + Examples + -------- + >>> import pandas + >>> import numpy as np + >>> import itertools + >>> + >>> tuples = [t for t in itertools.product(range(1000), range(4))] + >>> index = pandas.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) + >>> data = np.random.randn(len(index),4) + >>> df = pandas.DataFrame(data, columns=list('ABCD'), index=index) + >>> + >>> grouped = df.groupby(level='lvl1') + >>> boxplot_frame_groupby(grouped) + >>> + >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1) + >>> boxplot_frame_groupby(grouped, subplots=False) + """ + if subplots is True: + naxes = len(grouped) + nrows, ncols = _get_layout(naxes, layout=layout) + fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, squeeze=False, + ax=ax, sharex=False, sharey=True, figsize=figsize) + axes = _flatten(axes) + + ret = compat.OrderedDict() + for (key, group), ax in zip(grouped, axes): + d = group.boxplot(ax=ax, column=column, fontsize=fontsize, + rot=rot, grid=grid, **kwds) + ax.set_title(com.pprint_thing(key)) + ret[key] = d + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + else: + from pandas.tools.merge import concat + keys, frames = zip(*grouped) + if grouped.axis == 0: + df = concat(frames, keys=keys, axis=1) + else: + if len(frames) > 1: + df = frames[0].join(frames[1::]) + else: + df = frames[0] + ret = df.boxplot(column=column, fontsize=fontsize, rot=rot, + grid=grid, ax=ax, figsize=figsize, layout=layout, **kwds) + return ret + + +def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, + figsize=None, sharex=True, sharey=True, layout=None, + rot=0, ax=None, **kwargs): + from pandas import DataFrame + + if figsize == 'default': + # allowed to specify mpl default with 'default' + warnings.warn("figsize='default' is deprecated. Specify figure" + "size by tuple instead", FutureWarning) + figsize = None + + grouped = data.groupby(by) + if column is not None: + grouped = grouped[column] + + naxes = len(grouped) + nrows, ncols = _get_layout(naxes, layout=layout) + fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, + figsize=figsize, sharex=sharex, sharey=sharey, ax=ax) + + ravel_axes = _flatten(axes) + + for i, (key, group) in enumerate(grouped): + ax = ravel_axes[i] + if numeric_only and isinstance(group, DataFrame): + group = group._get_numeric_data() + plotf(group, ax, **kwargs) + ax.set_title(com.pprint_thing(key)) + + return fig, axes + + +def _grouped_plot_by_column(plotf, data, columns=None, by=None, + numeric_only=True, grid=False, + figsize=None, ax=None, layout=None, return_type=None, + **kwargs): + grouped = data.groupby(by) + if columns is None: + if not isinstance(by, (list, tuple)): + by = [by] + columns = data._get_numeric_data().columns - by + naxes = len(columns) + nrows, ncols = _get_layout(naxes, layout=layout) + fig, axes = _subplots(nrows=nrows, ncols=ncols, naxes=naxes, + sharex=True, sharey=True, + figsize=figsize, ax=ax) + + ravel_axes = _flatten(axes) + + result = compat.OrderedDict() + for i, col in enumerate(columns): + ax = ravel_axes[i] + gp_col = grouped[col] + keys, values = zip(*gp_col) + re_plotf = plotf(keys, values, ax, **kwargs) + ax.set_title(col) + ax.set_xlabel(com.pprint_thing(by)) + result[col] = re_plotf + ax.grid(grid) + + # Return axes in multiplot case, maybe revisit later # 985 + if return_type is None: + result = axes + + byline = by[0] if len(by) == 1 else by + fig.suptitle('Boxplot grouped by %s' % byline) + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + + return result + + +def table(ax, data, rowLabels=None, colLabels=None, + **kwargs): + + """ + Helper function to convert DataFrame and Series to matplotlib.table + + Parameters + ---------- + `ax`: Matplotlib axes object + `data`: DataFrame or Series + data for table contents + `kwargs`: keywords, optional + keyword arguments which passed to matplotlib.table.table. + If `rowLabels` or `colLabels` is not specified, data index or column name will be used. + + Returns + ------- + matplotlib table object + """ + from pandas import DataFrame + if isinstance(data, Series): + data = DataFrame(data, columns=[data.name]) + elif isinstance(data, DataFrame): + pass + else: + raise ValueError('Input data must be DataFrame or Series') + + if rowLabels is None: + rowLabels = data.index + + if colLabels is None: + colLabels = data.columns + + cellText = data.values + + import matplotlib.table + table = matplotlib.table.table(ax, cellText=cellText, + rowLabels=rowLabels, colLabels=colLabels, **kwargs) + return table + + +def _get_layout(nplots, layout=None): + if layout is not None: + if not isinstance(layout, (tuple, list)) or len(layout) != 2: + raise ValueError('Layout must be a tuple of (rows, columns)') + + nrows, ncols = layout + if nrows * ncols < nplots: + raise ValueError('Layout of %sx%s must be larger than required size %s' % + (nrows, ncols, nplots)) + + return layout + + if nplots == 1: + return (1, 1) + elif nplots == 2: + return (1, 2) + elif nplots < 4: + return (2, 2) + + k = 1 + while k ** 2 < nplots: + k += 1 + + if (k - 1) * k >= nplots: + return k, (k - 1) + else: + return k, k + +# copied from matplotlib/pyplot.py for compatibility with matplotlib < 1.0 + + +def _subplots(nrows=1, ncols=1, naxes=None, sharex=False, sharey=False, squeeze=True, + subplot_kw=None, ax=None, **fig_kw): + """Create a figure with a set of subplots already made. + + This utility wrapper makes it convenient to create common layouts of + subplots, including the enclosing figure object, in a single call. + + Keyword arguments: + + nrows : int + Number of rows of the subplot grid. Defaults to 1. + + ncols : int + Number of columns of the subplot grid. Defaults to 1. + + naxes : int + Number of required axes. Exceeded axes are set invisible. Default is nrows * ncols. + + sharex : bool + If True, the X axis will be shared amongst all subplots. + + sharey : bool + If True, the Y axis will be shared amongst all subplots. + + squeeze : bool + + If True, extra dimensions are squeezed out from the returned axis object: + - if only one subplot is constructed (nrows=ncols=1), the resulting + single Axis object is returned as a scalar. + - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object + array of Axis objects are returned as numpy 1-d arrays. + - for NxM subplots with N>1 and M>1 are returned as a 2d array. + + If False, no squeezing at all is done: the returned axis object is always + a 2-d array containing Axis instances, even if it ends up being 1x1. + + subplot_kw : dict + Dict with keywords passed to the add_subplot() call used to create each + subplots. + + ax : Matplotlib axis object, optional + + fig_kw : Other keyword arguments to be passed to the figure() call. + Note that all keywords not recognized above will be + automatically included here. + + + Returns: + + fig, ax : tuple + - fig is the Matplotlib Figure object + - ax can be either a single axis object or an array of axis objects if + more than one subplot was created. The dimensions of the resulting array + can be controlled with the squeeze keyword, see above. + + **Examples:** + + x = np.linspace(0, 2*np.pi, 400) + y = np.sin(x**2) + + # Just a figure and one subplot + f, ax = plt.subplots() + ax.plot(x, y) + ax.set_title('Simple plot') + + # Two subplots, unpack the output array immediately + f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) + ax1.plot(x, y) + ax1.set_title('Sharing Y axis') + ax2.scatter(x, y) + + # Four polar axes + plt.subplots(2, 2, subplot_kw=dict(polar=True)) + """ + import matplotlib.pyplot as plt + from pandas.core.frame import DataFrame + + if subplot_kw is None: + subplot_kw = {} + + # Create empty object array to hold all axes. It's easiest to make it 1-d + # so we can just append subplots upon creation, and then + nplots = nrows * ncols + + if naxes is None: + naxes = nrows * ncols + elif nplots < naxes: + raise ValueError("naxes {0} is larger than layour size defined by nrows * ncols".format(naxes)) + + if ax is None: + fig = plt.figure(**fig_kw) + else: + fig = ax.get_figure() + # if ax is passed and a number of subplots is 1, return ax as it is + if naxes == 1: + if squeeze: + return fig, ax + else: + return fig, _flatten(ax) + else: + warnings.warn("To output multiple subplots, the figure containing the passed axes " + "is being cleared", UserWarning) + fig.clear() + + axarr = np.empty(nplots, dtype=object) + + # Create first subplot separately, so we can share it if requested + ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) + + if sharex: + subplot_kw['sharex'] = ax0 + if sharey: + subplot_kw['sharey'] = ax0 + axarr[0] = ax0 + + # Note off-by-one counting because add_subplot uses the MATLAB 1-based + # convention. + for i in range(1, nplots): + ax = fig.add_subplot(nrows, ncols, i + 1, **subplot_kw) + axarr[i] = ax + + if nplots > 1: + if sharex and nrows > 1: + for ax in axarr[:naxes][:-ncols]: # only bottom row + for label in ax.get_xticklabels(): + label.set_visible(False) + ax.xaxis.get_label().set_visible(False) + if sharey and ncols > 1: + for i, ax in enumerate(axarr): + if (i % ncols) != 0: # only first column + for label in ax.get_yticklabels(): + label.set_visible(False) + ax.yaxis.get_label().set_visible(False) + + if naxes != nplots: + for ax in axarr[naxes:]: + ax.set_visible(False) + + if squeeze: + # Reshape the array to have the final desired dimension (nrow,ncol), + # though discarding unneeded dimensions that equal 1. If we only have + # one subplot, just return it instead of a 1-element array. + if nplots == 1: + axes = axarr[0] + else: + axes = axarr.reshape(nrows, ncols).squeeze() + else: + # returned axis array will be always 2-d, even if nrows=ncols=1 + axes = axarr.reshape(nrows, ncols) + + return fig, axes + + +def _flatten(axes): + if not com.is_list_like(axes): + axes = [axes] + elif isinstance(axes, np.ndarray): + axes = axes.ravel() + return axes + + +def _get_all_lines(ax): + lines = ax.get_lines() + + # check for right_ax, which can oddly sometimes point back to ax + if hasattr(ax, 'right_ax') and ax.right_ax != ax: + lines += ax.right_ax.get_lines() + + # no such risk with left_ax + if hasattr(ax, 'left_ax'): + lines += ax.left_ax.get_lines() + + return lines + + +def _get_xlim(lines): + left, right = np.inf, -np.inf + for l in lines: + x = l.get_xdata(orig=False) + left = min(x[0], left) + right = max(x[-1], right) + return left, right + + +def _set_ticks_props(axes, xlabelsize=None, xrot=None, + ylabelsize=None, yrot=None): + import matplotlib.pyplot as plt + + for ax in _flatten(axes): + if xlabelsize is not None: + plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + if xrot is not None: + plt.setp(ax.get_xticklabels(), rotation=xrot) + if ylabelsize is not None: + plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + if yrot is not None: + plt.setp(ax.get_yticklabels(), rotation=yrot) + return axes + + +if __name__ == '__main__': + # import pandas.rpy.common as com + # sales = com.load_data('sanfrancisco.home.sales', package='nutshell') + # top10 = sales['zip'].value_counts()[:10].index + # sales2 = sales[sales.zip.isin(top10)] + # _ = scatter_plot(sales2, 'squarefeet', 'price', by='zip') + + # plt.show() + + import matplotlib.pyplot as plt + + import pandas.tools.plotting as plots + import pandas.core.frame as fr + reload(plots) + reload(fr) + from pandas.core.frame import DataFrame + + data = DataFrame([[3, 6, -5], [4, 8, 2], [4, 9, -6], + [4, 9, -3], [2, 5, -1]], + columns=['A', 'B', 'C']) + data.plot(kind='barh', stacked=True) + + plt.show() diff --git a/pandas/tools/rplot.py b/pandas/tools/rplot.py new file mode 100644 index 00000000..1c3d17ee --- /dev/null +++ b/pandas/tools/rplot.py @@ -0,0 +1,885 @@ +import random +from copy import deepcopy +from pandas.core.common import _values_from_object + +import numpy as np +from pandas.compat import range, zip +# +# TODO: +# * Make sure legends work properly +# + +class Scale: + """ + Base class for mapping between graphical and data attributes. + """ + pass + +class ScaleGradient(Scale): + """ + A mapping between a data attribute value and a + point in colour space between two specified colours. + """ + def __init__(self, column, colour1, colour2): + """Initialize ScaleGradient instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + colour1: tuple, 3 element tuple with float values representing an RGB colour + colour2: tuple, 3 element tuple with float values representing an RGB colour + """ + self.column = column + self.colour1 = colour1 + self.colour2 = colour2 + self.categorical = False + + def __call__(self, data, index): + """Return a colour corresponding to data attribute value. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + A three element tuple representing an RGB somewhere between colour1 and colour2 + """ + x = data[self.column].iget(index) + a = min(data[self.column]) + b = max(data[self.column]) + r1, g1, b1 = self.colour1 + r2, g2, b2 = self.colour2 + x_scaled = (x - a) / (b - a) + return (r1 + (r2 - r1) * x_scaled, + g1 + (g2 - g1) * x_scaled, + b1 + (b2 - b1) * x_scaled) + +class ScaleGradient2(Scale): + """ + Create a mapping between a data attribute value and a + point in colour space in a line of three specified colours. + """ + def __init__(self, column, colour1, colour2, colour3): + """Initialize ScaleGradient2 instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + colour1: tuple, 3 element tuple with float values representing an RGB colour + colour2: tuple, 3 element tuple with float values representing an RGB colour + colour3: tuple, 3 element tuple with float values representing an RGB colour + """ + self.column = column + self.colour1 = colour1 + self.colour2 = colour2 + self.colour3 = colour3 + self.categorical = False + + def __call__(self, data, index): + """Return a colour corresponding to data attribute value. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + A three element tuple representing an RGB somewhere along the line + of colour1, colour2 and colour3 + """ + x = data[self.column].iget(index) + a = min(data[self.column]) + b = max(data[self.column]) + r1, g1, b1 = self.colour1 + r2, g2, b2 = self.colour2 + r3, g3, b3 = self.colour3 + x_scaled = (x - a) / (b - a) + if x_scaled < 0.5: + x_scaled *= 2.0 + return (r1 + (r2 - r1) * x_scaled, + g1 + (g2 - g1) * x_scaled, + b1 + (b2 - b1) * x_scaled) + else: + x_scaled = (x_scaled - 0.5) * 2.0 + return (r2 + (r3 - r2) * x_scaled, + g2 + (g3 - g2) * x_scaled, + b2 + (b3 - b2) * x_scaled) + +class ScaleSize(Scale): + """ + Provide a mapping between a DataFrame column and matplotlib + scatter plot shape size. + """ + def __init__(self, column, min_size=5.0, max_size=100.0, transform=lambda x: x): + """Initialize ScaleSize instance. + + Parameters: + ----------- + column: string, a column name + min_size: float, minimum point size + max_size: float, maximum point size + transform: a one argument function of form float -> float (e.g. lambda x: log(x)) + """ + self.column = column + self.min_size = min_size + self.max_size = max_size + self.transform = transform + self.categorical = False + + def __call__(self, data, index): + """Return matplotlib scatter plot marker shape size. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + """ + x = data[self.column].iget(index) + a = float(min(data[self.column])) + b = float(max(data[self.column])) + return self.transform(self.min_size + ((x - a) / (b - a)) * + (self.max_size - self.min_size)) + +class ScaleShape(Scale): + """ + Provides a mapping between matplotlib marker shapes + and attribute values. + """ + def __init__(self, column): + """Initialize ScaleShape instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + """ + self.column = column + self.shapes = ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x'] + self.legends = set([]) + self.categorical = True + + def __call__(self, data, index): + """Returns a matplotlib marker identifier. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + a matplotlib marker identifier + """ + values = sorted(list(set(data[self.column]))) + if len(values) > len(self.shapes): + raise ValueError("Too many different values of the categorical attribute for ScaleShape") + x = data[self.column].iget(index) + return self.shapes[values.index(x)] + +class ScaleRandomColour(Scale): + """ + Maps a random colour to a DataFrame attribute. + """ + def __init__(self, column): + """Initialize ScaleRandomColour instance. + + Parameters: + ----------- + column: string, pandas DataFrame column name + """ + self.column = column + self.categorical = True + + def __call__(self, data, index): + """Return a tuple of three floats, representing + an RGB colour. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + """ + random.seed(data[self.column].iget(index)) + return [random.random() for _ in range(3)] + +class ScaleConstant(Scale): + """ + Constant returning scale. Usually used automatically. + """ + def __init__(self, value): + """Initialize ScaleConstant instance. + + Parameters: + ----------- + value: any Python value to be returned when called + """ + self.value = value + self.categorical = False + + def __call__(self, data, index): + """Return the constant value. + + Parameters: + ----------- + data: pandas DataFrame + index: pandas DataFrame row index + + Returns: + -------- + A constant value specified during initialisation + """ + return self.value + +def default_aes(x=None, y=None): + """Create the default aesthetics dictionary. + + Parameters: + ----------- + x: string, DataFrame column name + y: string, DataFrame column name + + Returns: + -------- + a dictionary with aesthetics bindings + """ + return { + 'x' : x, + 'y' : y, + 'size' : ScaleConstant(40.0), + 'colour' : ScaleConstant('grey'), + 'shape' : ScaleConstant('o'), + 'alpha' : ScaleConstant(1.0), + } + +def make_aes(x=None, y=None, size=None, colour=None, shape=None, alpha=None): + """Create an empty aesthetics dictionary. + + Parameters: + ----------- + x: string, DataFrame column name + y: string, DataFrame column name + size: function, binding for size attribute of Geoms + colour: function, binding for colour attribute of Geoms + shape: function, binding for shape attribute of Geoms + alpha: function, binding for alpha attribute of Geoms + + Returns: + -------- + a dictionary with aesthetics bindings + """ + if not hasattr(size, '__call__') and size is not None: + size = ScaleConstant(size) + if not hasattr(colour, '__call__') and colour is not None: + colour = ScaleConstant(colour) + if not hasattr(shape, '__call__') and shape is not None: + shape = ScaleConstant(shape) + if not hasattr(alpha, '__call__') and alpha is not None: + alpha = ScaleConstant(alpha) + if any([isinstance(size, scale) for scale in [ScaleConstant, ScaleSize]]) or size is None: + pass + else: + raise ValueError('size mapping should be done through ScaleConstant or ScaleSize') + if any([isinstance(colour, scale) for scale in [ScaleConstant, ScaleGradient, ScaleGradient2, ScaleRandomColour]]) or colour is None: + pass + else: + raise ValueError('colour mapping should be done through ScaleConstant, ScaleRandomColour, ScaleGradient or ScaleGradient2') + if any([isinstance(shape, scale) for scale in [ScaleConstant, ScaleShape]]) or shape is None: + pass + else: + raise ValueError('shape mapping should be done through ScaleConstant or ScaleShape') + if any([isinstance(alpha, scale) for scale in [ScaleConstant]]) or alpha is None: + pass + else: + raise ValueError('alpha mapping should be done through ScaleConstant') + return { + 'x' : x, + 'y' : y, + 'size' : size, + 'colour' : colour, + 'shape' : shape, + 'alpha' : alpha, + } + +class Layer: + """ + Layer object representing a single plot layer. + """ + def __init__(self, data=None, **kwds): + """Initialize layer object. + + Parameters: + ----------- + data: pandas DataFrame instance + aes: aesthetics dictionary with bindings + """ + self.data = data + self.aes = make_aes(**kwds) + self.legend = {} + + def work(self, fig=None, ax=None): + """Do the drawing (usually) work. + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis object + + Returns: + -------- + a tuple with the same figure and axis instances + """ + return fig, ax + +class GeomPoint(Layer): + def work(self, fig=None, ax=None): + """Render the layer on a matplotlib axis. + You can specify either a figure or an axis to draw on. + + Parameters: + ----------- + fig: matplotlib figure object + ax: matplotlib axis object to draw on + + Returns: + -------- + fig, ax: matplotlib figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + for index in range(len(self.data)): + row = self.data.irow(index) + x = row[self.aes['x']] + y = row[self.aes['y']] + size_scaler = self.aes['size'] + colour_scaler = self.aes['colour'] + shape_scaler = self.aes['shape'] + alpha = self.aes['alpha'] + size_value = size_scaler(self.data, index) + colour_value = colour_scaler(self.data, index) + marker_value = shape_scaler(self.data, index) + alpha_value = alpha(self.data, index) + patch = ax.scatter(x, y, + s=size_value, + c=colour_value, + marker=marker_value, + alpha=alpha_value) + label = [] + if colour_scaler.categorical: + label += [colour_scaler.column, row[colour_scaler.column]] + if shape_scaler.categorical: + label += [shape_scaler.column, row[shape_scaler.column]] + self.legend[tuple(label)] = patch + ax.set_xlabel(self.aes['x']) + ax.set_ylabel(self.aes['y']) + return fig, ax + +class GeomPolyFit(Layer): + """ + Draw a polynomial fit of specified degree. + """ + def __init__(self, degree, lw=2.0, colour='grey'): + """Initialize GeomPolyFit object. + + Parameters: + ----------- + degree: an integer, polynomial degree + lw: line width + colour: matplotlib colour + """ + self.degree = degree + self.lw = lw + self.colour = colour + Layer.__init__(self) + + def work(self, fig=None, ax=None): + """Draw the polynomial fit on matplotlib figure or axis + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis + + Returns: + -------- + a tuple with figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + from numpy.polynomial.polynomial import polyfit + from numpy.polynomial.polynomial import polyval + x = self.data[self.aes['x']] + y = self.data[self.aes['y']] + min_x = min(x) + max_x = max(x) + c = polyfit(x, y, self.degree) + x_ = np.linspace(min_x, max_x, len(x)) + y_ = polyval(x_, c) + ax.plot(x_, y_, lw=self.lw, c=self.colour) + return fig, ax + +class GeomScatter(Layer): + """ + An efficient scatter plot, use this instead of GeomPoint for speed. + """ + def __init__(self, marker='o', colour='lightblue', alpha=1.0): + """Initialize GeomScatter instance. + + Parameters: + ----------- + marker: matplotlib marker string + colour: matplotlib colour + alpha: matplotlib alpha + """ + self.marker = marker + self.colour = colour + self.alpha = alpha + Layer.__init__(self) + + def work(self, fig=None, ax=None): + """Draw a scatter plot on matplotlib figure or axis + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis + + Returns: + -------- + a tuple with figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + x = self.data[self.aes['x']] + y = self.data[self.aes['y']] + ax.scatter(x, y, marker=self.marker, c=self.colour, alpha=self.alpha) + return fig, ax + +class GeomHistogram(Layer): + """ + An efficient histogram, use this instead of GeomBar for speed. + """ + def __init__(self, bins=10, colour='lightblue'): + """Initialize GeomHistogram instance. + + Parameters: + ----------- + bins: integer, number of histogram bins + colour: matplotlib colour + """ + self.bins = bins + self.colour = colour + Layer.__init__(self) + + def work(self, fig=None, ax=None): + """Draw a histogram on matplotlib figure or axis + + Parameters: + ----------- + fig: matplotlib figure + ax: matplotlib axis + + Returns: + -------- + a tuple with figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + x = self.data[self.aes['x']] + ax.hist(_values_from_object(x), self.bins, facecolor=self.colour) + ax.set_xlabel(self.aes['x']) + return fig, ax + +class GeomDensity(Layer): + """ + A kernel density estimation plot. + """ + def work(self, fig=None, ax=None): + """Draw a one dimensional kernel density plot. + You can specify either a figure or an axis to draw on. + + Parameters: + ----------- + fig: matplotlib figure object + ax: matplotlib axis object to draw on + + Returns: + -------- + fig, ax: matplotlib figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + from scipy.stats import gaussian_kde + x = self.data[self.aes['x']] + gkde = gaussian_kde(x) + ind = np.linspace(x.min(), x.max(), 200) + ax.plot(ind, gkde.evaluate(ind)) + return fig, ax + +class GeomDensity2D(Layer): + def work(self, fig=None, ax=None): + """Draw a two dimensional kernel density plot. + You can specify either a figure or an axis to draw on. + + Parameters: + ----------- + fig: matplotlib figure object + ax: matplotlib axis object to draw on + + Returns: + -------- + fig, ax: matplotlib figure and axis objects + """ + if ax is None: + if fig is None: + return fig, ax + else: + ax = fig.gca() + x = self.data[self.aes['x']] + y = self.data[self.aes['y']] + rvs = np.array([x, y]) + x_min = x.min() + x_max = x.max() + y_min = y.min() + y_max = y.max() + X, Y = np.mgrid[x_min:x_max:200j, y_min:y_max:200j] + positions = np.vstack([X.ravel(), Y.ravel()]) + values = np.vstack([x, y]) + import scipy.stats as stats + kernel = stats.gaussian_kde(values) + Z = np.reshape(kernel(positions).T, X.shape) + ax.contour(Z, extent=[x_min, x_max, y_min, y_max]) + return fig, ax + +class TrellisGrid(Layer): + def __init__(self, by): + """Initialize TreelisGrid instance. + + Parameters: + ----------- + by: column names to group by + """ + if len(by) != 2: + raise ValueError("You must give a list of length 2 to group by") + elif by[0] == '.' and by[1] == '.': + raise ValueError("At least one of grouping attributes must be not a dot") + self.by = by + + def trellis(self, layers): + """Create a trellis structure for a list of layers. + Each layer will be cloned with different data in to a two dimensional grid. + + Parameters: + ----------- + layers: a list of Layer objects + + Returns: + -------- + trellised_layers: Clones of each layer in the list arranged in a trellised latice + """ + trellised_layers = [] + for layer in layers: + data = layer.data + if self.by[0] == '.': + grouped = data.groupby(self.by[1]) + elif self.by[1] == '.': + grouped = data.groupby(self.by[0]) + else: + grouped = data.groupby(self.by) + groups = list(grouped.groups.keys()) + if self.by[0] == '.' or self.by[1] == '.': + shingle1 = set([g for g in groups]) + else: + shingle1 = set([g[0] for g in groups]) + shingle2 = set([g[1] for g in groups]) + if self.by[0] == '.': + self.rows = 1 + self.cols = len(shingle1) + elif self.by[1] == '.': + self.rows = len(shingle1) + self.cols = 1 + else: + self.rows = len(shingle1) + self.cols = len(shingle2) + trellised = [[None for _ in range(self.cols)] for _ in range(self.rows)] + self.group_grid = [[None for _ in range(self.cols)] for _ in range(self.rows)] + row = 0 + col = 0 + for group, data in grouped: + new_layer = deepcopy(layer) + new_layer.data = data + trellised[row][col] = new_layer + self.group_grid[row][col] = group + col += 1 + if col >= self.cols: + col = 0 + row += 1 + trellised_layers.append(trellised) + return trellised_layers + +def dictionary_union(dict1, dict2): + """Take two dictionaries, return dictionary union. + + Parameters: + ----------- + dict1: Python dictionary + dict2: Python dictionary + + Returns: + -------- + A union of the dictionaries. It assumes that values + with the same keys are identical. + """ + keys1 = list(dict1.keys()) + keys2 = list(dict2.keys()) + result = {} + for key1 in keys1: + result[key1] = dict1[key1] + for key2 in keys2: + result[key2] = dict2[key2] + return result + +def merge_aes(layer1, layer2): + """Merges the aesthetics dictionaries for the two layers. + Look up sequence_layers function. Which layer is first and which + one is second is important. + + Parameters: + ----------- + layer1: Layer object + layer2: Layer object + """ + for key in layer2.aes.keys(): + if layer2.aes[key] is None: + layer2.aes[key] = layer1.aes[key] + +def sequence_layers(layers): + """Go through the list of layers and fill in the missing bits of information. + The basic rules are this: + * If the current layer has data set to None, take the data from previous layer. + * For each aesthetic mapping, if that mapping is set to None, take it from previous layer. + + Parameters: + ----------- + layers: a list of Layer objects + """ + for layer1, layer2 in zip(layers[:-1], layers[1:]): + if layer2.data is None: + layer2.data = layer1.data + merge_aes(layer1, layer2) + return layers + +def sequence_grids(layer_grids): + """Go through the list of layer girds and perform the same thing as sequence_layers. + + Parameters: + ----------- + layer_grids: a list of two dimensional layer grids + """ + for grid1, grid2 in zip(layer_grids[:-1], layer_grids[1:]): + for row1, row2 in zip(grid1, grid2): + for layer1, layer2 in zip(row1, row2): + if layer2.data is None: + layer2.data = layer1.data + merge_aes(layer1, layer2) + return layer_grids + +def work_grid(grid, fig): + """Take a two dimensional grid, add subplots to a figure for each cell and do layer work. + + Parameters: + ----------- + grid: a two dimensional grid of layers + fig: matplotlib figure to draw on + + Returns: + -------- + axes: a two dimensional list of matplotlib axes + """ + nrows = len(grid) + ncols = len(grid[0]) + axes = [[None for _ in range(ncols)] for _ in range(nrows)] + for row in range(nrows): + for col in range(ncols): + axes[row][col] = fig.add_subplot(nrows, ncols, ncols * row + col + 1) + grid[row][col].work(ax=axes[row][col]) + return axes + +def adjust_subplots(fig, axes, trellis, layers): + """Adjust the subtplots on matplotlib figure with the + fact that we have a trellis plot in mind. + + Parameters: + ----------- + fig: matplotlib figure + axes: a two dimensional grid of matplotlib axes + trellis: TrellisGrid object + layers: last grid of layers in the plot + """ + # Flatten the axes grid + axes = [ax for row in axes for ax in row] + min_x = min([ax.get_xlim()[0] for ax in axes]) + max_x = max([ax.get_xlim()[1] for ax in axes]) + min_y = min([ax.get_ylim()[0] for ax in axes]) + max_y = max([ax.get_ylim()[1] for ax in axes]) + [ax.set_xlim(min_x, max_x) for ax in axes] + [ax.set_ylim(min_y, max_y) for ax in axes] + for index, axis in enumerate(axes): + if index % trellis.cols == 0: + pass + else: + axis.get_yaxis().set_ticks([]) + axis.set_ylabel('') + if index / trellis.cols == trellis.rows - 1: + pass + else: + axis.get_xaxis().set_ticks([]) + axis.set_xlabel('') + if trellis.by[0] == '.': + label1 = "%s = %s" % (trellis.by[1], trellis.group_grid[index // trellis.cols][index % trellis.cols]) + label2 = None + elif trellis.by[1] == '.': + label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[index // trellis.cols][index % trellis.cols]) + label2 = None + else: + label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[index // trellis.cols][index % trellis.cols][0]) + label2 = "%s = %s" % (trellis.by[1], trellis.group_grid[index // trellis.cols][index % trellis.cols][1]) + if label2 is not None: + axis.table(cellText=[[label1], [label2]], + loc='top', cellLoc='center', + cellColours=[['lightgrey'], ['lightgrey']]) + else: + axis.table(cellText=[[label1]], loc='top', cellLoc='center', cellColours=[['lightgrey']]) + # Flatten the layer grid + layers = [layer for row in layers for layer in row] + legend = {} + for layer in layers: + legend = dictionary_union(legend, layer.legend) + patches = [] + labels = [] + if len(list(legend.keys())) == 0: + key_function = lambda tup: tup + elif len(list(legend.keys())[0]) == 2: + key_function = lambda tup: (tup[1]) + else: + key_function = lambda tup: (tup[1], tup[3]) + for key in sorted(list(legend.keys()), key=key_function): + value = legend[key] + patches.append(value) + if len(key) == 2: + col, val = key + labels.append("%s" % str(val)) + elif len(key) == 4: + col1, val1, col2, val2 = key + labels.append("%s, %s" % (str(val1), str(val2))) + else: + raise ValueError("Maximum 2 categorical attributes to display a lengend of") + if len(legend): + fig.legend(patches, labels, loc='upper right') + fig.subplots_adjust(wspace=0.05, hspace=0.2) + +class RPlot: + """ + The main plot object. Add layers to an instance of this object to create a plot. + """ + def __init__(self, data, x=None, y=None): + """Initialize RPlot instance. + + Parameters: + ----------- + data: pandas DataFrame instance + x: string, DataFrame column name + y: string, DataFrame column name + """ + self.layers = [Layer(data, **default_aes(x=x, y=y))] + trellised = False + + def add(self, layer): + """Add a layer to RPlot instance. + + Parameters: + ----------- + layer: Layer instance + """ + if not isinstance(layer, Layer): + raise TypeError("The operand on the right side of + must be a Layer instance") + self.layers.append(layer) + + def render(self, fig=None): + """Render all the layers on a matplotlib figure. + + Parameters: + ----------- + fig: matplotlib figure + """ + import matplotlib.pyplot as plt + if fig is None: + fig = plt.gcf() + # Look for the last TrellisGrid instance in the layer list + last_trellis = None + for layer in self.layers: + if isinstance(layer, TrellisGrid): + last_trellis = layer + if last_trellis is None: + # We have a simple, non-trellised plot + new_layers = sequence_layers(self.layers) + for layer in new_layers: + layer.work(fig=fig) + legend = {} + for layer in new_layers: + legend = dictionary_union(legend, layer.legend) + patches = [] + labels = [] + if len(list(legend.keys())) == 0: + key_function = lambda tup: tup + elif len(list(legend.keys())[0]) == 2: + key_function = lambda tup: (tup[1]) + else: + key_function = lambda tup: (tup[1], tup[3]) + for key in sorted(list(legend.keys()), key=key_function): + value = legend[key] + patches.append(value) + if len(key) == 2: + col, val = key + labels.append("%s" % str(val)) + elif len(key) == 4: + col1, val1, col2, val2 = key + labels.append("%s, %s" % (str(val1), str(val2))) + else: + raise ValueError("Maximum 2 categorical attributes to display a lengend of") + if len(legend): + fig.legend(patches, labels, loc='upper right') + else: + # We have a trellised plot. + # First let's remove all other TrellisGrid instances from the layer list, + # including this one. + new_layers = [] + for layer in self.layers: + if not isinstance(layer, TrellisGrid): + new_layers.append(layer) + new_layers = sequence_layers(new_layers) + # Now replace the old layers by their trellised versions + new_layers = last_trellis.trellis(new_layers) + # Prepare the subplots and draw on them + new_layers = sequence_grids(new_layers) + axes_grids = [work_grid(grid, fig) for grid in new_layers] + axes_grid = axes_grids[-1] + adjust_subplots(fig, axes_grid, last_trellis, new_layers[-1]) + # And we're done + return fig diff --git a/pandas/tools/tests/__init__.py b/pandas/tools/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tools/tests/cut_data.csv b/pandas/tools/tests/cut_data.csv new file mode 100644 index 00000000..7d9d4805 --- /dev/null +++ b/pandas/tools/tests/cut_data.csv @@ -0,0 +1 @@ +1.001 0.994 0.9951 0.9956 0.9956 0.9951 0.9949 1.001 0.994 0.9938 0.9908 0.9947 0.992 0.9912 1.0002 0.9914 0.9928 0.9892 0.9917 0.9955 0.9892 0.9912 0.993 0.9937 0.9951 0.9955 0.993 0.9961 0.9914 0.9906 0.9974 0.9934 0.992 0.9939 0.9962 0.9905 0.9934 0.9906 0.9999 0.9999 0.9937 0.9937 0.9954 0.9934 0.9934 0.9931 0.994 0.9939 0.9954 0.995 0.9917 0.9914 0.991 0.9911 0.993 0.9908 0.9962 0.9972 0.9931 0.9926 0.9951 0.9972 0.991 0.9931 0.9927 0.9934 0.9903 0.992 0.9926 0.9962 0.9956 0.9958 0.9964 0.9941 0.9926 0.9962 0.9898 0.9912 0.9961 0.9949 0.9929 0.9985 0.9946 0.9966 0.9974 0.9975 0.9974 0.9972 0.9974 0.9975 0.9974 0.9957 0.99 0.9899 0.9916 0.9969 0.9979 0.9913 0.9956 0.9979 0.9975 0.9962 0.997 1 0.9975 0.9974 0.9962 0.999 0.999 0.9927 0.9959 1 0.9982 0.9968 0.9968 0.994 0.9914 0.9911 0.9982 0.9982 0.9934 0.9984 0.9952 0.9952 0.9928 0.9912 0.994 0.9958 0.9924 0.9924 0.994 0.9958 0.9979 0.9982 0.9961 0.9979 0.992 0.9975 0.9917 0.9923 0.9927 0.9975 0.992 0.9947 0.9921 0.9905 0.9918 0.9951 0.9917 0.994 0.9934 0.9968 0.994 0.9919 0.9966 0.9979 0.9979 0.9898 0.9894 0.9894 0.9898 0.998 0.9932 0.9979 0.997 0.9972 0.9974 0.9896 0.9968 0.9958 0.9906 0.9917 0.9902 0.9918 0.999 0.9927 0.991 0.9972 0.9931 0.995 0.9951 0.9936 1.001 0.9979 0.997 0.9972 0.9954 0.9924 0.9906 0.9962 0.9962 1.001 0.9928 0.9942 0.9942 0.9942 0.9942 0.9961 0.998 0.9961 0.9984 0.998 0.9973 0.9949 0.9924 0.9972 0.9958 0.9968 0.9938 0.993 0.994 0.9918 0.9958 0.9944 0.9912 0.9961 0.9939 0.9961 0.9989 0.9938 0.9939 0.9971 0.9912 0.9936 0.9929 0.9998 0.9938 0.9969 0.9938 0.9998 0.9972 0.9976 0.9976 0.9979 0.9979 0.9979 0.9979 0.9972 0.9918 0.9982 0.9985 0.9944 0.9903 0.9934 0.9975 0.9923 0.99 0.9905 0.9905 0.996 0.9964 0.998 0.9975 0.9913 0.9932 0.9935 0.9927 0.9927 0.9912 0.9904 0.9939 0.9996 0.9944 0.9977 0.9912 0.9996 0.9965 0.9944 0.9945 0.9944 0.9965 0.9944 0.9972 0.9949 0.9966 0.9954 0.9954 0.9915 0.9919 0.9916 0.99 0.9909 0.9938 0.9982 0.9988 0.9961 0.9978 0.9979 0.9979 0.9979 0.9979 0.9945 1 0.9957 0.9968 0.9934 0.9976 0.9932 0.997 0.9923 0.9914 0.992 0.9914 0.9914 0.9949 0.9949 0.995 0.995 0.9927 0.9928 0.9917 0.9918 0.9954 0.9941 0.9941 0.9934 0.9927 0.9938 0.9933 0.9934 0.9927 0.9938 0.9927 0.9946 0.993 0.9946 0.9976 0.9944 0.9978 0.992 0.9912 0.9927 0.9906 0.9954 0.9923 0.9906 0.991 0.9972 0.9945 0.9934 0.9964 0.9948 0.9962 0.9931 0.993 0.9942 0.9906 0.9995 0.998 0.997 0.9914 0.992 0.9924 0.992 0.9937 0.9978 0.9978 0.9927 0.994 0.9935 0.9968 0.9941 0.9942 0.9978 0.9923 0.9912 0.9923 0.9927 0.9931 0.9941 0.9927 0.9931 0.9934 0.9936 0.9893 0.9893 0.9919 0.9924 0.9927 0.9919 0.9924 0.9975 0.9969 0.9936 0.991 0.9893 0.9906 0.9941 0.995 0.9983 0.9983 0.9916 0.9957 0.99 0.9976 0.992 0.9917 0.9917 0.9993 0.9908 0.9917 0.9976 0.9934 1 0.9918 0.992 0.9896 0.9932 0.992 0.9917 0.9999 0.998 0.9918 0.9918 0.9999 0.998 0.9927 0.9959 0.9927 0.9929 0.9898 0.9954 0.9954 0.9954 0.9954 0.9954 0.9954 0.9974 0.9936 0.9978 0.9974 0.9927 0.9934 0.9938 0.9922 0.992 0.9935 0.9906 0.9934 0.9934 0.9913 0.9938 0.9898 0.9975 0.9975 0.9937 0.9914 0.9982 0.9982 0.9929 0.9971 0.9921 0.9931 0.9924 0.9929 0.9982 0.9892 0.9956 0.9924 0.9971 0.9956 0.9982 0.9973 0.9932 0.9976 0.9962 0.9956 0.9932 0.9976 0.9992 0.9983 0.9937 0.99 0.9944 0.9938 0.9965 0.9893 0.9927 0.994 0.9928 0.9964 0.9917 0.9972 0.9964 0.9954 0.993 0.9928 0.9916 0.9936 0.9962 0.9899 0.9898 0.996 0.9907 0.994 0.9913 0.9976 0.9904 0.992 0.9976 0.999 0.9975 0.9937 0.9937 0.998 0.998 0.9944 0.9938 0.9907 0.9938 0.9921 0.9908 0.9931 0.9915 0.9952 0.9926 0.9934 0.992 0.9918 0.9942 0.9942 0.9942 0.9901 0.9898 0.9902 0.9934 0.9906 0.9898 0.9896 0.9922 0.9947 0.9945 0.9976 0.9976 0.9976 0.9987 0.9987 0.9976 0.992 0.9955 0.9953 0.9976 0.992 0.9952 0.9983 0.9933 0.9958 0.9922 0.9928 0.9976 0.9976 0.9916 0.9901 0.9976 0.9901 0.9916 0.9982 0.993 0.9969 0.991 0.9953 0.9924 0.9969 0.9928 0.9945 0.9967 0.9944 0.9928 0.9929 0.9948 0.9976 0.9912 0.9987 0.99 0.991 0.9933 0.9933 0.9899 0.9912 0.9912 0.9976 0.994 0.9947 0.9954 0.993 0.9954 0.9963 0.992 0.9926 0.995 0.9983 0.992 0.9968 0.9905 0.9904 0.9926 0.9968 0.9928 0.9949 0.9909 0.9937 0.9914 0.9905 0.9904 0.9924 0.9924 0.9965 0.9965 0.9993 0.9965 0.9908 0.992 0.9978 0.9978 0.9978 0.9978 0.9912 0.9928 0.9928 0.993 0.9993 0.9965 0.9937 0.9913 0.9934 0.9952 0.9983 0.9957 0.9957 0.9916 0.9999 0.9999 0.9936 0.9972 0.9933 0.9934 0.9931 0.9976 0.9937 0.9937 0.991 0.9979 0.9971 0.9969 0.9968 0.9961 0.993 0.9973 0.9944 0.9986 0.9986 0.9986 0.9986 0.9972 0.9917 0.992 0.9932 0.9936 0.9915 0.9922 0.9934 0.9952 0.9972 0.9934 0.9958 0.9944 0.9908 0.9958 0.9925 0.9966 0.9972 0.9912 0.995 0.9928 0.9968 0.9955 0.9981 0.991 0.991 0.991 0.992 0.9931 0.997 0.9948 0.9923 0.9976 0.9938 0.9984 0.9972 0.9922 0.9935 0.9944 0.9942 0.9944 0.9997 0.9977 0.9912 0.9982 0.9982 0.9983 0.998 0.9894 0.9927 0.9917 0.9904 0.993 0.9941 0.9943 0.99855 0.99345 0.998 0.9916 0.9916 0.99475 0.99325 0.9933 0.9969 1.0002 0.9933 0.9937 0.99685 0.99455 0.9917 0.99035 0.9914 0.99225 0.99155 0.9954 0.99455 0.9924 0.99695 0.99655 0.9934 0.998 0.9971 0.9948 0.998 0.9971 0.99215 0.9948 0.9915 0.99115 0.9932 0.9977 0.99535 0.99165 0.9953 0.9928 0.9958 0.9928 0.9928 0.9964 0.9987 0.9953 0.9932 0.9907 0.99755 0.99935 0.9932 0.9932 0.9958 0.99585 1.00055 0.9985 0.99505 0.992 0.9988 0.99175 0.9962 0.9962 0.9942 0.9927 0.9927 0.99985 0.997 0.9918 0.99215 0.99865 0.9992 1.0006 0.99135 0.99715 0.9992 1.0006 0.99865 0.99815 0.99815 0.99815 0.9949 0.99815 0.99815 0.99225 0.99445 0.99225 0.99335 0.99625 0.9971 0.9983 0.99445 0.99085 0.9977 0.9953 0.99775 0.99795 0.99505 0.9977 0.9975 0.99745 0.9976 0.99775 0.9953 0.9932 0.99405 1 0.99785 0.9939 0.9939 0.99675 0.9939 0.99675 0.98965 0.9971 0.99445 0.9945 0.9939 0.9958 0.9956 0.99055 0.9959 0.9925 0.9963 0.9935 0.99105 0.99045 0.9963 0.99155 0.99085 0.99085 0.99085 0.9924 0.9924 0.99975 0.99975 0.99315 0.9917 0.9917 0.99845 0.9921 0.99975 0.9909 0.99315 0.99855 0.9934 0.9978 0.9934 0.9949 0.99855 0.9986 0.99725 0.9946 0.99255 0.9996 0.9939 0.99 0.9937 0.9886 0.9934 1 0.9994 0.9926 0.9956 0.9978 0.9915 0.9939 0.9932 0.993 0.9898 0.9921 0.9932 0.9919 0.993 0.9953 0.9928 0.9928 0.9976 0.9906 0.9918 0.99185 0.9918 0.99185 0.994 0.9908 0.9928 0.9896 0.9908 0.9918 0.9952 0.9923 0.9915 0.9952 0.9947 0.9983 0.9975 0.995 0.9944 0.994 0.9944 0.9908 0.99795 0.9985 0.99425 0.99425 0.9943 0.9924 0.9946 0.9924 0.995 0.9919 0.99 0.9923 0.9956 0.9978 0.9978 0.9967 0.9934 0.9936 0.9932 0.9934 0.998 0.9978 0.9929 0.9974 0.99685 0.99495 0.99745 0.99505 0.992 0.9978 0.9956 0.9982 0.99485 0.9971 0.99265 0.9904 0.9965 0.9946 0.99965 0.9935 0.996 0.9942 0.9936 0.9965 0.9928 0.9928 0.9965 0.9936 0.9938 0.9926 0.9926 0.9983 0.9983 0.992 0.9983 0.9923 0.9972 0.9928 0.9928 0.9994 0.991 0.9906 0.9894 0.9898 0.9994 0.991 0.9925 0.9956 0.9946 0.9966 0.9951 0.9927 0.9927 0.9951 0.9894 0.9907 0.9925 0.9928 0.9941 0.9941 0.9925 0.9935 0.9932 0.9944 0.9972 0.994 0.9956 0.9927 0.9924 0.9966 0.9997 0.9936 0.9936 0.9952 0.9952 0.9928 0.9911 0.993 0.9911 0.9932 0.993 0.993 0.9932 0.9932 0.9943 0.9968 0.9994 0.9926 0.9968 0.9932 0.9916 0.9946 0.9925 0.9925 0.9935 0.9962 0.9928 0.993 0.993 0.9956 0.9941 0.9972 0.9948 0.9955 0.9972 0.9972 0.9983 0.9942 0.9936 0.9956 0.9953 0.9918 0.995 0.992 0.9952 1.001 0.9924 0.9932 0.9937 0.9918 0.9934 0.991 0.9962 0.9932 0.9908 0.9962 0.9918 0.9941 0.9931 0.9981 0.9931 0.9944 0.992 0.9966 0.9956 0.9956 0.9949 1.0002 0.9942 0.9923 0.9917 0.9931 0.992 1.0002 0.9953 0.9951 0.9974 0.9904 0.9974 0.9944 1.0004 0.9952 0.9956 0.995 0.995 0.9995 0.9942 0.9977 0.992 0.992 0.9995 0.9934 1.0006 0.9982 0.9928 0.9945 0.9963 0.9906 0.9956 0.9942 0.9962 0.9894 0.995 0.9908 0.9914 0.9938 0.9977 0.9922 0.992 0.9903 0.9893 0.9952 0.9903 0.9912 0.9983 0.9937 0.9932 0.9928 0.9922 0.9976 0.9922 0.9974 0.998 0.9931 0.9911 0.9944 0.9937 0.9974 0.989 0.992 0.9928 0.9918 0.9936 0.9944 0.9988 0.994 0.9953 0.9986 0.9914 0.9934 0.996 0.9937 0.9921 0.998 0.996 0.9933 0.9933 0.9959 0.9936 0.9953 0.9938 0.9952 0.9959 0.9959 0.9937 0.992 0.9967 0.9944 0.9998 0.9998 0.9942 0.9998 0.9945 0.9998 0.9946 0.9942 0.9928 0.9946 0.9927 0.9938 0.9918 0.9945 0.9966 0.9954 0.9913 0.9931 0.9986 0.9965 0.9984 0.9952 0.9956 0.9949 0.9954 0.996 0.9931 0.992 0.9912 0.9978 0.9938 0.9914 0.9932 0.9944 0.9913 0.9948 0.998 0.9998 0.9964 0.9992 0.9948 0.9998 0.998 0.9939 0.992 0.9922 0.9955 0.9917 0.9917 0.9954 0.9986 0.9955 0.9917 0.9907 0.9922 0.9958 0.993 0.9917 0.9926 0.9959 0.9906 0.9993 0.993 0.9906 0.992 0.992 0.994 0.9959 0.9908 0.9902 0.9908 0.9943 0.9921 0.9911 0.9986 0.992 0.992 0.9943 0.9937 0.993 0.9902 0.9928 0.9896 0.998 0.9954 0.9938 0.9918 0.9896 0.9944 0.9999 0.9953 0.992 0.9925 0.9981 0.9952 0.9927 0.9927 0.9911 0.9936 0.9959 0.9946 0.9948 0.9955 0.9951 0.9952 0.9946 0.9946 0.9944 0.9938 0.9963 0.991 1.0003 0.9966 0.9993 1.0003 0.9938 0.9965 0.9938 0.9993 0.9938 1.0003 0.9966 0.9942 0.9928 0.991 0.9911 0.9977 0.9927 0.9911 0.991 0.9912 0.9907 0.9902 0.992 0.994 0.9966 0.993 0.993 0.993 0.9966 0.9942 0.9925 0.9925 0.9928 0.995 0.9939 0.9958 0.9952 1 0.9948 0.99 0.9958 0.9948 0.9949 0.997 0.9927 0.9938 0.9949 0.9953 0.997 0.9932 0.9927 0.9932 0.9955 0.9914 0.991 0.992 0.9924 0.9927 0.9911 0.9958 0.9928 0.9902 0.994 0.994 0.9972 1.0004 0.991 0.9918 0.995 0.9941 0.9956 0.9956 0.9959 0.9922 0.9931 0.9959 0.9984 0.9908 0.991 0.9928 0.9936 0.9941 0.9924 0.9917 0.9906 0.995 0.9956 0.9955 0.9907 1 0.9953 0.9911 0.9922 0.9951 0.9948 0.9906 0.994 0.9907 0.9927 0.9914 0.9958 1 0.9984 0.9941 0.9944 0.998 0.998 0.9902 0.9911 0.9929 0.993 0.9918 0.992 0.9932 0.992 0.994 0.9923 0.993 0.9956 0.9907 0.99 0.9918 0.9926 0.995 0.99 0.99 0.9946 0.9907 0.9898 0.9918 0.9986 0.9986 0.9928 0.9986 0.9979 0.994 0.9937 0.9938 0.9942 0.9944 0.993 0.9986 0.9932 0.9934 0.9928 0.9925 0.9944 0.9909 0.9932 0.9934 1.0001 0.992 0.9916 0.998 0.9919 0.9925 0.9977 0.9944 0.991 0.99 0.9917 0.9923 0.9928 0.9923 0.9928 0.9902 0.9893 0.9917 0.9982 1.0005 0.9923 0.9951 0.9956 0.998 0.9928 0.9938 0.9914 0.9955 0.9924 0.9911 0.9917 0.9917 0.9932 0.9955 0.9929 0.9955 0.9958 1.0012 0.9968 0.9911 0.9924 0.991 0.9946 0.9928 0.9946 0.9917 0.9918 0.9926 0.9931 0.9932 0.9903 0.9928 0.9929 0.9958 0.9955 0.9911 0.9938 0.9942 0.9945 0.9962 0.992 0.9927 0.9948 0.9945 0.9942 0.9952 0.9942 0.9958 0.9918 0.9932 1.0004 0.9972 0.9998 0.9918 0.9918 0.9964 0.9936 0.9931 0.9938 0.9934 0.99 0.9914 0.9904 0.994 0.9938 0.9933 0.9909 0.9942 0.9945 0.9954 0.996 0.9991 0.993 0.9942 0.9934 0.9939 0.9937 0.994 0.9926 0.9951 0.9952 0.9935 0.9938 0.9939 0.9933 0.9927 0.998 0.9997 0.9981 0.992 0.9954 0.992 0.9997 0.9981 0.9943 0.9941 0.9936 0.9996 0.9932 0.9926 0.9936 0.992 0.9936 0.9996 0.993 0.9924 0.9928 0.9926 0.9952 0.9945 0.9945 0.9903 0.9932 0.9953 0.9936 0.9912 0.9962 0.9965 0.9932 0.9967 0.9953 0.9963 0.992 0.991 0.9958 0.99 0.991 0.9958 0.9938 0.9996 0.9946 0.9974 0.9945 0.9946 0.9974 0.9957 0.9931 0.9947 0.9953 0.9931 0.9946 0.9978 0.9989 1.0004 0.9938 0.9934 0.9978 0.9956 0.9982 0.9948 0.9956 0.9982 0.9926 0.991 0.9945 0.9916 0.9953 0.9938 0.9956 0.9906 0.9956 0.9932 0.9914 0.9938 0.996 0.9906 0.98815 0.9942 0.9903 0.9906 0.9935 1.0024 0.9968 0.9906 0.9941 0.9919 0.9928 0.9958 0.9932 0.9957 0.9937 0.9982 0.9928 0.9919 0.9956 0.9957 0.9954 0.993 0.9954 0.9987 0.9956 0.9928 0.9951 0.993 0.9928 0.9926 0.9938 1.0001 0.9933 0.9952 0.9934 0.9988 0.993 0.9952 0.9948 0.9998 0.9971 0.9998 0.9962 0.9948 0.99 0.9942 0.9965 0.9912 0.9978 0.9928 1.0103 0.9956 0.9936 0.9929 0.9966 0.9964 0.996 0.9959 0.9954 0.9914 1.0103 1.0004 0.9911 0.9938 0.9927 0.9922 0.9924 0.9963 0.9936 0.9951 0.9951 0.9955 0.9961 0.9936 0.992 0.9944 0.9944 1.0008 0.9962 0.9986 0.9986 1 0.9986 0.9982 1 0.9949 0.9915 0.9951 0.9986 0.9927 0.9955 0.9952 0.9928 0.9982 0.9914 0.9927 0.9918 0.9944 0.9969 0.9955 0.9954 0.9955 0.9921 0.9934 0.9998 0.9946 0.9984 0.9924 0.9939 0.995 0.9957 0.9953 0.9912 0.9939 0.9921 0.9954 0.9933 0.9941 0.995 0.9977 0.9912 0.9945 0.9952 0.9924 0.9986 0.9953 0.9939 0.9929 0.9988 0.9906 0.9914 0.9978 0.9928 0.9948 0.9978 0.9946 0.9908 0.9954 0.9906 0.99705 0.9982 0.9932 0.9977 0.994 0.9982 0.9929 0.9924 0.9966 0.9921 0.9967 0.9934 0.9914 0.99705 0.9961 0.9967 0.9926 0.99605 0.99435 0.9948 0.9916 0.997 0.9961 0.9967 0.9961 0.9955 0.9922 0.9918 0.9955 0.9941 0.9955 0.9955 0.9924 0.9973 0.999 0.9941 0.9922 0.9922 0.9953 0.9945 0.9945 0.9957 0.9932 0.9945 0.9913 0.9909 0.9939 0.991 0.9954 0.9943 0.993 1.0002 0.9946 0.9953 0.9918 0.9936 0.9984 0.9956 0.9966 0.9942 0.9984 0.9956 0.9966 0.9974 0.9944 1.0008 0.9974 1.0008 0.9928 0.9944 0.9908 0.9917 0.9911 0.9912 0.9953 0.9932 0.9896 0.9889 0.9912 0.9926 0.9911 0.9964 0.9974 0.9944 0.9974 0.9964 0.9963 0.9948 0.9948 0.9953 0.9948 0.9953 0.9949 0.9988 0.9954 0.992 0.9984 0.9954 0.9926 0.992 0.9976 0.9972 0.991 0.998 0.9966 0.998 1.0007 0.992 0.9925 0.991 0.9934 0.9955 0.9944 0.9981 0.9968 0.9946 0.9946 0.9981 0.9946 0.997 0.9924 0.9958 0.994 0.9958 0.9984 0.9948 0.9932 0.9952 0.9924 0.9945 0.9976 0.9976 0.9938 0.9997 0.994 0.9921 0.9986 0.9987 0.9991 0.9987 0.9991 0.9991 0.9948 0.9987 0.993 0.9988 1 0.9932 0.9991 0.9989 1 1 0.9952 0.9969 0.9966 0.9966 0.9976 0.99 0.9988 0.9942 0.9984 0.9932 0.9969 0.9966 0.9933 0.9916 0.9914 0.9966 0.9958 0.9926 0.9939 0.9953 0.9906 0.9914 0.9958 0.9926 0.9991 0.9994 0.9976 0.9966 0.9953 0.9923 0.993 0.9931 0.9932 0.9926 0.9938 0.9966 0.9974 0.9924 0.9948 0.9964 0.9924 0.9966 0.9974 0.9938 0.9928 0.9959 1.0001 0.9959 1.0001 0.9968 0.9932 0.9954 0.9992 0.9932 0.9939 0.9952 0.9996 0.9966 0.9925 0.996 0.9996 0.9973 0.9937 0.9966 1.0017 0.993 0.993 0.9959 0.9958 1.0017 0.9958 0.9979 0.9941 0.997 0.9934 0.9927 0.9944 0.9927 0.9963 1.0011 1.0011 0.9959 0.9973 0.9966 0.9932 0.9984 0.999 0.999 0.999 0.999 0.999 1.0006 0.9937 0.9954 0.997 0.9912 0.9939 0.999 0.9957 0.9926 0.9994 1.0004 0.9994 1.0004 1.0004 1.0002 0.9922 0.9922 0.9934 0.9926 0.9941 0.9994 1.0004 0.9924 0.9948 0.9935 0.9918 0.9948 0.9924 0.9979 0.993 0.994 0.991 0.993 0.9922 0.9979 0.9937 0.9928 0.9965 0.9928 0.9991 0.9948 0.9925 0.9958 0.9962 0.9965 0.9951 0.9944 0.9916 0.9987 0.9928 0.9926 0.9934 0.9944 0.9949 0.9926 0.997 0.9949 0.9948 0.992 0.9964 0.9926 0.9982 0.9955 0.9955 0.9958 0.9997 1.0001 1.0001 0.9918 0.9918 0.9931 1.0001 0.9926 0.9966 0.9932 0.9969 0.9925 0.9914 0.996 0.9952 0.9934 0.9939 0.9939 0.9906 0.9901 0.9948 0.995 0.9953 0.9953 0.9952 0.996 0.9948 0.9951 0.9931 0.9962 0.9948 0.9959 0.9962 0.9958 0.9948 0.9948 0.994 0.9942 0.9942 0.9948 0.9964 0.9958 0.9932 0.9986 0.9986 0.9988 0.9953 0.9983 1 0.9951 0.9983 0.9906 0.9981 0.9936 0.9951 0.9953 1.0005 0.9972 1 0.9969 1.0001 1.0001 1.0001 0.9934 0.9969 1.0001 0.9902 0.993 0.9914 0.9941 0.9967 0.9918 0.998 0.9967 0.9918 0.9957 0.9986 0.9958 0.9948 0.9918 0.9923 0.9998 0.9998 0.9914 0.9939 0.9966 0.995 0.9966 0.994 0.9972 0.9998 0.9998 0.9982 0.9924 0.9972 0.997 0.9954 0.9962 0.9972 0.9921 0.9905 0.9998 0.993 0.9941 0.9994 0.9962 0.992 0.9922 0.994 0.9897 0.9954 0.99 0.9948 0.9922 0.998 0.9944 0.9944 0.9986 0.9986 0.9986 0.9986 0.9986 0.996 0.9999 0.9986 0.9986 0.996 0.9951 0.9999 0.993 0.9982 0.992 0.9963 0.995 0.9956 0.997 0.9936 0.9935 0.9963 0.9967 0.9912 0.9981 0.9966 0.9967 0.9963 0.9935 0.9902 0.99 0.996 0.9966 0.9962 0.994 0.996 0.994 0.9944 0.9974 0.996 0.9922 0.9917 0.9918 0.9936 0.9938 0.9918 0.9939 0.9917 0.9981 0.9941 0.9928 0.9952 0.9898 0.9914 0.9981 0.9957 0.998 0.9957 0.9986 0.9983 0.9982 0.997 0.9947 0.997 0.9947 0.99416 0.99516 0.99496 0.9974 0.99579 0.9983 0.99471 0.9974 0.99644 0.99579 0.99699 0.99758 0.9977 0.99397 0.9983 0.99471 0.99243 0.9962 1.00182 0.99384 0.99582 0.9962 0.9924 0.99466 0.99212 0.99449 0.99748 0.99449 0.99748 0.99475 0.99189 0.99827 0.99752 0.99827 0.99479 0.99752 0.99642 1.00047 0.99382 0.99784 0.99486 0.99537 0.99382 0.99838 0.99566 0.99268 0.99566 0.99468 0.9933 0.99307 0.99907 0.99907 0.99907 0.99907 0.99471 0.99471 0.99907 0.99148 0.99383 0.99365 0.99272 0.99148 0.99235 0.99508 0.9946 0.99674 0.99018 0.99235 0.99084 0.99856 0.99591 0.9975 0.9944 0.99173 0.99378 0.99805 0.99534 0.99232 0.99805 0.99078 0.99534 0.99061 0.99182 0.9966 0.9912 0.99779 0.99814 0.99096 0.99379 0.99426 0.99228 0.99335 0.99595 0.99297 0.99687 0.99297 0.99687 0.99445 0.9986 0.99154 0.9981 0.98993 1.00241 0.99716 0.99437 0.9972 0.99756 0.99509 0.99572 0.99756 0.99175 0.99254 0.99509 0.99676 0.9979 0.99194 0.99077 0.99782 0.99942 0.99708 0.99353 0.99256 0.99199 0.9918 0.99354 0.99244 0.99831 0.99396 0.99724 0.99524 0.9927 0.99802 0.99512 0.99438 0.99679 0.99652 0.99698 0.99474 0.99511 0.99582 0.99125 0.99256 0.9911 0.99168 0.9911 0.99556 1.00098 0.99516 0.99516 0.99518 0.99347 0.9929 0.99347 0.99841 0.99362 0.99361 0.9914 0.99114 0.9925 0.99453 0.9938 0.9938 0.99806 0.9961 1.00016 0.9916 0.99116 0.99319 0.99517 0.99514 0.99566 0.99166 0.99587 0.99558 0.99117 0.99399 0.99741 0.99405 0.99622 1.00051 0.99803 0.99405 0.99773 0.99397 0.99622 0.99713 0.99274 1.00118 0.99176 0.9969 0.99771 0.99411 0.99771 0.99411 0.99194 0.99558 0.99194 0.99558 0.99577 0.99564 0.99578 0.99888 1.00014 0.99441 0.99594 0.99437 0.99594 0.9979 0.99434 0.99203 0.998 0.99316 0.998 0.99314 0.99316 0.99612 0.99295 0.99394 0.99642 0.99642 0.99248 0.99268 0.99954 0.99692 0.99592 0.99592 0.99692 0.99822 0.99822 0.99402 0.99404 0.99787 0.99347 0.99838 0.99839 0.99375 0.99155 0.9936 0.99434 0.9922 0.99571 0.99658 0.99076 0.99496 0.9937 0.99076 0.99542 0.99825 0.99289 0.99432 0.99523 0.99542 0.9959 0.99543 0.99662 0.99088 0.99088 0.99922 0.9966 0.99466 0.99922 0.99836 0.99836 0.99238 0.99645 1 1 0.99376 1 0.99513 0.99556 0.99556 0.99543 0.99886 0.99526 0.99166 0.99691 0.99732 0.99573 0.99656 0.99112 0.99214 0.99165 0.99004 0.99463 0.99683 0.99004 0.99596 0.99898 0.99114 0.99508 0.99306 0.99898 0.99508 0.99114 0.99342 0.99345 0.99772 0.99239 0.99502 0.99502 0.99479 0.99207 0.99497 0.99828 0.99542 0.99542 0.99228 0.99706 0.99497 0.99669 0.99828 0.99269 0.99196 0.99662 0.99475 0.99544 0.99944 0.99475 0.99544 0.9966 0.99066 0.9907 0.99066 0.998 0.9907 0.99066 0.99307 0.99106 0.99696 0.99106 0.99307 0.99167 0.99902 0.98992 0.99182 0.99556 0.99582 0.99182 0.98972 0.99352 0.9946 0.99273 0.99628 0.99582 0.99553 0.98914 0.99354 0.99976 0.99808 0.99808 0.99808 0.99808 0.99808 0.99808 0.9919 0.99808 0.99499 0.99655 0.99615 0.99296 0.99482 0.99079 0.99366 0.99434 0.98958 0.99434 0.99938 0.99059 0.99835 0.98958 0.99159 0.99159 0.98931 0.9938 0.99558 0.99563 0.98931 0.99691 0.9959 0.99159 0.99628 0.99076 0.99678 0.99678 0.99678 0.99089 0.99537 1.0002 0.99628 0.99089 0.99678 0.99076 0.99332 0.99316 0.99272 0.99636 0.99202 0.99148 0.99064 0.99884 0.99773 1.00013 0.98974 0.99773 1.00013 0.99112 0.99136 0.99132 0.99642 0.99488 0.99527 0.99578 0.99352 0.99199 0.99198 0.99756 0.99578 0.99561 0.99347 0.98936 0.99786 0.99705 0.9942 0.9948 0.99116 0.99688 0.98974 0.99542 0.99154 0.99118 0.99044 0.9914 0.9979 0.98892 0.99114 0.99188 0.99583 0.98892 0.98892 0.99704 0.9911 0.99334 0.99334 0.99094 0.99014 0.99304 0.99652 0.98944 0.99772 0.99367 0.99304 0.99183 0.99126 0.98944 0.99577 0.99772 0.99652 0.99428 0.99388 0.99208 0.99256 0.99388 0.9925 0.99904 0.99216 0.99208 0.99428 0.99165 0.99924 0.99924 0.99924 0.9956 0.99562 0.9972 0.99924 0.9958 0.99976 0.99976 0.99296 0.9957 0.9958 0.99579 0.99541 0.99976 0.99518 0.99168 0.99276 0.99085 0.99873 0.99172 0.99312 0.99276 0.9972 0.99278 0.99092 0.9962 0.99053 0.99858 0.9984 0.99335 0.99053 0.9949 0.9962 0.99092 0.99532 0.99727 0.99026 0.99668 0.99727 0.9952 0.99144 0.99144 0.99015 0.9914 0.99693 0.99035 0.99693 0.99035 0.99006 0.99126 0.98994 0.98985 0.9971 0.99882 0.99477 0.99478 0.99576 0.99578 0.99354 0.99244 0.99084 0.99612 0.99356 0.98952 0.99612 0.99084 0.99244 0.99955 0.99374 0.9892 0.99144 0.99352 0.99352 0.9935 0.99237 0.99144 0.99022 0.99032 1.03898 0.99587 0.99587 0.99587 0.99976 0.99354 0.99976 0.99552 0.99552 0.99587 0.99604 0.99584 0.98894 0.9963 0.993 0.98894 0.9963 0.99068 0.98964 0.99604 0.99584 0.9923 0.99437 0.993 0.99238 0.99801 0.99802 0.99566 0.99067 0.99066 0.9929 0.9934 0.99067 0.98912 0.99066 0.99228 0.98912 0.9958 0.99052 0.99312 0.9968 0.99502 0.99084 0.99573 0.99256 0.9959 0.99084 0.99084 0.99644 0.99526 0.9954 0.99095 0.99188 0.9909 0.99256 0.9959 0.99581 0.99132 0.98936 0.99136 0.99142 0.99232 0.99232 0.993 0.99311 0.99132 0.98993 0.99208 0.99776 0.99839 0.99574 0.99093 0.99156 0.99278 0.9924 0.98984 0.99035 0.9924 0.99165 0.9923 0.99278 0.99008 0.98964 0.99156 0.9909 0.98984 0.9889 0.99178 0.99076 0.9889 0.99046 0.98999 0.98946 0.98976 0.99046 0.99672 0.99482 0.98945 0.98883 0.99362 0.99075 0.99436 0.98988 0.99158 0.99265 0.99195 0.99168 0.9918 0.99313 0.9895 0.9932 0.99848 0.9909 0.99014 0.9952 0.99652 0.99848 0.99104 0.99772 0.9922 0.99076 0.99622 0.9902 0.99114 0.9938 0.99594 0.9902 0.99035 0.99032 0.99558 0.99622 0.99076 0.99413 0.99043 0.99043 0.98982 0.98934 0.9902 0.99449 0.99629 0.9948 0.98984 0.99326 0.99834 0.99555 0.98975 0.99216 0.99216 0.99834 0.9901 0.98975 0.99573 0.99326 0.99215 0.98993 0.99218 0.99555 0.99564 0.99564 0.99397 0.99576 0.99601 0.99564 0.99397 0.98713 0.99308 0.99308 0.99582 0.99494 0.9929 0.99471 0.9929 0.9929 0.99037 0.99304 0.99026 0.98986 0.99471 0.98951 0.99634 0.99368 0.99792 0.99026 0.99362 0.98919 0.99835 0.99835 0.99038 0.99104 0.99038 0.99286 0.99296 0.99835 0.9954 0.9914 0.99286 0.99604 0.99604 0.99119 0.99007 0.99507 0.99596 0.99011 0.99184 0.99469 0.99469 0.99406 0.99305 0.99096 0.98956 0.9921 0.99496 0.99406 0.99406 0.9888 0.98942 0.99082 0.98802 17.3 1.4 1.3 1.6 5.25 2.4 14.6 11.8 1.5 1.8 7.7 2 1.8 1.4 16.7 8.1 8 4.7 8.1 2.1 16.7 6.4 1.5 7.6 1.5 12.4 1.3 1.7 8.1 7.1 7.6 2.3 6.5 1.4 12.7 1.6 1.1 1.2 6.5 4.6 0.6 10.6 4.6 4.8 2.7 12.6 0.6 9.2 6.6 7 8.45 11.1 18.15 18.15 4.1 4.1 4.6 18.15 4.9 8.3 1.4 11.5 1.8 1.6 2.4 4.9 1.8 4.3 4.4 1.4 1.6 1.3 5.2 5.6 5.3 4.9 2.4 1.6 2.1 1.4 7.1 1.6 10.7 11.1 10.7 1.6 1.6 1.5 1.5 1.6 1.6 8 7.7 2.7 15.1 15.1 8.9 6 12.3 13.1 6.7 12.3 2.3 11.1 1.5 6.7 6 15.2 10.2 13.1 10.7 17.1 17.1 17.1 1.9 10.7 17.1 1.2 1.2 3.1 1.5 10.7 4.9 12.6 10.7 4.9 12.15 12 1.7 2.6 1.4 1.9 16.9 16.9 2.1 7 7.1 5.9 7.1 8.7 13.2 15.3 15.3 13.2 2.7 10.65 10 6.8 15.6 13.2 5.1 3 15.3 2.1 1.9 8.6 8.75 3.6 4.7 1.3 1.8 9.7 4 2.4 4.7 18.8 1.8 1.8 12.8 12.8 12.8 12.8 12.8 7.8 16.75 12.8 12.8 7.8 5.4 16.75 1.3 10.1 3.8 10.9 6.6 9.8 11.7 1.2 1.4 9.6 12.2 2.6 10.7 4.9 12.2 9.6 1.4 1.1 1 8.2 11.3 7.3 2.3 8.2 2.1 2 10 15.75 3.9 2 1.5 1.6 1.4 1.5 1.4 2 13.8 1.3 3.8 6.9 2.2 1.6 13.8 10.8 12.8 10.8 15.3 12.1 12 11.6 9.2 11.6 9.2 2.8 1.6 6.1 8.5 7.8 14.9 6.2 8.5 8.2 7.8 10.6 11.2 11.6 7.1 14.9 6.2 1.7 7.7 17.3 1.4 7.7 7.7 3.4 1.6 1.4 1.4 10.4 1.4 10.4 4.1 2.8 15.7 10.9 15.7 6.5 10.9 5.9 17.3 1.4 13.5 8.5 6.2 1.4 14.95 7.7 1.3 7.7 1.3 1.3 1.3 15.6 15.6 15.6 15.6 4.9 5 15.6 6.5 1.4 2.7 1.2 6.5 6.4 6.9 7.2 10.6 3.5 6.4 2.3 12.05 7 11.8 1.4 5 2.2 14.6 1.6 1.3 14.6 2.8 1.6 3.3 6.3 8.1 1.6 10.6 11.8 1.7 8.1 1.4 1.3 1.8 7.2 1.1 11.95 1.1 11.95 2.2 12.7 1.4 10.6 1.9 17.8 10.2 4.8 9.8 8.4 7.2 4.8 8.4 4.5 1.4 7.2 11 11.1 2.6 2 10.1 13.3 11.4 1.3 1.4 1.4 7 2 1.2 12.9 5 10.1 3.75 1.7 12.6 1.3 1.6 7.6 8.1 14.9 6 6 7.2 3 1.2 2 4.9 2 8.9 16.45 2 1.9 5.1 4.4 5.8 4.4 12.9 1.3 1.3 1.2 2.7 1.7 8.2 1.5 1.5 12.9 3.9 17.75 4.9 1.6 1.4 2 2 8.2 2.1 1.8 8.5 4.45 5.8 13 2.7 7.3 19.1 8.8 2.7 7.4 2.3 6.85 11.4 0.9 19.35 7.9 11.75 7.7 3 7.7 3 1.5 7.5 1.5 7.5 8.3 7.05 8.4 13.9 17.5 5.6 9.4 4.8 9.4 9.7 6.3 1.6 14.6 2.5 14.6 2.6 2.5 8.2 1.5 2.3 10 10 1.6 1.6 16 10.4 7.4 7.4 10.4 16.05 16.05 2.6 2.5 10.8 1.2 12.1 11.95 1.7 0.8 1.4 1.3 6.3 10.3 15.55 1.5 1.5 1.4 1.5 7.9 13 1 4.85 7.1 7.9 7.5 7.6 10.3 1.7 1.7 19.95 7.7 5.3 19.95 12.7 12.7 1.5 11.3 18.1 18.1 7 18.1 6.4 1.4 1.4 3.1 14.1 7.7 5.2 11.6 10.4 7.5 11.2 0.8 1.4 4.7 3.1 4 11.3 3.1 8.1 14.8 1.4 8.1 3.5 14.8 8.1 1.4 1.5 1.5 12.8 1.6 7.1 7.1 11.2 1.7 6.7 17.3 8.6 8.6 1.5 12.1 6.7 10.7 17.3 1.8 1.4 7.5 4.8 7.1 16.9 4.8 7.1 11.3 1.1 1.2 1.1 12.9 1.2 1.1 1.2 2.3 10 2.3 1.2 1.4 14.9 1.8 1.8 7 8.6 1.8 1.1 1.3 4.9 1.9 10.4 10 8.6 1.7 1.7 18.95 12.8 12.8 12.8 12.8 12.8 12.8 0.7 12.8 1.4 13.3 8.5 1.5 11.7 5 1.2 2.1 1.4 2.1 16 1.1 15.3 1.4 2.8 2.8 0.9 2.5 8.1 8.2 0.9 11.1 7.8 2.8 10.1 3.2 14.2 14.2 14.2 2.9 6 20.4 10.1 2.9 14.2 3.2 0.95 1.7 1.7 9 1.3 1.4 2.4 16 11.4 14.35 2.1 11.4 14.35 1.1 1.1 1.2 15.8 5.2 5.2 9.6 5.2 1.2 0.8 14.45 9.6 6.9 3.4 2.3 11 5.95 5.1 5.4 1.2 12.6 1 6.6 1.5 1 1.1 6.6 8.2 2 1.4 2 7.5 2 2 13.3 2.85 5.6 5.6 1 3.2 1 7.1 2.4 11.2 9.5 1 1.8 2.6 2.4 8 11.2 7.1 3.3 10.3 1.2 1.6 10.3 9.65 16.4 1.5 1.2 3.3 5 16.3 16.3 16.3 6.5 6.4 10.2 16.3 7.4 13.7 13.7 1.3 7.4 7.4 7.45 7.2 13.7 10.4 1.1 6.5 4.6 13.9 5.2 1.7 6.5 16.4 3.6 1.5 12.4 1.7 6.2 6.2 2.6 1.7 9.3 12.4 1.5 9.1 12 4.8 12.3 12 2.7 3.6 3.6 4.3 1.8 11.8 1.8 11.8 1.8 1.4 6.6 1.55 0.7 6.4 11.8 4.3 5.1 5.8 5.9 1.3 1.4 1.2 7.4 10.8 1.8 7.4 1.2 1.4 14.4 1.7 3.6 3.6 10.05 10.05 10.5 1.9 3.6 1.65 1.9 65.8 6.85 7.4 7.4 20.2 11 20.2 6.2 6.2 6.85 8 8.2 2.2 10.1 7.2 2.2 10.1 1.6 1.3 8 8.2 5.3 14 7.2 1.6 11.8 9.6 6.1 2.7 3.6 1.7 1.6 2.7 1 0.9 1.6 1 10.6 2 1.2 6.2 9.2 5 6.3 3.3 8 1.2 1.2 16.2 11.6 7.2 1.1 3.4 1.4 3.3 8 9.3 2.3 0.9 3.5 1.7 1.3 1.3 5.6 7.4 2.3 1 1.5 10 14.9 9.3 1 1 5.9 5 1.25 3.9 5 0.8 1 5.9 1.6 1.3 1 1.1 1.25 1.4 1.2 5 1.4 1.7 1.8 1.6 1.5 1.7 13.9 5.9 2.1 1.1 6.7 2.7 6.7 3.95 7.75 10.6 1.6 2.5 0.7 11.1 5.15 4.7 9.7 1.7 1.4 2 7.5 9.7 0.8 13.1 1.1 2.2 8.9 1.1 0.9 1.7 6.9 1.1 1 1 7.6 8.9 2.2 1.2 1 1 3.1 1.95 2.2 8.75 11.9 2.7 5.45 6.3 14.4 7.8 1.6 9.1 9.1 14.4 1.3 1.6 11.3 6.3 0.7 1.25 0.7 7.8 10.3 10.3 7.8 8.7 8.3 10.3 7.8 1.2 8.3 8.3 6.2 5 1.8 1.6 1.8 1.8 2.9 6 0.9 1.1 1.6 5.45 14.05 8 13.1 4.9 1.3 2.2 14.9 14.9 0.95 1.4 0.95 1.7 5.6 14.9 7.1 1.2 9.6 11.4 11.4 7.9 5 11.1 8 3.8 10.55 10.2 10.2 9.8 6.3 1.1 4.5 6.3 10.9 9.8 9.8 0.8 0.8 1.2 1.3 9.8 10.2 10.9 6.3 6.3 1.2 0.9 1.1 4.5 3.7 18.1 1.35 5.5 3.1 12.85 19.8 8.25 12.85 3.8 6.9 8.25 11.7 4.6 4 19.8 12.85 1.2 8.9 11.7 6.2 14.8 14.8 10.8 1.6 8.3 8.4 2.5 3.5 17.2 2.1 12.2 11.8 16.8 17.2 1.1 14.7 5.5 6.1 1.2 1.3 8.7 1.7 8.7 10.2 4.5 5.9 1.7 1.4 5.4 7.9 1.1 7 7 7.6 7 12.3 15.3 12.3 1.2 2.3 6.1 7.6 10.2 4.1 2.9 8.5 1.5 3.1 7.9 3.5 4.9 1.1 7 1.2 4.5 2.6 9.9 4.5 9.5 1.5 3.2 2.6 11.2 3.2 2.3 4.9 4.9 1.4 1.5 6.7 2.1 4.3 10.9 7 2.3 2.5 2.6 3.2 2.5 14.7 4.5 2.2 1.9 1.6 17.3 4.2 4.2 2.5 1.9 1.4 0.8 8 1.6 1.7 5.5 17.3 8.6 6.9 2.1 2.2 1.5 2.5 17.6 4.2 2.9 4.8 11.9 0.9 1.3 6.4 4.3 11.9 8.1 1.3 0.9 17.2 17.2 17.2 8.7 17.2 8.7 7.5 17.2 4.6 3.7 2.2 7.4 15.1 7.4 4.8 7.9 1 15.1 7.4 4.8 4.6 1.4 6.2 6.1 5.1 6.3 0.9 2.3 6.6 7.5 8.6 11.9 2.3 7.1 4.3 1.1 1 7.9 1 1 1 7.3 1.7 1.3 6.4 1.8 1.5 3.8 7.9 1 1.2 5.3 9.1 6.5 9.1 6.3 5.1 6.5 2.4 9.1 7.5 5 6.75 1.2 1.6 16.05 5 12.4 0.95 4.6 1.7 1 1.3 5 2.5 2.6 2.1 12.75 1.1 12.4 3.7 2.65 2.5 8.2 7.3 1.1 6.6 7 14.5 11.8 3 3.7 6 4.6 2.5 3.3 1 1.1 1.4 3.3 8.55 2.5 6.7 3.8 4.5 4.6 4.2 11.3 5.5 4.2 2.2 14.5 14.5 14.5 14.5 14.5 14.5 1.5 18.75 3.6 1.4 5.1 10.5 2 2.6 9.2 1.8 5.7 2.4 1.9 1.4 0.9 4.6 1.4 9.2 1.4 1.8 2.3 2.3 4.4 6.4 2.9 2.8 2.9 4.4 8.2 1 2.9 7 1.8 1.5 7 8.2 7.6 2.3 8.7 1 2.9 6.7 5 1.9 2 1.9 8.5 12.6 5.2 2.1 1.1 1.3 1.1 9.2 1.2 1.1 8.3 1.8 1.4 15.7 4.35 1.8 1.6 2 5 1.8 1.3 1 1.4 8.1 8.6 3.7 5.7 2.35 13.65 13.65 13.65 15.2 4.6 1.2 4.6 6.65 13.55 13.65 9.8 10.3 6.7 15.2 9.9 7.2 1.1 8.3 11.25 12.8 9.65 12.6 12.2 8.3 11.25 1.3 9.9 7.2 1.1 1.1 4.8 1.1 1.4 1.7 10.6 1.4 1.1 5.55 2.1 1.7 9 1.7 1.8 4.7 11.3 3.6 6.9 3.6 4.9 6.95 1.9 4.7 11.3 1.8 11.3 8.2 8.3 9.55 8.4 7.8 7.8 10.2 5.5 7.8 7.4 3.3 5 3.3 5 1.3 1.2 7.4 7.8 9.9 0.7 4.6 5.6 9.5 14.8 4.6 2.1 11.6 1.2 11.6 2.1 20.15 4.7 4.3 14.5 4.9 14.55 14.55 10.05 4.9 14.5 14.55 15.25 3.15 1.3 5.2 1.1 7.1 8.8 18.5 8.8 1.4 1.2 5 1.6 18.75 6 9.4 9.7 4.75 6 5.35 5.35 6.8 6.9 1.4 0.9 1.2 1.3 2.6 12 9.85 3.85 2 1.6 7.8 1.9 2 10.3 1.1 12 3.85 9.85 2 4 1.1 10.4 6.1 1.8 10.4 4.7 4 1.1 6.4 8.15 6.1 4.8 1.2 1.1 1.4 7.4 1.8 1 15.5 15.5 8.4 2.4 3.95 19.95 2 3 15.5 8.4 14.3 4.2 1.4 3 4.9 2.4 14.3 10.7 11 1.4 1.2 12.9 10.8 1.3 2 1.8 1.2 7.5 9.7 3.8 7.2 9.7 6.3 6.3 0.8 8.6 6.3 3.1 7.2 7.1 6.4 14.7 7.2 7.1 1.9 1.2 4.8 1.2 3.4 4.3 8.5 1.8 1.8 19.5 8.5 19.9 8.3 1.8 1.1 16.65 16.65 16.65 0.9 6.1 10.2 0.9 16.65 3.85 4.4 4.5 3.2 4.5 4.4 9.7 4.2 4.2 1.1 9.7 4.2 5.6 4.2 1.6 1.6 1.1 14.6 2.6 1.2 7.25 6.55 7 1.5 1.4 7.25 1 4.2 17.5 17.5 17.5 1.5 1.3 3.9 4.2 7.6 1 1.1 11.8 1.4 9.7 12.9 1.6 7.2 7.1 1.9 8.8 7.2 1.4 14.3 14.3 8.8 1.4 1.8 14.3 7.2 1.2 11.8 0.9 12.6 26.05 4.7 12.6 1.2 26.05 6.1 11.8 0.9 5.6 5.3 5.7 8 8 17.6 8 8.8 1.5 1.4 4.8 2.4 3.7 4.9 5.7 5.7 4.9 2 5.1 4.5 3.2 6.65 1.6 4 17.75 1.4 17.75 7.2 5.7 8.5 11.4 5.4 2.7 4.3 1.2 1.8 1.3 5.7 2.7 11.7 4.3 11 1.6 11.6 6.2 1.8 1.2 1 2.4 1.2 8.2 18.8 9.6 12.9 9.2 1.2 12.9 8 12.9 1.6 12 2.5 9.2 4.4 8.8 9.6 8 18.8 1.3 1.2 12.9 1.2 1.6 1.5 18.15 13.1 13.1 13.1 13.1 1 1.6 11.8 1.4 1 13.1 10.6 10.4 1.1 7.4 1.2 3.4 18.15 8 2.5 2 2 6.9 1.2 9.4 2.9 6.9 5.4 1.3 20.8 10.3 1.3 1.6 13.1 1.8 8 1.6 1.4 14.7 14.7 14.7 14.7 14.7 14.7 14.7 1.8 10.6 12.5 6.8 14.7 2.9 1.4 1.4 2.1 7.4 2.9 1.4 1.4 7.4 5 2.5 6.1 2.7 2.1 12.9 12.9 12.9 13.7 12.9 2.4 9.8 13.7 1.3 12.1 6.1 7.7 6.1 1.4 7.7 12.1 6.8 9.2 8.3 17.4 2.7 12.8 8.2 8.1 8.2 8.3 8 11.8 12 1.7 17.4 13.9 10.7 2 2.2 1.3 1.1 2 6.4 1.3 1.1 10.7 6.4 6.3 6.4 15.1 2 2 2.2 12.1 8.8 8.8 5.1 6.8 6.8 3.7 12.2 5.7 8.1 2.5 4 6.8 1 5.1 5.8 10.6 3.5 3.5 16.4 4.8 3.3 1.2 1.2 4.8 3.3 2.5 8.7 1.6 4 2.5 16.2 9 16.2 1.4 7 9 3.1 1.5 4.6 4.8 4.6 1.5 2.7 6.3 7.2 7.2 12.4 6.6 6.6 4 4.8 1.3 7.2 11.1 12.4 9.8 6.6 13.3 11.7 8 1.6 16.55 1.5 10.2 6.6 17.8 17.8 1.5 7.4 17.8 2 7.4 2 17.8 12.1 8.2 1.5 8.7 3.5 6.4 2.1 7.7 12.3 1.3 8.7 3.5 1.1 2.8 3.5 1.9 3.8 3.8 2.4 4.8 4.8 6.2 1.3 3.8 1.5 4.8 1.9 6.2 7.9 1.6 1.4 2.6 14.8 2.4 0.9 0.9 1.2 9.9 3.9 15.6 15.6 1.5 1.6 7.8 5.6 1.3 16.7 7.95 6.7 1.1 6.3 8.9 1 1.5 6.6 6.2 6.3 2.1 2.2 5.4 8.9 1 17.9 2.6 1.3 17.9 2.6 2.3 4.3 7.1 7.1 11.9 11.7 5.8 3.8 12.4 6.5 7.1 7.6 7.9 2.8 10.6 2.8 1.5 7.6 7.9 1.7 7.6 7.5 1.7 1.7 12.1 4.5 1.7 8 7.6 8.6 8.6 14.6 1.6 8.6 14.6 1.1 3.7 8.9 8.9 4.7 8.9 3.1 5.8 5.8 5.8 1 15.8 1.5 5.2 1.5 2.5 1 15.8 5.9 3.1 3.1 5.8 11.5 18 4.8 8.5 1.6 18 4.8 5.9 1.1 8.5 13.1 4.1 2.9 13.1 1.1 1.5 7.75 1.15 1 17.8 5.7 17.8 7.4 1.4 1.4 1 4.4 1.6 7.9 15.5 15.5 15.5 15.5 17.55 13.5 13.5 1.3 15.5 11.6 7.9 15.5 17.55 11.6 13.15 1.9 13.5 1.3 6.1 6.1 1.9 1.9 1.6 11.3 8.4 8.3 8.4 12.2 8 1.3 12.7 1.3 10.5 12.5 9.6 1.5 1.5 7.8 10.8 12.5 8.6 1.2 14.5 3.7 1.1 1.1 3.8 4.6 10.2 7.9 2.4 10.7 4.9 10.7 1.1 7.9 5.6 2.4 14.2 9.5 9.5 4.1 4.7 1.4 0.9 20.3 3.5 2.7 1.2 1.2 2 1.1 1.5 1.2 18.1 18.1 3.6 3.5 12.1 17.45 12.1 3 1.6 5.7 5.6 6.8 15.6 6 1.8 8.6 8.6 11.5 7.8 2.4 5 8.6 1.5 5.4 11.9 11.9 9 10 11.9 11.9 15.5 5.4 15 1.4 9.4 3.7 15 1.4 6.5 1.4 6.3 13.7 13.7 13.7 13.7 13.7 13.7 1.5 1.6 1.4 3.5 1 1.4 1.5 13.7 1.6 5.2 1.4 11.9 2.4 3.2 1.7 4.2 15.4 13 5.6 9.7 2.5 4 15.4 1.2 2 1.2 5.1 1.4 1.2 6.5 1.3 6.5 2.7 1.3 7.4 12.9 1.3 1.2 2.6 2.3 1.3 10.5 2.6 14.4 1.2 3.1 1.7 6 11.8 6.2 1.4 12.1 12.1 12.1 3.9 4.6 12.1 1.2 8.1 3.9 1.1 6.5 10.1 10.7 3.2 12.4 5.2 5 2.5 9.2 6.9 2 15 15 1.2 15 1.8 10.8 3.9 4.2 2 13.5 13.3 2.2 1.4 1.6 2.2 14.8 1.8 14.8 1.3 9.9 5.1 5.1 1.5 1.5 11.1 5.25 2.3 7.9 8 1.4 5.25 2.3 2.3 3.5 13.7 9.9 15.4 16 16 16 16 2.4 5.5 2.3 16.8 16 17.8 17.8 6.8 6.8 6.8 6.8 1.6 4.7 11.8 17.8 15.7 5.8 15.7 9 15.7 5.8 8.8 10.2 6.6 6.5 8.9 11.1 4.2 1.6 7.4 11.5 1.6 2 4.8 9.8 1.9 4.2 1.6 7.3 5.4 10.4 1.9 7.3 5.4 7.7 11.5 1.2 2.2 1 8.2 8.3 8.2 9.3 8.1 8.2 8.3 13.9 13.9 13.9 13.9 13.9 13.9 13.9 2 13.9 15.7 1.2 1.5 1.2 3.2 1.2 2.6 13.2 10.4 5.7 2.5 1.6 1.4 7.4 2.5 5.6 3.6 7.5 5.8 1.6 1.5 2.9 11.2 9.65 10.1 3.2 11.2 11.45 9.65 4.5 2.7 3.5 1.7 2.1 4.8 5 2.6 6.6 5 7.3 5 1.7 2.6 8.2 8.2 5 1.2 7.1 9.5 15.8 15.5 15.8 17.05 12.7 12.3 11.8 11.8 11.8 12.3 11.8 13.6 5.2 6.2 7.9 7.9 3.3 2.8 7.9 3.3 6.3 4.9 10.4 4.9 10.4 16 6.3 2.2 17.3 17.3 17.3 17.3 2.2 2.2 17.3 6.6 6.5 12.3 5 2.8 13.6 2.8 5.4 10.9 1.7 9.15 4.5 9.15 1.4 5.9 16.4 1.2 16.4 5.9 7.8 7.8 2.8 2.9 2.5 12.8 12.2 7.7 2.8 2.9 17.3 19.3 19.3 19.3 2.7 6.4 17.3 2.4 2.8 1.7 15.4 15.4 4.1 6.6 1.2 2.1 1 1.1 1.4 1.6 9.8 1.9 1.3 7.9 7.9 4.5 22.6 7.9 3.5 1.2 4.5 2 7.8 0.9 2.9 2.9 3.5 4.2 9.7 10.5 1.1 16.1 1.1 8.1 6.2 7.7 2.4 16.3 2.3 8.4 8.5 6 1.1 1.75 2.6 1.3 2.1 1.1 1.1 2.8 9 2.8 2.2 5.1 3.5 12.7 7.5 2 3.5 14.3 9.8 12.7 12.7 5.1 3.5 12.7 12.9 12.9 1.3 10.5 1.5 12.7 12.9 1.2 6.2 8.8 3.9 1.3 9.1 9.1 3.9 1.8 2.1 1.4 14.7 9.1 1.9 1.8 9.6 3.9 1.3 11.8 1.9 12 7.9 9.3 4.6 2.2 10.2 10.6 1.4 9.1 11.1 9.1 4.4 2.8 1.1 1.3 1.2 3.3 9.7 2.3 1.1 11.4 1.2 14.7 13.8 1.3 6.3 7.9 2 11.8 1.2 10 5.2 1.2 7.2 9.9 5.3 13.55 2.2 9.9 4.3 13 13.55 1 1.1 6.9 13.4 4.6 9.9 3 5.8 12.9 3.2 0.8 2.5 2.4 7.2 7.3 6.3 4.25 1.2 2 4.25 4.7 4.5 1.4 4.1 5.3 4.2 6.65 8.2 2.6 2.6 2 12.2 2.3 8.2 5 10.7 10.8 1.7 1.3 1.7 12.7 1.3 1.2 1.3 5.7 3.4 1.1 1 1 1.65 6.8 6.8 4.9 1.4 2.5 10.8 10.8 10.8 10.8 2.8 1.3 2 1.1 8.2 6 6.1 8.2 8.8 6.1 6 1.2 11.4 1.3 1.3 6.2 3.2 4.5 9.9 6.2 11.4 1.3 1.3 0.9 0.7 1 1 10.4 1.3 12.5 12.5 12.5 12.5 19.25 1.1 12.5 19.25 9 1.2 9 1.3 12.8 12.8 7.6 7.6 1.4 8.3 9 1.85 12.55 1.4 1.8 4 12.55 9 3 1.85 7.9 2.6 1.2 7.1 7.9 1.3 10.7 7.7 8.4 10.7 12.7 1.8 7.7 10.5 1.6 1.85 10.5 10.5 1 1.2 1.7 1.6 9 1.9 1.2 1.5 3.9 3.6 1.2 5 2.9 10.4 11.4 18.35 18.4 1.2 7.1 1.3 1.5 10.2 2.2 3.5 3.5 3.9 7.4 7.4 11 1.5 3.9 5.4 1.5 5 1.2 13 13 13 13 8.6 1.7 1.2 1.2 1.2 2 19.4 0.8 6.3 6.4 12.1 12.1 12.9 2.4 4.3 4.2 12.9 1.7 2.2 12.1 3.4 7.4 7.3 1.1 1.1 1.4 14.5 8 1.1 1.1 2.2 5.8 0.9 6.4 10.9 7.3 8.3 1.3 3.3 1 1.1 1 5.1 3.2 12.6 3.7 1.7 5.1 1 1.3 1.5 4.6 10.3 6.1 6.1 1.2 10.3 9.9 1.6 1.1 1.5 1.2 1.5 1.1 11.5 7.8 7.4 1.45 8.9 1.1 1 2.5 1.1 2.4 2.3 5.1 2.5 8.9 2.5 8.9 1.6 1.4 3.9 13.7 13.7 9.2 7.8 7.6 7.7 3 1.3 4 1.1 2 1.9 1.4 4.5 10.1 6.6 1.9 12.4 1.6 2.5 1.2 2.5 0.8 0.9 8.1 8.1 11.75 1.3 1.9 8.3 8.1 5.7 1.9 1.2 11.75 2.2 0.9 1.3 1.6 8 1.2 1.1 0.8 \ No newline at end of file diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py new file mode 100644 index 00000000..4601ad07 --- /dev/null +++ b/pandas/tools/tests/test_merge.py @@ -0,0 +1,2158 @@ +# pylint: disable=E1103 + +import nose + +from datetime import datetime +from numpy.random import randn +from numpy import nan +import numpy as np +import random + +import pandas as pd +from pandas.compat import range, lrange, lzip, zip, StringIO +from pandas import compat, _np_version_under1p7 +from pandas.tseries.index import DatetimeIndex +from pandas.tools.merge import merge, concat, ordered_merge, MergeError +from pandas.util.testing import (assert_frame_equal, assert_series_equal, + assert_almost_equal, rands, + makeCustomDataframe as mkdf, + assertRaisesRegexp) +from pandas import isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_table, read_csv +import pandas.algos as algos +import pandas.util.testing as tm + +a_ = np.array + +N = 50 +NGROUPS = 8 +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + + +def get_test_data(ngroups=NGROUPS, n=N): + unique_groups = lrange(ngroups) + arr = np.asarray(np.tile(unique_groups, n // ngroups)) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[:n - len(arr)]) + + random.shuffle(arr) + return arr + + +class TestMerge(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + # aggregate multiple columns + self.df = DataFrame({'key1': get_test_data(), + 'key2': get_test_data(), + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + + # exclude a couple keys for fun + self.df = self.df[self.df['key2'] > 1] + + self.df2 = DataFrame({'key1': get_test_data(n=N // 5), + 'key2': get_test_data(ngroups=NGROUPS // 2, + n=N // 5), + 'value': np.random.randn(N // 5)}) + + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, + index=data['C']) + + self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], + 'v1': np.random.randn(7)}) + self.right = DataFrame({'v2': np.random.randn(4)}, + index=['d', 'b', 'c', 'a']) + + def test_cython_left_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = algos.left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5, -1, -1]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) + + def test_cython_right_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = algos.left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + # 0 1 1 1 + exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, + # 2 2 4 + 6, 7, 8, 6, 7, 8, -1]) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, + 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) + + def test_cython_inner_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = algos.inner_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) + + def test_left_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') + + joined_both = merge(self.df, self.df2) + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='left') + + def test_right_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='right') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') + + joined_both = merge(self.df, self.df2, how='right') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='right') + + def test_full_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='outer') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') + + joined_both = merge(self.df, self.df2, how='outer') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='outer') + + def test_inner_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='inner') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') + + joined_both = merge(self.df, self.df2, how='inner') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='inner') + + def test_handle_overlap(self): + joined = merge(self.df, self.df2, on='key2', + suffixes=['.foo', '.bar']) + + self.assertIn('key1.foo', joined) + self.assertIn('key1.bar', joined) + + def test_handle_overlap_arbitrary_key(self): + joined = merge(self.df, self.df2, + left_on='key2', right_on='key1', + suffixes=['.foo', '.bar']) + self.assertIn('key1.foo', joined) + self.assertIn('key2.bar', joined) + + def test_merge_common(self): + joined = merge(self.df, self.df2) + exp = merge(self.df, self.df2, on=['key1', 'key2']) + tm.assert_frame_equal(joined, exp) + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on='C') + self.assert_numpy_array_equal(merged['MergedA'], target['A']) + self.assert_numpy_array_equal(merged['MergedD'], target['D']) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + joined = df.join(df2, on='key') + expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], + 'value': [0, 0, 1, 1, 2]}) + assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], + columns=['one']) + df_b = DataFrame([['foo'], ['bar']], index=[1, 2], + columns=['two']) + df_c = DataFrame([[1], [2]], index=[1, 2], + columns=['three']) + joined = df_a.join(df_b, on='one') + joined = joined.join(df_c, on='one') + self.assertTrue(np.isnan(joined['two']['c'])) + self.assertTrue(np.isnan(joined['three']['c'])) + + # merge column not p resent + self.assertRaises(Exception, target.join, source, on='E') + + # overlap + source_copy = source.copy() + source_copy['A'] = 0 + self.assertRaises(Exception, target.join, source_copy, on='A') + + def test_join_on_fails_with_different_right_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': tm.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, left_on='a', right_index=True) + + def test_join_on_fails_with_different_left_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': tm.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}, + index=tm.makeCustomIndex(10, 2)) + df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}) + merge(df, df2, right_on='b', left_index=True) + + def test_join_on_fails_with_different_column_counts(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': tm.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': tm.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, right_on='a', left_on=['a', 'b']) + + def test_join_on_pass_vector(self): + expected = self.target.join(self.source, on='C') + del expected['C'] + + join_col = self.target.pop('C') + result = self.target.join(self.source, on=join_col) + assert_frame_equal(result, expected) + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on='C') + for col in self.source: + self.assertIn(col, merged) + self.assertTrue(merged[col].isnull().all()) + + merged2 = self.target.join(self.source.reindex([]), on='C', + how='inner') + self.assertTrue(merged2.columns.equals(merged.columns)) + self.assertEqual(len(merged2), 0) + + def test_join_on_inner(self): + df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) + + joined = df.join(df2, on='key', how='inner') + + expected = df.join(df2, on='key') + expected = expected[expected['value'].notnull()] + self.assert_numpy_array_equal(joined['key'], expected['key']) + self.assert_numpy_array_equal(joined['value'], expected['value']) + self.assertTrue(joined.index.equals(expected.index)) + + def test_join_on_singlekey_list(self): + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + + def test_join_on_series(self): + result = self.target.join(self.source['MergedA'], on='C') + expected = self.target.join(self.source[['MergedA']], on='C') + assert_frame_equal(result, expected) + + def test_join_on_series_buglet(self): + # GH #638 + df = DataFrame({'a': [1, 1]}) + ds = Series([2], index=[1], name='b') + result = df.join(ds, on='a') + expected = DataFrame({'a': [1, 1], + 'b': [2, 2]}, index=df.index) + tm.assert_frame_equal(result, expected) + + def test_join_index_mixed(self): + df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(10), + columns=['A', 'B', 'C', 'D']) + self.assertEqual(df1['B'].dtype, np.int64) + self.assertEqual(df1['D'].dtype, np.bool_) + + df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(0, 10, 2), + columns=['A', 'B', 'C', 'D']) + + # overlap + joined = df1.join(df2, lsuffix='_one', rsuffix='_two') + expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', + 'A_two', 'B_two', 'C_two', 'D_two'] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + assert_frame_equal(joined, expected) + + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1['bool'] = True + df1['string'] = 'foo' + + df2 = DataFrame(index=np.arange(5, 15)) + df2['int'] = 1 + df2['float'] = 1. + + for kind in JOIN_TYPES: + + joined = df1.join(df2, how=kind) + expected = _join_by_hand(df1, df2, how=kind) + assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=kind) + expected = _join_by_hand(df2, df1, how=kind) + assert_frame_equal(joined, expected) + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + + def test_join_unconsolidated(self): + # GH #331 + a = DataFrame(randn(30, 2), columns=['a', 'b']) + c = Series(randn(30)) + a['c'] = c + d = DataFrame(randn(30, 1), columns=['q']) + + # it works! + a.join(d) + d.join(a) + + def test_join_multiindex(self): + index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + df1 = DataFrame(data=np.random.randn(6), index=index1, + columns=['var X']) + df2 = DataFrame(data=np.random.randn(6), index=index2, + columns=['var Y']) + + df1 = df1.sortlevel(0) + df2 = df2.sortlevel(0) + + joined = df1.join(df2, how='outer') + ex_index = index1._tuple_index + index2._tuple_index + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + df1 = df1.sortlevel(1) + df2 = df2.sortlevel(1) + + joined = df1.join(df2, how='outer').sortlevel(0) + ex_index = index1._tuple_index + index2._tuple_index + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + def test_join_inner_multiindex(self): + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + joined = data.join(to_join, on=['key1', 'key2'], how='inner') + expected = merge(data, to_join.reset_index(), + left_on=['key1', 'key2'], + right_on=['first', 'second'], how='inner', + sort=False) + + expected2 = merge(to_join, data, + right_on=['key1', 'key2'], left_index=True, + how='inner', sort=False) + assert_frame_equal(joined, expected2.reindex_like(joined)) + + expected2 = merge(to_join, data, right_on=['key1', 'key2'], + left_index=True, how='inner', sort=False) + + expected = expected.drop(['first', 'second'], axis=1) + expected.index = joined.index + + self.assertTrue(joined.index.is_monotonic) + assert_frame_equal(joined, expected) + + # _assert_same_contents(expected, expected2.ix[:, expected.columns]) + + def test_join_hierarchical_mixed(self): + df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) + new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) + other_df = DataFrame( + [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) + other_df.set_index('a', inplace=True) + + result = merge(new_df, other_df, left_index=True, right_index=True) + self.assertTrue(('b', 'mean') in result) + self.assertTrue('b' in result) + + def test_join_float64_float32(self): + + a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype = np.float64) + b = DataFrame(randn(10, 1), columns=['c'], dtype = np.float32) + joined = a.join(b) + self.assertEqual(joined.dtypes['a'], 'float64') + self.assertEqual(joined.dtypes['b'], 'float64') + self.assertEqual(joined.dtypes['c'], 'float32') + + a = np.random.randint(0, 5, 100).astype('int64') + b = np.random.random(100).astype('float64') + c = np.random.random(100).astype('float32') + df = DataFrame({'a': a, 'b': b, 'c': c}) + xpdf = DataFrame({'a': a, 'b': b, 'c': c }) + s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) + rs = df.merge(s, left_on='a', right_index=True) + self.assertEqual(rs.dtypes['a'], 'int64') + self.assertEqual(rs.dtypes['b'], 'float64') + self.assertEqual(rs.dtypes['c'], 'float32') + self.assertEqual(rs.dtypes['md'], 'float32') + + xp = xpdf.merge(s, left_on='a', right_index=True) + assert_frame_equal(rs, xp) + + def test_join_many_non_unique_index(self): + df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) + df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) + df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + + result = idf1.join([idf2, idf3], how='outer') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') + + result = result.reset_index() + + result['a'] = result['a'].astype(np.float64) + result['b'] = result['b'].astype(np.float64) + + assert_frame_equal(result, expected.ix[:, result.columns]) + + df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) + df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) + df3 = DataFrame( + {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + result = idf1.join([idf2, idf3], how='inner') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') + + result = result.reset_index() + + assert_frame_equal(result, expected.ix[:, result.columns]) + + def test_merge_index_singlekey_right_vs_left(self): + left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], + 'v1': np.random.randn(7)}) + right = DataFrame({'v2': np.random.randn(4)}, + index=['d', 'b', 'c', 'a']) + + merged1 = merge(left, right, left_on='key', + right_index=True, how='left', sort=False) + merged2 = merge(right, left, right_on='key', + left_index=True, how='right', sort=False) + assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) + + merged1 = merge(left, right, left_on='key', + right_index=True, how='left', sort=True) + merged2 = merge(right, left, right_on='key', + left_index=True, how='right', sort=True) + assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) + + def test_merge_index_singlekey_inner(self): + left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], + 'v1': np.random.randn(7)}) + right = DataFrame({'v2': np.random.randn(4)}, + index=['d', 'b', 'c', 'a']) + + # inner join + result = merge(left, right, left_on='key', right_index=True, + how='inner') + expected = left.join(right, on='key').ix[result.index] + assert_frame_equal(result, expected) + + result = merge(right, left, right_on='key', left_index=True, + how='inner') + expected = left.join(right, on='key').ix[result.index] + assert_frame_equal(result, expected.ix[:, result.columns]) + + def test_merge_misspecified(self): + self.assertRaises(Exception, merge, self.left, self.right, + left_index=True) + self.assertRaises(Exception, merge, self.left, self.right, + right_index=True) + + self.assertRaises(Exception, merge, self.left, self.left, + left_on='key', on='key') + + self.assertRaises(Exception, merge, self.df, self.df2, + left_on=['key1'], right_on=['key1', 'key2']) + + def test_merge_overlap(self): + merged = merge(self.left, self.left, on='key') + exp_len = (self.left['key'].value_counts() ** 2).sum() + self.assertEqual(len(merged), exp_len) + self.assertIn('v1_x', merged) + self.assertIn('v1_y', merged) + + def test_merge_different_column_key_names(self): + left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], + 'value': [1, 2, 3, 4]}) + right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'], + 'value': [5, 6, 7, 8]}) + + merged = left.merge(right, left_on='lkey', right_on='rkey', + how='outer', sort=True) + + assert_almost_equal(merged['lkey'], + ['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan]) + assert_almost_equal(merged['rkey'], + ['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux']) + assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan]) + assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7]) + + def test_merge_copy(self): + left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) + right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) + + merged = merge(left, right, left_index=True, + right_index=True, copy=True) + + merged['a'] = 6 + self.assertTrue((left['a'] == 0).all()) + + merged['d'] = 'peekaboo' + self.assertTrue((right['d'] == 'bar').all()) + + def test_merge_nocopy(self): + left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) + right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) + + merged = merge(left, right, left_index=True, + right_index=True, copy=False) + + merged['a'] = 6 + self.assertTrue((left['a'] == 6).all()) + + merged['d'] = 'peekaboo' + self.assertTrue((right['d'] == 'peekaboo').all()) + + def test_join_sort(self): + left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], + 'value': [1, 2, 3, 4]}) + right = DataFrame({'value2': ['a', 'b', 'c']}, + index=['bar', 'baz', 'foo']) + + joined = left.join(right, on='key', sort=True) + expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], + 'value': [2, 3, 1, 4], + 'value2': ['a', 'b', 'c', 'c']}, + index=[1, 2, 0, 3]) + assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on='key', sort=False) + self.assert_numpy_array_equal(joined.index, lrange(4)) + + def test_intelligently_handle_join_key(self): + # #733, be a bit more 1337 about not returning unconsolidated DataFrame + + left = DataFrame({'key': [1, 1, 2, 2, 3], + 'value': lrange(5)}, columns=['value', 'key']) + right = DataFrame({'key': [1, 1, 2, 3, 4, 5], + 'rvalue': lrange(6)}) + + joined = merge(left, right, on='key', how='outer') + expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5.], + 'value': np.array([0, 0, 1, 1, 2, 3, 4, + np.nan, np.nan]), + 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, + columns=['value', 'key', 'rvalue']) + assert_frame_equal(joined, expected, check_dtype=False) + + self.assertTrue(joined._data.is_consolidated()) + + def test_handle_join_key_pass_array(self): + left = DataFrame({'key': [1, 1, 2, 2, 3], + 'value': lrange(5)}, columns=['value', 'key']) + right = DataFrame({'rvalue': lrange(6)}) + key = np.array([1, 1, 2, 3, 4, 5]) + + merged = merge(left, right, left_on='key', right_on=key, how='outer') + merged2 = merge(right, left, left_on=key, right_on='key', how='outer') + + assert_series_equal(merged['key'], merged2['key']) + self.assertTrue(merged['key'].notnull().all()) + self.assertTrue(merged2['key'].notnull().all()) + + left = DataFrame({'value': lrange(5)}, columns=['value']) + right = DataFrame({'rvalue': lrange(6)}) + lkey = np.array([1, 1, 2, 2, 3]) + rkey = np.array([1, 1, 2, 3, 4, 5]) + + merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') + self.assert_numpy_array_equal(merged['key_0'], + np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])) + + left = DataFrame({'value': lrange(3)}) + right = DataFrame({'rvalue': lrange(6)}) + + key = np.array([0, 1, 1, 2, 2, 3]) + merged = merge(left, right, left_index=True, right_on=key, how='outer') + self.assert_numpy_array_equal(merged['key_0'], key) + + def test_mixed_type_join_with_suffix(self): + # GH #916 + df = DataFrame(np.random.randn(20, 6), + columns=['a', 'b', 'c', 'd', 'e', 'f']) + df.insert(0, 'id', 0) + df.insert(5, 'dt', 'foo') + + grouped = df.groupby('id') + mn = grouped.mean() + cn = grouped.count() + + # it works! + mn.join(cn, rsuffix='_right') + + def test_no_overlap_more_informative_error(self): + dt = datetime.now() + df1 = DataFrame({'x': ['a']}, index=[dt]) + + df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + self.assertRaises(MergeError, merge, df1, df2) + + def test_merge_non_unique_indexes(self): + + dt = datetime(2012, 5, 1) + dt2 = datetime(2012, 5, 2) + dt3 = datetime(2012, 5, 3) + dt4 = datetime(2012, 5, 4) + + df1 = DataFrame({'x': ['a']}, index=[dt]) + df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + _check_merge(df1, df2) + + # Not monotonic + df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4]) + df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']}, + index=[dt3, dt3, dt2, dt2, dt, dt]) + _check_merge(df1, df2) + + df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt]) + df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt]) + _check_merge(df1, df2) + + def test_merge_non_unique_index_many_to_many(self): + dt = datetime(2012, 5, 1) + dt2 = datetime(2012, 5, 2) + dt3 = datetime(2012, 5, 3) + df1 = DataFrame({'x': ['a', 'b', 'c', 'd']}, + index=[dt2, dt2, dt, dt]) + df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']}, + index=[dt2, dt2, dt3, dt, dt]) + _check_merge(df1, df2) + + def test_left_merge_empty_dataframe(self): + left = DataFrame({'key': [1], 'value': [2]}) + right = DataFrame({'key': []}) + + result = merge(left, right, on='key', how='left') + assert_frame_equal(result, left) + + result = merge(right, left, on='key', how='right') + assert_frame_equal(result, left) + + def test_merge_nosort(self): + # #2098, anything to do? + + from datetime import datetime + + d = {"var1": np.random.randint(0, 10, size=10), + "var2": np.random.randint(0, 10, size=10), + "var3": [datetime(2012, 1, 12), datetime(2011, 2, 4), + datetime( + 2010, 2, 3), datetime(2012, 1, 12), + datetime( + 2011, 2, 4), datetime(2012, 4, 3), + datetime( + 2012, 3, 4), datetime(2008, 5, 1), + datetime(2010, 2, 3), datetime(2012, 2, 3)]} + df = DataFrame.from_dict(d) + var3 = df.var3.unique() + var3.sort() + new = DataFrame.from_dict({"var3": var3, + "var8": np.random.random(7)}) + + result = df.merge(new, on="var3", sort=False) + exp = merge(df, new, on='var3', sort=False) + assert_frame_equal(result, exp) + + self.assertTrue((df.var3.unique() == result.var3.unique()).all()) + + def test_merge_nan_right(self): + df1 = DataFrame({"i1" : [0, 1], "i2" : [0, 1]}) + df2 = DataFrame({"i1" : [0], "i3" : [0]}) + result = df1.join(df2, on="i1", rsuffix="_") + expected = DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1}, + 'i1_': {0: 0, 1: np.nan}, 'i3': {0: 0.0, 1: np.nan}, + None: {0: 0, 1: 0}}).set_index(None).reset_index()[['i1', 'i2', 'i1_', 'i3']] + assert_frame_equal(result, expected, check_dtype=False) + + df1 = DataFrame({"i1" : [0, 1], "i2" : [0.5, 1.5]}) + df2 = DataFrame({"i1" : [0], "i3" : [0.7]}) + result = df1.join(df2, rsuffix="_", on='i1') + expected = DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan}, + 'i2': {0: 0.5, 1: 1.5}, 'i3': {0: 0.69999999999999996, + 1: nan}})[['i1', 'i2', 'i1_', 'i3']] + assert_frame_equal(result, expected) + + + def test_append_dtype_coerce(self): + + # GH 4993 + # appending with datetime will incorrectly convert datetime64 + import datetime as dt + from pandas import NaT + + df1 = DataFrame(index=[1,2], data=[dt.datetime(2013,1,1,0,0), + dt.datetime(2013,1,2,0,0)], + columns=['start_time']) + df2 = DataFrame(index=[4,5], data=[[dt.datetime(2013,1,3,0,0), + dt.datetime(2013,1,3,6,10)], + [dt.datetime(2013,1,4,0,0), + dt.datetime(2013,1,4,7,10)]], + columns=['start_time','end_time']) + + expected = concat([ + Series([NaT,NaT,dt.datetime(2013,1,3,6,10),dt.datetime(2013,1,4,7,10)],name='end_time'), + Series([dt.datetime(2013,1,1,0,0),dt.datetime(2013,1,2,0,0),dt.datetime(2013,1,3,0,0),dt.datetime(2013,1,4,0,0)],name='start_time'), + ],axis=1) + result = df1.append(df2,ignore_index=True) + assert_frame_equal(result, expected) + + def test_join_append_timedeltas(self): + + import datetime as dt + from pandas import NaT + + # timedelta64 issues with join/merge + # GH 5695 + tm._skip_if_not_numpy17_friendly() + + d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)} + df = DataFrame(columns=list('dt')) + df = df.append(d, ignore_index=True) + result = df.append(d, ignore_index=True) + expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56), + dt.datetime(2013, 11, 5, 5, 56) ], + 't': [ dt.timedelta(0, 22500), + dt.timedelta(0, 22500) ]}) + assert_frame_equal(result, expected) + + td = np.timedelta64(300000000) + lhs = DataFrame(Series([td,td],index=["A","B"])) + rhs = DataFrame(Series([td],index=["A"])) + + from pandas import NaT + result = lhs.join(rhs,rsuffix='r', how="left") + expected = DataFrame({ '0' : Series([td,td],index=list('AB')), '0r' : Series([td,NaT],index=list('AB')) }) + assert_frame_equal(result, expected) + + def test_overlapping_columns_error_message(self): + # #2649 + df = DataFrame({'key': [1, 2, 3], + 'v1': [4, 5, 6], + 'v2': [7, 8, 9]}) + df2 = DataFrame({'key': [1, 2, 3], + 'v1': [4, 5, 6], + 'v2': [7, 8, 9]}) + + df.columns = ['key', 'foo', 'foo'] + df2.columns = ['key', 'bar', 'bar'] + + self.assertRaises(Exception, merge, df, df2) + +def _check_merge(x, y): + for how in ['inner', 'left', 'outer']: + result = x.join(y, how=how) + + expected = merge(x.reset_index(), y.reset_index(), how=how, + sort=True) + expected = expected.set_index('index') + + assert_frame_equal(result, expected, check_names=False) # TODO check_names on merge? + + +class TestMergeMulti(tm.TestCase): + + def setUp(self): + self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + self.data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + def test_merge_on_multikey(self): + joined = self.data.join(self.to_join, on=['key1', 'key2']) + + join_key = Index(lzip(self.data['key1'], self.data['key2'])) + indexer = self.to_join.index.get_indexer(join_key) + ex_values = self.to_join.values.take(indexer, axis=0) + ex_values[indexer == -1] = np.nan + expected = self.data.join(DataFrame(ex_values, + columns=self.to_join.columns)) + + # TODO: columns aren't in the same order yet + assert_frame_equal(joined, expected.ix[:, joined.columns]) + + def test_merge_right_vs_left(self): + # compare left vs right merge with multikey + merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'], + right_index=True, how='left') + merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'], + left_index=True, how='right') + merged2 = merged2.ix[:, merged1.columns] + assert_frame_equal(merged1, merged2) + + def test_compress_group_combinations(self): + + # ~ 40000000 possible unique groups + key1 = np.array([rands(10) for _ in range(10000)], dtype='O') + key1 = np.tile(key1, 2) + key2 = key1[::-1] + + df = DataFrame({'key1': key1, 'key2': key2, + 'value1': np.random.randn(20000)}) + + df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], + 'value2': np.random.randn(10000)}) + + # just to hit the label compression code path + merged = merge(df, df2, how='outer') + + def test_left_join_index_preserve_order(self): + + left = DataFrame({'k1': [0, 1, 2] * 8, + 'k2': ['foo', 'bar'] * 12, + 'v': np.array(np.arange(24),dtype=np.int64) }) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': [5, 7]}, index=index) + + result = left.join(right, on=['k1', 'k2']) + + expected = left.copy() + expected['v2'] = np.nan + expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 + expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 + + tm.assert_frame_equal(result, expected) + + # test join with multi dtypes blocks + left = DataFrame({'k1': [0, 1, 2] * 8, + 'k2': ['foo', 'bar'] * 12, + 'k3' : np.array([0, 1, 2]*8, dtype=np.float32), + 'v': np.array(np.arange(24),dtype=np.int32) }) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': [5, 7]}, index=index) + + result = left.join(right, on=['k1', 'k2']) + + expected = left.copy() + expected['v2'] = np.nan + expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 + expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 + + tm.assert_frame_equal(result, expected) + + # do a right join for an extra test + joined = merge(right, left, left_index=True, + right_on=['k1', 'k2'], how='right') + tm.assert_frame_equal(joined.ix[:, expected.columns], expected) + + def test_join_multi_dtypes(self): + + # test with multi dtypes in the join index + def _test(dtype1,dtype2): + left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), + 'k2': ['foo', 'bar'] * 12, + 'v': np.array(np.arange(24),dtype=np.int64) }) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index) + + result = left.join(right, on=['k1', 'k2']) + + expected = left.copy() + + if dtype2.kind == 'i': + dtype2 = np.dtype('float64') + expected['v2'] = np.array(np.nan,dtype=dtype2) + expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 + expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 + + tm.assert_frame_equal(result, expected) + + for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]: + for d2 in [np.int64,np.float64,np.float32,np.float16]: + _test(np.dtype(d1),np.dtype(d2)) + + def test_left_merge_na_buglet(self): + left = DataFrame({'id': list('abcde'), 'v1': randn(5), + 'v2': randn(5), 'dummy': list('abcde'), + 'v3': randn(5)}, + columns=['id', 'v1', 'v2', 'dummy', 'v3']) + right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], + 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) + + merged = merge(left, right, on='id', how='left') + + rdf = right.drop(['id'], axis=1) + expected = left.join(rdf) + tm.assert_frame_equal(merged, expected) + + def test_merge_na_keys(self): + data = [[1950, "A", 1.5], + [1950, "B", 1.5], + [1955, "B", 1.5], + [1960, "B", np.nan], + [1970, "B", 4.], + [1950, "C", 4.], + [1960, "C", np.nan], + [1965, "C", 3.], + [1970, "C", 4.]] + + frame = DataFrame(data, columns=["year", "panel", "data"]) + + other_data = [[1960, 'A', np.nan], + [1970, 'A', np.nan], + [1955, 'A', np.nan], + [1965, 'A', np.nan], + [1965, 'B', np.nan], + [1955, 'C', np.nan]] + other = DataFrame(other_data, columns=['year', 'panel', 'data']) + + result = frame.merge(other, how='outer') + + expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') + expected = expected.replace(-999, np.nan) + + tm.assert_frame_equal(result, expected) + + def test_int64_overflow_issues(self): + # #2690, combinatorial explosion + df1 = DataFrame(np.random.randn(1000, 7), + columns=list('ABCDEF') + ['G1']) + df2 = DataFrame(np.random.randn(1000, 7), + columns=list('ABCDEF') + ['G2']) + + # it works! + result = merge(df1, df2, how='outer') + self.assertTrue(len(result) == 2000) + + def test_join_multi_levels(self): + + # GH 3662 + # merge multi-levels + + household = DataFrame(dict(household_id = [1,2,3], + male = [0,1,0], + wealth = [196087.3,316478.7,294750]), + columns = ['household_id','male','wealth']).set_index('household_id') + portfolio = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000289783","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + name = ["ABN Amro","Robeco","Royal Dutch Shell","Royal Dutch Shell","AAB Eastern Europe Equity Fund","Postbank BioTech Fonds",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','name','share']).set_index(['household_id','asset_id']) + result = household.join(portfolio, how='inner') + expected = DataFrame(dict(male = [0,1,1,0,0,0], + wealth = [ 196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0 ], + name = ['ABN Amro','Robeco','Royal Dutch Shell','Royal Dutch Shell','AAB Eastern Europe Equity Fund','Postbank BioTech Fonds'], + share = [1.00,0.40,0.60,0.15,0.60,0.25], + household_id = [1,2,2,3,3,3], + asset_id = ['nl0000301109','nl0000289783','gb00b03mlx29','gb00b03mlx29','lu0197800237','nl0000289965']), + ).set_index(['household_id','asset_id']).reindex(columns=['male','wealth','name','share']) + assert_frame_equal(result,expected) + + assert_frame_equal(result,expected) + + # equivalency + result2 = merge(household.reset_index(),portfolio.reset_index(),on=['household_id'],how='inner').set_index(['household_id','asset_id']) + assert_frame_equal(result2,expected) + + result = household.join(portfolio, how='outer') + expected = concat([expected,DataFrame(dict(share = [1.00]), + index=MultiIndex.from_tuples([(4,np.nan)], + names=['household_id','asset_id']))], + axis=0).reindex(columns=expected.columns) + assert_frame_equal(result,expected) + + # invalid cases + household.index.name = 'foo' + def f(): + household.join(portfolio, how='inner') + self.assertRaises(ValueError, f) + + portfolio2 = portfolio.copy() + portfolio2.index.set_names(['household_id','foo']) + def f(): + portfolio2.join(portfolio, how='inner') + self.assertRaises(ValueError, f) + + def test_join_multi_levels2(self): + + # some more advanced merges + # GH6360 + household = DataFrame(dict(household_id = [1,2,2,3,3,3,4], + asset_id = ["nl0000301109","nl0000301109","gb00b03mlx29","gb00b03mlx29","lu0197800237","nl0000289965",np.nan], + share = [1.0,0.4,0.6,0.15,0.6,0.25,1.0]), + columns = ['household_id','asset_id','share']).set_index(['household_id','asset_id']) + + log_return = DataFrame(dict( + asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], + t = [233, 234, 235, 180, 181], + log_return = [.09604978, -.06524096, .03532373, .03025441, .036997] + )).set_index(["asset_id","t"]) + + expected = DataFrame(dict( + household_id = [2, 2, 2, 3, 3, 3, 3, 3], + asset_id = ["gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237"], + t = [233, 234, 235, 233, 234, 235, 180, 181], + share = [0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return = [.09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997] + )).set_index(["household_id", "asset_id", "t"]).reindex(columns=['share','log_return']) + + def f(): + household.join(log_return, how='inner') + self.assertRaises(NotImplementedError, f) + + # this is the equivalency + result = merge(household.reset_index(),log_return.reset_index(),on=['asset_id'],how='inner').set_index(['household_id','asset_id','t']) + assert_frame_equal(result,expected) + + expected = DataFrame(dict( + household_id = [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id = ["nl0000301109", "nl0000289783", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", "nl0000289965", None], + t = [None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], + share = [1.0, 0.4, 0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], + log_return = [None, None, .09604978, -.06524096, .03532373, .09604978, -.06524096, .03532373, .03025441, .036997, None, None] + )).set_index(["household_id", "asset_id", "t"]) + + def f(): + household.join(log_return, how='outer') + self.assertRaises(NotImplementedError, f) + +def _check_join(left, right, result, join_col, how='left', + lsuffix='_x', rsuffix='_y'): + + # some smoke tests + for c in join_col: + assert(result[c].notnull().all()) + + left_grouped = left.groupby(join_col) + right_grouped = right.groupby(join_col) + + for group_key, group in result.groupby(join_col): + l_joined = _restrict_to_columns(group, left.columns, lsuffix) + r_joined = _restrict_to_columns(group, right.columns, rsuffix) + + try: + lgroup = left_grouped.get_group(group_key) + except KeyError: + if how in ('left', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(l_joined, left.columns, join_col) + else: + _assert_same_contents(l_joined, lgroup) + + try: + rgroup = right_grouped.get_group(group_key) + except KeyError: + if how in ('right', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(r_joined, right.columns, join_col) + else: + _assert_same_contents(r_joined, rgroup) + + +def _restrict_to_columns(group, columns, suffix): + found = [c for c in group.columns + if c in columns or c.replace(suffix, '') in columns] + + # filter + group = group.ix[:, found] + + # get rid of suffixes, if any + group = group.rename(columns=lambda x: x.replace(suffix, '')) + + # put in the right order... + group = group.ix[:, columns] + + return group + + +def _assert_same_contents(join_chunk, source): + NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... + + jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values + svalues = source.fillna(NA_SENTINEL).drop_duplicates().values + + rows = set(tuple(row) for row in jvalues) + assert(len(rows) == len(source)) + assert(all(tuple(row) in rows for row in svalues)) + + +def _assert_all_na(join_chunk, source_columns, join_col): + for c in source_columns: + if c in join_col: + continue + assert(join_chunk[c].isnull().all()) + + +def _join_by_hand(a, b, how='left'): + join_index = a.index.join(b.index, how=how) + + a_re = a.reindex(join_index) + b_re = b.reindex(join_index) + + result_columns = a.columns.append(b.columns) + + for col, s in compat.iteritems(b_re): + a_re[col] = s + return a_re.reindex(columns=result_columns) + + +class TestConcatenate(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.frame = DataFrame(tm.getSeriesData()) + self.mixed_frame = self.frame.copy() + self.mixed_frame['foo'] = 'bar' + + def test_append(self): + begin_index = self.frame.index[:5] + end_index = self.frame.index[5:] + + begin_frame = self.frame.reindex(begin_index) + end_frame = self.frame.reindex(end_index) + + appended = begin_frame.append(end_frame) + assert_almost_equal(appended['A'], self.frame['A']) + + del end_frame['A'] + partial_appended = begin_frame.append(end_frame) + self.assertIn('A', partial_appended) + + partial_appended = end_frame.append(begin_frame) + self.assertIn('A', partial_appended) + + # mixed type handling + appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) + assert_frame_equal(appended, self.mixed_frame) + + # what to test here + mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) + mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) + + # all equal except 'foo' column + assert_frame_equal( + mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), + mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) + + # append empty + empty = DataFrame({}) + + appended = self.frame.append(empty) + assert_frame_equal(self.frame, appended) + self.assertIsNot(appended, self.frame) + + appended = empty.append(self.frame) + assert_frame_equal(self.frame, appended) + self.assertIsNot(appended, self.frame) + + # overlap + self.assertRaises(ValueError, self.frame.append, self.frame, + verify_integrity=True) + + # new columns + # GH 6129 + df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) + row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') + expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {'x': 3, 'y': 4, 'z': 6}, 'c' : {'z' : 7}}) + result = df.append(row) + assert_frame_equal(result, expected) + + def test_append_length0_frame(self): + df = DataFrame(columns=['A', 'B', 'C']) + df3 = DataFrame(index=[0, 1], columns=['A', 'B']) + df5 = df.append(df3) + + expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) + assert_frame_equal(df5, expected) + + def test_append_records(self): + arr1 = np.zeros((2,), dtype=('i4,f4,a10')) + arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")] + + arr2 = np.zeros((3,), dtype=('i4,f4,a10')) + arr2[:] = [(3, 4., 'foo'), + (5, 6., "bar"), + (7., 8., 'baz')] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + assert_frame_equal(result, expected) + + def test_append_different_columns(self): + df = DataFrame({'bools': np.random.randn(10) > 0, + 'ints': np.random.randint(0, 10, 10), + 'floats': np.random.randn(10), + 'strings': ['foo', 'bar'] * 5}) + + a = df[:5].ix[:, ['bools', 'ints', 'floats']] + b = df[5:].ix[:, ['strings', 'ints', 'floats']] + + appended = a.append(b) + self.assertTrue(isnull(appended['strings'][0:4]).all()) + self.assertTrue(isnull(appended['bools'][5:]).all()) + + def test_append_many(self): + chunks = [self.frame[:5], self.frame[5:10], + self.frame[10:15], self.frame[15:]] + + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result, self.frame) + + chunks[-1]['foo'] = 'bar' + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) + self.assertTrue((result['foo'][15:] == 'bar').all()) + self.assertTrue(result['foo'][:15].isnull().all()) + + def test_append_preserve_index_name(self): + # #980 + df1 = DataFrame(data=None, columns=['A', 'B', 'C']) + df1 = df1.set_index(['A']) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], + columns=['A', 'B', 'C']) + df2 = df2.set_index(['A']) + + result = df1.append(df2) + self.assertEqual(result.index.name, 'A') + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) + df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[['a', 'b']][:-2], + df[['c', 'd']][2:], df[['e', 'f']][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + # different join types + joined = df_list[0].join(df_list[1:], how='outer') + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how='inner') + _check_diff_index(df_list, joined, df.index[2:8]) + + self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) + df['key'] = ['foo', 'bar'] * 4 + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + + result = df1.join([df2, df3]) + assert_frame_equal(result, df) + + def test_append_missing_column_proper_upcast(self): + df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) + df2 = DataFrame({'B': np.array([True, False, True, False], + dtype=bool)}) + + appended = df1.append(df2, ignore_index=True) + self.assertEqual(appended['A'].dtype, 'f8') + self.assertEqual(appended['B'].dtype, 'O') + + def test_concat_with_group_keys(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + # axis=0 + df = DataFrame(np.random.randn(3, 4)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1]) + exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], + [0, 1, 2, 0, 1, 2, 3]]) + expected = DataFrame(np.r_[df.values, df2.values], + index=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1]) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], + index=exp_index2) + tm.assert_frame_equal(result, expected) + + # axis=1 + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df2.values], + columns=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df.values], + columns=exp_index2) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_specific_levels(self): + df = DataFrame(np.random.randn(10, 4)) + pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] + level = ['three', 'two', 'one', 'zero'] + result = concat(pieces, axis=1, keys=['one', 'two', 'three'], + levels=[level], + names=['group_key']) + + self.assert_numpy_array_equal(result.columns.levels[0], level) + self.assertEqual(result.columns.names[0], 'group_key') + + def test_concat_dataframe_keys_bug(self): + t1 = DataFrame({'value': Series([1, 2, 3], + index=Index(['a', 'b', 'c'], name='id'))}) + t2 = DataFrame({'value': Series([7, 8], + index=Index(['a', 'b'], name='id'))}) + + # it works + result = concat([t1, t2], axis=1, keys=['t1', 't2']) + self.assertEqual(list(result.columns), [('t1', 'value'), + ('t2', 'value')]) + + def test_concat_dict(self): + frames = {'foo': DataFrame(np.random.randn(4, 3)), + 'bar': DataFrame(np.random.randn(4, 3)), + 'baz': DataFrame(np.random.randn(4, 3)), + 'qux': DataFrame(np.random.randn(4, 3))} + + sorted_keys = sorted(frames) + + result = concat(frames) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) + tm.assert_frame_equal(result, expected) + + result = concat(frames, axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, + axis=1) + tm.assert_frame_equal(result, expected) + + keys = ['baz', 'foo', 'bar'] + result = concat(frames, keys=keys) + expected = concat([frames[k] for k in keys], keys=keys) + tm.assert_frame_equal(result, expected) + + def test_concat_ignore_index(self): + frame1 = DataFrame({"test1": ["a", "b", "c"], + "test2": [1, 2, 3], + "test3": [4.5, 3.2, 1.2]}) + frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) + frame1.index = Index(["x", "y", "z"]) + frame2.index = Index(["x", "y", "q"]) + + v1 = concat([frame1, frame2], axis=1, ignore_index=True) + + nan = np.nan + expected = DataFrame([[nan, nan, nan, 4.3], + ['a', 1, 4.5, 5.2], + ['b', 2, 3.2, 2.2], + ['c', 3, 1.2, nan]], + index=Index(["q", "x", "y", "z"])) + + tm.assert_frame_equal(v1, expected) + + def test_concat_multiindex_with_keys(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + result = concat([frame, frame], keys=[0, 1], names=['iteration']) + + self.assertEqual(result.index.names, ('iteration',) + index.names) + tm.assert_frame_equal(result.ix[0], frame) + tm.assert_frame_equal(result.ix[1], frame) + self.assertEqual(result.index.nlevels, 3) + + def test_concat_multiindex_with_tz(self): + # GH 6606 + df = DataFrame({'dt': [datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3)], + 'b': ['A', 'B', 'C'], + 'c': [1, 2, 3], 'd': [4, 5, 6]}) + df['dt'] = df['dt'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) + df = df.set_index(['dt', 'b']) + + exp_idx1 = pd.DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03'] * 2, + tz='US/Pacific', name='dt') + exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') + exp_idx = pd.MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, + index=exp_idx, columns=['c', 'd']) + + result = concat([df, df]) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_and_levels(self): + df = DataFrame(np.random.randn(1, 3)) + df2 = DataFrame(np.random.randn(1, 4)) + + levels = [['foo', 'baz'], ['one', 'two']] + names = ['first', 'second'] + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels, + names=names) + expected = concat([df, df2, df, df2]) + exp_index = MultiIndex(levels=levels + [[0]], + labels=[[0, 0, 1, 1], [0, 1, 0, 1], + [0, 0, 0, 0]], + names=names + [None]) + expected.index = exp_index + + assert_frame_equal(result, expected) + + # no names + + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels) + self.assertEqual(result.index.names, (None,) * 3) + + # no levels + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + names=['first', 'second']) + self.assertEqual(result.index.names, ('first', 'second') + (None,)) + self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo']) + + def test_concat_keys_levels_no_overlap(self): + # GH #1406 + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + + self.assertRaises(ValueError, concat, [df, df], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + self.assertRaises(ValueError, concat, [df, df2], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + def test_concat_rename_index(self): + a = DataFrame(np.random.rand(3, 3), + columns=list('ABC'), + index=Index(list('abc'), name='index_a')) + b = DataFrame(np.random.rand(3, 3), + columns=list('ABC'), + index=Index(list('abc'), name='index_b')) + + result = concat([a, b], keys=['key0', 'key1'], + names=['lvl0', 'lvl1']) + + exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) + names = list(exp.index.names) + names[1] = 'lvl1' + exp.index.set_names(names, inplace=True) + + tm.assert_frame_equal(result, exp) + self.assertEqual(result.index.names, exp.index.names) + + def test_crossed_dtypes_weird_corner(self): + columns = ['A', 'B', 'C', 'D'] + df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'), + 'B': np.array([1, 2, 3, 4], dtype='i8'), + 'C': np.array([1, 2, 3, 4], dtype='f8'), + 'D': np.array([1, 2, 3, 4], dtype='i8')}, + columns=columns) + + df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'), + 'B': np.array([1, 2, 3, 4], dtype='f8'), + 'C': np.array([1, 2, 3, 4], dtype='i8'), + 'D': np.array([1, 2, 3, 4], dtype='f8')}, + columns=columns) + + appended = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), + columns=columns) + tm.assert_frame_equal(appended, expected) + + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + result = concat( + [df, df2], keys=['one', 'two'], names=['first', 'second']) + self.assertEqual(result.index.names, ('first', 'second')) + + def test_dups_index(self): + # GH 4771 + + # single dtypes + df = DataFrame(np.random.randint(0,10,size=40).reshape(10,4),columns=['A','A','C','C']) + + result = concat([df,df],axis=1) + assert_frame_equal(result.iloc[:,:4],df) + assert_frame_equal(result.iloc[:,4:],df) + + result = concat([df,df],axis=0) + assert_frame_equal(result.iloc[:10],df) + assert_frame_equal(result.iloc[10:],df) + + # multi dtypes + df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), + DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + axis=1) + + result = concat([df,df],axis=1) + assert_frame_equal(result.iloc[:,:6],df) + assert_frame_equal(result.iloc[:,6:],df) + + result = concat([df,df],axis=0) + assert_frame_equal(result.iloc[:10],df) + assert_frame_equal(result.iloc[10:],df) + + # append + result = df.iloc[0:8,:].append(df.iloc[8:]) + assert_frame_equal(result, df) + + result = df.iloc[0:8,:].append(df.iloc[8:9]).append(df.iloc[9:10]) + assert_frame_equal(result, df) + + expected = concat([df,df],axis=0) + result = df.append(df) + assert_frame_equal(result, expected) + + def test_join_dups(self): + + # joining dups + df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), + DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + axis=1) + + expected = concat([df,df],axis=1) + result = df.join(df,rsuffix='_2') + result.columns = expected.columns + assert_frame_equal(result, expected) + + # GH 4975, invalid join on dups + w = DataFrame(np.random.randn(4,2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4,2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4,2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4,2), columns=["x", "y"]) + + dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") + dta = dta.merge(w, left_index=True, right_index=True) + expected = concat([x,y,z,w],axis=1) + expected.columns=['x_x','y_x','x_y','y_y','x_x','y_x','x_y','y_y'] + assert_frame_equal(dta,expected) + + def test_handle_empty_objects(self): + df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) + + baz = df[:5] + baz['foo'] = 'bar' + empty = df[5:5] + + frames = [baz, empty, empty, df[5:]] + concatted = concat(frames, axis=0) + + expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] + expected['foo'] = expected['foo'].astype('O') + expected['foo'][:5] = 'bar' + + tm.assert_frame_equal(concatted, expected) + + # empty as first element with time series + # GH3259 + df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) + empty = DataFrame() + result = concat([df,empty],axis=1) + assert_frame_equal(result, df) + result = concat([empty,df],axis=1) + assert_frame_equal(result, df) + + result = concat([df,empty]) + assert_frame_equal(result, df) + result = concat([empty,df]) + assert_frame_equal(result, df) + + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index=date_range('01-Jan-2013', periods=10, freq='H') + arr = np.arange(10, dtype='int64') + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1,1), index=index) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0]) + result = concat([df,df], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1]) + result = concat([s1,s2], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,s2,s1], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3]) + result = concat([s1,df,s2,s2,s1], axis=1) + assert_frame_equal(result, expected) + + # with names + s1.name = 'foo' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0]) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + s2.name = 'bar' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar']) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,df,s2], axis=1, ignore_index=True) + assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0]) + result = concat([s1,df,s2]) + assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0]) + result = concat([s1,df,s2], ignore_index=True) + assert_frame_equal(result, expected) + + # invalid concatente of mixed dims + panel = tm.makePanel() + self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1)) + + def test_panel_join(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[:2, :10, :3] + p2 = panel.ix[2:, 5:, 2:] + + # left join + result = p1.join(p2) + expected = p1.copy() + expected['ItemC'] = p2['ItemC'] + tm.assert_panel_equal(result, expected) + + # right join + result = p1.join(p2, how='right') + expected = p2.copy() + expected['ItemA'] = p1['ItemA'] + expected['ItemB'] = p1['ItemB'] + expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) + tm.assert_panel_equal(result, expected) + + # inner join + result = p1.join(p2, how='inner') + expected = panel.ix[:, 5:10, 2:3] + tm.assert_panel_equal(result, expected) + + # outer join + result = p1.join(p2, how='outer') + expected = p1.reindex(major=panel.major_axis, + minor=panel.minor_axis) + expected = expected.join(p2.reindex(major=panel.major_axis, + minor=panel.minor_axis)) + tm.assert_panel_equal(result, expected) + + def test_panel_join_overlap(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.ix[['ItemB', 'ItemC']] + + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 + joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') + p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.ix[['ItemA']] + expected = no_overlap.join(p1_suf.join(p2_suf)) + tm.assert_panel_equal(joined, expected) + + def test_panel_join_many(self): + tm.K = 10 + panel = tm.makePanel() + tm.K = 4 + + panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + + joined = panels[0].join(panels[1:]) + tm.assert_panel_equal(joined, panel) + + panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + + data_dict = {} + for p in panels: + data_dict.update(compat.iteritems(p)) + + joined = panels[0].join(panels[1:], how='inner') + expected = Panel.from_dict(data_dict, intersect=True) + tm.assert_panel_equal(joined, expected) + + joined = panels[0].join(panels[1:], how='outer') + expected = Panel.from_dict(data_dict, intersect=False) + tm.assert_panel_equal(joined, expected) + + # edge cases + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='outer', lsuffix='foo', rsuffix='bar') + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='right') + + def test_panel_concat_other_axes(self): + panel = tm.makePanel() + + p1 = panel.ix[:, :5, :] + p2 = panel.ix[:, 5:, :] + + result = concat([p1, p2], axis=1) + tm.assert_panel_equal(result, panel) + + p1 = panel.ix[:, :, :2] + p2 = panel.ix[:, :, 2:] + + result = concat([p1, p2], axis=2) + tm.assert_panel_equal(result, panel) + + # if things are a bit misbehaved + p1 = panel.ix[:2, :, :2] + p2 = panel.ix[:, :, 2:] + p1['ItemC'] = 'baz' + + result = concat([p1, p2], axis=2) + + expected = panel.copy() + expected['ItemC'] = expected['ItemC'].astype('O') + expected.ix['ItemC', :, :2] = 'baz' + tm.assert_panel_equal(result, expected) + + def test_panel_concat_buglet(self): + # #2257 + def make_panel(): + index = 5 + cols = 3 + + def df(): + return DataFrame(np.random.randn(index, cols), + index=["I%s" % i for i in range(index)], + columns=["C%s" % i for i in range(cols)]) + return Panel(dict([("Item%s" % x, df()) for x in ['A', 'B', 'C']])) + + panel1 = make_panel() + panel2 = make_panel() + + panel2 = panel2.rename_axis(dict([(x, "%s_1" % x) + for x in panel2.major_axis]), + axis=1) + + panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1) + panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) + + # it works! + concat([panel1, panel3], axis=1, verify_integrity=True) + + def test_panel4d_concat(self): + p4d = tm.makePanel4D() + + p1 = p4d.ix[:, :, :5, :] + p2 = p4d.ix[:, :, 5:, :] + + result = concat([p1, p2], axis=2) + tm.assert_panel4d_equal(result, p4d) + + p1 = p4d.ix[:, :, :, :2] + p2 = p4d.ix[:, :, :, 2:] + + result = concat([p1, p2], axis=3) + tm.assert_panel4d_equal(result, p4d) + + def test_panel4d_concat_mixed_type(self): + p4d = tm.makePanel4D() + + # if things are a bit misbehaved + p1 = p4d.ix[:, :2, :, :2] + p2 = p4d.ix[:, :, :, 2:] + p1['L5'] = 'baz' + + result = concat([p1, p2], axis=3) + + p2['L5'] = np.nan + expected = concat([p1, p2], axis=3) + expected = expected.ix[result.labels] + + tm.assert_panel4d_equal(result, expected) + + def test_concat_series(self): + ts = tm.makeTimeSeries() + ts.name = 'foo' + + pieces = [ts[:5], ts[5:15], ts[15:]] + + result = concat(pieces) + tm.assert_series_equal(result, ts) + self.assertEqual(result.name, ts.name) + + result = concat(pieces, keys=[0, 1, 2]) + expected = ts.copy() + + ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) + + exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), + np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], + labels=exp_labels) + expected.index = exp_index + tm.assert_series_equal(result, expected) + + def test_concat_series_axis1(self): + ts = tm.makeTimeSeries() + + pieces = [ts[:-2], ts[2:], ts[2:-2]] + + result = concat(pieces, axis=1) + expected = DataFrame(pieces).T + assert_frame_equal(result, expected) + + result = concat(pieces, keys=['A', 'B', 'C'], axis=1) + expected = DataFrame(pieces, index=['A', 'B', 'C']).T + assert_frame_equal(result, expected) + + # preserve series names, #2489 + s = Series(randn(5), name='A') + s2 = Series(randn(5), name='B') + + result = concat([s, s2], axis=1) + expected = DataFrame({'A': s, 'B': s2}) + assert_frame_equal(result, expected) + + s2.name = None + result = concat([s, s2], axis=1) + self.assertTrue(np.array_equal(result.columns, lrange(2))) + + # must reindex, #2603 + s = Series(randn(3), index=['c', 'a', 'b'], name='A') + s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') + result = concat([s, s2], axis=1) + expected = DataFrame({'A': s, 'B': s2}) + assert_frame_equal(result, expected) + + def test_concat_single_with_key(self): + df = DataFrame(np.random.randn(10, 4)) + + result = concat([df], keys=['foo']) + expected = concat([df, df], keys=['foo', 'bar']) + tm.assert_frame_equal(result, expected[:10]) + + def test_concat_exclude_none(self): + df = DataFrame(np.random.randn(10, 4)) + + pieces = [df[:5], None, None, df[5:]] + result = concat(pieces) + tm.assert_frame_equal(result, df) + self.assertRaises(Exception, concat, [None, None]) + + def test_concat_datetime64_block(self): + from pandas.tseries.index import date_range + + rng = date_range('1/1/2000', periods=10) + + df = DataFrame({'time': rng}) + + result = concat([df, df]) + self.assertTrue((result.iloc[:10]['time'] == rng).all()) + self.assertTrue((result.iloc[10:]['time'] == rng).all()) + + def test_concat_timedelta64_block(self): + + # not friendly for < 1.7 + tm._skip_if_not_numpy17_friendly() + from pandas import to_timedelta + + rng = to_timedelta(np.arange(10),unit='s') + + df = DataFrame({'time': rng}) + + result = concat([df, df]) + self.assertTrue((result.iloc[:10]['time'] == rng).all()) + self.assertTrue((result.iloc[10:]['time'] == rng).all()) + + def test_concat_keys_with_none(self): + # #1649 + df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) + + result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) + expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) + tm.assert_frame_equal(result, expected) + + result = concat([None, df0, df0[:2], df0[:1], df0], + keys=['a', 'b', 'c', 'd', 'e']) + expected = concat([df0, df0[:2], df0[:1], df0], + keys=['b', 'c', 'd', 'e']) + tm.assert_frame_equal(result, expected) + + def test_concat_bug_1719(self): + ts1 = tm.makeTimeSeries() + ts2 = tm.makeTimeSeries()[::2] + + ## to join with union + ## these two are of different length! + left = concat([ts1, ts2], join='outer', axis=1) + right = concat([ts2, ts1], join='outer', axis=1) + + self.assertEqual(len(left), len(right)) + + def test_concat_bug_2972(self): + ts0 = Series(np.zeros(5)) + ts1 = Series(np.ones(5)) + ts0.name = ts1.name = 'same name' + result = concat([ts0, ts1], axis=1) + + expected = DataFrame({0: ts0, 1: ts1}) + expected.columns=['same name', 'same name'] + assert_frame_equal(result, expected) + + def test_concat_bug_3602(self): + + # GH 3602, duplicate columns + df1 = DataFrame({'firmNo' : [0,0,0,0], 'stringvar' : ['rrr', 'rrr', 'rrr', 'rrr'], 'prc' : [6,6,6,6] }) + df2 = DataFrame({'misc' : [1,2,3,4], 'prc' : [6,6,6,6], 'C' : [9,10,11,12]}) + expected = DataFrame([[0,6,'rrr',9,1,6], + [0,6,'rrr',10,2,6], + [0,6,'rrr',11,3,6], + [0,6,'rrr',12,4,6]]) + expected.columns = ['firmNo','prc','stringvar','C','misc','prc'] + + result = concat([df1,df2],axis=1) + assert_frame_equal(result,expected) + + def test_concat_series_axis1_same_names_ignore_index(self): + dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] + s1 = Series(randn(len(dates)), index=dates, name='value') + s2 = Series(randn(len(dates)), index=dates, name='value') + + result = concat([s1, s2], axis=1, ignore_index=True) + self.assertTrue(np.array_equal(result.columns, [0, 1])) + + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = mkdf(10, 2) + for obj in [1, dict(), [1, 2], (1, 2) ]: + self.assertRaises(TypeError, lambda x: concat([ df1, obj ])) + + def test_concat_invalid_first_argument(self): + df1 = mkdf(10, 2) + df2 = mkdf(10, 2) + self.assertRaises(TypeError, concat, df1, df2) + + # generator ok though + concat(DataFrame(np.random.rand(5,5)) for _ in range(3)) + + # text reader ok + # GH6583 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + reader = read_csv(StringIO(data), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(data)) + assert_frame_equal(result,expected) + +class TestOrderedMerge(tm.TestCase): + + def setUp(self): + self.left = DataFrame({'key': ['a', 'c', 'e'], + 'lvalue': [1, 2., 3]}) + + self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], + 'rvalue': [1, 2, 3., 4]}) + + # GH #813 + + def test_basic(self): + result = ordered_merge(self.left, self.right, on='key') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1, nan, 2, nan, 3, nan], + 'rvalue': [nan, 1, 2, 3, nan, 4]}) + + assert_frame_equal(result, expected) + + def test_ffill(self): + result = ordered_merge( + self.left, self.right, on='key', fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1., 1, 2, 2, 3, 3.], + 'rvalue': [nan, 1, 2, 3, 3, 4]}) + assert_frame_equal(result, expected) + + def test_multigroup(self): + left = concat([self.left, self.left], ignore_index=True) + # right = concat([self.right, self.right], ignore_index=True) + + left['group'] = ['a'] * 3 + ['b'] * 3 + # right['group'] = ['a'] * 4 + ['b'] * 4 + + result = ordered_merge(left, self.right, on='key', left_by='group', + fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, + 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, + 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) + expected['group'] = ['a'] * 6 + ['b'] * 6 + + assert_frame_equal(result, expected.ix[:, result.columns]) + + result2 = ordered_merge(self.right, left, on='key', right_by='group', + fill_method='ffill') + assert_frame_equal(result, result2.ix[:, result.columns]) + + result = ordered_merge(left, self.right, on='key', left_by='group') + self.assertTrue(result['group'].notnull().all()) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py new file mode 100644 index 00000000..a16df003 --- /dev/null +++ b/pandas/tools/tests/test_pivot.py @@ -0,0 +1,644 @@ +import datetime + +import numpy as np +from numpy.testing import assert_equal + +import pandas as pd +from pandas import DataFrame, Series, Index, MultiIndex, Grouper +from pandas.tools.merge import concat +from pandas.tools.pivot import pivot_table, crosstab +from pandas.compat import range, u, product +import pandas.util.testing as tm + + +class TestPivotTable(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_pivot_table(self): + index = ['A', 'B'] + columns = 'C' + table = pivot_table(self.data, values='D', index=index, columns=columns) + + table2 = self.data.pivot_table(values='D', index=index, columns=columns) + tm.assert_frame_equal(table, table2) + + # this works + pivot_table(self.data, values='D', index=index) + + if len(index) > 1: + self.assertEqual(table.index.names, tuple(index)) + else: + self.assertEqual(table.index.name, index[0]) + + if len(columns) > 1: + self.assertEqual(table.columns.names, columns) + else: + self.assertEqual(table.columns.name, columns[0]) + + expected = self.data.groupby(index + [columns])['D'].agg(np.mean).unstack() + tm.assert_frame_equal(table, expected) + + def test_pivot_table_warnings(self): + index = ['A', 'B'] + columns = 'C' + with tm.assert_produces_warning(FutureWarning): + table = pivot_table(self.data, values='D', rows=index, + cols=columns) + + with tm.assert_produces_warning(False): + table2 = pivot_table(self.data, values='D', index=index, + columns=columns) + + tm.assert_frame_equal(table, table2) + + def test_pivot_table_nocols(self): + df = DataFrame({'rows': ['a', 'b', 'c'], + 'cols': ['x', 'y', 'z'], + 'values': [1,2,3]}) + rs = df.pivot_table(columns='cols', aggfunc=np.sum) + xp = df.pivot_table(index='cols', aggfunc=np.sum).T + tm.assert_frame_equal(rs, xp) + + rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'}) + xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T + tm.assert_frame_equal(rs, xp) + + def test_pivot_table_dropna(self): + df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000}, + 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, + 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, + 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, + 'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}}) + pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False) + pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False) + + m = MultiIndex.from_tuples([(u('A'), u('a')), + (u('A'), u('b')), + (u('A'), u('c')), + (u('A'), u('d')), + (u('B'), u('a')), + (u('B'), u('b')), + (u('B'), u('c')), + (u('B'), u('d')), + (u('C'), u('a')), + (u('C'), u('b')), + (u('C'), u('c')), + (u('C'), u('d'))]) + + assert_equal(pv_col.columns.values, m.values) + assert_equal(pv_ind.index.values, m.values) + + + def test_pass_array(self): + result = self.data.pivot_table('D', index=self.data.A, columns=self.data.C) + expected = self.data.pivot_table('D', index='A', columns='C') + tm.assert_frame_equal(result, expected) + + def test_pass_function(self): + result = self.data.pivot_table('D', index=lambda x: x // 5, + columns=self.data.C) + expected = self.data.pivot_table('D', index=self.data.index // 5, + columns='C') + tm.assert_frame_equal(result, expected) + + def test_pivot_table_multiple(self): + index = ['A', 'B'] + columns = 'C' + table = pivot_table(self.data, index=index, columns=columns) + expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() + tm.assert_frame_equal(table, expected) + + def test_pivot_dtypes(self): + + # can convert dtypes + f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']}) + self.assertEqual(f.dtypes['v'], 'int64') + + z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.sum) + result = z.get_dtype_counts() + expected = Series(dict(int64 = 2)) + tm.assert_series_equal(result, expected) + + # cannot convert dtypes + f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']}) + self.assertEqual(f.dtypes['v'], 'float64') + + z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.mean) + result = z.get_dtype_counts() + expected = Series(dict(float64 = 2)) + tm.assert_series_equal(result, expected) + + def test_pivot_multi_values(self): + result = pivot_table(self.data, values=['D', 'E'], + index='A', columns=['B', 'C'], fill_value=0) + expected = pivot_table(self.data.drop(['F'], axis=1), + index='A', columns=['B', 'C'], fill_value=0) + tm.assert_frame_equal(result, expected) + + def test_pivot_multi_functions(self): + f = lambda func: pivot_table(self.data, values=['D', 'E'], + index=['A', 'B'], columns='C', + aggfunc=func) + result = f([np.mean, np.std]) + means = f(np.mean) + stds = f(np.std) + expected = concat([means, stds], keys=['mean', 'std'], axis=1) + tm.assert_frame_equal(result, expected) + + # margins not supported?? + f = lambda func: pivot_table(self.data, values=['D', 'E'], + index=['A', 'B'], columns='C', + aggfunc=func, margins=True) + result = f([np.mean, np.std]) + means = f(np.mean) + stds = f(np.std) + expected = concat([means, stds], keys=['mean', 'std'], axis=1) + tm.assert_frame_equal(result, expected) + + def test_pivot_index_with_nan(self): + # GH 3588 + nan = np.nan + df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]}) + result = df.pivot('a','b','c') + expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan], + [nan,nan,nan,nan],[nan,nan,15,20]], + index = Index(['R1','R2',nan,'R4'],name='a'), + columns = Index(['C1','C2','C3','C4'],name='b')) + tm.assert_frame_equal(result, expected) + + def test_pivot_with_tz(self): + # GH 5878 + df = DataFrame({'dt1': [datetime.datetime(2013, 1, 1, 9, 0), + datetime.datetime(2013, 1, 2, 9, 0), + datetime.datetime(2013, 1, 1, 9, 0), + datetime.datetime(2013, 1, 2, 9, 0)], + 'dt2': [datetime.datetime(2014, 1, 1, 9, 0), + datetime.datetime(2014, 1, 1, 9, 0), + datetime.datetime(2014, 1, 2, 9, 0), + datetime.datetime(2014, 1, 2, 9, 0)], + 'data1': np.arange(4,dtype='int64'), + 'data2': np.arange(4,dtype='int64')}) + + df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) + df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) + + exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) + exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'] * 2, + name='dt2', tz='Asia/Tokyo') + exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) + expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], + name='dt1', tz='US/Pacific'), + columns=exp_col) + + pv = df.pivot(index='dt1', columns='dt2') + tm.assert_frame_equal(pv, expected) + + expected = DataFrame([[0, 2], [1, 3]], + index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], + name='dt1', tz='US/Pacific'), + columns=pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], + name='dt2', tz='Asia/Tokyo')) + + pv = df.pivot(index='dt1', columns='dt2', values='data1') + tm.assert_frame_equal(pv, expected) + + def test_margins(self): + def _check_output(res, col, index=['A', 'B'], columns=['C']): + cmarg = res['All'][:-1] + exp = self.data.groupby(index)[col].mean() + tm.assert_series_equal(cmarg, exp) + + res = res.sortlevel() + rmarg = res.xs(('All', ''))[:-1] + exp = self.data.groupby(columns)[col].mean() + tm.assert_series_equal(rmarg, exp) + + gmarg = res['All']['All', ''] + exp = self.data[col].mean() + self.assertEqual(gmarg, exp) + + # column specified + table = self.data.pivot_table('D', index=['A', 'B'], columns='C', + margins=True, aggfunc=np.mean) + _check_output(table, 'D') + + # no column specified + table = self.data.pivot_table(index=['A', 'B'], columns='C', + margins=True, aggfunc=np.mean) + for valcol in table.columns.levels[0]: + _check_output(table[valcol], valcol) + + # no col + + # to help with a buglet + self.data.columns = [k * 2 for k in self.data.columns] + table = self.data.pivot_table(index=['AA', 'BB'], margins=True, + aggfunc=np.mean) + for valcol in table.columns: + gmarg = table[valcol]['All', ''] + self.assertEqual(gmarg, self.data[valcol].mean()) + + # this is OK + table = self.data.pivot_table(index=['AA', 'BB'], margins=True, + aggfunc='mean') + + # no rows + rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, + aggfunc=np.mean) + tm.assert_isinstance(rtable, Series) + for item in ['DD', 'EE', 'FF']: + gmarg = table[item]['All', ''] + self.assertEqual(gmarg, self.data[item].mean()) + + def test_pivot_integer_columns(self): + # caused by upstream bug in unstack + + d = datetime.date.min + data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], + [d + datetime.timedelta(i) for i in range(20)], [1.0])) + df = DataFrame(data) + table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) + + df2 = df.rename(columns=str) + table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2']) + + tm.assert_frame_equal(table, table2, check_names=False) + + def test_pivot_no_level_overlap(self): + # GH #1181 + + data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, + 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, + 'c': (['foo'] * 4 + ['bar'] * 4) * 2, + 'value': np.random.randn(16)}) + + table = data.pivot_table('value', index='a', columns=['b', 'c']) + + grouped = data.groupby(['a', 'b', 'c'])['value'].mean() + expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') + tm.assert_frame_equal(table, expected) + + def test_pivot_columns_lexsorted(self): + + n = 10000 + + dtype = np.dtype([ + ("Index", object), + ("Symbol", object), + ("Year", int), + ("Month", int), + ("Day", int), + ("Quantity", int), + ("Price", float), + ]) + + products = np.array([ + ('SP500', 'ADBE'), + ('SP500', 'NVDA'), + ('SP500', 'ORCL'), + ('NDQ100', 'AAPL'), + ('NDQ100', 'MSFT'), + ('NDQ100', 'GOOG'), + ('FTSE', 'DGE.L'), + ('FTSE', 'TSCO.L'), + ('FTSE', 'GSK.L'), + ], dtype=[('Index', object), ('Symbol', object)]) + items = np.empty(n, dtype=dtype) + iproduct = np.random.randint(0, len(products), n) + items['Index'] = products['Index'][iproduct] + items['Symbol'] = products['Symbol'][iproduct] + dr = pd.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) + dates = dr[np.random.randint(0, len(dr), n)] + items['Year'] = dates.year + items['Month'] = dates.month + items['Day'] = dates.day + items['Price'] = np.random.lognormal(4.0, 2.0, n) + + df = DataFrame(items) + + pivoted = df.pivot_table('Price', index=['Month', 'Day'], + columns=['Index', 'Symbol', 'Year'], + aggfunc='mean') + + self.assertTrue(pivoted.columns.is_monotonic) + + def test_pivot_complex_aggfunc(self): + f = {'D': ['std'], 'E': ['sum']} + expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') + result = self.data.pivot_table(index='A', columns='B', aggfunc=f) + + tm.assert_frame_equal(result, expected) + + def test_margins_no_values_no_cols(self): + # Regression test on pivot table: no values or cols passed. + result = self.data[['A', 'B']].pivot_table(index=['A', 'B'], aggfunc=len, margins=True) + result_list = result.tolist() + self.assertEqual(sum(result_list[:-1]), result_list[-1]) + + def test_margins_no_values_two_rows(self): + # Regression test on pivot table: no values passed but rows are a multi-index + result = self.data[['A', 'B', 'C']].pivot_table(index=['A', 'B'], columns='C', aggfunc=len, margins=True) + self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) + + def test_margins_no_values_one_row_one_col(self): + # Regression test on pivot table: no values passed but row and col defined + result = self.data[['A', 'B']].pivot_table(index='A', columns='B', aggfunc=len, margins=True) + self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0]) + + def test_margins_no_values_two_row_two_cols(self): + # Regression test on pivot table: no values passed but rows and cols are multi-indexed + self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'] + result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) + self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) + + def test_pivot_timegrouper(self): + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], + 'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), + datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), + datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date') + + expected = DataFrame(np.array([10, 18, 3],dtype='int64').reshape(1, 3), + index=[datetime.datetime(2013, 12, 31)], + columns='Carl Joe Mark'.split()) + expected.index.name = 'Date' + expected.columns.name = 'Buyer' + + result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result,expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result,expected.T) + + expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), + index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)], + columns='Carl Joe Mark'.split()) + expected.index.name = 'Date' + expected.columns.name = 'Buyer' + + result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + # passing the name + df = df.reset_index() + result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'), + columns='Buyer', values='Quantity', aggfunc=np.sum)) + self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer', + columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) + + # passing the level + df = df.set_index('Date') + result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'), + columns='Buyer', values='Quantity', aggfunc=np.sum)) + self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer', + columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) + + # double grouper + df = DataFrame({ + 'Branch' : 'A A A A A A A B'.split(), + 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), + 'Quantity': [1,3,5,1,8,1,9,3], + 'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5), + datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0), + datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)], + 'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5), + datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0), + datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0), + datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]}) + + result = pivot_table(df, index=Grouper(freq='M', key='Date'), + columns=Grouper(freq='M', key='PayDay'), + values='Quantity', aggfunc=np.sum) + expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, + np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4), + index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), + datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)], + columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), + datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)]) + expected.index.name = 'Date' + expected.columns.name = 'PayDay' + + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), + columns=Grouper(freq='M', key='Date'), + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)), + (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)), + (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)), + (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)), + (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)), + (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),] + idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) + expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan, + 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), + index=idx, columns=['A', 'B']) + expected.columns.name = 'Branch' + + result = pivot_table(df, index=[Grouper(freq='M', key='Date'), + Grouper(freq='M', key='PayDay')], columns=['Branch'], + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected) + + result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'), + Grouper(freq='M', key='PayDay')], + values='Quantity', aggfunc=np.sum) + tm.assert_frame_equal(result, expected.T) + + def test_pivot_datetime_tz(self): + dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', + '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] + dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', + '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] + df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], + 'dt1': dates1, 'dt2': dates2, + 'value1': np.arange(6,dtype='int64'), 'value2': [1, 2] * 3}) + df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) + df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) + + exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1') + exp_col1 = Index(['value1', 'value1']) + exp_col2 = Index(['a', 'b'], name='label') + exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) + expected = DataFrame([[0, 3], [1, 4], [2, 5]], + index=exp_idx, columns=exp_col) + result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1']) + tm.assert_frame_equal(result, expected) + + + exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean']) + exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) + exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, + tz='Asia/Tokyo', name='dt2') + exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) + expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], + [1, 4, 2, 1, 1, 4, 2, 1], + [2, 5, 1, 2, 2, 5, 1, 2]], dtype='int64'), + index=exp_idx, + columns=exp_col) + + result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], + aggfunc=[np.sum, np.mean]) + tm.assert_frame_equal(result, expected) + + +class TestCrosstab(tm.TestCase): + + def setUp(self): + df = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + self.df = df.append(df, ignore_index=True) + + def test_crosstab_single(self): + df = self.df + result = crosstab(df['A'], df['C']) + expected = df.groupby(['A', 'C']).size().unstack() + tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) + + def test_crosstab_multiple(self): + df = self.df + + result = crosstab(df['A'], [df['B'], df['C']]) + expected = df.groupby(['A', 'B', 'C']).size() + expected = expected.unstack( + 'B').unstack('C').fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + result = crosstab([df['B'], df['C']], df['A']) + expected = df.groupby(['B', 'C', 'A']).size() + expected = expected.unstack('A').fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + def test_crosstab_ndarray(self): + a = np.random.randint(0, 5, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 10, size=100) + + df = DataFrame({'a': a, 'b': b, 'c': c}) + + result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c')) + expected = crosstab(df['a'], [df['b'], df['c']]) + tm.assert_frame_equal(result, expected) + + result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c')) + expected = crosstab([df['b'], df['c']], df['a']) + tm.assert_frame_equal(result, expected) + + # assign arbitrary names + result = crosstab(self.df['A'].values, self.df['C'].values) + self.assertEqual(result.index.name, 'row_0') + self.assertEqual(result.columns.name, 'col_0') + + def test_crosstab_margins(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({'a': a, 'b': b, 'c': c}) + + result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), + margins=True) + + self.assertEqual(result.index.names, ('a',)) + self.assertEqual(result.columns.names, ['b', 'c']) + + all_cols = result['All', ''] + exp_cols = df.groupby(['a']).size().astype('i8') + exp_cols = exp_cols.append(Series([len(df)], index=['All'])) + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.ix['All'] + exp_rows = df.groupby(['b', 'c']).size().astype('i8') + exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + def test_crosstab_pass_values(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + values = np.random.randn(100) + + table = crosstab([a, b], c, values, aggfunc=np.sum, + rownames=['foo', 'bar'], colnames=['baz']) + + df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values}) + + expected = df.pivot_table('values', index=['foo', 'bar'], columns='baz', + aggfunc=np.sum) + tm.assert_frame_equal(table, expected) + + def test_crosstab_dropna(self): + # GH 3820 + a = np.array(['foo', 'foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object) + b = np.array(['one', 'one', 'two', 'one', 'two', 'two', 'two'], dtype=object) + c = np.array(['dull', 'dull', 'dull', 'dull', 'dull', 'shiny', 'shiny'], dtype=object) + res = crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False) + m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), + ('two', 'dull'), ('two', 'shiny')]) + assert_equal(res.columns.values, m.values) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py new file mode 100644 index 00000000..78c8201f --- /dev/null +++ b/pandas/tools/tests/test_tile.py @@ -0,0 +1,241 @@ +import os +import nose + +import numpy as np +from pandas.compat import zip + +from pandas import DataFrame, Series, unique +import pandas.util.testing as tm +from pandas.util.testing import assertRaisesRegexp +import pandas.core.common as com + +from pandas.core.algorithms import quantile +from pandas.tools.tile import cut, qcut +import pandas.tools.tile as tmod + +from numpy.testing import assert_equal, assert_almost_equal + + +class TestCut(tm.TestCase): + + def test_simple(self): + data = np.ones(5) + result = cut(data, 4, labels=False) + desired = [1, 1, 1, 1, 1] + assert_equal(result, desired) + + def test_bins(self): + data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) + result, bins = cut(data, 3, retbins=True) + assert_equal(result.labels, [0, 0, 0, 1, 2, 0]) + assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + + def test_right(self): + data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + result, bins = cut(data, 4, right=True, retbins=True) + assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 0]) + assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) + + def test_noright(self): + data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + result, bins = cut(data, 4, right=False, retbins=True) + assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 1]) + assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) + + def test_arraylike(self): + data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + result, bins = cut(data, 3, retbins=True) + assert_equal(result.labels, [0, 0, 0, 1, 2, 0]) + assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + + def test_bins_not_monotonic(self): + data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + self.assertRaises(ValueError, cut, data, [0.1, 1.5, 1, 10]) + + def test_wrong_num_labels(self): + data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + self.assertRaises(ValueError, cut, data, [0, 1, 10], + labels=['foo', 'bar', 'baz']) + + def test_cut_corner(self): + # h3h + self.assertRaises(ValueError, cut, [], 2) + + self.assertRaises(ValueError, cut, [1, 2, 3], 0.5) + + def test_cut_out_of_range_more(self): + # #1511 + s = Series([0, -1, 0, 1, -3]) + ind = cut(s, [0, 1], labels=False) + exp = [np.nan, np.nan, np.nan, 0, np.nan] + assert_almost_equal(ind, exp) + + def test_labels(self): + arr = np.tile(np.arange(0, 1.01, 0.1), 4) + + result, bins = cut(arr, 4, retbins=True) + ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', + '(0.75, 1]'] + self.assert_numpy_array_equal(result.levels, ex_levels) + + result, bins = cut(arr, 4, retbins=True, right=False) + ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', + '[0.75, 1.001)'] + self.assert_numpy_array_equal(result.levels, ex_levels) + + def test_cut_pass_series_name_to_factor(self): + s = Series(np.random.randn(100), name='foo') + + factor = cut(s, 4) + self.assertEqual(factor.name, 'foo') + + def test_label_precision(self): + arr = np.arange(0, 0.73, 0.01) + + result = cut(arr, 4, precision=2) + ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', + '(0.54, 0.72]'] + self.assert_numpy_array_equal(result.levels, ex_levels) + + def test_na_handling(self): + arr = np.arange(0, 0.75, 0.01) + arr[::3] = np.nan + + result = cut(arr, 4) + + result_arr = np.asarray(result) + + ex_arr = np.where(com.isnull(arr), np.nan, result_arr) + + tm.assert_almost_equal(result_arr, ex_arr) + + result = cut(arr, 4, labels=False) + ex_result = np.where(com.isnull(arr), np.nan, result) + tm.assert_almost_equal(result, ex_result) + + def test_inf_handling(self): + data = np.arange(6) + data_ser = Series(data,dtype='int64') + + result = cut(data, [-np.inf, 2, 4, np.inf]) + result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) + + ex_levels = ['(-inf, 2]', '(2, 4]', '(4, inf]'] + + np.testing.assert_array_equal(result.levels, ex_levels) + np.testing.assert_array_equal(result_ser.levels, ex_levels) + self.assertEqual(result[5], '(4, inf]') + self.assertEqual(result[0], '(-inf, 2]') + self.assertEqual(result_ser[5], '(4, inf]') + self.assertEqual(result_ser[0], '(-inf, 2]') + + def test_qcut(self): + arr = np.random.randn(1000) + + labels, bins = qcut(arr, 4, retbins=True) + ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) + assert_almost_equal(bins, ex_bins) + + ex_levels = cut(arr, ex_bins, include_lowest=True) + self.assert_numpy_array_equal(labels, ex_levels) + + def test_qcut_bounds(self): + arr = np.random.randn(1000) + + factor = qcut(arr, 10, labels=False) + self.assertEqual(len(np.unique(factor)), 10) + + def test_qcut_specify_quantiles(self): + arr = np.random.randn(100) + + factor = qcut(arr, [0, .25, .5, .75, 1.]) + expected = qcut(arr, 4) + self.assertTrue(factor.equals(expected)) + + def test_qcut_all_bins_same(self): + assertRaisesRegexp(ValueError, "edges.*unique", qcut, [0,0,0,0,0,0,0,0,0,0], 3) + + def test_cut_out_of_bounds(self): + arr = np.random.randn(100) + + result = cut(arr, [-1, 0, 1]) + + mask = result.labels == -1 + ex_mask = (arr < -1) | (arr > 1) + self.assert_numpy_array_equal(mask, ex_mask) + + def test_cut_pass_labels(self): + arr = [50, 5, 10, 15, 20, 30, 70] + bins = [0, 25, 50, 100] + labels = ['Small', 'Medium', 'Large'] + + result = cut(arr, bins, labels=labels) + + exp = cut(arr, bins) + exp.levels = labels + + self.assertTrue(result.equals(exp)) + + def test_qcut_include_lowest(self): + values = np.arange(10) + + cats = qcut(values, 4) + + ex_levels = ['[0, 2.25]', '(2.25, 4.5]', '(4.5, 6.75]', '(6.75, 9]'] + self.assertTrue((cats.levels == ex_levels).all()) + + def test_qcut_nas(self): + arr = np.random.randn(100) + arr[:20] = np.nan + + result = qcut(arr, 4) + self.assertTrue(com.isnull(result[:20]).all()) + + def test_label_formatting(self): + self.assertEqual(tmod._trim_zeros('1.000'), '1') + + # it works + result = cut(np.arange(11.), 2) + + result = cut(np.arange(11.) / 1e10, 2) + + # #1979, negative numbers + + result = tmod._format_label(-117.9998, precision=3) + self.assertEqual(result, '-118') + result = tmod._format_label(117.9998, precision=3) + self.assertEqual(result, '118') + + def test_qcut_binning_issues(self): + # #1978, 1979 + path = os.path.join(curpath(), 'cut_data.csv') + + arr = np.loadtxt(path) + + result = qcut(arr, 20) + + starts = [] + ends = [] + for lev in result.levels: + s, e = lev[1:-1].split(',') + + self.assertTrue(s != e) + + starts.append(float(s)) + ends.append(float(e)) + + for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]), + zip(ends[:-1], ends[1:])): + self.assertTrue(sp < sn) + self.assertTrue(ep < en) + self.assertTrue(ep <= sn) + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_tools.py b/pandas/tools/tests/test_tools.py new file mode 100644 index 00000000..4fd70e28 --- /dev/null +++ b/pandas/tools/tests/test_tools.py @@ -0,0 +1,23 @@ +from pandas import DataFrame +from pandas.tools.describe import value_range + +import numpy as np +import pandas.util.testing as tm + + +class TestTools(tm.TestCase): + + def test_value_range(self): + df = DataFrame(np.random.randn(5, 5)) + df.ix[0, 2] = -5 + df.ix[2, 0] = 5 + + res = value_range(df) + + self.assertEqual(res['Minimum'], -5) + self.assertEqual(res['Maximum'], 5) + + df.ix[0, 1] = np.NaN + + self.assertEqual(res['Minimum'], -5) + self.assertEqual(res['Maximum'], 5) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py new file mode 100644 index 00000000..9480ea7e --- /dev/null +++ b/pandas/tools/tests/test_util.py @@ -0,0 +1,95 @@ +import os +import locale +import codecs +import nose + +import numpy as np +from numpy.testing import assert_equal + +from pandas import date_range, Index +import pandas.util.testing as tm +from pandas.tools.util import cartesian_product + + +CURRENT_LOCALE = locale.getlocale() +LOCALE_OVERRIDE = os.environ.get('LOCALE_OVERRIDE', None) + + +class TestCartesianProduct(tm.TestCase): + + def test_simple(self): + x, y = list('ABC'), [1, 22] + result = cartesian_product([x, y]) + expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']), + np.array([ 1, 22, 1, 22, 1, 22])] + assert_equal(result, expected) + + def test_datetimeindex(self): + # regression test for GitHub issue #6439 + # make sure that the ordering on datetimeindex is consistent + x = date_range('2000-01-01', periods=2) + result = [Index(y).day for y in cartesian_product([x, x])] + expected = [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])] + assert_equal(result, expected) + + +class TestLocaleUtils(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestLocaleUtils, cls).setUpClass() + cls.locales = tm.get_locales() + + if not cls.locales: + raise nose.SkipTest("No locales found") + + if os.name == 'nt': # we're on windows + raise nose.SkipTest("Running on Windows") + + @classmethod + def tearDownClass(cls): + super(TestLocaleUtils, cls).tearDownClass() + del cls.locales + + def test_get_locales(self): + # all systems should have at least a single locale + assert len(tm.get_locales()) > 0 + + def test_get_locales_prefix(self): + if len(self.locales) == 1: + raise nose.SkipTest("Only a single locale found, no point in " + "trying to test filtering locale prefixes") + first_locale = self.locales[0] + assert len(tm.get_locales(prefix=first_locale[:2])) > 0 + + def test_set_locale(self): + if len(self.locales) == 1: + raise nose.SkipTest("Only a single locale found, no point in " + "trying to test setting another locale") + + if LOCALE_OVERRIDE is not None: + lang, enc = LOCALE_OVERRIDE.split('.') + else: + lang, enc = 'it_CH', 'UTF-8' + + enc = codecs.lookup(enc).name + new_locale = lang, enc + + if not tm._can_set_locale(new_locale): + with tm.assertRaises(locale.Error): + with tm.set_locale(new_locale): + pass + else: + with tm.set_locale(new_locale) as normalized_locale: + new_lang, new_enc = normalized_locale.split('.') + new_enc = codecs.lookup(enc).name + normalized_locale = new_lang, new_enc + self.assertEqual(normalized_locale, new_locale) + + current_locale = locale.getlocale() + self.assertEqual(current_locale, CURRENT_LOCALE) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py new file mode 100644 index 00000000..c2512ba2 --- /dev/null +++ b/pandas/tools/tile.py @@ -0,0 +1,266 @@ +""" +Quantilization functions and related stuff +""" + +from pandas.core.api import DataFrame, Series +from pandas.core.categorical import Categorical +from pandas.core.index import _ensure_index +import pandas.core.algorithms as algos +import pandas.core.common as com +import pandas.core.nanops as nanops +from pandas.compat import zip + +import numpy as np + + +def cut(x, bins, right=True, labels=None, retbins=False, precision=3, + include_lowest=False): + """ + Return indices of half-open bins to which each value of `x` belongs. + + Parameters + ---------- + x : array-like + Input array to be binned. It has to be 1-dimensional. + bins : int or sequence of scalars + If `bins` is an int, it defines the number of equal-width bins in the + range of `x`. However, in this case, the range of `x` is extended + by .1% on each side to include the min or max values of `x`. If + `bins` is a sequence it defines the bin edges allowing for + non-uniform bin width. No extension of the range of `x` is done in + this case. + right : bool, optional + Indicates whether the bins include the rightmost edge or not. If + right == True (the default), then the bins [1,2,3,4] indicate + (1,2], (2,3], (3,4]. + labels : array or boolean, default None + Labels to use for bin edges, or False to return integer bin labels + retbins : bool, optional + Whether to return the bins or not. Can be useful if bins is given + as a scalar. + precision : int + The precision at which to store and display the bins labels + include_lowest : bool + Whether the first interval should be left-inclusive or not. + + Returns + ------- + out : Categorical or array of integers if labels is False + bins : ndarray of floats + Returned only if `retbins` is True. + + Notes + ----- + The `cut` function can be useful for going from a continuous variable to + a categorical variable. For example, `cut` could convert ages to groups + of age ranges. + + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting Categorical object + + + Examples + -------- + >>> cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) + (array([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], + (6.533, 9.7], (0.191, 3.367]], dtype=object), + array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + >>> cut(np.ones(5), 4, labels=False) + array([2, 2, 2, 2, 2]) + """ + # NOTE: this binning code is changed a bit from histogram for var(x) == 0 + if not np.iterable(bins): + if np.isscalar(bins) and bins < 1: + raise ValueError("`bins` should be a positive integer.") + try: # for array-like + sz = x.size + except AttributeError: + x = np.asarray(x) + sz = x.size + if sz == 0: + raise ValueError('Cannot cut empty array') + # handle empty arrays. Can't determine range, so use 0-1. + # rng = (0, 1) + else: + rng = (nanops.nanmin(x), nanops.nanmax(x)) + mn, mx = [mi + 0.0 for mi in rng] + + if mn == mx: # adjust end points before binning + mn -= .001 * mn + mx += .001 * mx + bins = np.linspace(mn, mx, bins + 1, endpoint=True) + else: # adjust end points after binning + bins = np.linspace(mn, mx, bins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + else: + bins = np.asarray(bins) + if (np.diff(bins) < 0).any(): + raise ValueError('bins must increase monotonically.') + + return _bins_to_cuts(x, bins, right=right, labels=labels, + retbins=retbins, precision=precision, + include_lowest=include_lowest) + + +def qcut(x, q, labels=None, retbins=False, precision=3): + """ + Quantile-based discretization function. Discretize variable into + equal-sized buckets based on rank or based on sample quantiles. For example + 1000 values for 10 quantiles would produce a Categorical object indicating + quantile membership for each data point. + + Parameters + ---------- + x : ndarray or Series + q : integer or array of quantiles + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles + labels : array or boolean, default None + Labels to use for bin edges, or False to return integer bin labels + retbins : bool, optional + Whether to return the bins or not. Can be useful if bins is given + as a scalar. + precision : int + The precision at which to store and display the bins labels + + Returns + ------- + cat : Categorical + + Notes + ----- + Out of bounds values will be NA in the resulting Categorical object + + Examples + -------- + """ + if com.is_integer(q): + quantiles = np.linspace(0, 1, q + 1) + else: + quantiles = q + bins = algos.quantile(x, quantiles) + return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, + precision=precision, include_lowest=True) + + +def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, + precision=3, name=None, include_lowest=False): + if name is None and isinstance(x, Series): + name = x.name + x = np.asarray(x) + + side = 'left' if right else 'right' + ids = bins.searchsorted(x, side=side) + + if len(algos.unique(bins)) < len(bins): + raise ValueError('Bin edges must be unique: %s' % repr(bins)) + + if include_lowest: + ids[x == bins[0]] = 1 + + na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) + has_nas = na_mask.any() + + if labels is not False: + if labels is None: + increases = 0 + while True: + try: + levels = _format_levels(bins, precision, right=right, + include_lowest=include_lowest) + except ValueError: + increases += 1 + precision += 1 + if increases >= 20: + raise + else: + break + + else: + if len(labels) != len(bins) - 1: + raise ValueError('Bin labels must be one fewer than ' + 'the number of bin edges') + levels = labels + + levels = np.asarray(levels, dtype=object) + np.putmask(ids, na_mask, 0) + fac = Categorical(ids - 1, levels, name=name) + else: + fac = ids - 1 + if has_nas: + fac = fac.astype(np.float64) + np.putmask(fac, na_mask, np.nan) + + if not retbins: + return fac + + return fac, bins + + +def _format_levels(bins, prec, right=True, + include_lowest=False): + fmt = lambda v: _format_label(v, precision=prec) + if right: + levels = [] + for a, b in zip(bins, bins[1:]): + fa, fb = fmt(a), fmt(b) + + if a != b and fa == fb: + raise ValueError('precision too low') + + formatted = '(%s, %s]' % (fa, fb) + + levels.append(formatted) + + if include_lowest: + levels[0] = '[' + levels[0][1:] + else: + levels = ['[%s, %s)' % (fmt(a), fmt(b)) + for a, b in zip(bins, bins[1:])] + + return levels + + +def _format_label(x, precision=3): + fmt_str = '%%.%dg' % precision + if np.isinf(x): + return str(x) + elif com.is_float(x): + frac, whole = np.modf(x) + sgn = '-' if x < 0 else '' + whole = abs(whole) + if frac != 0.0: + val = fmt_str % frac + + # rounded up or down + if '.' not in val: + if x < 0: + return '%d' % (-whole - 1) + else: + return '%d' % (whole + 1) + + if 'e' in val: + return _trim_zeros(fmt_str % x) + else: + val = _trim_zeros(val) + if '.' in val: + return sgn + '.'.join(('%d' % whole, val.split('.')[1])) + else: # pragma: no cover + return sgn + '.'.join(('%d' % whole, val)) + else: + return sgn + '%0.f' % whole + else: + return str(x) + + +def _trim_zeros(x): + while len(x) > 1 and x[-1] == '0': + x = x[:-1] + if len(x) > 1 and x[-1] == '.': + x = x[:-1] + return x diff --git a/pandas/tools/util.py b/pandas/tools/util.py new file mode 100644 index 00000000..215a76b8 --- /dev/null +++ b/pandas/tools/util.py @@ -0,0 +1,49 @@ +import operator +from pandas.compat import reduce +from pandas.core.index import Index +import numpy as np +from pandas import algos +from pandas.core import common as com + + +def match(needles, haystack): + haystack = Index(haystack) + needles = Index(needles) + return haystack.get_indexer(needles) + + +def cartesian_product(X): + ''' + Numpy version of itertools.product or pandas.compat.product. + Sometimes faster (for large inputs)... + + Examples + -------- + >>> cartesian_product([list('ABC'), [1, 2]]) + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), + array([1, 2, 1, 2, 1, 2])] + + ''' + + lenX = np.fromiter((len(x) for x in X), dtype=int) + cumprodX = np.cumproduct(lenX) + + a = np.roll(cumprodX, 1) + a[0] = 1 + + b = cumprodX[-1] / cumprodX + + return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]), + np.product(a[i])) + for i, x in enumerate(X)] + + +def _compose2(f, g): + """Compose 2 callables""" + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def compose(*funcs): + """Compose 2 or more callables""" + assert len(funcs) > 1, 'At least 2 callables must be passed to compose' + return reduce(_compose2, funcs) diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py new file mode 100644 index 00000000..c2cc3723 --- /dev/null +++ b/pandas/tseries/api.py @@ -0,0 +1,12 @@ +""" + +""" + + +from pandas.tseries.index import DatetimeIndex, date_range, bdate_range +from pandas.tseries.frequencies import infer_freq +from pandas.tseries.period import Period, PeriodIndex, period_range, pnow +from pandas.tseries.resample import TimeGrouper +from pandas.tseries.timedeltas import to_timedelta +from pandas.lib import NaT +import pandas.tseries.offsets as offsets diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py new file mode 100644 index 00000000..80ac97ee --- /dev/null +++ b/pandas/tseries/converter.py @@ -0,0 +1,987 @@ +from datetime import datetime, timedelta +import datetime as pydt +import numpy as np + +from dateutil.relativedelta import relativedelta + +import matplotlib.units as units +import matplotlib.dates as dates + +from matplotlib.ticker import Formatter, AutoLocator, Locator +from matplotlib.transforms import nonsingular + +from pandas.compat import lrange +import pandas.compat as compat +import pandas.lib as lib +import pandas.core.common as com +from pandas.core.index import Index + +from pandas.core.series import Series +from pandas.tseries.index import date_range +import pandas.tseries.tools as tools +import pandas.tseries.frequencies as frequencies +from pandas.tseries.frequencies import FreqGroup +from pandas.tseries.period import Period, PeriodIndex + + +def register(): + units.registry[lib.Timestamp] = DatetimeConverter() + units.registry[Period] = PeriodConverter() + units.registry[pydt.datetime] = DatetimeConverter() + units.registry[pydt.date] = DatetimeConverter() + units.registry[pydt.time] = TimeConverter() + + +def _to_ordinalf(tm): + tot_sec = (tm.hour * 3600 + tm.minute * 60 + tm.second + + float(tm.microsecond / 1e6)) + return tot_sec + + +def time2num(d): + if isinstance(d, compat.string_types): + parsed = tools.to_datetime(d) + if not isinstance(parsed, datetime): + raise ValueError('Could not parse time %s' % d) + return _to_ordinalf(parsed.time()) + if isinstance(d, pydt.time): + return _to_ordinalf(d) + return d + + +class TimeConverter(units.ConversionInterface): + + @staticmethod + def convert(value, unit, axis): + valid_types = (str, pydt.time) + if (isinstance(value, valid_types) or com.is_integer(value) or + com.is_float(value)): + return time2num(value) + if isinstance(value, Index): + return value.map(time2num) + if isinstance(value, (list, tuple, np.ndarray)): + return [time2num(x) for x in value] + return value + + @staticmethod + def axisinfo(unit, axis): + if unit != 'time': + return None + + majloc = AutoLocator() + majfmt = TimeFormatter(majloc) + return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='time') + + @staticmethod + def default_units(x, axis): + return 'time' + + +### time formatter +class TimeFormatter(Formatter): + + def __init__(self, locs): + self.locs = locs + + def __call__(self, x, pos=0): + fmt = '%H:%M:%S' + s = int(x) + ms = int((x - s) * 1e3) + us = int((x - s) * 1e6 - ms) + m, s = divmod(s, 60) + h, m = divmod(m, 60) + _, h = divmod(h, 24) + if us != 0: + fmt += '.%6f' + elif ms != 0: + fmt += '.%3f' + + return pydt.time(h, m, s, us).strftime(fmt) + + +### Period Conversion + + +class PeriodConverter(dates.DateConverter): + + @staticmethod + def convert(values, units, axis): + if not hasattr(axis, 'freq'): + raise TypeError('Axis must have `freq` set to convert to Periods') + valid_types = (str, datetime, Period, pydt.date, pydt.time) + if (isinstance(values, valid_types) or com.is_integer(values) or + com.is_float(values)): + return get_datevalue(values, axis.freq) + if isinstance(values, PeriodIndex): + return values.asfreq(axis.freq).values + if isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + if isinstance(values, (list, tuple, np.ndarray)): + return [get_datevalue(x, axis.freq) for x in values] + return values + + +def get_datevalue(date, freq): + if isinstance(date, Period): + return date.asfreq(freq).ordinal + elif isinstance(date, (str, datetime, pydt.date, pydt.time)): + return Period(date, freq).ordinal + elif (com.is_integer(date) or com.is_float(date) or + (isinstance(date, np.ndarray) and (date.size == 1))): + return date + elif date is None: + return None + raise ValueError("Unrecognizable date '%s'" % date) + +HOURS_PER_DAY = 24. +MINUTES_PER_DAY = 60. * HOURS_PER_DAY +SECONDS_PER_DAY = 60. * MINUTES_PER_DAY +MUSECONDS_PER_DAY = 1e6 * SECONDS_PER_DAY + + +def _dt_to_float_ordinal(dt): + """ + Convert :mod:`datetime` to the Gregorian date as UTC float days, + preserving hours, minutes, seconds and microseconds. Return value + is a :func:`float`. + """ + if isinstance(dt, (np.ndarray, Series)) and com.is_datetime64_ns_dtype(dt): + base = dates.epoch2num(dt.asi8 / 1.0E9) + else: + base = dates.date2num(dt) + return base + + +### Datetime Conversion +class DatetimeConverter(dates.DateConverter): + + @staticmethod + def convert(values, unit, axis): + def try_parse(values): + try: + return _dt_to_float_ordinal(tools.to_datetime(values)) + except Exception: + return values + + if isinstance(values, (datetime, pydt.date)): + return _dt_to_float_ordinal(values) + elif isinstance(values, pydt.time): + return dates.date2num(values) + elif (com.is_integer(values) or com.is_float(values)): + return values + elif isinstance(values, compat.string_types): + return try_parse(values) + elif isinstance(values, (list, tuple, np.ndarray)): + if not isinstance(values, np.ndarray): + values = com._asarray_tuplesafe(values) + + if com.is_integer_dtype(values) or com.is_float_dtype(values): + return values + + try: + values = tools.to_datetime(values) + if isinstance(values, Index): + values = values.map(_dt_to_float_ordinal) + else: + values = [_dt_to_float_ordinal(x) for x in values] + except Exception: + pass + + return values + + @staticmethod + def axisinfo(unit, axis): + """ + Return the :class:`~matplotlib.units.AxisInfo` for *unit*. + + *unit* is a tzinfo instance or None. + The *axis* argument is required but not used. + """ + tz = unit + + majloc = PandasAutoDateLocator(tz=tz) + majfmt = PandasAutoDateFormatter(majloc, tz=tz) + datemin = pydt.date(2000, 1, 1) + datemax = pydt.date(2010, 1, 1) + + return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='', + default_limits=(datemin, datemax)) + + +class PandasAutoDateFormatter(dates.AutoDateFormatter): + + def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'): + dates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt) + # matplotlib.dates._UTC has no _utcoffset called by pandas + if self._tz is dates.UTC: + self._tz._utcoffset = self._tz.utcoffset(None) + self.scaled = { + 365.0: '%Y', + 30.: '%b %Y', + 1.0: '%b %d %Y', + 1. / 24.: '%H:%M:%S', + 1. / 24. / 3600. / 1000.: '%H:%M:%S.%f' + } + + def _get_fmt(self, x): + + scale = float(self._locator._get_unit()) + + fmt = self.defaultfmt + + for k in sorted(self.scaled): + if k >= scale: + fmt = self.scaled[k] + break + + return fmt + + def __call__(self, x, pos=0): + fmt = self._get_fmt(x) + self._formatter = dates.DateFormatter(fmt, self._tz) + return self._formatter(x, pos) + + +class PandasAutoDateLocator(dates.AutoDateLocator): + + def get_locator(self, dmin, dmax): + 'Pick the best locator based on a distance.' + delta = relativedelta(dmax, dmin) + + num_days = ((delta.years * 12.0) + delta.months * 31.0) + delta.days + num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds + tot_sec = num_days * 86400. + num_sec + + if abs(tot_sec) < self.minticks: + self._freq = -1 + locator = MilliSecondLocator(self.tz) + locator.set_axis(self.axis) + + locator.set_view_interval(*self.axis.get_view_interval()) + locator.set_data_interval(*self.axis.get_data_interval()) + return locator + + return dates.AutoDateLocator.get_locator(self, dmin, dmax) + + def _get_unit(self): + return MilliSecondLocator.get_unit_generic(self._freq) + + +class MilliSecondLocator(dates.DateLocator): + + UNIT = 1. / (24 * 3600 * 1000) + + def __init__(self, tz): + dates.DateLocator.__init__(self, tz) + self._interval = 1. + + def _get_unit(self): + return self.get_unit_generic(-1) + + @staticmethod + def get_unit_generic(freq): + unit = dates.RRuleLocator.get_unit_generic(freq) + if unit < 0: + return MilliSecondLocator.UNIT + return unit + + def __call__(self): + # if no data have been set, this will tank with a ValueError + try: + dmin, dmax = self.viewlim_to_dt() + except ValueError: + return [] + + if dmin > dmax: + dmax, dmin = dmin, dmax + delta = relativedelta(dmax, dmin) + + # We need to cap at the endpoints of valid datetime + try: + start = dmin - delta + except ValueError: + start = _from_ordinal(1.0) + + try: + stop = dmax + delta + except ValueError: + # The magic number! + stop = _from_ordinal(3652059.9999999) + + nmax, nmin = dates.date2num((dmax, dmin)) + + num = (nmax - nmin) * 86400 * 1000 + max_millis_ticks = 6 + for interval in [1, 10, 50, 100, 200, 500]: + if num <= interval * (max_millis_ticks - 1): + self._interval = interval + break + else: + # We went through the whole loop without breaking, default to 1 + self._interval = 1000. + + estimate = (nmax - nmin) / (self._get_unit() * self._get_interval()) + + if estimate > self.MAXTICKS * 2: + raise RuntimeError(('MillisecondLocator estimated to generate %d ' + 'ticks from %s to %s: exceeds Locator.MAXTICKS' + '* 2 (%d) ') % + (estimate, dmin, dmax, self.MAXTICKS * 2)) + + freq = '%dL' % self._get_interval() + tz = self.tz.tzname(None) + st = _from_ordinal(dates.date2num(dmin)) # strip tz + ed = _from_ordinal(dates.date2num(dmax)) + all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).asobject + + try: + if len(all_dates) > 0: + locs = self.raise_if_exceeds(dates.date2num(all_dates)) + return locs + except Exception as e: # pragma: no cover + pass + + lims = dates.date2num([dmin, dmax]) + return lims + + def _get_interval(self): + return self._interval + + def autoscale(self): + """ + Set the view limits to include the data range. + """ + dmin, dmax = self.datalim_to_dt() + if dmin > dmax: + dmax, dmin = dmin, dmax + + delta = relativedelta(dmax, dmin) + + # We need to cap at the endpoints of valid datetime + try: + start = dmin - delta + except ValueError: + start = _from_ordinal(1.0) + + try: + stop = dmax + delta + except ValueError: + # The magic number! + stop = _from_ordinal(3652059.9999999) + + dmin, dmax = self.datalim_to_dt() + + vmin = dates.date2num(dmin) + vmax = dates.date2num(dmax) + + return self.nonsingular(vmin, vmax) + + +def _from_ordinal(x, tz=None): + ix = int(x) + dt = datetime.fromordinal(ix) + remainder = float(x) - ix + hour, remainder = divmod(24 * remainder, 1) + minute, remainder = divmod(60 * remainder, 1) + second, remainder = divmod(60 * remainder, 1) + microsecond = int(1e6 * remainder) + if microsecond < 10: + microsecond = 0 # compensate for rounding errors + dt = datetime(dt.year, dt.month, dt.day, int(hour), int(minute), + int(second), microsecond) + if tz is not None: + dt = dt.astimezone(tz) + + if microsecond > 999990: # compensate for rounding errors + dt += timedelta(microseconds=1e6 - microsecond) + + return dt + +### Fixed frequency dynamic tick locators and formatters + +##### ------------------------------------------------------------------------- +#---- --- Locators --- +##### ------------------------------------------------------------------------- + + +def _get_default_annual_spacing(nyears): + """ + Returns a default spacing between consecutive ticks for annual data. + """ + if nyears < 11: + (min_spacing, maj_spacing) = (1, 1) + elif nyears < 20: + (min_spacing, maj_spacing) = (1, 2) + elif nyears < 50: + (min_spacing, maj_spacing) = (1, 5) + elif nyears < 100: + (min_spacing, maj_spacing) = (5, 10) + elif nyears < 200: + (min_spacing, maj_spacing) = (5, 25) + elif nyears < 600: + (min_spacing, maj_spacing) = (10, 50) + else: + factor = nyears // 1000 + 1 + (min_spacing, maj_spacing) = (factor * 20, factor * 100) + return (min_spacing, maj_spacing) + + +def period_break(dates, period): + """ + Returns the indices where the given period changes. + + Parameters + ---------- + dates : PeriodIndex + Array of intervals to monitor. + period : string + Name of the period to monitor. + """ + current = getattr(dates, period) + previous = getattr(dates - 1, period) + return (current - previous).nonzero()[0] + + +def has_level_label(label_flags, vmin): + """ + Returns true if the ``label_flags`` indicate there is at least one label + for this level. + + if the minimum view limit is not an exact integer, then the first tick + label won't be shown, so we must adjust for that. + """ + if label_flags.size == 0 or (label_flags.size == 1 and + label_flags[0] == 0 and + vmin % 1 > 0.0): + return False + else: + return True + + +def _daily_finder(vmin, vmax, freq): + periodsperday = -1 + + if freq >= FreqGroup.FR_HR: + if freq == FreqGroup.FR_NS: + periodsperday = 24 * 60 * 60 * 1000000000 + elif freq == FreqGroup.FR_US: + periodsperday = 24 * 60 * 60 * 1000000 + elif freq == FreqGroup.FR_MS: + periodsperday = 24 * 60 * 60 * 1000 + elif freq == FreqGroup.FR_SEC: + periodsperday = 24 * 60 * 60 + elif freq == FreqGroup.FR_MIN: + periodsperday = 24 * 60 + elif freq == FreqGroup.FR_HR: + periodsperday = 24 + else: # pragma: no cover + raise ValueError("unexpected frequency: %s" % freq) + periodsperyear = 365 * periodsperday + periodspermonth = 28 * periodsperday + + elif freq == FreqGroup.FR_BUS: + periodsperyear = 261 + periodspermonth = 19 + elif freq == FreqGroup.FR_DAY: + periodsperyear = 365 + periodspermonth = 28 + elif frequencies.get_freq_group(freq) == FreqGroup.FR_WK: + periodsperyear = 52 + periodspermonth = 3 + else: # pragma: no cover + raise ValueError("unexpected frequency") + + # save this for later usage + vmin_orig = vmin + + (vmin, vmax) = (Period(ordinal=int(vmin), freq=freq), + Period(ordinal=int(vmax), freq=freq)) + span = vmax.ordinal - vmin.ordinal + 1 + dates_ = PeriodIndex(start=vmin, end=vmax, freq=freq) + # Initialize the output + info = np.zeros(span, + dtype=[('val', np.int64), ('maj', bool), + ('min', bool), ('fmt', '|S20')]) + info['val'][:] = dates_.values + info['fmt'][:] = '' + info['maj'][[0, -1]] = True + # .. and set some shortcuts + info_maj = info['maj'] + info_min = info['min'] + info_fmt = info['fmt'] + + def first_label(label_flags): + if (label_flags[0] == 0) and (label_flags.size > 1) and \ + ((vmin_orig % 1) > 0.0): + return label_flags[1] + else: + return label_flags[0] + + # Case 1. Less than a month + if span <= periodspermonth: + day_start = period_break(dates_, 'day') + month_start = period_break(dates_, 'month') + + def _hour_finder(label_interval, force_year_start): + _hour = dates_.hour + _prev_hour = (dates_ - 1).hour + hour_start = (_hour - _prev_hour) != 0 + info_maj[day_start] = True + info_min[hour_start & (_hour % label_interval == 0)] = True + year_start = period_break(dates_, 'year') + info_fmt[hour_start & (_hour % label_interval == 0)] = '%H:%M' + info_fmt[day_start] = '%H:%M\n%d-%b' + info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + if force_year_start and not has_level_label(year_start, vmin_orig): + info_fmt[first_label(day_start)] = '%H:%M\n%d-%b\n%Y' + + def _minute_finder(label_interval): + hour_start = period_break(dates_, 'hour') + _minute = dates_.minute + _prev_minute = (dates_ - 1).minute + minute_start = (_minute - _prev_minute) != 0 + info_maj[hour_start] = True + info_min[minute_start & (_minute % label_interval == 0)] = True + year_start = period_break(dates_, 'year') + info_fmt = info['fmt'] + info_fmt[minute_start & (_minute % label_interval == 0)] = '%H:%M' + info_fmt[day_start] = '%H:%M\n%d-%b' + info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + + def _second_finder(label_interval): + minute_start = period_break(dates_, 'minute') + _second = dates_.second + _prev_second = (dates_ - 1).second + second_start = (_second - _prev_second) != 0 + info['maj'][minute_start] = True + info['min'][second_start & (_second % label_interval == 0)] = True + year_start = period_break(dates_, 'year') + info_fmt = info['fmt'] + info_fmt[second_start & (_second % + label_interval == 0)] = '%H:%M:%S' + info_fmt[day_start] = '%H:%M:%S\n%d-%b' + info_fmt[year_start] = '%H:%M:%S\n%d-%b\n%Y' + + if span < periodsperday / 12000.0: + _second_finder(1) + elif span < periodsperday / 6000.0: + _second_finder(2) + elif span < periodsperday / 2400.0: + _second_finder(5) + elif span < periodsperday / 1200.0: + _second_finder(10) + elif span < periodsperday / 800.0: + _second_finder(15) + elif span < periodsperday / 400.0: + _second_finder(30) + elif span < periodsperday / 150.0: + _minute_finder(1) + elif span < periodsperday / 70.0: + _minute_finder(2) + elif span < periodsperday / 24.0: + _minute_finder(5) + elif span < periodsperday / 12.0: + _minute_finder(15) + elif span < periodsperday / 6.0: + _minute_finder(30) + elif span < periodsperday / 2.5: + _hour_finder(1, False) + elif span < periodsperday / 1.5: + _hour_finder(2, False) + elif span < periodsperday * 1.25: + _hour_finder(3, False) + elif span < periodsperday * 2.5: + _hour_finder(6, True) + elif span < periodsperday * 4: + _hour_finder(12, True) + else: + info_maj[month_start] = True + info_min[day_start] = True + year_start = period_break(dates_, 'year') + info_fmt = info['fmt'] + info_fmt[day_start] = '%d' + info_fmt[month_start] = '%d\n%b' + info_fmt[year_start] = '%d\n%b\n%Y' + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(day_start)] = '%d\n%b\n%Y' + else: + info_fmt[first_label(month_start)] = '%d\n%b\n%Y' + + # Case 2. Less than three months + elif span <= periodsperyear // 4: + month_start = period_break(dates_, 'month') + info_maj[month_start] = True + if freq < FreqGroup.FR_HR: + info['min'] = True + else: + day_start = period_break(dates_, 'day') + info['min'][day_start] = True + week_start = period_break(dates_, 'week') + year_start = period_break(dates_, 'year') + info_fmt[week_start] = '%d' + info_fmt[month_start] = '\n\n%b' + info_fmt[year_start] = '\n\n%b\n%Y' + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(week_start)] = '\n\n%b\n%Y' + else: + info_fmt[first_label(month_start)] = '\n\n%b\n%Y' + # Case 3. Less than 14 months ............... + elif span <= 1.15 * periodsperyear: + year_start = period_break(dates_, 'year') + month_start = period_break(dates_, 'month') + week_start = period_break(dates_, 'week') + info_maj[month_start] = True + info_min[week_start] = True + info_min[year_start] = False + info_min[month_start] = False + info_fmt[month_start] = '%b' + info_fmt[year_start] = '%b\n%Y' + if not has_level_label(year_start, vmin_orig): + info_fmt[first_label(month_start)] = '%b\n%Y' + # Case 4. Less than 2.5 years ............... + elif span <= 2.5 * periodsperyear: + year_start = period_break(dates_, 'year') + quarter_start = period_break(dates_, 'quarter') + month_start = period_break(dates_, 'month') + info_maj[quarter_start] = True + info_min[month_start] = True + info_fmt[quarter_start] = '%b' + info_fmt[year_start] = '%b\n%Y' + # Case 4. Less than 4 years ................. + elif span <= 4 * periodsperyear: + year_start = period_break(dates_, 'year') + month_start = period_break(dates_, 'month') + info_maj[year_start] = True + info_min[month_start] = True + info_min[year_start] = False + + month_break = dates_[month_start].month + jan_or_jul = month_start[(month_break == 1) | (month_break == 7)] + info_fmt[jan_or_jul] = '%b' + info_fmt[year_start] = '%b\n%Y' + # Case 5. Less than 11 years ................ + elif span <= 11 * periodsperyear: + year_start = period_break(dates_, 'year') + quarter_start = period_break(dates_, 'quarter') + info_maj[year_start] = True + info_min[quarter_start] = True + info_min[year_start] = False + info_fmt[year_start] = '%Y' + # Case 6. More than 12 years ................ + else: + year_start = period_break(dates_, 'year') + year_break = dates_[year_start].year + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(year_break % maj_anndef == 0)] + info_maj[major_idx] = True + minor_idx = year_start[(year_break % min_anndef == 0)] + info_min[minor_idx] = True + info_fmt[major_idx] = '%Y' + #............................................ + + return info + + +def _monthly_finder(vmin, vmax, freq): + periodsperyear = 12 + + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + #.............. + # Initialize the output + info = np.zeros(span, + dtype=[('val', int), ('maj', bool), ('min', bool), + ('fmt', '|S8')]) + info['val'] = np.arange(vmin, vmax + 1) + dates_ = info['val'] + info['fmt'] = '' + year_start = (dates_ % 12 == 0).nonzero()[0] + info_maj = info['maj'] + info_fmt = info['fmt'] + #.............. + if span <= 1.15 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + + info_fmt[:] = '%b' + info_fmt[year_start] = '%b\n%Y' + + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = '%b\n%Y' + #.............. + elif span <= 2.5 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + # TODO: Check the following : is it really info['fmt'] ? + info['fmt'][quarter_start] = True + info['min'] = True + + info_fmt[quarter_start] = '%b' + info_fmt[year_start] = '%b\n%Y' + #.............. + elif span <= 4 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + + jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6) + info_fmt[jan_or_jul] = '%b' + info_fmt[year_start] = '%b\n%Y' + #.............. + elif span <= 11 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + info['min'][quarter_start] = True + + info_fmt[year_start] = '%Y' + #.................. + else: + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + years = dates_[year_start] // 12 + 1 + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info['min'][year_start[(years % min_anndef == 0)]] = True + + info_fmt[major_idx] = '%Y' + #.............. + return info + + +def _quarterly_finder(vmin, vmax, freq): + periodsperyear = 4 + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + #............................................ + info = np.zeros(span, + dtype=[('val', int), ('maj', bool), ('min', bool), + ('fmt', '|S8')]) + info['val'] = np.arange(vmin, vmax + 1) + info['fmt'] = '' + dates_ = info['val'] + info_maj = info['maj'] + info_fmt = info['fmt'] + year_start = (dates_ % 4 == 0).nonzero()[0] + #.............. + if span <= 3.5 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + + info_fmt[:] = 'Q%q' + info_fmt[year_start] = 'Q%q\n%F' + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = 'Q%q\n%F' + #.............. + elif span <= 11 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + info_fmt[year_start] = '%F' + #.............. + else: + years = dates_[year_start] // 4 + 1 + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info['min'][year_start[(years % min_anndef == 0)]] = True + info_fmt[major_idx] = '%F' + #.............. + return info + + +def _annual_finder(vmin, vmax, freq): + (vmin, vmax) = (int(vmin), int(vmax + 1)) + span = vmax - vmin + 1 + #.............. + info = np.zeros(span, + dtype=[('val', int), ('maj', bool), ('min', bool), + ('fmt', '|S8')]) + info['val'] = np.arange(vmin, vmax + 1) + info['fmt'] = '' + dates_ = info['val'] + #.............. + (min_anndef, maj_anndef) = _get_default_annual_spacing(span) + major_idx = dates_ % maj_anndef == 0 + info['maj'][major_idx] = True + info['min'][(dates_ % min_anndef == 0)] = True + info['fmt'][major_idx] = '%Y' + #.............. + return info + + +def get_finder(freq): + if isinstance(freq, compat.string_types): + freq = frequencies.get_freq(freq) + fgroup = frequencies.get_freq_group(freq) + + if fgroup == FreqGroup.FR_ANN: + return _annual_finder + elif fgroup == FreqGroup.FR_QTR: + return _quarterly_finder + elif freq == FreqGroup.FR_MTH: + return _monthly_finder + elif ((freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK): + return _daily_finder + else: # pragma: no cover + errmsg = "Unsupported frequency: %s" % (freq) + raise NotImplementedError(errmsg) + + +class TimeSeries_DateLocator(Locator): + """ + Locates the ticks along an axis controlled by a :class:`Series`. + + Parameters + ---------- + freq : {var} + Valid frequency specifier. + minor_locator : {False, True}, optional + Whether the locator is for minor ticks (True) or not. + dynamic_mode : {True, False}, optional + Whether the locator should work in dynamic mode. + base : {int}, optional + quarter : {int}, optional + month : {int}, optional + day : {int}, optional + """ + + def __init__(self, freq, minor_locator=False, dynamic_mode=True, + base=1, quarter=1, month=1, day=1, plot_obj=None): + if isinstance(freq, compat.string_types): + freq = frequencies.get_freq(freq) + self.freq = freq + self.base = base + (self.quarter, self.month, self.day) = (quarter, month, day) + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _get_default_locs(self, vmin, vmax): + "Returns the default locations of ticks." + + if self.plot_obj.date_axis_info is None: + self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) + + locator = self.plot_obj.date_axis_info + + if self.isminor: + return np.compress(locator['min'], locator['val']) + return np.compress(locator['maj'], locator['val']) + + def __call__(self): + 'Return the locations of the ticks.' + # axis calls Locator.set_axis inside set_m_formatter + vi = tuple(self.axis.get_view_interval()) + if vi != self.plot_obj.view_interval: + self.plot_obj.date_axis_info = None + self.plot_obj.view_interval = vi + vmin, vmax = vi + if vmax < vmin: + vmin, vmax = vmax, vmin + if self.isdynamic: + locs = self._get_default_locs(vmin, vmax) + else: # pragma: no cover + base = self.base + (d, m) = divmod(vmin, base) + vmin = (d + 1) * base + locs = lrange(vmin, vmax + 1, base) + return locs + + def autoscale(self): + """ + Sets the view limits to the nearest multiples of base that contain the + data. + """ + # requires matplotlib >= 0.98.0 + (vmin, vmax) = self.axis.get_data_interval() + + locs = self._get_default_locs(vmin, vmax) + (vmin, vmax) = locs[[0, -1]] + if vmin == vmax: + vmin -= 1 + vmax += 1 + return nonsingular(vmin, vmax) + +#####------------------------------------------------------------------------- +#---- --- Formatter --- +#####------------------------------------------------------------------------- + + +class TimeSeries_DateFormatter(Formatter): + """ + Formats the ticks along an axis controlled by a :class:`PeriodIndex`. + + Parameters + ---------- + freq : {int, string} + Valid frequency specifier. + minor_locator : {False, True} + Whether the current formatter should apply to minor ticks (True) or + major ticks (False). + dynamic_mode : {True, False} + Whether the formatter works in dynamic mode or not. + """ + + def __init__(self, freq, minor_locator=False, dynamic_mode=True, + plot_obj=None): + if isinstance(freq, compat.string_types): + freq = frequencies.get_freq(freq) + self.format = None + self.freq = freq + self.locs = [] + self.formatdict = None + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _set_default_format(self, vmin, vmax): + "Returns the default ticks spacing." + + if self.plot_obj.date_axis_info is None: + self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) + info = self.plot_obj.date_axis_info + + if self.isminor: + format = np.compress(info['min'] & np.logical_not(info['maj']), + info) + else: + format = np.compress(info['maj'], info) + self.formatdict = dict([(x, f) for (x, _, _, f) in format]) + return self.formatdict + + def set_locs(self, locs): + 'Sets the locations of the ticks' + # don't actually use the locs. This is just needed to work with + # matplotlib. Force to use vmin, vmax + self.locs = locs + + (vmin, vmax) = vi = tuple(self.axis.get_view_interval()) + if vi != self.plot_obj.view_interval: + self.plot_obj.date_axis_info = None + self.plot_obj.view_interval = vi + if vmax < vmin: + (vmin, vmax) = (vmax, vmin) + self._set_default_format(vmin, vmax) + + def __call__(self, x, pos=0): + if self.formatdict is None: + return '' + else: + fmt = self.formatdict.pop(x, '') + return Period(ordinal=int(x), freq=self.freq).strftime(fmt) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py new file mode 100644 index 00000000..fe61e5f0 --- /dev/null +++ b/pandas/tseries/frequencies.py @@ -0,0 +1,1048 @@ +from datetime import datetime +from pandas.compat import range, long, zip +from pandas import compat +import re + +import numpy as np + +from pandas.core.algorithms import unique +from pandas.tseries.offsets import DateOffset +from pandas.util.decorators import cache_readonly +import pandas.tseries.offsets as offsets +import pandas.core.common as com +import pandas.lib as lib +import pandas.tslib as tslib + +class FreqGroup(object): + FR_ANN = 1000 + FR_QTR = 2000 + FR_MTH = 3000 + FR_WK = 4000 + FR_BUS = 5000 + FR_DAY = 6000 + FR_HR = 7000 + FR_MIN = 8000 + FR_SEC = 9000 + FR_MS = 10000 + FR_US = 11000 + FR_NS = 12000 + + +class Resolution(object): + + RESO_US = tslib.US_RESO + RESO_MS = tslib.MS_RESO + RESO_SEC = tslib.S_RESO + RESO_MIN = tslib.T_RESO + RESO_HR = tslib.H_RESO + RESO_DAY = tslib.D_RESO + + _reso_str_map = { + RESO_US: 'microsecond', + RESO_MS: 'millisecond', + RESO_SEC: 'second', + RESO_MIN: 'minute', + RESO_HR: 'hour', + RESO_DAY: 'day'} + + _reso_period_map = { + 'year': 'A', + 'quarter': 'Q', + 'month': 'M', + 'day': 'D', + 'hour': 'H', + 'minute': 'T', + 'second': 'S', + 'millisecond': 'L', + 'microsecond': 'U', + 'nanosecond': 'N'} + + @classmethod + def get_str(cls, reso): + return cls._reso_str_map.get(reso, 'day') + + @classmethod + def get_freq(cls, resostr): + return cls._reso_period_map[resostr] + +def get_reso_string(reso): + return Resolution.get_str(reso) + + +def get_to_timestamp_base(base): + if base < FreqGroup.FR_BUS: + return FreqGroup.FR_DAY + if FreqGroup.FR_HR <= base <= FreqGroup.FR_SEC: + return FreqGroup.FR_SEC + return base + + +def get_freq_group(freq): + if isinstance(freq, compat.string_types): + base, mult = get_freq_code(freq) + freq = base + return (freq // 1000) * 1000 + + +def get_freq(freq): + if isinstance(freq, compat.string_types): + base, mult = get_freq_code(freq) + freq = base + return freq + + +def get_freq_code(freqstr): + """ + + Parameters + ---------- + + Returns + ------- + """ + if isinstance(freqstr, DateOffset): + freqstr = (get_offset_name(freqstr), freqstr.n) + + if isinstance(freqstr, tuple): + if (com.is_integer(freqstr[0]) and + com.is_integer(freqstr[1])): + # e.g., freqstr = (2000, 1) + return freqstr + else: + # e.g., freqstr = ('T', 5) + try: + code = _period_str_to_code(freqstr[0]) + stride = freqstr[1] + except: + if com.is_integer(freqstr[1]): + raise + code = _period_str_to_code(freqstr[1]) + stride = freqstr[0] + return code, stride + + if com.is_integer(freqstr): + return (freqstr, 1) + + base, stride = _base_and_stride(freqstr) + code = _period_str_to_code(base) + + return code, stride + + +def _get_freq_str(base, mult=1): + code = _reverse_period_code_map.get(base) + if mult == 1: + return code + return str(mult) + code + + +#---------------------------------------------------------------------- +# Offset names ("time rules") and related functions + + +from pandas.tseries.offsets import (Nano, Micro, Milli, Second, Minute, Hour, + Day, BDay, CDay, Week, MonthBegin, + MonthEnd, BMonthBegin, BMonthEnd, + QuarterBegin, QuarterEnd, BQuarterBegin, + BQuarterEnd, YearBegin, YearEnd, + BYearBegin, BYearEnd, _make_offset + ) +try: + cday = CDay() +except NotImplementedError: + cday = None + +#: cache of previously seen offsets +_offset_map = {} + +_offset_to_period_map = { + 'WEEKDAY': 'D', + 'EOM': 'M', + 'BM': 'M', + 'BQS': 'Q', + 'QS': 'Q', + 'BQ': 'Q', + 'BA': 'A', + 'AS': 'A', + 'BAS': 'A', + 'MS': 'M', + 'D': 'D', + 'C': 'C', + 'B': 'B', + 'T': 'T', + 'S': 'S', + 'L': 'L', + 'U': 'U', + 'N': 'N', + 'H': 'H', + 'Q': 'Q', + 'A': 'A', + 'W': 'W', + 'M': 'M' +} + +need_suffix = ['QS', 'BQ', 'BQS', 'AS', 'BA', 'BAS'] +_months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', + 'OCT', 'NOV', 'DEC'] +for __prefix in need_suffix: + for _m in _months: + _offset_to_period_map['%s-%s' % (__prefix, _m)] = \ + _offset_to_period_map[__prefix] +for __prefix in ['A', 'Q']: + for _m in _months: + _alias = '%s-%s' % (__prefix, _m) + _offset_to_period_map[_alias] = _alias + +_days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] +for _d in _days: + _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d + + +def get_period_alias(offset_str): + """ alias to closest period strings BQ->Q etc""" + return _offset_to_period_map.get(offset_str, None) + +_rule_aliases = { + # Legacy rules that will continue to map to their original values + # essentially for the rest of time + + 'WEEKDAY': 'B', + 'EOM': 'BM', + + 'W@MON': 'W-MON', + 'W@TUE': 'W-TUE', + 'W@WED': 'W-WED', + 'W@THU': 'W-THU', + 'W@FRI': 'W-FRI', + 'W@SAT': 'W-SAT', + 'W@SUN': 'W-SUN', + 'W': 'W-SUN', + + 'Q@JAN': 'BQ-JAN', + 'Q@FEB': 'BQ-FEB', + 'Q@MAR': 'BQ-MAR', + 'Q': 'Q-DEC', + + 'A': 'A-DEC', # YearEnd(month=12), + 'AS': 'AS-JAN', # YearBegin(month=1), + 'BA': 'BA-DEC', # BYearEnd(month=12), + 'BAS': 'BAS-JAN', # BYearBegin(month=1), + + 'A@JAN': 'BA-JAN', + 'A@FEB': 'BA-FEB', + 'A@MAR': 'BA-MAR', + 'A@APR': 'BA-APR', + 'A@MAY': 'BA-MAY', + 'A@JUN': 'BA-JUN', + 'A@JUL': 'BA-JUL', + 'A@AUG': 'BA-AUG', + 'A@SEP': 'BA-SEP', + 'A@OCT': 'BA-OCT', + 'A@NOV': 'BA-NOV', + 'A@DEC': 'BA-DEC', + + # lite aliases + 'Min': 'T', + 'min': 'T', + 'ms': 'L', + 'us': 'U' +} + +#TODO: Can this be killed? +for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): + for _iweek in range(4): + _name = 'WOM-%d%s' % (_iweek + 1, _weekday) + _rule_aliases[_name.replace('-', '@')] = _name + +# Note that _rule_aliases is not 1:1 (d[BA]==d[A@DEC]), and so traversal +# order matters when constructing an inverse. we pick one. #2331 +_legacy_reverse_map = dict((v, k) for k, v in + reversed(sorted(compat.iteritems(_rule_aliases)))) + +def to_offset(freqstr): + """ + Return DateOffset object from string representation + + Examples + -------- + >>> to_offset('5Min') + Minute(5) + """ + if freqstr is None: + return None + + if isinstance(freqstr, DateOffset): + return freqstr + + if isinstance(freqstr, tuple): + name = freqstr[0] + stride = freqstr[1] + if isinstance(stride, compat.string_types): + name, stride = stride, name + name, _ = _base_and_stride(name) + delta = get_offset(name) * stride + else: + delta = None + stride_sign = None + try: + for stride, name, _ in opattern.findall(freqstr): + offset = get_offset(name) + if stride_sign is None: + stride_sign = -1 if stride.startswith('-') else 1 + if not stride: + stride = 1 + stride = int(stride) + offset = offset * int(np.fabs(stride) * stride_sign) + if delta is None: + delta = offset + else: + delta = delta + offset + except Exception: + raise ValueError("Could not evaluate %s" % freqstr) + + if delta is None: + raise ValueError('Unable to understand %s as a frequency' % freqstr) + + return delta + + +# hack to handle WOM-1MON +opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-@][\dA-Za-z\-]+)?)') + + +def _base_and_stride(freqstr): + """ + Return base freq and stride info from string representation + + Examples + -------- + _freq_and_stride('5Min') -> 'Min', 5 + """ + groups = opattern.match(freqstr) + + if not groups: + raise ValueError("Could not evaluate %s" % freqstr) + + stride = groups.group(1) + + if len(stride): + stride = int(stride) + else: + stride = 1 + + base = groups.group(2) + + return (base, stride) + + +def get_base_alias(freqstr): + """ + Returns the base frequency alias, e.g., '5D' -> 'D' + """ + return _base_and_stride(freqstr)[0] + +_dont_uppercase = set(('MS', 'ms')) + + +def get_offset(name): + """ + Return DateOffset object associated with rule name + + Examples + -------- + get_offset('EOM') --> BMonthEnd(1) + """ + if name not in _dont_uppercase: + name = name.upper() + + if name in _rule_aliases: + name = _rule_aliases[name] + elif name.lower() in _rule_aliases: + name = _rule_aliases[name.lower()] + else: + if name in _rule_aliases: + name = _rule_aliases[name] + + if name not in _offset_map: + try: + # generate and cache offset + offset = _make_offset(name) + except (ValueError, TypeError, KeyError): + # bad prefix or suffix + raise ValueError('Bad rule name requested: %s.' % name) + _offset_map[name] = offset + return _offset_map[name] + + +getOffset = get_offset + + +def get_offset_name(offset): + """ + Return rule name associated with a DateOffset object + + Examples + -------- + get_offset_name(BMonthEnd(1)) --> 'EOM' + """ + if offset is None: + raise ValueError("Offset can't be none!") + # Hack because this is what it did before... + if isinstance(offset, BDay): + if offset.n != 1: + raise ValueError('Bad rule given: %s.' % 'BusinessDays') + else: + return offset.rule_code + try: + return offset.freqstr + except AttributeError: + # Bad offset, give useful error. + raise ValueError('Bad rule given: %s.' % offset) + + +def get_legacy_offset_name(offset): + """ + Return the pre pandas 0.8.0 name for the date offset + """ + name = offset.name + return _legacy_reverse_map.get(name, name) + +def get_standard_freq(freq): + """ + Return the standardized frequency string + """ + if freq is None: + return None + + if isinstance(freq, DateOffset): + return get_offset_name(freq) + + code, stride = get_freq_code(freq) + return _get_freq_str(code, stride) + +#---------------------------------------------------------------------- +# Period codes + +# period frequency constants corresponding to scikits timeseries +# originals +_period_code_map = { + # Annual freqs with various fiscal year ends. + # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 + "A-DEC": 1000, # Annual - December year end + "A-JAN": 1001, # Annual - January year end + "A-FEB": 1002, # Annual - February year end + "A-MAR": 1003, # Annual - March year end + "A-APR": 1004, # Annual - April year end + "A-MAY": 1005, # Annual - May year end + "A-JUN": 1006, # Annual - June year end + "A-JUL": 1007, # Annual - July year end + "A-AUG": 1008, # Annual - August year end + "A-SEP": 1009, # Annual - September year end + "A-OCT": 1010, # Annual - October year end + "A-NOV": 1011, # Annual - November year end + + # Quarterly frequencies with various fiscal year ends. + # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 + "Q-DEC": 2000, # Quarterly - December year end + "Q-JAN": 2001, # Quarterly - January year end + "Q-FEB": 2002, # Quarterly - February year end + "Q-MAR": 2003, # Quarterly - March year end + "Q-APR": 2004, # Quarterly - April year end + "Q-MAY": 2005, # Quarterly - May year end + "Q-JUN": 2006, # Quarterly - June year end + "Q-JUL": 2007, # Quarterly - July year end + "Q-AUG": 2008, # Quarterly - August year end + "Q-SEP": 2009, # Quarterly - September year end + "Q-OCT": 2010, # Quarterly - October year end + "Q-NOV": 2011, # Quarterly - November year end + + "M": 3000, # Monthly + + "W-SUN": 4000, # Weekly - Sunday end of week + "W-MON": 4001, # Weekly - Monday end of week + "W-TUE": 4002, # Weekly - Tuesday end of week + "W-WED": 4003, # Weekly - Wednesday end of week + "W-THU": 4004, # Weekly - Thursday end of week + "W-FRI": 4005, # Weekly - Friday end of week + "W-SAT": 4006, # Weekly - Saturday end of week + + "B": 5000, # Business days + "D": 6000, # Daily + "H": 7000, # Hourly + "T": 8000, # Minutely + "S": 9000, # Secondly + "L": 10000, # Millisecondly + "U": 11000, # Microsecondly + "N": 12000, # Nanosecondly +} + +_reverse_period_code_map = {} +for _k, _v in compat.iteritems(_period_code_map): + _reverse_period_code_map[_v] = _k + +# Additional aliases +_period_code_map.update({ + "Q": 2000, # Quarterly - December year end (default quarterly) + "A": 1000, # Annual + "W": 4000, # Weekly +}) + + +def _period_alias_dictionary(): + """ + Build freq alias dictionary to support freqs from original c_dates.c file + of the scikits.timeseries library. + """ + alias_dict = {} + + M_aliases = ["M", "MTH", "MONTH", "MONTHLY"] + B_aliases = ["B", "BUS", "BUSINESS", "BUSINESSLY", 'WEEKDAY'] + D_aliases = ["D", "DAY", "DLY", "DAILY"] + H_aliases = ["H", "HR", "HOUR", "HRLY", "HOURLY"] + T_aliases = ["T", "MIN", "MINUTE", "MINUTELY"] + S_aliases = ["S", "SEC", "SECOND", "SECONDLY"] + L_aliases = ["L", "ms", "MILLISECOND", "MILLISECONDLY"] + U_aliases = ["U", "US", "MICROSECOND", "MICROSECONDLY"] + N_aliases = ["N", "NS", "NANOSECOND", "NANOSECONDLY"] + + for k in M_aliases: + alias_dict[k] = 'M' + + for k in B_aliases: + alias_dict[k] = 'B' + + for k in D_aliases: + alias_dict[k] = 'D' + + for k in H_aliases: + alias_dict[k] = 'H' + + for k in T_aliases: + alias_dict[k] = 'Min' + + for k in S_aliases: + alias_dict[k] = 'S' + + for k in L_aliases: + alias_dict[k] = 'L' + + for k in U_aliases: + alias_dict[k] = 'U' + + for k in N_aliases: + alias_dict[k] = 'N' + + A_prefixes = ["A", "Y", "ANN", "ANNUAL", "ANNUALLY", "YR", "YEAR", + "YEARLY"] + + Q_prefixes = ["Q", "QTR", "QUARTER", "QUARTERLY", "Q-E", + "QTR-E", "QUARTER-E", "QUARTERLY-E"] + + month_names = [ + ["DEC", "DECEMBER"], + ["JAN", "JANUARY"], + ["FEB", "FEBRUARY"], + ["MAR", "MARCH"], + ["APR", "APRIL"], + ["MAY", "MAY"], + ["JUN", "JUNE"], + ["JUL", "JULY"], + ["AUG", "AUGUST"], + ["SEP", "SEPTEMBER"], + ["OCT", "OCTOBER"], + ["NOV", "NOVEMBER"]] + + seps = ["@", "-"] + + for k in A_prefixes: + alias_dict[k] = 'A' + for m_tup in month_names: + for sep in seps: + m1, m2 = m_tup + alias_dict[k + sep + m1] = 'A-' + m1 + alias_dict[k + sep + m2] = 'A-' + m1 + + for k in Q_prefixes: + alias_dict[k] = 'Q' + for m_tup in month_names: + for sep in seps: + m1, m2 = m_tup + alias_dict[k + sep + m1] = 'Q-' + m1 + alias_dict[k + sep + m2] = 'Q-' + m1 + + W_prefixes = ["W", "WK", "WEEK", "WEEKLY"] + + day_names = [ + ["SUN", "SUNDAY"], + ["MON", "MONDAY"], + ["TUE", "TUESDAY"], + ["WED", "WEDNESDAY"], + ["THU", "THURSDAY"], + ["FRI", "FRIDAY"], + ["SAT", "SATURDAY"]] + + for k in W_prefixes: + alias_dict[k] = 'W' + for d_tup in day_names: + for sep in ["@", "-"]: + d1, d2 = d_tup + alias_dict[k + sep + d1] = 'W-' + d1 + alias_dict[k + sep + d2] = 'W-' + d1 + + return alias_dict + + +def _infer_period_group(freqstr): + return _period_group(Resolution._reso_period_map[freqstr]) + + +def _period_group(freqstr): + base, mult = get_freq_code(freqstr) + return base // 1000 * 1000 + +_period_alias_dict = _period_alias_dictionary() + + +def _period_str_to_code(freqstr): + # hack + freqstr = _rule_aliases.get(freqstr, freqstr) + + if freqstr not in _dont_uppercase: + freqstr = _rule_aliases.get(freqstr.lower(), freqstr) + + try: + if freqstr not in _dont_uppercase: + freqstr = freqstr.upper() + return _period_code_map[freqstr] + except KeyError: + try: + alias = _period_alias_dict[freqstr] + except KeyError: + raise ValueError("Unknown freqstr: %s" % freqstr) + + return _period_code_map[alias] + + +def infer_freq(index, warn=True): + """ + Infer the most likely frequency given the input index. If the frequency is + uncertain, a warning will be printed + + Parameters + ---------- + index : DatetimeIndex + if passed a Series will use the values of the series (NOT THE INDEX) + warn : boolean, default True + + Returns + ------- + freq : string or None + None if no discernible frequency + TypeError if the index is not datetime-like + """ + import pandas as pd + + if isinstance(index, com.ABCSeries): + values = index.values + if not (com.is_datetime64_dtype(index.values) or values.dtype == object): + raise TypeError("cannot infer freq from a non-convertible dtype on a Series of {0}".format(index.dtype)) + index = values + if isinstance(index, pd.PeriodIndex): + raise TypeError("PeriodIndex given. Check the `freq` attribute " + "instead of using infer_freq.") + if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): + if isinstance(index, (pd.Int64Index, pd.Float64Index)): + raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) + index = index.values + + index = pd.DatetimeIndex(index) + inferer = _FrequencyInferer(index, warn=warn) + return inferer.get_freq() + +_ONE_MICRO = long(1000) +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR + +def _tz_convert_with_transitions(values, to_tz, from_tz): + """ + convert i8 values from the specificed timezone to the to_tz zone, taking + into account DST transitions + """ + + # vectorization is slow, so tests if we can do this via the faster tz_convert + f = lambda x: tslib.tz_convert_single(x, to_tz, from_tz) + + if len(values) > 2: + first_slow, last_slow = f(values[0]),f(values[-1]) + + first_fast, last_fast = tslib.tz_convert(np.array([values[0],values[-1]],dtype='i8'),to_tz,from_tz) + + # don't cross a DST, so ok + if first_fast == first_slow and last_fast == last_slow: + return tslib.tz_convert(values,to_tz,from_tz) + + return np.vectorize(f)(values) + +class _FrequencyInferer(object): + """ + Not sure if I can avoid the state machine here + """ + + def __init__(self, index, warn=True): + self.index = index + self.values = np.asarray(index).view('i8') + + if index.tz is not None: + self.values = _tz_convert_with_transitions(self.values,'UTC',index.tz) + + self.warn = warn + + if len(index) < 3: + raise ValueError('Need at least 3 dates to infer frequency') + + self.is_monotonic = self.index.is_monotonic + + @cache_readonly + def deltas(self): + return tslib.unique_deltas(self.values) + + @cache_readonly + def is_unique(self): + return len(self.deltas) == 1 + + def get_freq(self): + if not self.is_monotonic or not self.index.is_unique: + return None + + delta = self.deltas[0] + if _is_multiple(delta, _ONE_DAY): + return self._infer_daily_rule() + else: + # Possibly intraday frequency + if not self.is_unique: + return None + if _is_multiple(delta, _ONE_HOUR): + # Hours + return _maybe_add_count('H', delta / _ONE_HOUR) + elif _is_multiple(delta, _ONE_MINUTE): + # Minutes + return _maybe_add_count('T', delta / _ONE_MINUTE) + elif _is_multiple(delta, _ONE_SECOND): + # Seconds + return _maybe_add_count('S', delta / _ONE_SECOND) + elif _is_multiple(delta, _ONE_MILLI): + # Milliseconds + return _maybe_add_count('L', delta / _ONE_MILLI) + elif _is_multiple(delta, _ONE_MICRO): + # Microseconds + return _maybe_add_count('U', delta / _ONE_MICRO) + else: + # Nanoseconds + return _maybe_add_count('N', delta) + + @cache_readonly + def day_deltas(self): + return [x / _ONE_DAY for x in self.deltas] + + @cache_readonly + def fields(self): + return tslib.build_field_sarray(self.values) + + @cache_readonly + def rep_stamp(self): + return lib.Timestamp(self.values[0]) + + def month_position_check(self): + # TODO: cythonize this, very slow + calendar_end = True + business_end = True + calendar_start = True + business_start = True + + years = self.fields['Y'] + months = self.fields['M'] + days = self.fields['D'] + weekdays = self.index.dayofweek + + from calendar import monthrange + for y, m, d, wd in zip(years, months, days, weekdays): + wd = datetime(y, m, d).weekday() + + if calendar_start: + calendar_start &= d == 1 + if business_start: + business_start &= d == 1 or (d <= 3 and wd == 0) + + if calendar_end or business_end: + _, daysinmonth = monthrange(y, m) + cal = d == daysinmonth + if calendar_end: + calendar_end &= cal + if business_end: + business_end &= cal or (daysinmonth - d < 3 and wd == 4) + elif not calendar_start and not business_start: + break + + if calendar_end: + return 'ce' + elif business_end: + return 'be' + elif calendar_start: + return 'cs' + elif business_start: + return 'bs' + else: + return None + + @cache_readonly + def mdiffs(self): + nmonths = self.fields['Y'] * 12 + self.fields['M'] + return tslib.unique_deltas(nmonths.astype('i8')) + + @cache_readonly + def ydiffs(self): + return tslib.unique_deltas(self.fields['Y'].astype('i8')) + + def _infer_daily_rule(self): + annual_rule = self._get_annual_rule() + if annual_rule: + nyears = self.ydiffs[0] + month = _month_aliases[self.rep_stamp.month] + return _maybe_add_count('%s-%s' % (annual_rule, month), nyears) + + quarterly_rule = self._get_quarterly_rule() + if quarterly_rule: + nquarters = self.mdiffs[0] / 3 + mod_dict = {0: 12, 2: 11, 1: 10} + month = _month_aliases[mod_dict[self.rep_stamp.month % 3]] + return _maybe_add_count('%s-%s' % (quarterly_rule, month), + nquarters) + + monthly_rule = self._get_monthly_rule() + if monthly_rule: + return monthly_rule + + if self.is_unique: + days = self.deltas[0] / _ONE_DAY + if days % 7 == 0: + # Weekly + alias = _weekday_rule_aliases[self.rep_stamp.weekday()] + return _maybe_add_count('W-%s' % alias, days / 7) + else: + return _maybe_add_count('D', days) + + # Business daily. Maybe + if self.day_deltas == [1, 3]: + return 'B' + + wom_rule = self._get_wom_rule() + if wom_rule: + return wom_rule + + def _get_annual_rule(self): + if len(self.ydiffs) > 1: + return None + + if len(algos.unique(self.fields['M'])) > 1: + return None + + pos_check = self.month_position_check() + return {'cs': 'AS', 'bs': 'BAS', + 'ce': 'A', 'be': 'BA'}.get(pos_check) + + def _get_quarterly_rule(self): + if len(self.mdiffs) > 1: + return None + + if not self.mdiffs[0] % 3 == 0: + return None + + pos_check = self.month_position_check() + return {'cs': 'QS', 'bs': 'BQS', + 'ce': 'Q', 'be': 'BQ'}.get(pos_check) + + def _get_monthly_rule(self): + if len(self.mdiffs) > 1: + return None + pos_check = self.month_position_check() + return {'cs': 'MS', 'bs': 'BMS', + 'ce': 'M', 'be': 'BM'}.get(pos_check) + + def _get_wom_rule(self): +# wdiffs = unique(np.diff(self.index.week)) + #We also need -47, -49, -48 to catch index spanning year boundary +# if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): +# return None + + weekdays = unique(self.index.weekday) + if len(weekdays) > 1: + return None + + week_of_months = unique((self.index.day - 1) // 7) + if len(week_of_months) > 1: + return None + + # get which week + week = week_of_months[0] + 1 + wd = _weekday_rule_aliases[weekdays[0]] + + return 'WOM-%d%s' % (week, wd) + +import pandas.core.algorithms as algos + + +def _maybe_add_count(base, count): + if count > 1: + return '%d%s' % (count, base) + else: + return base + + +def is_subperiod(source, target): + """ + Returns True if downsampling is possible between source and target + frequencies + + Parameters + ---------- + source : string + Frequency converting from + target : string + Frequency converting to + + Returns + ------- + is_subperiod : boolean + """ + if isinstance(source, offsets.DateOffset): + source = source.rule_code + + if isinstance(target, offsets.DateOffset): + target = target.rule_code + + target = target.upper() + source = source.upper() + if _is_annual(target): + if _is_quarterly(source): + return _quarter_months_conform(_get_rule_month(source), + _get_rule_month(target)) + return source in ['D', 'C', 'B', 'M', 'H', 'T', 'S'] + elif _is_quarterly(target): + return source in ['D', 'C', 'B', 'M', 'H', 'T', 'S'] + elif target == 'M': + return source in ['D', 'C', 'B', 'H', 'T', 'S'] + elif _is_weekly(target): + return source in [target, 'D', 'C', 'B', 'H', 'T', 'S'] + elif target == 'B': + return source in ['B', 'H', 'T', 'S'] + elif target == 'C': + return source in ['C', 'H', 'T', 'S'] + elif target == 'D': + return source in ['D', 'H', 'T', 'S'] + elif target == 'H': + return source in ['H', 'T', 'S'] + elif target == 'T': + return source in ['T', 'S'] + elif target == 'S': + return source in ['S'] + + +def is_superperiod(source, target): + """ + Returns True if upsampling is possible between source and target + frequencies + + Parameters + ---------- + source : string + Frequency converting from + target : string + Frequency converting to + + Returns + ------- + is_superperiod : boolean + """ + if isinstance(source, offsets.DateOffset): + source = source.rule_code + + if isinstance(target, offsets.DateOffset): + target = target.rule_code + + target = target.upper() + source = source.upper() + if _is_annual(source): + if _is_annual(target): + return _get_rule_month(source) == _get_rule_month(target) + + if _is_quarterly(target): + smonth = _get_rule_month(source) + tmonth = _get_rule_month(target) + return _quarter_months_conform(smonth, tmonth) + return target in ['D', 'C', 'B', 'M', 'H', 'T', 'S'] + elif _is_quarterly(source): + return target in ['D', 'C', 'B', 'M', 'H', 'T', 'S'] + elif source == 'M': + return target in ['D', 'C', 'B', 'H', 'T', 'S'] + elif _is_weekly(source): + return target in [source, 'D', 'C', 'B', 'H', 'T', 'S'] + elif source == 'B': + return target in ['D', 'C', 'B', 'H', 'T', 'S'] + elif source == 'C': + return target in ['D', 'C', 'B', 'H', 'T', 'S'] + elif source == 'D': + return target in ['D', 'C', 'B', 'H', 'T', 'S'] + elif source == 'H': + return target in ['H', 'T', 'S'] + elif source == 'T': + return target in ['T', 'S'] + elif source == 'S': + return target in ['S'] + + +def _get_rule_month(source, default='DEC'): + source = source.upper() + if '-' not in source: + return default + else: + return source.split('-')[1] + + +def _is_annual(rule): + rule = rule.upper() + return rule == 'A' or rule.startswith('A-') + + +def _quarter_months_conform(source, target): + snum = _month_numbers[source] + tnum = _month_numbers[target] + return snum % 3 == tnum % 3 + + +def _is_quarterly(rule): + rule = rule.upper() + return rule == 'Q' or rule.startswith('Q-') + + +def _is_weekly(rule): + rule = rule.upper() + return rule == 'W' or rule.startswith('W-') + + +DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] + +MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] + +_month_numbers = dict((k, i) for i, k in enumerate(MONTHS)) + + +_weekday_rule_aliases = dict((k, v) for k, v in enumerate(DAYS)) +_month_aliases = dict((k + 1, v) for k, v in enumerate(MONTHS)) + + +def _is_multiple(us, mult): + return us % mult == 0 diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py new file mode 100644 index 00000000..6291be34 --- /dev/null +++ b/pandas/tseries/holiday.py @@ -0,0 +1,355 @@ +from pandas import DateOffset, DatetimeIndex, Series, Timestamp +from pandas.compat import add_metaclass +from datetime import datetime, timedelta +from dateutil.relativedelta import MO, TU, WE, TH, FR, SA, SU + +def next_monday(dt): + """ + If holiday falls on Saturday, use following Monday instead; + if holiday falls on Sunday, use Monday instead + """ + if dt.weekday() == 5: + return dt + timedelta(2) + elif dt.weekday() == 6: + return dt + timedelta(1) + return dt + +def next_monday_or_tuesday(dt): + """ + For second holiday of two adjacent ones! + If holiday falls on Saturday, use following Monday instead; + if holiday falls on Sunday or Monday, use following Tuesday instead + (because Monday is already taken by adjacent holiday on the day before) + """ + dow = dt.weekday() + if dow == 5 or dow == 6: + return dt + timedelta(2) + elif dow == 0: + return dt + timedelta(1) + return dt + +def previous_friday(dt): + """ + If holiday falls on Saturday or Sunday, use previous Friday instead. + """ + if dt.weekday() == 5: + return dt - timedelta(1) + elif dt.weekday() == 6: + return dt - timedelta(2) + return dt + +def sunday_to_monday(dt): + """ + If holiday falls on Sunday, use day thereafter (Monday) instead. + """ + if dt.weekday() == 6: + return dt + timedelta(1) + return dt + +def nearest_workday(dt): + """ + If holiday falls on Saturday, use day before (Friday) instead; + if holiday falls on Sunday, use day thereafter (Monday) instead. + """ + if dt.weekday() == 5: + return dt - timedelta(1) + elif dt.weekday() == 6: + return dt + timedelta(1) + return dt + +class Holiday(object): + """ + Class that defines a holiday with start/end dates and rules + for observance. + """ + def __init__(self, name, year=None, month=None, day=None, offset=None, + observance=None, start_date=None, end_date=None): + self.name = name + self.year = year + self.month = month + self.day = day + self.offset = offset + self.start_date = start_date + self.end_date = end_date + self.observance = observance + + def __repr__(self): + info = '' + if self.year is not None: + info += 'year=%s, ' % self.year + info += 'month=%s, day=%s, ' % (self.month, self.day) + + if self.offset is not None: + info += 'offset=%s' % self.offset + + if self.observance is not None: + info += 'observance=%s' % self.observance + + repr = 'Holiday: %s (%s)' % (self.name, info) + return repr + + def dates(self, start_date, end_date, return_name=False): + """ + Calculate holidays between start date and end date + + Parameters + ---------- + start_date : starting date, datetime-like, optional + end_date : ending date, datetime-like, optional + return_name : bool, optional, default=False + If True, return a series that has dates and holiday names. + False will only return dates. + """ + if self.year is not None: + dt = Timestamp(datetime(self.year, self.month, self.day)) + if return_name: + return Series(self.name, index=[dt]) + else: + return [dt] + + if self.start_date is not None: + start_date = self.start_date + + if self.end_date is not None: + end_date = self.end_date + + start_date = Timestamp(start_date) + end_date = Timestamp(end_date) + + year_offset = DateOffset(years=1) + base_date = Timestamp(datetime(start_date.year, self.month, self.day)) + dates = DatetimeIndex(start=base_date, end=end_date, freq=year_offset) + holiday_dates = list(self._apply_rule(dates)) + + if return_name: + return Series(self.name, index=holiday_dates) + + return holiday_dates + + def _apply_rule(self, dates): + """ + Apply the given offset/observance to an + iterable of dates. + + Parameters + ---------- + dates : array-like + Dates to apply the given offset/observance rule + + Returns + ------- + Dates with rules applied + """ + if self.observance is not None: + return map(lambda d: self.observance(d), dates) + + if not isinstance(self.offset, list): + offsets = [self.offset] + else: + offsets = self.offset + + for offset in offsets: + dates = map(lambda d: d + offset, dates) + + return dates + +holiday_calendars = {} +def register(cls): + try: + name = cls.name + except: + name = cls.__name__ + holiday_calendars[name] = cls + +def get_calendar(name): + """ + Return an instance of a calendar based on its name. + + Parameters + ---------- + name : str + Calendar name to return an instance of + """ + return holiday_calendars[name]() + +class HolidayCalendarMetaClass(type): + def __new__(cls, clsname, bases, attrs): + calendar_class = super(HolidayCalendarMetaClass, cls).__new__(cls, clsname, bases, attrs) + register(calendar_class) + return calendar_class + +@add_metaclass(HolidayCalendarMetaClass) +class AbstractHolidayCalendar(object): + """ + Abstract interface to create holidays following certain rules. + """ + __metaclass__ = HolidayCalendarMetaClass + rules = [] + start_date = Timestamp(datetime(1970, 1, 1)) + end_date = Timestamp(datetime(2030, 12, 31)) + _holiday_cache = None + + def __init__(self, name=None, rules=None): + """ + Initializes holiday object with a given set a rules. Normally + classes just have the rules defined within them. + + Parameters + ---------- + name : str + Name of the holiday calendar, defaults to class name + rules : array of Holiday objects + A set of rules used to create the holidays. + """ + super(AbstractHolidayCalendar, self).__init__() + if name is None: + name = self.__class__.__name__ + self.name = name + + if rules is not None: + self.rules = rules + + def holidays(self, start=None, end=None, return_name=False): + """ + Returns a curve with holidays between start_date and end_date + + Parameters + ---------- + start : starting date, datetime-like, optional + end : ending date, datetime-like, optional + return_names : bool, optional + If True, return a series that has dates and holiday names. + False will only return a DatetimeIndex of dates. + + Returns + ------- + DatetimeIndex of holidays + """ + if self.rules is None: + raise Exception('Holiday Calendar %s does not have any '\ + 'rules specified' % self.name) + + if start is None: + start = AbstractHolidayCalendar.start_date + + if end is None: + end = AbstractHolidayCalendar.end_date + + start = Timestamp(start) + end = Timestamp(end) + + holidays = None + # If we don't have a cache or the dates are outside the prior cache, we get them again + if self._cache is None or start < self._cache[0] or end > self._cache[1]: + for rule in self.rules: + rule_holidays = rule.dates(start, end, return_name=True) + + if holidays is None: + holidays = rule_holidays + else: + holidays = holidays.append(rule_holidays) + + self._cache = (start, end, holidays.sort_index()) + + holidays = self._cache[2] + holidays = holidays[start:end] + + if return_name: + return holidays + else: + return holidays.index + + @property + def _cache(self): + return self.__class__._holiday_cache + + @_cache.setter + def _cache(self, values): + self.__class__._holiday_cache = values + + @staticmethod + def merge_class(base, other): + """ + Merge holiday calendars together. The base calendar + will take precedence to other. The merge will be done + based on each holiday's name. + + Parameters + ---------- + base : AbstractHolidayCalendar instance/subclass or array of Holiday objects + other : AbstractHolidayCalendar instance/subclass or array of Holiday objects + """ + try: + other = other.rules + except: + pass + + if not isinstance(other, list): + other = [other] + other_holidays = dict((holiday.name, holiday) for holiday in other) + + try: + base = base.rules + except: + pass + + if not isinstance(base, list): + base = [base] + base_holidays = dict([ (holiday.name,holiday) for holiday in base ]) + + other_holidays.update(base_holidays) + return list(other_holidays.values()) + + def merge(self, other, inplace=False): + """ + Merge holiday calendars together. The caller's class + rules take precedence. The merge will be done + based on each holiday's name. + + Parameters + ---------- + other : holiday calendar + inplace : bool (default=False) + If True set rule_table to holidays, else return array of Holidays + """ + holidays = self.merge_class(self, other) + if inplace: + self.rules = holidays + else: + return holidays + +USMemorialDay = Holiday('MemorialDay', month=5, day=24, + offset=DateOffset(weekday=MO(1))) +USLaborDay = Holiday('Labor Day', month=9, day=1, + offset=DateOffset(weekday=MO(1))) +USColumbusDay = Holiday('Columbus Day', month=10, day=1, + offset=DateOffset(weekday=MO(2))) +USThanksgivingDay = Holiday('Thanksgiving', month=11, day=1, + offset=DateOffset(weekday=TH(4))) +USMartinLutherKingJr = Holiday('Dr. Martin Luther King Jr.', month=1, day=1, + offset=DateOffset(weekday=MO(3))) +USPresidentsDay = Holiday('President''s Day', month=2, day=1, + offset=DateOffset(weekday=MO(3))) + +class USFederalHolidayCalendar(AbstractHolidayCalendar): + """ + US Federal Government Holiday Calendar based on rules specified + by: https://www.opm.gov/policy-data-oversight/snow-dismissal-procedures/federal-holidays/ + """ + rules = [ + Holiday('New Years Day', month=1, day=1, observance=nearest_workday), + USMartinLutherKingJr, + USPresidentsDay, + USMemorialDay, + Holiday('July 4th', month=7, day=4, observance=nearest_workday), + USLaborDay, + USColumbusDay, + Holiday('Veterans Day', month=11, day=11, observance=nearest_workday), + USThanksgivingDay, + Holiday('Christmas', month=12, day=25, observance=nearest_workday) + ] + +def HolidayCalendarFactory(name, base, other, base_class=AbstractHolidayCalendar): + rules = AbstractHolidayCalendar.merge_class(base, other) + calendar_class = type(name, (base_class,), {"rules": rules, "name": name}) + return calendar_class diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py new file mode 100644 index 00000000..de758c4c --- /dev/null +++ b/pandas/tseries/index.py @@ -0,0 +1,2071 @@ +# pylint: disable=E1101 +import operator + +from datetime import time, datetime +from datetime import timedelta + +import numpy as np + +from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, + _values_from_object, _maybe_box, + ABCSeries) +from pandas.core.index import Index, Int64Index, Float64Index +import pandas.compat as compat +from pandas.compat import u +from pandas.tseries.frequencies import ( + infer_freq, to_offset, get_period_alias, + Resolution, get_reso_string, _tz_convert_with_transitions) +from pandas.core.base import DatetimeIndexOpsMixin +from pandas.tseries.offsets import DateOffset, generate_range, Tick, CDay +from pandas.tseries.tools import parse_time_string, normalize_date +from pandas.util.decorators import cache_readonly +import pandas.core.common as com +import pandas.tseries.offsets as offsets +import pandas.tseries.tools as tools + +from pandas.lib import Timestamp +import pandas.lib as lib +import pandas.tslib as tslib +import pandas.algos as _algos +import pandas.index as _index + + +def _utc(): + import pytz + return pytz.utc + +# -------- some conversion wrapper functions + + +def _field_accessor(name, field, docstring=None): + def f(self): + values = self.asi8 + if self.tz is not None: + utc = _utc() + if self.tz is not utc: + values = self._local_timestamps() + if field in ['is_month_start', 'is_month_end', + 'is_quarter_start', 'is_quarter_end', + 'is_year_start', 'is_year_end']: + month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12 + return tslib.get_start_end_field(values, field, self.freqstr, month_kw) + else: + return tslib.get_date_field(values, field) + f.__name__ = name + f.__doc__ = docstring + return property(f) + + +def _join_i8_wrapper(joinf, with_indexers=True): + @staticmethod + def wrapper(left, right): + if isinstance(left, (np.ndarray, ABCSeries)): + left = left.view('i8', type=np.ndarray) + if isinstance(right, (np.ndarray, ABCSeries)): + right = right.view('i8', type=np.ndarray) + results = joinf(left, right) + if with_indexers: + join_index, left_indexer, right_indexer = results + join_index = join_index.view('M8[ns]') + return join_index, left_indexer, right_indexer + return results + return wrapper + + +def _dt_index_cmp(opname, nat_result=False): + """ + Wrap comparison operations to convert datetime-like to datetime64 + """ + def wrapper(self, other): + func = getattr(super(DatetimeIndex, self), opname) + if isinstance(other, datetime) or isinstance(other, compat.string_types): + other = _to_m8(other, tz=self.tz) + result = func(other) + if com.isnull(other): + result.fill(nat_result) + else: + if isinstance(other, list): + other = DatetimeIndex(other) + elif not isinstance(other, (np.ndarray, ABCSeries)): + other = _ensure_datetime64(other) + result = func(other) + + if isinstance(other, Index): + o_mask = other.values.view('i8') == tslib.iNaT + else: + o_mask = other.view('i8') == tslib.iNaT + + if o_mask.any(): + result[o_mask] = nat_result + + mask = self.asi8 == tslib.iNaT + if mask.any(): + result[mask] = nat_result + return result.view(np.ndarray) + + return wrapper + + +def _ensure_datetime64(other): + if isinstance(other, np.datetime64): + return other + raise TypeError('%s type object %s' % (type(other), str(other))) + + +_midnight = time(0, 0) + +class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index): + """ + Immutable ndarray of datetime64 data, represented internally as int64, and + which can be boxed to Timestamp objects that are subclasses of datetime and + carry metadata such as frequency information. + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional datetime-like data to construct index with + copy : bool + Make a copy of input ndarray + freq : string or pandas offset object, optional + One of pandas date offset strings or corresponding objects + start : starting value, datetime-like, optional + If data is None, start is used as the start point in generating regular + timestamp data. + periods : int, optional, > 0 + Number of periods to generate, if generating index. Takes precedence + over end argument + end : end time, datetime-like, optional + If periods is none, generated index will extend to first conforming + time on or just past end argument + closed : string or None, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None) + name : object + Name to be stored in the index + """ + _join_precedence = 10 + + _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _left_indexer_unique = _join_i8_wrapper( + _algos.left_join_indexer_unique_int64, with_indexers=False) + _arrmap = None + + __eq__ = _dt_index_cmp('__eq__') + __ne__ = _dt_index_cmp('__ne__', nat_result=True) + __lt__ = _dt_index_cmp('__lt__') + __gt__ = _dt_index_cmp('__gt__') + __le__ = _dt_index_cmp('__le__') + __ge__ = _dt_index_cmp('__ge__') + + # structured array cache for datetime fields + _sarr_cache = None + + _engine_type = _index.DatetimeEngine + + tz = None + offset = None + _comparables = ['name','freqstr','tz'] + _allow_datetime_index_ops = True + + def __new__(cls, data=None, + freq=None, start=None, end=None, periods=None, + copy=False, name=None, tz=None, + verify_integrity=True, normalize=False, + closed=None, **kwds): + + dayfirst = kwds.pop('dayfirst', None) + yearfirst = kwds.pop('yearfirst', None) + infer_dst = kwds.pop('infer_dst', False) + + freq_infer = False + if not isinstance(freq, DateOffset): + + # if a passed freq is None, don't infer automatically + if freq != 'infer': + freq = to_offset(freq) + else: + freq_infer = True + freq = None + + if periods is not None: + if com.is_float(periods): + periods = int(periods) + elif not com.is_integer(periods): + raise ValueError('Periods must be a number, got %s' % + str(periods)) + + if data is None and freq is None: + raise ValueError("Must provide freq argument if no data is " + "supplied") + + if data is None: + return cls._generate(start, end, periods, name, freq, + tz=tz, normalize=normalize, closed=closed, + infer_dst=infer_dst) + + if not isinstance(data, (np.ndarray, ABCSeries)): + if np.isscalar(data): + raise ValueError('DatetimeIndex() must be called with a ' + 'collection of some kind, %s was passed' + % repr(data)) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + + data = np.asarray(data, dtype='O') + + # try a few ways to make it datetime64 + if lib.is_string_array(data): + data = _str_to_dt_array(data, freq, dayfirst=dayfirst, + yearfirst=yearfirst) + else: + data = tools.to_datetime(data, errors='raise') + data.offset = freq + if isinstance(data, DatetimeIndex): + if name is not None: + data.name = name + + if tz is not None: + return data.tz_localize(tz, infer_dst=infer_dst) + + return data + + if issubclass(data.dtype.type, compat.string_types): + data = _str_to_dt_array(data, freq, dayfirst=dayfirst, + yearfirst=yearfirst) + + if issubclass(data.dtype.type, np.datetime64): + if isinstance(data, ABCSeries): + data = data.values + if isinstance(data, DatetimeIndex): + if tz is None: + tz = data.tz + + subarr = data.values + + if freq is None: + freq = data.offset + verify_integrity = False + else: + if data.dtype != _NS_DTYPE: + subarr = tslib.cast_to_nanoseconds(data) + else: + subarr = data + elif data.dtype == _INT64_DTYPE: + if isinstance(data, Int64Index): + raise TypeError('cannot convert Int64Index->DatetimeIndex') + if copy: + subarr = np.asarray(data, dtype=_NS_DTYPE) + else: + subarr = data.view(_NS_DTYPE) + else: + if isinstance(data, ABCSeries): + values = data.values + else: + values = data + + if lib.is_string_array(values): + subarr = _str_to_dt_array(values, freq, dayfirst=dayfirst, + yearfirst=yearfirst) + else: + try: + subarr = tools.to_datetime(data, box=False) + + # make sure that we have a index/ndarray like (and not a Series) + if isinstance(subarr, ABCSeries): + subarr = subarr.values + + except ValueError: + # tz aware + subarr = tools.to_datetime(data, box=False, utc=True) + + if not np.issubdtype(subarr.dtype, np.datetime64): + raise ValueError('Unable to convert %s to datetime dtype' + % str(data)) + + if isinstance(subarr, DatetimeIndex): + if tz is None: + tz = subarr.tz + else: + if tz is not None: + tz = tools._maybe_get_tz(tz) + + if (not isinstance(data, DatetimeIndex) or + getattr(data, 'tz', None) is None): + # Convert tz-naive to UTC + ints = subarr.view('i8') + subarr = tslib.tz_localize_to_utc(ints, tz, + infer_dst=infer_dst) + + subarr = subarr.view(_NS_DTYPE) + + subarr = subarr.view(cls) + subarr.name = name + subarr.offset = freq + subarr.tz = tz + + if verify_integrity and len(subarr) > 0: + if freq is not None and not freq_infer: + inferred = subarr.inferred_freq + if inferred != freq.freqstr: + on_freq = cls._generate(subarr[0], None, len(subarr), None, freq, tz=tz) + if not np.array_equal(subarr.asi8, on_freq.asi8): + raise ValueError('Inferred frequency {0} from passed dates does not' + 'conform to passed frequency {1}'.format(inferred, freq.freqstr)) + + if freq_infer: + inferred = subarr.inferred_freq + if inferred: + subarr.offset = to_offset(inferred) + + return subarr + + @classmethod + def _generate(cls, start, end, periods, name, offset, + tz=None, normalize=False, infer_dst=False, closed=None): + if com._count_not_none(start, end, periods) != 2: + raise ValueError('Must specify two of start, end, or periods') + + _normalized = True + + if start is not None: + start = Timestamp(start) + + if end is not None: + end = Timestamp(end) + + left_closed = False + right_closed = False + + if start is None and end is None: + if closed is not None: + raise ValueError("Closed has to be None if not both of start" + "and end are defined") + + if closed is None: + left_closed = True + right_closed = True + elif closed == "left": + left_closed = True + elif closed == "right": + right_closed = True + else: + raise ValueError("Closed has to be either 'left', 'right' or None") + + try: + inferred_tz = tools._infer_tzinfo(start, end) + except: + raise ValueError('Start and end cannot both be tz-aware with ' + 'different timezones') + + inferred_tz = tools._maybe_get_tz(inferred_tz) + + # these may need to be localized + tz = tools._maybe_get_tz(tz, start or end) + + if tz is not None and inferred_tz is not None: + if not inferred_tz == tz: + raise AssertionError("Inferred time zone not equal to passed " + "time zone") + + elif inferred_tz is not None: + tz = inferred_tz + + if start is not None: + if normalize: + start = normalize_date(start) + _normalized = True + else: + _normalized = _normalized and start.time() == _midnight + + if end is not None: + if normalize: + end = normalize_date(end) + _normalized = True + else: + _normalized = _normalized and end.time() == _midnight + + if hasattr(offset, 'delta') and offset != offsets.Day(): + if inferred_tz is None and tz is not None: + # naive dates + if start is not None and start.tz is None: + start = start.tz_localize(tz) + + if end is not None and end.tz is None: + end = end.tz_localize(tz) + + if start and end: + if start.tz is None and end.tz is not None: + start = start.tz_localize(end.tz) + + if end.tz is None and start.tz is not None: + end = end.tz_localize(start.tz) + + if _use_cached_range(offset, _normalized, start, end): + index = cls._cached_range(start, end, periods=periods, + offset=offset, name=name) + else: + index = _generate_regular_range(start, end, periods, offset) + + else: + + if inferred_tz is None and tz is not None: + # naive dates + if start is not None and start.tz is not None: + start = start.replace(tzinfo=None) + + if end is not None and end.tz is not None: + end = end.replace(tzinfo=None) + + if start and end: + if start.tz is None and end.tz is not None: + end = end.replace(tzinfo=None) + + if end.tz is None and start.tz is not None: + start = start.replace(tzinfo=None) + + if _use_cached_range(offset, _normalized, start, end): + index = cls._cached_range(start, end, periods=periods, + offset=offset, name=name) + else: + index = _generate_regular_range(start, end, periods, offset) + + if tz is not None and getattr(index, 'tz', None) is None: + index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz, + infer_dst=infer_dst) + index = index.view(_NS_DTYPE) + + index = index.view(cls) + index.name = name + index.offset = offset + index.tz = tz + + if not left_closed: + index = index[1:] + if not right_closed: + index = index[:-1] + + return index + + @property + def _box_func(self): + return lambda x: Timestamp(x, offset=self.offset, tz=self.tz) + + def _local_timestamps(self): + utc = _utc() + + if self.is_monotonic: + return tslib.tz_convert(self.asi8, utc, self.tz) + else: + values = self.asi8 + indexer = values.argsort() + result = tslib.tz_convert(values.take(indexer), utc, self.tz) + + n = len(indexer) + reverse = np.empty(n, dtype=np.int_) + reverse.put(indexer, np.arange(n)) + return result.take(reverse) + + @classmethod + def _simple_new(cls, values, name, freq=None, tz=None): + if values.dtype != _NS_DTYPE: + values = com._ensure_int64(values).view(_NS_DTYPE) + + result = values.view(cls) + result.name = name + result.offset = freq + result.tz = tools._maybe_get_tz(tz) + + return result + + @property + def tzinfo(self): + """ + Alias for tz attribute + """ + return self.tz + + @classmethod + def _cached_range(cls, start=None, end=None, periods=None, offset=None, + name=None): + if start is None and end is None: + # I somewhat believe this should never be raised externally and therefore + # should be a `PandasError` but whatever... + raise TypeError('Must specify either start or end.') + if start is not None: + start = Timestamp(start) + if end is not None: + end = Timestamp(end) + if (start is None or end is None) and periods is None: + raise TypeError('Must either specify period or provide both start and end.') + + if offset is None: + # This can't happen with external-facing code, therefore PandasError + raise TypeError('Must provide offset.') + + drc = _daterange_cache + if offset not in _daterange_cache: + xdr = generate_range(offset=offset, start=_CACHE_START, + end=_CACHE_END) + + arr = tools.to_datetime(list(xdr), box=False) + + cachedRange = arr.view(DatetimeIndex) + cachedRange.offset = offset + cachedRange.tz = None + cachedRange.name = None + drc[offset] = cachedRange + else: + cachedRange = drc[offset] + + if start is None: + if not isinstance(end, Timestamp): + raise AssertionError('end must be an instance of Timestamp') + + end = offset.rollback(end) + + endLoc = cachedRange.get_loc(end) + 1 + startLoc = endLoc - periods + elif end is None: + if not isinstance(start, Timestamp): + raise AssertionError('start must be an instance of Timestamp') + + start = offset.rollforward(start) + + startLoc = cachedRange.get_loc(start) + endLoc = startLoc + periods + else: + if not offset.onOffset(start): + start = offset.rollforward(start) + + if not offset.onOffset(end): + end = offset.rollback(end) + + startLoc = cachedRange.get_loc(start) + endLoc = cachedRange.get_loc(end) + 1 + + indexSlice = cachedRange[startLoc:endLoc] + indexSlice.name = name + indexSlice.offset = offset + + return indexSlice + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return tslib.ints_to_pydatetime(self.asi8, self.tz) + + _na_value = tslib.NaT + """The expected NA value to use with this index.""" + + @cache_readonly + def _is_dates_only(self): + from pandas.core.format import _is_dates_only + return _is_dates_only(self.values) + + @property + def _formatter_func(self): + from pandas.core.format import _get_format_datetime64 + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: formatter(x, tz=self.tz) + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(np.ndarray.__reduce__(self)) + subclass_state = self.name, self.offset, self.tz + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if len(state) == 2: + nd_state, own_state = state + self.name = own_state[0] + self.offset = own_state[1] + self.tz = own_state[2] + np.ndarray.__setstate__(self, nd_state) + + # provide numpy < 1.7 compat + if nd_state[2] == 'M8[us]': + new_state = np.ndarray.__reduce__(self.values.astype('M8[ns]')) + np.ndarray.__setstate__(self, new_state[2]) + + else: # pragma: no cover + np.ndarray.__setstate__(self, state) + + def __add__(self, other): + if isinstance(other, Index): + return self.union(other) + elif isinstance(other, (DateOffset, timedelta)): + return self._add_delta(other) + elif isinstance(other, np.timedelta64): + return self._add_delta(other) + elif com.is_integer(other): + return self.shift(other) + else: # pragma: no cover + raise TypeError(other) + + def __sub__(self, other): + if isinstance(other, Index): + return self.diff(other) + elif isinstance(other, (DateOffset, timedelta)): + return self._add_delta(-other) + elif isinstance(other, np.timedelta64): + return self._add_delta(-other) + elif com.is_integer(other): + return self.shift(-other) + else: # pragma: no cover + raise TypeError(other) + + def _add_delta(self, delta): + if isinstance(delta, (Tick, timedelta)): + inc = offsets._delta_to_nanoseconds(delta) + mask = self.asi8 == tslib.iNaT + new_values = (self.asi8 + inc).view(_NS_DTYPE) + new_values[mask] = tslib.iNaT + new_values = new_values.view(_NS_DTYPE) + elif isinstance(delta, np.timedelta64): + new_values = self.to_series() + delta + else: + new_values = self.astype('O') + delta + tz = 'UTC' if self.tz is not None else None + result = DatetimeIndex(new_values, tz=tz, freq='infer') + utc = _utc() + if self.tz is not None and self.tz is not utc: + result = result.tz_convert(self.tz) + return result + + def __contains__(self, key): + try: + res = self.get_loc(key) + return np.isscalar(res) or type(res) == slice + except (KeyError, TypeError): + return False + + def _format_with_header(self, header, **kwargs): + return header + self._format_native_types(**kwargs) + + def _format_native_types(self, na_rep=u('NaT'), + date_format=None, **kwargs): + data = self.asobject + from pandas.core.format import Datetime64Formatter + return Datetime64Formatter(values=data, + nat_rep=na_rep, + date_format=date_format, + justify='all').get_result() + + def isin(self, values): + """ + Compute boolean array of whether each index value is found in the + passed set of values + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + if not isinstance(values, DatetimeIndex): + try: + values = DatetimeIndex(values) + except ValueError: + return self.asobject.isin(values) + + value_set = set(values.asi8) + return lib.ismember(self.asi8, value_set) + + def to_datetime(self, dayfirst=False): + return self.copy() + + def groupby(self, f): + objs = self.asobject + return _algos.groupby_object(objs, f) + + def summary(self, name=None): + if len(self) > 0: + index_summary = ', %s to %s' % (com.pprint_thing(self[0]), + com.pprint_thing(self[-1])) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + result = '%s: %s entries%s' % (com.pprint_thing(name), + len(self), index_summary) + if self.freq: + result += '\nFreq: %s' % self.freqstr + + return result + + def get_duplicates(self): + values = Index.get_duplicates(self) + return DatetimeIndex(values) + + def astype(self, dtype): + dtype = np.dtype(dtype) + + if dtype == np.object_: + return self.asobject + elif dtype == _INT64_DTYPE: + return self.asi8.copy() + else: # pragma: no cover + raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) + + def _get_time_micros(self): + utc = _utc() + values = self.asi8 + if self.tz is not None and self.tz is not utc: + values = self._local_timestamps() + return tslib.get_time_micros(values) + + def to_series(self, keep_tz=False): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False. + return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set or is UTC, the resulting + Series will have a datetime64[ns] dtype. + Otherwise the Series will have an object dtype. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. + + Returns + ------- + Series + """ + return super(DatetimeIndex, self).to_series(keep_tz=keep_tz) + + def _to_embed(self, keep_tz=False): + """ return an array repr of this object, potentially casting to object """ + if keep_tz and self.tz is not None and str(self.tz) != 'UTC': + return self.asobject.values + return self.values + + def to_pydatetime(self): + """ + Return DatetimeIndex as object ndarray of datetime.datetime objects + + Returns + ------- + datetimes : ndarray + """ + return tslib.ints_to_pydatetime(self.asi8, tz=self.tz) + + def to_period(self, freq=None): + """ + Cast to PeriodIndex at a particular frequency + """ + from pandas.tseries.period import PeriodIndex + + if freq is None: + freq = self.freqstr or self.inferred_freq + + if freq is None: + msg = "You must pass a freq argument as current index has none." + raise ValueError(msg) + + freq = get_period_alias(freq) + + return PeriodIndex(self.values, name=self.name, freq=freq, tz=self.tz) + + def order(self, return_indexer=False, ascending=True): + """ + Return sorted copy of Index + """ + if return_indexer: + _as = self.argsort() + if not ascending: + _as = _as[::-1] + sorted_index = self.take(_as) + return sorted_index, _as + else: + sorted_values = np.sort(self.values) + if not ascending: + sorted_values = sorted_values[::-1] + return self._simple_new(sorted_values, self.name, None, + self.tz) + + def snap(self, freq='S'): + """ + Snap time stamps to nearest occurring frequency + + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + snapped = np.empty(len(self), dtype=_NS_DTYPE) + + for i, v in enumerate(self): + s = v + if not freq.onOffset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + snapped[i] = s + + # we know it conforms; skip check + return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + + def shift(self, n, freq=None): + """ + Specialized shift which produces a DatetimeIndex + + Parameters + ---------- + n : int + Periods to shift by + freq : DateOffset or timedelta-like, optional + + Returns + ------- + shifted : DatetimeIndex + """ + if freq is not None and freq != self.offset: + if isinstance(freq, compat.string_types): + freq = to_offset(freq) + result = Index.shift(self, n, freq) + result.tz = self.tz + + return result + + if n == 0: + # immutable so OK + return self + + if self.offset is None: + raise ValueError("Cannot shift with no offset") + + start = self[0] + n * self.offset + end = self[-1] + n * self.offset + return DatetimeIndex(start=start, end=end, freq=self.offset, + name=self.name, tz=self.tz) + + def repeat(self, repeats, axis=None): + """ + Analogous to ndarray.repeat + """ + return DatetimeIndex(self.values.repeat(repeats), + name=self.name) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take + """ + maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) + if isinstance(maybe_slice, slice): + return self[maybe_slice] + return super(DatetimeIndex, self).take(indices, axis) + + def unique(self): + """ + Index.unique with handling for DatetimeIndex metadata + + Returns + ------- + result : DatetimeIndex + """ + result = Int64Index.unique(self) + return DatetimeIndex._simple_new(result, tz=self.tz, + name=self.name) + + def union(self, other): + """ + Specialized union for DatetimeIndex objects. If combine + overlapping ranges with the same DateOffset, will be much + faster than Index.union + + Parameters + ---------- + other : DatetimeIndex or array-like + + Returns + ------- + y : Index or DatetimeIndex + """ + if not isinstance(other, DatetimeIndex): + try: + other = DatetimeIndex(other) + except TypeError: + pass + + this, other = self._maybe_utc_convert(other) + + if this._can_fast_union(other): + return this._fast_union(other) + else: + result = Index.union(this, other) + if isinstance(result, DatetimeIndex): + result.tz = this.tz + if result.freq is None: + result.offset = to_offset(result.inferred_freq) + return result + + def union_many(self, others): + """ + A bit of a hack to accelerate unioning a collection of indexes + """ + this = self + + for other in others: + if not isinstance(this, DatetimeIndex): + this = Index.union(this, other) + continue + + if not isinstance(other, DatetimeIndex): + try: + other = DatetimeIndex(other) + except TypeError: + pass + + this, other = this._maybe_utc_convert(other) + + if this._can_fast_union(other): + this = this._fast_union(other) + else: + tz = this.tz + this = Index.union(this, other) + if isinstance(this, DatetimeIndex): + this.tz = tz + + if this.freq is None: + this.offset = to_offset(this.inferred_freq) + return this + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + name = self.name + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) + + for obj in to_concat: + if isinstance(obj, Index) and obj.name != name: + name = None + break + + to_concat = self._ensure_compat_concat(to_concat) + to_concat, factory = _process_concat_data(to_concat, name) + + return factory(to_concat) + + def join(self, other, how='left', level=None, return_indexers=False): + """ + See Index.join + """ + if (not isinstance(other, DatetimeIndex) and len(other) > 0 and + other.inferred_type not in ('floating', 'mixed-integer', + 'mixed-integer-float', 'mixed')): + try: + other = DatetimeIndex(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join(this, other, how=how, level=level, + return_indexers=return_indexers) + + def _maybe_utc_convert(self, other): + this = self + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + elif other.tz is not None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + + if self.tz != other.tz: + this = self.tz_convert('UTC') + other = other.tz_convert('UTC') + return this, other + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + if (isinstance(other, DatetimeIndex) + and self.offset == other.offset + and self._can_fast_union(other)): + joined = self._view_like(joined) + joined.name = name + return joined + else: + tz = getattr(other, 'tz', None) + return self._simple_new(joined, name, tz=tz) + + def _can_fast_union(self, other): + if not isinstance(other, DatetimeIndex): + return False + + offset = self.offset + + if offset is None or offset != other.offset: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + try: + return (right_start == left_end + offset) or right_start in left + except (ValueError): + + # if we are comparing an offset that does not propogate timezones + # this will raise + return False + + def _fast_union(self, other): + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + left_start, left_end = left[0], left[-1] + right_end = right[-1] + + if not self.offset._should_cache(): + # concatenate dates + if left_end < right_end: + loc = right.searchsorted(left_end, side='right') + right_chunk = right.values[loc:] + dates = com._concat_compat((left.values, right_chunk)) + return self._view_like(dates) + else: + return left + else: + return type(self)(start=left_start, + end=max(left_end, right_end), + freq=left.offset) + + def __array_finalize__(self, obj): + if self.ndim == 0: # pragma: no cover + return self.item() + + self.offset = getattr(obj, 'offset', None) + self.tz = getattr(obj, 'tz', None) + self.name = getattr(obj, 'name', None) + self._reset_identity() + + def _wrap_union_result(self, other, result): + name = self.name if self.name == other.name else None + if self.tz != other.tz: + raise ValueError('Passed item and index have different timezone') + return self._simple_new(result, name=name, freq=None, tz=self.tz) + + def intersection(self, other): + """ + Specialized intersection for DatetimeIndex objects. May be much faster + than Index.intersection + + Parameters + ---------- + other : DatetimeIndex or array-like + + Returns + ------- + y : Index or DatetimeIndex + """ + if not isinstance(other, DatetimeIndex): + try: + other = DatetimeIndex(other) + except (TypeError, ValueError): + pass + result = Index.intersection(self, other) + if isinstance(result, DatetimeIndex): + if result.freq is None: + result.offset = to_offset(result.inferred_freq) + return result + + elif (other.offset is None or self.offset is None or + other.offset != self.offset or + not other.offset.isAnchored() or + (not self.is_monotonic or not other.is_monotonic)): + result = Index.intersection(self, other) + if isinstance(result, DatetimeIndex): + if result.freq is None: + result.offset = to_offset(result.inferred_freq) + return result + + if len(self) == 0: + return self + if len(other) == 0: + return other + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._view_like(left_chunk) + + def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): + + is_monotonic = self.is_monotonic + + if reso == 'year': + t1 = Timestamp(datetime(parsed.year, 1, 1), tz=self.tz) + t2 = Timestamp(datetime(parsed.year, 12, 31, 23, 59, 59, 999999), tz=self.tz) + elif reso == 'month': + d = tslib.monthrange(parsed.year, parsed.month)[1] + t1 = Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz) + t2 = Timestamp(datetime(parsed.year, parsed.month, d, 23, 59, 59, 999999), tz=self.tz) + elif reso == 'quarter': + qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead + d = tslib.monthrange(parsed.year, qe)[1] # at end of month + t1 = Timestamp(datetime(parsed.year, parsed.month, 1), tz=self.tz) + t2 = Timestamp(datetime(parsed.year, qe, d, 23, 59, 59, 999999), tz=self.tz) + elif (reso == 'day' and (self._resolution < Resolution.RESO_DAY or not is_monotonic)): + st = datetime(parsed.year, parsed.month, parsed.day) + t1 = Timestamp(st, tz=self.tz) + t2 = st + offsets.Day() + t2 = Timestamp(Timestamp(t2, tz=self.tz).value - 1) + elif (reso == 'hour' and ( + self._resolution < Resolution.RESO_HR or not is_monotonic)): + st = datetime(parsed.year, parsed.month, parsed.day, + hour=parsed.hour) + t1 = Timestamp(st, tz=self.tz) + t2 = Timestamp(Timestamp(st + offsets.Hour(), + tz=self.tz).value - 1) + elif (reso == 'minute' and ( + self._resolution < Resolution.RESO_MIN or not is_monotonic)): + st = datetime(parsed.year, parsed.month, parsed.day, + hour=parsed.hour, minute=parsed.minute) + t1 = Timestamp(st, tz=self.tz) + t2 = Timestamp(Timestamp(st + offsets.Minute(), + tz=self.tz).value - 1) + elif (reso == 'second' and ( + self._resolution == Resolution.RESO_SEC or not is_monotonic)): + st = datetime(parsed.year, parsed.month, parsed.day, + hour=parsed.hour, minute=parsed.minute, second=parsed.second) + t1 = Timestamp(st, tz=self.tz) + t2 = Timestamp(Timestamp(st + offsets.Second(), + tz=self.tz).value - 1) + else: + raise KeyError + + stamps = self.asi8 + + if is_monotonic: + + # we are out of range + if len(stamps) and ( + (use_lhs and t1.value < stamps[0] and t2.value < stamps[0]) or ( + (use_rhs and t1.value > stamps[-1] and t2.value > stamps[-1]))): + raise KeyError + + # a monotonic (sorted) series can be sliced + left = stamps.searchsorted(t1.value, side='left') if use_lhs else None + right = stamps.searchsorted(t2.value, side='right') if use_rhs else None + + return slice(left, right) + + lhs_mask = (stamps >= t1.value) if use_lhs else True + rhs_mask = (stamps <= t2.value) if use_rhs else True + + # try to find a the dates + return (lhs_mask & rhs_mask).nonzero()[0] + + def _possibly_promote(self, other): + if other.inferred_type == 'date': + other = DatetimeIndex(other) + return self, other + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + + if isinstance(key, datetime): + + # needed to localize naive datetimes + if self.tz is not None: + key = Timestamp(key, tz=self.tz) + + return self.get_value_maybe_box(series, key) + + try: + return _maybe_box(self, Index.get_value(self, series, key), series, key) + except KeyError: + try: + loc = self._get_string_slice(key) + return series[loc] + except (TypeError, ValueError, KeyError): + pass + + if isinstance(key, time): + locs = self.indexer_at_time(key) + return series.take(locs) + + try: + return self.get_value_maybe_box(series, key) + except (TypeError, ValueError, KeyError): + raise KeyError(key) + + def get_value_maybe_box(self, series, key): + # needed to localize naive datetimes + if self.tz is not None: + key = Timestamp(key, tz=self.tz) + elif not isinstance(key, Timestamp): + key = Timestamp(key) + values = self._engine.get_value(_values_from_object(series), key) + return _maybe_box(self, values, series, key) + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + if isinstance(key, datetime): + # needed to localize naive datetimes + stamp = Timestamp(key, tz=self.tz) + return self._engine.get_loc(stamp) + + try: + return Index.get_loc(self, key) + except (KeyError, ValueError): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError): + pass + + if isinstance(key, time): + return self.indexer_at_time(key) + + try: + stamp = Timestamp(key, tz=self.tz) + return self._engine.get_loc(stamp) + except (KeyError, ValueError): + raise KeyError(key) + + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + freq = getattr(self, 'freqstr', + getattr(self, 'inferred_freq', None)) + _, parsed, reso = parse_time_string(key, freq) + loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, + use_rhs=use_rhs) + return loc + + def slice_indexer(self, start=None, end=None, step=None): + """ + Index.slice_indexer, customized to handle time slicing + """ + if isinstance(start, time) and isinstance(end, time): + if step is not None and step != 1: + raise ValueError('Must have step size of 1 with time slices') + return self.indexer_between_time(start, end) + + if isinstance(start, time) or isinstance(end, time): + raise KeyError('Cannot mix time and non-time slice keys') + + if isinstance(start, float) or isinstance(end, float): + raise TypeError('Cannot index datetime64 with float keys') + + return Index.slice_indexer(self, start, end, step) + + def slice_locs(self, start=None, end=None): + """ + Index.slice_locs, customized to handle partial ISO-8601 string slicing + """ + if isinstance(start, compat.string_types) or isinstance(end, compat.string_types): + + if self.is_monotonic: + try: + if start: + start_loc = self._get_string_slice(start).start + else: + start_loc = 0 + + if end: + end_loc = self._get_string_slice(end).stop + else: + end_loc = len(self) + + return start_loc, end_loc + except KeyError: + pass + + else: + # can't use a slice indexer because we are not sorted! + # so create an indexer directly + try: + if start: + start_loc = self._get_string_slice(start, + use_rhs=False) + else: + start_loc = np.arange(len(self)) + + if end: + end_loc = self._get_string_slice(end, use_lhs=False) + else: + end_loc = np.arange(len(self)) + + return start_loc, end_loc + except KeyError: + pass + + if isinstance(start, time) or isinstance(end, time): + raise KeyError('Cannot use slice_locs with time slice keys') + + return Index.slice_locs(self, start, end) + + def __getitem__(self, key): + """Override numpy.ndarray's __getitem__ method to work as desired""" + arr_idx = self.view(np.ndarray) + if np.isscalar(key): + val = arr_idx[key] + return Timestamp(val, offset=self.offset, tz=self.tz) + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + if key.all(): + key = slice(0,None,None) + else: + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + + new_offset = None + if isinstance(key, slice): + if self.offset is not None and key.step is not None: + new_offset = key.step * self.offset + else: + new_offset = self.offset + + result = arr_idx[key] + if result.ndim > 1: + return result + + return self._simple_new(result, self.name, new_offset, self.tz) + + # Try to run function on index first, and then on elements of index + # Especially important for group-by functionality + def map(self, f): + try: + result = f(self) + if not isinstance(result, np.ndarray): + raise TypeError + return result + except Exception: + return _algos.arrmap_object(self.asobject, f) + + # alias to offset + @property + def freq(self): + """ return the frequency object if its set, otherwise None """ + return self.offset + + @cache_readonly + def inferred_freq(self): + try: + return infer_freq(self) + except ValueError: + return None + + @property + def freqstr(self): + """ return the frequency object as a string if its set, otherwise None """ + if self.freq is None: + return None + return self.offset.freqstr + + _year = _field_accessor('year', 'Y') + _month = _field_accessor('month', 'M', "The month as January=1, December=12") + _day = _field_accessor('day', 'D') + _hour = _field_accessor('hour', 'h') + _minute = _field_accessor('minute', 'm') + _second = _field_accessor('second', 's') + _microsecond = _field_accessor('microsecond', 'us') + _nanosecond = _field_accessor('nanosecond', 'ns') + _weekofyear = _field_accessor('weekofyear', 'woy') + _week = _weekofyear + _dayofweek = _field_accessor('dayofweek', 'dow', + "The day of the week with Monday=0, Sunday=6") + _weekday = _dayofweek + _dayofyear = _field_accessor('dayofyear', 'doy') + _quarter = _field_accessor('quarter', 'q') + _is_month_start = _field_accessor('is_month_start', 'is_month_start') + _is_month_end = _field_accessor('is_month_end', 'is_month_end') + _is_quarter_start = _field_accessor('is_quarter_start', 'is_quarter_start') + _is_quarter_end = _field_accessor('is_quarter_end', 'is_quarter_end') + _is_year_start = _field_accessor('is_year_start', 'is_year_start') + _is_year_end = _field_accessor('is_year_end', 'is_year_end') + + @property + def _time(self): + """ + Returns numpy array of datetime.time. The time part of the Timestamps. + """ + # can't call self.map() which tries to treat func as ufunc + # and causes recursion warnings on python 2.6 + return _algos.arrmap_object(self.asobject, lambda x: x.time()) + + @property + def _date(self): + """ + Returns numpy array of datetime.date. The date part of the Timestamps. + """ + return _algos.arrmap_object(self.asobject, lambda x: x.date()) + + + def normalize(self): + """ + Return DatetimeIndex with times to midnight. Length is unaltered + + Returns + ------- + normalized : DatetimeIndex + """ + new_values = tslib.date_normalize(self.asi8, self.tz) + return DatetimeIndex(new_values, freq='infer', name=self.name, + tz=self.tz) + + def __iter__(self): + return iter(self.asobject) + + def searchsorted(self, key, side='left'): + if isinstance(key, np.ndarray): + key = np.array(key, dtype=_NS_DTYPE, copy=False) + else: + key = _to_m8(key, tz=self.tz) + + return self.values.searchsorted(key, side=side) + + def is_type_compatible(self, typ): + return typ == self.inferred_type or typ == 'datetime' + + def argmin(self): + # hack to workaround argmin failure + try: + return self.values.argmin() + except Exception: # pragma: no cover + return self.asi8.argmin() + + @property + def inferred_type(self): + # b/c datetime is represented as microseconds since the epoch, make + # sure we can't have ambiguous indexing + return 'datetime64' + + @property + def dtype(self): + return _NS_DTYPE + + @property + def is_all_dates(self): + return True + + @cache_readonly + def is_normalized(self): + """ + Returns True if all of the dates are at midnight ("no time") + """ + return tslib.dates_normalized(self.asi8, self.tz) + + @cache_readonly + def resolution(self): + """ + Returns day, hour, minute, second, or microsecond + """ + reso = self._resolution + return get_reso_string(reso) + + @cache_readonly + def _resolution(self): + return tslib.resolution(self.asi8, self.tz) + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + if (not hasattr(other, 'inferred_type') or + other.inferred_type != 'datetime64'): + if self.offset is not None: + return False + try: + other = DatetimeIndex(other) + except: + return False + + if self.tz is not None: + if other.tz is None: + return False + same_zone = tslib.get_timezone( + self.tz) == tslib.get_timezone(other.tz) + else: + if other.tz is not None: + return False + same_zone = True + + return same_zone and np.array_equal(self.asi8, other.asi8) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + if not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + + Returns + ------- + new_index : Index + """ + + freq = None + if isinstance(item, datetime): + zone = tslib.get_timezone(self.tz) + izone = tslib.get_timezone(getattr(item, 'tzinfo', None)) + if zone != izone: + raise ValueError('Passed item and index have different timezone') + # check freq can be preserved on edge cases + if self.freq is not None: + if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + item = _to_m8(item, tz=self.tz) + try: + new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], + self[loc:].asi8)) + if self.tz is not None: + new_dates = _tz_convert_with_transitions(new_dates,'UTC',self.tz) + return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) + + except (AttributeError, TypeError): + + # fall back to object index + if isinstance(item,compat.string_types): + return self.asobject.insert(loc, item) + raise TypeError("cannot insert DatetimeIndex with incompatible label") + + def delete(self, loc): + """ + Make new DatetimeIndex with passed location deleted + Returns + + loc: int, slice or array of ints + Indicate which sub-arrays to remove. + + ------- + new_index : DatetimeIndex + """ + new_dates = np.delete(self.asi8, loc) + + freq = None + if lib.is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if com.is_list_like(loc): + loc = lib.maybe_indices_to_slice(com._ensure_int64(np.array(loc))) + if isinstance(loc, slice) and loc.step in (1, None): + if (loc.start in (0, None) or loc.stop in (len(self), None)): + freq = self.freq + + if self.tz is not None: + new_dates = _tz_convert_with_transitions(new_dates, 'UTC', self.tz) + return DatetimeIndex(new_dates, name=self.name, freq=freq, tz=self.tz) + + def _view_like(self, ndarray): + result = ndarray.view(type(self)) + result.offset = self.offset + result.tz = self.tz + result.name = self.name + return result + + def tz_convert(self, tz): + """ + Convert DatetimeIndex from one time zone to another (using pytz/dateutil) + + Returns + ------- + normalized : DatetimeIndex + """ + tz = tools._maybe_get_tz(tz) + + if self.tz is None: + # tz naive, use tz_localize + raise TypeError('Cannot convert tz-naive timestamps, use ' + 'tz_localize to localize') + + # No conversion since timestamps are all UTC to begin with + return self._simple_new(self.values, self.name, self.offset, tz) + + def tz_localize(self, tz, infer_dst=False): + """ + Localize tz-naive DatetimeIndex to given time zone (using pytz/dateutil) + + Parameters + ---------- + tz : string or pytz.timezone or dateutil.tz.tzfile + Time zone for time. Corresponding timestamps would be converted to + time zone of the TimeSeries + infer_dst : boolean, default False + Attempt to infer fall dst-transition hours based on order + + Returns + ------- + localized : DatetimeIndex + """ + if self.tz is not None: + raise TypeError("Already tz-aware, use tz_convert to convert.") + tz = tools._maybe_get_tz(tz) + + # Convert to UTC + new_dates = tslib.tz_localize_to_utc(self.asi8, tz, infer_dst=infer_dst) + new_dates = new_dates.view(_NS_DTYPE) + + return self._simple_new(new_dates, self.name, self.offset, tz) + + def indexer_at_time(self, time, asof=False): + """ + Select values at particular time of day (e.g. 9:30AM) + + Parameters + ---------- + time : datetime.time or string + tz : string or pytz.timezone or dateutil.tz.tzfile + Time zone for time. Corresponding timestamps would be converted to + time zone of the TimeSeries + + Returns + ------- + values_at_time : TimeSeries + """ + from dateutil.parser import parse + + if asof: + raise NotImplementedError + + if isinstance(time, compat.string_types): + time = parse(time).time() + + if time.tzinfo: + # TODO + raise NotImplementedError + + time_micros = self._get_time_micros() + micros = _time_to_micros(time) + return (micros == time_micros).nonzero()[0] + + def indexer_between_time(self, start_time, end_time, include_start=True, + include_end=True): + """ + Select values between particular times of day (e.g., 9:00-9:30AM) + + Parameters + ---------- + start_time : datetime.time or string + end_time : datetime.time or string + include_start : boolean, default True + include_end : boolean, default True + tz : string or pytz.timezone or dateutil.tz.tzfile, default None + + Returns + ------- + values_between_time : TimeSeries + """ + from dateutil.parser import parse + + if isinstance(start_time, compat.string_types): + start_time = parse(start_time).time() + + if isinstance(end_time, compat.string_types): + end_time = parse(end_time).time() + + if start_time.tzinfo or end_time.tzinfo: + raise NotImplementedError + + time_micros = self._get_time_micros() + start_micros = _time_to_micros(start_time) + end_micros = _time_to_micros(end_time) + + if include_start and include_end: + lop = rop = operator.le + elif include_start: + lop = operator.le + rop = operator.lt + elif include_end: + lop = operator.lt + rop = operator.le + else: + lop = rop = operator.lt + + if start_time <= end_time: + join_op = operator.and_ + else: + join_op = operator.or_ + + mask = join_op(lop(start_micros, time_micros), + rop(time_micros, end_micros)) + + return mask.nonzero()[0] + + def to_julian_date(self): + """ + Convert DatetimeIndex to Float64Index of Julian Dates. + 0 Julian date is noon January 1, 4713 BC. + http://en.wikipedia.org/wiki/Julian_day + """ + + # http://mysite.verizon.net/aesir_research/date/jdalg2.htm + year = self.year + month = self.month + day = self.day + testarr = month < 3 + year[testarr] -= 1 + month[testarr] += 12 + return Float64Index(day + + np.fix((153*month - 457)/5) + + 365*year + + np.floor(year / 4) - + np.floor(year / 100) + + np.floor(year / 400) + + 1721118.5 + + (self.hour + + self.minute/60.0 + + self.second/3600.0 + + self.microsecond/3600.0/1e+6 + + self.nanosecond/3600.0/1e+9 + )/24.0) + + +def _generate_regular_range(start, end, periods, offset): + if isinstance(offset, Tick): + stride = offset.nanos + if periods is None: + b = Timestamp(start).value + e = Timestamp(end).value + e += stride - e % stride + # end.tz == start.tz by this point due to _generate implementation + tz = start.tz + elif start is not None: + b = Timestamp(start).value + e = b + periods * stride + tz = start.tz + elif end is not None: + e = Timestamp(end).value + stride + b = e - periods * stride + tz = end.tz + else: + raise NotImplementedError + + data = np.arange(b, e, stride, dtype=np.int64) + data = DatetimeIndex._simple_new(data, None, tz=tz) + else: + if isinstance(start, Timestamp): + start = start.to_pydatetime() + + if isinstance(end, Timestamp): + end = end.to_pydatetime() + + xdr = generate_range(start=start, end=end, + periods=periods, offset=offset) + + dates = list(xdr) + # utc = len(dates) > 0 and dates[0].tzinfo is not None + data = tools.to_datetime(dates) + + return data + + +def date_range(start=None, end=None, periods=None, freq='D', tz=None, + normalize=False, name=None, closed=None): + """ + Return a fixed frequency datetime index, with day (calendar) as the default + frequency + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating dates + end : string or datetime-like, default None + Right bound for generating dates + periods : integer or None, default None + If None, must specify start and end + freq : string or DateOffset, default 'D' (calendar daily) + Frequency strings can have multiples, e.g. '5H' + tz : string or None + Time zone name for returning localized DatetimeIndex, for example + Asia/Hong_Kong + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + name : str, default None + Name of the resulting index + closed : string or None, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None) + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : DatetimeIndex + """ + return DatetimeIndex(start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize, name=name, + closed=closed) + + +def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, + normalize=True, name=None, closed=None): + """ + Return a fixed frequency datetime index, with business day as the default + frequency + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating dates + end : string or datetime-like, default None + Right bound for generating dates + periods : integer or None, default None + If None, must specify start and end + freq : string or DateOffset, default 'B' (business daily) + Frequency strings can have multiples, e.g. '5H' + tz : string or None + Time zone name for returning localized DatetimeIndex, for example + Asia/Beijing + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + name : str, default None + Name for the resulting index + closed : string or None, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None) + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : DatetimeIndex + """ + + return DatetimeIndex(start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize, name=name, + closed=closed) + + +def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, + normalize=True, name=None, closed=None, **kwargs): + """ + **EXPERIMENTAL** Return a fixed frequency datetime index, with + CustomBusinessDay as the default frequency + + .. warning:: EXPERIMENTAL + + The CustomBusinessDay class is not officially supported and the API is + likely to change in future versions. Use this at your own risk. + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating dates + end : string or datetime-like, default None + Right bound for generating dates + periods : integer or None, default None + If None, must specify start and end + freq : string or DateOffset, default 'C' (CustomBusinessDay) + Frequency strings can have multiples, e.g. '5H' + tz : string or None + Time zone name for returning localized DatetimeIndex, for example + Asia/Beijing + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + name : str, default None + Name for the resulting index + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + closed : string or None, default None + Make the interval closed with respect to the given frequency to + the 'left', 'right', or both sides (None) + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : DatetimeIndex + """ + + if freq=='C': + holidays = kwargs.pop('holidays', []) + weekmask = kwargs.pop('weekmask', 'Mon Tue Wed Thu Fri') + freq = CDay(holidays=holidays, weekmask=weekmask) + return DatetimeIndex(start=start, end=end, periods=periods, freq=freq, + tz=tz, normalize=normalize, name=name, + closed=closed, **kwargs) + + +def _to_m8(key, tz=None): + ''' + Timestamp-like => dt64 + ''' + if not isinstance(key, Timestamp): + # this also converts strings + key = Timestamp(key, tz=tz) + + return np.int64(tslib.pydt_to_i8(key)).view(_NS_DTYPE) + + +def _str_to_dt_array(arr, offset=None, dayfirst=None, yearfirst=None): + def parser(x): + result = parse_time_string(x, offset, dayfirst=dayfirst, + yearfirst=yearfirst) + return result[0] + + arr = np.asarray(arr, dtype=object) + data = _algos.arrmap_object(arr, parser) + return tools.to_datetime(data) + + +_CACHE_START = Timestamp(datetime(1950, 1, 1)) +_CACHE_END = Timestamp(datetime(2030, 1, 1)) + +_daterange_cache = {} + + +def _naive_in_cache_range(start, end): + if start is None or end is None: + return False + else: + if start.tzinfo is not None or end.tzinfo is not None: + return False + return _in_range(start, end, _CACHE_START, _CACHE_END) + + +def _in_range(start, end, rng_start, rng_end): + return start > rng_start and end < rng_end + +def _use_cached_range(offset, _normalized, start, end): + return (offset._should_cache() and + not (offset._normalize_cache and not _normalized) and + _naive_in_cache_range(start, end)) + +def _time_to_micros(time): + seconds = time.hour * 60 * 60 + 60 * time.minute + time.second + return 1000000 * seconds + time.microsecond + + +def _process_concat_data(to_concat, name): + klass = Index + kwargs = {} + concat = np.concatenate + + all_dti = True + need_utc_convert = False + has_naive = False + tz = None + + for x in to_concat: + if not isinstance(x, DatetimeIndex): + all_dti = False + else: + if tz is None: + tz = x.tz + + if x.tz is None: + has_naive = True + + if x.tz != tz: + need_utc_convert = True + tz = 'UTC' + + if all_dti: + need_obj_convert = False + if has_naive and tz is not None: + need_obj_convert = True + + if need_obj_convert: + to_concat = [x.asobject.values for x in to_concat] + + else: + if need_utc_convert: + to_concat = [x.tz_convert('UTC').values for x in to_concat] + else: + to_concat = [x.values for x in to_concat] + + # well, technically not a "class" anymore...oh well + klass = DatetimeIndex._simple_new + kwargs = {'tz': tz} + concat = com._concat_compat + else: + for i, x in enumerate(to_concat): + if isinstance(x, DatetimeIndex): + to_concat[i] = x.asobject.values + elif isinstance(x, Index): + to_concat[i] = x.values + + factory_func = lambda x: klass(concat(x), name=name, **kwargs) + return to_concat, factory_func diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py new file mode 100644 index 00000000..104e088e --- /dev/null +++ b/pandas/tseries/interval.py @@ -0,0 +1,37 @@ +import numpy as np + +from pandas.core.index import Index + + +class Interval(object): + """ + Represents an interval of time defined by two timestamps + """ + + def __init__(self, start, end): + self.start = start + self.end = end + + +class PeriodInterval(object): + """ + Represents an interval of time defined by two Period objects (time ordinals) + """ + + def __init__(self, start, end): + self.start = start + self.end = end + + +class IntervalIndex(Index): + """ + + """ + def __new__(self, starts, ends): + pass + + def dtype(self): + return self.values.dtype + +if __name__ == '__main__': + pass diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py new file mode 100644 index 00000000..57181b43 --- /dev/null +++ b/pandas/tseries/offsets.py @@ -0,0 +1,2244 @@ +from datetime import date, datetime, timedelta +from pandas.compat import range +from pandas import compat +import numpy as np + +from pandas.tseries.tools import to_datetime + +# import after tools, dateutil check +from dateutil.relativedelta import relativedelta, weekday +from dateutil.easter import easter +import pandas.tslib as tslib +from pandas.tslib import Timestamp, OutOfBoundsDatetime + +from pandas import _np_version_under1p7 + +import functools + +__all__ = ['Day', 'BusinessDay', 'BDay', 'CustomBusinessDay', 'CDay', + 'CBMonthEnd','CBMonthBegin', + 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd', + 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd', + 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd', + 'LastWeekOfMonth', 'FY5253Quarter', 'FY5253', + 'Week', 'WeekOfMonth', 'Easter', + 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano'] + +# convert to/from datetime/timestamp to allow invalid Timestamp ranges to pass thru +def as_timestamp(obj): + try: + if isinstance(obj, Timestamp): + return obj + return Timestamp(obj) + except (OutOfBoundsDatetime): + pass + return obj + +def as_datetime(obj): + f = getattr(obj,'to_pydatetime',None) + if f is not None: + obj = f() + return obj + +def apply_wraps(func): + @functools.wraps(func) + def wrapper(self, other): + if other is tslib.NaT: + return tslib.NaT + if type(other) == date: + other = datetime(other.year, other.month, other.day) + if isinstance(other, (np.datetime64, datetime)): + other = as_timestamp(other) + + tz = getattr(other, 'tzinfo', None) + result = func(self, other) + + if self.normalize: + result = tslib.normalize_date(result) + + if isinstance(other, Timestamp) and not isinstance(result, Timestamp): + result = as_timestamp(result) + + if tz is not None and result.tzinfo is None: + result = result.tz_localize(tz) + return result + return wrapper + + +def _is_normalized(dt): + if (dt.hour != 0 or dt.minute != 0 or dt.second != 0 + or dt.microsecond != 0 or getattr(dt, 'nanosecond', 0) != 0): + return False + return True + +#---------------------------------------------------------------------- +# DateOffset + + +class ApplyTypeError(TypeError): + # sentinel class for catching the apply error to return NotImplemented + pass + + +class CacheableOffset(object): + _cacheable = True + + +class DateOffset(object): + """ + Standard kind of date increment used for a date range. + + Works exactly like relativedelta in terms of the keyword args you + pass in, use of the keyword n is discouraged-- you would be better + off specifying n in the keywords you use, but regardless it is + there for you. n is needed for DateOffset subclasses. + + DateOffets work as follows. Each offset specify a set of dates + that conform to the DateOffset. For example, Bday defines this + set to be the set of dates that are weekdays (M-F). To test if a + date is in the set of a DateOffset dateOffset we can use the + onOffset method: dateOffset.onOffset(date). + + If a date is not on a valid date, the rollback and rollforward + methods can be used to roll the date to the nearest valid date + before/after the date. + + DateOffsets can be created to move dates forward a given number of + valid dates. For example, Bday(2) can be added to a date to move + it two business days forward. If the date does not start on a + valid date, first it is moved to a valid date. Thus psedo code + is: + + def __add__(date): + date = rollback(date) # does nothing if date is valid + return date + + + When a date offset is created for a negitive number of periods, + the date is first rolled forward. The pseudo code is: + + def __add__(date): + date = rollforward(date) # does nothing is date is valid + return date + + + Zero presents a problem. Should it roll forward or back? We + arbitrarily have it rollforward: + + date + BDay(0) == BDay.rollforward(date) + + Since 0 is a bit weird, we suggest avoiding its use. + """ + _cacheable = False + _normalize_cache = True + + def __init__(self, n=1, normalize=False, **kwds): + self.n = int(n) + self.normalize = normalize + self.kwds = kwds + if len(kwds) > 0: + self._offset = relativedelta(**kwds) + else: + self._offset = timedelta(1) + + @apply_wraps + def apply(self, other): + other = as_datetime(other) + if len(self.kwds) > 0: + if self.n > 0: + for i in range(self.n): + other = other + self._offset + else: + for i in range(-self.n): + other = other - self._offset + return as_timestamp(other) + else: + return as_timestamp(other + timedelta(self.n)) + + def isAnchored(self): + return (self.n == 1) + + def copy(self): + return self.__class__(self.n, normalize=self.normalize, **self.kwds) + + def _should_cache(self): + return self.isAnchored() and self._cacheable + + def _params(self): + attrs = [(k, v) for k, v in compat.iteritems(vars(self)) + if (k not in ['kwds', 'name', 'normalize', + 'busdaycalendar']) and (k[0] != '_')] + attrs.extend(list(self.kwds.items())) + attrs = sorted(set(attrs)) + + params = tuple([str(self.__class__)] + attrs) + return params + + def __repr__(self): + if hasattr(self, '_named'): + return self._named + className = getattr(self, '_outputName', type(self).__name__) + exclude = set(['n', 'inc', 'normalize']) + attrs = [] + for attr in sorted(self.__dict__): + if ((attr == 'kwds' and len(self.kwds) == 0) + or attr.startswith('_')): + continue + elif attr == 'kwds': + kwds_new = {} + for key in self.kwds: + if not hasattr(self, key): + kwds_new[key] = self.kwds[key] + if len(kwds_new) > 0: + attrs.append('='.join((attr, repr(kwds_new)))) + else: + if attr not in exclude: + attrs.append('='.join((attr, repr(getattr(self, attr))))) + + if abs(self.n) != 1: + plural = 's' + else: + plural = '' + + n_str = "" + if self.n != 1: + n_str = "%s * " % self.n + + out = '<%s' % n_str + className + plural + if attrs: + out += ': ' + ', '.join(attrs) + out += '>' + return out + + @property + def name(self): + if hasattr(self, '_named'): + return self._named + else: + return self.rule_code + + def __eq__(self, other): + if other is None: + return False + + if isinstance(other, compat.string_types): + from pandas.tseries.frequencies import to_offset + + other = to_offset(other) + + if not isinstance(other, DateOffset): + return False + + return self._params() == other._params() + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self._params()) + + def __call__(self, other): + return self.apply(other) + + def __add__(self, other): + try: + return self.apply(other) + except ApplyTypeError: + return NotImplemented + + def __radd__(self, other): + return self.__add__(other) + + def __sub__(self, other): + if isinstance(other, datetime): + raise TypeError('Cannot subtract datetime from offset.') + elif type(other) == type(self): + return self.__class__(self.n - other.n, normalize=self.normalize, **self.kwds) + else: # pragma: no cover + return NotImplemented + + def __rsub__(self, other): + return self.__class__(-self.n, normalize=self.normalize, **self.kwds) + other + + def __mul__(self, someInt): + return self.__class__(n=someInt * self.n, normalize=self.normalize, **self.kwds) + + def __rmul__(self, someInt): + return self.__mul__(someInt) + + def __neg__(self): + return self.__class__(-self.n, normalize=self.normalize, **self.kwds) + + @apply_wraps + def rollback(self, dt): + """Roll provided date backward to next offset only if not on offset""" + if not self.onOffset(dt): + dt = dt - self.__class__(1, normalize=self.normalize, **self.kwds) + return dt + + @apply_wraps + def rollforward(self, dt): + """Roll provided date forward to next offset only if not on offset""" + if not self.onOffset(dt): + dt = dt + self.__class__(1, normalize=self.normalize, **self.kwds) + return dt + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + # XXX, see #1395 + if type(self) == DateOffset or isinstance(self, Tick): + return True + + # Default (slow) method for determining if some date is a member of the + # date range generated by this offset. Subclasses may have this + # re-implemented in a nicer way. + a = dt + b = ((dt + self) - self) + return a == b + + # way to get around weirdness with rule_code + @property + def _prefix(self): + raise NotImplementedError('Prefix not defined') + + @property + def rule_code(self): + return self._prefix + + @property + def freqstr(self): + try: + code = self.rule_code + except NotImplementedError: + return repr(self) + + if self.n != 1: + fstr = '%d%s' % (self.n, code) + else: + fstr = code + + return fstr + + +class SingleConstructorOffset(DateOffset): + @classmethod + def _from_name(cls, suffix=None): + # default _from_name calls cls with no args + if suffix: + raise ValueError("Bad freq suffix %s" % suffix) + return cls() + + +class BusinessMixin(object): + """ mixin to business types to provide related functions """ + + # TODO: Combine this with DateOffset by defining a whitelisted set of + # attributes on each object rather than the existing behavior of iterating + # over internal ``__dict__`` + def __repr__(self): + if hasattr(self, '_named'): + return self._named + className = getattr(self, '_outputName', self.__class__.__name__) + attrs = [] + + if self.offset: + attrs = ['offset=%s' % repr(self.offset)] + + if abs(self.n) != 1: + plural = 's' + else: + plural = '' + + n_str = "" + if self.n != 1: + n_str = "%s * " % self.n + + out = '<%s' % n_str + className + plural + if attrs: + out += ': ' + ', '.join(attrs) + out += '>' + return out + +class BusinessDay(BusinessMixin, SingleConstructorOffset): + """ + DateOffset subclass representing possibly n business days + """ + _prefix = 'B' + + def __init__(self, n=1, normalize=False, **kwds): + self.n = int(n) + self.normalize = normalize + self.kwds = kwds + self.offset = kwds.get('offset', timedelta(0)) + + @property + def freqstr(self): + try: + code = self.rule_code + except NotImplementedError: + return repr(self) + + if self.n != 1: + fstr = '%d%s' % (self.n, code) + else: + fstr = code + + if self.offset: + fstr += self._offset_str() + + return fstr + + def _offset_str(self): + def get_str(td): + off_str = '' + if td.days > 0: + off_str += str(td.days) + 'D' + if td.seconds > 0: + s = td.seconds + hrs = int(s / 3600) + if hrs != 0: + off_str += str(hrs) + 'H' + s -= hrs * 3600 + mts = int(s / 60) + if mts != 0: + off_str += str(mts) + 'Min' + s -= mts * 60 + if s != 0: + off_str += str(s) + 's' + if td.microseconds > 0: + off_str += str(td.microseconds) + 'us' + return off_str + + if isinstance(self.offset, timedelta): + zero = timedelta(0, 0, 0) + if self.offset >= zero: + off_str = '+' + get_str(self.offset) + else: + off_str = '-' + get_str(-self.offset) + return off_str + else: + return '+' + repr(self.offset) + + def isAnchored(self): + return (self.n == 1) + + @apply_wraps + def apply(self, other): + if isinstance(other, datetime): + n = self.n + + if n == 0 and other.weekday() > 4: + n = 1 + + result = other + + # avoid slowness below + if abs(n) > 5: + k = n // 5 + result = result + timedelta(7 * k) + if n < 0 and result.weekday() > 4: + n += 1 + n -= 5 * k + if n == 0 and result.weekday() > 4: + n -= 1 + + while n != 0: + k = n // abs(n) + result = result + timedelta(k) + if result.weekday() < 5: + n -= k + + if self.offset: + result = result + self.offset + + return as_timestamp(result) + + elif isinstance(other, (timedelta, Tick)): + return BDay(self.n, offset=self.offset + other, + normalize=self.normalize) + else: + raise ApplyTypeError('Only know how to combine business day with ' + 'datetime or timedelta.') + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.weekday() < 5 + + +class CustomBusinessDay(BusinessDay): + """ + **EXPERIMENTAL** DateOffset subclass representing possibly n business days + excluding holidays + + .. warning:: EXPERIMENTAL + + This class is not officially supported and the API is likely to change + in future versions. Use this at your own risk. + + Parameters + ---------- + n : int, default 1 + offset : timedelta, default timedelta(0) + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + calendar : HolidayCalendar instance + instance of AbstractHolidayCalendar that provide the list of holidays + """ + + _cacheable = False + _prefix = 'C' + + def __init__(self, n=1, normalize=False, **kwds): + self.n = int(n) + self.normalize = normalize + self.kwds = kwds + self.offset = kwds.get('offset', timedelta(0)) + self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') + + if 'calendar' in kwds: + holidays = kwds['calendar'].holidays() + else: + holidays = kwds.get('holidays', []) + holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in + holidays] + self.holidays = tuple(sorted(holidays)) + self.kwds['holidays'] = self.holidays + + self._set_busdaycalendar() + + def _set_busdaycalendar(self): + if self.holidays: + kwargs = {'weekmask':self.weekmask,'holidays':self.holidays} + else: + kwargs = {'weekmask':self.weekmask} + try: + self.busdaycalendar = np.busdaycalendar(**kwargs) + except: + # Check we have the required numpy version + from distutils.version import LooseVersion + + if LooseVersion(np.__version__) < '1.7.0': + raise NotImplementedError("CustomBusinessDay requires numpy >= " + "1.7.0. Current version: " + + np.__version__) + else: + raise + + def __getstate__(self): + """Return a pickleable state""" + state = self.__dict__.copy() + del state['busdaycalendar'] + return state + + def __setstate__(self, state): + """Reconstruct an instance from a pickled state""" + self.__dict__ = state + self._set_busdaycalendar() + + @apply_wraps + def apply(self, other): + if self.n <= 0: + roll = 'forward' + else: + roll = 'backward' + + # Distinguish input cases to enhance performance + if isinstance(other, datetime): + date_in = other + np_dt = np.datetime64(date_in.date()) + + np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll, + busdaycal=self.busdaycalendar) + + dt_date = np_incr_dt.astype(datetime) + result = datetime.combine(dt_date, date_in.time()) + + if self.offset: + result = result + self.offset + + return as_timestamp(result) + + elif isinstance(other, (timedelta, Tick)): + return BDay(self.n, offset=self.offset + other, + normalize=self.normalize) + else: + raise ApplyTypeError('Only know how to combine trading day with ' + 'datetime, datetime64 or timedelta.') + + @staticmethod + def _to_dt64(dt, dtype='datetime64'): + # Currently + # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') + # numpy.datetime64('2013-05-01T02:00:00.000000+0200') + # Thus astype is needed to cast datetime to datetime64[D] + + if getattr(dt, 'tzinfo', None) is not None: + i8 = tslib.pydt_to_i8(dt) + dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo) + dt = Timestamp(dt) + dt = np.datetime64(dt) + if dt.dtype.name != dtype: + dt = dt.astype(dtype) + return dt + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + day64 = self._to_dt64(dt,'datetime64[D]') + return np.is_busday(day64, busdaycal=self.busdaycalendar) + + +class MonthOffset(SingleConstructorOffset): + @property + def name(self): + if self.isAnchored: + return self.rule_code + else: + return "%s-%s" % (self.rule_code, _int_to_month[self.n]) + + +class MonthEnd(MonthOffset): + """DateOffset of one month end""" + + @apply_wraps + def apply(self, other): + n = self.n + _, days_in_month = tslib.monthrange(other.year, other.month) + if other.day != days_in_month: + other = as_datetime(other) + relativedelta(months=-1, day=31) + if n <= 0: + n = n + 1 + other = as_datetime(other) + relativedelta(months=n, day=31) + return as_timestamp(other) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + days_in_month = tslib.monthrange(dt.year, dt.month)[1] + return dt.day == days_in_month + + _prefix = 'M' + + +class MonthBegin(MonthOffset): + """DateOffset of one month at beginning""" + + @apply_wraps + def apply(self, other): + n = self.n + + if other.day > 1 and n <= 0: # then roll forward if n<=0 + n += 1 + + other = as_datetime(other) + relativedelta(months=n, day=1) + return as_timestamp(other) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.day == 1 + + _prefix = 'MS' + + +class BusinessMonthEnd(MonthOffset): + """DateOffset increments between business EOM dates""" + + def isAnchored(self): + return (self.n == 1) + + @apply_wraps + def apply(self, other): + + n = self.n + + wkday, days_in_month = tslib.monthrange(other.year, other.month) + lastBDay = days_in_month - max(((wkday + days_in_month - 1) + % 7) - 4, 0) + + if n > 0 and not other.day >= lastBDay: + n = n - 1 + elif n <= 0 and other.day > lastBDay: + n = n + 1 + other = as_datetime(other) + relativedelta(months=n, day=31) + + if other.weekday() > 4: + other = other - BDay() + return as_timestamp(other) + + _prefix = 'BM' + + +class BusinessMonthBegin(MonthOffset): + """DateOffset of one business month at beginning""" + + @apply_wraps + def apply(self, other): + n = self.n + + wkday, _ = tslib.monthrange(other.year, other.month) + first = _get_firstbday(wkday) + + if other.day > first and n <= 0: + # as if rolled forward already + n += 1 + elif other.day < first and n > 0: + other = as_datetime(other) + timedelta(days=first - other.day) + n -= 1 + + other = as_datetime(other) + relativedelta(months=n) + wkday, _ = tslib.monthrange(other.year, other.month) + first = _get_firstbday(wkday) + result = datetime(other.year, other.month, first, other.hour, other.minute, + other.second, other.microsecond) + return as_timestamp(result) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + first_weekday, _ = tslib.monthrange(dt.year, dt.month) + if first_weekday == 5: + return dt.day == 3 + elif first_weekday == 6: + return dt.day == 2 + else: + return dt.day == 1 + + _prefix = 'BMS' + + + +class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): + """ + **EXPERIMENTAL** DateOffset of one custom business month + + .. warning:: EXPERIMENTAL + + This class is not officially supported and the API is likely to change + in future versions. Use this at your own risk. + + Parameters + ---------- + n : int, default 1 + offset : timedelta, default timedelta(0) + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + """ + + _cacheable = False + _prefix = 'CBM' + def __init__(self, n=1, normalize=False, **kwds): + self.n = int(n) + self.normalize = normalize + self.kwds = kwds + self.offset = kwds.get('offset', timedelta(0)) + self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') + self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds) + self.m_offset = MonthEnd(normalize=normalize) + + @apply_wraps + def apply(self,other): + n = self.n + dt_in = other + # First move to month offset + cur_mend = self.m_offset.rollforward(dt_in) + # Find this custom month offset + cur_cmend = self.cbday.rollback(cur_mend) + + # handle zero case. arbitrarily rollforward + if n == 0 and dt_in != cur_cmend: + n += 1 + + if dt_in < cur_cmend and n >= 1: + n -= 1 + elif dt_in > cur_cmend and n <= -1: + n += 1 + + new = cur_mend + n * MonthEnd() + result = self.cbday.rollback(new) + return as_timestamp(result) + +class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): + """ + **EXPERIMENTAL** DateOffset of one custom business month + + .. warning:: EXPERIMENTAL + + This class is not officially supported and the API is likely to change + in future versions. Use this at your own risk. + + Parameters + ---------- + n : int, default 1 + offset : timedelta, default timedelta(0) + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + weekmask : str, Default 'Mon Tue Wed Thu Fri' + weekmask of valid business days, passed to ``numpy.busdaycalendar`` + holidays : list + list/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar`` + """ + + _cacheable = False + _prefix = 'CBMS' + def __init__(self, n=1, normalize=False, **kwds): + self.n = int(n) + self.normalize = normalize + self.kwds = kwds + self.offset = kwds.get('offset', timedelta(0)) + self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') + self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds) + self.m_offset = MonthBegin(normalize=normalize) + + @apply_wraps + def apply(self,other): + n = self.n + dt_in = other + # First move to month offset + cur_mbegin = self.m_offset.rollback(dt_in) + # Find this custom month offset + cur_cmbegin = self.cbday.rollforward(cur_mbegin) + + # handle zero case. arbitrarily rollforward + if n == 0 and dt_in != cur_cmbegin: + n += 1 + + if dt_in > cur_cmbegin and n <= -1: + n += 1 + elif dt_in < cur_cmbegin and n >= 1: + n -= 1 + + new = cur_mbegin + n * MonthBegin() + result = self.cbday.rollforward(new) + return as_timestamp(result) + +class Week(DateOffset): + """ + Weekly offset + + Parameters + ---------- + weekday : int, default None + Always generate specific day of week. 0 for Monday + """ + + def __init__(self, n=1, normalize=False, **kwds): + self.n = n + self.normalize = normalize + self.weekday = kwds.get('weekday', None) + + if self.weekday is not None: + if self.weekday < 0 or self.weekday > 6: + raise ValueError('Day must be 0<=day<=6, got %d' % + self.weekday) + + self._inc = timedelta(weeks=1) + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.weekday is not None) + + @apply_wraps + def apply(self, other): + base = other + if self.weekday is None: + return as_timestamp(as_datetime(other) + self.n * self._inc) + + if self.n > 0: + k = self.n + otherDay = other.weekday() + if otherDay != self.weekday: + other = as_datetime(other) + timedelta((self.weekday - otherDay) % 7) + k = k - 1 + other = as_datetime(other) + for i in range(k): + other = other + self._inc + else: + k = self.n + otherDay = other.weekday() + if otherDay != self.weekday: + other = as_datetime(other) + timedelta((self.weekday - otherDay) % 7) + other = as_datetime(other) + for i in range(-k): + other = other - self._inc + + other = datetime(other.year, other.month, other.day, + base.hour, base.minute, base.second, base.microsecond) + return other + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.weekday() == self.weekday + + _prefix = 'W' + + @property + def rule_code(self): + suffix = '' + if self.weekday is not None: + suffix = '-%s' % (_int_to_weekday[self.weekday]) + return self._prefix + suffix + + @classmethod + def _from_name(cls, suffix=None): + if not suffix: + weekday = None + else: + weekday = _weekday_to_int[suffix] + return cls(weekday=weekday) + +class WeekDay(object): + MON = 0 + TUE = 1 + WED = 2 + THU = 3 + FRI = 4 + SAT = 5 + SUN = 6 + +_int_to_weekday = { + WeekDay.MON: 'MON', + WeekDay.TUE: 'TUE', + WeekDay.WED: 'WED', + WeekDay.THU: 'THU', + WeekDay.FRI: 'FRI', + WeekDay.SAT: 'SAT', + WeekDay.SUN: 'SUN' +} + +_weekday_to_int = dict((v, k) for k, v in _int_to_weekday.items()) + + +class WeekOfMonth(DateOffset): + """ + Describes monthly dates like "the Tuesday of the 2nd week of each month" + + Parameters + ---------- + n : int + week : {0, 1, 2, 3, ...} + 0 is 1st week of month, 1 2nd week, etc. + weekday : {0, 1, ..., 6} + 0: Mondays + 1: Tuesdays + 2: Wednesdays + 3: Thursdays + 4: Fridays + 5: Saturdays + 6: Sundays + """ + + def __init__(self, n=1, normalize=False, **kwds): + self.n = n + self.normalize = normalize + self.weekday = kwds['weekday'] + self.week = kwds['week'] + + if self.n == 0: + raise ValueError('N cannot be 0') + + if self.weekday < 0 or self.weekday > 6: + raise ValueError('Day must be 0<=day<=6, got %d' % + self.weekday) + if self.week < 0 or self.week > 3: + raise ValueError('Week must be 0<=day<=3, got %d' % + self.week) + + self.kwds = kwds + + @apply_wraps + def apply(self, other): + base = other + offsetOfMonth = self.getOffsetOfMonth(other) + + if offsetOfMonth > other: + if self.n > 0: + months = self.n - 1 + else: + months = self.n + elif offsetOfMonth == other: + months = self.n + else: + if self.n > 0: + months = self.n + else: + months = self.n + 1 + + other = self.getOffsetOfMonth(as_datetime(other) + relativedelta(months=months, day=1)) + other = datetime(other.year, other.month, other.day, base.hour, + base.minute, base.second, base.microsecond) + if getattr(other, 'tzinfo', None) is not None: + other = other.tzinfo.localize(other) + return other + + def getOffsetOfMonth(self, dt): + w = Week(weekday=self.weekday) + + d = datetime(dt.year, dt.month, 1) + if getattr(dt, 'tzinfo', None) is not None: + d = dt.tzinfo.localize(d) + + d = w.rollforward(d) + + for i in range(self.week): + d = w.apply(d) + + return d + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + d = datetime(dt.year, dt.month, dt.day) + if getattr(dt, 'tzinfo', None) is not None: + d = dt.tzinfo.localize(d) + return d == self.getOffsetOfMonth(dt) + + @property + def rule_code(self): + return '%s-%d%s' % (self._prefix, self.week + 1, + _int_to_weekday.get(self.weekday, '')) + + _prefix = 'WOM' + + @classmethod + def _from_name(cls, suffix=None): + if not suffix: + raise ValueError("Prefix %r requires a suffix." % (cls._prefix)) + # TODO: handle n here... + # only one digit weeks (1 --> week 0, 2 --> week 1, etc.) + week = int(suffix[0]) - 1 + weekday = _weekday_to_int[suffix[1:]] + return cls(week=week, weekday=weekday) + +class LastWeekOfMonth(DateOffset): + """ + Describes monthly dates in last week of month like "the last Tuesday of each month" + + Parameters + ---------- + n : int + weekday : {0, 1, ..., 6} + 0: Mondays + 1: Tuesdays + 2: Wednesdays + 3: Thursdays + 4: Fridays + 5: Saturdays + 6: Sundays + """ + def __init__(self, n=1, normalize=False, **kwds): + self.n = n + self.normalize = normalize + self.weekday = kwds['weekday'] + + if self.n == 0: + raise ValueError('N cannot be 0') + + if self.weekday < 0 or self.weekday > 6: + raise ValueError('Day must be 0<=day<=6, got %d' % + self.weekday) + + self.kwds = kwds + + @apply_wraps + def apply(self, other): + offsetOfMonth = self.getOffsetOfMonth(other) + + if offsetOfMonth > other: + if self.n > 0: + months = self.n - 1 + else: + months = self.n + elif offsetOfMonth == other: + months = self.n + else: + if self.n > 0: + months = self.n + else: + months = self.n + 1 + + return self.getOffsetOfMonth(as_datetime(other) + relativedelta(months=months, day=1)) + + def getOffsetOfMonth(self, dt): + m = MonthEnd() + d = datetime(dt.year, dt.month, 1, dt.hour, dt.minute, dt.second, dt.microsecond) + if getattr(dt, 'tzinfo', None) is not None: + d = dt.tzinfo.localize(d) + + eom = m.rollforward(d) + + w = Week(weekday=self.weekday) + + return w.rollback(eom) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt == self.getOffsetOfMonth(dt) + + @property + def rule_code(self): + return '%s-%s' % (self._prefix, _int_to_weekday.get(self.weekday, '')) + + _prefix = 'LWOM' + + @classmethod + def _from_name(cls, suffix=None): + if not suffix: + raise ValueError("Prefix %r requires a suffix." % (cls._prefix)) + # TODO: handle n here... + weekday = _weekday_to_int[suffix] + return cls(weekday=weekday) + + +class QuarterOffset(DateOffset): + """Quarter representation - doesn't call super""" + + #: default month for __init__ + _default_startingMonth = None + #: default month in _from_name + _from_name_startingMonth = None + + # TODO: Consider combining QuarterOffset and YearOffset __init__ at some + # point + def __init__(self, n=1, normalize=False, **kwds): + self.n = n + self.normalize = normalize + self.startingMonth = kwds.get('startingMonth', + self._default_startingMonth) + + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.startingMonth is not None) + + @classmethod + def _from_name(cls, suffix=None): + kwargs = {} + if suffix: + kwargs['startingMonth'] = _month_to_int[suffix] + else: + if cls._from_name_startingMonth is not None: + kwargs['startingMonth'] = cls._from_name_startingMonth + return cls(**kwargs) + + @property + def rule_code(self): + return '%s-%s' % (self._prefix, _int_to_month[self.startingMonth]) + + +class BQuarterEnd(QuarterOffset): + """DateOffset increments between business Quarter dates + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... + """ + _outputName = 'BusinessQuarterEnd' + _default_startingMonth = 3 + # 'BQ' + _from_name_startingMonth = 12 + _prefix = 'BQ' + + @apply_wraps + def apply(self, other): + n = self.n + base = other + other = datetime(other.year, other.month, other.day, + other.hour, other.minute, other.second, + other.microsecond) + + wkday, days_in_month = tslib.monthrange(other.year, other.month) + lastBDay = days_in_month - max(((wkday + days_in_month - 1) + % 7) - 4, 0) + + monthsToGo = 3 - ((other.month - self.startingMonth) % 3) + if monthsToGo == 3: + monthsToGo = 0 + + if n > 0 and not (other.day >= lastBDay and monthsToGo == 0): + n = n - 1 + elif n <= 0 and other.day > lastBDay and monthsToGo == 0: + n = n + 1 + + other = as_datetime(other) + relativedelta(months=monthsToGo + 3 * n, day=31) + if getattr(base, 'tzinfo', None) is not None: + other = base.tzinfo.localize(other) + if other.weekday() > 4: + other = other - BDay() + + return as_timestamp(other) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + modMonth = (dt.month - self.startingMonth) % 3 + return BMonthEnd().onOffset(dt) and modMonth == 0 + + +_int_to_month = { + 1: 'JAN', + 2: 'FEB', + 3: 'MAR', + 4: 'APR', + 5: 'MAY', + 6: 'JUN', + 7: 'JUL', + 8: 'AUG', + 9: 'SEP', + 10: 'OCT', + 11: 'NOV', + 12: 'DEC' +} + +_month_to_int = dict((v, k) for k, v in _int_to_month.items()) + + +# TODO: This is basically the same as BQuarterEnd +class BQuarterBegin(QuarterOffset): + _outputName = "BusinessQuarterBegin" + # I suspect this is wrong for *all* of them. + _default_startingMonth = 3 + _from_name_startingMonth = 1 + _prefix = 'BQS' + + @apply_wraps + def apply(self, other): + n = self.n + other = as_datetime(other) + + wkday, _ = tslib.monthrange(other.year, other.month) + + first = _get_firstbday(wkday) + + monthsSince = (other.month - self.startingMonth) % 3 + + if n <= 0 and monthsSince != 0: # make sure to roll forward so negate + monthsSince = monthsSince - 3 + + # roll forward if on same month later than first bday + if n <= 0 and (monthsSince == 0 and other.day > first): + n = n + 1 + # pretend to roll back if on same month but before firstbday + elif n > 0 and (monthsSince == 0 and other.day < first): + n = n - 1 + + # get the first bday for result + other = other + relativedelta(months=3 * n - monthsSince) + wkday, _ = tslib.monthrange(other.year, other.month) + first = _get_firstbday(wkday) + result = datetime(other.year, other.month, first, + other.hour, other.minute, other.second, + other.microsecond) + if getattr(other, 'tzinfo', None) is not None: + result = other.tzinfo.localize(result) + return as_timestamp(result) + + +class QuarterEnd(QuarterOffset): + """DateOffset increments between business Quarter dates + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... + """ + _outputName = 'QuarterEnd' + _default_startingMonth = 3 + _prefix = 'Q' + + def __init__(self, n=1, normalize=False, **kwds): + self.n = n + self.normalize = normalize + self.startingMonth = kwds.get('startingMonth', 3) + + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.startingMonth is not None) + + @apply_wraps + def apply(self, other): + n = self.n + base = other + other = datetime(other.year, other.month, other.day, + other.hour, other.minute, other.second, + other.microsecond) + other = as_datetime(other) + + wkday, days_in_month = tslib.monthrange(other.year, other.month) + + monthsToGo = 3 - ((other.month - self.startingMonth) % 3) + if monthsToGo == 3: + monthsToGo = 0 + + if n > 0 and not (other.day >= days_in_month and monthsToGo == 0): + n = n - 1 + + other = other + relativedelta(months=monthsToGo + 3 * n, day=31) + if getattr(base, 'tzinfo', None) is not None: + other = base.tzinfo.localize(other) + return as_timestamp(other) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + modMonth = (dt.month - self.startingMonth) % 3 + return MonthEnd().onOffset(dt) and modMonth == 0 + + +class QuarterBegin(QuarterOffset): + _outputName = 'QuarterBegin' + _default_startingMonth = 3 + _from_name_startingMonth = 1 + _prefix = 'QS' + + def isAnchored(self): + return (self.n == 1 and self.startingMonth is not None) + + @apply_wraps + def apply(self, other): + n = self.n + other = as_datetime(other) + + wkday, days_in_month = tslib.monthrange(other.year, other.month) + + monthsSince = (other.month - self.startingMonth) % 3 + + if n <= 0 and monthsSince != 0: + # make sure you roll forward, so negate + monthsSince = monthsSince - 3 + + if n < 0 and (monthsSince == 0 and other.day > 1): + # after start, so come back an extra period as if rolled forward + n = n + 1 + + other = other + relativedelta(months=3 * n - monthsSince, day=1) + return as_timestamp(other) + + +class YearOffset(DateOffset): + """DateOffset that just needs a month""" + + def __init__(self, n=1, normalize=False, **kwds): + self.month = kwds.get('month', self._default_month) + + if self.month < 1 or self.month > 12: + raise ValueError('Month must go from 1 to 12') + + DateOffset.__init__(self, n=n, normalize=normalize, **kwds) + + @classmethod + def _from_name(cls, suffix=None): + kwargs = {} + if suffix: + kwargs['month'] = _month_to_int[suffix] + return cls(**kwargs) + + @property + def rule_code(self): + return '%s-%s' % (self._prefix, _int_to_month[self.month]) + + +class BYearEnd(YearOffset): + """DateOffset increments between business EOM dates""" + _outputName = 'BusinessYearEnd' + _default_month = 12 + _prefix = 'BA' + + @apply_wraps + def apply(self, other): + n = self.n + other = as_datetime(other) + + wkday, days_in_month = tslib.monthrange(other.year, self.month) + lastBDay = (days_in_month - + max(((wkday + days_in_month - 1) % 7) - 4, 0)) + + years = n + if n > 0: + if (other.month < self.month or + (other.month == self.month and other.day < lastBDay)): + years -= 1 + elif n <= 0: + if (other.month > self.month or + (other.month == self.month and other.day > lastBDay)): + years += 1 + + other = other + relativedelta(years=years) + + _, days_in_month = tslib.monthrange(other.year, self.month) + result = datetime(other.year, self.month, days_in_month, + other.hour, other.minute, other.second, + other.microsecond) + + if result.weekday() > 4: + result = result - BDay() + + return as_timestamp(result) + + +class BYearBegin(YearOffset): + """DateOffset increments between business year begin dates""" + _outputName = 'BusinessYearBegin' + _default_month = 1 + _prefix = 'BAS' + + @apply_wraps + def apply(self, other): + n = self.n + other = as_datetime(other) + + wkday, days_in_month = tslib.monthrange(other.year, self.month) + + first = _get_firstbday(wkday) + + years = n + + if n > 0: # roll back first for positive n + if (other.month < self.month or + (other.month == self.month and other.day < first)): + years -= 1 + elif n <= 0: # roll forward + if (other.month > self.month or + (other.month == self.month and other.day > first)): + years += 1 + + # set first bday for result + other = other + relativedelta(years=years) + wkday, days_in_month = tslib.monthrange(other.year, self.month) + first = _get_firstbday(wkday) + return as_timestamp(datetime(other.year, self.month, first, other.hour, + other.minute, other.second, other.microsecond)) + + +class YearEnd(YearOffset): + """DateOffset increments between calendar year ends""" + _default_month = 12 + _prefix = 'A' + + @apply_wraps + def apply(self, other): + def _increment(date): + if date.month == self.month: + _, days_in_month = tslib.monthrange(date.year, self.month) + if date.day != days_in_month: + year = date.year + else: + year = date.year + 1 + elif date.month < self.month: + year = date.year + else: + year = date.year + 1 + _, days_in_month = tslib.monthrange(year, self.month) + return datetime(year, self.month, days_in_month, + date.hour, date.minute, date.second, + date.microsecond) + + def _decrement(date): + year = date.year if date.month > self.month else date.year - 1 + _, days_in_month = tslib.monthrange(year, self.month) + return datetime(year, self.month, days_in_month, + date.hour, date.minute, date.second, + date.microsecond) + + def _rollf(date): + if date.month != self.month or\ + date.day < tslib.monthrange(date.year, date.month)[1]: + date = _increment(date) + return date + + n = self.n + result = other + if n > 0: + while n > 0: + result = _increment(result) + n -= 1 + elif n < 0: + while n < 0: + result = _decrement(result) + n += 1 + else: + # n == 0, roll forward + result = _rollf(result) + + return as_timestamp(result) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + wkday, days_in_month = tslib.monthrange(dt.year, self.month) + return self.month == dt.month and dt.day == days_in_month + + +class YearBegin(YearOffset): + """DateOffset increments between calendar year begin dates""" + _default_month = 1 + _prefix = 'AS' + + @apply_wraps + def apply(self, other): + def _increment(date): + year = date.year + if date.month >= self.month: + year += 1 + return datetime(year, self.month, 1, date.hour, date.minute, + date.second, date.microsecond) + + def _decrement(date): + year = date.year + if date.month < self.month or (date.month == self.month and + date.day == 1): + year -= 1 + return datetime(year, self.month, 1, date.hour, date.minute, + date.second, date.microsecond) + + def _rollf(date): + if (date.month != self.month) or date.day > 1: + date = _increment(date) + return date + + n = self.n + result = other + if n > 0: + while n > 0: + result = _increment(result) + n -= 1 + elif n < 0: + while n < 0: + result = _decrement(result) + n += 1 + else: + # n == 0, roll forward + result = _rollf(result) + + return as_timestamp(result) + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.month == self.month and dt.day == 1 + + +class FY5253(DateOffset): + """ + Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar. + + It is used by companies that desire that their + fiscal year always end on the same day of the week. + + It is a method of managing accounting periods. + It is a common calendar structure for some industries, + such as retail, manufacturing and parking industry. + + For more information see: + http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar + + + The year may either: + - end on the last X day of the Y month. + - end on the last X day closest to the last day of the Y month. + + X is a specific day of the week. + Y is a certain month of the year + + Parameters + ---------- + n : int + weekday : {0, 1, ..., 6} + 0: Mondays + 1: Tuesdays + 2: Wednesdays + 3: Thursdays + 4: Fridays + 5: Saturdays + 6: Sundays + startingMonth : The month in which fiscal years end. {1, 2, ... 12} + variation : str + {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" + """ + + _prefix = 'RE' + _suffix_prefix_last = 'L' + _suffix_prefix_nearest = 'N' + + def __init__(self, n=1, normalize=False, **kwds): + self.n = n + self.normalize = normalize + self.startingMonth = kwds['startingMonth'] + self.weekday = kwds["weekday"] + + self.variation = kwds["variation"] + + self.kwds = kwds + + if self.n == 0: + raise ValueError('N cannot be 0') + + if self.variation not in ["nearest", "last"]: + raise ValueError('%s is not a valid variation' % self.variation) + + if self.variation == "nearest": + weekday_offset = weekday(self.weekday) + self._rd_forward = relativedelta(weekday=weekday_offset) + self._rd_backward = relativedelta(weekday=weekday_offset(-1)) + else: + self._offset_lwom = LastWeekOfMonth(n=1, weekday=self.weekday) + + def isAnchored(self): + return self.n == 1 \ + and self.startingMonth is not None \ + and self.weekday is not None + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + dt = datetime(dt.year, dt.month, dt.day) + year_end = self.get_year_end(dt) + + if self.variation == "nearest": + # We have to check the year end of "this" cal year AND the previous + return year_end == dt or \ + self.get_year_end(dt - relativedelta(months=1)) == dt + else: + return year_end == dt + + @apply_wraps + def apply(self, other): + n = self.n + prev_year = self.get_year_end( + datetime(other.year - 1, self.startingMonth, 1)) + cur_year = self.get_year_end( + datetime(other.year, self.startingMonth, 1)) + next_year = self.get_year_end( + datetime(other.year + 1, self.startingMonth, 1)) + if getattr(other, 'tzinfo', None) is not None: + prev_year = other.tzinfo.localize(prev_year) + cur_year = other.tzinfo.localize(cur_year) + next_year = other.tzinfo.localize(next_year) + + if n > 0: + if other == prev_year: + year = other.year - 1 + elif other == cur_year: + year = other.year + elif other == next_year: + year = other.year + 1 + elif other < prev_year: + year = other.year - 1 + n -= 1 + elif other < cur_year: + year = other.year + n -= 1 + elif other < next_year: + year = other.year + 1 + n -= 1 + else: + assert False + + result = self.get_year_end(datetime(year + n, self.startingMonth, 1)) + + result = datetime(result.year, result.month, result.day, + other.hour, other.minute, other.second, other.microsecond) + return result + else: + n = -n + if other == prev_year: + year = other.year - 1 + elif other == cur_year: + year = other.year + elif other == next_year: + year = other.year + 1 + elif other > next_year: + year = other.year + 1 + n -= 1 + elif other > cur_year: + year = other.year + n -= 1 + elif other > prev_year: + year = other.year - 1 + n -= 1 + else: + assert False + + result = self.get_year_end(datetime(year - n, self.startingMonth, 1)) + + result = datetime(result.year, result.month, result.day, + other.hour, other.minute, other.second, other.microsecond) + return result + + def get_year_end(self, dt): + if self.variation == "nearest": + return self._get_year_end_nearest(dt) + else: + return self._get_year_end_last(dt) + + def get_target_month_end(self, dt): + target_month = datetime(dt.year, self.startingMonth, 1) + if getattr(dt, 'tzinfo', None) is not None: + target_month = dt.tzinfo.localize(target_month) + next_month_first_of = target_month + relativedelta(months=+1) + return next_month_first_of + relativedelta(days=-1) + + def _get_year_end_nearest(self, dt): + target_date = self.get_target_month_end(dt) + if target_date.weekday() == self.weekday: + return target_date + else: + forward = target_date + self._rd_forward + backward = target_date + self._rd_backward + + if forward - target_date < target_date - backward: + return forward + else: + return backward + + def _get_year_end_last(self, dt): + current_year = datetime(dt.year, self.startingMonth, 1) + if getattr(dt, 'tzinfo', None) is not None: + current_year = dt.tzinfo.localize(current_year) + return current_year + self._offset_lwom + + @property + def rule_code(self): + suffix = self.get_rule_code_suffix() + return "%s-%s" % (self._get_prefix(), suffix) + + def _get_prefix(self): + return self._prefix + + def _get_suffix_prefix(self): + if self.variation == "nearest": + return self._suffix_prefix_nearest + else: + return self._suffix_prefix_last + + def get_rule_code_suffix(self): + return '%s-%s-%s' % (self._get_suffix_prefix(), \ + _int_to_month[self.startingMonth], \ + _int_to_weekday[self.weekday]) + + @classmethod + def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): + if varion_code == "N": + variation = "nearest" + elif varion_code == "L": + variation = "last" + else: + raise ValueError( + "Unable to parse varion_code: %s" % (varion_code,)) + + startingMonth = _month_to_int[startingMonth_code] + weekday = _weekday_to_int[weekday_code] + + return { + "weekday": weekday, + "startingMonth": startingMonth, + "variation": variation, + } + + @classmethod + def _from_name(cls, *args): + return cls(**cls._parse_suffix(*args)) + + +class FY5253Quarter(DateOffset): + """ + DateOffset increments between business quarter dates + for 52-53 week fiscal year (also known as a 4-4-5 calendar). + + It is used by companies that desire that their + fiscal year always end on the same day of the week. + + It is a method of managing accounting periods. + It is a common calendar structure for some industries, + such as retail, manufacturing and parking industry. + + For more information see: + http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar + + The year may either: + - end on the last X day of the Y month. + - end on the last X day closest to the last day of the Y month. + + X is a specific day of the week. + Y is a certain month of the year + + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... + + Parameters + ---------- + n : int + weekday : {0, 1, ..., 6} + 0: Mondays + 1: Tuesdays + 2: Wednesdays + 3: Thursdays + 4: Fridays + 5: Saturdays + 6: Sundays + startingMonth : The month in which fiscal years end. {1, 2, ... 12} + qtr_with_extra_week : The quarter number that has the leap + or 14 week when needed. {1,2,3,4} + variation : str + {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" + """ + + _prefix = 'REQ' + + def __init__(self, n=1, normalize=False, **kwds): + self.n = n + self.normalize = normalize + + self.qtr_with_extra_week = kwds["qtr_with_extra_week"] + + self.kwds = kwds + + if self.n == 0: + raise ValueError('N cannot be 0') + + self._offset = FY5253( \ + startingMonth=kwds['startingMonth'], \ + weekday=kwds["weekday"], + variation=kwds["variation"]) + + def isAnchored(self): + return self.n == 1 and self._offset.isAnchored() + + @apply_wraps + def apply(self, other): + base = other + other = as_datetime(other) + + n = self.n + + if n > 0: + while n > 0: + if not self._offset.onOffset(other): + qtr_lens = self.get_weeks(other) + start = other - self._offset + else: + start = other + qtr_lens = self.get_weeks(other + self._offset) + + for weeks in qtr_lens: + start += relativedelta(weeks=weeks) + if start > other: + other = start + n -= 1 + break + + else: + n = -n + while n > 0: + if not self._offset.onOffset(other): + qtr_lens = self.get_weeks(other) + end = other + self._offset + else: + end = other + qtr_lens = self.get_weeks(other) + + for weeks in reversed(qtr_lens): + end -= relativedelta(weeks=weeks) + if end < other: + other = end + n -= 1 + break + other = datetime(other.year, other.month, other.day, + base.hour, base.minute, base.second, base.microsecond) + return other + + def get_weeks(self, dt): + ret = [13] * 4 + + year_has_extra_week = self.year_has_extra_week(dt) + + if year_has_extra_week: + ret[self.qtr_with_extra_week - 1] = 14 + + return ret + + def year_has_extra_week(self, dt): + if self._offset.onOffset(dt): + prev_year_end = dt - self._offset + next_year_end = dt + else: + next_year_end = dt + self._offset + prev_year_end = dt - self._offset + + week_in_year = (next_year_end - prev_year_end).days / 7 + + return week_in_year == 53 + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + if self._offset.onOffset(dt): + return True + + next_year_end = dt - self._offset + + qtr_lens = self.get_weeks(dt) + + current = next_year_end + for qtr_len in qtr_lens[0:4]: + current += relativedelta(weeks=qtr_len) + if dt == current: + return True + return False + + @property + def rule_code(self): + suffix = self._offset.get_rule_code_suffix() + return "%s-%s" % (self._prefix, + "%s-%d" % (suffix, self.qtr_with_extra_week)) + + @classmethod + def _from_name(cls, *args): + return cls(**dict(FY5253._parse_suffix(*args[:-1]), + qtr_with_extra_week=int(args[-1]))) + +class Easter(DateOffset): + ''' + DateOffset for the Easter holiday using + logic defined in dateutil. Right now uses + the revised method which is valid in years + 1583-4099. + ''' + def __init__(self, n=1, **kwds): + super(Easter, self).__init__(n, **kwds) + + @apply_wraps + def apply(self, other): + currentEaster = easter(other.year) + currentEaster = datetime(currentEaster.year, currentEaster.month, currentEaster.day) + if getattr(other, 'tzinfo', None) is not None: + currentEaster = other.tzinfo.localize(currentEaster) + + # NOTE: easter returns a datetime.date so we have to convert to type of other + if self.n >= 0: + if other >= currentEaster: + new = easter(other.year + self.n) + else: + new = easter(other.year + self.n - 1) + else: + if other > currentEaster: + new = easter(other.year + self.n + 1) + else: + new = easter(other.year + self.n) + + new = datetime(new.year, new.month, new.day, other.hour, + other.minute, other.second, other.microsecond) + return new + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return date(dt.year, dt.month, dt.day) == easter(dt.year) + +#---------------------------------------------------------------------- +# Ticks + +import operator + + +def _tick_comp(op): + def f(self, other): + return op(self.delta, other.delta) + + return f + + +class Tick(SingleConstructorOffset): + _inc = timedelta(microseconds=1000) + + __gt__ = _tick_comp(operator.gt) + __ge__ = _tick_comp(operator.ge) + __lt__ = _tick_comp(operator.lt) + __le__ = _tick_comp(operator.le) + __eq__ = _tick_comp(operator.eq) + __ne__ = _tick_comp(operator.ne) + + def __add__(self, other): + if isinstance(other, Tick): + if type(self) == type(other): + return type(self)(self.n + other.n) + else: + return _delta_to_tick(self.delta + other.delta) + try: + return self.apply(other) + except ApplyTypeError: + return NotImplemented + + def __eq__(self, other): + if isinstance(other, compat.string_types): + from pandas.tseries.frequencies import to_offset + + other = to_offset(other) + + if isinstance(other, Tick): + return self.delta == other.delta + else: + return DateOffset.__eq__(self, other) + + # This is identical to DateOffset.__hash__, but has to be redefined here + # for Python 3, because we've redefined __eq__. + def __hash__(self): + return hash(self._params()) + + def __ne__(self, other): + if isinstance(other, compat.string_types): + from pandas.tseries.frequencies import to_offset + + other = to_offset(other) + + if isinstance(other, Tick): + return self.delta != other.delta + else: + return DateOffset.__ne__(self, other) + + @property + def delta(self): + return self.n * self._inc + + @property + def nanos(self): + return _delta_to_nanoseconds(self.delta) + + def apply(self, other): + # Timestamp can handle tz and nano sec, thus no need to use apply_wraps + if type(other) == date: + other = datetime(other.year, other.month, other.day) + elif isinstance(other, (np.datetime64, datetime)): + other = as_timestamp(other) + + if isinstance(other, datetime): + result = other + self.delta + if self.normalize: + # normalize_date returns normal datetime + result = tslib.normalize_date(result) + return as_timestamp(result) + + elif isinstance(other, timedelta): + return other + self.delta + elif isinstance(other, type(self)): + return type(self)(self.n + other.n) + else: + raise ApplyTypeError('Unhandled type: %s' % type(other).__name__) + + _prefix = 'undefined' + + def isAnchored(self): + return False + + +def _delta_to_tick(delta): + if delta.microseconds == 0: + if delta.seconds == 0: + return Day(delta.days) + else: + seconds = delta.days * 86400 + delta.seconds + if seconds % 3600 == 0: + return Hour(seconds / 3600) + elif seconds % 60 == 0: + return Minute(seconds / 60) + else: + return Second(seconds) + else: + nanos = _delta_to_nanoseconds(delta) + if nanos % 1000000 == 0: + return Milli(nanos // 1000000) + elif nanos % 1000 == 0: + return Micro(nanos // 1000) + else: # pragma: no cover + return Nano(nanos) + + +def _delta_to_nanoseconds(delta): + if isinstance(delta, np.timedelta64): + return delta.astype('timedelta64[ns]').item() + elif isinstance(delta, Tick): + delta = delta.delta + + return (delta.days * 24 * 60 * 60 * 1000000 + + delta.seconds * 1000000 + + delta.microseconds) * 1000 + + +class Day(Tick): + _inc = timedelta(1) + _prefix = 'D' + + +class Hour(Tick): + _inc = timedelta(0, 3600) + _prefix = 'H' + + +class Minute(Tick): + _inc = timedelta(0, 60) + _prefix = 'T' + + +class Second(Tick): + _inc = timedelta(0, 1) + _prefix = 'S' + + +class Milli(Tick): + _prefix = 'L' + + +class Micro(Tick): + _inc = timedelta(microseconds=1) + _prefix = 'U' + + +class Nano(Tick): + _inc = np.timedelta64(1, 'ns') if not _np_version_under1p7 else 1 + _prefix = 'N' + + +BDay = BusinessDay +BMonthEnd = BusinessMonthEnd +BMonthBegin = BusinessMonthBegin +CBMonthEnd = CustomBusinessMonthEnd +CBMonthBegin = CustomBusinessMonthBegin +CDay = CustomBusinessDay + + +def _get_firstbday(wkday): + """ + wkday is the result of monthrange(year, month) + + If it's a saturday or sunday, increment first business day to reflect this + """ + first = 1 + if wkday == 5: # on Saturday + first = 3 + elif wkday == 6: # on Sunday + first = 2 + return first + + +def generate_range(start=None, end=None, periods=None, + offset=BDay(), time_rule=None): + """ + Generates a sequence of dates corresponding to the specified time + offset. Similar to dateutil.rrule except uses pandas DateOffset + objects to represent time increments + + Parameters + ---------- + start : datetime (default None) + end : datetime (default None) + periods : int, optional + time_rule : (legacy) name of DateOffset object to be used, optional + Corresponds with names expected by tseries.frequencies.get_offset + + Notes + ----- + * This method is faster for generating weekdays than dateutil.rrule + * At least two of (start, end, periods) must be specified. + * If both start and end are specified, the returned dates will + satisfy start <= date <= end. + * If both time_rule and offset are specified, time_rule supersedes offset. + + Returns + ------- + dates : generator object + + """ + if time_rule is not None: + from pandas.tseries.frequencies import get_offset + + offset = get_offset(time_rule) + + start = to_datetime(start) + end = to_datetime(end) + + if start and not offset.onOffset(start): + start = offset.rollforward(start) + + if end and not offset.onOffset(end): + end = offset.rollback(end) + + if periods is None and end < start: + end = None + periods = 0 + + if end is None: + end = start + (periods - 1) * offset + + if start is None: + start = end - (periods - 1) * offset + + cur = start + + next_date = cur + while cur <= end: + yield cur + + # faster than cur + offset + next_date = offset.apply(cur) + if next_date <= cur: + raise ValueError('Offset %s did not increment date' % offset) + cur = next_date + +prefix_mapping = dict((offset._prefix, offset) for offset in [ + YearBegin, # 'AS' + YearEnd, # 'A' + BYearBegin, # 'BAS' + BYearEnd, # 'BA' + BusinessDay, # 'B' + BusinessMonthBegin, # 'BMS' + BusinessMonthEnd, # 'BM' + BQuarterEnd, # 'BQ' + BQuarterBegin, # 'BQS' + CustomBusinessDay, # 'C' + CustomBusinessMonthEnd, # 'CBM' + CustomBusinessMonthBegin, # 'CBMS' + MonthEnd, # 'M' + MonthBegin, # 'MS' + Week, # 'W' + Second, # 'S' + Minute, # 'T' + Micro, # 'U' + QuarterEnd, # 'Q' + QuarterBegin, # 'QS' + Milli, # 'L' + Hour, # 'H' + Day, # 'D' + WeekOfMonth, # 'WOM' + FY5253, + FY5253Quarter, +]) + +if not _np_version_under1p7: + # Only 1.7+ supports nanosecond resolution + prefix_mapping['N'] = Nano + + +def _make_offset(key): + """Gets offset based on key. KeyError if prefix is bad, ValueError if + suffix is bad. All handled by `get_offset` in tseries/frequencies. Not + public.""" + if key is None: + return None + split = key.replace('@', '-').split('-') + klass = prefix_mapping[split[0]] + # handles case where there's no suffix (and will TypeError if too many '-') + obj = klass._from_name(*split[1:]) + obj._named = key + return obj diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py new file mode 100644 index 00000000..5948fbf8 --- /dev/null +++ b/pandas/tseries/period.py @@ -0,0 +1,1337 @@ +# pylint: disable=E1101,E1103,W0232 +import operator + +from datetime import datetime, date +import numpy as np +from pandas.core.base import PandasObject + +from pandas.tseries.frequencies import (get_freq_code as _gfc, + _month_numbers, FreqGroup) +from pandas.tseries.index import DatetimeIndex, Int64Index, Index +from pandas.core.base import DatetimeIndexOpsMixin +from pandas.tseries.tools import parse_time_string +import pandas.tseries.frequencies as _freq_mod + +import pandas.core.common as com +from pandas.core.common import (isnull, _INT64_DTYPE, _maybe_box, + _values_from_object, ABCSeries) +from pandas import compat +from pandas.lib import Timestamp +import pandas.lib as lib +import pandas.tslib as tslib +import pandas.algos as _algos +from pandas.compat import zip, u + + +#--------------- +# Period logic + +def _period_field_accessor(name, alias): + def f(self): + base, mult = _gfc(self.freq) + return tslib.get_period_field(alias, self.ordinal, base) + f.__name__ = name + return property(f) + + +def _field_accessor(name, alias): + def f(self): + base, mult = _gfc(self.freq) + return tslib.get_period_field_arr(alias, self.values, base) + f.__name__ = name + return property(f) + + +class Period(PandasObject): + """ + Represents an period of time + + Parameters + ---------- + value : Period or compat.string_types, default None + The time period represented (e.g., '4Q2005') + freq : str, default None + e.g., 'B' for businessday. Must be a singular rule-code (e.g. 5T is not + allowed). + year : int, default None + month : int, default 1 + quarter : int, default None + day : int, default 1 + hour : int, default 0 + minute : int, default 0 + second : int, default 0 + """ + __slots__ = ['freq', 'ordinal'] + _comparables = ['name','freqstr'] + + def __init__(self, value=None, freq=None, ordinal=None, + year=None, month=1, quarter=None, day=1, + hour=0, minute=0, second=0): + # freq points to a tuple (base, mult); base is one of the defined + # periods such as A, Q, etc. Every five minutes would be, e.g., + # ('T', 5) but may be passed in as a string like '5T' + + self.freq = None + + # ordinal is the period offset from the gregorian proleptic epoch + self.ordinal = None + + if ordinal is not None and value is not None: + raise ValueError(("Only value or ordinal but not both should be " + "given but not both")) + elif ordinal is not None: + if not com.is_integer(ordinal): + raise ValueError("Ordinal must be an integer") + if freq is None: + raise ValueError('Must supply freq for ordinal value') + self.ordinal = ordinal + + elif value is None: + if freq is None: + raise ValueError("If value is None, freq cannot be None") + + self.ordinal = _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq) + + elif isinstance(value, Period): + other = value + if freq is None or _gfc(freq) == _gfc(other.freq): + self.ordinal = other.ordinal + freq = other.freq + else: + converted = other.asfreq(freq) + self.ordinal = converted.ordinal + + elif com._is_null_datelike_scalar(value) or value in tslib._nat_strings: + self.ordinal = tslib.iNaT + if freq is None: + raise ValueError("If value is NaT, freq cannot be None " + "because it cannot be inferred") + + elif isinstance(value, compat.string_types) or com.is_integer(value): + if com.is_integer(value): + value = str(value) + value = value.upper() + + dt, _, reso = parse_time_string(value, freq) + if freq is None: + try: + freq = _freq_mod.Resolution.get_freq(reso) + except KeyError: + raise ValueError("Invalid frequency or could not infer: %s" % reso) + + elif isinstance(value, datetime): + dt = value + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, date): + dt = datetime(year=value.year, month=value.month, day=value.day) + if freq is None: + raise ValueError('Must supply freq for datetime value') + else: + msg = "Value must be Period, string, integer, or datetime" + raise ValueError(msg) + + base, mult = _gfc(freq) + if mult != 1: + # TODO: Better error message - this is slightly confusing + raise ValueError('Only mult == 1 supported') + + if self.ordinal is None: + self.ordinal = tslib.period_ordinal(dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, dt.microsecond, 0, + base) + + self.freq = _freq_mod._get_freq_str(base) + + def __eq__(self, other): + if isinstance(other, Period): + if other.freq != self.freq: + raise ValueError("Cannot compare non-conforming periods") + if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: + return False + return (self.ordinal == other.ordinal + and _gfc(self.freq) == _gfc(other.freq)) + return NotImplemented + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash((self.ordinal, self.freq)) + + def __add__(self, other): + if com.is_integer(other): + if self.ordinal == tslib.iNaT: + ordinal = self.ordinal + else: + ordinal = self.ordinal + other + return Period(ordinal=ordinal, freq=self.freq) + else: # pragma: no cover + return NotImplemented + + def __sub__(self, other): + if com.is_integer(other): + if self.ordinal == tslib.iNaT: + ordinal = self.ordinal + else: + ordinal = self.ordinal - other + return Period(ordinal=ordinal, freq=self.freq) + if isinstance(other, Period): + if other.freq != self.freq: + raise ValueError("Cannot do arithmetic with " + "non-conforming periods") + if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: + return Period(ordinal=tslib.iNaT, freq=self.freq) + return self.ordinal - other.ordinal + else: # pragma: no cover + return NotImplemented + + def _comp_method(func, name): + def f(self, other): + if isinstance(other, Period): + if other.freq != self.freq: + raise ValueError("Cannot compare non-conforming periods") + if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: + return False + return func(self.ordinal, other.ordinal) + else: + raise TypeError(other) + + f.__name__ = name + return f + + __lt__ = _comp_method(operator.lt, '__lt__') + __le__ = _comp_method(operator.le, '__le__') + __gt__ = _comp_method(operator.gt, '__gt__') + __ge__ = _comp_method(operator.ge, '__ge__') + + def asfreq(self, freq, how='E'): + """ + Convert Period to desired frequency, either at the start or end of the + interval + + Parameters + ---------- + freq : string + how : {'E', 'S', 'end', 'start'}, default 'end' + Start or end of the timespan + + Returns + ------- + resampled : Period + """ + how = _validate_end_alias(how) + base1, mult1 = _gfc(self.freq) + base2, mult2 = _gfc(freq) + + if mult2 != 1: + raise ValueError('Only mult == 1 supported') + + end = how == 'E' + new_ordinal = tslib.period_asfreq(self.ordinal, base1, base2, end) + + return Period(ordinal=new_ordinal, freq=base2) + + @property + def start_time(self): + return self.to_timestamp(how='S') + + @property + def end_time(self): + if self.ordinal == tslib.iNaT: + ordinal = self.ordinal + else: + ordinal = (self + 1).start_time.value - 1 + return Timestamp(ordinal) + + def to_timestamp(self, freq=None, how='start', tz=None): + """ + Return the Timestamp representation of the Period at the target + frequency at the specified end (how) of the Period + + Parameters + ---------- + freq : string or DateOffset, default is 'D' if self.freq is week or + longer and 'S' otherwise + Target frequency + how: str, default 'S' (start) + 'S', 'E'. Can be aliased as case insensitive + 'Start', 'Finish', 'Begin', 'End' + + Returns + ------- + Timestamp + """ + how = _validate_end_alias(how) + + if freq is None: + base, mult = _gfc(self.freq) + freq = _freq_mod.get_to_timestamp_base(base) + + base, mult = _gfc(freq) + val = self.asfreq(freq, how) + + dt64 = tslib.period_ordinal_to_dt64(val.ordinal, base) + return Timestamp(dt64, tz=tz) + + year = _period_field_accessor('year', 0) + month = _period_field_accessor('month', 3) + day = _period_field_accessor('day', 4) + hour = _period_field_accessor('hour', 5) + minute = _period_field_accessor('minute', 6) + second = _period_field_accessor('second', 7) + weekofyear = _period_field_accessor('week', 8) + week = weekofyear + dayofweek = _period_field_accessor('dayofweek', 10) + weekday = dayofweek + dayofyear = _period_field_accessor('dayofyear', 9) + quarter = _period_field_accessor('quarter', 2) + qyear = _period_field_accessor('qyear', 1) + + @classmethod + def now(cls, freq=None): + return Period(datetime.now(), freq=freq) + + def __repr__(self): + base, mult = _gfc(self.freq) + formatted = tslib.period_format(self.ordinal, base) + freqstr = _freq_mod._reverse_period_code_map[base] + + if not compat.PY3: + encoding = com.get_option("display.encoding") + formatted = formatted.encode(encoding) + + return "Period('%s', '%s')" % (formatted, freqstr) + + def __unicode__(self): + """ + Return a string representation for a particular DataFrame + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + base, mult = _gfc(self.freq) + formatted = tslib.period_format(self.ordinal, base) + value = ("%s" % formatted) + return value + + def strftime(self, fmt): + """ + Returns the string representation of the :class:`Period`, depending + on the selected :keyword:`format`. :keyword:`format` must be a string + containing one or several directives. The method recognizes the same + directives as the :func:`time.strftime` function of the standard Python + distribution, as well as the specific additional directives ``%f``, + ``%F``, ``%q``. (formatting & docs originally from scikits.timeries) + + +-----------+--------------------------------+-------+ + | Directive | Meaning | Notes | + +===========+================================+=======+ + | ``%a`` | Locale's abbreviated weekday | | + | | name. | | + +-----------+--------------------------------+-------+ + | ``%A`` | Locale's full weekday name. | | + +-----------+--------------------------------+-------+ + | ``%b`` | Locale's abbreviated month | | + | | name. | | + +-----------+--------------------------------+-------+ + | ``%B`` | Locale's full month name. | | + +-----------+--------------------------------+-------+ + | ``%c`` | Locale's appropriate date and | | + | | time representation. | | + +-----------+--------------------------------+-------+ + | ``%d`` | Day of the month as a decimal | | + | | number [01,31]. | | + +-----------+--------------------------------+-------+ + | ``%f`` | 'Fiscal' year without a | \(1) | + | | century as a decimal number | | + | | [00,99] | | + +-----------+--------------------------------+-------+ + | ``%F`` | 'Fiscal' year with a century | \(2) | + | | as a decimal number | | + +-----------+--------------------------------+-------+ + | ``%H`` | Hour (24-hour clock) as a | | + | | decimal number [00,23]. | | + +-----------+--------------------------------+-------+ + | ``%I`` | Hour (12-hour clock) as a | | + | | decimal number [01,12]. | | + +-----------+--------------------------------+-------+ + | ``%j`` | Day of the year as a decimal | | + | | number [001,366]. | | + +-----------+--------------------------------+-------+ + | ``%m`` | Month as a decimal number | | + | | [01,12]. | | + +-----------+--------------------------------+-------+ + | ``%M`` | Minute as a decimal number | | + | | [00,59]. | | + +-----------+--------------------------------+-------+ + | ``%p`` | Locale's equivalent of either | \(3) | + | | AM or PM. | | + +-----------+--------------------------------+-------+ + | ``%q`` | Quarter as a decimal number | | + | | [01,04] | | + +-----------+--------------------------------+-------+ + | ``%S`` | Second as a decimal number | \(4) | + | | [00,61]. | | + +-----------+--------------------------------+-------+ + | ``%U`` | Week number of the year | \(5) | + | | (Sunday as the first day of | | + | | the week) as a decimal number | | + | | [00,53]. All days in a new | | + | | year preceding the first | | + | | Sunday are considered to be in | | + | | week 0. | | + +-----------+--------------------------------+-------+ + | ``%w`` | Weekday as a decimal number | | + | | [0(Sunday),6]. | | + +-----------+--------------------------------+-------+ + | ``%W`` | Week number of the year | \(5) | + | | (Monday as the first day of | | + | | the week) as a decimal number | | + | | [00,53]. All days in a new | | + | | year preceding the first | | + | | Monday are considered to be in | | + | | week 0. | | + +-----------+--------------------------------+-------+ + | ``%x`` | Locale's appropriate date | | + | | representation. | | + +-----------+--------------------------------+-------+ + | ``%X`` | Locale's appropriate time | | + | | representation. | | + +-----------+--------------------------------+-------+ + | ``%y`` | Year without century as a | | + | | decimal number [00,99]. | | + +-----------+--------------------------------+-------+ + | ``%Y`` | Year with century as a decimal | | + | | number. | | + +-----------+--------------------------------+-------+ + | ``%Z`` | Time zone name (no characters | | + | | if no time zone exists). | | + +-----------+--------------------------------+-------+ + | ``%%`` | A literal ``'%'`` character. | | + +-----------+--------------------------------+-------+ + + .. note:: + + (1) + The ``%f`` directive is the same as ``%y`` if the frequency is + not quarterly. + Otherwise, it corresponds to the 'fiscal' year, as defined by + the :attr:`qyear` attribute. + + (2) + The ``%F`` directive is the same as ``%Y`` if the frequency is + not quarterly. + Otherwise, it corresponds to the 'fiscal' year, as defined by + the :attr:`qyear` attribute. + + (3) + The ``%p`` directive only affects the output hour field + if the ``%I`` directive is used to parse the hour. + + (4) + The range really is ``0`` to ``61``; this accounts for leap + seconds and the (very rare) double leap seconds. + + (5) + The ``%U`` and ``%W`` directives are only used in calculations + when the day of the week and the year are specified. + + .. rubric:: Examples + + >>> a = Period(freq='Q@JUL', year=2006, quarter=1) + >>> a.strftime('%F-Q%q') + '2006-Q1' + >>> # Output the last month in the quarter of this date + >>> a.strftime('%b-%Y') + 'Oct-2005' + >>> + >>> a = Period(freq='D', year=2001, month=1, day=1) + >>> a.strftime('%d-%b-%Y') + '01-Jan-2006' + >>> a.strftime('%b. %d, %Y was a %A') + 'Jan. 01, 2001 was a Monday' + """ + base, mult = _gfc(self.freq) + return tslib.period_format(self.ordinal, base, fmt) + + +def _get_ordinals(data, freq): + f = lambda x: Period(x, freq=freq).ordinal + if isinstance(data[0], Period): + return tslib.extract_ordinals(data, freq) + else: + return lib.map_infer(data, f) + + +def dt64arr_to_periodarr(data, freq, tz): + if data.dtype != np.dtype('M8[ns]'): + raise ValueError('Wrong dtype: %s' % data.dtype) + + base, mult = _gfc(freq) + return tslib.dt64arr_to_periodarr(data.view('i8'), base, tz) + +# --- Period index sketch + +def _period_index_cmp(opname, nat_result=False): + """ + Wrap comparison operations to convert datetime-like to datetime64 + """ + def wrapper(self, other): + if isinstance(other, Period): + func = getattr(self.values, opname) + if other.freq != self.freq: + raise AssertionError("Frequencies must be equal") + + result = func(other.ordinal) + elif isinstance(other, PeriodIndex): + if other.freq != self.freq: + raise AssertionError("Frequencies must be equal") + + result = getattr(self.values, opname)(other.values) + + mask = (com.mask_missing(self.values, tslib.iNaT) | + com.mask_missing(other.values, tslib.iNaT)) + if mask.any(): + result[mask] = nat_result + + return result + else: + other = Period(other, freq=self.freq) + func = getattr(self.values, opname) + result = func(other.ordinal) + + if other.ordinal == tslib.iNaT: + result.fill(nat_result) + mask = self.values == tslib.iNaT + if mask.any(): + result[mask] = nat_result + + return result + return wrapper + + +class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): + """ + Immutable ndarray holding ordinal values indicating regular periods in + time such as particular years, quarters, months, etc. A value of 1 is the + period containing the Gregorian proleptic datetime Jan 1, 0001 00:00:00. + This ordinal representation is from the scikits.timeseries project. + + For instance, + # construct period for day 1/1/1 and get the first second + i = Period(year=1,month=1,day=1,freq='D').asfreq('S', 'S') + i.ordinal + ===> 1 + + Index keys are boxed to Period objects which carries the metadata (eg, + frequency information). + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional period-like data to construct index with + dtype : NumPy dtype (default: i8) + copy : bool + Make a copy of input ndarray + freq : string or period object, optional + One of pandas period strings or corresponding objects + start : starting value, period-like, optional + If data is None, used as the start point in generating regular + period data. + periods : int, optional, > 0 + Number of periods to generate, if generating index. Takes precedence + over end argument + end : end value, period-like, optional + If periods is none, generated index will extend to first conforming + period on or just past end argument + year : int, array, or Series, default None + month : int, array, or Series, default None + quarter : int, array, or Series, default None + day : int, array, or Series, default None + hour : int, array, or Series, default None + minute : int, array, or Series, default None + second : int, array, or Series, default None + tz : object, default None + Timezone for converting datetime64 data to Periods + + Examples + -------- + >>> idx = PeriodIndex(year=year_arr, quarter=q_arr) + + >>> idx2 = PeriodIndex(start='2000', end='2010', freq='A') + """ + _box_scalars = True + _allow_period_index_ops = True + + __eq__ = _period_index_cmp('__eq__') + __ne__ = _period_index_cmp('__ne__', nat_result=True) + __lt__ = _period_index_cmp('__lt__') + __gt__ = _period_index_cmp('__gt__') + __le__ = _period_index_cmp('__le__') + __ge__ = _period_index_cmp('__ge__') + + def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, + periods=None, copy=False, name=None, year=None, month=None, + quarter=None, day=None, hour=None, minute=None, second=None, + tz=None): + + freq = _freq_mod.get_standard_freq(freq) + + if periods is not None: + if com.is_float(periods): + periods = int(periods) + elif not com.is_integer(periods): + raise ValueError('Periods must be a number, got %s' % + str(periods)) + + if data is None: + if ordinal is not None: + data = np.asarray(ordinal, dtype=np.int64) + else: + fields = [year, month, quarter, day, hour, minute, second] + data, freq = cls._generate_range(start, end, periods, + freq, fields) + else: + ordinal, freq = cls._from_arraylike(data, freq, tz) + data = np.array(ordinal, dtype=np.int64, copy=False) + + subarr = data.view(cls) + subarr.name = name + subarr.freq = freq + + return subarr + + @classmethod + def _generate_range(cls, start, end, periods, freq, fields): + field_count = com._count_not_none(*fields) + if com._count_not_none(start, end) > 0: + if field_count > 0: + raise ValueError('Can either instantiate from fields ' + 'or endpoints, but not both') + subarr, freq = _get_ordinal_range(start, end, periods, freq) + elif field_count > 0: + y, mth, q, d, h, minute, s = fields + subarr, freq = _range_from_fields(year=y, month=mth, quarter=q, + day=d, hour=h, minute=minute, + second=s, freq=freq) + else: + raise ValueError('Not enough parameters to construct ' + 'Period range') + + return subarr, freq + + @classmethod + def _from_arraylike(cls, data, freq, tz): + if not isinstance(data, np.ndarray): + if np.isscalar(data) or isinstance(data, Period): + raise ValueError('PeriodIndex() must be called with a ' + 'collection of some kind, %s was passed' + % repr(data)) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + + try: + data = com._ensure_int64(data) + if freq is None: + raise ValueError('freq not specified') + data = np.array([Period(x, freq=freq).ordinal for x in data], + dtype=np.int64) + except (TypeError, ValueError): + data = com._ensure_object(data) + + if freq is None and len(data) > 0: + freq = getattr(data[0], 'freq', None) + + if freq is None: + raise ValueError('freq not specified and cannot be ' + 'inferred from first element') + + data = _get_ordinals(data, freq) + else: + if isinstance(data, PeriodIndex): + if freq is None or freq == data.freq: + freq = data.freq + data = data.values + else: + base1, _ = _gfc(data.freq) + base2, _ = _gfc(freq) + data = tslib.period_asfreq_arr(data.values, base1, + base2, 1) + else: + if freq is None and len(data) > 0: + freq = getattr(data[0], 'freq', None) + + if freq is None: + raise ValueError('freq not specified and cannot be ' + 'inferred from first element') + + if data.dtype != np.int64: + if np.issubdtype(data.dtype, np.datetime64): + data = dt64arr_to_periodarr(data, freq, tz) + else: + try: + data = com._ensure_int64(data) + except (TypeError, ValueError): + data = com._ensure_object(data) + data = _get_ordinals(data, freq) + + return data, freq + + @classmethod + def _simple_new(cls, values, name, freq=None, **kwargs): + result = values.view(cls) + result.name = name + result.freq = freq + return result + + @property + def _na_value(self): + return self._box_func(tslib.iNaT) + + def __contains__(self, key): + if not isinstance(key, Period) or key.freq != self.freq: + if isinstance(key, compat.string_types): + try: + self.get_loc(key) + return True + except Exception: + return False + return False + return key.ordinal in self._engine + + @property + def _box_func(self): + return lambda x: Period(ordinal=x, freq=self.freq) + + def asof_locs(self, where, mask): + """ + where : array of timestamps + mask : array of booleans where data is not NA + + """ + where_idx = where + if isinstance(where_idx, DatetimeIndex): + where_idx = PeriodIndex(where_idx.values, freq=self.freq) + + locs = self.values[mask].searchsorted(where_idx.values, side='right') + + locs = np.where(locs > 0, locs - 1, 0) + result = np.arange(len(self))[mask].take(locs) + + first = mask.argmax() + result[(locs == 0) & (where_idx.values < self.values[first])] = -1 + + return result + + def _array_values(self): + return self.asobject + + def astype(self, dtype): + dtype = np.dtype(dtype) + if dtype == np.object_: + return Index(np.array(list(self), dtype), dtype) + elif dtype == _INT64_DTYPE: + return Index(self.values, dtype) + raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) + + def __iter__(self): + for val in self.values: + yield Period(ordinal=val, freq=self.freq) + + def searchsorted(self, key, side='left'): + if isinstance(key, compat.string_types): + key = Period(key, freq=self.freq).ordinal + + return self.values.searchsorted(key, side=side) + + @property + def is_all_dates(self): + return True + + @property + def is_full(self): + """ + Returns True if there are any missing periods from start to end + """ + if len(self) == 0: + return True + if not self.is_monotonic: + raise ValueError('Index is not monotonic') + values = self.values + return ((values[1:] - values[:-1]) < 2).all() + + @property + def freqstr(self): + return self.freq + + def asfreq(self, freq=None, how='E'): + how = _validate_end_alias(how) + + freq = _freq_mod.get_standard_freq(freq) + + base1, mult1 = _gfc(self.freq) + base2, mult2 = _gfc(freq) + + if mult2 != 1: + raise ValueError('Only mult == 1 supported') + + end = how == 'E' + new_data = tslib.period_asfreq_arr(self.values, base1, base2, end) + return self._simple_new(new_data, self.name, freq=freq) + + def to_datetime(self, dayfirst=False): + return self.to_timestamp() + + _year = _field_accessor('year', 0) + _month = _field_accessor('month', 3) + _day = _field_accessor('day', 4) + _hour = _field_accessor('hour', 5) + _minute = _field_accessor('minute', 6) + _second = _field_accessor('second', 7) + _weekofyear = _field_accessor('week', 8) + _week = _weekofyear + _dayofweek = _field_accessor('dayofweek', 10) + _weekday = _dayofweek + _dayofyear = day_of_year = _field_accessor('dayofyear', 9) + _quarter = _field_accessor('quarter', 2) + _qyear = _field_accessor('qyear', 1) + + # Try to run function on index first, and then on elements of index + # Especially important for group-by functionality + def map(self, f): + try: + result = f(self) + if not isinstance(result, np.ndarray): + raise TypeError + return result + except Exception: + return _algos.arrmap_object(self.asobject, f) + + def _get_object_array(self): + freq = self.freq + boxfunc = lambda x: Period(ordinal=x, freq=freq) + boxer = np.frompyfunc(boxfunc, 1, 1) + return boxer(self.values) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self._get_object_array() + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + return np.array_equal(self.asi8, other.asi8) + + def to_timestamp(self, freq=None, how='start'): + """ + Cast to DatetimeIndex + + Parameters + ---------- + freq : string or DateOffset, default 'D' for week or longer, 'S' + otherwise + Target frequency + how : {'s', 'e', 'start', 'end'} + + Returns + ------- + DatetimeIndex + """ + how = _validate_end_alias(how) + + if freq is None: + base, mult = _gfc(self.freq) + freq = _freq_mod.get_to_timestamp_base(base) + + base, mult = _gfc(freq) + new_data = self.asfreq(freq, how) + + new_data = tslib.periodarr_to_dt64arr(new_data.values, base) + return DatetimeIndex(new_data, freq='infer', name=self.name) + + def shift(self, n): + """ + Specialized shift which produces an PeriodIndex + + Parameters + ---------- + n : int + Periods to shift by + freq : freq string + + Returns + ------- + shifted : PeriodIndex + """ + mask = self.values == tslib.iNaT + values = self.values + n + values[mask] = tslib.iNaT + return PeriodIndex(data=values, name=self.name, freq=self.freq) + + def __add__(self, other): + try: + return self.shift(other) + except TypeError: + # self.values + other raises TypeError for invalid input + return NotImplemented + + def __sub__(self, other): + try: + return self.shift(-other) + except TypeError: + return NotImplemented + + @property + def inferred_type(self): + # b/c data is represented as ints make sure we can't have ambiguous + # indexing + return 'period' + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + s = _values_from_object(series) + try: + return _maybe_box(self, super(PeriodIndex, self).get_value(s, key), series, key) + except (KeyError, IndexError): + try: + asdt, parsed, reso = parse_time_string(key, self.freq) + grp = _freq_mod._infer_period_group(reso) + freqn = _freq_mod._period_group(self.freq) + + vals = self.values + + # if our data is higher resolution than requested key, slice + if grp < freqn: + iv = Period(asdt, freq=(grp, 1)) + ord1 = iv.asfreq(self.freq, how='S').ordinal + ord2 = iv.asfreq(self.freq, how='E').ordinal + + if ord2 < vals[0] or ord1 > vals[-1]: + raise KeyError(key) + + pos = np.searchsorted(self.values, [ord1, ord2]) + key = slice(pos[0], pos[1] + 1) + return series[key] + elif grp == freqn: + key = Period(asdt, freq=self.freq).ordinal + return _maybe_box(self, self._engine.get_value(s, key), series, key) + else: + raise KeyError(key) + except TypeError: + pass + + key = Period(key, self.freq).ordinal + return _maybe_box(self, self._engine.get_value(s, key), series, key) + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + try: + return self._engine.get_loc(key) + except KeyError: + try: + asdt, parsed, reso = parse_time_string(key, self.freq) + key = asdt + except TypeError: + pass + + key = Period(key, self.freq) + try: + return self._engine.get_loc(key.ordinal) + except KeyError: + raise KeyError(key) + + def slice_locs(self, start=None, end=None): + """ + Index.slice_locs, customized to handle partial ISO-8601 string slicing + """ + if isinstance(start, compat.string_types) or isinstance(end, compat.string_types): + try: + if start: + start_loc = self._get_string_slice(start).start + else: + start_loc = 0 + + if end: + end_loc = self._get_string_slice(end).stop + else: + end_loc = len(self) + + return start_loc, end_loc + except KeyError: + pass + + if isinstance(start, datetime) and isinstance(end, datetime): + ordinals = self.values + t1 = Period(start, freq=self.freq) + t2 = Period(end, freq=self.freq) + + left = ordinals.searchsorted(t1.ordinal, side='left') + right = ordinals.searchsorted(t2.ordinal, side='right') + return left, right + + return Int64Index.slice_locs(self, start, end) + + def _get_string_slice(self, key): + if not self.is_monotonic: + raise ValueError('Partial indexing only valid for ' + 'ordered time series') + + key, parsed, reso = parse_time_string(key, self.freq) + + grp = _freq_mod._infer_period_group(reso) + freqn = _freq_mod._period_group(self.freq) + + if reso == 'year': + t1 = Period(year=parsed.year, freq='A') + elif reso == 'month': + t1 = Period(year=parsed.year, month=parsed.month, freq='M') + elif reso == 'quarter': + q = (parsed.month - 1) // 3 + 1 + t1 = Period(year=parsed.year, quarter=q, freq='Q-DEC') + elif reso == 'day' and grp < freqn: + t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, + freq='D') + elif reso == 'hour' and grp < freqn: + t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, + hour=parsed.hour, freq='H') + elif reso == 'minute' and grp < freqn: + t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, + hour=parsed.hour, minute=parsed.minute, freq='T') + elif reso == 'second' and grp < freqn: + t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, + hour=parsed.hour, minute=parsed.minute, second=parsed.second, + freq='S') + else: + raise KeyError(key) + + ordinals = self.values + + t2 = t1.asfreq(self.freq, how='end') + t1 = t1.asfreq(self.freq, how='start') + + left = ordinals.searchsorted(t1.ordinal, side='left') + right = ordinals.searchsorted(t2.ordinal, side='right') + return slice(left, right) + + def join(self, other, how='left', level=None, return_indexers=False): + """ + See Index.join + """ + self._assert_can_do_setop(other) + + result = Int64Index.join(self, other, how=how, level=level, + return_indexers=return_indexers) + + if return_indexers: + result, lidx, ridx = result + return self._apply_meta(result), lidx, ridx + return self._apply_meta(result) + + def _assert_can_do_setop(self, other): + if not isinstance(other, PeriodIndex): + raise ValueError('can only call with other PeriodIndex-ed objects') + + if self.freq != other.freq: + raise ValueError('Only like-indexed PeriodIndexes compatible ' + 'for join (for now)') + + def _wrap_union_result(self, other, result): + name = self.name if self.name == other.name else None + result = self._apply_meta(result) + result.name = name + return result + + def _apply_meta(self, rawarr): + if not isinstance(rawarr, PeriodIndex): + rawarr = rawarr.view(PeriodIndex) + rawarr.freq = self.freq + return rawarr + + def __getitem__(self, key): + """Override numpy.ndarray's __getitem__ method to work as desired""" + arr_idx = self.view(np.ndarray) + if np.isscalar(key): + val = arr_idx[key] + return Period(ordinal=val, freq=self.freq) + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + + result = arr_idx[key] + if result.ndim > 1: + # MPL kludge + # values = np.asarray(list(values), dtype=object) + # return values.reshape(result.shape) + + return PeriodIndex(result, name=self.name, freq=self.freq) + + return PeriodIndex(result, name=self.name, freq=self.freq) + + def _format_with_header(self, header, **kwargs): + return header + self._format_native_types(**kwargs) + + def _format_native_types(self, na_rep=u('NaT'), **kwargs): + + values = np.array(list(self), dtype=object) + mask = isnull(self.values) + values[mask] = na_rep + + imask = ~mask + values[imask] = np.array([u('%s') % dt for dt in values[imask]]) + return values.tolist() + + def __array_finalize__(self, obj): + if not self.ndim: # pragma: no cover + return self.item() + + self.freq = getattr(obj, 'freq', None) + self.name = getattr(obj, 'name', None) + self._reset_identity() + + def _format_footer(self): + tagline = 'Length: %d, Freq: %s' + return tagline % (len(self), self.freqstr) + + def take(self, indices, axis=None): + """ + Analogous to ndarray.take + """ + indices = com._ensure_platform_int(indices) + taken = self.values.take(indices, axis=axis) + return self._simple_new(taken, self.name, freq=self.freq) + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + name = self.name + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) + + for obj in to_concat: + if isinstance(obj, Index) and obj.name != name: + name = None + break + + to_concat = self._ensure_compat_concat(to_concat) + + if isinstance(to_concat[0], PeriodIndex): + if len(set([x.freq for x in to_concat])) > 1: + # box + to_concat = [x.asobject for x in to_concat] + else: + cat_values = np.concatenate([x.values for x in to_concat]) + return PeriodIndex(cat_values, freq=self.freq, name=name) + + to_concat = [x.values if isinstance(x, Index) else x + for x in to_concat] + return Index(com._concat_compat(to_concat), name=name) + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(np.ndarray.__reduce__(self)) + subclass_state = (self.name, self.freq) + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if len(state) == 2: + nd_state, own_state = state + np.ndarray.__setstate__(self, nd_state) + self.name = own_state[0] + try: # backcompat + self.freq = own_state[1] + except: + pass + else: # pragma: no cover + np.ndarray.__setstate__(self, state) + + +def _get_ordinal_range(start, end, periods, freq): + if com._count_not_none(start, end, periods) < 2: + raise ValueError('Must specify 2 of start, end, periods') + + if start is not None: + start = Period(start, freq) + if end is not None: + end = Period(end, freq) + + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) + + if is_start_per and is_end_per and start.freq != end.freq: + raise ValueError('Start and end must have same freq') + if ((is_start_per and start.ordinal == tslib.iNaT) or + (is_end_per and end.ordinal == tslib.iNaT)): + raise ValueError('Start and end must not be NaT') + + if freq is None: + if is_start_per: + freq = start.freq + elif is_end_per: + freq = end.freq + else: # pragma: no cover + raise ValueError('Could not infer freq from start/end') + + if periods is not None: + if start is None: + data = np.arange(end.ordinal - periods + 1, + end.ordinal + 1, + dtype=np.int64) + else: + data = np.arange(start.ordinal, start.ordinal + periods, + dtype=np.int64) + else: + data = np.arange(start.ordinal, end.ordinal + 1, dtype=np.int64) + + return data, freq + + +def _range_from_fields(year=None, month=None, quarter=None, day=None, + hour=None, minute=None, second=None, freq=None): + if hour is None: + hour = 0 + if minute is None: + minute = 0 + if second is None: + second = 0 + if day is None: + day = 1 + + ordinals = [] + + if quarter is not None: + if freq is None: + freq = 'Q' + base = FreqGroup.FR_QTR + else: + base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') + if base != FreqGroup.FR_QTR: + raise AssertionError("base must equal FR_QTR") + + year, quarter = _make_field_arrays(year, quarter) + for y, q in zip(year, quarter): + y, m = _quarter_to_myear(y, q, freq) + val = tslib.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base) + ordinals.append(val) + else: + base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') + + arrays = _make_field_arrays(year, month, day, hour, minute, second) + for y, mth, d, h, mn, s in zip(*arrays): + ordinals.append(tslib.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) + + return np.array(ordinals, dtype=np.int64), freq + + +def _make_field_arrays(*fields): + length = None + for x in fields: + if isinstance(x, (list, np.ndarray, ABCSeries)): + if length is not None and len(x) != length: + raise ValueError('Mismatched Period array lengths') + elif length is None: + length = len(x) + + arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) + else np.repeat(x, length) for x in fields] + + return arrays + + +def _ordinal_from_fields(year, month, quarter, day, hour, minute, + second, freq): + base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') + + if quarter is not None: + year, month = _quarter_to_myear(year, quarter, freq) + + return tslib.period_ordinal(year, month, day, hour, minute, second, 0, 0, base) + + +def _quarter_to_myear(year, quarter, freq): + if quarter is not None: + if quarter <= 0 or quarter > 4: + raise ValueError('Quarter must be 1 <= q <= 4') + + mnum = _month_numbers[_freq_mod._get_rule_month(freq)] + 1 + month = (mnum + (quarter - 1) * 3) % 12 + 1 + if month > mnum: + year -= 1 + + return year, month + + +def _validate_end_alias(how): + how_dict = {'S': 'S', 'E': 'E', + 'START': 'S', 'FINISH': 'E', + 'BEGIN': 'S', 'END': 'E'} + how = how_dict.get(str(how).upper()) + if how not in set(['S', 'E']): + raise ValueError('How must be one of S or E') + return how + + +def pnow(freq=None): + return Period(datetime.now(), freq=freq) + + +def period_range(start=None, end=None, periods=None, freq='D', name=None): + """ + Return a fixed frequency datetime index, with day (calendar) as the default + frequency + + + Parameters + ---------- + start : + end : + periods : int, default None + Number of periods in the index + freq : str/DateOffset, default 'D' + Frequency alias + name : str, default None + Name for the resulting PeriodIndex + + Returns + ------- + prng : PeriodIndex + """ + return PeriodIndex(start=start, end=end, periods=periods, + freq=freq, name=name) diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py new file mode 100644 index 00000000..6031482f --- /dev/null +++ b/pandas/tseries/plotting.py @@ -0,0 +1,251 @@ +""" +Period formatters and locators adapted from scikits.timeseries by +Pierre GF Gerard-Marchant & Matt Knox +""" + +#!!! TODO: Use the fact that axis can have units to simplify the process +from matplotlib import pylab + +import numpy as np + +from pandas import isnull +from pandas.tseries.period import Period +from pandas.tseries.offsets import DateOffset +import pandas.tseries.frequencies as frequencies +from pandas.tseries.index import DatetimeIndex +import pandas.core.common as com + +from pandas.tseries.converter import (PeriodConverter, TimeSeries_DateLocator, + TimeSeries_DateFormatter) + +from pandas.tools.plotting import _get_all_lines, _get_xlim + +#---------------------------------------------------------------------- +# Plotting functions and monkey patches + + +def tsplot(series, plotf, **kwargs): + """ + Plots a Series on the given Matplotlib axes or the current axes + + Parameters + ---------- + axes : Axes + series : Series + + Notes + _____ + Supports same kwargs as Axes.plot + + """ + # Used inferred freq is possible, need a test case for inferred + if 'ax' in kwargs: + ax = kwargs.pop('ax') + else: + import matplotlib.pyplot as plt + ax = plt.gca() + + freq = _get_freq(ax, series) + # resample against axes freq if necessary + if freq is None: # pragma: no cover + raise ValueError('Cannot use dynamic axis without frequency info') + else: + # Convert DatetimeIndex to PeriodIndex + if isinstance(series.index, DatetimeIndex): + series = series.to_period(freq=freq) + freq, ax_freq, series = _maybe_resample(series, ax, freq, plotf, + kwargs) + + # Set ax with freq info + _decorate_axes(ax, freq, kwargs) + + # mask missing values + args = _maybe_mask(series) + + # how to make sure ax.clear() flows through? + if not hasattr(ax, '_plot_data'): + ax._plot_data = [] + ax._plot_data.append((series, kwargs)) + + # styles + style = kwargs.pop('style', None) + if style is not None: + args.append(style) + + lines = plotf(ax, *args, **kwargs) + + # set date formatter, locators and rescale limits + format_dateaxis(ax, ax.freq) + left, right = _get_xlim(_get_all_lines(ax)) + ax.set_xlim(left, right) + + # x and y coord info + ax.format_coord = lambda t, y: ("t = {0} " + "y = {1:8f}".format(Period(ordinal=int(t), + freq=ax.freq), + y)) + + return lines + + +def _maybe_resample(series, ax, freq, plotf, kwargs): + ax_freq = _get_ax_freq(ax) + if ax_freq is not None and freq != ax_freq: + if frequencies.is_superperiod(freq, ax_freq): # upsample input + series = series.copy() + series.index = series.index.asfreq(ax_freq, how='s') + freq = ax_freq + elif _is_sup(freq, ax_freq): # one is weekly + how = kwargs.pop('how', 'last') + series = series.resample('D', how=how).dropna() + series = series.resample(ax_freq, how=how).dropna() + freq = ax_freq + elif frequencies.is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq): + _upsample_others(ax, freq, plotf, kwargs) + ax_freq = freq + else: # pragma: no cover + raise ValueError('Incompatible frequency conversion') + return freq, ax_freq, series + + +def _get_ax_freq(ax): + ax_freq = getattr(ax, 'freq', None) + if ax_freq is None: + if hasattr(ax, 'left_ax'): + ax_freq = getattr(ax.left_ax, 'freq', None) + elif hasattr(ax, 'right_ax'): + ax_freq = getattr(ax.right_ax, 'freq', None) + return ax_freq + + +def _is_sub(f1, f2): + return ((f1.startswith('W') and frequencies.is_subperiod('D', f2)) or + (f2.startswith('W') and frequencies.is_subperiod(f1, 'D'))) + + +def _is_sup(f1, f2): + return ((f1.startswith('W') and frequencies.is_superperiod('D', f2)) or + (f2.startswith('W') and frequencies.is_superperiod(f1, 'D'))) + + +def _upsample_others(ax, freq, plotf, kwargs): + legend = ax.get_legend() + lines, labels = _replot_ax(ax, freq, plotf, kwargs) + + other_ax = None + if hasattr(ax, 'left_ax'): + other_ax = ax.left_ax + if hasattr(ax, 'right_ax'): + other_ax = ax.right_ax + + if other_ax is not None: + rlines, rlabels = _replot_ax(other_ax, freq, plotf, kwargs) + lines.extend(rlines) + labels.extend(rlabels) + + if (legend is not None and kwargs.get('legend', True) and + len(lines) > 0): + title = legend.get_title().get_text() + if title == 'None': + title = None + ax.legend(lines, labels, loc='best', title=title) + + +def _replot_ax(ax, freq, plotf, kwargs): + data = getattr(ax, '_plot_data', None) + ax._plot_data = [] + ax.clear() + _decorate_axes(ax, freq, kwargs) + + lines = [] + labels = [] + if data is not None: + for series, kwds in data: + series = series.copy() + idx = series.index.asfreq(freq, how='S') + series.index = idx + ax._plot_data.append(series) + args = _maybe_mask(series) + lines.append(plotf(ax, *args, **kwds)[0]) + labels.append(com.pprint_thing(series.name)) + + return lines, labels + + +def _decorate_axes(ax, freq, kwargs): + ax.freq = freq + xaxis = ax.get_xaxis() + xaxis.freq = freq + if not hasattr(ax, 'legendlabels'): + ax.legendlabels = [kwargs.get('label', None)] + else: + ax.legendlabels.append(kwargs.get('label', None)) + ax.view_interval = None + ax.date_axis_info = None + + +def _maybe_mask(series): + mask = isnull(series) + if mask.any(): + masked_array = np.ma.array(series.values) + masked_array = np.ma.masked_where(mask, masked_array) + args = [series.index, masked_array] + else: + args = [series.index, series.values] + return args + + +def _get_freq(ax, series): + # get frequency from data + freq = getattr(series.index, 'freq', None) + if freq is None: + freq = getattr(series.index, 'inferred_freq', None) + + ax_freq = getattr(ax, 'freq', None) + + # use axes freq if no data freq + if freq is None: + freq = ax_freq + + # get the period frequency + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = frequencies.get_base_alias(freq) + + freq = frequencies.get_period_alias(freq) + + return freq + + +# Patch methods for subplot. Only format_dateaxis is currently used. +# Do we need the rest for convenience? + + +def format_dateaxis(subplot, freq): + """ + Pretty-formats the date axis (x-axis). + + Major and minor ticks are automatically set for the frequency of the + current underlying series. As the dynamic mode is activated by + default, changing the limits of the x axis will intelligently change + the positions of the ticks. + """ + majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, + minor_locator=False, + plot_obj=subplot) + minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, + minor_locator=True, + plot_obj=subplot) + subplot.xaxis.set_major_locator(majlocator) + subplot.xaxis.set_minor_locator(minlocator) + + majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + minor_locator=False, + plot_obj=subplot) + minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + minor_locator=True, + plot_obj=subplot) + subplot.xaxis.set_major_formatter(majformatter) + subplot.xaxis.set_minor_formatter(minformatter) + pylab.draw_if_interactive() diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py new file mode 100644 index 00000000..01aff164 --- /dev/null +++ b/pandas/tseries/resample.py @@ -0,0 +1,448 @@ +from datetime import timedelta + +import numpy as np + +from pandas.core.groupby import BinGrouper, Grouper +from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod +from pandas.tseries.index import DatetimeIndex, date_range +from pandas.tseries.offsets import DateOffset, Tick, _delta_to_nanoseconds +from pandas.tseries.period import PeriodIndex, period_range +import pandas.tseries.tools as tools +import pandas.core.common as com +import pandas.compat as compat + +from pandas.lib import Timestamp +import pandas.lib as lib +import pandas.tslib as tslib + + +_DEFAULT_METHOD = 'mean' + + +class TimeGrouper(Grouper): + """ + Custom groupby class for time-interval grouping + + Parameters + ---------- + freq : pandas date offset or offset alias for identifying bin edges + closed : closed end of interval; left or right + label : interval boundary to use for labeling; left or right + nperiods : optional, integer + convention : {'start', 'end', 'e', 's'} + If axis is PeriodIndex + + Notes + ----- + Use begin, end, nperiods to generate intervals that cannot be derived + directly from the associated object + """ + def __init__(self, freq='Min', closed=None, label=None, how='mean', + nperiods=None, axis=0, + fill_method=None, limit=None, loffset=None, kind=None, + convention=None, base=0, **kwargs): + freq = to_offset(freq) + + end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W']) + rule = freq.rule_code + if (rule in end_types or + ('-' in rule and rule[:rule.find('-')] in end_types)): + if closed is None: + closed = 'right' + if label is None: + label = 'right' + else: + if closed is None: + closed = 'left' + if label is None: + label = 'left' + + self.closed = closed + self.label = label + self.nperiods = nperiods + self.kind = kind + + self.convention = convention or 'E' + self.convention = self.convention.lower() + + self.loffset = loffset + self.how = how + self.fill_method = fill_method + self.limit = limit + self.base = base + + # always sort time groupers + kwargs['sort'] = True + + super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs) + + def resample(self, obj): + self._set_grouper(obj, sort=True) + ax = self.grouper + + if isinstance(ax, DatetimeIndex): + rs = self._resample_timestamps() + elif isinstance(ax, PeriodIndex): + offset = to_offset(self.freq) + if offset.n > 1: + if self.kind == 'period': # pragma: no cover + print('Warning: multiple of frequency -> timestamps') + # Cannot have multiple of periods, convert to timestamp + self.kind = 'timestamp' + + if self.kind is None or self.kind == 'period': + rs = self._resample_periods() + else: + obj = self.obj.to_timestamp(how=self.convention) + self._set_grouper(obj) + rs = self._resample_timestamps() + elif len(ax) == 0: + return self.obj + else: # pragma: no cover + raise TypeError('Only valid with DatetimeIndex or PeriodIndex') + + rs_axis = rs._get_axis(self.axis) + rs_axis.name = ax.name + return rs + + def _get_grouper(self, obj): + self._set_grouper(obj) + return self._get_binner_for_resample() + + def _get_binner_for_resample(self): + # create the BinGrouper + # assume that self.set_grouper(obj) has already been called + + ax = self.ax + if self.kind is None or self.kind == 'timestamp': + self.binner, bins, binlabels = self._get_time_bins(ax) + else: + self.binner, bins, binlabels = self._get_time_period_bins(ax) + + self.grouper = BinGrouper(bins, binlabels) + return self.binner, self.grouper, self.obj + + def _get_binner_for_grouping(self, obj): + # return an ordering of the transformed group labels, + # suitable for multi-grouping, e.g the labels for + # the resampled intervals + ax = self._set_grouper(obj) + self._get_binner_for_resample() + + # create the grouper + binner = self.binner + l = [] + for key, group in self.grouper.get_iterator(ax): + l.extend([key]*len(group)) + grouper = binner.__class__(l,freq=binner.freq,name=binner.name) + + # since we may have had to sort + # may need to reorder groups here + if self.indexer is not None: + indexer = self.indexer.argsort(kind='quicksort') + grouper = grouper.take(indexer) + return grouper + + def _get_time_bins(self, ax): + if not isinstance(ax, DatetimeIndex): + raise TypeError('axis must be a DatetimeIndex, but got ' + 'an instance of %r' % type(ax).__name__) + + if len(ax) == 0: + binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + first, last = ax.min(), ax.max() + first, last = _get_range_edges(first, last, self.freq, closed=self.closed, + base=self.base) + tz = ax.tz + binner = labels = DatetimeIndex(freq=self.freq, + start=first.replace(tzinfo=None), + end=last.replace(tzinfo=None), + tz=tz, + name=ax.name) + + # a little hack + trimmed = False + if (len(binner) > 2 and binner[-2] == last and + self.closed == 'right'): + + binner = binner[:-1] + trimmed = True + + ax_values = ax.asi8 + binner, bin_edges = self._adjust_bin_edges(binner, ax_values) + + # general version, knowing nothing about relative frequencies + bins = lib.generate_bins_dt64(ax_values, bin_edges, self.closed, hasnans=ax.hasnans) + + if self.closed == 'right': + labels = binner + if self.label == 'right': + labels = labels[1:] + elif not trimmed: + labels = labels[:-1] + else: + if self.label == 'right': + labels = labels[1:] + elif not trimmed: + labels = labels[:-1] + + if ax.hasnans: + binner = binner.insert(0, tslib.NaT) + labels = labels.insert(0, tslib.NaT) + + # if we end up with more labels than bins + # adjust the labels + # GH4076 + if len(bins) < len(labels): + labels = labels[:len(bins)] + + return binner, bins, labels + + def _adjust_bin_edges(self, binner, ax_values): + # Some hacks for > daily data, see #1471, #1458, #1483 + + bin_edges = binner.asi8 + + if self.freq != 'D' and is_superperiod(self.freq, 'D'): + day_nanos = _delta_to_nanoseconds(timedelta(1)) + if self.closed == 'right': + bin_edges = bin_edges + day_nanos - 1 + + # intraday values on last day + if bin_edges[-2] > ax_values.max(): + bin_edges = bin_edges[:-1] + binner = binner[:-1] + + return binner, bin_edges + + def _get_time_period_bins(self, ax): + if not isinstance(ax, DatetimeIndex): + raise TypeError('axis must be a DatetimeIndex, but got ' + 'an instance of %r' % type(ax).__name__) + + if not len(ax): + binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + labels = binner = PeriodIndex(start=ax[0], + end=ax[-1], + freq=self.freq, + name=ax.name) + + end_stamps = (labels + 1).asfreq(self.freq, 's').to_timestamp() + if ax.tzinfo: + end_stamps = end_stamps.tz_localize(ax.tzinfo) + bins = ax.searchsorted(end_stamps, side='left') + + return binner, bins, labels + + @property + def _agg_method(self): + return self.how if self.how else _DEFAULT_METHOD + + def _resample_timestamps(self): + # assumes set_grouper(obj) already called + axlabels = self.ax + + self._get_binner_for_resample() + grouper = self.grouper + binner = self.binner + obj = self.obj + + # Determine if we're downsampling + if axlabels.freq is not None or axlabels.inferred_freq is not None: + + if len(grouper.binlabels) < len(axlabels) or self.how is not None: + # downsample + grouped = obj.groupby(grouper, axis=self.axis) + result = grouped.aggregate(self._agg_method) + # GH2073 + if self.fill_method is not None: + result = result.fillna(method=self.fill_method, + limit=self.limit) + + else: + # upsampling shortcut + if self.axis: + raise AssertionError('axis must be 0') + + if self.closed == 'right': + res_index = binner[1:] + else: + res_index = binner[:-1] + + # if we have the same frequency as our axis, then we are equal sampling + # even if how is None + if self.fill_method is None and self.limit is None and to_offset( + axlabels.inferred_freq) == self.freq: + result = obj.copy() + result.index = res_index + else: + result = obj.reindex(res_index, method=self.fill_method, + limit=self.limit) + else: + # Irregular data, have to use groupby + grouped = obj.groupby(grouper, axis=self.axis) + result = grouped.aggregate(self._agg_method) + + if self.fill_method is not None: + result = result.fillna(method=self.fill_method, + limit=self.limit) + + loffset = self.loffset + if isinstance(loffset, compat.string_types): + loffset = to_offset(self.loffset) + + if isinstance(loffset, (DateOffset, timedelta)): + if (isinstance(result.index, DatetimeIndex) + and len(result.index) > 0): + + result.index = result.index + loffset + + return result + + def _resample_periods(self): + # assumes set_grouper(obj) already called + axlabels = self.ax + obj = self.obj + + if len(axlabels) == 0: + new_index = PeriodIndex(data=[], freq=self.freq) + return obj.reindex(new_index) + else: + start = axlabels[0].asfreq(self.freq, how=self.convention) + end = axlabels[-1].asfreq(self.freq, how='end') + + new_index = period_range(start, end, freq=self.freq) + + # Start vs. end of period + memb = axlabels.asfreq(self.freq, how=self.convention) + + if is_subperiod(axlabels.freq, self.freq) or self.how is not None: + # Downsampling + rng = np.arange(memb.values[0], memb.values[-1] + 1) + bins = memb.searchsorted(rng, side='right') + grouper = BinGrouper(bins, new_index) + + grouped = obj.groupby(grouper, axis=self.axis) + return grouped.aggregate(self._agg_method) + elif is_superperiod(axlabels.freq, self.freq): + # Get the fill indexer + indexer = memb.get_indexer(new_index, method=self.fill_method, + limit=self.limit) + return _take_new_index(obj, indexer, new_index, axis=self.axis) + + else: + raise ValueError('Frequency %s cannot be resampled to %s' + % (axlabels.freq, self.freq)) + + +def _take_new_index(obj, indexer, new_index, axis=0): + from pandas.core.api import Series, DataFrame + + if isinstance(obj, Series): + new_values = com.take_1d(obj.values, indexer) + return Series(new_values, index=new_index, name=obj.name) + elif isinstance(obj, DataFrame): + if axis == 1: + raise NotImplementedError + return DataFrame(obj._data.reindex_indexer( + new_axis=new_index, indexer=indexer, axis=1)) + else: + raise NotImplementedError + + +def _get_range_edges(first, last, offset, closed='left', base=0): + if isinstance(offset, compat.string_types): + offset = to_offset(offset) + + if isinstance(offset, Tick): + day_nanos = _delta_to_nanoseconds(timedelta(1)) + # #1165 + if (day_nanos % offset.nanos) == 0: + return _adjust_dates_anchored(first, last, offset, + closed=closed, base=base) + + if not isinstance(offset, Tick): # and first.time() != last.time(): + # hack! + first = tools.normalize_date(first) + last = tools.normalize_date(last) + + if closed == 'left': + first = Timestamp(offset.rollback(first)) + else: + first = Timestamp(first - offset) + + last = Timestamp(last + offset) + + return first, last + + +def _adjust_dates_anchored(first, last, offset, closed='right', base=0): + from pandas.tseries.tools import normalize_date + + start_day_nanos = Timestamp(normalize_date(first)).value + last_day_nanos = Timestamp(normalize_date(last)).value + + base_nanos = (base % offset.n) * offset.nanos // offset.n + start_day_nanos += base_nanos + last_day_nanos += base_nanos + + foffset = (first.value - start_day_nanos) % offset.nanos + loffset = (last.value - last_day_nanos) % offset.nanos + + if closed == 'right': + if foffset > 0: + # roll back + fresult = first.value - foffset + else: + fresult = first.value - offset.nanos + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + # already the end of the road + lresult = last.value + else: # closed == 'left' + if foffset > 0: + fresult = first.value - foffset + else: + # start of the road + fresult = first.value + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + lresult = last.value + offset.nanos + + return (Timestamp(fresult, tz=first.tz), + Timestamp(lresult, tz=last.tz)) + + +def asfreq(obj, freq, method=None, how=None, normalize=False): + """ + Utility frequency conversion method for Series/DataFrame + """ + if isinstance(obj.index, PeriodIndex): + if method is not None: + raise NotImplementedError + + if how is None: + how = 'E' + + new_index = obj.index.asfreq(freq, how=how) + new_obj = obj.copy() + new_obj.index = new_index + return new_obj + else: + if len(obj.index) == 0: + return obj.copy() + dti = date_range(obj.index[0], obj.index[-1], freq=freq) + rs = obj.reindex(dti, method=method) + if normalize: + rs.index = rs.index.normalize() + return rs diff --git a/pandas/tseries/tests/__init__.py b/pandas/tseries/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tseries/tests/data/daterange_073.pickle b/pandas/tseries/tests/data/daterange_073.pickle new file mode 100644 index 0000000000000000000000000000000000000000..0214a023e6338dce54e6daf8b3d94a7275baca66 GIT binary patch literal 650 zcmZY7OH0E*5C`z4PrH3+U)q<}*CAed_9jSAE<`BoQDl>BZ848-vOy{q^blLWo!>~) zb{G%N!ZQ3A=JKESwB<$ad@;2AKn&f;Q8OL{d_f)qVfkLDg2+-tYSx^4HV=1WHdi9x z-jg7sq#JKLnWm|jY36DyGdk61Gu|yGwpz>uky)0$zosdwB?CE~W|;P77{=XCQrnN- zDD&$<=5=ecUCmrUu#p8u3g22LwXJw8_oh3^q7*@LCj=6X`nPgnkX%h7Rn(=8|4V3gVF}+qI5udC|!^~N>3;w{`{A; z@_i>Hx1;1JWdG_z9xvsI&WfHNxZIh&3OQJ_yg!+QLdny=^fnRN!cm;avn2QACCQ(& Y?DLBq%8RAEoDS9@(>$t0rm-@Izk(m6nE(I) literal 0 HcmV?d00001 diff --git a/pandas/tseries/tests/data/frame.pickle b/pandas/tseries/tests/data/frame.pickle new file mode 100644 index 0000000000000000000000000000000000000000..b3b100fb43022faf7bd0b949238988afc7de53bf GIT binary patch literal 1182 zcmZo*N-jvuOGzx&OU^G!)k`Z%%uVHTNi0cp1G5SlH5&_2m1O3Xq!#5R<`i=|<>V)4 z`zGckrl%Hh6*2==vcwj$Y9!~C<`z^!%*!p!DalMMDoU*6iZ4n{&d)0@DJo4a;VNW9 zu{JX=CAEUfGq1$V#1qUcWcOxh4P{Jf4=Uu)@MiR8ZH1W1l~Ph!kjhoa8OoGt;mzR9 z2voqO;msV%XyfPS=k*^5z=StLNm6I11_Kl@LTM%_%?zbkpmd2}YgY7m%J$PITE56D?utru>CGx=D$H7fJyWf^==6j7BJDUVc$-VoqjNYN2dL zC|iD7T5)Pgp&TM4K*5ocnp2XJQig0taVTS+H)Cm% zUwcw&Y@sqRmcZ$Y3z%rZ>8el#9w(~cq~guh28xw5SgfewNFN;`6M*TW;fH?PbI(qD zpGB$Jd180$pY_~~+OhDdefBm(`J$sw?Iq%0oobwM+g|&L=xlec2ljnYDGD!n-q>g8 zUwoA0f6#t{RL7q$(nsuHeSXyJ?tR(*#=mDLA1WNNXXv=rfBD92`>Bn3vYT!_x3`GD za8a>$i~TLpIhKblp4tZ;l`Kf#cV$0_Toy_unf!A`-q_VZL0AG-eV zsr|Adk2*u+Yxb8^`t(}$F4?E}MdinQzi+=`r_oc77x(PTws(Kna^b%HBhDjt=Q_T$ z&pcMAp>pVf{gs02oSzq6vX}X#bi8QMeftj!9HuX9TW+tmz5In8^E>;!XC?%#<#=n~ z!1O`X*y@NqU;2%8|888h*FUt3f9K|R_OmDVNaSvOY(Fu@LfxqNfxVtm!`}<-H-Wh& PF}6@WgCns$DM=3iVMVf9 literal 0 HcmV?d00001 diff --git a/pandas/tseries/tests/data/series.pickle b/pandas/tseries/tests/data/series.pickle new file mode 100644 index 0000000000000000000000000000000000000000..307a4ac26517384a8267b23a8891e4cf919a20d8 GIT binary patch literal 646 zcmZo*O3o|IEvVE>&M!*U%Pq|*$xJLNO049HFG@|$&nqq|DorloDr8J9NX$z~EQTm6 zPA$qzE#?Zz%uNl3FbkQy8CpXbliGs{nKir_y}4Q;#&V^UR2HOi6|#gfrCE40cryYO zuxfZShcepu`T2SM2LdqR%}|om85;0gre(H&?L+&pe^2_|L|)soNw4(YaOa)<>%zvK zYfcaC6He?G)>?kezSjQ7#_DSi>^Cm|Q~hw#b9(`k2lLpcEV8#d5t>_a^o4!SJ<&Jf z{KxD|GEg0!l30>jl$e*E%H;xN1%X+GY;dQuL!6!gbge(kwH#pA)}Xr99_ZTGLQaij zkbxz@VBmr?3b{hL*sn4&Gk`&BP$72)M1%z{!UGjyg^Tb)McCjXd{7Z~xClQ~gbOYr z02SeeiwHtRc;F&JP!V3Zh%i)y4=y5-TH@E*h7!YI@8sv_6mvPb024!@sAglKSZ$%W zMkr@qeo<~>PG(hVp+rY0TYg$vacW7SBqAh0!I6@hQ!OOZ59U9nA?q0TAECTVJ&E1=s@p_O5Y99O<1%Q zB;BptG-d=J1D0mc)|$GRnB*5MVgwiVVm1}zl5>9Zji(6111*Nz)~! zRiE7aziNM|S|k1TkABe8-^TPSq(1=Pb|Z~nI#^obZBbmI (3,) and sys.platform == 'win32': + raise nose.SkipTest("not used on python 3/win32") + + +def eq_gen_range(kwargs, expected): + rng = generate_range(**kwargs) + assert(np.array_equal(list(rng), expected)) + + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + + +class TestGenRangeGeneration(tm.TestCase): + def test_generate(self): + rng1 = list(generate_range(START, END, offset=datetools.bday)) + rng2 = list(generate_range(START, END, time_rule='B')) + self.assert_numpy_array_equal(rng1, rng2) + + def test_generate_cday(self): + tm._skip_if_no_cday() + rng1 = list(generate_range(START, END, offset=datetools.cday)) + rng2 = list(generate_range(START, END, time_rule='C')) + self.assert_numpy_array_equal(rng1, rng2) + + def test_1(self): + eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), + [datetime(2009, 3, 25), datetime(2009, 3, 26)]) + + def test_2(self): + eq_gen_range(dict(start=datetime(2008, 1, 1), + end=datetime(2008, 1, 3)), + [datetime(2008, 1, 1), + datetime(2008, 1, 2), + datetime(2008, 1, 3)]) + + def test_3(self): + eq_gen_range(dict(start=datetime(2008, 1, 5), + end=datetime(2008, 1, 6)), + []) + + +class TestDateRange(tm.TestCase): + + def setUp(self): + self.rng = bdate_range(START, END) + + def test_constructor(self): + rng = bdate_range(START, END, freq=datetools.bday) + rng = bdate_range(START, periods=20, freq=datetools.bday) + rng = bdate_range(end=START, periods=20, freq=datetools.bday) + self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'B') + self.assertRaises(ValueError, bdate_range, '2011-1-1', '2012-1-1', 'B') + + def test_naive_aware_conflicts(self): + naive = bdate_range(START, END, freq=datetools.bday, tz=None) + aware = bdate_range(START, END, freq=datetools.bday, tz="Asia/Hong_Kong") + assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", naive.join, aware) + assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", aware.join, naive) + + def test_cached_range(self): + rng = DatetimeIndex._cached_range(START, END, + offset=datetools.bday) + rng = DatetimeIndex._cached_range(START, periods=20, + offset=datetools.bday) + rng = DatetimeIndex._cached_range(end=START, periods=20, + offset=datetools.bday) + + assertRaisesRegexp(TypeError, "offset", DatetimeIndex._cached_range, START, END) + + assertRaisesRegexp(TypeError, "specify period", DatetimeIndex._cached_range, START, + offset=datetools.bday) + + assertRaisesRegexp(TypeError, "specify period", DatetimeIndex._cached_range, end=END, + offset=datetools.bday) + + assertRaisesRegexp(TypeError, "start or end", DatetimeIndex._cached_range, periods=20, + offset=datetools.bday) + + def test_cached_range_bug(self): + rng = date_range('2010-09-01 05:00:00', periods=50, + freq=datetools.DateOffset(hours=6)) + self.assertEqual(len(rng), 50) + self.assertEqual(rng[0], datetime(2010, 9, 1, 5)) + + def test_timezone_comparaison_bug(self): + start = Timestamp('20130220 10:00', tz='US/Eastern') + try: + date_range(start, periods=2, tz='US/Eastern') + except AssertionError: + self.fail() + + def test_timezone_comparaison_assert(self): + start = Timestamp('20130220 10:00', tz='US/Eastern') + self.assertRaises(AssertionError, date_range, start, periods=2, tz='Europe/Berlin') + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + self.assertTrue(comp[11]) + self.assertFalse(comp[9]) + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assertTrue(cp.equals(self.rng)) + + def test_repr(self): + # only really care that it works + repr(self.rng) + + def test_getitem(self): + smaller = self.rng[:5] + self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) + self.assertEqual(smaller.offset, self.rng.offset) + + sliced = self.rng[::5] + self.assertEqual(sliced.offset, datetools.bday * 5) + + fancy_indexed = self.rng[[4, 3, 2, 1, 0]] + self.assertEqual(len(fancy_indexed), 5) + tm.assert_isinstance(fancy_indexed, DatetimeIndex) + self.assertIsNone(fancy_indexed.freq) + + # 32-bit vs. 64-bit platforms + self.assertEqual(self.rng[4], self.rng[np.int_(4)]) + + def test_getitem_matplotlib_hackaround(self): + values = self.rng[:, None] + expected = self.rng.values[:, None] + self.assert_numpy_array_equal(values, expected) + + def test_shift(self): + shifted = self.rng.shift(5) + self.assertEqual(shifted[0], self.rng[5]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(-5) + self.assertEqual(shifted[5], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(0) + self.assertEqual(shifted[0], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + rng = date_range(START, END, freq=datetools.bmonthEnd) + shifted = rng.shift(1, freq=datetools.bday) + self.assertEqual(shifted[0], rng[0] + datetools.bday) + + def test_pickle_unpickle(self): + pickled = pickle.dumps(self.rng) + unpickled = pickle.loads(pickled) + + self.assertIsNotNone(unpickled.offset) + + def test_union(self): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assert_isinstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right) + tm.assert_isinstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assert_isinstance(the_union, DatetimeIndex) + + # order does not matter + self.assert_numpy_array_equal(right.union(left), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=datetools.bmonthEnd) + + the_union = self.rng.union(rng) + tm.assert_isinstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=datetools.bmonthEnd) + + the_join = self.rng.join(rng, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + def test_union_not_cacheable(self): + rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_union = rng1.union(rng2) + self.assertTrue(the_union.equals(rng)) + + rng1 = rng[10:] + rng2 = rng[15:35] + the_union = rng1.union(rng2) + expected = rng[10:] + self.assertTrue(the_union.equals(expected)) + + def test_intersection(self): + rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_int = rng1.intersection(rng2) + expected = rng[10:25] + self.assertTrue(the_int.equals(expected)) + tm.assert_isinstance(the_int, DatetimeIndex) + self.assertEqual(the_int.offset, rng.offset) + + the_int = rng1.intersection(rng2.view(DatetimeIndex)) + self.assertTrue(the_int.equals(expected)) + + # non-overlapping + the_int = rng[:10].intersection(rng[10:]) + expected = DatetimeIndex([]) + self.assertTrue(the_int.equals(expected)) + + def test_intersection_bug(self): + # GH #771 + a = bdate_range('11/30/2011', '12/31/2011') + b = bdate_range('12/10/2011', '12/20/2011') + result = a.intersection(b) + self.assertTrue(result.equals(b)) + + def test_summary(self): + self.rng.summary() + self.rng[2:2].summary() + + def test_summary_pytz(self): + tm._skip_if_no_pytz() + import pytz + bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + + def test_summary_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + bdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = bdate_range(end=end, periods=20) + firstDate = end - 19 * datetools.bday + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_date_parse_failure(self): + badly_formed_date = '2007/100/1' + + self.assertRaises(ValueError, Timestamp, badly_formed_date) + + self.assertRaises(ValueError, bdate_range, start=badly_formed_date, + periods=10) + self.assertRaises(ValueError, bdate_range, end=badly_formed_date, + periods=10) + self.assertRaises(ValueError, bdate_range, badly_formed_date, + badly_formed_date) + + def test_equals(self): + self.assertFalse(self.rng.equals(list(self.rng))) + + def test_identical(self): + t1 = self.rng.copy() + t2 = self.rng.copy() + self.assertTrue(t1.identical(t2)) + + # name + t1 = t1.rename('foo') + self.assertTrue(t1.equals(t2)) + self.assertFalse(t1.identical(t2)) + t2 = t2.rename('foo') + self.assertTrue(t1.identical(t2)) + + # freq + t2v = Index(t2.values) + self.assertTrue(t1.equals(t2v)) + self.assertFalse(t1.identical(t2v)) + + def test_daterange_bug_456(self): + # GH #456 + rng1 = bdate_range('12/5/2011', '12/5/2011') + rng2 = bdate_range('12/2/2011', '12/5/2011') + rng2.offset = datetools.BDay() + + result = rng1.union(rng2) + tm.assert_isinstance(result, DatetimeIndex) + + def test_error_with_zero_monthends(self): + self.assertRaises(ValueError, date_range, '1/1/2000', '1/1/2001', + freq=datetools.MonthEnd(0)) + + def test_range_bug(self): + # GH #770 + offset = datetools.DateOffset(months=3) + result = date_range("2011-1-1", "2012-1-31", freq=offset) + + start = datetime(2011, 1, 1) + exp_values = [start + i * offset for i in range(5)] + self.assert_numpy_array_equal(result, DatetimeIndex(exp_values)) + + def test_range_tz_pytz(self): + # GH 2906 + tm._skip_if_no_pytz() + from pytz import timezone as tz + + start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) + end = datetime(2011, 1, 3, tzinfo=tz('US/Eastern')) + + dr = date_range(start=start, periods=3) + self.assertEqual(dr.tz, tz('US/Eastern')) + self.assertEqual(dr[0], start) + self.assertEqual(dr[2], end) + + dr = date_range(end=end, periods=3) + self.assertEqual(dr.tz, tz('US/Eastern')) + self.assertEqual(dr[0], start) + self.assertEqual(dr[2], end) + + dr = date_range(start=start, end=end) + self.assertEqual(dr.tz, tz('US/Eastern')) + self.assertEqual(dr[0], start) + self.assertEqual(dr[2], end) + + def test_range_tz_dateutil(self): + # GH 2906 + tm._skip_if_no_dateutil() + # Use maybe_get_tz to fix filename in tz under dateutil. + from pandas.tslib import maybe_get_tz + tz = lambda x: maybe_get_tz('dateutil/' + x) + + start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) + end = datetime(2011, 1, 3, tzinfo=tz('US/Eastern')) + + dr = date_range(start=start, periods=3) + self.assert_(dr.tz == tz('US/Eastern')) + self.assert_(dr[0] == start) + self.assert_(dr[2] == end) + + dr = date_range(end=end, periods=3) + self.assert_(dr.tz == tz('US/Eastern')) + self.assert_(dr[0] == start) + self.assert_(dr[2] == end) + + dr = date_range(start=start, end=end) + self.assert_(dr.tz == tz('US/Eastern')) + self.assert_(dr[0] == start) + self.assert_(dr[2] == end) + + def test_month_range_union_tz_pytz(self): + tm._skip_if_no_pytz() + from pytz import timezone + tz = timezone('US/Eastern') + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=datetools.monthEnd) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=datetools.monthEnd) + + early_dr.union(late_dr) + + def test_month_range_union_tz_dateutil(self): + _skip_if_windows_python_3() + tm._skip_if_no_dateutil() + from dateutil.tz import gettz as timezone + tz = timezone('US/Eastern') + + early_start = datetime(2011, 1, 1) + early_end = datetime(2011, 3, 1) + + late_start = datetime(2011, 3, 1) + late_end = datetime(2011, 5, 1) + + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=datetools.monthEnd) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=datetools.monthEnd) + + early_dr.union(late_dr) + + def test_range_closed(self): + begin = datetime(2011, 1, 1) + end = datetime(2014, 1, 1) + + for freq in ["3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq) + left = date_range(begin, end, closed="left", freq=freq) + right = date_range(begin, end, closed="right", freq=freq) + + expected_left = closed[:-1] + expected_right = closed[1:] + + self.assertTrue(expected_left.equals(left)) + self.assertTrue(expected_right.equals(right)) + + +class TestCustomDateRange(tm.TestCase): + + def setUp(self): + tm._skip_if_no_cday() + self.rng = cdate_range(START, END) + + def test_constructor(self): + rng = cdate_range(START, END, freq=datetools.cday) + rng = cdate_range(START, periods=20, freq=datetools.cday) + rng = cdate_range(end=START, periods=20, freq=datetools.cday) + self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'C') + self.assertRaises(ValueError, cdate_range, '2011-1-1', '2012-1-1', 'C') + + def test_cached_range(self): + rng = DatetimeIndex._cached_range(START, END, + offset=datetools.cday) + rng = DatetimeIndex._cached_range(START, periods=20, + offset=datetools.cday) + rng = DatetimeIndex._cached_range(end=START, periods=20, + offset=datetools.cday) + + self.assertRaises(Exception, DatetimeIndex._cached_range, START, END) + + self.assertRaises(Exception, DatetimeIndex._cached_range, START, + freq=datetools.cday) + + self.assertRaises(Exception, DatetimeIndex._cached_range, end=END, + freq=datetools.cday) + + self.assertRaises(Exception, DatetimeIndex._cached_range, periods=20, + freq=datetools.cday) + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + self.assertTrue(comp[11]) + self.assertFalse(comp[9]) + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assertTrue(cp.equals(self.rng)) + + def test_repr(self): + # only really care that it works + repr(self.rng) + + def test_getitem(self): + smaller = self.rng[:5] + self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) + self.assertEqual(smaller.offset, self.rng.offset) + + sliced = self.rng[::5] + self.assertEqual(sliced.offset, datetools.cday * 5) + + fancy_indexed = self.rng[[4, 3, 2, 1, 0]] + self.assertEqual(len(fancy_indexed), 5) + tm.assert_isinstance(fancy_indexed, DatetimeIndex) + self.assertIsNone(fancy_indexed.freq) + + # 32-bit vs. 64-bit platforms + self.assertEqual(self.rng[4], self.rng[np.int_(4)]) + + def test_getitem_matplotlib_hackaround(self): + values = self.rng[:, None] + expected = self.rng.values[:, None] + self.assert_numpy_array_equal(values, expected) + + def test_shift(self): + shifted = self.rng.shift(5) + self.assertEqual(shifted[0], self.rng[5]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(-5) + self.assertEqual(shifted[5], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(0) + self.assertEqual(shifted[0], self.rng[0]) + self.assertEqual(shifted.offset, self.rng.offset) + + rng = date_range(START, END, freq=datetools.bmonthEnd) + shifted = rng.shift(1, freq=datetools.cday) + self.assertEqual(shifted[0], rng[0] + datetools.cday) + + def test_pickle_unpickle(self): + pickled = pickle.dumps(self.rng) + unpickled = pickle.loads(pickled) + + self.assertIsNotNone(unpickled.offset) + + def test_union(self): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assert_isinstance(the_union, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right) + tm.assert_isinstance(the_union, Index) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right) + tm.assert_isinstance(the_union, DatetimeIndex) + + # order does not matter + self.assert_numpy_array_equal(right.union(left), the_union) + + # overlapping, but different offset + rng = date_range(START, END, freq=datetools.bmonthEnd) + + the_union = self.rng.union(rng) + tm.assert_isinstance(the_union, DatetimeIndex) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + + # overlapping, but different offset + rng = date_range(START, END, freq=datetools.bmonthEnd) + + the_join = self.rng.join(rng, how='outer') + tm.assert_isinstance(the_join, DatetimeIndex) + self.assertIsNone(the_join.freq) + + def test_intersection_bug(self): + # GH #771 + a = cdate_range('11/30/2011', '12/31/2011') + b = cdate_range('12/10/2011', '12/20/2011') + result = a.intersection(b) + self.assertTrue(result.equals(b)) + + def test_summary(self): + self.rng.summary() + self.rng[2:2].summary() + + def test_summary_pytz(self): + tm._skip_if_no_pytz() + import pytz + cdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + + def test_summary_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + cdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = cdate_range(end=end, periods=20) + firstDate = end - 19 * datetools.cday + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_date_parse_failure(self): + badly_formed_date = '2007/100/1' + + self.assertRaises(ValueError, Timestamp, badly_formed_date) + + self.assertRaises(ValueError, cdate_range, start=badly_formed_date, + periods=10) + self.assertRaises(ValueError, cdate_range, end=badly_formed_date, + periods=10) + self.assertRaises(ValueError, cdate_range, badly_formed_date, + badly_formed_date) + + def test_equals(self): + self.assertFalse(self.rng.equals(list(self.rng))) + + def test_daterange_bug_456(self): + # GH #456 + rng1 = cdate_range('12/5/2011', '12/5/2011') + rng2 = cdate_range('12/2/2011', '12/5/2011') + rng2.offset = datetools.CDay() + + result = rng1.union(rng2) + tm.assert_isinstance(result, DatetimeIndex) + + def test_cdaterange(self): + rng = cdate_range('2013-05-01', periods=3) + xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) + self.assertTrue(xp.equals(rng)) + + def test_cdaterange_weekmask(self): + rng = cdate_range('2013-05-01', periods=3, + weekmask='Sun Mon Tue Wed Thu') + xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) + self.assertTrue(xp.equals(rng)) + + def test_cdaterange_holidays(self): + rng = cdate_range('2013-05-01', periods=3, + holidays=['2013-05-01']) + xp = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) + self.assertTrue(xp.equals(rng)) + + def test_cdaterange_weekmask_and_holidays(self): + rng = cdate_range('2013-05-01', periods=3, + weekmask='Sun Mon Tue Wed Thu', + holidays=['2013-05-01']) + xp = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) + self.assertTrue(xp.equals(rng)) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py new file mode 100644 index 00000000..37371b58 --- /dev/null +++ b/pandas/tseries/tests/test_frequencies.py @@ -0,0 +1,348 @@ +from datetime import datetime, time, timedelta +from pandas.compat import range +import sys +import os + +import nose + +import numpy as np + +from pandas import Index, DatetimeIndex, Timestamp, Series, date_range, period_range + +from pandas.tseries.frequencies import to_offset, infer_freq +from pandas.tseries.tools import to_datetime +import pandas.tseries.frequencies as fmod +import pandas.tseries.offsets as offsets +from pandas.tseries.period import PeriodIndex +import pandas.compat as compat + +from pandas import _np_version_under1p7 +import pandas.util.testing as tm + +def test_to_offset_multiple(): + freqstr = '2h30min' + freqstr2 = '2h 30min' + + result = to_offset(freqstr) + assert(result == to_offset(freqstr2)) + expected = offsets.Minute(150) + assert(result == expected) + + freqstr = '2h30min15s' + result = to_offset(freqstr) + expected = offsets.Second(150 * 60 + 15) + assert(result == expected) + + freqstr = '2h 60min' + result = to_offset(freqstr) + expected = offsets.Hour(3) + assert(result == expected) + + freqstr = '15l500u' + result = to_offset(freqstr) + expected = offsets.Micro(15500) + assert(result == expected) + + freqstr = '10s75L' + result = to_offset(freqstr) + expected = offsets.Milli(10075) + assert(result == expected) + + if not _np_version_under1p7: + freqstr = '2800N' + result = to_offset(freqstr) + expected = offsets.Nano(2800) + assert(result == expected) + + # malformed + try: + to_offset('2h20m') + except ValueError: + pass + else: + assert(False) + + +def test_to_offset_negative(): + freqstr = '-1S' + result = to_offset(freqstr) + assert(result.n == -1) + + freqstr = '-5min10s' + result = to_offset(freqstr) + assert(result.n == -310) + + +def test_to_offset_leading_zero(): + freqstr = '00H 00T 01S' + result = to_offset(freqstr) + assert(result.n == 1) + + freqstr = '-00H 03T 14S' + result = to_offset(freqstr) + assert(result.n == -194) + + +def test_anchored_shortcuts(): + result = to_offset('W') + expected = to_offset('W-SUN') + assert(result == expected) + + result = to_offset('Q') + expected = to_offset('Q-DEC') + assert(result == expected) + + +_dti = DatetimeIndex + + +class TestFrequencyInference(tm.TestCase): + + def test_raise_if_period_index(self): + index = PeriodIndex(start="1/1/1990", periods=20, freq="M") + self.assertRaises(TypeError, infer_freq, index) + + def test_raise_if_too_few(self): + index = _dti(['12/31/1998', '1/3/1999']) + self.assertRaises(ValueError, infer_freq, index) + + def test_business_daily(self): + index = _dti(['12/31/1998', '1/3/1999', '1/4/1999']) + self.assertEqual(infer_freq(index), 'B') + + def test_day(self): + self._check_tick(timedelta(1), 'D') + + def test_day_corner(self): + index = _dti(['1/1/2000', '1/2/2000', '1/3/2000']) + self.assertEqual(infer_freq(index), 'D') + + def test_non_datetimeindex(self): + dates = to_datetime(['1/1/2000', '1/2/2000', '1/3/2000']) + self.assertEqual(infer_freq(dates), 'D') + + def test_hour(self): + self._check_tick(timedelta(hours=1), 'H') + + def test_minute(self): + self._check_tick(timedelta(minutes=1), 'T') + + def test_second(self): + self._check_tick(timedelta(seconds=1), 'S') + + def test_millisecond(self): + self._check_tick(timedelta(microseconds=1000), 'L') + + def test_microsecond(self): + self._check_tick(timedelta(microseconds=1), 'U') + + def test_nanosecond(self): + tm._skip_if_not_numpy17_friendly() + self._check_tick(np.timedelta64(1, 'ns'), 'N') + + def _check_tick(self, base_delta, code): + b = Timestamp(datetime.now()) + for i in range(1, 5): + inc = base_delta * i + index = _dti([b + inc * j for j in range(3)]) + if i > 1: + exp_freq = '%d%s' % (i, code) + else: + exp_freq = code + self.assertEqual(infer_freq(index), exp_freq) + + index = _dti([b + base_delta * 7] + + [b + base_delta * j for j in range(3)]) + self.assertIsNone(infer_freq(index)) + + index = _dti([b + base_delta * j for j in range(3)] + + [b + base_delta * 7]) + self.assertIsNone(infer_freq(index)) + + def test_weekly(self): + days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] + + for day in days: + self._check_generated_range('1/1/2000', 'W-%s' % day) + + def test_week_of_month(self): + days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] + + for day in days: + for i in range(1, 5): + self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day)) + + def test_week_of_month_fake(self): + #All of these dates are on same day of week and are 4 or 5 weeks apart + index = DatetimeIndex(["2013-08-27","2013-10-01","2013-10-29","2013-11-26"]) + assert infer_freq(index) != 'WOM-4TUE' + + def test_monthly(self): + self._check_generated_range('1/1/2000', 'M') + + def test_monthly_ambiguous(self): + rng = _dti(['1/31/2000', '2/29/2000', '3/31/2000']) + self.assertEqual(rng.inferred_freq, 'M') + + def test_business_monthly(self): + self._check_generated_range('1/1/2000', 'BM') + + def test_business_start_monthly(self): + self._check_generated_range('1/1/2000', 'BMS') + + def test_quarterly(self): + for month in ['JAN', 'FEB', 'MAR']: + self._check_generated_range('1/1/2000', 'Q-%s' % month) + + def test_annual(self): + for month in MONTHS: + self._check_generated_range('1/1/2000', 'A-%s' % month) + + def test_business_annual(self): + for month in MONTHS: + self._check_generated_range('1/1/2000', 'BA-%s' % month) + + def test_annual_ambiguous(self): + rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) + self.assertEqual(rng.inferred_freq, 'A-JAN') + + def _check_generated_range(self, start, freq): + freq = freq.upper() + + gen = date_range(start, periods=7, freq=freq) + index = _dti(gen.values) + if not freq.startswith('Q-'): + self.assertEqual(infer_freq(index), gen.freqstr) + else: + inf_freq = infer_freq(index) + self.assertTrue((inf_freq == 'Q-DEC' and + gen.freqstr in ('Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', + 'Q-MAR')) + or + (inf_freq == 'Q-NOV' and + gen.freqstr in ('Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB')) + or + (inf_freq == 'Q-OCT' and + gen.freqstr in ('Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN'))) + + gen = date_range(start, periods=5, freq=freq) + index = _dti(gen.values) + if not freq.startswith('Q-'): + self.assertEqual(infer_freq(index), gen.freqstr) + else: + inf_freq = infer_freq(index) + self.assertTrue((inf_freq == 'Q-DEC' and + gen.freqstr in ('Q', 'Q-DEC', 'Q-SEP', 'Q-JUN', + 'Q-MAR')) + or + (inf_freq == 'Q-NOV' and + gen.freqstr in ('Q-NOV', 'Q-AUG', 'Q-MAY', 'Q-FEB')) + or + (inf_freq == 'Q-OCT' and + gen.freqstr in ('Q-OCT', 'Q-JUL', 'Q-APR', 'Q-JAN'))) + + def test_infer_freq(self): + rng = period_range('1959Q2', '2009Q3', freq='Q') + rng = Index(rng.to_timestamp('D', how='e').asobject) + self.assertEqual(rng.inferred_freq, 'Q-DEC') + + rng = period_range('1959Q2', '2009Q3', freq='Q-NOV') + rng = Index(rng.to_timestamp('D', how='e').asobject) + self.assertEqual(rng.inferred_freq, 'Q-NOV') + + rng = period_range('1959Q2', '2009Q3', freq='Q-OCT') + rng = Index(rng.to_timestamp('D', how='e').asobject) + self.assertEqual(rng.inferred_freq, 'Q-OCT') + + def test_infer_freq_tz(self): + + freqs = {'AS-JAN': ['2009-01-01', '2010-01-01', '2011-01-01', '2012-01-01'], + 'Q-OCT': ['2009-01-31', '2009-04-30', '2009-07-31', '2009-10-31'], + 'M': ['2010-11-30', '2010-12-31', '2011-01-31', '2011-02-28'], + 'W-SAT': ['2010-12-25', '2011-01-01', '2011-01-08', '2011-01-15'], + 'D': ['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04'], + 'H': ['2011-12-31 22:00', '2011-12-31 23:00', '2012-01-01 00:00', '2012-01-01 01:00'] + } + + # GH 7310 + for tz in [None, 'Australia/Sydney', 'Asia/Tokyo', 'Europe/Paris', + 'US/Pacific', 'US/Eastern']: + for expected, dates in compat.iteritems(freqs): + idx = DatetimeIndex(dates, tz=tz) + self.assertEqual(idx.inferred_freq, expected) + + def test_not_monotonic(self): + rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) + rng = rng[::-1] + self.assertIsNone(rng.inferred_freq) + + def test_non_datetimeindex(self): + rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) + + vals = rng.to_pydatetime() + + result = infer_freq(vals) + self.assertEqual(result, rng.inferred_freq) + + def test_invalid_index_types(self): + + # test all index types + for i in [ tm.makeIntIndex(10), + tm.makeFloatIndex(10), + tm.makePeriodIndex(10) ]: + self.assertRaises(TypeError, lambda : infer_freq(i)) + + for i in [ tm.makeStringIndex(10), + tm.makeUnicodeIndex(10) ]: + self.assertRaises(ValueError, lambda : infer_freq(i)) + + def test_string_datetimelike_compat(self): + + # GH 6463 + expected = infer_freq(['2004-01', '2004-02', '2004-03', '2004-04']) + result = infer_freq(Index(['2004-01', '2004-02', '2004-03', '2004-04'])) + self.assertEqual(result,expected) + + def test_series(self): + + # GH6407 + # inferring series + + # invalid type of Series + for s in [ Series(np.arange(10)), + Series(np.arange(10.))]: + self.assertRaises(TypeError, lambda : infer_freq(s)) + + # a non-convertible string + self.assertRaises(ValueError, lambda : infer_freq(Series(['foo','bar']))) + + # cannot infer on PeriodIndex + for freq in [None, 'L', 'Y']: + s = Series(period_range('2013',periods=10,freq=freq)) + self.assertRaises(TypeError, lambda : infer_freq(s)) + + # DateTimeIndex + for freq in ['M', 'L', 'S']: + s = Series(date_range('20130101',periods=10,freq=freq)) + inferred = infer_freq(s) + self.assertEqual(inferred,freq) + + s = Series(date_range('20130101','20130110')) + inferred = infer_freq(s) + self.assertEqual(inferred,'D') + +MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', + 'OCT', 'NOV', 'DEC'] + + +def test_is_superperiod_subperiod(): + assert(fmod.is_superperiod(offsets.YearEnd(), offsets.MonthEnd())) + assert(fmod.is_subperiod(offsets.MonthEnd(), offsets.YearEnd())) + + assert(fmod.is_superperiod(offsets.Hour(), offsets.Minute())) + assert(fmod.is_subperiod(offsets.Minute(), offsets.Hour())) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_holiday.py b/pandas/tseries/tests/test_holiday.py new file mode 100644 index 00000000..0d5cc11b --- /dev/null +++ b/pandas/tseries/tests/test_holiday.py @@ -0,0 +1,168 @@ + +from datetime import datetime +import pandas.util.testing as tm +from pandas.tseries.holiday import ( + USFederalHolidayCalendar, USMemorialDay, USThanksgivingDay, + nearest_workday, next_monday_or_tuesday, next_monday, + previous_friday, sunday_to_monday, Holiday, DateOffset, + MO, Timestamp, AbstractHolidayCalendar, get_calendar, + HolidayCalendarFactory) + +class TestCalendar(tm.TestCase): + + def setUp(self): + self.holiday_list = [ + datetime(2012, 1, 2), + datetime(2012, 1, 16), + datetime(2012, 2, 20), + datetime(2012, 5, 28), + datetime(2012, 7, 4), + datetime(2012, 9, 3), + datetime(2012, 10, 8), + datetime(2012, 11, 12), + datetime(2012, 11, 22), + datetime(2012, 12, 25)] + + self.start_date = datetime(2012, 1, 1) + self.end_date = datetime(2012, 12, 31) + + def test_calendar(self): + + calendar = USFederalHolidayCalendar() + holidays = calendar.holidays(self.start_date, + self.end_date) + + holidays_1 = calendar.holidays( + self.start_date.strftime('%Y-%m-%d'), + self.end_date.strftime('%Y-%m-%d')) + holidays_2 = calendar.holidays( + Timestamp(self.start_date), + Timestamp(self.end_date)) + + self.assertEqual(list(holidays.to_pydatetime()), + self.holiday_list) + self.assertEqual(list(holidays_1.to_pydatetime()), + self.holiday_list) + self.assertEqual(list(holidays_2.to_pydatetime()), + self.holiday_list) + +class TestHoliday(tm.TestCase): + + def setUp(self): + self.start_date = datetime(2011, 1, 1) + self.end_date = datetime(2020, 12, 31) + + def test_usmemorialday(self): + holidays = USMemorialDay.dates(self.start_date, + self.end_date) + holidayList = [ + datetime(2011, 5, 30), + datetime(2012, 5, 28), + datetime(2013, 5, 27), + datetime(2014, 5, 26), + datetime(2015, 5, 25), + datetime(2016, 5, 30), + datetime(2017, 5, 29), + datetime(2018, 5, 28), + datetime(2019, 5, 27), + datetime(2020, 5, 25), + ] + self.assertEqual(list(holidays), holidayList) + + def test_usthanksgivingday(self): + holidays = USThanksgivingDay.dates(self.start_date, + self.end_date) + holidayList = [ + datetime(2011, 11, 24), + datetime(2012, 11, 22), + datetime(2013, 11, 28), + datetime(2014, 11, 27), + datetime(2015, 11, 26), + datetime(2016, 11, 24), + datetime(2017, 11, 23), + datetime(2018, 11, 22), + datetime(2019, 11, 28), + datetime(2020, 11, 26), + ] + + self.assertEqual(list(holidays), holidayList) + + def test_argument_types(self): + holidays = USThanksgivingDay.dates(self.start_date, + self.end_date) + + holidays_1 = USThanksgivingDay.dates( + self.start_date.strftime('%Y-%m-%d'), + self.end_date.strftime('%Y-%m-%d')) + + holidays_2 = USThanksgivingDay.dates( + Timestamp(self.start_date), + Timestamp(self.end_date)) + + self.assertEqual(holidays, holidays_1) + self.assertEqual(holidays, holidays_2) + + def test_special_holidays(self): + base_date = [datetime(2012, 5, 28)] + holiday_1 = Holiday('One-Time', year=2012, month=5, day=28) + holiday_2 = Holiday('Range', month=5, day=28, + start_date=datetime(2012, 1, 1), + end_date=datetime(2012, 12, 31), + offset=DateOffset(weekday=MO(1))) + + self.assertEqual(base_date, + holiday_1.dates(self.start_date, self.end_date)) + self.assertEqual(base_date, + holiday_2.dates(self.start_date, self.end_date)) + + def test_get_calendar(self): + class TestCalendar(AbstractHolidayCalendar): + rules = [] + + calendar = get_calendar('TestCalendar') + self.assertEqual(TestCalendar, calendar.__class__) + + def test_factory(self): + class_1 = HolidayCalendarFactory('MemorialDay', AbstractHolidayCalendar, + USMemorialDay) + class_2 = HolidayCalendarFactory('Thansksgiving', AbstractHolidayCalendar, + USThanksgivingDay) + class_3 = HolidayCalendarFactory('Combined', class_1, class_2) + + self.assertEqual(len(class_1.rules), 1) + self.assertEqual(len(class_2.rules), 1) + self.assertEqual(len(class_3.rules), 2) + + +class TestObservanceRules(tm.TestCase): + + def setUp(self): + self.we = datetime(2014, 4, 9) + self.th = datetime(2014, 4, 10) + self.fr = datetime(2014, 4, 11) + self.sa = datetime(2014, 4, 12) + self.su = datetime(2014, 4, 13) + self.mo = datetime(2014, 4, 14) + self.tu = datetime(2014, 4, 15) + + def test_next_monday(self): + self.assertEqual(next_monday(self.sa), self.mo) + self.assertEqual(next_monday(self.su), self.mo) + + def test_next_monday_or_tuesday(self): + self.assertEqual(next_monday_or_tuesday(self.sa), self.mo) + self.assertEqual(next_monday_or_tuesday(self.su), self.tu) + self.assertEqual(next_monday_or_tuesday(self.mo), self.tu) + + def test_previous_friday(self): + self.assertEqual(previous_friday(self.sa), self.fr) + self.assertEqual(previous_friday(self.su), self.fr) + + def test_sunday_to_monday(self): + self.assertEqual(sunday_to_monday(self.su), self.mo) + + def test_nearest_workday(self): + self.assertEqual(nearest_workday(self.sa), self.fr) + self.assertEqual(nearest_workday(self.su), self.mo) + self.assertEqual(nearest_workday(self.mo), self.mo) + diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py new file mode 100644 index 00000000..9febec68 --- /dev/null +++ b/pandas/tseries/tests/test_offsets.py @@ -0,0 +1,3083 @@ +from datetime import date, datetime, timedelta +from dateutil.relativedelta import relativedelta +from pandas.compat import range +from pandas import compat +import nose +from nose.tools import assert_raises + + +import numpy as np + +from pandas.core.datetools import ( + bday, BDay, CDay, BQuarterEnd, BMonthEnd, + CBMonthEnd, CBMonthBegin, + BYearEnd, MonthEnd, MonthBegin, BYearBegin, + QuarterBegin, BQuarterBegin, BMonthBegin, DateOffset, Week, + YearBegin, YearEnd, Hour, Minute, Second, Day, Micro, Milli, Nano, Easter, + WeekOfMonth, format, ole2datetime, QuarterEnd, to_datetime, normalize_date, + get_offset, get_offset_name, get_standard_freq) + +from pandas.tseries.frequencies import _offset_map +from pandas.tseries.index import _to_m8, DatetimeIndex, _daterange_cache, date_range +from pandas.tseries.tools import parse_time_string, _maybe_get_tz +import pandas.tseries.offsets as offsets + +from pandas.tslib import monthrange, OutOfBoundsDatetime, NaT +from pandas.lib import Timestamp +from pandas.util.testing import assertRaisesRegexp +import pandas.util.testing as tm +from pandas.tseries.offsets import BusinessMonthEnd, CacheableOffset, \ + LastWeekOfMonth, FY5253, FY5253Quarter, WeekDay +from pandas.tseries.holiday import USFederalHolidayCalendar + +from pandas import _np_version_under1p7 + +_multiprocess_can_split_ = True + + +def test_monthrange(): + import calendar + for y in range(2000, 2013): + for m in range(1, 13): + assert monthrange(y, m) == calendar.monthrange(y, m) + + +#### +## Misc function tests +#### + + +def test_format(): + actual = format(datetime(2008, 1, 15)) + assert actual == '20080115' + + +def test_ole2datetime(): + actual = ole2datetime(60000) + assert actual == datetime(2064, 4, 8) + + assert_raises(ValueError, ole2datetime, 60) + + +def test_to_datetime1(): + actual = to_datetime(datetime(2008, 1, 15)) + assert actual == datetime(2008, 1, 15) + + actual = to_datetime('20080115') + assert actual == datetime(2008, 1, 15) + + # unparseable + s = 'Month 1, 1999' + assert to_datetime(s) == s + + +def test_normalize_date(): + actual = normalize_date(datetime(2007, 10, 1, 1, 12, 5, 10)) + assert actual == datetime(2007, 10, 1) + + +def test_to_m8(): + valb = datetime(2007, 10, 1) + valu = _to_m8(valb) + tm.assert_isinstance(valu, np.datetime64) + # assert valu == np.datetime64(datetime(2007,10,1)) + +# def test_datetime64_box(): +# valu = np.datetime64(datetime(2007,10,1)) +# valb = _dt_box(valu) +# assert type(valb) == datetime +# assert valb == datetime(2007,10,1) + +##### +### DateOffset Tests +##### + +class Base(tm.TestCase): + _offset = None + + _offset_types = [getattr(offsets, o) for o in offsets.__all__] + skip_np_u1p7 = [offsets.CustomBusinessDay, offsets.CDay, offsets.CustomBusinessMonthBegin, + offsets.CustomBusinessMonthEnd, offsets.Nano] + + @property + def offset_types(self): + if _np_version_under1p7: + return [o for o in self._offset_types if o not in self.skip_np_u1p7] + else: + return self._offset_types + + def _get_offset(self, klass, value=1, normalize=False): + # create instance from offset class + if klass is FY5253 or klass is FY5253Quarter: + klass = klass(n=value, startingMonth=1, weekday=1, + qtr_with_extra_week=1, variation='last', + normalize=normalize) + elif klass is LastWeekOfMonth: + klass = klass(n=value, weekday=5, normalize=normalize) + elif klass is WeekOfMonth: + klass = klass(n=value, week=1, weekday=5, normalize=normalize) + elif klass is Week: + klass = klass(n=value, weekday=5, normalize=normalize) + else: + try: + klass = klass(value, normalize=normalize) + except: + klass = klass(normalize=normalize) + return klass + + def test_apply_out_of_range(self): + if self._offset is None: + return + if _np_version_under1p7 and self._offset in self.skip_np_u1p7: + raise nose.SkipTest('numpy >= 1.7 required') + + # try to create an out-of-bounds result timestamp; if we can't create the offset + # skip + try: + offset = self._get_offset(self._offset, value=10000) + + result = Timestamp('20080101') + offset + self.assertIsInstance(result, datetime) + except (OutOfBoundsDatetime): + raise + except (ValueError, KeyError) as e: + raise nose.SkipTest("cannot create out_of_range offset: {0} {1}".format(str(self).split('.')[-1],e)) + + +class TestCommon(Base): + + def setUp(self): + + # exected value created by Base._get_offset + # are applied to 2011/01/01 09:00 (Saturday) + # used for .apply and .rollforward + self.expecteds = {'Day': Timestamp('2011-01-02 09:00:00'), + 'BusinessDay': Timestamp('2011-01-03 09:00:00'), + 'CustomBusinessDay': Timestamp('2011-01-03 09:00:00'), + 'CustomBusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), + 'CustomBusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), + 'MonthBegin': Timestamp('2011-02-01 09:00:00'), + 'BusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), + 'MonthEnd': Timestamp('2011-01-31 09:00:00'), + 'BusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), + 'YearBegin': Timestamp('2012-01-01 09:00:00'), + 'BYearBegin': Timestamp('2011-01-03 09:00:00'), + 'YearEnd': Timestamp('2011-12-31 09:00:00'), + 'BYearEnd': Timestamp('2011-12-30 09:00:00'), + 'QuarterBegin': Timestamp('2011-03-01 09:00:00'), + 'BQuarterBegin': Timestamp('2011-03-01 09:00:00'), + 'QuarterEnd': Timestamp('2011-03-31 09:00:00'), + 'BQuarterEnd': Timestamp('2011-03-31 09:00:00'), + 'WeekOfMonth': Timestamp('2011-01-08 09:00:00'), + 'LastWeekOfMonth': Timestamp('2011-01-29 09:00:00'), + 'FY5253Quarter': Timestamp('2011-01-25 09:00:00'), + 'FY5253': Timestamp('2011-01-25 09:00:00'), + 'Week': Timestamp('2011-01-08 09:00:00'), + 'Easter': Timestamp('2011-04-24 09:00:00'), + 'Hour': Timestamp('2011-01-01 10:00:00'), + 'Minute': Timestamp('2011-01-01 09:01:00'), + 'Second': Timestamp('2011-01-01 09:00:01'), + 'Milli': Timestamp('2011-01-01 09:00:00.001000'), + 'Micro': Timestamp('2011-01-01 09:00:00.000001'), + 'Nano': Timestamp(np.datetime64('2011-01-01T09:00:00.000000001Z'))} + + self.timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern'] + + def test_return_type(self): + for offset in self.offset_types: + offset = self._get_offset(offset) + + # make sure that we are returning a Timestamp + result = Timestamp('20080101') + offset + self.assertIsInstance(result, Timestamp) + + # make sure that we are returning NaT + self.assertTrue(NaT + offset is NaT) + self.assertTrue(offset + NaT is NaT) + + self.assertTrue(NaT - offset is NaT) + self.assertTrue((-offset).apply(NaT) is NaT) + + def _check_offsetfunc_works(self, offset, funcname, dt, expected, + normalize=False): + offset_s = self._get_offset(offset, normalize=normalize) + func = getattr(offset_s, funcname) + + result = func(dt) + self.assert_(isinstance(result, Timestamp)) + self.assertEqual(result, expected) + + result = func(Timestamp(dt)) + self.assert_(isinstance(result, Timestamp)) + self.assertEqual(result, expected) + + if isinstance(dt, np.datetime64): + # test tz when input is datetime or Timestamp + return + + tm._skip_if_no_pytz() + import pytz + for tz in self.timezones: + expected_localize = expected.tz_localize(tz) + + dt_tz = pytz.timezone(tz).localize(dt) + result = func(dt_tz) + self.assert_(isinstance(result, Timestamp)) + self.assertEqual(result, expected_localize) + + result = func(Timestamp(dt, tz=tz)) + self.assert_(isinstance(result, Timestamp)) + self.assertEqual(result, expected_localize) + + def _check_nanofunc_works(self, offset, funcname, dt, expected): + offset = self._get_offset(offset) + func = getattr(offset, funcname) + + t1 = Timestamp(dt) + self.assertEqual(func(t1), expected) + + def test_apply(self): + sdt = datetime(2011, 1, 1, 9, 0) + ndt = np.datetime64('2011-01-01 09:00Z') + + for offset in self.offset_types: + for dt in [sdt, ndt]: + expected = self.expecteds[offset.__name__] + if offset == Nano: + self._check_nanofunc_works(offset, 'apply', dt, expected) + else: + self._check_offsetfunc_works(offset, 'apply', dt, expected) + + expected = Timestamp(expected.date()) + self._check_offsetfunc_works(offset, 'apply', dt, expected, + normalize=True) + + def test_rollforward(self): + expecteds = self.expecteds.copy() + + # result will not be changed if the target is on the offset + no_changes = ['Day', 'MonthBegin', 'YearBegin', 'Week', 'Hour', 'Minute', + 'Second', 'Milli', 'Micro', 'Nano'] + for n in no_changes: + expecteds[n] = Timestamp('2011/01/01 09:00') + + # but be changed when normalize=True + norm_expected = expecteds.copy() + for k in norm_expected: + norm_expected[k] = Timestamp(norm_expected[k].date()) + + normalized = {'Day': Timestamp('2011-01-02 00:00:00'), + 'MonthBegin': Timestamp('2011-02-01 00:00:00'), + 'YearBegin': Timestamp('2012-01-01 00:00:00'), + 'Week': Timestamp('2011-01-08 00:00:00'), + 'Hour': Timestamp('2011-01-01 00:00:00'), + 'Minute': Timestamp('2011-01-01 00:00:00'), + 'Second': Timestamp('2011-01-01 00:00:00'), + 'Milli': Timestamp('2011-01-01 00:00:00'), + 'Micro': Timestamp('2011-01-01 00:00:00')} + norm_expected.update(normalized) + + sdt = datetime(2011, 1, 1, 9, 0) + ndt = np.datetime64('2011-01-01 09:00Z') + + for offset in self.offset_types: + for dt in [sdt, ndt]: + expected = expecteds[offset.__name__] + if offset == Nano: + self._check_nanofunc_works(offset, 'rollforward', dt, expected) + else: + self._check_offsetfunc_works(offset, 'rollforward', dt, expected) + expected = norm_expected[offset.__name__] + self._check_offsetfunc_works(offset, 'rollforward', dt, expected, + normalize=True) + + def test_rollback(self): + expecteds = {'BusinessDay': Timestamp('2010-12-31 09:00:00'), + 'CustomBusinessDay': Timestamp('2010-12-31 09:00:00'), + 'CustomBusinessMonthEnd': Timestamp('2010-12-31 09:00:00'), + 'CustomBusinessMonthBegin': Timestamp('2010-12-01 09:00:00'), + 'BusinessMonthBegin': Timestamp('2010-12-01 09:00:00'), + 'MonthEnd': Timestamp('2010-12-31 09:00:00'), + 'BusinessMonthEnd': Timestamp('2010-12-31 09:00:00'), + 'BYearBegin': Timestamp('2010-01-01 09:00:00'), + 'YearEnd': Timestamp('2010-12-31 09:00:00'), + 'BYearEnd': Timestamp('2010-12-31 09:00:00'), + 'QuarterBegin': Timestamp('2010-12-01 09:00:00'), + 'BQuarterBegin': Timestamp('2010-12-01 09:00:00'), + 'QuarterEnd': Timestamp('2010-12-31 09:00:00'), + 'BQuarterEnd': Timestamp('2010-12-31 09:00:00'), + 'WeekOfMonth': Timestamp('2010-12-11 09:00:00'), + 'LastWeekOfMonth': Timestamp('2010-12-25 09:00:00'), + 'FY5253Quarter': Timestamp('2010-10-26 09:00:00'), + 'FY5253': Timestamp('2010-01-26 09:00:00'), + 'Easter': Timestamp('2010-04-04 09:00:00')} + + # result will not be changed if the target is on the offset + for n in ['Day', 'MonthBegin', 'YearBegin', 'Week', 'Hour', 'Minute', + 'Second', 'Milli', 'Micro', 'Nano']: + expecteds[n] = Timestamp('2011/01/01 09:00') + + # but be changed when normalize=True + norm_expected = expecteds.copy() + for k in norm_expected: + norm_expected[k] = Timestamp(norm_expected[k].date()) + + normalized = {'Day': Timestamp('2010-12-31 00:00:00'), + 'MonthBegin': Timestamp('2010-12-01 00:00:00'), + 'YearBegin': Timestamp('2010-01-01 00:00:00'), + 'Week': Timestamp('2010-12-25 00:00:00'), + 'Hour': Timestamp('2011-01-01 00:00:00'), + 'Minute': Timestamp('2011-01-01 00:00:00'), + 'Second': Timestamp('2011-01-01 00:00:00'), + 'Milli': Timestamp('2011-01-01 00:00:00'), + 'Micro': Timestamp('2011-01-01 00:00:00')} + norm_expected.update(normalized) + + sdt = datetime(2011, 1, 1, 9, 0) + ndt = np.datetime64('2011-01-01 09:00Z') + + for offset in self.offset_types: + for dt in [sdt, ndt]: + expected = expecteds[offset.__name__] + if offset == Nano: + self._check_nanofunc_works(offset, 'rollback', dt, expected) + else: + self._check_offsetfunc_works(offset, 'rollback', dt, expected) + + expected = norm_expected[offset.__name__] + self._check_offsetfunc_works(offset, 'rollback', + dt, expected, normalize=True) + + def test_onOffset(self): + for offset in self.offset_types: + dt = self.expecteds[offset.__name__] + offset_s = self._get_offset(offset) + self.assert_(offset_s.onOffset(dt)) + + # when normalize=True, onOffset checks time is 00:00:00 + offset_n = self._get_offset(offset, normalize=True) + self.assert_(not offset_n.onOffset(dt)) + + date = datetime(dt.year, dt.month, dt.day) + self.assert_(offset_n.onOffset(date)) + + def test_add(self): + dt = datetime(2011, 1, 1, 9, 0) + + for offset in self.offset_types: + offset_s = self._get_offset(offset) + expected = self.expecteds[offset.__name__] + + result_dt = dt + offset_s + result_ts = Timestamp(dt) + offset_s + for result in [result_dt, result_ts]: + self.assertTrue(isinstance(result, Timestamp)) + self.assertEqual(result, expected) + + tm._skip_if_no_pytz() + for tz in self.timezones: + expected_localize = expected.tz_localize(tz) + result = Timestamp(dt, tz=tz) + offset_s + self.assert_(isinstance(result, Timestamp)) + self.assertEqual(result, expected_localize) + + # normalize=True + offset_s = self._get_offset(offset, normalize=True) + expected = Timestamp(expected.date()) + + result_dt = dt + offset_s + result_ts = Timestamp(dt) + offset_s + for result in [result_dt, result_ts]: + self.assertTrue(isinstance(result, Timestamp)) + self.assertEqual(result, expected) + + for tz in self.timezones: + expected_localize = expected.tz_localize(tz) + result = Timestamp(dt, tz=tz) + offset_s + self.assert_(isinstance(result, Timestamp)) + self.assertEqual(result, expected_localize) + + +class TestDateOffset(Base): + _multiprocess_can_split_ = True + + def setUp(self): + self.d = Timestamp(datetime(2008, 1, 2)) + _offset_map.clear() + + def test_repr(self): + repr(DateOffset()) + repr(DateOffset(2)) + repr(2 * DateOffset()) + repr(2 * DateOffset(months=2)) + + def test_mul(self): + assert DateOffset(2) == 2 * DateOffset(1) + assert DateOffset(2) == DateOffset(1) * 2 + + def test_constructor(self): + + assert((self.d + DateOffset(months=2)) == datetime(2008, 3, 2)) + assert((self.d - DateOffset(months=2)) == datetime(2007, 11, 2)) + + assert((self.d + DateOffset(2)) == datetime(2008, 1, 4)) + + assert not DateOffset(2).isAnchored() + assert DateOffset(1).isAnchored() + + d = datetime(2008, 1, 31) + assert((d + DateOffset(months=1)) == datetime(2008, 2, 29)) + + def test_copy(self): + assert(DateOffset(months=2).copy() == DateOffset(months=2)) + + def test_eq(self): + offset1 = DateOffset(days=1) + offset2 = DateOffset(days=365) + + self.assertNotEqual(offset1, offset2) + + +class TestBusinessDay(Base): + _multiprocess_can_split_ = True + _offset = BDay + + def setUp(self): + self.d = datetime(2008, 1, 1) + + self.offset = BDay() + self.offset2 = BDay(2) + + def test_different_normalize_equals(self): + # equivalent in this special case + offset = BDay() + offset2 = BDay() + offset2.normalize = True + self.assertEqual(offset, offset2) + + def test_repr(self): + self.assertEqual(repr(self.offset), '') + assert repr(self.offset2) == '<2 * BusinessDays>' + + expected = '' + assert repr(self.offset + timedelta(1)) == expected + + def test_with_offset(self): + offset = self.offset + timedelta(hours=2) + + assert (self.d + offset) == datetime(2008, 1, 2, 2) + + def testEQ(self): + self.assertEqual(self.offset2, self.offset2) + + def test_mul(self): + pass + + def test_hash(self): + self.assertEqual(hash(self.offset2), hash(self.offset2)) + + def testCall(self): + self.assertEqual(self.offset2(self.d), datetime(2008, 1, 3)) + + def testRAdd(self): + self.assertEqual(self.d + self.offset2, self.offset2 + self.d) + + def testSub(self): + off = self.offset2 + self.assertRaises(Exception, off.__sub__, self.d) + self.assertEqual(2 * off - off, off) + + self.assertEqual(self.d - self.offset2, self.d + BDay(-2)) + + def testRSub(self): + self.assertEqual(self.d - self.offset2, (-self.offset2).apply(self.d)) + + def testMult1(self): + self.assertEqual(self.d + 10 * self.offset, self.d + BDay(10)) + + def testMult2(self): + self.assertEqual(self.d + (-5 * BDay(-10)), + self.d + BDay(50)) + + def testRollback1(self): + self.assertEqual(BDay(10).rollback(self.d), self.d) + + def testRollback2(self): + self.assertEqual( + BDay(10).rollback(datetime(2008, 1, 5)), datetime(2008, 1, 4)) + + def testRollforward1(self): + self.assertEqual(BDay(10).rollforward(self.d), self.d) + + def testRollforward2(self): + self.assertEqual( + BDay(10).rollforward(datetime(2008, 1, 5)), datetime(2008, 1, 7)) + + def test_roll_date_object(self): + offset = BDay() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 9, 14)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 9, 17)) + + offset = offsets.Day() + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + def test_onOffset(self): + tests = [(BDay(), datetime(2008, 1, 1), True), + (BDay(), datetime(2008, 1, 5), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_apply(self): + tests = [] + + tests.append((bday, + {datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + tests.append((2 * bday, + {datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)})) + + tests.append((-bday, + {datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)})) + + tests.append((-2 * bday, + {datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)})) + + tests.append((BDay(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + BDay(10) + self.assertEqual(result, datetime(2012, 11, 6)) + + result = dt + BDay(100) - BDay(100) + self.assertEqual(result, dt) + + off = BDay() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 12, 23) + self.assertEqual(rs, xp) + + st = datetime(2011, 12, 18) + rs = st + off + xp = datetime(2011, 12, 26) + self.assertEqual(rs, xp) + + off = BDay() * 10 + rs = datetime(2014, 1, 5) + off # see #5890 + xp = datetime(2014, 1, 17) + self.assertEqual(rs, xp) + + def test_apply_corner(self): + self.assertRaises(TypeError, BDay().apply, BMonthEnd()) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BDay() + offset2 = BDay() + self.assertFalse(offset1 != offset2) + + +class TestCustomBusinessDay(Base): + _multiprocess_can_split_ = True + _offset = CDay + + def setUp(self): + self.d = datetime(2008, 1, 1) + self.nd = np.datetime64('2008-01-01 00:00:00Z') + + tm._skip_if_no_cday() + self.offset = CDay() + self.offset2 = CDay(2) + + def test_different_normalize_equals(self): + # equivalent in this special case + offset = CDay() + offset2 = CDay() + offset2.normalize = True + self.assertEqual(offset, offset2) + + def test_repr(self): + assert repr(self.offset) == '' + assert repr(self.offset2) == '<2 * CustomBusinessDays>' + + expected = '' + assert repr(self.offset + timedelta(1)) == expected + + def test_with_offset(self): + offset = self.offset + timedelta(hours=2) + + assert (self.d + offset) == datetime(2008, 1, 2, 2) + + def testEQ(self): + self.assertEqual(self.offset2, self.offset2) + + def test_mul(self): + pass + + def test_hash(self): + self.assertEqual(hash(self.offset2), hash(self.offset2)) + + def testCall(self): + self.assertEqual(self.offset2(self.d), datetime(2008, 1, 3)) + self.assertEqual(self.offset2(self.nd), datetime(2008, 1, 3)) + + def testRAdd(self): + self.assertEqual(self.d + self.offset2, self.offset2 + self.d) + + def testSub(self): + off = self.offset2 + self.assertRaises(Exception, off.__sub__, self.d) + self.assertEqual(2 * off - off, off) + + self.assertEqual(self.d - self.offset2, self.d + CDay(-2)) + + def testRSub(self): + self.assertEqual(self.d - self.offset2, (-self.offset2).apply(self.d)) + + def testMult1(self): + self.assertEqual(self.d + 10 * self.offset, self.d + CDay(10)) + + def testMult2(self): + self.assertEqual(self.d + (-5 * CDay(-10)), + self.d + CDay(50)) + + def testRollback1(self): + self.assertEqual(CDay(10).rollback(self.d), self.d) + + def testRollback2(self): + self.assertEqual( + CDay(10).rollback(datetime(2008, 1, 5)), datetime(2008, 1, 4)) + + def testRollforward1(self): + self.assertEqual(CDay(10).rollforward(self.d), self.d) + + def testRollforward2(self): + self.assertEqual( + CDay(10).rollforward(datetime(2008, 1, 5)), datetime(2008, 1, 7)) + + def test_roll_date_object(self): + offset = CDay() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 9, 14)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 9, 17)) + + offset = offsets.Day() + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + def test_onOffset(self): + tests = [(CDay(), datetime(2008, 1, 1), True), + (CDay(), datetime(2008, 1, 5), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_apply(self): + from pandas.core.datetools import cday + tests = [] + + tests.append((cday, + {datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + tests.append((2 * cday, + {datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)})) + + tests.append((-cday, + {datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)})) + + tests.append((-2 * cday, + {datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)})) + + tests.append((CDay(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + CDay(10) + self.assertEqual(result, datetime(2012, 11, 6)) + + result = dt + CDay(100) - CDay(100) + self.assertEqual(result, dt) + + off = CDay() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 12, 23) + self.assertEqual(rs, xp) + + st = datetime(2011, 12, 18) + rs = st + off + xp = datetime(2011, 12, 26) + self.assertEqual(rs, xp) + + def test_apply_corner(self): + self.assertRaises(Exception, CDay().apply, BMonthEnd()) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = CDay() + offset2 = CDay() + self.assertFalse(offset1 != offset2) + + def test_holidays(self): + # Define a TradingDay offset + holidays = ['2012-05-01', datetime(2013, 5, 1), + np.datetime64('2014-05-01')] + tday = CDay(holidays=holidays) + for year in range(2012, 2015): + dt = datetime(year, 4, 30) + xp = datetime(year, 5, 2) + rs = dt + tday + self.assertEqual(rs, xp) + + def test_weekmask(self): + weekmask_saudi = 'Sat Sun Mon Tue Wed' # Thu-Fri Weekend + weekmask_uae = '1111001' # Fri-Sat Weekend + weekmask_egypt = [1,1,1,1,0,0,1] # Fri-Sat Weekend + bday_saudi = CDay(weekmask=weekmask_saudi) + bday_uae = CDay(weekmask=weekmask_uae) + bday_egypt = CDay(weekmask=weekmask_egypt) + dt = datetime(2013, 5, 1) + xp_saudi = datetime(2013, 5, 4) + xp_uae = datetime(2013, 5, 2) + xp_egypt = datetime(2013, 5, 2) + self.assertEqual(xp_saudi, dt + bday_saudi) + self.assertEqual(xp_uae, dt + bday_uae) + self.assertEqual(xp_egypt, dt + bday_egypt) + xp2 = datetime(2013, 5, 5) + self.assertEqual(xp2, dt + 2 * bday_saudi) + self.assertEqual(xp2, dt + 2 * bday_uae) + self.assertEqual(xp2, dt + 2 * bday_egypt) + + def test_weekmask_and_holidays(self): + weekmask_egypt = 'Sun Mon Tue Wed Thu' # Fri-Sat Weekend + holidays = ['2012-05-01', datetime(2013, 5, 1), + np.datetime64('2014-05-01')] + bday_egypt = CDay(holidays=holidays, weekmask=weekmask_egypt) + dt = datetime(2013, 4, 30) + xp_egypt = datetime(2013, 5, 5) + self.assertEqual(xp_egypt, dt + 2 * bday_egypt) + + def test_calendar(self): + calendar = USFederalHolidayCalendar() + dt = datetime(2014, 1, 17) + assertEq(CDay(calendar=calendar), dt, datetime(2014, 1, 21)) + +class CustomBusinessMonthBase(object): + _multiprocess_can_split_ = True + + def setUp(self): + self.d = datetime(2008, 1, 1) + + tm._skip_if_no_cday() + self.offset = self._object() + self.offset2 = self._object(2) + + def testEQ(self): + self.assertEqual(self.offset2, self.offset2) + + def test_mul(self): + pass + + def test_hash(self): + self.assertEqual(hash(self.offset2), hash(self.offset2)) + + def testRAdd(self): + self.assertEqual(self.d + self.offset2, self.offset2 + self.d) + + def testSub(self): + off = self.offset2 + self.assertRaises(Exception, off.__sub__, self.d) + self.assertEqual(2 * off - off, off) + + self.assertEqual(self.d - self.offset2, + self.d + self._object(-2)) + + def testRSub(self): + self.assertEqual(self.d - self.offset2, (-self.offset2).apply(self.d)) + + def testMult1(self): + self.assertEqual(self.d + 10 * self.offset, + self.d + self._object(10)) + + def testMult2(self): + self.assertEqual(self.d + (-5 * self._object(-10)), + self.d + self._object(50)) + + def test_offsets_compare_equal(self): + offset1 = self._object() + offset2 = self._object() + self.assertFalse(offset1 != offset2) + +class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base): + _object = CBMonthEnd + + def test_different_normalize_equals(self): + # equivalent in this special case + offset = CBMonthEnd() + offset2 = CBMonthEnd() + offset2.normalize = True + self.assertEqual(offset, offset2) + + def test_repr(self): + assert repr(self.offset) == '' + assert repr(self.offset2) == '<2 * CustomBusinessMonthEnds>' + + def testCall(self): + self.assertEqual(self.offset2(self.d), datetime(2008, 2, 29)) + + def testRollback1(self): + self.assertEqual( + CDay(10).rollback(datetime(2007, 12, 31)), datetime(2007, 12, 31)) + + def testRollback2(self): + self.assertEqual(CBMonthEnd(10).rollback(self.d), + datetime(2007,12,31)) + + def testRollforward1(self): + self.assertEqual(CBMonthEnd(10).rollforward(self.d), datetime(2008,1,31)) + + def test_roll_date_object(self): + offset = CBMonthEnd() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 8, 31)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 9, 28)) + + offset = offsets.Day() + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + def test_onOffset(self): + tests = [(CBMonthEnd(), datetime(2008, 1, 31), True), + (CBMonthEnd(), datetime(2008, 1, 1), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + + def test_apply(self): + cbm = CBMonthEnd() + tests = [] + + tests.append((cbm, + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29)})) + + tests.append((2 * cbm, + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 2, 7): datetime(2008, 3, 31)})) + + tests.append((-cbm, + {datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 2, 8): datetime(2008, 1, 31)})) + + tests.append((-2 * cbm, + {datetime(2008, 1, 1): datetime(2007, 11, 30), + datetime(2008, 2, 9): datetime(2007, 12, 31)})) + + tests.append((CBMonthEnd(0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + CBMonthEnd(10) + self.assertEqual(result, datetime(2013, 7, 31)) + + result = dt + CDay(100) - CDay(100) + self.assertEqual(result, dt) + + off = CBMonthEnd() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 7, 29) + self.assertEqual(rs, xp) + + st = datetime(2011, 12, 18) + rs = st + off + xp = datetime(2012, 5, 31) + self.assertEqual(rs, xp) + + def test_holidays(self): + # Define a TradingDay offset + holidays = ['2012-01-31', datetime(2012, 2, 28), + np.datetime64('2012-02-29')] + bm_offset = CBMonthEnd(holidays=holidays) + dt = datetime(2012,1,1) + self.assertEqual(dt + bm_offset,datetime(2012,1,30)) + self.assertEqual(dt + 2*bm_offset,datetime(2012,2,27)) + + def test_datetimeindex(self): + from pandas.tseries.holiday import USFederalHolidayCalendar + self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthEnd(calendar=USFederalHolidayCalendar())).tolist()[0], + datetime(2012,1,31)) + +class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): + _object = CBMonthBegin + + def test_different_normalize_equals(self): + # equivalent in this special case + offset = CBMonthBegin() + offset2 = CBMonthBegin() + offset2.normalize = True + self.assertEqual(offset, offset2) + + def test_repr(self): + assert repr(self.offset) == '' + assert repr(self.offset2) == '<2 * CustomBusinessMonthBegins>' + + def testCall(self): + self.assertEqual(self.offset2(self.d), datetime(2008, 3, 3)) + + def testRollback1(self): + self.assertEqual( + CDay(10).rollback(datetime(2007, 12, 31)), datetime(2007, 12, 31)) + + def testRollback2(self): + self.assertEqual(CBMonthBegin(10).rollback(self.d), + datetime(2008,1,1)) + + def testRollforward1(self): + self.assertEqual(CBMonthBegin(10).rollforward(self.d), datetime(2008,1,1)) + + def test_roll_date_object(self): + offset = CBMonthBegin() + + dt = date(2012, 9, 15) + + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 9, 3)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 10, 1)) + + offset = offsets.Day() + result = offset.rollback(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + result = offset.rollforward(dt) + self.assertEqual(result, datetime(2012, 9, 15)) + + def test_onOffset(self): + tests = [(CBMonthBegin(), datetime(2008, 1, 1), True), + (CBMonthBegin(), datetime(2008, 1, 31), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + + def test_apply(self): + cbm = CBMonthBegin() + tests = [] + + tests.append((cbm, + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 2, 7): datetime(2008, 3, 3)})) + + tests.append((2 * cbm, + {datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 2, 7): datetime(2008, 4, 1)})) + + tests.append((-cbm, + {datetime(2008, 1, 1): datetime(2007, 12, 3), + datetime(2008, 2, 8): datetime(2008, 2, 1)})) + + tests.append((-2 * cbm, + {datetime(2008, 1, 1): datetime(2007, 11, 1), + datetime(2008, 2, 9): datetime(2008, 1, 1)})) + + tests.append((CBMonthBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 7): datetime(2008, 2, 1)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_large_n(self): + dt = datetime(2012, 10, 23) + + result = dt + CBMonthBegin(10) + self.assertEqual(result, datetime(2013, 8, 1)) + + result = dt + CDay(100) - CDay(100) + self.assertEqual(result, dt) + + off = CBMonthBegin() * 6 + rs = datetime(2012, 1, 1) - off + xp = datetime(2011, 7, 1) + self.assertEqual(rs, xp) + + st = datetime(2011, 12, 18) + rs = st + off + xp = datetime(2012, 6, 1) + self.assertEqual(rs, xp) + + def test_holidays(self): + # Define a TradingDay offset + holidays = ['2012-02-01', datetime(2012, 2, 2), + np.datetime64('2012-03-01')] + bm_offset = CBMonthBegin(holidays=holidays) + dt = datetime(2012,1,1) + self.assertEqual(dt + bm_offset,datetime(2012,1,2)) + self.assertEqual(dt + 2*bm_offset,datetime(2012,2,3)) + + def test_datetimeindex(self): + self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthBegin(calendar=USFederalHolidayCalendar())).tolist()[0], + datetime(2012,1,3)) + + +def assertOnOffset(offset, date, expected): + actual = offset.onOffset(date) + assert actual == expected, ("\nExpected: %s\nActual: %s\nFor Offset: %s)" + "\nAt Date: %s" % + (expected, actual, offset, date)) + + +class TestWeek(Base): + _offset = Week + + def test_repr(self): + self.assertEqual(repr(Week(weekday=0)), "") + self.assertEqual(repr(Week(n=-1, weekday=0)), "<-1 * Week: weekday=0>") + self.assertEqual(repr(Week(n=-2, weekday=0)), "<-2 * Weeks: weekday=0>") + + def test_corner(self): + self.assertRaises(ValueError, Week, weekday=7) + assertRaisesRegexp(ValueError, "Day must be", Week, weekday=-1) + + def test_isAnchored(self): + self.assertTrue(Week(weekday=0).isAnchored()) + self.assertFalse(Week().isAnchored()) + self.assertFalse(Week(2, weekday=2).isAnchored()) + self.assertFalse(Week(2).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((Week(), # not business week + {datetime(2008, 1, 1): datetime(2008, 1, 8), + datetime(2008, 1, 4): datetime(2008, 1, 11), + datetime(2008, 1, 5): datetime(2008, 1, 12), + datetime(2008, 1, 6): datetime(2008, 1, 13), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + tests.append((Week(weekday=0), # Mon + {datetime(2007, 12, 31): datetime(2008, 1, 7), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + tests.append((Week(0, weekday=0), # n=0 -> roll forward. Mon + {datetime(2007, 12, 31): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) + + tests.append((Week(-2, weekday=1), # n=0 -> roll forward. Mon + {datetime(2010, 4, 6): datetime(2010, 3, 23), + datetime(2010, 4, 8): datetime(2010, 3, 30), + datetime(2010, 4, 5): datetime(2010, 3, 23)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_onOffset(self): + for weekday in range(7): + offset = Week(weekday=weekday) + + for day in range(1, 8): + date = datetime(2008, 1, day) + + if day % 7 == weekday: + expected = True + else: + expected = False + assertOnOffset(offset, date, expected) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = Week() + offset2 = Week() + self.assertFalse(offset1 != offset2) + + +class TestWeekOfMonth(Base): + _offset = WeekOfMonth + + def test_constructor(self): + assertRaisesRegexp(ValueError, "^N cannot be 0", WeekOfMonth, n=0, week=1, weekday=1) + assertRaisesRegexp(ValueError, "^Week", WeekOfMonth, n=1, week=4, weekday=0) + assertRaisesRegexp(ValueError, "^Week", WeekOfMonth, n=1, week=-1, weekday=0) + assertRaisesRegexp(ValueError, "^Day", WeekOfMonth, n=1, week=0, weekday=-1) + assertRaisesRegexp(ValueError, "^Day", WeekOfMonth, n=1, week=0, weekday=7) + + def test_repr(self): + self.assertEqual(repr(WeekOfMonth(weekday=1,week=2)), "") + + def test_offset(self): + date1 = datetime(2011, 1, 4) # 1st Tuesday of Month + date2 = datetime(2011, 1, 11) # 2nd Tuesday of Month + date3 = datetime(2011, 1, 18) # 3rd Tuesday of Month + date4 = datetime(2011, 1, 25) # 4th Tuesday of Month + + # see for loop for structure + test_cases = [ + (-2, 2, 1, date1, datetime(2010, 11, 16)), + (-2, 2, 1, date2, datetime(2010, 11, 16)), + (-2, 2, 1, date3, datetime(2010, 11, 16)), + (-2, 2, 1, date4, datetime(2010, 12, 21)), + + (-1, 2, 1, date1, datetime(2010, 12, 21)), + (-1, 2, 1, date2, datetime(2010, 12, 21)), + (-1, 2, 1, date3, datetime(2010, 12, 21)), + (-1, 2, 1, date4, datetime(2011, 1, 18)), + + (1, 0, 0, date1, datetime(2011, 2, 7)), + (1, 0, 0, date2, datetime(2011, 2, 7)), + (1, 0, 0, date3, datetime(2011, 2, 7)), + (1, 0, 0, date4, datetime(2011, 2, 7)), + (1, 0, 1, date1, datetime(2011, 2, 1)), + (1, 0, 1, date2, datetime(2011, 2, 1)), + (1, 0, 1, date3, datetime(2011, 2, 1)), + (1, 0, 1, date4, datetime(2011, 2, 1)), + (1, 0, 2, date1, datetime(2011, 1, 5)), + (1, 0, 2, date2, datetime(2011, 2, 2)), + (1, 0, 2, date3, datetime(2011, 2, 2)), + (1, 0, 2, date4, datetime(2011, 2, 2)), + + (1, 2, 1, date1, datetime(2011, 1, 18)), + (1, 2, 1, date2, datetime(2011, 1, 18)), + (1, 2, 1, date3, datetime(2011, 2, 15)), + (1, 2, 1, date4, datetime(2011, 2, 15)), + + (2, 2, 1, date1, datetime(2011, 2, 15)), + (2, 2, 1, date2, datetime(2011, 2, 15)), + (2, 2, 1, date3, datetime(2011, 3, 15)), + (2, 2, 1, date4, datetime(2011, 3, 15)), + ] + + for n, week, weekday, date, expected in test_cases: + offset = WeekOfMonth(n, week=week, weekday=weekday) + assertEq(offset, date, expected) + + # try subtracting + result = datetime(2011, 2, 1) - WeekOfMonth(week=1, weekday=2) + self.assertEqual(result, datetime(2011, 1, 12)) + result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2) + self.assertEqual(result, datetime(2011, 2, 2)) + + def test_onOffset(self): + test_cases = [ + (0, 0, datetime(2011, 2, 7), True), + (0, 0, datetime(2011, 2, 6), False), + (0, 0, datetime(2011, 2, 14), False), + (1, 0, datetime(2011, 2, 14), True), + (0, 1, datetime(2011, 2, 1), True), + (0, 1, datetime(2011, 2, 8), False), + ] + + for week, weekday, date, expected in test_cases: + offset = WeekOfMonth(week=week, weekday=weekday) + self.assertEqual(offset.onOffset(date), expected) + +class TestLastWeekOfMonth(Base): + _offset = LastWeekOfMonth + + def test_constructor(self): + assertRaisesRegexp(ValueError, "^N cannot be 0", \ + LastWeekOfMonth, n=0, weekday=1) + + assertRaisesRegexp(ValueError, "^Day", LastWeekOfMonth, n=1, weekday=-1) + assertRaisesRegexp(ValueError, "^Day", LastWeekOfMonth, n=1, weekday=7) + + def test_offset(self): + #### Saturday + last_sat = datetime(2013,8,31) + next_sat = datetime(2013,9,28) + offset_sat = LastWeekOfMonth(n=1, weekday=5) + + one_day_before = (last_sat + timedelta(days=-1)) + self.assertEqual(one_day_before + offset_sat, last_sat) + + one_day_after = (last_sat + timedelta(days=+1)) + self.assertEqual(one_day_after + offset_sat, next_sat) + + #Test On that day + self.assertEqual(last_sat + offset_sat, next_sat) + + #### Thursday + + offset_thur = LastWeekOfMonth(n=1, weekday=3) + last_thurs = datetime(2013,1,31) + next_thurs = datetime(2013,2,28) + + one_day_before = last_thurs + timedelta(days=-1) + self.assertEqual(one_day_before + offset_thur, last_thurs) + + one_day_after = last_thurs + timedelta(days=+1) + self.assertEqual(one_day_after + offset_thur, next_thurs) + + # Test on that day + self.assertEqual(last_thurs + offset_thur, next_thurs) + + three_before = last_thurs + timedelta(days=-3) + self.assertEqual(three_before + offset_thur, last_thurs) + + two_after = last_thurs + timedelta(days=+2) + self.assertEqual(two_after + offset_thur, next_thurs) + + offset_sunday = LastWeekOfMonth(n=1, weekday=WeekDay.SUN) + self.assertEqual(datetime(2013,7,31) + offset_sunday, datetime(2013,8,25)) + + def test_onOffset(self): + test_cases = [ + (WeekDay.SUN, datetime(2013, 1, 27), True), + (WeekDay.SAT, datetime(2013, 3, 30), True), + (WeekDay.MON, datetime(2013, 2, 18), False), #Not the last Mon + (WeekDay.SUN, datetime(2013, 2, 25), False), #Not a SUN + (WeekDay.MON, datetime(2013, 2, 25), True), + (WeekDay.SAT, datetime(2013, 11, 30), True), + + (WeekDay.SAT, datetime(2006, 8, 26), True), + (WeekDay.SAT, datetime(2007, 8, 25), True), + (WeekDay.SAT, datetime(2008, 8, 30), True), + (WeekDay.SAT, datetime(2009, 8, 29), True), + (WeekDay.SAT, datetime(2010, 8, 28), True), + (WeekDay.SAT, datetime(2011, 8, 27), True), + (WeekDay.SAT, datetime(2019, 8, 31), True), + ] + + for weekday, date, expected in test_cases: + offset = LastWeekOfMonth(weekday=weekday) + self.assertEqual(offset.onOffset(date), expected, msg=date) + + +class TestBMonthBegin(Base): + _offset = BMonthBegin + + def test_offset(self): + tests = [] + + tests.append((BMonthBegin(), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 1): datetime(2006, 10, 2), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1)})) + + tests.append((BMonthBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 10, 2): datetime(2006, 10, 2), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 15): datetime(2006, 10, 2)})) + + tests.append((BMonthBegin(2), + {datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 1, 15): datetime(2008, 3, 3), + datetime(2006, 12, 29): datetime(2007, 2, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + tests.append((BMonthBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 2), + datetime(2008, 6, 1): datetime(2008, 5, 1), + datetime(2008, 3, 10): datetime(2008, 3, 3), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 30): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [(BMonthBegin(), datetime(2007, 12, 31), False), + (BMonthBegin(), datetime(2008, 1, 1), True), + (BMonthBegin(), datetime(2001, 4, 2), True), + (BMonthBegin(), datetime(2008, 3, 3), True)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthBegin() + offset2 = BMonthBegin() + self.assertFalse(offset1 != offset2) + + +class TestBMonthEnd(Base): + _offset = BMonthEnd + + def test_offset(self): + tests = [] + + tests.append((BMonthEnd(), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 29)})) + + tests.append((BMonthEnd(0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 29), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + tests.append((BMonthEnd(2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 2, 28), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 29)})) + + tests.append((BMonthEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2008, 5, 30), + datetime(2008, 12, 31): datetime(2008, 11, 28), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + BMonthEnd(normalize=True) + expected = dt.replace(hour=0) + BMonthEnd() + self.assertEqual(result, expected) + + def test_onOffset(self): + + tests = [(BMonthEnd(), datetime(2007, 12, 31), True), + (BMonthEnd(), datetime(2008, 1, 1), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthEnd() + offset2 = BMonthEnd() + self.assertFalse(offset1 != offset2) + + +class TestMonthBegin(Base): + _offset = MonthBegin + + def test_offset(self): + tests = [] + + # NOTE: I'm not entirely happy with the logic here for Begin -ss + # see thread 'offset conventions' on the ML + tests.append((MonthBegin(), + {datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 2, 1): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + tests.append((MonthBegin(0), + {datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 12, 3): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + tests.append((MonthBegin(2), + {datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 12, 28): datetime(2008, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + tests.append((MonthBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 5, 31): datetime(2008, 5, 1), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 1, 2): datetime(2006, 1, 1)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + +class TestMonthEnd(Base): + _offset = MonthEnd + + def test_offset(self): + tests = [] + + tests.append((MonthEnd(), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 31)})) + + tests.append((MonthEnd(0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + tests.append((MonthEnd(2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 31)})) + + tests.append((MonthEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 11, 30), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + # def test_day_of_month(self): + # dt = datetime(2007, 1, 1) + + # offset = MonthEnd(day=20) + + # result = dt + offset + # self.assertEqual(result, datetime(2007, 1, 20)) + + # result = result + offset + # self.assertEqual(result, datetime(2007, 2, 20)) + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + MonthEnd(normalize=True) + expected = dt.replace(hour=0) + MonthEnd() + self.assertEqual(result, expected) + + def test_onOffset(self): + + tests = [(MonthEnd(), datetime(2007, 12, 31), True), + (MonthEnd(), datetime(2008, 1, 1), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + +class TestBQuarterBegin(Base): + _offset = BQuarterBegin + + def test_repr(self): + self.assertEqual(repr(BQuarterBegin()),"") + self.assertEqual(repr(BQuarterBegin(startingMonth=3)), "") + self.assertEqual(repr(BQuarterBegin(startingMonth=1)), "") + + def test_isAnchored(self): + self.assertTrue(BQuarterBegin(startingMonth=1).isAnchored()) + self.assertTrue(BQuarterBegin().isAnchored()) + self.assertFalse(BQuarterBegin(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((BQuarterBegin(startingMonth=1), + {datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2007, 3, 15): datetime(2007, 4, 2), + datetime(2007, 2, 28): datetime(2007, 4, 2), + datetime(2007, 1, 1): datetime(2007, 4, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 7, 2), + datetime(2008, 4, 30): datetime(2008, 7, 1), })) + + tests.append((BQuarterBegin(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 8, 15): datetime(2008, 11, 3), + datetime(2008, 9, 15): datetime(2008, 11, 3), + datetime(2008, 11, 1): datetime(2008, 11, 3), + datetime(2008, 4, 30): datetime(2008, 5, 1), })) + + tests.append((BQuarterBegin(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2007, 12, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 15): datetime(2008, 4, 1), + datetime(2008, 2, 27): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 4, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 2): datetime(2007, 7, 2), })) + + tests.append((BQuarterBegin(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2007, 7, 3): datetime(2007, 7, 2), + datetime(2007, 4, 3): datetime(2007, 4, 2), + datetime(2007, 7, 2): datetime(2007, 4, 2), + datetime(2008, 4, 1): datetime(2008, 1, 1), })) + + tests.append((BQuarterBegin(startingMonth=1, n=2), + {datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 1, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2007, 3, 31): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 10, 1), + datetime(2008, 4, 30): datetime(2008, 10, 1), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + # corner + offset = BQuarterBegin(n=-1, startingMonth=1) + self.assertEqual(datetime(2007, 4, 3) + offset, datetime(2007, 4, 2)) + + +class TestBQuarterEnd(Base): + _offset = BQuarterEnd + + def test_repr(self): + self.assertEqual(repr(BQuarterEnd()),"") + self.assertEqual(repr(BQuarterEnd(startingMonth=3)), "") + self.assertEqual(repr(BQuarterEnd(startingMonth=1)), "") + + def test_isAnchored(self): + self.assertTrue(BQuarterEnd(startingMonth=1).isAnchored()) + self.assertTrue(BQuarterEnd().isAnchored()) + self.assertFalse(BQuarterEnd(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((BQuarterEnd(startingMonth=1), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), })) + + tests.append((BQuarterEnd(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 30), + datetime(2008, 3, 15): datetime(2008, 5, 30), + datetime(2008, 3, 31): datetime(2008, 5, 30), + datetime(2008, 4, 15): datetime(2008, 5, 30), + datetime(2008, 4, 30): datetime(2008, 5, 30), })) + + tests.append((BQuarterEnd(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), })) + + tests.append((BQuarterEnd(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), })) + + tests.append((BQuarterEnd(startingMonth=1, n=2), + {datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + # corner + offset = BQuarterEnd(n=-1, startingMonth=1) + self.assertEqual(datetime(2010, 1, 31) + offset, datetime(2010, 1, 29)) + + def test_onOffset(self): + + tests = [ + (BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +def makeFY5253LastOfMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="last", **kwds) + +def makeFY5253NearestEndMonthQuarter(*args, **kwds): + return FY5253Quarter(*args, variation="nearest", **kwds) + +def makeFY5253NearestEndMonth(*args, **kwds): + return FY5253(*args, variation="nearest", **kwds) + +def makeFY5253LastOfMonth(*args, **kwds): + return FY5253(*args, variation="last", **kwds) + +class TestFY5253LastOfMonth(Base): + + def test_onOffset(self): + + offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, weekday=WeekDay.SAT) + offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, weekday=WeekDay.SAT) + + tests = [ + #From Wikipedia (see: http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Last_Saturday_of_the_month_at_fiscal_year_end) + (offset_lom_sat_aug, datetime(2006, 8, 26), True), + (offset_lom_sat_aug, datetime(2007, 8, 25), True), + (offset_lom_sat_aug, datetime(2008, 8, 30), True), + (offset_lom_sat_aug, datetime(2009, 8, 29), True), + (offset_lom_sat_aug, datetime(2010, 8, 28), True), + (offset_lom_sat_aug, datetime(2011, 8, 27), True), + (offset_lom_sat_aug, datetime(2012, 8, 25), True), + (offset_lom_sat_aug, datetime(2013, 8, 31), True), + (offset_lom_sat_aug, datetime(2014, 8, 30), True), + (offset_lom_sat_aug, datetime(2015, 8, 29), True), + (offset_lom_sat_aug, datetime(2016, 8, 27), True), + (offset_lom_sat_aug, datetime(2017, 8, 26), True), + (offset_lom_sat_aug, datetime(2018, 8, 25), True), + (offset_lom_sat_aug, datetime(2019, 8, 31), True), + + (offset_lom_sat_aug, datetime(2006, 8, 27), False), + (offset_lom_sat_aug, datetime(2007, 8, 28), False), + (offset_lom_sat_aug, datetime(2008, 8, 31), False), + (offset_lom_sat_aug, datetime(2009, 8, 30), False), + (offset_lom_sat_aug, datetime(2010, 8, 29), False), + (offset_lom_sat_aug, datetime(2011, 8, 28), False), + + (offset_lom_sat_aug, datetime(2006, 8, 25), False), + (offset_lom_sat_aug, datetime(2007, 8, 24), False), + (offset_lom_sat_aug, datetime(2008, 8, 29), False), + (offset_lom_sat_aug, datetime(2009, 8, 28), False), + (offset_lom_sat_aug, datetime(2010, 8, 27), False), + (offset_lom_sat_aug, datetime(2011, 8, 26), False), + (offset_lom_sat_aug, datetime(2019, 8, 30), False), + + #From GMCR (see for example: http://yahoo.brand.edgar-online.com/Default.aspx?companyid=3184&formtypeID=7) + (offset_lom_sat_sep, datetime(2010, 9, 25), True), + (offset_lom_sat_sep, datetime(2011, 9, 24), True), + (offset_lom_sat_sep, datetime(2012, 9, 29), True), + + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_apply(self): + offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, weekday=WeekDay.SAT) + offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8, weekday=WeekDay.SAT) + + date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25), + datetime(2008, 8, 30), datetime(2009, 8, 29), + datetime(2010, 8, 28), datetime(2011, 8, 27), + datetime(2012, 8, 25), datetime(2013, 8, 31), + datetime(2014, 8, 30), datetime(2015, 8, 29), + datetime(2016, 8, 27)] + + tests = [ + (offset_lom_aug_sat, date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, date_seq_lom_aug_sat), + (offset_lom_aug_sat, [datetime(2006, 8, 25)] + date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, [datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), + (makeFY5253LastOfMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), list(reversed(date_seq_lom_aug_sat))), + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + self.assertEqual(current, datum) + +class TestFY5253NearestEndMonth(Base): + + def test_get_target_month_end(self): + self.assertEqual(makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT).get_target_month_end(datetime(2013,1,1)), datetime(2013,8,31)) + self.assertEqual(makeFY5253NearestEndMonth(startingMonth=12, weekday=WeekDay.SAT).get_target_month_end(datetime(2013,1,1)), datetime(2013,12,31)) + self.assertEqual(makeFY5253NearestEndMonth(startingMonth=2, weekday=WeekDay.SAT).get_target_month_end(datetime(2013,1,1)), datetime(2013,2,28)) + + def test_get_year_end(self): + self.assertEqual(makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT).get_year_end(datetime(2013,1,1)), datetime(2013,8,31)) + self.assertEqual(makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SUN).get_year_end(datetime(2013,1,1)), datetime(2013,9,1)) + self.assertEqual(makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.FRI).get_year_end(datetime(2013,1,1)), datetime(2013,8,30)) + + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + self.assertEqual(offset_n.get_year_end(datetime(2012,1,1)), datetime(2013,1,1)) + self.assertEqual(offset_n.get_year_end(datetime(2012,1,10)), datetime(2013,1,1)) + + self.assertEqual(offset_n.get_year_end(datetime(2013,1,1)), datetime(2013,12,31)) + self.assertEqual(offset_n.get_year_end(datetime(2013,1,2)), datetime(2013,12,31)) + self.assertEqual(offset_n.get_year_end(datetime(2013,1,3)), datetime(2013,12,31)) + self.assertEqual(offset_n.get_year_end(datetime(2013,1,10)), datetime(2013,12,31)) + + JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") + self.assertEqual(JNJ.get_year_end(datetime(2006, 1, 1)), datetime(2006, 12, 31)) + + def test_onOffset(self): + offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8, weekday=WeekDay.SAT) + offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8, weekday=WeekDay.THU) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest") + + tests = [ +# From Wikipedia (see: http://en.wikipedia.org/wiki/4%E2%80%934%E2%80%935_calendar#Saturday_nearest_the_end_of_month) +# 2006-09-02 2006 September 2 +# 2007-09-01 2007 September 1 +# 2008-08-30 2008 August 30 (leap year) +# 2009-08-29 2009 August 29 +# 2010-08-28 2010 August 28 +# 2011-09-03 2011 September 3 +# 2012-09-01 2012 September 1 (leap year) +# 2013-08-31 2013 August 31 +# 2014-08-30 2014 August 30 +# 2015-08-29 2015 August 29 +# 2016-09-03 2016 September 3 (leap year) +# 2017-09-02 2017 September 2 +# 2018-09-01 2018 September 1 +# 2019-08-31 2019 August 31 + (offset_lom_aug_sat, datetime(2006, 9, 2), True), + (offset_lom_aug_sat, datetime(2007, 9, 1), True), + (offset_lom_aug_sat, datetime(2008, 8, 30), True), + (offset_lom_aug_sat, datetime(2009, 8, 29), True), + (offset_lom_aug_sat, datetime(2010, 8, 28), True), + (offset_lom_aug_sat, datetime(2011, 9, 3), True), + + (offset_lom_aug_sat, datetime(2016, 9, 3), True), + (offset_lom_aug_sat, datetime(2017, 9, 2), True), + (offset_lom_aug_sat, datetime(2018, 9, 1), True), + (offset_lom_aug_sat, datetime(2019, 8, 31), True), + + (offset_lom_aug_sat, datetime(2006, 8, 27), False), + (offset_lom_aug_sat, datetime(2007, 8, 28), False), + (offset_lom_aug_sat, datetime(2008, 8, 31), False), + (offset_lom_aug_sat, datetime(2009, 8, 30), False), + (offset_lom_aug_sat, datetime(2010, 8, 29), False), + (offset_lom_aug_sat, datetime(2011, 8, 28), False), + + (offset_lom_aug_sat, datetime(2006, 8, 25), False), + (offset_lom_aug_sat, datetime(2007, 8, 24), False), + (offset_lom_aug_sat, datetime(2008, 8, 29), False), + (offset_lom_aug_sat, datetime(2009, 8, 28), False), + (offset_lom_aug_sat, datetime(2010, 8, 27), False), + (offset_lom_aug_sat, datetime(2011, 8, 26), False), + (offset_lom_aug_sat, datetime(2019, 8, 30), False), + + #From Micron, see: http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_lom_aug_thu, datetime(2012, 8, 30), True), + (offset_lom_aug_thu, datetime(2011, 9, 1), True), + + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_apply(self): + date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1), + datetime(2008, 8, 30), datetime(2009, 8, 29), + datetime(2010, 8, 28), datetime(2011, 9, 3)] + + JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1), + datetime(2006, 12, 31), datetime(2007, 12, 30), + datetime(2008, 12, 28), datetime(2010, 1, 3), + datetime(2011, 1, 2), datetime(2012, 1, 1), + datetime(2012, 12, 30)] + + DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, variation="nearest") + + tests = [ + (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), [datetime(2006, 9, 1)] + date_seq_nem_8_sat), + (makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]), + (makeFY5253NearestEndMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), list(reversed(date_seq_nem_8_sat))), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), JNJ), + (makeFY5253NearestEndMonth(n=-1, startingMonth=12, weekday=WeekDay.SUN), list(reversed(JNJ))), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), [datetime(2005,1,2), datetime(2006, 1, 1)]), + (makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), [datetime(2006,1,2), datetime(2006, 12, 31)]), + (DEC_SAT, [datetime(2013,1,15), datetime(2012,12,29)]) + ] + for test in tests: + offset, data = test + current = data[0] + for datum in data[1:]: + current = current + offset + self.assertEqual(current, datum) + +class TestFY5253LastOfMonthQuarter(Base): + + def test_isAnchored(self): + self.assertTrue(makeFY5253LastOfMonthQuarter(startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4).isAnchored()) + self.assertTrue(makeFY5253LastOfMonthQuarter(weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4).isAnchored()) + self.assertFalse(makeFY5253LastOfMonthQuarter(2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4).isAnchored()) + + def test_equality(self): + self.assertEqual(makeFY5253LastOfMonthQuarter(startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4), makeFY5253LastOfMonthQuarter(startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + self.assertNotEqual(makeFY5253LastOfMonthQuarter(startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4), makeFY5253LastOfMonthQuarter(startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4)) + self.assertNotEqual(makeFY5253LastOfMonthQuarter(startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4), makeFY5253LastOfMonthQuarter(startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + + def test_offset(self): + offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4) + offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4) + offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4) + + offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4) + offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4) + + GMCR = [datetime(2010, 3, 27), + datetime(2010, 6, 26), + datetime(2010, 9, 25), + datetime(2010, 12, 25), + datetime(2011, 3, 26), + datetime(2011, 6, 25), + datetime(2011, 9, 24), + datetime(2011, 12, 24), + datetime(2012, 3, 24), + datetime(2012, 6, 23), + datetime(2012, 9, 29), + datetime(2012, 12, 29), + datetime(2013, 3, 30), + datetime(2013, 6, 29)] + + + assertEq(offset, base=GMCR[0], expected=GMCR[1]) + assertEq(offset, base=GMCR[0] + relativedelta(days=-1), expected=GMCR[0]) + assertEq(offset, base=GMCR[1], expected=GMCR[2]) + + assertEq(offset2, base=GMCR[0], expected=GMCR[2]) + assertEq(offset4, base=GMCR[0], expected=GMCR[4]) + + assertEq(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) + assertEq(offset_neg1, base=GMCR[-1] + relativedelta(days=+1), expected=GMCR[-1]) + assertEq(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) + + date = GMCR[0] + relativedelta(days=-1) + for expected in GMCR: + assertEq(offset, date, expected) + date = date + offset + + date = GMCR[-1] + relativedelta(days=+1) + for expected in reversed(GMCR): + assertEq(offset_neg1, date, expected) + date = date + offset_neg1 + + + def test_onOffset(self): + lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4) + lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4) + + tests = [ + #From Wikipedia + (lomq_aug_sat_4, datetime(2006, 8, 26), True), + (lomq_aug_sat_4, datetime(2007, 8, 25), True), + (lomq_aug_sat_4, datetime(2008, 8, 30), True), + (lomq_aug_sat_4, datetime(2009, 8, 29), True), + (lomq_aug_sat_4, datetime(2010, 8, 28), True), + (lomq_aug_sat_4, datetime(2011, 8, 27), True), + (lomq_aug_sat_4, datetime(2019, 8, 31), True), + + (lomq_aug_sat_4, datetime(2006, 8, 27), False), + (lomq_aug_sat_4, datetime(2007, 8, 28), False), + (lomq_aug_sat_4, datetime(2008, 8, 31), False), + (lomq_aug_sat_4, datetime(2009, 8, 30), False), + (lomq_aug_sat_4, datetime(2010, 8, 29), False), + (lomq_aug_sat_4, datetime(2011, 8, 28), False), + + (lomq_aug_sat_4, datetime(2006, 8, 25), False), + (lomq_aug_sat_4, datetime(2007, 8, 24), False), + (lomq_aug_sat_4, datetime(2008, 8, 29), False), + (lomq_aug_sat_4, datetime(2009, 8, 28), False), + (lomq_aug_sat_4, datetime(2010, 8, 27), False), + (lomq_aug_sat_4, datetime(2011, 8, 26), False), + (lomq_aug_sat_4, datetime(2019, 8, 30), False), + + #From GMCR + (lomq_sep_sat_4, datetime(2010, 9, 25), True), + (lomq_sep_sat_4, datetime(2011, 9, 24), True), + (lomq_sep_sat_4, datetime(2012, 9, 29), True), + + (lomq_sep_sat_4, datetime(2013, 6, 29), True), + (lomq_sep_sat_4, datetime(2012, 6, 23), True), + (lomq_sep_sat_4, datetime(2012, 6, 30), False), + + (lomq_sep_sat_4, datetime(2013, 3, 30), True), + (lomq_sep_sat_4, datetime(2012, 3, 24), True), + + (lomq_sep_sat_4, datetime(2012, 12, 29), True), + (lomq_sep_sat_4, datetime(2011, 12, 24), True), + + #INTC (extra week in Q1) + #See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1), datetime(2011, 4, 2), True), + + #see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1), datetime(2012, 12, 29), True), + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1), datetime(2011, 12, 31), True), + (makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1), datetime(2010, 12, 25), True), + + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_year_has_extra_week(self): + #End of long Q1 + self.assertTrue(makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2))) + + #Start of long Q1 + self.assertTrue(makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26))) + + #End of year before year with long Q1 + self.assertFalse(makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25))) + + for year in [x for x in range(1994, 2011+1) if x not in [2011, 2005, 2000, 1994]]: + self.assertFalse(makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1).year_has_extra_week(datetime(year, 4, 2))) + + #Other long years + self.assertTrue(makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2))) + self.assertTrue(makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2))) + self.assertTrue(makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2))) + + def test_get_weeks(self): + sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1) + sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=4) + + self.assertEqual(sat_dec_1.get_weeks(datetime(2011, 4, 2)), [14, 13, 13, 13]) + self.assertEqual(sat_dec_4.get_weeks(datetime(2011, 4, 2)), [13, 13, 13, 14]) + self.assertEqual(sat_dec_1.get_weeks(datetime(2010, 12, 25)), [13, 13, 13, 13]) + +class TestFY5253NearestEndMonthQuarter(Base): + + def test_onOffset(self): + + offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4) + offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, + variation="nearest", qtr_with_extra_week=4) + + tests = [ + #From Wikipedia + (offset_nem_sat_aug_4, datetime(2006, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2007, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2008, 8, 30), True), + (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), + (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), + (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), + + (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), + (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), + (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), + (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), + + (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), + + (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), + (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), + (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), + (offset_nem_sat_aug_4, datetime(2009, 8, 28), False), + (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), + (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), + (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), + + #From Micron, see: http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 + (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), + (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), + + #See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 + (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), + (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), + (offset_nem_thu_aug_4, datetime(2012, 11, 29), True), + (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), + (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), + (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), + + (offset_n, datetime(2012, 12, 31), False), + (offset_n, datetime(2013, 1, 1), True), + (offset_n, datetime(2013, 1, 2), False) + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_offset(self): + offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4) + + MU = [datetime(2012, 5, 31), datetime(2012, 8, 30), datetime(2012, 11, 29), datetime(2013, 2, 28), datetime(2013, 5, 30)] + + date = MU[0] + relativedelta(days=-1) + for expected in MU: + assertEq(offset, date, expected) + date = date + offset + + assertEq(offset, datetime(2012, 5, 31), datetime(2012, 8, 30)) + assertEq(offset, datetime(2012, 5, 30), datetime(2012, 5, 31)) + + offset2 = FY5253Quarter(weekday=5, startingMonth=12, + variation="last", qtr_with_extra_week=4) + + assertEq(offset2, datetime(2013,1,15), datetime(2013, 3, 30)) + +class TestQuarterBegin(Base): + + def test_repr(self): + self.assertEqual(repr(QuarterBegin()), "") + self.assertEqual(repr(QuarterBegin(startingMonth=3)), "") + self.assertEqual(repr(QuarterBegin(startingMonth=1)),"") + + def test_isAnchored(self): + self.assertTrue(QuarterBegin(startingMonth=1).isAnchored()) + self.assertTrue(QuarterBegin().isAnchored()) + self.assertFalse(QuarterBegin(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((QuarterBegin(startingMonth=1), + {datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), })) + + tests.append((QuarterBegin(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 4, 30): datetime(2008, 5, 1), })) + + tests.append((QuarterBegin(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), })) + + tests.append((QuarterBegin(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), + datetime(2008, 7, 1): datetime(2008, 4, 1)})) + + tests.append((QuarterBegin(startingMonth=1, n=2), + {datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 10, 1), + datetime(2008, 4, 1): datetime(2008, 10, 1), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + # corner + offset = QuarterBegin(n=-1, startingMonth=1) + self.assertEqual(datetime(2010, 2, 1) + offset, datetime(2010, 1, 1)) + + +class TestQuarterEnd(Base): + _offset = QuarterEnd + + def test_repr(self): + self.assertEqual(repr(QuarterEnd()), "") + self.assertEqual(repr(QuarterEnd(startingMonth=3)), "") + self.assertEqual(repr(QuarterEnd(startingMonth=1)), "") + + def test_isAnchored(self): + self.assertTrue(QuarterEnd(startingMonth=1).isAnchored()) + self.assertTrue(QuarterEnd().isAnchored()) + self.assertFalse(QuarterEnd(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((QuarterEnd(startingMonth=1), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), })) + + tests.append((QuarterEnd(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 5, 31), + datetime(2008, 3, 31): datetime(2008, 5, 31), + datetime(2008, 4, 15): datetime(2008, 5, 31), + datetime(2008, 4, 30): datetime(2008, 5, 31), })) + + tests.append((QuarterEnd(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), })) + + tests.append((QuarterEnd(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + datetime(2008, 7, 1): datetime(2008, 4, 30)})) + + tests.append((QuarterEnd(startingMonth=1, n=2), + {datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + # corner + offset = QuarterEnd(n=-1, startingMonth=1) + self.assertEqual(datetime(2010, 2, 1) + offset, datetime(2010, 1, 31)) + + def test_onOffset(self): + + tests = [(QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (QuarterEnd( + 1, startingMonth=1), datetime(2007, 12, 31), False), + (QuarterEnd( + 1, startingMonth=1), datetime(2008, 2, 29), False), + (QuarterEnd( + 1, startingMonth=1), datetime(2007, 3, 30), False), + (QuarterEnd( + 1, startingMonth=1), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (QuarterEnd( + 1, startingMonth=1), datetime(2008, 5, 30), False), + (QuarterEnd( + 1, startingMonth=1), datetime(2008, 5, 31), False), + (QuarterEnd( + 1, startingMonth=1), datetime(2007, 6, 29), False), + (QuarterEnd( + 1, startingMonth=1), datetime(2007, 6, 30), False), + + (QuarterEnd( + 1, startingMonth=2), datetime(2008, 1, 31), False), + (QuarterEnd( + 1, startingMonth=2), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (QuarterEnd( + 1, startingMonth=2), datetime(2007, 3, 30), False), + (QuarterEnd( + 1, startingMonth=2), datetime(2007, 3, 31), False), + (QuarterEnd( + 1, startingMonth=2), datetime(2008, 4, 30), False), + (QuarterEnd( + 1, startingMonth=2), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), + (QuarterEnd( + 1, startingMonth=2), datetime(2007, 6, 29), False), + (QuarterEnd( + 1, startingMonth=2), datetime(2007, 6, 30), False), + + (QuarterEnd( + 1, startingMonth=3), datetime(2008, 1, 31), False), + (QuarterEnd( + 1, startingMonth=3), datetime(2007, 12, 31), True), + (QuarterEnd( + 1, startingMonth=3), datetime(2008, 2, 29), False), + (QuarterEnd( + 1, startingMonth=3), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), + (QuarterEnd( + 1, startingMonth=3), datetime(2008, 4, 30), False), + (QuarterEnd( + 1, startingMonth=3), datetime(2008, 5, 30), False), + (QuarterEnd( + 1, startingMonth=3), datetime(2008, 5, 31), False), + (QuarterEnd( + 1, startingMonth=3), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + +class TestBYearBegin(Base): + _offset = BYearBegin + + def test_misspecified(self): + self.assertRaises(ValueError, BYearBegin, month=13) + self.assertRaises(ValueError, BYearEnd, month=13) + + def test_offset(self): + tests = [] + + tests.append((BYearBegin(), + {datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2011, 1, 1): datetime(2011, 1, 3), + datetime(2011, 1, 3): datetime(2012, 1, 2), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2) + } + )) + + tests.append((BYearBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2), })) + + tests.append((BYearBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 1, 2), + datetime(2009, 1, 4): datetime(2009, 1, 1), + datetime(2009, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 2), + datetime(2006, 12, 30): datetime(2006, 1, 2), + datetime(2006, 1, 1): datetime(2005, 1, 3), })) + + tests.append((BYearBegin(-2), + {datetime(2007, 1, 1): datetime(2005, 1, 3), + datetime(2007, 6, 30): datetime(2006, 1, 2), + datetime(2008, 12, 31): datetime(2007, 1, 1), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + +class TestYearBegin(Base): + _offset = YearBegin + + def test_misspecified(self): + self.assertRaises(ValueError, YearBegin, month=13) + + def test_offset(self): + tests = [] + + tests.append((YearBegin(), + {datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), })) + + tests.append((YearBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), })) + + tests.append((YearBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 1, 1), + datetime(2007, 1, 15): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 1), + datetime(2006, 12, 30): datetime(2006, 1, 1), + datetime(2007, 1, 1): datetime(2006, 1, 1), })) + + tests.append((YearBegin(-2), + {datetime(2007, 1, 1): datetime(2005, 1, 1), + datetime(2008, 6, 30): datetime(2007, 1, 1), + datetime(2008, 12, 31): datetime(2007, 1, 1), })) + + tests.append((YearBegin(month=4), + {datetime(2007, 4, 1): datetime(2008, 4, 1), + datetime(2007, 4, 15): datetime(2008, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), })) + + tests.append((YearBegin(0, month=4), + {datetime(2007, 4, 1): datetime(2007, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), })) + + tests.append((YearBegin(-1, month=4), + {datetime(2007, 4, 1): datetime(2006, 4, 1), + datetime(2007, 3, 1): datetime(2006, 4, 1), + datetime(2007, 12, 15): datetime(2007, 4, 1), + datetime(2012, 1, 31): datetime(2011, 4, 1), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [ + (YearBegin(), datetime(2007, 1, 3), False), + (YearBegin(), datetime(2008, 1, 1), True), + (YearBegin(), datetime(2006, 12, 31), False), + (YearBegin(), datetime(2006, 1, 2), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + +class TestBYearEndLagged(Base): + + def test_bad_month_fail(self): + self.assertRaises(Exception, BYearEnd, month=13) + self.assertRaises(Exception, BYearEnd, month=0) + + def test_offset(self): + tests = [] + + tests.append((BYearEnd(month=6), + {datetime(2008, 1, 1): datetime(2008, 6, 30), + datetime(2007, 6, 30): datetime(2008, 6, 30)}, + )) + + tests.append((BYearEnd(n=-1, month=6), + {datetime(2008, 1, 1): datetime(2007, 6, 29), + datetime(2007, 6, 30): datetime(2007, 6, 29)}, + )) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + self.assertEqual(base + offset, expected) + + def test_roll(self): + offset = BYearEnd(month=6) + date = datetime(2009, 11, 30) + + self.assertEqual(offset.rollforward(date), datetime(2010, 6, 30)) + self.assertEqual(offset.rollback(date), datetime(2009, 6, 30)) + + def test_onOffset(self): + + tests = [ + (BYearEnd(month=2), datetime(2007, 2, 28), True), + (BYearEnd(month=6), datetime(2007, 6, 30), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + +class TestBYearEnd(Base): + _offset = BYearEnd + + def test_offset(self): + tests = [] + + tests.append((BYearEnd(), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2006, 12, 29), + datetime(2005, 12, 31): datetime(2006, 12, 29), })) + + tests.append((BYearEnd(0), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 29), })) + + tests.append((BYearEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29), })) + + tests.append((BYearEnd(-2), + {datetime(2007, 1, 1): datetime(2005, 12, 30), + datetime(2008, 6, 30): datetime(2006, 12, 29), + datetime(2008, 12, 31): datetime(2006, 12, 29), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [ + (BYearEnd(), datetime(2007, 12, 31), True), + (BYearEnd(), datetime(2008, 1, 1), False), + (BYearEnd(), datetime(2006, 12, 31), False), + (BYearEnd(), datetime(2006, 12, 29), True), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + +class TestYearEnd(Base): + _offset = YearEnd + + def test_misspecified(self): + self.assertRaises(ValueError, YearEnd, month=13) + + def test_offset(self): + tests = [] + + tests.append((YearEnd(), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 31), })) + + tests.append((YearEnd(0), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), })) + + tests.append((YearEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 31), + datetime(2006, 12, 30): datetime(2005, 12, 31), + datetime(2007, 1, 1): datetime(2006, 12, 31), })) + + tests.append((YearEnd(-2), + {datetime(2007, 1, 1): datetime(2005, 12, 31), + datetime(2008, 6, 30): datetime(2006, 12, 31), + datetime(2008, 12, 31): datetime(2006, 12, 31), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [ + (YearEnd(), datetime(2007, 12, 31), True), + (YearEnd(), datetime(2008, 1, 1), False), + (YearEnd(), datetime(2006, 12, 31), True), + (YearEnd(), datetime(2006, 12, 29), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + +class TestYearEndDiffMonth(Base): + + def test_offset(self): + tests = [] + + tests.append((YearEnd(month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 15): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2009, 3, 31), + datetime(2008, 3, 30): datetime(2008, 3, 31), + datetime(2005, 3, 31): datetime(2006, 3, 31), + datetime(2006, 7, 30): datetime(2007, 3, 31)})) + + tests.append((YearEnd(0, month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 28): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2008, 3, 31), + datetime(2005, 3, 30): datetime(2005, 3, 31), })) + + tests.append((YearEnd(-1, month=3), + {datetime(2007, 1, 1): datetime(2006, 3, 31), + datetime(2008, 2, 28): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2007, 3, 31), + datetime(2006, 3, 29): datetime(2005, 3, 31), + datetime(2006, 3, 30): datetime(2005, 3, 31), + datetime(2007, 3, 1): datetime(2006, 3, 31), })) + + tests.append((YearEnd(-2, month=3), + {datetime(2007, 1, 1): datetime(2005, 3, 31), + datetime(2008, 6, 30): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2006, 3, 31), })) + + for offset, cases in tests: + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [ + (YearEnd(month=3), datetime(2007, 3, 31), True), + (YearEnd(month=3), datetime(2008, 1, 1), False), + (YearEnd(month=3), datetime(2006, 3, 31), True), + (YearEnd(month=3), datetime(2006, 3, 29), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + +def assertEq(offset, base, expected): + actual = offset + base + actual_swapped = base + offset + actual_apply = offset.apply(base) + try: + assert actual == expected + assert actual_swapped == expected + assert actual_apply == expected + except AssertionError: + raise AssertionError("\nExpected: %s\nActual: %s\nFor Offset: %s)" + "\nAt Date: %s" % + (expected, actual, offset, base)) + +def test_Easter(): + assertEq(Easter(), datetime(2010, 1, 1), datetime(2010, 4, 4)) + assertEq(Easter(), datetime(2010, 4, 5), datetime(2011, 4, 24)) + assertEq(Easter(2), datetime(2010, 1, 1), datetime(2011, 4, 24)) + + assertEq(Easter(), datetime(2010, 4, 4), datetime(2011, 4, 24)) + assertEq(Easter(2), datetime(2010, 4, 4), datetime(2012, 4, 8)) + + assertEq(-Easter(), datetime(2011, 1, 1), datetime(2010, 4, 4)) + assertEq(-Easter(), datetime(2010, 4, 5), datetime(2010, 4, 4)) + assertEq(-Easter(2), datetime(2011, 1, 1), datetime(2009, 4, 12)) + + assertEq(-Easter(), datetime(2010, 4, 4), datetime(2009, 4, 12)) + assertEq(-Easter(2), datetime(2010, 4, 4), datetime(2008, 3, 23)) + +def test_Hour(): + assertEq(Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 1)) + assertEq(Hour(-1), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + assertEq(2 * Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 2)) + assertEq(-1 * Hour(), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + + assert (Hour(3) + Hour(2)) == Hour(5) + assert (Hour(3) - Hour(2)) == Hour() + + assert(Hour(4) != Hour(1)) + + assert not Hour().isAnchored() + + +def test_Minute(): + assertEq(Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1)) + assertEq(Minute(-1), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + assertEq(2 * Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2)) + assertEq(-1 * Minute(), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + + assert (Minute(3) + Minute(2)) == Minute(5) + assert (Minute(3) - Minute(2)) == Minute() + assert(Minute(5) != Minute()) + + assert not Minute().isAnchored() + + +def test_Second(): + assertEq(Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 1)) + assertEq(Second(-1), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1)) + assertEq(2 * Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 2)) + assertEq( + -1 * Second(), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1)) + + assert (Second(3) + Second(2)) == Second(5) + assert (Second(3) - Second(2)) == Second() + + assert not Second().isAnchored() + + +def test_Millisecond(): + assertEq(Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1000)) + assertEq(Milli(-1), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1)) + assertEq(Milli(2), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000)) + assertEq(2 * Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000)) + assertEq(-1 * Milli(), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1)) + + assert (Milli(3) + Milli(2)) == Milli(5) + assert (Milli(3) - Milli(2)) == Milli() + + +def test_MillisecondTimestampArithmetic(): + assertEq(Milli(), Timestamp('2010-01-01'), Timestamp('2010-01-01 00:00:00.001')) + assertEq(Milli(-1), Timestamp('2010-01-01 00:00:00.001'), Timestamp('2010-01-01')) + + +def test_Microsecond(): + assertEq(Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1)) + assertEq(Micro(-1), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1)) + assertEq(2 * Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2)) + assertEq(-1 * Micro(), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1)) + + assert (Micro(3) + Micro(2)) == Micro(5) + assert (Micro(3) - Micro(2)) == Micro() + + +def test_NanosecondGeneric(): + tm._skip_if_not_numpy17_friendly() + + timestamp = Timestamp(datetime(2010, 1, 1)) + assert timestamp.nanosecond == 0 + + result = timestamp + Nano(10) + assert result.nanosecond == 10 + + reverse_result = Nano(10) + timestamp + assert reverse_result.nanosecond == 10 + + +def test_Nanosecond(): + tm._skip_if_not_numpy17_friendly() + + timestamp = Timestamp(datetime(2010, 1, 1)) + assertEq(Nano(), timestamp, timestamp + np.timedelta64(1, 'ns')) + assertEq(Nano(-1), timestamp + np.timedelta64(1, 'ns'), timestamp) + assertEq(2 * Nano(), timestamp, timestamp + np.timedelta64(2, 'ns')) + assertEq(-1 * Nano(), timestamp + np.timedelta64(1, 'ns'), timestamp) + + assert (Nano(3) + Nano(2)) == Nano(5) + assert (Nano(3) - Nano(2)) == Nano() + + +def test_tick_offset(): + assert not Day().isAnchored() + assert not Milli().isAnchored() + assert not Micro().isAnchored() + assert not Nano().isAnchored() + + +def test_compare_ticks(): + offsets = [Hour, Minute, Second, Milli, Micro] + + for kls in offsets: + three = kls(3) + four = kls(4) + + for _ in range(10): + assert(three < kls(4)) + assert(kls(3) < four) + assert(four > kls(3)) + assert(kls(4) > three) + assert(kls(3) == kls(3)) + assert(kls(3) != kls(4)) + + +class TestOffsetNames(tm.TestCase): + def test_get_offset_name(self): + assertRaisesRegexp(ValueError, 'Bad rule.*BusinessDays', get_offset_name, BDay(2)) + + assert get_offset_name(BDay()) == 'B' + assert get_offset_name(BMonthEnd()) == 'BM' + assert get_offset_name(Week(weekday=0)) == 'W-MON' + assert get_offset_name(Week(weekday=1)) == 'W-TUE' + assert get_offset_name(Week(weekday=2)) == 'W-WED' + assert get_offset_name(Week(weekday=3)) == 'W-THU' + assert get_offset_name(Week(weekday=4)) == 'W-FRI' + + self.assertEqual(get_offset_name(LastWeekOfMonth(weekday=WeekDay.SUN)), "LWOM-SUN") + self.assertEqual(get_offset_name(makeFY5253LastOfMonthQuarter(weekday=1, startingMonth=3, qtr_with_extra_week=4)),"REQ-L-MAR-TUE-4") + self.assertEqual(get_offset_name(makeFY5253NearestEndMonthQuarter(weekday=1, startingMonth=3, qtr_with_extra_week=3)), "REQ-N-MAR-TUE-3") + +def test_get_offset(): + assertRaisesRegexp(ValueError, "rule.*GIBBERISH", get_offset, 'gibberish') + assertRaisesRegexp(ValueError, "rule.*QS-JAN-B", get_offset, 'QS-JAN-B') + pairs = [ + ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), + ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), + ('W-TUE', Week(weekday=1)), ('W-WED', Week(weekday=2)), + ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4)), + ('w@Sat', Week(weekday=5)), + ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), + ("RE-L-DEC-TUE", makeFY5253LastOfMonth(weekday=1, startingMonth=12)), + ("REQ-L-MAR-TUE-4", makeFY5253LastOfMonthQuarter(weekday=1, startingMonth=3, qtr_with_extra_week=4)), + ("REQ-L-DEC-MON-3", makeFY5253LastOfMonthQuarter(weekday=0, startingMonth=12, qtr_with_extra_week=3)), + ("REQ-N-DEC-MON-3", makeFY5253NearestEndMonthQuarter(weekday=0, startingMonth=12, qtr_with_extra_week=3)), + ] + + for name, expected in pairs: + offset = get_offset(name) + assert offset == expected, ("Expected %r to yield %r (actual: %r)" % + (name, expected, offset)) + + +def test_parse_time_string(): + (date, parsed, reso) = parse_time_string('4Q1984') + (date_lower, parsed_lower, reso_lower) = parse_time_string('4q1984') + assert date == date_lower + assert parsed == parsed_lower + assert reso == reso_lower + + +def test_get_standard_freq(): + fstr = get_standard_freq('W') + assert fstr == get_standard_freq('w') + assert fstr == get_standard_freq('1w') + assert fstr == get_standard_freq(('W', 1)) + assert fstr == get_standard_freq('WeEk') + + fstr = get_standard_freq('5Q') + assert fstr == get_standard_freq('5q') + assert fstr == get_standard_freq('5QuarTer') + assert fstr == get_standard_freq(('q', 5)) + + +def test_quarterly_dont_normalize(): + date = datetime(2012, 3, 31, 5, 30) + + offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin) + + for klass in offsets: + result = date + klass() + assert(result.time() == date.time()) + + +class TestOffsetAliases(tm.TestCase): + + def setUp(self): + _offset_map.clear() + + def test_alias_equality(self): + for k, v in compat.iteritems(_offset_map): + if v is None: + continue + self.assertEqual(k, v.copy()) + + def test_rule_code(self): + lst = ['M', 'MS', 'BM', 'BMS', 'D', 'B', 'H', 'T', 'S', 'L', 'U'] + for k in lst: + self.assertEqual(k, get_offset(k).rule_code) + # should be cached - this is kind of an internals test... + assert k in _offset_map + self.assertEqual(k, (get_offset(k) * 3).rule_code) + + suffix_lst = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] + base = 'W' + for v in suffix_lst: + alias = '-'.join([base, v]) + self.assertEqual(alias, get_offset(alias).rule_code) + self.assertEqual(alias, (get_offset(alias) * 5).rule_code) + + suffix_lst = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', + 'SEP', 'OCT', 'NOV', 'DEC'] + base_lst = ['A', 'AS', 'BA', 'BAS', 'Q', 'QS', 'BQ', 'BQS'] + for base in base_lst: + for v in suffix_lst: + alias = '-'.join([base, v]) + self.assertEqual(alias, get_offset(alias).rule_code) + self.assertEqual(alias, (get_offset(alias) * 5).rule_code) + + +def test_apply_ticks(): + result = offsets.Hour(3).apply(offsets.Hour(4)) + exp = offsets.Hour(7) + assert(result == exp) + + +def test_delta_to_tick(): + delta = timedelta(3) + + tick = offsets._delta_to_tick(delta) + assert(tick == offsets.Day(3)) + + +def test_dateoffset_misc(): + oset = offsets.DateOffset(months=2, days=4) + # it works + result = oset.freqstr + + assert(not offsets.DateOffset(months=2) == 2) + + +def test_freq_offsets(): + off = BDay(1, offset=timedelta(0, 1800)) + assert(off.freqstr == 'B+30Min') + + off = BDay(1, offset=timedelta(0, -1800)) + assert(off.freqstr == 'B-30Min') + + +def get_all_subclasses(cls): + ret = set() + this_subclasses = cls.__subclasses__() + ret = ret | set(this_subclasses) + for this_subclass in this_subclasses: + ret | get_all_subclasses(this_subclass) + return ret + +class TestCaching(tm.TestCase): + + # as of GH 6479 (in 0.14.0), offset caching is turned off + # as of v0.12.0 only BusinessMonth/Quarter were actually caching + + def setUp(self): + _daterange_cache.clear() + _offset_map.clear() + + def run_X_index_creation(self, cls): + inst1 = cls() + if not inst1.isAnchored(): + self.assertFalse(inst1._should_cache(), cls) + return + + self.assertTrue(inst1._should_cache(), cls) + + DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=inst1, normalize=True) + self.assertTrue(cls() in _daterange_cache, cls) + + def test_should_cache_month_end(self): + self.assertFalse(MonthEnd()._should_cache()) + + def test_should_cache_bmonth_end(self): + self.assertFalse(BusinessMonthEnd()._should_cache()) + + def test_should_cache_week_month(self): + self.assertFalse(WeekOfMonth(weekday=1, week=2)._should_cache()) + + def test_all_cacheableoffsets(self): + for subclass in get_all_subclasses(CacheableOffset): + if subclass.__name__[0] == "_" \ + or subclass in TestCaching.no_simple_ctr: + continue + self.run_X_index_creation(subclass) + + def test_month_end_index_creation(self): + DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=MonthEnd(), normalize=True) + self.assertFalse(MonthEnd() in _daterange_cache) + + def test_bmonth_end_index_creation(self): + DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=BusinessMonthEnd(), normalize=True) + self.assertFalse(BusinessMonthEnd() in _daterange_cache) + + def test_week_of_month_index_creation(self): + inst1 = WeekOfMonth(weekday=1, week=2) + DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=inst1, normalize=True) + inst2 = WeekOfMonth(weekday=1, week=2) + self.assertFalse(inst2 in _daterange_cache) + +class TestReprNames(tm.TestCase): + def test_str_for_named_is_name(self): + # look at all the amazing combinations! + month_prefixes = ['A', 'AS', 'BA', 'BAS', 'Q', 'BQ', 'BQS', 'QS'] + names = [prefix + '-' + month for prefix in month_prefixes + for month in ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', + 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']] + days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] + names += ['W-' + day for day in days] + names += ['WOM-' + week + day for week in ('1', '2', '3', '4') + for day in days] + #singletons + names += ['S', 'T', 'U', 'BM', 'BMS', 'BQ', 'QS'] # No 'Q' + _offset_map.clear() + for name in names: + offset = get_offset(name) + self.assertEqual(repr(offset), name) + self.assertEqual(str(offset), name) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py new file mode 100644 index 00000000..53375b4d --- /dev/null +++ b/pandas/tseries/tests/test_period.py @@ -0,0 +1,2663 @@ +"""Tests suite for Period handling. + +Parts derived from scikits.timeseries code, original authors: +- Pierre Gerard-Marchant & Matt Knox +- pierregm_at_uga_dot_edu - mattknow_ca_at_hotmail_dot_com + +""" + +from datetime import datetime, date, timedelta + +from numpy.ma.testutils import assert_equal + +from pandas import Timestamp +from pandas.tseries.frequencies import MONTHS, DAYS, _period_code_map +from pandas.tseries.period import Period, PeriodIndex, period_range +from pandas.tseries.index import DatetimeIndex, date_range, Index +from pandas.tseries.tools import to_datetime +import pandas.tseries.period as pmod + +import pandas.core.datetools as datetools +import pandas as pd +import numpy as np +from numpy.random import randn +from pandas.compat import range, lrange, lmap, zip + +from pandas import Series, TimeSeries, DataFrame, _np_version_under1p9 +from pandas import tslib +from pandas.util.testing import(assert_series_equal, assert_almost_equal, + assertRaisesRegexp) +import pandas.util.testing as tm +from pandas import compat +from numpy.testing import assert_array_equal + + +class TestPeriodProperties(tm.TestCase): + "Test properties such as year, month, weekday, etc...." + # + + def test_quarterly_negative_ordinals(self): + p = Period(ordinal=-1, freq='Q-DEC') + self.assertEqual(p.year, 1969) + self.assertEqual(p.quarter, 4) + + p = Period(ordinal=-2, freq='Q-DEC') + self.assertEqual(p.year, 1969) + self.assertEqual(p.quarter, 3) + + p = Period(ordinal=-2, freq='M') + self.assertEqual(p.year, 1969) + self.assertEqual(p.month, 11) + + def test_period_cons_quarterly(self): + # bugs in scikits.timeseries + for month in MONTHS: + freq = 'Q-%s' % month + exp = Period('1989Q3', freq=freq) + self.assertIn('1989Q3', str(exp)) + stamp = exp.to_timestamp('D', how='end') + p = Period(stamp, freq=freq) + self.assertEqual(p, exp) + + def test_period_cons_annual(self): + # bugs in scikits.timeseries + for month in MONTHS: + freq = 'A-%s' % month + exp = Period('1989', freq=freq) + stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) + p = Period(stamp, freq=freq) + self.assertEqual(p, exp + 1) + + def test_period_cons_weekly(self): + for num in range(10, 17): + daystr = '2011-02-%d' % num + for day in DAYS: + freq = 'W-%s' % day + + result = Period(daystr, freq=freq) + expected = Period(daystr, freq='D').asfreq(freq) + self.assertEqual(result, expected) + + def test_period_cons_nat(self): + p = Period('NaT', freq='M') + self.assertEqual(p.ordinal, tslib.iNaT) + self.assertEqual(p.freq, 'M') + + p = Period('nat', freq='W-SUN') + self.assertEqual(p.ordinal, tslib.iNaT) + self.assertEqual(p.freq, 'W-SUN') + + p = Period(tslib.iNaT, freq='D') + self.assertEqual(p.ordinal, tslib.iNaT) + self.assertEqual(p.freq, 'D') + + self.assertRaises(ValueError, Period, 'NaT') + + def test_timestamp_tz_arg(self): + import pytz + p = Period('1/1/2005', freq='M').to_timestamp(tz='Europe/Brussels') + self.assertEqual(p.tz, + pytz.timezone('Europe/Brussels').normalize(p).tzinfo) + + def test_timestamp_tz_arg_dateutil(self): + import dateutil + from pandas.tslib import maybe_get_tz + p = Period('1/1/2005', freq='M').to_timestamp(tz=maybe_get_tz('dateutil/Europe/Brussels')) + self.assertEqual(p.tz, dateutil.tz.gettz('Europe/Brussels')) + + def test_timestamp_tz_arg_dateutil_from_string(self): + import dateutil + p = Period('1/1/2005', freq='M').to_timestamp(tz='dateutil/Europe/Brussels') + self.assertEqual(p.tz, dateutil.tz.gettz('Europe/Brussels')) + + def test_timestamp_nat_tz(self): + t = Period('NaT', freq='M').to_timestamp() + self.assertTrue(t is tslib.NaT) + + t = Period('NaT', freq='M').to_timestamp(tz='Asia/Tokyo') + self.assertTrue(t is tslib.NaT) + + def test_period_constructor(self): + i1 = Period('1/1/2005', freq='M') + i2 = Period('Jan 2005') + + self.assertEqual(i1, i2) + + i1 = Period('2005', freq='A') + i2 = Period('2005') + i3 = Period('2005', freq='a') + + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + + i4 = Period('2005', freq='M') + i5 = Period('2005', freq='m') + + self.assertRaises(ValueError, i1.__ne__, i4) + self.assertEqual(i4, i5) + + i1 = Period.now('Q') + i2 = Period(datetime.now(), freq='Q') + i3 = Period.now('q') + + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + + # Biz day construction, roll forward if non-weekday + i1 = Period('3/10/12', freq='B') + i2 = Period('3/10/12', freq='D') + self.assertEqual(i1, i2.asfreq('B')) + i2 = Period('3/11/12', freq='D') + self.assertEqual(i1, i2.asfreq('B')) + i2 = Period('3/12/12', freq='D') + self.assertEqual(i1, i2.asfreq('B')) + + i3 = Period('3/10/12', freq='b') + self.assertEqual(i1, i3) + + i1 = Period(year=2005, quarter=1, freq='Q') + i2 = Period('1/1/2005', freq='Q') + self.assertEqual(i1, i2) + + i1 = Period(year=2005, quarter=3, freq='Q') + i2 = Period('9/1/2005', freq='Q') + self.assertEqual(i1, i2) + + i1 = Period(year=2005, month=3, day=1, freq='D') + i2 = Period('3/1/2005', freq='D') + self.assertEqual(i1, i2) + + i3 = Period(year=2005, month=3, day=1, freq='d') + self.assertEqual(i1, i3) + + i1 = Period(year=2012, month=3, day=10, freq='B') + i2 = Period('3/12/12', freq='B') + self.assertEqual(i1, i2) + + i1 = Period('2005Q1') + i2 = Period(year=2005, quarter=1, freq='Q') + i3 = Period('2005q1') + self.assertEqual(i1, i2) + self.assertEqual(i1, i3) + + i1 = Period('05Q1') + self.assertEqual(i1, i2) + lower = Period('05q1') + self.assertEqual(i1, lower) + + i1 = Period('1Q2005') + self.assertEqual(i1, i2) + lower = Period('1q2005') + self.assertEqual(i1, lower) + + i1 = Period('1Q05') + self.assertEqual(i1, i2) + lower = Period('1q05') + self.assertEqual(i1, lower) + + i1 = Period('4Q1984') + self.assertEqual(i1.year, 1984) + lower = Period('4q1984') + self.assertEqual(i1, lower) + + i1 = Period('1982', freq='min') + i2 = Period('1982', freq='MIN') + self.assertEqual(i1, i2) + i2 = Period('1982', freq=('Min', 1)) + self.assertEqual(i1, i2) + + expected = Period('2007-01', freq='M') + i1 = Period('200701', freq='M') + self.assertEqual(i1, expected) + + i1 = Period('200701', freq='M') + self.assertEqual(i1, expected) + + i1 = Period(200701, freq='M') + self.assertEqual(i1, expected) + + i1 = Period(ordinal=200701, freq='M') + self.assertEqual(i1.year, 18695) + + i1 = Period(datetime(2007, 1, 1), freq='M') + i2 = Period('200701', freq='M') + self.assertEqual(i1, i2) + + i1 = Period(date(2007, 1, 1), freq='M') + i2 = Period(datetime(2007, 1, 1), freq='M') + self.assertEqual(i1, i2) + + i1 = Period('2007-01-01 09:00:00.001') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + self.assertEqual(i1, expected) + + i1 = Period('2007-01-01 09:00:00.00101') + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + self.assertEqual(i1, expected) + + self.assertRaises(ValueError, Period, ordinal=200701) + + self.assertRaises(ValueError, Period, '2007-1-1', freq='X') + + def test_freq_str(self): + i1 = Period('1982', freq='Min') + self.assertNotEqual(i1.freq[0], '1') + + def test_repr(self): + p = Period('Jan-2000') + self.assertIn('2000-01', repr(p)) + + p = Period('2000-12-15') + self.assertIn('2000-12-15', repr(p)) + + def test_repr_nat(self): + p = Period('nat', freq='M') + self.assertIn(repr(tslib.NaT), repr(p)) + + def test_millisecond_repr(self): + p = Period('2000-01-01 12:15:02.123') + + self.assertEqual("Period('2000-01-01 12:15:02.123', 'L')", repr(p)) + + def test_microsecond_repr(self): + p = Period('2000-01-01 12:15:02.123567') + + self.assertEqual("Period('2000-01-01 12:15:02.123567', 'U')", repr(p)) + + def test_strftime(self): + p = Period('2000-1-1 12:34:12', freq='S') + res = p.strftime('%Y-%m-%d %H:%M:%S') + self.assertEqual(res, '2000-01-01 12:34:12') + tm.assert_isinstance(res, compat.text_type) # GH3363 + + def test_sub_delta(self): + left, right = Period('2011', freq='A'), Period('2007', freq='A') + result = left - right + self.assertEqual(result, 4) + + self.assertRaises(ValueError, left.__sub__, + Period('2007-01', freq='M')) + + def test_to_timestamp(self): + p = Period('1982', freq='A') + start_ts = p.to_timestamp(how='S') + aliases = ['s', 'StarT', 'BEGIn'] + for a in aliases: + self.assertEqual(start_ts, p.to_timestamp('D', how=a)) + + end_ts = p.to_timestamp(how='E') + aliases = ['e', 'end', 'FINIsH'] + for a in aliases: + self.assertEqual(end_ts, p.to_timestamp('D', how=a)) + + from_lst = ['A', 'Q', 'M', 'W', 'B', + 'D', 'H', 'Min', 'S'] + + def _ex(p): + return Timestamp((p + 1).start_time.value - 1) + + for i, fcode in enumerate(from_lst): + p = Period('1982', freq=fcode) + result = p.to_timestamp().to_period(fcode) + self.assertEqual(result, p) + + self.assertEqual(p.start_time, p.to_timestamp(how='S')) + + self.assertEqual(p.end_time, _ex(p)) + + # Frequency other than daily + + p = Period('1985', freq='A') + + result = p.to_timestamp('H', how='end') + expected = datetime(1985, 12, 31, 23) + self.assertEqual(result, expected) + + result = p.to_timestamp('T', how='end') + expected = datetime(1985, 12, 31, 23, 59) + self.assertEqual(result, expected) + + result = p.to_timestamp(how='end') + expected = datetime(1985, 12, 31) + self.assertEqual(result, expected) + + expected = datetime(1985, 1, 1) + result = p.to_timestamp('H', how='start') + self.assertEqual(result, expected) + result = p.to_timestamp('T', how='start') + self.assertEqual(result, expected) + result = p.to_timestamp('S', how='start') + self.assertEqual(result, expected) + + assertRaisesRegexp(ValueError, 'Only mult == 1', p.to_timestamp, '5t') + + p = Period('NaT', freq='W') + self.assertTrue(p.to_timestamp() is tslib.NaT) + + def test_start_time(self): + freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] + xp = datetime(2012, 1, 1) + for f in freq_lst: + p = Period('2012', freq=f) + self.assertEqual(p.start_time, xp) + self.assertEqual(Period('2012', freq='B').start_time, + datetime(2012, 1, 2)) + self.assertEqual(Period('2012', freq='W').start_time, + datetime(2011, 12, 26)) + + p = Period('NaT', freq='W') + self.assertTrue(p.start_time is tslib.NaT) + + def test_end_time(self): + p = Period('2012', freq='A') + + def _ex(*args): + return Timestamp(Timestamp(datetime(*args)).value - 1) + + xp = _ex(2013, 1, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='Q') + xp = _ex(2012, 4, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='M') + xp = _ex(2012, 2, 1) + self.assertEqual(xp, p.end_time) + + xp = _ex(2012, 1, 2) + p = Period('2012', freq='D') + self.assertEqual(p.end_time, xp) + + xp = _ex(2012, 1, 1, 1) + p = Period('2012', freq='H') + self.assertEqual(p.end_time, xp) + + xp = _ex(2012, 1, 3) + self.assertEqual(Period('2012', freq='B').end_time, xp) + + xp = _ex(2012, 1, 2) + self.assertEqual(Period('2012', freq='W').end_time, xp) + + p = Period('NaT', freq='W') + self.assertTrue(p.end_time is tslib.NaT) + + def test_anchor_week_end_time(self): + def _ex(*args): + return Timestamp(Timestamp(datetime(*args)).value - 1) + + p = Period('2013-1-1', 'W-SAT') + xp = _ex(2013, 1, 6) + self.assertEqual(p.end_time, xp) + + def test_properties_annually(self): + # Test properties on Periods with annually frequency. + a_date = Period(freq='A', year=2007) + assert_equal(a_date.year, 2007) + + def test_properties_quarterly(self): + # Test properties on Periods with daily frequency. + qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) + qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) + qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) + # + for x in range(3): + for qd in (qedec_date, qejan_date, qejun_date): + assert_equal((qd + x).qyear, 2007) + assert_equal((qd + x).quarter, x + 1) + + def test_properties_monthly(self): + # Test properties on Periods with daily frequency. + m_date = Period(freq='M', year=2007, month=1) + for x in range(11): + m_ival_x = m_date + x + assert_equal(m_ival_x.year, 2007) + if 1 <= x + 1 <= 3: + assert_equal(m_ival_x.quarter, 1) + elif 4 <= x + 1 <= 6: + assert_equal(m_ival_x.quarter, 2) + elif 7 <= x + 1 <= 9: + assert_equal(m_ival_x.quarter, 3) + elif 10 <= x + 1 <= 12: + assert_equal(m_ival_x.quarter, 4) + assert_equal(m_ival_x.month, x + 1) + + def test_properties_weekly(self): + # Test properties on Periods with daily frequency. + w_date = Period(freq='WK', year=2007, month=1, day=7) + # + assert_equal(w_date.year, 2007) + assert_equal(w_date.quarter, 1) + assert_equal(w_date.month, 1) + assert_equal(w_date.week, 1) + assert_equal((w_date - 1).week, 52) + + def test_properties_daily(self): + # Test properties on Periods with daily frequency. + b_date = Period(freq='B', year=2007, month=1, day=1) + # + assert_equal(b_date.year, 2007) + assert_equal(b_date.quarter, 1) + assert_equal(b_date.month, 1) + assert_equal(b_date.day, 1) + assert_equal(b_date.weekday, 0) + assert_equal(b_date.dayofyear, 1) + # + d_date = Period(freq='D', year=2007, month=1, day=1) + # + assert_equal(d_date.year, 2007) + assert_equal(d_date.quarter, 1) + assert_equal(d_date.month, 1) + assert_equal(d_date.day, 1) + assert_equal(d_date.weekday, 0) + assert_equal(d_date.dayofyear, 1) + + def test_properties_hourly(self): + # Test properties on Periods with hourly frequency. + h_date = Period(freq='H', year=2007, month=1, day=1, hour=0) + # + assert_equal(h_date.year, 2007) + assert_equal(h_date.quarter, 1) + assert_equal(h_date.month, 1) + assert_equal(h_date.day, 1) + assert_equal(h_date.weekday, 0) + assert_equal(h_date.dayofyear, 1) + assert_equal(h_date.hour, 0) + # + + def test_properties_minutely(self): + # Test properties on Periods with minutely frequency. + t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0) + # + assert_equal(t_date.quarter, 1) + assert_equal(t_date.month, 1) + assert_equal(t_date.day, 1) + assert_equal(t_date.weekday, 0) + assert_equal(t_date.dayofyear, 1) + assert_equal(t_date.hour, 0) + assert_equal(t_date.minute, 0) + + def test_properties_secondly(self): + # Test properties on Periods with secondly frequency. + s_date = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + # + assert_equal(s_date.year, 2007) + assert_equal(s_date.quarter, 1) + assert_equal(s_date.month, 1) + assert_equal(s_date.day, 1) + assert_equal(s_date.weekday, 0) + assert_equal(s_date.dayofyear, 1) + assert_equal(s_date.hour, 0) + assert_equal(s_date.minute, 0) + assert_equal(s_date.second, 0) + + def test_properties_nat(self): + p_nat = Period('NaT', freq='M') + t_nat = pd.Timestamp('NaT') + # confirm Period('NaT') work identical with Timestamp('NaT') + for f in ['year', 'month', 'day', 'hour', 'minute', 'second', + 'week', 'dayofyear', 'quarter']: + self.assertEqual(getattr(p_nat, f), -1) + self.assertEqual(getattr(t_nat, f), -1) + + for f in ['weekofyear', 'dayofweek', 'weekday', 'qyear']: + self.assertEqual(getattr(p_nat, f), -1) + + def test_pnow(self): + dt = datetime.now() + + val = pmod.pnow('D') + exp = Period(dt, freq='D') + self.assertEqual(val, exp) + + def test_constructor_corner(self): + self.assertRaises(ValueError, Period, year=2007, month=1, + freq='2M') + + self.assertRaises(ValueError, Period, datetime.now()) + self.assertRaises(ValueError, Period, datetime.now().date()) + self.assertRaises(ValueError, Period, 1.6, freq='D') + self.assertRaises(ValueError, Period, ordinal=1.6, freq='D') + self.assertRaises(ValueError, Period, ordinal=2, value=1, freq='D') + self.assertRaises(ValueError, Period) + self.assertRaises(ValueError, Period, month=1) + + p = Period('2007-01-01', freq='D') + + result = Period(p, freq='A') + exp = Period('2007', freq='A') + self.assertEqual(result, exp) + + def test_constructor_infer_freq(self): + p = Period('2007-01-01') + self.assertEqual(p.freq, 'D') + + p = Period('2007-01-01 07') + self.assertEqual(p.freq, 'H') + + p = Period('2007-01-01 07:10') + self.assertEqual(p.freq, 'T') + + p = Period('2007-01-01 07:10:15') + self.assertEqual(p.freq, 'S') + + p = Period('2007-01-01 07:10:15.123') + self.assertEqual(p.freq, 'L') + + p = Period('2007-01-01 07:10:15.123000') + self.assertEqual(p.freq, 'L') + + p = Period('2007-01-01 07:10:15.123400') + self.assertEqual(p.freq, 'U') + + def test_asfreq_MS(self): + initial = Period("2013") + + self.assertEqual(initial.asfreq(freq="M", how="S"), Period('2013-01', 'M')) + self.assertRaises(ValueError, initial.asfreq, freq="MS", how="S") + tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS", pd.Period, '2013-01', 'MS') + self.assertTrue(_period_code_map.get("MS") is None) + +def noWrap(item): + return item + + +class TestFreqConversion(tm.TestCase): + "Test frequency conversion of date objects" + + def test_asfreq_corner(self): + val = Period(freq='A', year=2007) + self.assertRaises(ValueError, val.asfreq, '5t') + + def test_conv_annual(self): + # frequency conversion tests: from Annual Frequency + + ival_A = Period(freq='A', year=2007) + + ival_AJAN = Period(freq="A-JAN", year=2007) + ival_AJUN = Period(freq="A-JUN", year=2007) + ival_ANOV = Period(freq="A-NOV", year=2007) + + ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) + ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) + ival_A_to_M_start = Period(freq='M', year=2007, month=1) + ival_A_to_M_end = Period(freq='M', year=2007, month=12) + ival_A_to_W_start = Period(freq='WK', year=2007, month=1, day=1) + ival_A_to_W_end = Period(freq='WK', year=2007, month=12, day=31) + ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) + ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) + ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + + ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) + ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) + ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) + + assert_equal(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) + assert_equal(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) + assert_equal(ival_A.asfreq('M', 's'), ival_A_to_M_start) + assert_equal(ival_A.asfreq('M', 'E'), ival_A_to_M_end) + assert_equal(ival_A.asfreq('WK', 'S'), ival_A_to_W_start) + assert_equal(ival_A.asfreq('WK', 'E'), ival_A_to_W_end) + assert_equal(ival_A.asfreq('B', 'S'), ival_A_to_B_start) + assert_equal(ival_A.asfreq('B', 'E'), ival_A_to_B_end) + assert_equal(ival_A.asfreq('D', 'S'), ival_A_to_D_start) + assert_equal(ival_A.asfreq('D', 'E'), ival_A_to_D_end) + assert_equal(ival_A.asfreq('H', 'S'), ival_A_to_H_start) + assert_equal(ival_A.asfreq('H', 'E'), ival_A_to_H_end) + assert_equal(ival_A.asfreq('min', 'S'), ival_A_to_T_start) + assert_equal(ival_A.asfreq('min', 'E'), ival_A_to_T_end) + assert_equal(ival_A.asfreq('T', 'S'), ival_A_to_T_start) + assert_equal(ival_A.asfreq('T', 'E'), ival_A_to_T_end) + assert_equal(ival_A.asfreq('S', 'S'), ival_A_to_S_start) + assert_equal(ival_A.asfreq('S', 'E'), ival_A_to_S_end) + + assert_equal(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) + assert_equal(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) + + assert_equal(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) + assert_equal(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) + + assert_equal(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) + assert_equal(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) + + assert_equal(ival_A.asfreq('A'), ival_A) + + def test_conv_quarterly(self): + # frequency conversion tests: from Quarterly Frequency + + ival_Q = Period(freq='Q', year=2007, quarter=1) + ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) + + ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) + ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) + + ival_Q_to_A = Period(freq='A', year=2007) + ival_Q_to_M_start = Period(freq='M', year=2007, month=1) + ival_Q_to_M_end = Period(freq='M', year=2007, month=3) + ival_Q_to_W_start = Period(freq='WK', year=2007, month=1, day=1) + ival_Q_to_W_end = Period(freq='WK', year=2007, month=3, day=31) + ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) + ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) + ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, + hour=23) + ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, + hour=23, minute=59, second=59) + + ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) + + ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) + + assert_equal(ival_Q.asfreq('A'), ival_Q_to_A) + assert_equal(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) + + assert_equal(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) + assert_equal(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) + assert_equal(ival_Q.asfreq('WK', 'S'), ival_Q_to_W_start) + assert_equal(ival_Q.asfreq('WK', 'E'), ival_Q_to_W_end) + assert_equal(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) + assert_equal(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) + assert_equal(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) + assert_equal(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) + assert_equal(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) + assert_equal(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) + assert_equal(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) + assert_equal(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) + assert_equal(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) + assert_equal(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) + + assert_equal(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) + assert_equal(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) + assert_equal(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) + assert_equal(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) + + assert_equal(ival_Q.asfreq('Q'), ival_Q) + + def test_conv_monthly(self): + # frequency conversion tests: from Monthly Frequency + + ival_M = Period(freq='M', year=2007, month=1) + ival_M_end_of_year = Period(freq='M', year=2007, month=12) + ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) + ival_M_to_A = Period(freq='A', year=2007) + ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_M_to_W_start = Period(freq='WK', year=2007, month=1, day=1) + ival_M_to_W_end = Period(freq='WK', year=2007, month=1, day=31) + ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) + ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, + hour=23) + ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, + hour=23, minute=59, second=59) + + assert_equal(ival_M.asfreq('A'), ival_M_to_A) + assert_equal(ival_M_end_of_year.asfreq('A'), ival_M_to_A) + assert_equal(ival_M.asfreq('Q'), ival_M_to_Q) + assert_equal(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) + + assert_equal(ival_M.asfreq('WK', 'S'), ival_M_to_W_start) + assert_equal(ival_M.asfreq('WK', 'E'), ival_M_to_W_end) + assert_equal(ival_M.asfreq('B', 'S'), ival_M_to_B_start) + assert_equal(ival_M.asfreq('B', 'E'), ival_M_to_B_end) + assert_equal(ival_M.asfreq('D', 'S'), ival_M_to_D_start) + assert_equal(ival_M.asfreq('D', 'E'), ival_M_to_D_end) + assert_equal(ival_M.asfreq('H', 'S'), ival_M_to_H_start) + assert_equal(ival_M.asfreq('H', 'E'), ival_M_to_H_end) + assert_equal(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) + assert_equal(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) + assert_equal(ival_M.asfreq('S', 'S'), ival_M_to_S_start) + assert_equal(ival_M.asfreq('S', 'E'), ival_M_to_S_end) + + assert_equal(ival_M.asfreq('M'), ival_M) + + def test_conv_weekly(self): + # frequency conversion tests: from Weekly Frequency + + ival_W = Period(freq='WK', year=2007, month=1, day=1) + + ival_WSUN = Period(freq='WK', year=2007, month=1, day=7) + ival_WSAT = Period(freq='WK-SAT', year=2007, month=1, day=6) + ival_WFRI = Period(freq='WK-FRI', year=2007, month=1, day=5) + ival_WTHU = Period(freq='WK-THU', year=2007, month=1, day=4) + ival_WWED = Period(freq='WK-WED', year=2007, month=1, day=3) + ival_WTUE = Period(freq='WK-TUE', year=2007, month=1, day=2) + ival_WMON = Period(freq='WK-MON', year=2007, month=1, day=1) + + ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) + ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) + ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) + ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) + ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) + ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) + ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) + ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) + ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) + ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) + ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) + ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) + + ival_W_end_of_year = Period(freq='WK', year=2007, month=12, day=31) + ival_W_end_of_quarter = Period(freq='WK', year=2007, month=3, day=31) + ival_W_end_of_month = Period(freq='WK', year=2007, month=1, day=31) + ival_W_to_A = Period(freq='A', year=2007) + ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_W_to_M = Period(freq='M', year=2007, month=1) + + if Period(freq='D', year=2007, month=12, day=31).weekday == 6: + ival_W_to_A_end_of_year = Period(freq='A', year=2007) + else: + ival_W_to_A_end_of_year = Period(freq='A', year=2008) + + if Period(freq='D', year=2007, month=3, day=31).weekday == 6: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, + quarter=1) + else: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, + quarter=2) + + if Period(freq='D', year=2007, month=1, day=31).weekday == 6: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) + else: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) + + ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) + ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, + hour=23) + ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, + hour=23, minute=59, second=59) + + assert_equal(ival_W.asfreq('A'), ival_W_to_A) + assert_equal(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + assert_equal(ival_W.asfreq('Q'), ival_W_to_Q) + assert_equal(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + assert_equal(ival_W.asfreq('M'), ival_W_to_M) + assert_equal(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + assert_equal(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + assert_equal(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + assert_equal(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + assert_equal(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + assert_equal(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + assert_equal(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + assert_equal(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + assert_equal(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + assert_equal(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + assert_equal(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + assert_equal(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + assert_equal(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + assert_equal(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + assert_equal(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + assert_equal(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + assert_equal(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + assert_equal(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + assert_equal(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + assert_equal(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + assert_equal(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + assert_equal(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + assert_equal(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + assert_equal(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + assert_equal(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + + assert_equal(ival_W.asfreq('WK'), ival_W) + + def test_conv_business(self): + # frequency conversion tests: from Business Frequency" + + ival_B = Period(freq='B', year=2007, month=1, day=1) + ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) + ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) + ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) + ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) + + ival_B_to_A = Period(freq='A', year=2007) + ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_B_to_M = Period(freq='M', year=2007, month=1) + ival_B_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + + assert_equal(ival_B.asfreq('A'), ival_B_to_A) + assert_equal(ival_B_end_of_year.asfreq('A'), ival_B_to_A) + assert_equal(ival_B.asfreq('Q'), ival_B_to_Q) + assert_equal(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) + assert_equal(ival_B.asfreq('M'), ival_B_to_M) + assert_equal(ival_B_end_of_month.asfreq('M'), ival_B_to_M) + assert_equal(ival_B.asfreq('WK'), ival_B_to_W) + assert_equal(ival_B_end_of_week.asfreq('WK'), ival_B_to_W) + + assert_equal(ival_B.asfreq('D'), ival_B_to_D) + + assert_equal(ival_B.asfreq('H', 'S'), ival_B_to_H_start) + assert_equal(ival_B.asfreq('H', 'E'), ival_B_to_H_end) + assert_equal(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) + assert_equal(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) + assert_equal(ival_B.asfreq('S', 'S'), ival_B_to_S_start) + assert_equal(ival_B.asfreq('S', 'E'), ival_B_to_S_end) + + assert_equal(ival_B.asfreq('B'), ival_B) + + def test_conv_daily(self): + # frequency conversion tests: from Business Frequency" + + ival_D = Period(freq='D', year=2007, month=1, day=1) + ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) + ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) + ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) + ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) + + ival_D_friday = Period(freq='D', year=2007, month=1, day=5) + ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) + ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) + ival_D_monday = Period(freq='D', year=2007, month=1, day=8) + + ival_B_friday = Period(freq='B', year=2007, month=1, day=5) + ival_B_monday = Period(freq='B', year=2007, month=1, day=8) + + ival_D_to_A = Period(freq='A', year=2007) + + ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) + ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) + ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) + + ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) + ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) + ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) + + ival_D_to_M = Period(freq='M', year=2007, month=1) + ival_D_to_W = Period(freq='WK', year=2007, month=1, day=7) + + ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + + assert_equal(ival_D.asfreq('A'), ival_D_to_A) + + assert_equal(ival_D_end_of_quarter.asfreq('A-JAN'), + ival_Deoq_to_AJAN) + assert_equal(ival_D_end_of_quarter.asfreq('A-JUN'), + ival_Deoq_to_AJUN) + assert_equal(ival_D_end_of_quarter.asfreq('A-DEC'), + ival_Deoq_to_ADEC) + + assert_equal(ival_D_end_of_year.asfreq('A'), ival_D_to_A) + assert_equal(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) + assert_equal(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) + assert_equal(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) + assert_equal(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) + assert_equal(ival_D.asfreq('M'), ival_D_to_M) + assert_equal(ival_D_end_of_month.asfreq('M'), ival_D_to_M) + assert_equal(ival_D.asfreq('WK'), ival_D_to_W) + assert_equal(ival_D_end_of_week.asfreq('WK'), ival_D_to_W) + + assert_equal(ival_D_friday.asfreq('B'), ival_B_friday) + assert_equal(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) + assert_equal(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) + assert_equal(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) + assert_equal(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) + + assert_equal(ival_D.asfreq('H', 'S'), ival_D_to_H_start) + assert_equal(ival_D.asfreq('H', 'E'), ival_D_to_H_end) + assert_equal(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) + assert_equal(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) + assert_equal(ival_D.asfreq('S', 'S'), ival_D_to_S_start) + assert_equal(ival_D.asfreq('S', 'E'), ival_D_to_S_end) + + assert_equal(ival_D.asfreq('D'), ival_D) + + def test_conv_hourly(self): + # frequency conversion tests: from Hourly Frequency" + + ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, + hour=23) + ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, + hour=23) + ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, + hour=23) + ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, + hour=23) + + ival_H_to_A = Period(freq='A', year=2007) + ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_H_to_M = Period(freq='M', year=2007, month=1) + ival_H_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) + + ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=59) + ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=59, second=59) + + assert_equal(ival_H.asfreq('A'), ival_H_to_A) + assert_equal(ival_H_end_of_year.asfreq('A'), ival_H_to_A) + assert_equal(ival_H.asfreq('Q'), ival_H_to_Q) + assert_equal(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) + assert_equal(ival_H.asfreq('M'), ival_H_to_M) + assert_equal(ival_H_end_of_month.asfreq('M'), ival_H_to_M) + assert_equal(ival_H.asfreq('WK'), ival_H_to_W) + assert_equal(ival_H_end_of_week.asfreq('WK'), ival_H_to_W) + assert_equal(ival_H.asfreq('D'), ival_H_to_D) + assert_equal(ival_H_end_of_day.asfreq('D'), ival_H_to_D) + assert_equal(ival_H.asfreq('B'), ival_H_to_B) + assert_equal(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) + + assert_equal(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) + assert_equal(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) + assert_equal(ival_H.asfreq('S', 'S'), ival_H_to_S_start) + assert_equal(ival_H.asfreq('S', 'E'), ival_H_to_S_end) + + assert_equal(ival_H.asfreq('H'), ival_H) + + def test_conv_minutely(self): + # frequency conversion tests: from Minutely Frequency" + + ival_T = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=59) + + ival_T_to_A = Period(freq='A', year=2007) + ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_T_to_M = Period(freq='M', year=2007, month=1) + ival_T_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + + ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=59) + + assert_equal(ival_T.asfreq('A'), ival_T_to_A) + assert_equal(ival_T_end_of_year.asfreq('A'), ival_T_to_A) + assert_equal(ival_T.asfreq('Q'), ival_T_to_Q) + assert_equal(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) + assert_equal(ival_T.asfreq('M'), ival_T_to_M) + assert_equal(ival_T_end_of_month.asfreq('M'), ival_T_to_M) + assert_equal(ival_T.asfreq('WK'), ival_T_to_W) + assert_equal(ival_T_end_of_week.asfreq('WK'), ival_T_to_W) + assert_equal(ival_T.asfreq('D'), ival_T_to_D) + assert_equal(ival_T_end_of_day.asfreq('D'), ival_T_to_D) + assert_equal(ival_T.asfreq('B'), ival_T_to_B) + assert_equal(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) + assert_equal(ival_T.asfreq('H'), ival_T_to_H) + assert_equal(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) + + assert_equal(ival_T.asfreq('S', 'S'), ival_T_to_S_start) + assert_equal(ival_T.asfreq('S', 'E'), ival_T_to_S_end) + + assert_equal(ival_T.asfreq('Min'), ival_T) + + def test_conv_secondly(self): + # frequency conversion tests: from Secondly Frequency" + + ival_S = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, + hour=23, minute=59, second=59) + ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=59, second=59) + ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=59) + + ival_S_to_A = Period(freq='A', year=2007) + ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_S_to_M = Period(freq='M', year=2007, month=1) + ival_S_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + + assert_equal(ival_S.asfreq('A'), ival_S_to_A) + assert_equal(ival_S_end_of_year.asfreq('A'), ival_S_to_A) + assert_equal(ival_S.asfreq('Q'), ival_S_to_Q) + assert_equal(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) + assert_equal(ival_S.asfreq('M'), ival_S_to_M) + assert_equal(ival_S_end_of_month.asfreq('M'), ival_S_to_M) + assert_equal(ival_S.asfreq('WK'), ival_S_to_W) + assert_equal(ival_S_end_of_week.asfreq('WK'), ival_S_to_W) + assert_equal(ival_S.asfreq('D'), ival_S_to_D) + assert_equal(ival_S_end_of_day.asfreq('D'), ival_S_to_D) + assert_equal(ival_S.asfreq('B'), ival_S_to_B) + assert_equal(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) + assert_equal(ival_S.asfreq('H'), ival_S_to_H) + assert_equal(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) + assert_equal(ival_S.asfreq('Min'), ival_S_to_T) + assert_equal(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) + + assert_equal(ival_S.asfreq('S'), ival_S) + + def test_asfreq_nat(self): + p = Period('NaT', freq='A') + result = p.asfreq('M') + self.assertEqual(result.ordinal, tslib.iNaT) + self.assertEqual(result.freq, 'M') + + +class TestPeriodIndex(tm.TestCase): + + def setUp(self): + pass + + def test_hash_error(self): + index = period_range('20010101', periods=10) + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(index).__name__): + hash(index) + + def test_make_time_series(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + series = Series(1, index=index) + tm.assert_isinstance(series, TimeSeries) + + def test_astype(self): + idx = period_range('1990', '2009', freq='A') + + result = idx.astype('i8') + self.assert_numpy_array_equal(result, idx.values) + + def test_constructor_use_start_freq(self): + # GH #1118 + p = Period('4/2/2012', freq='B') + index = PeriodIndex(start=p, periods=10) + expected = PeriodIndex(start='4/2/2012', periods=10, freq='B') + self.assertTrue(index.equals(expected)) + + def test_constructor_field_arrays(self): + # GH #1264 + + years = np.arange(1990, 2010).repeat(4)[2:-2] + quarters = np.tile(np.arange(1, 5), 20)[2:-2] + + index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') + expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') + self.assertTrue(index.equals(expected)) + + self.assertRaises( + ValueError, PeriodIndex, year=years, quarter=quarters, + freq='2Q-DEC') + + index = PeriodIndex(year=years, quarter=quarters) + self.assertTrue(index.equals(expected)) + + years = [2007, 2007, 2007] + months = [1, 2] + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='M') + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='2M') + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='M', start=Period('2007-01', freq='M')) + + years = [2007, 2007, 2007] + months = [1, 2, 3] + idx = PeriodIndex(year=years, month=months, freq='M') + exp = period_range('2007-01', periods=3, freq='M') + self.assertTrue(idx.equals(exp)) + + def test_constructor_U(self): + # U was used as undefined period + self.assertRaises(ValueError, period_range, '2007-1-1', periods=500, + freq='X') + + def test_constructor_arrays_negative_year(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(lrange(1, 5), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + self.assert_numpy_array_equal(pindex.year, years) + self.assert_numpy_array_equal(pindex.quarter, quarters) + + def test_constructor_invalid_quarters(self): + self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004), + quarter=lrange(4), freq='Q-DEC') + + def test_constructor_corner(self): + self.assertRaises(ValueError, PeriodIndex, periods=10, freq='A') + + start = Period('2007', freq='A-JUN') + end = Period('2010', freq='A-DEC') + self.assertRaises(ValueError, PeriodIndex, start=start, end=end) + self.assertRaises(ValueError, PeriodIndex, start=start) + self.assertRaises(ValueError, PeriodIndex, end=end) + + result = period_range('2007-01', periods=10.5, freq='M') + exp = period_range('2007-01', periods=10, freq='M') + self.assertTrue(result.equals(exp)) + + def test_constructor_fromarraylike(self): + idx = period_range('2007-01', periods=20, freq='M') + + self.assertRaises(ValueError, PeriodIndex, idx.values) + self.assertRaises(ValueError, PeriodIndex, list(idx.values)) + self.assertRaises(ValueError, PeriodIndex, + data=Period('2007', freq='A')) + + result = PeriodIndex(iter(idx)) + self.assertTrue(result.equals(idx)) + + result = PeriodIndex(idx) + self.assertTrue(result.equals(idx)) + + result = PeriodIndex(idx, freq='M') + self.assertTrue(result.equals(idx)) + + result = PeriodIndex(idx, freq='D') + exp = idx.asfreq('D', 'e') + self.assertTrue(result.equals(exp)) + + def test_constructor_datetime64arr(self): + vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) + vals = vals.view(np.dtype('M8[us]')) + + self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + + def test_constructor_simple_new(self): + idx = period_range('2007-01', name='p', periods=20, freq='M') + result = idx._simple_new(idx, 'p', freq=idx.freq) + self.assertTrue(result.equals(idx)) + + result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) + self.assertTrue(result.equals(idx)) + + def test_constructor_nat(self): + self.assertRaises( + ValueError, period_range, start='NaT', end='2011-01-01', freq='M') + self.assertRaises( + ValueError, period_range, start='2011-01-01', end='NaT', freq='M') + + def test_constructor_year_and_quarter(self): + year = pd.Series([2001, 2002, 2003]) + quarter = year - 2000 + idx = PeriodIndex(year=year, quarter=quarter) + strs = ['%dQ%d' % t for t in zip(quarter, year)] + lops = list(map(Period, strs)) + p = PeriodIndex(lops) + tm.assert_index_equal(p, idx) + + def test_is_(self): + create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', + end='12/1/2009') + index = create_index() + self.assertEqual(index.is_(index), True) + self.assertEqual(index.is_(create_index()), False) + self.assertEqual(index.is_(index.view()), True) + self.assertEqual(index.is_(index.view().view().view().view().view()), True) + self.assertEqual(index.view().is_(index), True) + ind2 = index.view() + index.name = "Apple" + self.assertEqual(ind2.is_(index), True) + self.assertEqual(index.is_(index[:]), False) + self.assertEqual(index.is_(index.asfreq('M')), False) + self.assertEqual(index.is_(index.asfreq('A')), False) + self.assertEqual(index.is_(index - 2), False) + self.assertEqual(index.is_(index - 0), False) + + def test_comp_period(self): + idx = period_range('2007-01', periods=20, freq='M') + + result = idx < idx[10] + exp = idx.values < idx.values[10] + self.assert_numpy_array_equal(result, exp) + + def test_getitem_ndim2(self): + idx = period_range('2007-01', periods=3, freq='M') + + result = idx[:, None] + # MPL kludge + tm.assert_isinstance(result, PeriodIndex) + + def test_getitem_partial(self): + rng = period_range('2007-01', periods=50, freq='M') + ts = Series(np.random.randn(len(rng)), rng) + + self.assertRaises(KeyError, ts.__getitem__, '2006') + + result = ts['2008'] + self.assertTrue((result.index.year == 2008).all()) + + result = ts['2008':'2009'] + self.assertEqual(len(result), 24) + + result = ts['2008-1':'2009-12'] + self.assertEqual(len(result), 24) + + result = ts['2008Q1':'2009Q4'] + self.assertEqual(len(result), 24) + + result = ts[:'2009'] + self.assertEqual(len(result), 36) + + result = ts['2009':] + self.assertEqual(len(result), 50 - 24) + + exp = result + result = ts[24:] + assert_series_equal(exp, result) + + ts = ts[10:].append(ts[10:]) + self.assertRaises(ValueError, ts.__getitem__, slice('2008', '2009')) + + def test_getitem_datetime(self): + rng = period_range(start='2012-01-01', periods=10, freq='W-MON') + ts = Series(lrange(len(rng)), index=rng) + + dt1 = datetime(2011, 10, 2) + dt4 = datetime(2012, 4, 20) + + rs = ts[dt1:dt4] + assert_series_equal(rs, ts) + + def test_sub(self): + rng = period_range('2007-01', periods=50) + + result = rng - 5 + exp = rng + (-5) + self.assertTrue(result.equals(exp)) + + def test_periods_number_check(self): + self.assertRaises( + ValueError, period_range, '2011-1-1', '2012-1-1', 'B') + + def test_tolist(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + rs = index.tolist() + [tm.assert_isinstance(x, Period) for x in rs] + + recon = PeriodIndex(rs) + self.assertTrue(index.equals(recon)) + + def test_to_timestamp(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + series = Series(1, index=index, name='foo') + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = series.to_timestamp(how='end') + self.assertTrue(result.index.equals(exp_index)) + self.assertEqual(result.name, 'foo') + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') + result = series.to_timestamp(how='start') + self.assertTrue(result.index.equals(exp_index)) + + def _get_with_delta(delta, freq='A-DEC'): + return date_range(to_datetime('1/1/2001') + delta, + to_datetime('12/31/2009') + delta, freq=freq) + + delta = timedelta(hours=23) + result = series.to_timestamp('H', 'end') + exp_index = _get_with_delta(delta) + self.assertTrue(result.index.equals(exp_index)) + + delta = timedelta(hours=23, minutes=59) + result = series.to_timestamp('T', 'end') + exp_index = _get_with_delta(delta) + self.assertTrue(result.index.equals(exp_index)) + + result = series.to_timestamp('S', 'end') + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + self.assertTrue(result.index.equals(exp_index)) + + self.assertRaises(ValueError, index.to_timestamp, '5t') + + index = PeriodIndex(freq='H', start='1/1/2001', end='1/2/2001') + series = Series(1, index=index, name='foo') + + exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59', + freq='H') + result = series.to_timestamp(how='end') + self.assertTrue(result.index.equals(exp_index)) + self.assertEqual(result.name, 'foo') + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(lrange(1, 5), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + stamps = pindex.to_timestamp('D', 'end') + expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) + self.assertTrue(stamps.equals(expected)) + + def test_to_timestamp_preserve_name(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', + name='foo') + self.assertEqual(index.name, 'foo') + + conv = index.to_timestamp('D') + self.assertEqual(conv.name, 'foo') + + def test_to_timestamp_repr_is_code(self): + zs=[Timestamp('99-04-17 00:00:00',tz='UTC'), + Timestamp('2001-04-17 00:00:00',tz='UTC'), + Timestamp('2001-04-17 00:00:00',tz='America/Los_Angeles'), + Timestamp('2001-04-17 00:00:00',tz=None)] + for z in zs: + self.assertEqual( eval(repr(z)), z) + + def test_to_timestamp_period_nat(self): + # GH 7228 + index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx') + + result = index.to_timestamp('D') + expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), + datetime(2011, 2, 1)], name='idx') + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, 'idx') + + result2 = result.to_period(freq='M') + self.assertTrue(result2.equals(index)) + self.assertEqual(result2.name, 'idx') + + def test_as_frame_columns(self): + rng = period_range('1/1/2000', periods=5) + df = DataFrame(randn(10, 5), columns=rng) + + ts = df[rng[0]] + assert_series_equal(ts, df.ix[:, 0]) + + # GH # 1211 + repr(df) + + ts = df['1/1/2000'] + assert_series_equal(ts, df.ix[:, 0]) + + def test_indexing(self): + + # GH 4390, iat incorrectly indexing + index = period_range('1/1/2001', periods=10) + s = Series(randn(10), index=index) + expected = s[index[0]] + result = s.iat[0] + self.assertEqual(expected, result) + + def test_frame_setitem(self): + rng = period_range('1/1/2000', periods=5) + rng.name = 'index' + df = DataFrame(randn(5, 3), index=rng) + + df['Index'] = rng + rs = Index(df['Index']) + self.assertTrue(rs.equals(rng)) + + rs = df.reset_index().set_index('index') + tm.assert_isinstance(rs.index, PeriodIndex) + self.assertTrue(rs.index.equals(rng)) + + def test_period_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = period_range('2011/01/01', periods=6, freq='M') + idx2 = period_range('2013', periods=6, freq='A') + + df = df.set_index(idx1) + self.assertTrue(df.index.equals(idx1)) + df = df.reindex(idx2) + self.assertTrue(df.index.equals(idx2)) + + def test_nested_dict_frame_constructor(self): + rng = period_range('1/1/2000', periods=5) + df = DataFrame(randn(10, 5), columns=rng) + + data = {} + for col in df.columns: + for row in df.index: + data.setdefault(col, {})[row] = df.get_value(row, col) + + result = DataFrame(data, columns=rng) + tm.assert_frame_equal(result, df) + + data = {} + for col in df.columns: + for row in df.index: + data.setdefault(row, {})[col] = df.get_value(row, col) + + result = DataFrame(data, index=rng).T + tm.assert_frame_equal(result, df) + + def test_frame_to_time_stamp(self): + K = 5 + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + df = DataFrame(randn(len(index), K), index=index) + df['mix'] = 'a' + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = df.to_timestamp('D', 'end') + self.assertTrue(result.index.equals(exp_index)) + assert_almost_equal(result.values, df.values) + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') + result = df.to_timestamp('D', 'start') + self.assertTrue(result.index.equals(exp_index)) + + def _get_with_delta(delta, freq='A-DEC'): + return date_range(to_datetime('1/1/2001') + delta, + to_datetime('12/31/2009') + delta, freq=freq) + + delta = timedelta(hours=23) + result = df.to_timestamp('H', 'end') + exp_index = _get_with_delta(delta) + self.assertTrue(result.index.equals(exp_index)) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp('T', 'end') + exp_index = _get_with_delta(delta) + self.assertTrue(result.index.equals(exp_index)) + + result = df.to_timestamp('S', 'end') + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + self.assertTrue(result.index.equals(exp_index)) + + # columns + df = df.T + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = df.to_timestamp('D', 'end', axis=1) + self.assertTrue(result.columns.equals(exp_index)) + assert_almost_equal(result.values, df.values) + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') + result = df.to_timestamp('D', 'start', axis=1) + self.assertTrue(result.columns.equals(exp_index)) + + delta = timedelta(hours=23) + result = df.to_timestamp('H', 'end', axis=1) + exp_index = _get_with_delta(delta) + self.assertTrue(result.columns.equals(exp_index)) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp('T', 'end', axis=1) + exp_index = _get_with_delta(delta) + self.assertTrue(result.columns.equals(exp_index)) + + result = df.to_timestamp('S', 'end', axis=1) + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + self.assertTrue(result.columns.equals(exp_index)) + + # invalid axis + assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) + assertRaisesRegexp(ValueError, 'Only mult == 1', df.to_timestamp, '5t', axis=1) + + def test_index_duplicate_periods(self): + # monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts[2007] + expected = ts[1:3] + assert_series_equal(result, expected) + result[:] = 1 + self.assertTrue((ts[1:3] == 1).all()) + + # not monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN') + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts[2007] + expected = ts[idx == 2007] + assert_series_equal(result, expected) + + def test_index_unique(self): + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') + self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assertEqual(idx.nunique(), 3) + + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', tz='US/Eastern') + expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', tz='US/Eastern') + self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assertEqual(idx.nunique(), 3) + + def test_constructor(self): + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 9) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 4 * 9) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 12 * 9) + + pi = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') + assert_equal(len(pi), 365 * 9 + 2) + + pi = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') + assert_equal(len(pi), 261 * 9) + + pi = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') + assert_equal(len(pi), 365 * 24) + + pi = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') + assert_equal(len(pi), 24 * 60) + + pi = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') + assert_equal(len(pi), 24 * 60 * 60) + + start = Period('02-Apr-2005', 'B') + i1 = PeriodIndex(start=start, periods=20) + assert_equal(len(i1), 20) + assert_equal(i1.freq, start.freq) + assert_equal(i1[0], start) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), 10) + assert_equal(i1.freq, end_intv.freq) + assert_equal(i1[-1], end_intv) + + end_intv = Period('2006-12-31', '1w') + i2 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + assert_equal(i1.freq, i2.freq) + + end_intv = Period('2006-12-31', ('w', 1)) + i2 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + assert_equal(i1.freq, i2.freq) + + try: + PeriodIndex(start=start, end=end_intv) + raise AssertionError('Cannot allow mixed freq for start and end') + except ValueError: + pass + + end_intv = Period('2005-05-01', 'B') + i1 = PeriodIndex(start=start, end=end_intv) + + try: + PeriodIndex(start=start) + raise AssertionError( + 'Must specify periods if missing start or end') + except ValueError: + pass + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + assert_equal(len(i2), 2) + assert_equal(i2[0], end_intv) + + i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + assert_equal(len(i2), 2) + assert_equal(i2[0], end_intv) + + # Mixed freq should fail + vals = [end_intv, Period('2006-12-31', 'w')] + self.assertRaises(ValueError, PeriodIndex, vals) + vals = np.array(vals) + self.assertRaises(ValueError, PeriodIndex, vals) + + def test_shift(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') + + self.assertTrue(pi1.shift(0).equals(pi1)) + + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + def test_shift_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') + result = idx.shift(1) + expected = PeriodIndex(['2011-02', '2011-03', 'NaT', '2011-05'], freq='M', name='idx') + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + + def test_asfreq(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') + pi2 = PeriodIndex(freq='Q', start='1/1/2001', end='1/1/2001') + pi3 = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2001') + pi4 = PeriodIndex(freq='D', start='1/1/2001', end='1/1/2001') + pi5 = PeriodIndex(freq='H', start='1/1/2001', end='1/1/2001 00:00') + pi6 = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 00:00') + pi7 = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') + + self.assertEqual(pi1.asfreq('Q', 'S'), pi2) + self.assertEqual(pi1.asfreq('Q', 's'), pi2) + self.assertEqual(pi1.asfreq('M', 'start'), pi3) + self.assertEqual(pi1.asfreq('D', 'StarT'), pi4) + self.assertEqual(pi1.asfreq('H', 'beGIN'), pi5) + self.assertEqual(pi1.asfreq('Min', 'S'), pi6) + self.assertEqual(pi1.asfreq('S', 'S'), pi7) + + self.assertEqual(pi2.asfreq('A', 'S'), pi1) + self.assertEqual(pi2.asfreq('M', 'S'), pi3) + self.assertEqual(pi2.asfreq('D', 'S'), pi4) + self.assertEqual(pi2.asfreq('H', 'S'), pi5) + self.assertEqual(pi2.asfreq('Min', 'S'), pi6) + self.assertEqual(pi2.asfreq('S', 'S'), pi7) + + self.assertEqual(pi3.asfreq('A', 'S'), pi1) + self.assertEqual(pi3.asfreq('Q', 'S'), pi2) + self.assertEqual(pi3.asfreq('D', 'S'), pi4) + self.assertEqual(pi3.asfreq('H', 'S'), pi5) + self.assertEqual(pi3.asfreq('Min', 'S'), pi6) + self.assertEqual(pi3.asfreq('S', 'S'), pi7) + + self.assertEqual(pi4.asfreq('A', 'S'), pi1) + self.assertEqual(pi4.asfreq('Q', 'S'), pi2) + self.assertEqual(pi4.asfreq('M', 'S'), pi3) + self.assertEqual(pi4.asfreq('H', 'S'), pi5) + self.assertEqual(pi4.asfreq('Min', 'S'), pi6) + self.assertEqual(pi4.asfreq('S', 'S'), pi7) + + self.assertEqual(pi5.asfreq('A', 'S'), pi1) + self.assertEqual(pi5.asfreq('Q', 'S'), pi2) + self.assertEqual(pi5.asfreq('M', 'S'), pi3) + self.assertEqual(pi5.asfreq('D', 'S'), pi4) + self.assertEqual(pi5.asfreq('Min', 'S'), pi6) + self.assertEqual(pi5.asfreq('S', 'S'), pi7) + + self.assertEqual(pi6.asfreq('A', 'S'), pi1) + self.assertEqual(pi6.asfreq('Q', 'S'), pi2) + self.assertEqual(pi6.asfreq('M', 'S'), pi3) + self.assertEqual(pi6.asfreq('D', 'S'), pi4) + self.assertEqual(pi6.asfreq('H', 'S'), pi5) + self.assertEqual(pi6.asfreq('S', 'S'), pi7) + + self.assertEqual(pi7.asfreq('A', 'S'), pi1) + self.assertEqual(pi7.asfreq('Q', 'S'), pi2) + self.assertEqual(pi7.asfreq('M', 'S'), pi3) + self.assertEqual(pi7.asfreq('D', 'S'), pi4) + self.assertEqual(pi7.asfreq('H', 'S'), pi5) + self.assertEqual(pi7.asfreq('Min', 'S'), pi6) + + self.assertRaises(ValueError, pi7.asfreq, 'T', 'foo') + self.assertRaises(ValueError, pi1.asfreq, '5t') + + def test_asfreq_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M') + result = idx.asfreq(freq='Q') + expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') + self.assertTrue(result.equals(expected)) + + def test_period_index_length(self): + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 9) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 4 * 9) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 12 * 9) + + start = Period('02-Apr-2005', 'B') + i1 = PeriodIndex(start=start, periods=20) + assert_equal(len(i1), 20) + assert_equal(i1.freq, start.freq) + assert_equal(i1[0], start) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), 10) + assert_equal(i1.freq, end_intv.freq) + assert_equal(i1[-1], end_intv) + + end_intv = Period('2006-12-31', '1w') + i2 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + assert_equal(i1.freq, i2.freq) + + end_intv = Period('2006-12-31', ('w', 1)) + i2 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), len(i2)) + self.assertTrue((i1 == i2).all()) + assert_equal(i1.freq, i2.freq) + + try: + PeriodIndex(start=start, end=end_intv) + raise AssertionError('Cannot allow mixed freq for start and end') + except ValueError: + pass + + end_intv = Period('2005-05-01', 'B') + i1 = PeriodIndex(start=start, end=end_intv) + + try: + PeriodIndex(start=start) + raise AssertionError( + 'Must specify periods if missing start or end') + except ValueError: + pass + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + assert_equal(len(i2), 2) + assert_equal(i2[0], end_intv) + + i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + assert_equal(len(i2), 2) + assert_equal(i2[0], end_intv) + + # Mixed freq should fail + vals = [end_intv, Period('2006-12-31', 'w')] + self.assertRaises(ValueError, PeriodIndex, vals) + vals = np.array(vals) + self.assertRaises(ValueError, PeriodIndex, vals) + + def test_frame_index_to_string(self): + index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + frame = DataFrame(np.random.randn(3, 4), index=index) + + # it works! + frame.to_string() + + def test_asfreq_ts(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/31/2010') + ts = Series(np.random.randn(len(index)), index=index) + df = DataFrame(np.random.randn(len(index), 3), index=index) + + result = ts.asfreq('D', how='end') + df_result = df.asfreq('D', how='end') + exp_index = index.asfreq('D', how='end') + self.assertEqual(len(result), len(ts)) + self.assertTrue(result.index.equals(exp_index)) + self.assertTrue(df_result.index.equals(exp_index)) + + result = ts.asfreq('D', how='start') + self.assertEqual(len(result), len(ts)) + self.assertTrue(result.index.equals(index.asfreq('D', how='start'))) + + def test_badinput(self): + self.assertRaises(datetools.DateParseError, Period, '1/1/-2000', 'A') + # self.assertRaises(datetools.DateParseError, Period, '-2000', 'A') + # self.assertRaises(datetools.DateParseError, Period, '0', 'A') + + def test_negative_ordinals(self): + p = Period(ordinal=-1000, freq='A') + p = Period(ordinal=0, freq='A') + + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A') + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') + assert_array_equal(idx1,idx2) + + def test_dti_to_period(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + pi1 = dti.to_period() + pi2 = dti.to_period(freq='D') + + self.assertEqual(pi1[0], Period('Jan 2005', freq='M')) + self.assertEqual(pi2[0], Period('1/31/2005', freq='D')) + + self.assertEqual(pi1[-1], Period('Nov 2005', freq='M')) + self.assertEqual(pi2[-1], Period('11/30/2005', freq='D')) + + def test_pindex_slice_index(self): + pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='M') + s = Series(np.random.rand(len(pi)), index=pi) + res = s['2010'] + exp = s[0:12] + assert_series_equal(res, exp) + res = s['2011'] + exp = s[12:24] + assert_series_equal(res, exp) + + def test_getitem_day(self): + # GH 6716 + # Confirm DatetimeIndex and PeriodIndex works identically + didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) + pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = ['2014', '2013/02', '2013/01/02', + '2013/02/01 9H', '2013/02/01 09:00'] + for v in values: + + if _np_version_under1p9: + with tm.assertRaises(ValueError): + idx[v] + else: + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + #with tm.assertRaises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + assert_series_equal(s['2013/01'], s[0:31]) + assert_series_equal(s['2013/02'], s[31:59]) + assert_series_equal(s['2014'], s[365:]) + + invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + for v in invalid: + with tm.assertRaises(KeyError): + s[v] + + def test_range_slice_day(self): + # GH 6716 + didx = DatetimeIndex(start='2013/01/01', freq='D', periods=400) + pidx = PeriodIndex(start='2013/01/01', freq='D', periods=400) + + for idx in [didx, pidx]: + # slices against index should raise IndexError + values = ['2014', '2013/02', '2013/01/02', + '2013/02/01 9H', '2013/02/01 09:00'] + for v in values: + with tm.assertRaises(IndexError): + idx[v:] + + s = Series(np.random.rand(len(idx)), index=idx) + + assert_series_equal(s['2013/01/02':], s[1:]) + assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) + assert_series_equal(s['2013/02':], s[31:]) + assert_series_equal(s['2014':], s[365:]) + + invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + for v in invalid: + with tm.assertRaises(IndexError): + idx[v:] + + def test_getitem_seconds(self): + # GH 6716 + didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) + pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) + + for idx in [didx, pidx]: + # getitem against index should raise ValueError + values = ['2014', '2013/02', '2013/01/02', + '2013/02/01 9H', '2013/02/01 09:00'] + for v in values: + if _np_version_under1p9: + with tm.assertRaises(ValueError): + idx[v] + else: + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + #with tm.assertRaises(IndexError): + # idx[v] + continue + + s = Series(np.random.rand(len(idx)), index=idx) + + assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) + assert_series_equal(s['2013/01/01 9H'], s[:3600]) + for d in ['2013/01/01', '2013/01', '2013']: + assert_series_equal(s[d], s) + + def test_range_slice_seconds(self): + # GH 6716 + didx = DatetimeIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) + pidx = PeriodIndex(start='2013/01/01 09:00:00', freq='S', periods=4000) + + for idx in [didx, pidx]: + # slices against index should raise IndexError + values = ['2014', '2013/02', '2013/01/02', + '2013/02/01 9H', '2013/02/01 09:00'] + for v in values: + with tm.assertRaises(IndexError): + idx[v:] + + s = Series(np.random.rand(len(idx)), index=idx) + + assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], s[300:660]) + assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], s[3600:3960]) + assert_series_equal(s['2013/01/01 10H':], s[3600:]) + assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) + for d in ['2013/01/01', '2013/01', '2013']: + assert_series_equal(s[d:], s) + + def test_range_slice_outofbounds(self): + # GH 5407 + didx = DatetimeIndex(start='2013/10/01', freq='D', periods=10) + pidx = PeriodIndex(start='2013/10/01', freq='D', periods=10) + + for idx in [didx, pidx]: + df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) + empty = DataFrame(index=DatetimeIndex([], freq='D'), columns=['units']) + + tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) + tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2]) + tm.assert_frame_equal(df['2013/10/01':'2013/10/02'], df.iloc[:2]) + tm.assert_frame_equal(df['2013/10/02':'2013/09/30'], empty) + tm.assert_frame_equal(df['2013/10/15':'2013/10/17'], empty) + tm.assert_frame_equal(df['2013-06':'2013-09'], empty) + tm.assert_frame_equal(df['2013-11':'2013-12'], empty) + + def test_pindex_fieldaccessor_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2012-03', '2012-04'], freq='D') + self.assert_numpy_array_equal(idx.year, np.array([2011, 2011, -1, 2012, 2012])) + self.assert_numpy_array_equal(idx.month, np.array([1, 2, -1, 3, 4])) + + def test_pindex_qaccess(self): + pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') + s = Series(np.random.rand(len(pi)), index=pi).cumsum() + # Todo: fix these accessors! + self.assertEqual(s['05Q4'], s[2]) + + def test_period_dt64_round_trip(self): + dti = date_range('1/1/2000', '1/7/2002', freq='B') + pi = dti.to_period() + self.assertTrue(pi.to_timestamp().equals(dti)) + + dti = date_range('1/1/2000', '1/7/2002', freq='B') + pi = dti.to_period(freq='H') + self.assertTrue(pi.to_timestamp().equals(dti)) + + def test_to_period_quarterly(self): + # make sure we can make the round trip + for month in MONTHS: + freq = 'Q-%s' % month + rng = period_range('1989Q3', '1991Q3', freq=freq) + stamps = rng.to_timestamp() + result = stamps.to_period(freq) + self.assertTrue(rng.equals(result)) + + def test_to_period_quarterlyish(self): + offsets = ['BQ', 'QS', 'BQS'] + for off in offsets: + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + self.assertEqual(prng.freq, 'Q-DEC') + + def test_to_period_annualish(self): + offsets = ['BA', 'AS', 'BAS'] + for off in offsets: + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + self.assertEqual(prng.freq, 'A-DEC') + + def test_to_period_monthish(self): + offsets = ['MS', 'EOM', 'BM'] + for off in offsets: + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + self.assertEqual(prng.freq, 'M') + + def test_no_multiples(self): + self.assertRaises(ValueError, period_range, '1989Q3', periods=10, + freq='2Q') + + self.assertRaises(ValueError, period_range, '1989', periods=10, + freq='2A') + self.assertRaises(ValueError, Period, '1989', freq='2A') + + # def test_pindex_multiples(self): + # pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='2M') + # self.assertEqual(pi[0], Period('1/1/10', '2M')) + # self.assertEqual(pi[1], Period('3/1/10', '2M')) + + # self.assertEqual(pi[0].asfreq('6M'), pi[2].asfreq('6M')) + # self.assertEqual(pi[0].asfreq('A'), pi[2].asfreq('A')) + + # self.assertEqual(pi[0].asfreq('M', how='S'), + # Period('Jan 2010', '1M')) + # self.assertEqual(pi[0].asfreq('M', how='E'), + # Period('Feb 2010', '1M')) + # self.assertEqual(pi[1].asfreq('M', how='S'), + # Period('Mar 2010', '1M')) + + # i = Period('1/1/2010 12:05:18', '5S') + # self.assertEqual(i, Period('1/1/2010 12:05:15', '5S')) + + # i = Period('1/1/2010 12:05:18', '5S') + # self.assertEqual(i.asfreq('1S', how='E'), + # Period('1/1/2010 12:05:19', '1S')) + + def test_iteration(self): + index = PeriodIndex(start='1/1/10', periods=4, freq='B') + + result = list(index) + tm.assert_isinstance(result[0], Period) + self.assertEqual(result[0].freq, index.freq) + + def test_take(self): + index = PeriodIndex(start='1/1/10', end='12/31/12', freq='D', name='idx') + expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7), + datetime(2010, 1, 9), datetime(2010, 1, 13)], + freq='D', name='idx') + + taken1 = index.take([5, 6, 8, 12]) + taken2 = index[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + self.assertTrue(taken.equals(expected)) + tm.assert_isinstance(taken, PeriodIndex) + self.assertEqual(taken.freq, index.freq) + self.assertEqual(taken.name, expected.name) + + def test_joins(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + for kind in ['inner', 'outer', 'left', 'right']: + joined = index.join(index[:-5], how=kind) + + tm.assert_isinstance(joined, PeriodIndex) + self.assertEqual(joined.freq, index.freq) + + def test_join_self(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + for kind in ['inner', 'outer', 'left', 'right']: + res = index.join(index, how=kind) + self.assertIs(index, res) + + def test_join_does_not_recur(self): + df = tm.makeCustomDataframe(3, 2, data_gen_f=lambda *args: + np.random.randint(2), c_idx_type='p', + r_idx_type='dt') + s = df.iloc[:2, 0] + + res = s.index.join(df.columns, how='outer') + expected = Index([s.index[0], s.index[1], + df.columns[0], df.columns[1]], object) + tm.assert_index_equal(res, expected) + + def test_align_series(self): + rng = period_range('1/1/2000', '1/1/2010', freq='A') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected[1::2] = np.nan + assert_series_equal(result, expected) + + result = ts + _permute(ts[::2]) + assert_series_equal(result, expected) + + # it works! + for kind in ['inner', 'outer', 'left', 'right']: + ts.align(ts[::2], join=kind) + with assertRaisesRegexp(ValueError, 'Only like-indexed'): + ts + ts.asfreq('D', how="end") + + def test_align_frame(self): + rng = period_range('1/1/2000', '1/1/2010', freq='A') + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected.values[1::2] = np.nan + tm.assert_frame_equal(result, expected) + + result = ts + _permute(ts[::2]) + tm.assert_frame_equal(result, expected) + + def test_union(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + result = index[:-5].union(index[10:]) + self.assertTrue(result.equals(index)) + + # not in order + result = _permute(index[:-5]).union(_permute(index[10:])) + self.assertTrue(result.equals(index)) + + # raise if different frequencies + index = period_range('1/1/2000', '1/20/2000', freq='D') + index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + self.assertRaises(ValueError, index.union, index2) + + self.assertRaises(ValueError, index.join, index.to_timestamp()) + + def test_intersection(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + result = index[:-5].intersection(index[10:]) + self.assertTrue(result.equals(index[10:-5])) + + # not in order + left = _permute(index[:-5]) + right = _permute(index[10:]) + result = left.intersection(right).order() + self.assertTrue(result.equals(index[10:-5])) + + # raise if different frequencies + index = period_range('1/1/2000', '1/20/2000', freq='D') + index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + self.assertRaises(ValueError, index.intersection, index2) + + def test_fields(self): + # year, month, day, hour, minute + # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter + # qyear + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2005') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2002') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2002') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='D', start='12/1/2001', end='6/1/2001') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='B', start='12/1/2001', end='6/1/2001') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='H', start='12/31/2001', end='1/1/2002 23:00') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='Min', start='12/31/2001', end='1/1/2002 00:20') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='S', start='12/31/2001 00:00:00', + end='12/31/2001 00:05:00') + self._check_all_fields(pi) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + self._check_all_fields(i1) + + def _check_all_fields(self, periodindex): + fields = ['year', 'month', 'day', 'hour', 'minute', + 'second', 'weekofyear', 'week', 'dayofweek', + 'weekday', 'dayofyear', 'quarter', 'qyear'] + + periods = list(periodindex) + + for field in fields: + field_idx = getattr(periodindex, field) + assert_equal(len(periodindex), len(field_idx)) + for x, val in zip(periods, field_idx): + assert_equal(getattr(x, field), val) + + def test_is_full(self): + index = PeriodIndex([2005, 2007, 2009], freq='A') + self.assertFalse(index.is_full) + + index = PeriodIndex([2005, 2006, 2007], freq='A') + self.assertTrue(index.is_full) + + index = PeriodIndex([2005, 2005, 2007], freq='A') + self.assertFalse(index.is_full) + + index = PeriodIndex([2005, 2005, 2006], freq='A') + self.assertTrue(index.is_full) + + index = PeriodIndex([2006, 2005, 2005], freq='A') + self.assertRaises(ValueError, getattr, index, 'is_full') + + self.assertTrue(index[:0].is_full) + + def test_map(self): + index = PeriodIndex([2005, 2007, 2009], freq='A') + result = index.map(lambda x: x + 1) + expected = index + 1 + self.assertTrue(result.equals(expected)) + + result = index.map(lambda x: x.ordinal) + exp = [x.ordinal for x in index] + assert_array_equal(result, exp) + + def test_map_with_string_constructor(self): + raw = [2005, 2007, 2009] + index = PeriodIndex(raw, freq='A') + types = str, + + if compat.PY3: + # unicode + types += compat.text_type, + + for t in types: + expected = np.array(lmap(t, raw), dtype=object) + res = index.map(t) + + # should return an array + tm.assert_isinstance(res, np.ndarray) + + # preserve element types + self.assertTrue(all(isinstance(resi, t) for resi in res)) + + # dtype should be object + self.assertEqual(res.dtype, np.dtype('object').type) + + # lastly, values should compare equal + assert_array_equal(res, expected) + + def test_convert_array_of_periods(self): + rng = period_range('1/1/2000', periods=20, freq='D') + periods = list(rng) + + result = pd.Index(periods) + tm.assert_isinstance(result, PeriodIndex) + + def test_with_multi_index(self): + # #1705 + index = date_range('1/1/2012', periods=4, freq='12H') + index_as_arrays = [index.to_period(freq='D'), index.hour] + + s = Series([0, 1, 2, 3], index_as_arrays) + + tm.assert_isinstance(s.index.levels[0], PeriodIndex) + + tm.assert_isinstance(s.index.values[0][0], Period) + + def test_to_datetime_1703(self): + index = period_range('1/1/2012', periods=4, freq='D') + + result = index.to_datetime() + self.assertEqual(result[0], Timestamp('1/1/2012')) + + def test_get_loc_msg(self): + idx = period_range('2000-1-1', freq='A', periods=10) + bad_period = Period('2012', 'A') + self.assertRaises(KeyError, idx.get_loc, bad_period) + + try: + idx.get_loc(bad_period) + except KeyError as inst: + self.assertEqual(inst.args[0], bad_period) + + def test_append_concat(self): + # #1815 + d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') + d2 = date_range('12/31/2000', '12/31/2009', freq='A-DEC') + + s1 = Series(np.random.randn(10), d1) + s2 = Series(np.random.randn(10), d2) + + s1 = s1.to_period() + s2 = s2.to_period() + + # drops index + result = pd.concat([s1, s2]) + tm.assert_isinstance(result.index, PeriodIndex) + self.assertEqual(result.index[0], s1.index[0]) + + def test_pickle_freq(self): + # GH2891 + import pickle + prng = period_range('1/1/2011', '1/1/2012', freq='M') + new_prng = pickle.loads(pickle.dumps(prng)) + self.assertEqual(new_prng.freq,'M') + + def test_slice_keep_name(self): + idx = period_range('20010101', periods=10, freq='D', name='bob') + self.assertEqual(idx.name, idx[1:].name) + + def test_factorize(self): + idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', + '2014-03', '2014-03'], freq='M') + + exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01'], freq='M') + + exp_arr = np.array([2, 2, 1, 0, 2, 0]) + arr, idx = idx2.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') + arr, idx = idx2.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + def test_recreate_from_data(self): + for o in ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'N', 'H']: + org = PeriodIndex(start='2001/04/01', freq=o, periods=1) + idx = PeriodIndex(org.values, freq=o) + self.assertTrue(idx.equals(org)) + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestMethods(tm.TestCase): + "Base test class for MaskedArrays." + + def test_add(self): + dt1 = Period(freq='D', year=2008, month=1, day=1) + dt2 = Period(freq='D', year=2008, month=1, day=2) + assert_equal(dt1 + 1, dt2) + # + # GH 4731 + msg = "unsupported operand type\(s\)" + with tm.assertRaisesRegexp(TypeError, msg): + dt1 + "str" + + with tm.assertRaisesRegexp(TypeError, msg): + dt1 + dt2 + + def test_nat_ops(self): + p = Period('NaT', freq='M') + self.assertEqual((p + 1).ordinal, tslib.iNaT) + self.assertEqual((p - 1).ordinal, tslib.iNaT) + self.assertEqual((p - Period('2011-01', freq='M')).ordinal, tslib.iNaT) + self.assertEqual((Period('2011-01', freq='M') - p).ordinal, tslib.iNaT) + + def test_pi_ops_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') + result = idx + 2 + expected = PeriodIndex(['2011-03', '2011-04', 'NaT', '2011-06'], freq='M', name='idx') + self.assertTrue(result.equals(expected)) + + result2 = result - 2 + self.assertTrue(result2.equals(idx)) + + msg = "unsupported operand type\(s\)" + with tm.assertRaisesRegexp(TypeError, msg): + idx + "str" + + +class TestPeriodRepresentation(tm.TestCase): + """ + Wish to match NumPy units + """ + + def test_annual(self): + self._check_freq('A', 1970) + + def test_monthly(self): + self._check_freq('M', '1970-01') + + def test_weekly(self): + self._check_freq('W-THU', '1970-01-01') + + def test_daily(self): + self._check_freq('D', '1970-01-01') + + def test_business_daily(self): + self._check_freq('B', '1970-01-01') + + def test_hourly(self): + self._check_freq('H', '1970-01-01') + + def test_minutely(self): + self._check_freq('T', '1970-01-01') + + def test_secondly(self): + self._check_freq('S', '1970-01-01') + + def test_millisecondly(self): + self._check_freq('L', '1970-01-01') + + def test_microsecondly(self): + self._check_freq('U', '1970-01-01') + + def test_nanosecondly(self): + self._check_freq('N', '1970-01-01') + + def _check_freq(self, freq, base_date): + rng = PeriodIndex(start=base_date, periods=10, freq=freq) + exp = np.arange(10, dtype=np.int64) + self.assert_numpy_array_equal(rng.values, exp) + + def test_negone_ordinals(self): + freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] + + period = Period(ordinal=-1, freq='D') + for freq in freqs: + repr(period.asfreq(freq)) + + for freq in freqs: + period = Period(ordinal=-1, freq=freq) + repr(period) + self.assertEqual(period.year, 1969) + + period = Period(ordinal=-1, freq='B') + repr(period) + period = Period(ordinal=-1, freq='W') + repr(period) + + +class TestComparisons(tm.TestCase): + def setUp(self): + self.january1 = Period('2000-01', 'M') + self.january2 = Period('2000-01', 'M') + self.february = Period('2000-02', 'M') + self.march = Period('2000-03', 'M') + self.day = Period('2012-01-01', 'D') + + def test_equal(self): + self.assertEqual(self.january1, self.january2) + + def test_equal_Raises_Value(self): + with tm.assertRaises(ValueError): + self.january1 == self.day + + def test_notEqual(self): + self.assertNotEqual(self.january1, 1) + self.assertNotEqual(self.january1, self.february) + + def test_greater(self): + self.assertTrue(self.february > self.january1) + + def test_greater_Raises_Value(self): + with tm.assertRaises(ValueError): + self.january1 > self.day + + def test_greater_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 > 1 + + def test_greaterEqual(self): + self.assertTrue(self.january1 >= self.january2) + + def test_greaterEqual_Raises_Value(self): + with tm.assertRaises(ValueError): + self.january1 >= self.day + with tm.assertRaises(TypeError): + print(self.january1 >= 1) + + def test_smallerEqual(self): + self.assertTrue(self.january1 <= self.january2) + + def test_smallerEqual_Raises_Value(self): + with tm.assertRaises(ValueError): + self.january1 <= self.day + + def test_smallerEqual_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 <= 1 + + def test_smaller(self): + self.assertTrue(self.january1 < self.february) + + def test_smaller_Raises_Value(self): + with tm.assertRaises(ValueError): + self.january1 < self.day + + def test_smaller_Raises_Type(self): + with tm.assertRaises(TypeError): + self.january1 < 1 + + def test_sort(self): + periods = [self.march, self.january1, self.february] + correctPeriods = [self.january1, self.february, self.march] + self.assertEqual(sorted(periods), correctPeriods) + + def test_period_nat_comp(self): + p_nat = Period('NaT', freq='D') + p = Period('2011-01-01', freq='D') + + nat = pd.Timestamp('NaT') + t = pd.Timestamp('2011-01-01') + # confirm Period('NaT') work identical with Timestamp('NaT') + for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), + (nat, t), (t, nat), (nat, nat)]: + self.assertEqual(left < right, False) + self.assertEqual(left > right, False) + self.assertEqual(left == right, False) + self.assertEqual(left != right, True) + self.assertEqual(left <= right, False) + self.assertEqual(left >= right, False) + + def test_pi_nat_comp(self): + idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq='M') + + result = idx1 > Period('2011-02', freq='M') + self.assert_numpy_array_equal(result, np.array([False, False, False, True])) + + result = idx1 == Period('NaT', freq='M') + self.assert_numpy_array_equal(result, np.array([False, False, False, False])) + + result = idx1 != Period('NaT', freq='M') + self.assert_numpy_array_equal(result, np.array([True, True, True, True])) + + idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq='M') + result = idx1 < idx2 + self.assert_numpy_array_equal(result, np.array([True, False, False, False])) + + result = idx1 == idx1 + self.assert_numpy_array_equal(result, np.array([True, True, False, True])) + + result = idx1 != idx1 + self.assert_numpy_array_equal(result, np.array([False, False, True, False])) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py new file mode 100644 index 00000000..0bdba375 --- /dev/null +++ b/pandas/tseries/tests/test_plotting.py @@ -0,0 +1,1045 @@ +from datetime import datetime, timedelta, date, time + +import nose +from pandas.compat import lrange, zip + +import numpy as np +from numpy.testing.decorators import slow +from numpy.testing import assert_array_equal + +from pandas import Index, Series, DataFrame + +from pandas.tseries.index import date_range, bdate_range +from pandas.tseries.offsets import DateOffset +from pandas.tseries.period import period_range, Period, PeriodIndex +from pandas.tseries.resample import DatetimeIndex + +from pandas.util.testing import assert_series_equal, ensure_clean +import pandas.util.testing as tm + +from pandas.tests.test_graphics import _skip_if_no_scipy_gaussian_kde + + +@tm.mplskip +class TestTSPlot(tm.TestCase): + def setUp(self): + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'Y'] + idx = [period_range('12/31/1999', freq=x, periods=100) for x in freq] + self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] + self.period_df = [DataFrame(np.random.randn(len(x), 3), index=x, + columns=['A', 'B', 'C']) + for x in idx] + + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A', '1B30Min'] + idx = [date_range('12/31/1999', freq=x, periods=100) for x in freq] + self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx] + self.datetime_df = [DataFrame(np.random.randn(len(x), 3), index=x, + columns=['A', 'B', 'C']) + for x in idx] + + def tearDown(self): + tm.close() + + @slow + def test_ts_plot_with_tz(self): + # GH2877 + index = date_range('1/1/2011', periods=2, freq='H', + tz='Europe/Brussels') + ts = Series([188.5, 328.25], index=index) + _check_plot_works(ts.plot) + + @slow + def test_frame_inferred(self): + # inferred freq + import matplotlib.pyplot as plt + idx = date_range('1/1/1987', freq='MS', periods=100) + idx = DatetimeIndex(idx.values, freq=None) + + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + _check_plot_works(df.plot) + + # axes freq + idx = idx[0:40] + idx[45:99] + df2 = DataFrame(np.random.randn(len(idx), 3), index=idx) + _check_plot_works(df2.plot) + + # N > 1 + idx = date_range('2008-1-1 00:15:00', freq='15T', periods=10) + idx = DatetimeIndex(idx.values, freq=None) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + _check_plot_works(df.plot) + + def test_nonnumeric_exclude(self): + import matplotlib.pyplot as plt + + idx = date_range('1/1/1987', freq='A', periods=3) + df = DataFrame({'A': ["x", "y", "z"], 'B': [1,2,3]}, idx) + + ax = df.plot() # it works + self.assertEqual(len(ax.get_lines()), 1) #B was plotted + plt.close(plt.gcf()) + + self.assertRaises(TypeError, df['A'].plot) + + @slow + def test_tsplot(self): + from pandas.tseries.plotting import tsplot + import matplotlib.pyplot as plt + + ax = plt.gca() + ts = tm.makeTimeSeries() + + f = lambda *args, **kwds: tsplot(s, plt.Axes.plot, *args, **kwds) + + for s in self.period_ser: + _check_plot_works(f, s.index.freq, ax=ax, series=s) + + for s in self.datetime_ser: + _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s) + + ax = ts.plot(style='k') + self.assertEqual((0., 0., 0.), ax.get_lines()[0].get_color()) + + def test_both_style_and_color(self): + import matplotlib.pyplot as plt + + ts = tm.makeTimeSeries() + self.assertRaises(ValueError, ts.plot, style='b-', color='#000099') + + s = ts.reset_index(drop=True) + self.assertRaises(ValueError, s.plot, style='b-', color='#000099') + + @slow + def test_high_freq(self): + freaks = ['ms', 'us'] + for freq in freaks: + rng = date_range('1/1/2012', periods=100000, freq=freq) + ser = Series(np.random.randn(len(rng)), rng) + _check_plot_works(ser.plot) + + def test_get_datevalue(self): + from pandas.tseries.converter import get_datevalue + self.assertIsNone(get_datevalue(None, 'D')) + self.assertEqual(get_datevalue(1987, 'A'), 1987) + self.assertEqual(get_datevalue(Period(1987, 'A'), 'M'), + Period('1987-12', 'M').ordinal) + self.assertEqual(get_datevalue('1/1/1987', 'D'), + Period('1987-1-1', 'D').ordinal) + + @slow + def test_ts_plot_format_coord(self): + def check_format_of_first_point(ax, expected_string): + first_line = ax.get_lines()[0] + first_x = first_line.get_xdata()[0].ordinal + first_y = first_line.get_ydata()[0] + try: + self.assertEqual(expected_string, ax.format_coord(first_x, first_y)) + except (ValueError): + raise nose.SkipTest("skipping test because issue forming test comparison GH7664") + + annual = Series(1, index=date_range('2014-01-01', periods=3, freq='A-DEC')) + check_format_of_first_point(annual.plot(), 't = 2014 y = 1.000000') + + # note this is added to the annual plot already in existence, and changes its freq field + daily = Series(1, index=date_range('2014-01-01', periods=3, freq='D')) + check_format_of_first_point(daily.plot(), 't = 2014-01-01 y = 1.000000') + + @slow + def test_line_plot_period_series(self): + for s in self.period_ser: + _check_plot_works(s.plot, s.index.freq) + + @slow + def test_line_plot_datetime_series(self): + for s in self.datetime_ser: + _check_plot_works(s.plot, s.index.freq.rule_code) + + @slow + def test_line_plot_period_frame(self): + for df in self.period_df: + _check_plot_works(df.plot, df.index.freq) + + @slow + def test_line_plot_datetime_frame(self): + for df in self.datetime_df: + freq = df.index.to_period(df.index.freq.rule_code).freq + _check_plot_works(df.plot, freq) + + @slow + def test_line_plot_inferred_freq(self): + for ser in self.datetime_ser: + ser = Series(ser.values, Index(np.asarray(ser.index))) + _check_plot_works(ser.plot, ser.index.inferred_freq) + + ser = ser[[0, 3, 5, 6]] + _check_plot_works(ser.plot) + + def test_fake_inferred_business(self): + import matplotlib.pyplot as plt + fig = plt.gcf() + plt.clf() + fig.add_subplot(111) + rng = date_range('2001-1-1', '2001-1-10') + ts = Series(lrange(len(rng)), rng) + ts = ts[:3].append(ts[5:]) + ax = ts.plot() + self.assertFalse(hasattr(ax, 'freq')) + + @slow + def test_plot_offset_freq(self): + ser = tm.makeTimeSeries() + _check_plot_works(ser.plot) + + dr = date_range(ser.index[0], freq='BQS', periods=10) + ser = Series(np.random.randn(len(dr)), dr) + _check_plot_works(ser.plot) + + @slow + def test_plot_multiple_inferred_freq(self): + dr = Index([datetime(2000, 1, 1), + datetime(2000, 1, 6), + datetime(2000, 1, 11)]) + ser = Series(np.random.randn(len(dr)), dr) + _check_plot_works(ser.plot) + + @slow + def test_uhf(self): + import pandas.tseries.converter as conv + import matplotlib.pyplot as plt + fig = plt.gcf() + plt.clf() + fig.add_subplot(111) + + idx = date_range('2012-6-22 21:59:51.960928', freq='L', periods=500) + df = DataFrame(np.random.randn(len(idx), 2), idx) + + ax = df.plot() + axis = ax.get_xaxis() + + tlocs = axis.get_ticklocs() + tlabels = axis.get_ticklabels() + for loc, label in zip(tlocs, tlabels): + xp = conv._from_ordinal(loc).strftime('%H:%M:%S.%f') + rs = str(label.get_text()) + if len(rs): + self.assertEqual(xp, rs) + + @slow + def test_irreg_hf(self): + import matplotlib.pyplot as plt + fig = plt.gcf() + plt.clf() + fig.add_subplot(111) + + idx = date_range('2012-6-22 21:59:51', freq='S', periods=100) + df = DataFrame(np.random.randn(len(idx), 2), idx) + + irreg = df.ix[[0, 1, 3, 4]] + ax = irreg.plot() + diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() + + sec = 1. / 24 / 60 / 60 + self.assertTrue((np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all()) + + plt.clf() + fig.add_subplot(111) + df2 = df.copy() + df2.index = df.index.asobject + ax = df2.plot() + diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() + self.assertTrue((np.fabs(diffs[1:] - sec) < 1e-8).all()) + + def test_irregular_datetime64_repr_bug(self): + import matplotlib.pyplot as plt + ser = tm.makeTimeSeries() + ser = ser[[0, 1, 2, 7]] + + fig = plt.gcf() + plt.clf() + ax = fig.add_subplot(211) + ret = ser.plot() + self.assertIsNotNone(ret) + + for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index): + self.assertEqual(rs, xp) + + def test_business_freq(self): + import matplotlib.pyplot as plt + bts = tm.makePeriodSeries() + ax = bts.plot() + self.assertEqual(ax.get_lines()[0].get_xydata()[0, 0], + bts.index[0].ordinal) + idx = ax.get_lines()[0].get_xdata() + self.assertEqual(PeriodIndex(data=idx).freqstr, 'B') + + @slow + def test_business_freq_convert(self): + n = tm.N + tm.N = 300 + bts = tm.makeTimeSeries().asfreq('BM') + tm.N = n + ts = bts.to_period('M') + ax = bts.plot() + self.assertEqual(ax.get_lines()[0].get_xydata()[0, 0], + ts.index[0].ordinal) + idx = ax.get_lines()[0].get_xdata() + self.assertEqual(PeriodIndex(data=idx).freqstr, 'M') + + def test_nonzero_base(self): + # GH2571 + idx = (date_range('2012-12-20', periods=24, freq='H') + + timedelta(minutes=30)) + df = DataFrame(np.arange(24), index=idx) + ax = df.plot() + rs = ax.get_lines()[0].get_xdata() + self.assertFalse(Index(rs).is_normalized) + + def test_dataframe(self): + bts = DataFrame({'a': tm.makeTimeSeries()}) + ax = bts.plot() + idx = ax.get_lines()[0].get_xdata() + assert_array_equal(bts.index.to_period(), idx) + + @slow + def test_axis_limits(self): + import matplotlib.pyplot as plt + + def _test(ax): + xlim = ax.get_xlim() + ax.set_xlim(xlim[0] - 5, xlim[1] + 10) + ax.get_figure().canvas.draw() + result = ax.get_xlim() + self.assertEqual(result[0], xlim[0] - 5) + self.assertEqual(result[1], xlim[1] + 10) + + # string + expected = (Period('1/1/2000', ax.freq), + Period('4/1/2000', ax.freq)) + ax.set_xlim('1/1/2000', '4/1/2000') + ax.get_figure().canvas.draw() + result = ax.get_xlim() + self.assertEqual(int(result[0]), expected[0].ordinal) + self.assertEqual(int(result[1]), expected[1].ordinal) + + # datetim + expected = (Period('1/1/2000', ax.freq), + Period('4/1/2000', ax.freq)) + ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1)) + ax.get_figure().canvas.draw() + result = ax.get_xlim() + self.assertEqual(int(result[0]), expected[0].ordinal) + self.assertEqual(int(result[1]), expected[1].ordinal) + fig = ax.get_figure() + plt.close(fig) + + ser = tm.makeTimeSeries() + ax = ser.plot() + _test(ax) + + df = DataFrame({'a': ser, 'b': ser + 1}) + ax = df.plot() + _test(ax) + + df = DataFrame({'a': ser, 'b': ser + 1}) + axes = df.plot(subplots=True) + + for ax in axes: + _test(ax) + + def test_get_finder(self): + import pandas.tseries.converter as conv + + self.assertEqual(conv.get_finder('B'), conv._daily_finder) + self.assertEqual(conv.get_finder('D'), conv._daily_finder) + self.assertEqual(conv.get_finder('M'), conv._monthly_finder) + self.assertEqual(conv.get_finder('Q'), conv._quarterly_finder) + self.assertEqual(conv.get_finder('A'), conv._annual_finder) + self.assertEqual(conv.get_finder('W'), conv._daily_finder) + + @slow + def test_finder_daily(self): + import matplotlib.pyplot as plt + xp = Period('1999-1-1', freq='B').ordinal + day_lst = [10, 40, 252, 400, 950, 2750, 10000] + for n in day_lst: + rng = bdate_range('1999-1-1', periods=n) + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + vmin, vmax = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + plt.close(ax.get_figure()) + + @slow + def test_finder_quarterly(self): + import matplotlib.pyplot as plt + xp = Period('1988Q1').ordinal + yrs = [3.5, 11] + for n in yrs: + rng = period_range('1987Q2', periods=int(n * 4), freq='Q') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(rs, xp) + (vmin, vmax) = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + plt.close(ax.get_figure()) + + @slow + def test_finder_monthly(self): + import matplotlib.pyplot as plt + xp = Period('Jan 1988').ordinal + yrs = [1.15, 2.5, 4, 11] + for n in yrs: + rng = period_range('1987Q2', periods=int(n * 12), freq='M') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(rs, xp) + vmin, vmax = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + plt.close(ax.get_figure()) + + def test_finder_monthly_long(self): + rng = period_range('1988Q1', periods=24 * 12, freq='M') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period('1989Q1', 'M').ordinal + self.assertEqual(rs, xp) + + @slow + def test_finder_annual(self): + import matplotlib.pyplot as plt + xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] + for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): + rng = period_range('1987', periods=nyears, freq='A') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(rs, Period(xp[i], freq='A').ordinal) + plt.close(ax.get_figure()) + + @slow + def test_finder_minutely(self): + nminutes = 50 * 24 * 60 + rng = date_range('1/1/1999', freq='Min', periods=nminutes) + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period('1/1/1999', freq='Min').ordinal + self.assertEqual(rs, xp) + + def test_finder_hourly(self): + nhours = 23 + rng = date_range('1/1/1999', freq='H', periods=nhours) + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period('1/1/1999', freq='H').ordinal + self.assertEqual(rs, xp) + + @slow + def test_gaps(self): + import matplotlib.pyplot as plt + + ts = tm.makeTimeSeries() + ts[5:25] = np.nan + ax = ts.plot() + lines = ax.get_lines() + self.assertEqual(len(lines), 1) + l = lines[0] + data = l.get_xydata() + tm.assert_isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + self.assertTrue(mask[5:25, 1].all()) + plt.close(ax.get_figure()) + + # irregular + ts = tm.makeTimeSeries() + ts = ts[[0, 1, 2, 5, 7, 9, 12, 15, 20]] + ts[2:5] = np.nan + ax = ts.plot() + lines = ax.get_lines() + self.assertEqual(len(lines), 1) + l = lines[0] + data = l.get_xydata() + tm.assert_isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + self.assertTrue(mask[2:5, 1].all()) + plt.close(ax.get_figure()) + + # non-ts + idx = [0, 1, 2, 5, 7, 9, 12, 15, 20] + ser = Series(np.random.randn(len(idx)), idx) + ser[2:5] = np.nan + ax = ser.plot() + lines = ax.get_lines() + self.assertEqual(len(lines), 1) + l = lines[0] + data = l.get_xydata() + tm.assert_isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + self.assertTrue(mask[2:5, 1].all()) + + @slow + def test_gap_upsample(self): + low = tm.makeTimeSeries() + low[5:25] = np.nan + ax = low.plot() + + idxh = date_range(low.index[0], low.index[-1], freq='12h') + s = Series(np.random.randn(len(idxh)), idxh) + s.plot(secondary_y=True) + lines = ax.get_lines() + self.assertEqual(len(lines), 1) + self.assertEqual(len(ax.right_ax.get_lines()), 1) + l = lines[0] + data = l.get_xydata() + tm.assert_isinstance(data, np.ma.core.MaskedArray) + mask = data.mask + self.assertTrue(mask[5:25, 1].all()) + + @slow + def test_secondary_y(self): + import matplotlib.pyplot as plt + + ser = Series(np.random.randn(10)) + ser2 = Series(np.random.randn(10)) + ax = ser.plot(secondary_y=True).right_ax + fig = ax.get_figure() + axes = fig.get_axes() + l = ax.get_lines()[0] + xp = Series(l.get_ydata(), l.get_xdata()) + assert_series_equal(ser, xp) + self.assertEqual(ax.get_yaxis().get_ticks_position(), 'right') + self.assertFalse(axes[0].get_yaxis().get_visible()) + plt.close(fig) + + ax2 = ser2.plot() + self.assertEqual(ax2.get_yaxis().get_ticks_position(), 'default') + plt.close(ax2.get_figure()) + + ax = ser2.plot() + ax2 = ser.plot(secondary_y=True).right_ax + self.assertTrue(ax.get_yaxis().get_visible()) + + @slow + def test_secondary_y_ts(self): + import matplotlib.pyplot as plt + idx = date_range('1/1/2000', periods=10) + ser = Series(np.random.randn(10), idx) + ser2 = Series(np.random.randn(10), idx) + ax = ser.plot(secondary_y=True).right_ax + fig = ax.get_figure() + axes = fig.get_axes() + l = ax.get_lines()[0] + xp = Series(l.get_ydata(), l.get_xdata()).to_timestamp() + assert_series_equal(ser, xp) + self.assertEqual(ax.get_yaxis().get_ticks_position(), 'right') + self.assertFalse(axes[0].get_yaxis().get_visible()) + plt.close(fig) + + ax2 = ser2.plot() + self.assertEqual(ax2.get_yaxis().get_ticks_position(), 'default') + plt.close(ax2.get_figure()) + + ax = ser2.plot() + ax2 = ser.plot(secondary_y=True) + self.assertTrue(ax.get_yaxis().get_visible()) + + @slow + def test_secondary_kde(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + + import matplotlib.pyplot as plt + ser = Series(np.random.randn(10)) + ax = ser.plot(secondary_y=True, kind='density').right_ax + fig = ax.get_figure() + axes = fig.get_axes() + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'right') + + @slow + def test_secondary_bar(self): + ser = Series(np.random.randn(10)) + ax = ser.plot(secondary_y=True, kind='bar') + fig = ax.get_figure() + axes = fig.get_axes() + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'right') + + @slow + def test_secondary_frame(self): + df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) + axes = df.plot(secondary_y=['a', 'c'], subplots=True) + self.assertEqual(axes[0].get_yaxis().get_ticks_position(), 'right') + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'default') + self.assertEqual(axes[2].get_yaxis().get_ticks_position(), 'right') + + @slow + def test_secondary_bar_frame(self): + df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) + axes = df.plot(kind='bar', secondary_y=['a', 'c'], subplots=True) + self.assertEqual(axes[0].get_yaxis().get_ticks_position(), 'right') + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'default') + self.assertEqual(axes[2].get_yaxis().get_ticks_position(), 'right') + + def test_mixed_freq_regular_first(self): + import matplotlib.pyplot as plt + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + ax = s1.plot() + ax2 = s2.plot(style='g') + lines = ax2.get_lines() + idx1 = lines[0].get_xdata() + idx2 = lines[1].get_xdata() + self.assertTrue(idx1.equals(s1.index.to_period('B'))) + self.assertTrue(idx2.equals(s2.index.to_period('B'))) + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + self.assertEqual(left, pidx[0].ordinal) + self.assertEqual(right, pidx[-1].ordinal) + + @slow + def test_mixed_freq_irregular_first(self): + import matplotlib.pyplot as plt + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + s2.plot(style='g') + ax = s1.plot() + self.assertFalse(hasattr(ax, 'freq')) + lines = ax.get_lines() + x1 = lines[0].get_xdata() + assert_array_equal(x1, s2.index.asobject.values) + x2 = lines[1].get_xdata() + assert_array_equal(x2, s1.index.asobject.values) + + def test_mixed_freq_hf_first(self): + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assertEqual(PeriodIndex(data=l.get_xdata()).freq, 'D') + + @slow + def test_mixed_freq_alignment(self): + ts_ind = date_range('2012-01-01 13:00', '2012-01-02', freq='H') + ts_data = np.random.randn(12) + + ts = Series(ts_data, index=ts_ind) + ts2 = ts.asfreq('T').interpolate() + + ax = ts.plot() + ts2.plot(style='r') + + self.assertEqual(ax.lines[0].get_xdata()[0], + ax.lines[1].get_xdata()[0]) + + @slow + def test_mixed_freq_lf_first(self): + import matplotlib.pyplot as plt + + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot(legend=True) + ax = high.plot(legend=True) + for l in ax.get_lines(): + self.assertEqual(PeriodIndex(data=l.get_xdata()).freq, 'D') + leg = ax.get_legend() + self.assertEqual(len(leg.texts), 2) + plt.close(ax.get_figure()) + + idxh = date_range('1/1/1999', periods=240, freq='T') + idxl = date_range('1/1/1999', periods=4, freq='H') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assertEqual(PeriodIndex(data=l.get_xdata()).freq, 'T') + + def test_mixed_freq_irreg_period(self): + ts = tm.makeTimeSeries() + irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] + rng = period_range('1/3/2000', periods=30, freq='B') + ps = Series(np.random.randn(len(rng)), rng) + irreg.plot() + ps.plot() + + @slow + def test_to_weekly_resampling(self): + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assertTrue(PeriodIndex(data=l.get_xdata()).freq.startswith('W')) + + @slow + def test_from_weekly_resampling(self): + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assertTrue(PeriodIndex(data=l.get_xdata()).freq.startswith('W')) + + @slow + def test_irreg_dtypes(self): + # date + idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] + df = DataFrame(np.random.randn(len(idx), 3), Index(idx, dtype=object)) + _check_plot_works(df.plot) + + # np.datetime64 + idx = date_range('1/1/2000', periods=10) + idx = idx[[0, 2, 5, 9]].asobject + df = DataFrame(np.random.randn(len(idx), 3), idx) + _check_plot_works(df.plot) + + @slow + def test_time(self): + t = datetime(1, 1, 1, 3, 30, 0) + deltas = np.random.randint(1, 20, 3).cumsum() + ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) + df = DataFrame({'a': np.random.randn(len(ts)), + 'b': np.random.randn(len(ts))}, + index=ts) + ax = df.plot() + + # verify tick labels + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + h, m = divmod(m, 60) + xp = l.get_text() + if len(xp) > 0: + rs = time(h, m, s).strftime('%H:%M:%S') + self.assertEqual(xp, rs) + + # change xlim + ax.set_xlim('1:30', '5:00') + + # check tick labels again + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + h, m = divmod(m, 60) + xp = l.get_text() + if len(xp) > 0: + rs = time(h, m, s).strftime('%H:%M:%S') + self.assertEqual(xp, rs) + + @slow + def test_time_musec(self): + t = datetime(1, 1, 1, 3, 30, 0) + deltas = np.random.randint(1, 20, 3).cumsum() + ts = np.array([(t + timedelta(microseconds=int(x))).time() + for x in deltas]) + df = DataFrame({'a': np.random.randn(len(ts)), + 'b': np.random.randn(len(ts))}, + index=ts) + ax = df.plot() + + # verify tick labels + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + us = int((t - int(t)) * 1e6) + h, m = divmod(m, 60) + xp = l.get_text() + if len(xp) > 0: + rs = time(h, m, s).strftime('%H:%M:%S.%f') + self.assertEqual(xp, rs) + + @slow + def test_secondary_upsample(self): + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot(secondary_y=True) + for l in ax.get_lines(): + self.assertEqual(l.get_xdata().freq, 'D') + for l in ax.right_ax.get_lines(): + self.assertEqual(l.get_xdata().freq, 'D') + + @slow + def test_secondary_legend(self): + import matplotlib.pyplot as plt + fig = plt.gcf() + plt.clf() + ax = fig.add_subplot(211) + + # ts + df = tm.makeTimeDataFrame() + ax = df.plot(secondary_y=['A', 'B']) + leg = ax.get_legend() + self.assertEqual(len(leg.get_lines()), 4) + self.assertEqual(leg.get_texts()[0].get_text(), 'A (right)') + self.assertEqual(leg.get_texts()[1].get_text(), 'B (right)') + self.assertEqual(leg.get_texts()[2].get_text(), 'C') + self.assertEqual(leg.get_texts()[3].get_text(), 'D') + self.assertIsNone(ax.right_ax.get_legend()) + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + self.assertEqual(len(colors), 4) + + plt.clf() + ax = fig.add_subplot(211) + ax = df.plot(secondary_y=['A', 'C'], mark_right=False) + leg = ax.get_legend() + self.assertEqual(len(leg.get_lines()), 4) + self.assertEqual(leg.get_texts()[0].get_text(), 'A') + self.assertEqual(leg.get_texts()[1].get_text(), 'B') + self.assertEqual(leg.get_texts()[2].get_text(), 'C') + self.assertEqual(leg.get_texts()[3].get_text(), 'D') + + plt.clf() + ax = df.plot(kind='bar', secondary_y=['A']) + leg = ax.get_legend() + self.assertEqual(leg.get_texts()[0].get_text(), 'A (right)') + self.assertEqual(leg.get_texts()[1].get_text(), 'B') + + plt.clf() + ax = df.plot(kind='bar', secondary_y=['A'], mark_right=False) + leg = ax.get_legend() + self.assertEqual(leg.get_texts()[0].get_text(), 'A') + self.assertEqual(leg.get_texts()[1].get_text(), 'B') + + plt.clf() + ax = fig.add_subplot(211) + df = tm.makeTimeDataFrame() + ax = df.plot(secondary_y=['C', 'D']) + leg = ax.get_legend() + self.assertEqual(len(leg.get_lines()), 4) + self.assertIsNone(ax.right_ax.get_legend()) + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + self.assertEqual(len(colors), 4) + + # non-ts + df = tm.makeDataFrame() + plt.clf() + ax = fig.add_subplot(211) + ax = df.plot(secondary_y=['A', 'B']) + leg = ax.get_legend() + self.assertEqual(len(leg.get_lines()), 4) + self.assertIsNone(ax.right_ax.get_legend()) + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + self.assertEqual(len(colors), 4) + + plt.clf() + ax = fig.add_subplot(211) + ax = df.plot(secondary_y=['C', 'D']) + leg = ax.get_legend() + self.assertEqual(len(leg.get_lines()), 4) + self.assertIsNone(ax.right_ax.get_legend()) + colors = set() + for line in leg.get_lines(): + colors.add(line.get_color()) + + # TODO: color cycle problems + self.assertEqual(len(colors), 4) + + def test_format_date_axis(self): + rng = date_range('1/1/2012', periods=12, freq='M') + df = DataFrame(np.random.randn(len(rng), 3), rng) + ax = df.plot() + xaxis = ax.get_xaxis() + for l in xaxis.get_ticklabels(): + if len(l.get_text()) > 0: + self.assertEqual(l.get_rotation(), 30) + + @slow + def test_ax_plot(self): + import matplotlib.pyplot as plt + + x = DatetimeIndex(start='2012-01-02', periods=10, + freq='D') + y = lrange(len(x)) + fig = plt.figure() + ax = fig.add_subplot(111) + lines = ax.plot(x, y, label='Y') + assert_array_equal(DatetimeIndex(lines[0].get_xdata()), x) + + @slow + def test_mpl_nopandas(self): + import matplotlib.pyplot as plt + + dates = [date(2008, 12, 31), date(2009, 1, 31)] + values1 = np.arange(10.0, 11.0, 0.5) + values2 = np.arange(11.0, 12.0, 0.5) + + kw = dict(fmt='-', lw=4) + + plt.close('all') + fig = plt.figure() + ax = fig.add_subplot(111) + ax.plot_date([x.toordinal() for x in dates], values1, **kw) + ax.plot_date([x.toordinal() for x in dates], values2, **kw) + + line1, line2 = ax.get_lines() + assert_array_equal(np.array([x.toordinal() for x in dates]), + line1.get_xydata()[:, 0]) + assert_array_equal(np.array([x.toordinal() for x in dates]), + line2.get_xydata()[:, 0]) + + @slow + def test_irregular_ts_shared_ax_xlim(self): + # GH 2960 + ts = tm.makeTimeSeries()[:20] + ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] + + # plot the left section of the irregular series, then the right section + ax = ts_irregular[:5].plot() + ts_irregular[5:].plot(ax=ax) + + # check that axis limits are correct + left, right = ax.get_xlim() + self.assertEqual(left, ts_irregular.index.min().toordinal()) + self.assertEqual(right, ts_irregular.index.max().toordinal()) + + @slow + def test_secondary_y_non_ts_xlim(self): + # GH 3490 - non-timeseries with secondary y + index_1 = [1, 2, 3, 4] + index_2 = [5, 6, 7, 8] + s1 = Series(1, index=index_1) + s2 = Series(2, index=index_2) + + ax = s1.plot() + left_before, right_before = ax.get_xlim() + s2.plot(secondary_y=True, ax=ax) + left_after, right_after = ax.get_xlim() + + self.assertEqual(left_before, left_after) + self.assertTrue(right_before < right_after) + + @slow + def test_secondary_y_regular_ts_xlim(self): + # GH 3490 - regular-timeseries with secondary y + index_1 = date_range(start='2000-01-01', periods=4, freq='D') + index_2 = date_range(start='2000-01-05', periods=4, freq='D') + s1 = Series(1, index=index_1) + s2 = Series(2, index=index_2) + + ax = s1.plot() + left_before, right_before = ax.get_xlim() + s2.plot(secondary_y=True, ax=ax) + left_after, right_after = ax.get_xlim() + + self.assertEqual(left_before, left_after) + self.assertTrue(right_before < right_after) + + @slow + def test_secondary_y_mixed_freq_ts_xlim(self): + # GH 3490 - mixed frequency timeseries with secondary y + rng = date_range('2000-01-01', periods=10000, freq='min') + ts = Series(1, index=rng) + + ax = ts.plot() + left_before, right_before = ax.get_xlim() + ts.resample('D').plot(secondary_y=True, ax=ax) + left_after, right_after = ax.get_xlim() + + # a downsample should not have changed either limit + self.assertEqual(left_before, left_after) + self.assertEqual(right_before, right_after) + + @slow + def test_secondary_y_irregular_ts_xlim(self): + # GH 3490 - irregular-timeseries with secondary y + ts = tm.makeTimeSeries()[:20] + ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] + + ax = ts_irregular[:5].plot() + # plot higher-x values on secondary axis + ts_irregular[5:].plot(secondary_y=True, ax=ax) + # ensure secondary limits aren't overwritten by plot on primary + ts_irregular[:5].plot(ax=ax) + + left, right = ax.get_xlim() + self.assertEqual(left, ts_irregular.index.min().toordinal()) + self.assertEqual(right, ts_irregular.index.max().toordinal()) + + +def _check_plot_works(f, freq=None, series=None, *args, **kwargs): + import matplotlib.pyplot as plt + + fig = plt.gcf() + + try: + plt.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop('ax', plt.gca()) + orig_axfreq = getattr(orig_ax, 'freq', None) + + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + + ax = kwargs.pop('ax', plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, DateOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert ax.freq == dfreq + + if freq is not None and orig_axfreq is None: + assert ax.freq == freq + + ax = fig.add_subplot(212) + try: + kwargs['ax'] = ax + ret = f(*args, **kwargs) + assert ret is not None # do something more intelligent + except Exception: + pass + + with ensure_clean(return_filelike=True) as path: + plt.savefig(path) + finally: + plt.close(fig) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py new file mode 100644 index 00000000..ff8b6945 --- /dev/null +++ b/pandas/tseries/tests/test_resample.py @@ -0,0 +1,1399 @@ +# pylint: disable=E1101 + +from datetime import datetime, timedelta + +from pandas.compat import range, lrange, zip, product +import numpy as np + +from pandas import (Series, TimeSeries, DataFrame, Panel, Index, + isnull, notnull, Timestamp) + +from pandas.tseries.index import date_range +from pandas.tseries.offsets import Minute, BDay +from pandas.tseries.period import period_range, PeriodIndex, Period +from pandas.tseries.resample import DatetimeIndex, TimeGrouper +from pandas.tseries.frequencies import MONTHS, DAYS + +import pandas.tseries.offsets as offsets +import pandas as pd + +import nose + +from pandas.util.testing import (assert_series_equal, assert_almost_equal, + assert_frame_equal) +import pandas.util.testing as tm + +bday = BDay() + + +class TestResample(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + dti = DatetimeIndex(start=datetime(2005, 1, 1), + end=datetime(2005, 1, 10), freq='Min') + + self.series = Series(np.random.rand(len(dti)), dti) + + def test_custom_grouper(self): + + dti = DatetimeIndex(freq='Min', start=datetime(2005, 1, 1), + end=datetime(2005, 1, 10)) + + s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') + + b = TimeGrouper(Minute(5)) + g = s.groupby(b) + + # check all cython functions work + funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + for f in funcs: + g._cython_agg_general(f) + + b = TimeGrouper(Minute(5), closed='right', label='right') + g = s.groupby(b) + # check all cython functions work + funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + for f in funcs: + g._cython_agg_general(f) + + self.assertEqual(g.ngroups, 2593) + self.assertTrue(notnull(g.mean()).all()) + + # construct expected val + arr = [1] + [5] * 2592 + idx = dti[0:-1:5] + idx = idx.append(dti[-1:]) + expect = Series(arr, index=idx) + + # GH2763 - return in put dtype if we can + result = g.agg(np.sum) + assert_series_equal(result, expect) + + df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype='float64') + r = df.groupby(b).agg(np.sum) + + self.assertEqual(len(r.columns), 10) + self.assertEqual(len(r.index), 2593) + + def test_resample_basic(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min', + name='index') + s = Series(np.random.randn(14), index=rng) + result = s.resample('5min', how='mean', closed='right', label='right') + expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=date_range('1/1/2000', periods=4, freq='5min')) + assert_series_equal(result, expected) + self.assertEqual(result.index.name, 'index') + + result = s.resample('5min', how='mean', closed='left', label='right') + expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], + index=date_range('1/1/2000 00:05', periods=3, + freq='5min')) + assert_series_equal(result, expected) + + s = self.series + result = s.resample('5Min', how='last') + grouper = TimeGrouper(Minute(5), closed='left', label='left') + expect = s.groupby(grouper).agg(lambda x: x[-1]) + assert_series_equal(result, expect) + + def test_resample_how(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', + freq='min', name='index') + s = Series(np.random.randn(14), index=rng) + grouplist = np.ones_like(s) + grouplist[0] = 0 + grouplist[1:6] = 1 + grouplist[6:11] = 2 + grouplist[11:] = 3 + args = ['sum', 'mean', 'std', 'sem', 'max', 'min', + 'median', 'first', 'last', 'ohlc'] + + def _ohlc(group): + if isnull(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + inds = date_range('1/1/2000', periods=4, freq='5min') + + for arg in args: + if arg == 'ohlc': + func = _ohlc + else: + func = arg + try: + result = s.resample('5min', how=arg, + closed='right', label='right') + + expected = s.groupby(grouplist).agg(func) + self.assertEqual(result.index.name, 'index') + if arg == 'ohlc': + expected = DataFrame(expected.values.tolist()) + expected.columns = ['open', 'high', 'low', 'close'] + expected.index = Index(inds, name='index') + assert_frame_equal(result, expected) + else: + expected.index = inds + assert_series_equal(result, expected) + except BaseException as exc: + + exc.args += ('how=%s' % arg,) + raise + + def test_resample_basic_from_daily(self): + # from daily + dti = DatetimeIndex( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), + freq='D', name='index') + + s = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = s.resample('w-sun', how='last') + + self.assertEqual(len(result), 3) + self.assertTrue((result.index.dayofweek == [6, 6, 6]).all()) + self.assertEqual(result.irow(0), s['1/2/2005']) + self.assertEqual(result.irow(1), s['1/9/2005']) + self.assertEqual(result.irow(2), s.irow(-1)) + + result = s.resample('W-MON', how='last') + self.assertEqual(len(result), 2) + self.assertTrue((result.index.dayofweek == [0, 0]).all()) + self.assertEqual(result.irow(0), s['1/3/2005']) + self.assertEqual(result.irow(1), s['1/10/2005']) + + result = s.resample('W-TUE', how='last') + self.assertEqual(len(result), 2) + self.assertTrue((result.index.dayofweek == [1, 1]).all()) + self.assertEqual(result.irow(0), s['1/4/2005']) + self.assertEqual(result.irow(1), s['1/10/2005']) + + result = s.resample('W-WED', how='last') + self.assertEqual(len(result), 2) + self.assertTrue((result.index.dayofweek == [2, 2]).all()) + self.assertEqual(result.irow(0), s['1/5/2005']) + self.assertEqual(result.irow(1), s['1/10/2005']) + + result = s.resample('W-THU', how='last') + self.assertEqual(len(result), 2) + self.assertTrue((result.index.dayofweek == [3, 3]).all()) + self.assertEqual(result.irow(0), s['1/6/2005']) + self.assertEqual(result.irow(1), s['1/10/2005']) + + result = s.resample('W-FRI', how='last') + self.assertEqual(len(result), 2) + self.assertTrue((result.index.dayofweek == [4, 4]).all()) + self.assertEqual(result.irow(0), s['1/7/2005']) + self.assertEqual(result.irow(1), s['1/10/2005']) + + # to biz day + result = s.resample('B', how='last') + self.assertEqual(len(result), 7) + self.assertTrue((result.index.dayofweek == [4, 0, 1, 2, 3, 4, 0]).all()) + self.assertEqual(result.irow(0), s['1/2/2005']) + self.assertEqual(result.irow(1), s['1/3/2005']) + self.assertEqual(result.irow(5), s['1/9/2005']) + self.assertEqual(result.index.name, 'index') + + def test_resample_upsampling_picked_but_not_correct(self): + + # Test for issue #3020 + dates = date_range('01-Jan-2014','05-Jan-2014', freq='D') + series = Series(1, index=dates) + + result = series.resample('D') + self.assertEqual(result.index[0], dates[0]) + + # GH 5955 + # incorrect deciding to upsample when the axis frequency matches the resample frequency + + import datetime + s = Series(np.arange(1.,6),index=[datetime.datetime(1975, 1, i, 12, 0) for i in range(1, 6)]) + expected = Series(np.arange(1.,6),index=date_range('19750101',periods=5,freq='D')) + + result = s.resample('D',how='count') + assert_series_equal(result,Series(1,index=expected.index)) + + result1 = s.resample('D',how='sum') + result2 = s.resample('D',how='mean') + result3 = s.resample('D') + assert_series_equal(result1,expected) + assert_series_equal(result2,expected) + assert_series_equal(result3,expected) + + def test_resample_frame_basic(self): + df = tm.makeTimeDataFrame() + + b = TimeGrouper('M') + g = df.groupby(b) + + # check all cython functions work + funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] + for f in funcs: + g._cython_agg_general(f) + + result = df.resample('A') + assert_series_equal(result['A'], df['A'].resample('A')) + + result = df.resample('M') + assert_series_equal(result['A'], df['A'].resample('M')) + + df.resample('M', kind='period') + df.resample('W-WED', kind='period') + + def test_resample_loffset(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') + s = Series(np.random.randn(14), index=rng) + + result = s.resample('5min', how='mean', closed='right', label='right', + loffset=timedelta(minutes=1)) + idx = date_range('1/1/2000', periods=4, freq='5min') + expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=idx + timedelta(minutes=1)) + assert_series_equal(result, expected) + + expected = s.resample( + '5min', how='mean', closed='right', label='right', + loffset='1min') + assert_series_equal(result, expected) + + expected = s.resample( + '5min', how='mean', closed='right', label='right', + loffset=Minute(1)) + assert_series_equal(result, expected) + + self.assertEqual(result.index.freq, Minute(5)) + + # from daily + dti = DatetimeIndex( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), + freq='D') + ser = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = ser.resample('w-sun', how='last') + expected = ser.resample('w-sun', how='last', loffset=-bday) + self.assertEqual(result.index[0] - bday, expected.index[0]) + + def test_resample_upsample(self): + # from daily + dti = DatetimeIndex( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), + freq='D', name='index') + + s = Series(np.random.rand(len(dti)), dti) + + # to minutely, by padding + result = s.resample('Min', fill_method='pad') + self.assertEqual(len(result), 12961) + self.assertEqual(result[0], s[0]) + self.assertEqual(result[-1], s[-1]) + + self.assertEqual(result.index.name, 'index') + + def test_upsample_with_limit(self): + rng = date_range('1/1/2000', periods=3, freq='5t') + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample('t', fill_method='ffill', limit=2) + expected = ts.reindex(result.index, method='ffill', limit=2) + assert_series_equal(result, expected) + + def test_resample_ohlc(self): + s = self.series + + grouper = TimeGrouper(Minute(5)) + expect = s.groupby(grouper).agg(lambda x: x[-1]) + result = s.resample('5Min', how='ohlc') + + self.assertEqual(len(result), len(expect)) + self.assertEqual(len(result.columns), 4) + + xs = result.irow(-2) + self.assertEqual(xs['open'], s[-6]) + self.assertEqual(xs['high'], s[-6:-1].max()) + self.assertEqual(xs['low'], s[-6:-1].min()) + self.assertEqual(xs['close'], s[-2]) + + xs = result.irow(0) + self.assertEqual(xs['open'], s[0]) + self.assertEqual(xs['high'], s[:5].max()) + self.assertEqual(xs['low'], s[:5].min()) + self.assertEqual(xs['close'], s[4]) + + def test_resample_ohlc_dataframe(self): + df = (pd.DataFrame({'PRICE': {Timestamp('2011-01-06 10:59:05', tz=None): 24990, + Timestamp('2011-01-06 12:43:33', tz=None): 25499, + Timestamp('2011-01-06 12:54:09', tz=None): 25499}, + 'VOLUME': {Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, + Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, + Timestamp('2011-01-06 12:54:09', tz=None): 100000000}}) + ).reindex_axis(['VOLUME', 'PRICE'], axis=1) + res = df.resample('H', how='ohlc') + exp = pd.concat([df['VOLUME'].resample('H', how='ohlc'), + df['PRICE'].resample('H', how='ohlc')], + axis=1, + keys=['VOLUME', 'PRICE']) + assert_frame_equal(exp, res) + + df.columns = [['a', 'b'], ['c', 'd']] + res = df.resample('H', how='ohlc') + exp.columns = pd.MultiIndex.from_tuples([('a', 'c', 'open'), ('a', 'c', 'high'), + ('a', 'c', 'low'), ('a', 'c', 'close'), ('b', 'd', 'open'), + ('b', 'd', 'high'), ('b', 'd', 'low'), ('b', 'd', 'close')]) + assert_frame_equal(exp, res) + + # dupe columns fail atm + # df.columns = ['PRICE', 'PRICE'] + + def test_resample_dup_index(self): + + # GH 4812 + # dup columns with resample raising + df = DataFrame(np.random.randn(4,12),index=[2000,2000,2000,2000],columns=[ Period(year=2000,month=i+1,freq='M') for i in range(12) ]) + df.iloc[3,:] = np.nan + result = df.resample('Q',axis=1) + expected = df.groupby(lambda x: int((x.month-1)/3),axis=1).mean() + expected.columns = [ Period(year=2000,quarter=i+1,freq='Q') for i in range(4) ] + assert_frame_equal(result, expected) + + def test_resample_reresample(self): + dti = DatetimeIndex( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), + freq='D') + s = Series(np.random.rand(len(dti)), dti) + bs = s.resample('B', closed='right', label='right') + result = bs.resample('8H') + self.assertEqual(len(result), 22) + tm.assert_isinstance(result.index.freq, offsets.DateOffset) + self.assertEqual(result.index.freq, offsets.Hour(8)) + + def test_resample_timestamp_to_period(self): + ts = _simple_ts('1/1/1990', '1/1/2000') + + result = ts.resample('A-DEC', kind='period') + expected = ts.resample('A-DEC') + expected.index = period_range('1990', '2000', freq='a-dec') + assert_series_equal(result, expected) + + result = ts.resample('A-JUN', kind='period') + expected = ts.resample('A-JUN') + expected.index = period_range('1990', '2000', freq='a-jun') + assert_series_equal(result, expected) + + result = ts.resample('M', kind='period') + expected = ts.resample('M') + expected.index = period_range('1990-01', '2000-01', freq='M') + assert_series_equal(result, expected) + + result = ts.resample('M', kind='period') + expected = ts.resample('M') + expected.index = period_range('1990-01', '2000-01', freq='M') + assert_series_equal(result, expected) + + def test_ohlc_5min(self): + def _ohlc(group): + if isnull(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50', + freq='10s') + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample('5min', how='ohlc', closed='right', + label='right') + + self.assertTrue((resampled.ix['1/1/2000 00:00'] == ts[0]).all()) + + exp = _ohlc(ts[1:31]) + self.assertTrue((resampled.ix['1/1/2000 00:05'] == exp).all()) + + exp = _ohlc(ts['1/1/2000 5:55:01':]) + self.assertTrue((resampled.ix['1/1/2000 6:00:00'] == exp).all()) + + def test_downsample_non_unique(self): + rng = date_range('1/1/2000', '2/29/2000') + rng2 = rng.repeat(5).values + ts = Series(np.random.randn(len(rng2)), index=rng2) + + result = ts.resample('M', how='mean') + + expected = ts.groupby(lambda x: x.month).mean() + self.assertEqual(len(result), 2) + assert_almost_equal(result[0], expected[1]) + assert_almost_equal(result[1], expected[2]) + + def test_asfreq_non_unique(self): + # GH #1077 + rng = date_range('1/1/2000', '2/29/2000') + rng2 = rng.repeat(2).values + ts = Series(np.random.randn(len(rng2)), index=rng2) + + self.assertRaises(Exception, ts.asfreq, 'B') + + def test_resample_axis1(self): + rng = date_range('1/1/2000', '2/29/2000') + df = DataFrame(np.random.randn(3, len(rng)), columns=rng, + index=['a', 'b', 'c']) + + result = df.resample('M', axis=1) + expected = df.T.resample('M').T + tm.assert_frame_equal(result, expected) + + def test_resample_panel(self): + rng = date_range('1/1/2000', '6/30/2000') + n = len(rng) + + panel = Panel(np.random.randn(3, n, 5), + items=['one', 'two', 'three'], + major_axis=rng, + minor_axis=['a', 'b', 'c', 'd', 'e']) + + result = panel.resample('M', axis=1) + + def p_apply(panel, f): + result = {} + for item in panel.items: + result[item] = f(panel[item]) + return Panel(result, items=panel.items) + + expected = p_apply(panel, lambda x: x.resample('M')) + tm.assert_panel_equal(result, expected) + + panel2 = panel.swapaxes(1, 2) + result = panel2.resample('M', axis=2) + expected = p_apply(panel2, lambda x: x.resample('M', axis=1)) + tm.assert_panel_equal(result, expected) + + def test_resample_panel_numpy(self): + rng = date_range('1/1/2000', '6/30/2000') + n = len(rng) + + panel = Panel(np.random.randn(3, n, 5), + items=['one', 'two', 'three'], + major_axis=rng, + minor_axis=['a', 'b', 'c', 'd', 'e']) + + result = panel.resample('M', how=lambda x: x.mean(1), axis=1) + expected = panel.resample('M', how='mean', axis=1) + tm.assert_panel_equal(result, expected) + + panel = panel.swapaxes(1, 2) + result = panel.resample('M', how=lambda x: x.mean(2), axis=2) + expected = panel.resample('M', how='mean', axis=2) + tm.assert_panel_equal(result, expected) + + def test_resample_anchored_ticks(self): + # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should + # "anchor" the origin at midnight so we get regular intervals rather + # than starting from the first timestamp which might start in the middle + # of a desired interval + + rng = date_range('1/1/2000 04:00:00', periods=86400, freq='s') + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + freqs = ['t', '5t', '15t', '30t', '4h', '12h'] + for freq in freqs: + result = ts[2:].resample(freq, closed='left', label='left') + expected = ts.resample(freq, closed='left', label='left') + assert_series_equal(result, expected) + + def test_resample_single_group(self): + mysum = lambda x: x.sum() + + rng = date_range('2000-1-1', '2000-2-10', freq='D') + ts = Series(np.random.randn(len(rng)), index=rng) + assert_series_equal(ts.resample('M', how='sum'), + ts.resample('M', how=mysum)) + + rng = date_range('2000-1-1', '2000-1-10', freq='D') + ts = Series(np.random.randn(len(rng)), index=rng) + assert_series_equal(ts.resample('M', how='sum'), + ts.resample('M', how=mysum)) + + # GH 3849 + s = Series([30.1, 31.6], index=[Timestamp('20070915 15:30:00'), + Timestamp('20070915 15:40:00')]) + expected = Series([0.75], index=[Timestamp('20070915')]) + result = s.resample('D', how=lambda x: np.std(x)) + assert_series_equal(result, expected) + + def test_resample_base(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s') + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample('5min', base=2) + exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57', + freq='5min') + self.assertTrue(resampled.index.equals(exp_rng)) + + def test_resample_daily_anchored(self): + rng = date_range('1/1/2000 0:00:00', periods=10000, freq='T') + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + result = ts[2:].resample('D', closed='left', label='left') + expected = ts.resample('D', closed='left', label='left') + assert_series_equal(result, expected) + + def test_resample_to_period_monthly_buglet(self): + # GH #1259 + + rng = date_range('1/1/2000', '12/31/2000') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('M', kind='period') + exp_index = period_range('Jan-2000', 'Dec-2000', freq='M') + self.assertTrue(result.index.equals(exp_index)) + + def test_resample_empty(self): + ts = _simple_ts('1/1/2000', '2/1/2000')[:0] + + result = ts.resample('A') + self.assertEqual(len(result), 0) + self.assertEqual(result.index.freqstr, 'A-DEC') + + result = ts.resample('A', kind='period') + self.assertEqual(len(result), 0) + self.assertEqual(result.index.freqstr, 'A-DEC') + + xp = DataFrame() + rs = xp.resample('A') + assert_frame_equal(xp, rs) + + def test_weekly_resample_buglet(self): + # #1327 + rng = date_range('1/1/2000', freq='B', periods=20) + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample('W') + expected = ts.resample('W-SUN') + assert_series_equal(resampled, expected) + + def test_monthly_resample_error(self): + # #1451 + dates = date_range('4/16/2012 20:00', periods=5000, freq='h') + ts = Series(np.random.randn(len(dates)), index=dates) + # it works! + result = ts.resample('M') + + def test_resample_anchored_intraday(self): + # #1471, #1458 + + rng = date_range('1/1/2012', '4/1/2012', freq='100min') + df = DataFrame(rng.month, index=rng) + + result = df.resample('M') + expected = df.resample('M', kind='period').to_timestamp(how='end') + tm.assert_frame_equal(result, expected) + + result = df.resample('M', closed='left') + exp = df.tshift(1, freq='D').resample('M', kind='period') + exp = exp.to_timestamp(how='end') + + tm.assert_frame_equal(result, exp) + + rng = date_range('1/1/2012', '4/1/2012', freq='100min') + df = DataFrame(rng.month, index=rng) + + result = df.resample('Q') + expected = df.resample('Q', kind='period').to_timestamp(how='end') + tm.assert_frame_equal(result, expected) + + result = df.resample('Q', closed='left') + expected = df.tshift(1, freq='D').resample('Q', kind='period', + closed='left') + expected = expected.to_timestamp(how='end') + tm.assert_frame_equal(result, expected) + + ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h') + resampled = ts.resample('M') + self.assertEqual(len(resampled), 1) + + def test_resample_anchored_monthstart(self): + ts = _simple_ts('1/1/2000', '12/31/2002') + + freqs = ['MS', 'BMS', 'QS-MAR', 'AS-DEC', 'AS-JUN'] + + for freq in freqs: + result = ts.resample(freq, how='mean') + + def test_corner_cases(self): + # miscellaneous test coverage + + rng = date_range('1/1/2000', periods=12, freq='t') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('5t', closed='right', label='left') + ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t') + self.assertTrue(result.index.equals(ex_index)) + + len0pts = _simple_pts('2007-01', '2010-05', freq='M')[:0] + # it works + result = len0pts.resample('A-DEC') + self.assertEqual(len(result), 0) + + # resample to periods + ts = _simple_ts('2000-04-28', '2000-04-30 11:00', freq='h') + result = ts.resample('M', kind='period') + self.assertEqual(len(result), 1) + self.assertEqual(result.index[0], Period('2000-04', freq='M')) + + def test_anchored_lowercase_buglet(self): + dates = date_range('4/16/2012 20:00', periods=50000, freq='s') + ts = Series(np.random.randn(len(dates)), index=dates) + # it works! + ts.resample('d') + + def test_upsample_apply_functions(self): + # #1596 + rng = pd.date_range('2012-06-12', periods=4, freq='h') + + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('20min', how=['mean', 'sum']) + tm.assert_isinstance(result, DataFrame) + + def test_resample_not_monotonic(self): + rng = pd.date_range('2012-06-12', periods=200, freq='h') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts = ts.take(np.random.permutation(len(ts))) + + result = ts.resample('D', how='sum') + exp = ts.sort_index().resample('D', how='sum') + assert_series_equal(result, exp) + + def test_resample_median_bug_1688(self): + + for dtype in ['int64','int32','float64','float32']: + df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), + datetime(2012, 1, 1, 0, 5, 0)], + dtype = dtype) + + result = df.resample("T", how=lambda x: x.mean()) + exp = df.asfreq('T') + tm.assert_frame_equal(result, exp) + + result = df.resample("T", how="median") + exp = df.asfreq('T') + tm.assert_frame_equal(result, exp) + + def test_how_lambda_functions(self): + + ts = _simple_ts('1/1/2000', '4/1/2000') + + result = ts.resample('M', how=lambda x: x.mean()) + exp = ts.resample('M', how='mean') + tm.assert_series_equal(result, exp) + + self.assertRaises(Exception, ts.resample, 'M', + how=[lambda x: x.mean(), lambda x: x.std(ddof=1)]) + + result = ts.resample('M', how={'foo': lambda x: x.mean(), + 'bar': lambda x: x.std(ddof=1)}) + foo_exp = ts.resample('M', how='mean') + bar_exp = ts.resample('M', how='std') + + tm.assert_series_equal(result['foo'], foo_exp) + tm.assert_series_equal(result['bar'], bar_exp) + + def test_resample_unequal_times(self): + # #1772 + start = datetime(1999, 3, 1, 5) + # end hour is less than start + end = datetime(2012, 7, 31, 4) + bad_ind = date_range(start, end, freq="30min") + df = DataFrame({'close': 1}, index=bad_ind) + + # it works! + df.resample('AS', 'sum') + + def test_resample_consistency(self): + + # GH 6418 + # resample with bfill / limit / reindex consistency + + i30 = index=pd.date_range('2002-02-02', periods=4, freq='30T') + s=pd.Series(np.arange(4.), index=i30) + s[2] = np.NaN + + # Upsample by factor 3 with reindex() and resample() methods: + i10 = pd.date_range(i30[0], i30[-1], freq='10T') + + s10 = s.reindex(index=i10, method='bfill') + s10_2 = s.reindex(index=i10, method='bfill', limit=2) + rl = s.reindex_like(s10, method='bfill', limit=2) + r10_2 = s.resample('10Min', fill_method='bfill', limit=2) + r10 = s.resample('10Min', fill_method='bfill') + + # s10_2, r10, r10_2, rl should all be equal + assert_series_equal(s10_2, r10) + assert_series_equal(s10_2, r10_2) + assert_series_equal(s10_2, rl) + + def test_resample_timegrouper(self): + # GH 7227 + dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3), + datetime(2014, 11, 5), datetime(2014, 9, 5), + datetime(2014, 10, 8), datetime(2014, 7, 15)] + + dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:] + dates3 = [pd.NaT] + dates1 + [pd.NaT] + + for dates in [dates1, dates2, dates3]: + df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) + result = df.set_index('A').resample('M', how='count') + exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30', + '2014-10-31', '2014-11-30'], freq='M', name='A') + expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx) + assert_frame_equal(result, expected) + + result = df.groupby(pd.Grouper(freq='M', key='A')).count() + assert_frame_equal(result, expected) + + df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates)))) + result = df.set_index('A').resample('M', how='count') + expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, + index=exp_idx, columns=['B', 'C']) + assert_frame_equal(result, expected) + + result = df.groupby(pd.Grouper(freq='M', key='A')).count() + assert_frame_equal(result, expected) + + +def _simple_ts(start, end, freq='D'): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + +def _simple_pts(start, end, freq='D'): + rng = period_range(start, end, freq=freq) + return TimeSeries(np.random.randn(len(rng)), index=rng) + + +class TestResamplePeriodIndex(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_annual_upsample_D_s_f(self): + self._check_annual_upsample_cases('D', 'start', 'ffill') + + def test_annual_upsample_D_e_f(self): + self._check_annual_upsample_cases('D', 'end', 'ffill') + + def test_annual_upsample_D_s_b(self): + self._check_annual_upsample_cases('D', 'start', 'bfill') + + def test_annual_upsample_D_e_b(self): + self._check_annual_upsample_cases('D', 'end', 'bfill') + + def test_annual_upsample_B_s_f(self): + self._check_annual_upsample_cases('B', 'start', 'ffill') + + def test_annual_upsample_B_e_f(self): + self._check_annual_upsample_cases('B', 'end', 'ffill') + + def test_annual_upsample_B_s_b(self): + self._check_annual_upsample_cases('B', 'start', 'bfill') + + def test_annual_upsample_B_e_b(self): + self._check_annual_upsample_cases('B', 'end', 'bfill') + + def test_annual_upsample_M_s_f(self): + self._check_annual_upsample_cases('M', 'start', 'ffill') + + def test_annual_upsample_M_e_f(self): + self._check_annual_upsample_cases('M', 'end', 'ffill') + + def test_annual_upsample_M_s_b(self): + self._check_annual_upsample_cases('M', 'start', 'bfill') + + def test_annual_upsample_M_e_b(self): + self._check_annual_upsample_cases('M', 'end', 'bfill') + + def _check_annual_upsample_cases(self, targ, conv, meth, end='12/31/1991'): + for month in MONTHS: + ts = _simple_pts('1/1/1990', end, freq='A-%s' % month) + + result = ts.resample(targ, fill_method=meth, + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, meth).to_period() + assert_series_equal(result, expected) + + def test_basic_downsample(self): + ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') + result = ts.resample('a-dec') + + expected = ts.groupby(ts.index.year).mean() + expected.index = period_range('1/1/1990', '6/30/1995', + freq='a-dec') + assert_series_equal(result, expected) + + # this is ok + assert_series_equal(ts.resample('a-dec'), result) + assert_series_equal(ts.resample('a'), result) + + def test_not_subperiod(self): + # These are incompatible period rules for resampling + ts = _simple_pts('1/1/1990', '6/30/1995', freq='w-wed') + self.assertRaises(ValueError, ts.resample, 'a-dec') + self.assertRaises(ValueError, ts.resample, 'q-mar') + self.assertRaises(ValueError, ts.resample, 'M') + self.assertRaises(ValueError, ts.resample, 'w-thu') + + def test_basic_upsample(self): + ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') + result = ts.resample('a-dec') + + resampled = result.resample('D', fill_method='ffill', convention='end') + + expected = result.to_timestamp('D', how='end') + expected = expected.asfreq('D', 'ffill').to_period() + + assert_series_equal(resampled, expected) + + def test_upsample_with_limit(self): + rng = period_range('1/1/2000', periods=5, freq='A') + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample('M', fill_method='ffill', limit=2, + convention='end') + expected = ts.asfreq('M').reindex(result.index, method='ffill', + limit=2) + assert_series_equal(result, expected) + + def test_annual_upsample(self): + ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-DEC') + df = DataFrame({'a': ts}) + rdf = df.resample('D', fill_method='ffill') + exp = df['a'].resample('D', fill_method='ffill') + assert_series_equal(rdf['a'], exp) + + rng = period_range('2000', '2003', freq='A-DEC') + ts = Series([1, 2, 3, 4], index=rng) + + result = ts.resample('M', fill_method='ffill') + ex_index = period_range('2000-01', '2003-12', freq='M') + + expected = ts.asfreq('M', how='start').reindex(ex_index, + method='ffill') + assert_series_equal(result, expected) + + def test_quarterly_upsample(self): + targets = ['D', 'B', 'M'] + + for month in MONTHS: + ts = _simple_pts('1/1/1990', '12/31/1995', freq='Q-%s' % month) + + for targ, conv in product(targets, ['start', 'end']): + result = ts.resample(targ, fill_method='ffill', + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, 'ffill').to_period() + assert_series_equal(result, expected) + + def test_monthly_upsample(self): + targets = ['D', 'B'] + + ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') + + for targ, conv in product(targets, ['start', 'end']): + result = ts.resample(targ, fill_method='ffill', + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, 'ffill').to_period() + assert_series_equal(result, expected) + + def test_fill_method_and_how_upsample(self): + # GH2073 + s = Series(np.arange(9,dtype='int64'), + index=date_range('2010-01-01', periods=9, freq='Q')) + last = s.resample('M', fill_method='ffill') + both = s.resample('M', how='last', fill_method='ffill').astype('int64') + assert_series_equal(last, both) + + def test_weekly_upsample(self): + targets = ['D', 'B'] + + for day in DAYS: + ts = _simple_pts('1/1/1990', '12/31/1995', freq='W-%s' % day) + + for targ, conv in product(targets, ['start', 'end']): + result = ts.resample(targ, fill_method='ffill', + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, 'ffill').to_period() + assert_series_equal(result, expected) + + def test_resample_to_timestamps(self): + ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') + + result = ts.resample('A-DEC', kind='timestamp') + expected = ts.to_timestamp(how='end').resample('A-DEC') + assert_series_equal(result, expected) + + def test_resample_to_quarterly(self): + for month in MONTHS: + ts = _simple_pts('1990', '1992', freq='A-%s' % month) + quar_ts = ts.resample('Q-%s' % month, fill_method='ffill') + + stamps = ts.to_timestamp('D', how='start') + qdates = period_range(ts.index[0].asfreq('D', 'start'), + ts.index[-1].asfreq('D', 'end'), + freq='Q-%s' % month) + + expected = stamps.reindex(qdates.to_timestamp('D', 's'), + method='ffill') + expected.index = qdates + + assert_series_equal(quar_ts, expected) + + # conforms, but different month + ts = _simple_pts('1990', '1992', freq='A-JUN') + + for how in ['start', 'end']: + result = ts.resample('Q-MAR', convention=how, fill_method='ffill') + expected = ts.asfreq('Q-MAR', how=how) + expected = expected.reindex(result.index, method='ffill') + + # .to_timestamp('D') + # expected = expected.resample('Q-MAR', fill_method='ffill') + + assert_series_equal(result, expected) + + def test_resample_fill_missing(self): + rng = PeriodIndex([2000, 2005, 2007, 2009], freq='A') + + s = TimeSeries(np.random.randn(4), index=rng) + + stamps = s.to_timestamp() + + filled = s.resample('A') + expected = stamps.resample('A').to_period('A') + assert_series_equal(filled, expected) + + filled = s.resample('A', fill_method='ffill') + expected = stamps.resample('A', fill_method='ffill').to_period('A') + assert_series_equal(filled, expected) + + def test_cant_fill_missing_dups(self): + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq='A') + s = TimeSeries(np.random.randn(5), index=rng) + self.assertRaises(Exception, s.resample, 'A') + + def test_resample_5minute(self): + rng = period_range('1/1/2000', '1/5/2000', freq='T') + ts = TimeSeries(np.random.randn(len(rng)), index=rng) + + result = ts.resample('5min') + expected = ts.to_timestamp().resample('5min') + assert_series_equal(result, expected) + + def test_upsample_daily_business_daily(self): + ts = _simple_pts('1/1/2000', '2/1/2000', freq='B') + + result = ts.resample('D') + expected = ts.asfreq('D').reindex(period_range('1/3/2000', '2/1/2000')) + assert_series_equal(result, expected) + + ts = _simple_pts('1/1/2000', '2/1/2000') + result = ts.resample('H', convention='s') + exp_rng = period_range('1/1/2000', '2/1/2000 23:00', freq='H') + expected = ts.asfreq('H', how='s').reindex(exp_rng) + assert_series_equal(result, expected) + + def test_resample_empty(self): + ts = _simple_pts('1/1/2000', '2/1/2000')[:0] + + result = ts.resample('A') + self.assertEqual(len(result), 0) + + def test_resample_irregular_sparse(self): + dr = date_range(start='1/1/2012', freq='5min', periods=1000) + s = Series(np.array(100), index=dr) + # subset the data. + subset = s[:'2012-01-04 06:55'] + + result = subset.resample('10min', how=len) + expected = s.resample('10min', how=len).ix[result.index] + assert_series_equal(result, expected) + + def test_resample_weekly_all_na(self): + rng = date_range('1/1/2000', periods=10, freq='W-WED') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('W-THU') + + self.assertTrue(result.isnull().all()) + + result = ts.resample('W-THU', fill_method='ffill')[:-1] + expected = ts.asfreq('W-THU', method='ffill') + assert_series_equal(result, expected) + + def test_resample_tz_localized(self): + dr = date_range(start='2012-4-13', end='2012-5-1') + ts = Series(lrange(len(dr)), dr) + + ts_utc = ts.tz_localize('UTC') + ts_local = ts_utc.tz_convert('America/Los_Angeles') + + result = ts_local.resample('W') + + ts_local_naive = ts_local.copy() + ts_local_naive.index = [x.replace(tzinfo=None) + for x in ts_local_naive.index.to_pydatetime()] + + exp = ts_local_naive.resample('W').tz_localize('America/Los_Angeles') + + assert_series_equal(result, exp) + + # it works + result = ts_local.resample('D') + + # #2245 + idx = date_range('2001-09-20 15:59', '2001-09-20 16:00', freq='T', + tz='Australia/Sydney') + s = Series([1, 2], index=idx) + + result = s.resample('D', closed='right', label='right') + ex_index = date_range('2001-09-21', periods=1, freq='D', + tz='Australia/Sydney') + expected = Series([1.5], index=ex_index) + + assert_series_equal(result, expected) + + # for good measure + result = s.resample('D', kind='period') + ex_index = period_range('2001-09-20', periods=1, freq='D') + expected = Series([1.5], index=ex_index) + assert_series_equal(result, expected) + + # GH 6397 + # comparing an offset that doesn't propogate tz's + rng = date_range('1/1/2011', periods=20000, freq='H') + rng = rng.tz_localize('EST') + ts = DataFrame(index=rng) + ts['first']=np.random.randn(len(rng)) + ts['second']=np.cumsum(np.random.randn(len(rng))) + expected = DataFrame({ 'first' : ts.resample('A',how=np.sum)['first'], + 'second' : ts.resample('A',how=np.mean)['second'] },columns=['first','second']) + result = ts.resample('A', how={'first':np.sum, 'second':np.mean}).reindex(columns=['first','second']) + assert_frame_equal(result,expected) + + def test_closed_left_corner(self): + # #1465 + s = Series(np.random.randn(21), + index=date_range(start='1/1/2012 9:30', + freq='1min', periods=21)) + s[0] = np.nan + + result = s.resample('10min', how='mean', closed='left', label='right') + exp = s[1:].resample('10min', how='mean', closed='left', label='right') + assert_series_equal(result, exp) + + result = s.resample('10min', how='mean', closed='left', label='left') + exp = s[1:].resample('10min', how='mean', closed='left', label='left') + + ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3) + + self.assertTrue(result.index.equals(ex_index)) + assert_series_equal(result, exp) + + def test_quarterly_resampling(self): + rng = period_range('2000Q1', periods=10, freq='Q-DEC') + ts = Series(np.arange(10), index=rng) + + result = ts.resample('A') + exp = ts.to_timestamp().resample('A').to_period() + assert_series_equal(result, exp) + + def test_resample_weekly_bug_1726(self): + # 8/6/12 is a Monday + ind = DatetimeIndex(start="8/6/2012", end="8/26/2012", freq="D") + n = len(ind) + data = [[x] * 5 for x in range(n)] + df = DataFrame(data, columns=['open', 'high', 'low', 'close', 'vol'], + index=ind) + + # it works! + df.resample('W-MON', how='first', closed='left', label='left') + + def test_resample_bms_2752(self): + # GH2753 + foo = pd.Series(index=pd.bdate_range('20000101','20000201')) + res1 = foo.resample("BMS") + res2 = foo.resample("BMS").resample("B") + self.assertEqual(res1.index[0], Timestamp('20000103')) + self.assertEqual(res1.index[0], res2.index[0]) + + # def test_monthly_convention_span(self): + # rng = period_range('2000-01', periods=3, freq='M') + # ts = Series(np.arange(3), index=rng) + + # # hacky way to get same thing + # exp_index = period_range('2000-01-01', '2000-03-31', freq='D') + # expected = ts.asfreq('D', how='end').reindex(exp_index) + # expected = expected.fillna(method='bfill') + + # result = ts.resample('D', convention='span') + + # assert_series_equal(result, expected) + + def test_default_right_closed_label(self): + end_freq = ['D', 'Q', 'M', 'D'] + end_types = ['M', 'A', 'Q', 'W'] + + for from_freq, to_freq in zip(end_freq, end_types): + idx = DatetimeIndex(start='8/15/2012', periods=100, + freq=from_freq) + df = DataFrame(np.random.randn(len(idx), 2), idx) + + resampled = df.resample(to_freq) + assert_frame_equal(resampled, df.resample(to_freq, closed='right', + label='right')) + + def test_default_left_closed_label(self): + others = ['MS', 'AS', 'QS', 'D', 'H'] + others_freq = ['D', 'Q', 'M', 'H', 'T'] + + for from_freq, to_freq in zip(others_freq, others): + idx = DatetimeIndex(start='8/15/2012', periods=100, + freq=from_freq) + df = DataFrame(np.random.randn(len(idx), 2), idx) + + resampled = df.resample(to_freq) + assert_frame_equal(resampled, df.resample(to_freq, closed='left', + label='left')) + + def test_all_values_single_bin(self): + # 2070 + index = period_range(start="2012-01-01", end="2012-12-31", freq="M") + s = Series(np.random.randn(len(index)), index=index) + + result = s.resample("A", how='mean') + tm.assert_almost_equal(result[0], s.mean()) + + def test_evenly_divisible_with_no_extra_bins(self): + # 4076 + # when the frequency is evenly divisible, sometimes extra bins + + df = DataFrame(np.random.randn(9, 3), index=date_range('2000-1-1', periods=9)) + result = df.resample('5D') + expected = pd.concat([df.iloc[0:5].mean(),df.iloc[5:].mean()],axis=1).T + expected.index = [Timestamp('2000-1-1'),Timestamp('2000-1-6')] + assert_frame_equal(result,expected) + + index = date_range(start='2001-5-4', periods=28) + df = DataFrame( + [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90, + 'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 + + [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10, + 'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28, + index=index.append(index)).sort() + + index = date_range('2001-5-4',periods=4,freq='7D') + expected = DataFrame( + [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, + 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, + index=index) + result = df.resample('7D', how='count') + assert_frame_equal(result,expected) + + expected = DataFrame( + [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700, + 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4, + index=index) + result = df.resample('7D', how='sum') + assert_frame_equal(result,expected) + +class TestTimeGrouper(tm.TestCase): + + def setUp(self): + self.ts = Series(np.random.randn(1000), + index=date_range('1/1/2000', periods=1000)) + + def test_apply(self): + grouper = TimeGrouper('A', label='right', closed='right') + + grouped = self.ts.groupby(grouper) + + f = lambda x: x.order()[-3:] + + applied = grouped.apply(f) + expected = self.ts.groupby(lambda x: x.year).apply(f) + + applied.index = applied.index.droplevel(0) + expected.index = expected.index.droplevel(0) + assert_series_equal(applied, expected) + + def test_count(self): + self.ts[::3] = np.nan + + grouper = TimeGrouper('A', label='right', closed='right') + result = self.ts.resample('A', how='count') + + expected = self.ts.groupby(lambda x: x.year).count() + expected.index = result.index + + assert_series_equal(result, expected) + + def test_numpy_reduction(self): + result = self.ts.resample('A', how='prod', closed='right') + + expected = self.ts.groupby(lambda x: x.year).agg(np.prod) + expected.index = result.index + + assert_series_equal(result, expected) + + def test_apply_iteration(self): + # #2300 + N = 1000 + ind = pd.date_range(start="2000-01-01", freq="D", periods=N) + df = DataFrame({'open': 1, 'close': 2}, index=ind) + tg = TimeGrouper('M') + + _, grouper, _ = tg._get_grouper(df) + + # Errors + grouped = df.groupby(grouper, group_keys=False) + f = lambda df: df['close'] / df['open'] + + # it works! + result = grouped.apply(f) + self.assertTrue(result.index.equals(df.index)) + + def test_panel_aggregation(self): + ind = pd.date_range('1/1/2000', periods=100) + data = np.random.randn(2, len(ind), 4) + wp = pd.Panel(data, items=['Item1', 'Item2'], major_axis=ind, + minor_axis=['A', 'B', 'C', 'D']) + + tg = TimeGrouper('M', axis=1) + _, grouper, _ = tg._get_grouper(wp) + bingrouped = wp.groupby(grouper) + binagg = bingrouped.mean() + + def f(x): + assert(isinstance(x, Panel)) + return x.mean(1) + result = bingrouped.agg(f) + tm.assert_panel_equal(result, binagg) + + def test_fails_on_no_datetime_index(self): + index_names = ('Int64Index', 'PeriodIndex', 'Index', 'Float64Index', + 'MultiIndex') + index_funcs = (tm.makeIntIndex, tm.makePeriodIndex, + tm.makeUnicodeIndex, tm.makeFloatIndex, + lambda m: tm.makeCustomIndex(m, 2)) + n = 2 + for name, func in zip(index_names, index_funcs): + index = func(n) + df = DataFrame({'a': np.random.randn(n)}, index=index) + with tm.assertRaisesRegexp(TypeError, + "axis must be a DatetimeIndex, " + "but got an instance of %r" % name): + df.groupby(TimeGrouper('D')) + + def test_aggregate_normal(self): + # check TimeGrouper's aggregation is identical as normal groupby + + n = 20 + data = np.random.randn(n, 4) + normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + normal_df['key'] = [1, 2, 3, 4, 5] * 4 + + dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), + datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 + + normal_grouped = normal_df.groupby('key') + dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + + for func in ['min', 'max', 'prod', 'var', 'std', 'mean']: + expected = getattr(normal_grouped, func)() + dt_result = getattr(dt_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + assert_frame_equal(expected, dt_result) + + for func in ['count', 'sum']: + expected = getattr(normal_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_frame_equal(expected, dt_result) + + # GH 7453 + for func in ['size']: + expected = getattr(normal_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_series_equal(expected, dt_result) + + """ + for func in ['first', 'last']: + expected = getattr(normal_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_frame_equal(expected, dt_result) + + for func in ['nth']: + expected = getattr(normal_grouped, func)(3) + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)(3) + assert_frame_equal(expected, dt_result) + """ + # if TimeGrouper is used included, 'first','last' and 'nth' doesn't work yet + + def test_aggregate_with_nat(self): + # check TimeGrouper's aggregation is identical as normal groupby + + n = 20 + data = np.random.randn(n, 4) + normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, + datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 + + normal_grouped = normal_df.groupby('key') + dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + + for func in ['min', 'max', 'prod']: + normal_result = getattr(normal_grouped, func)() + dt_result = getattr(dt_grouped, func)() + pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]], + index=[3], columns=['A', 'B', 'C', 'D']) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + assert_frame_equal(expected, dt_result) + + for func in ['count', 'sum']: + normal_result = getattr(normal_grouped, func)() + pad = DataFrame([[0, 0, 0, 0]], index=[3], columns=['A', 'B', 'C', 'D']) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_frame_equal(expected, dt_result) + + for func in ['size']: + normal_result = getattr(normal_grouped, func)() + pad = Series([0], index=[3]) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_series_equal(expected, dt_result) + + # if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py new file mode 100644 index 00000000..9d85c599 --- /dev/null +++ b/pandas/tseries/tests/test_timedeltas.py @@ -0,0 +1,425 @@ +# pylint: disable-msg=E1101,W0612 + +from datetime import datetime, timedelta +import nose + +import numpy as np +import pandas as pd + +from pandas import (Index, Series, DataFrame, Timestamp, isnull, notnull, + bdate_range, date_range) +import pandas.core.common as com +from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long +from pandas import compat, to_timedelta, tslib +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type as ct +from pandas.util.testing import (assert_series_equal, + assert_frame_equal, + assert_almost_equal, + ensure_clean, + _skip_if_not_numpy17_friendly) +import pandas.util.testing as tm + +class TestTimedeltas(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + pass + + def test_numeric_conversions(self): + _skip_if_not_numpy17_friendly() + + self.assertEqual(ct(0), np.timedelta64(0,'ns')) + self.assertEqual(ct(10), np.timedelta64(10,'ns')) + self.assertEqual(ct(10,unit='ns'), np.timedelta64(10,'ns').astype('m8[ns]')) + + self.assertEqual(ct(10,unit='us'), np.timedelta64(10,'us').astype('m8[ns]')) + self.assertEqual(ct(10,unit='ms'), np.timedelta64(10,'ms').astype('m8[ns]')) + self.assertEqual(ct(10,unit='s'), np.timedelta64(10,'s').astype('m8[ns]')) + self.assertEqual(ct(10,unit='d'), np.timedelta64(10,'D').astype('m8[ns]')) + + def test_timedelta_conversions(self): + _skip_if_not_numpy17_friendly() + + self.assertEqual(ct(timedelta(seconds=1)), np.timedelta64(1,'s').astype('m8[ns]')) + self.assertEqual(ct(timedelta(microseconds=1)), np.timedelta64(1,'us').astype('m8[ns]')) + self.assertEqual(ct(timedelta(days=1)), np.timedelta64(1,'D').astype('m8[ns]')) + + def test_short_format_converters(self): + _skip_if_not_numpy17_friendly() + + def conv(v): + return v.astype('m8[ns]') + + self.assertEqual(ct('10'), np.timedelta64(10,'ns')) + self.assertEqual(ct('10ns'), np.timedelta64(10,'ns')) + self.assertEqual(ct('100'), np.timedelta64(100,'ns')) + self.assertEqual(ct('100ns'), np.timedelta64(100,'ns')) + + self.assertEqual(ct('1000'), np.timedelta64(1000,'ns')) + self.assertEqual(ct('1000ns'), np.timedelta64(1000,'ns')) + self.assertEqual(ct('1000NS'), np.timedelta64(1000,'ns')) + + self.assertEqual(ct('10us'), np.timedelta64(10000,'ns')) + self.assertEqual(ct('100us'), np.timedelta64(100000,'ns')) + self.assertEqual(ct('1000us'), np.timedelta64(1000000,'ns')) + self.assertEqual(ct('1000Us'), np.timedelta64(1000000,'ns')) + self.assertEqual(ct('1000uS'), np.timedelta64(1000000,'ns')) + + self.assertEqual(ct('1ms'), np.timedelta64(1000000,'ns')) + self.assertEqual(ct('10ms'), np.timedelta64(10000000,'ns')) + self.assertEqual(ct('100ms'), np.timedelta64(100000000,'ns')) + self.assertEqual(ct('1000ms'), np.timedelta64(1000000000,'ns')) + + self.assertEqual(ct('-1s'), -np.timedelta64(1000000000,'ns')) + self.assertEqual(ct('1s'), np.timedelta64(1000000000,'ns')) + self.assertEqual(ct('10s'), np.timedelta64(10000000000,'ns')) + self.assertEqual(ct('100s'), np.timedelta64(100000000000,'ns')) + self.assertEqual(ct('1000s'), np.timedelta64(1000000000000,'ns')) + + self.assertEqual(ct('1d'), conv(np.timedelta64(1,'D'))) + self.assertEqual(ct('-1d'), -conv(np.timedelta64(1,'D'))) + self.assertEqual(ct('1D'), conv(np.timedelta64(1,'D'))) + self.assertEqual(ct('10D'), conv(np.timedelta64(10,'D'))) + self.assertEqual(ct('100D'), conv(np.timedelta64(100,'D'))) + self.assertEqual(ct('1000D'), conv(np.timedelta64(1000,'D'))) + self.assertEqual(ct('10000D'), conv(np.timedelta64(10000,'D'))) + + # space + self.assertEqual(ct(' 10000D '), conv(np.timedelta64(10000,'D'))) + self.assertEqual(ct(' - 10000D '), -conv(np.timedelta64(10000,'D'))) + + # invalid + self.assertRaises(ValueError, ct, '1foo') + self.assertRaises(ValueError, ct, 'foo') + + def test_full_format_converters(self): + _skip_if_not_numpy17_friendly() + + def conv(v): + return v.astype('m8[ns]') + d1 = np.timedelta64(1,'D') + + self.assertEqual(ct('1days'), conv(d1)) + self.assertEqual(ct('1days,'), conv(d1)) + self.assertEqual(ct('- 1days,'), -conv(d1)) + + self.assertEqual(ct('00:00:01'), conv(np.timedelta64(1,'s'))) + self.assertEqual(ct('06:00:01'), conv(np.timedelta64(6*3600+1,'s'))) + self.assertEqual(ct('06:00:01.0'), conv(np.timedelta64(6*3600+1,'s'))) + self.assertEqual(ct('06:00:01.01'), conv(np.timedelta64(1000*(6*3600+1)+10,'ms'))) + + self.assertEqual(ct('- 1days, 00:00:01'), -conv(d1+np.timedelta64(1,'s'))) + self.assertEqual(ct('1days, 06:00:01'), conv(d1+np.timedelta64(6*3600+1,'s'))) + self.assertEqual(ct('1days, 06:00:01.01'), conv(d1+np.timedelta64(1000*(6*3600+1)+10,'ms'))) + + # invalid + self.assertRaises(ValueError, ct, '- 1days, 00') + + def test_nat_converters(self): + _skip_if_not_numpy17_friendly() + + self.assertEqual(to_timedelta('nat',box=False).astype('int64'), tslib.iNaT) + self.assertEqual(to_timedelta('nan',box=False).astype('int64'), tslib.iNaT) + + def test_to_timedelta(self): + _skip_if_not_numpy17_friendly() + + def conv(v): + return v.astype('m8[ns]') + d1 = np.timedelta64(1,'D') + + self.assertEqual(to_timedelta('1 days 06:05:01.00003',box=False), conv(d1+np.timedelta64(6*3600+5*60+1,'s')+np.timedelta64(30,'us'))) + self.assertEqual(to_timedelta('15.5us',box=False), conv(np.timedelta64(15500,'ns'))) + + # empty string + result = to_timedelta('',box=False) + self.assertEqual(result.astype('int64'), tslib.iNaT) + + result = to_timedelta(['', '']) + self.assertTrue(isnull(result).all()) + + # pass thru + result = to_timedelta(np.array([np.timedelta64(1,'s')])) + expected = np.array([np.timedelta64(1,'s')]) + tm.assert_almost_equal(result,expected) + + # ints + result = np.timedelta64(0,'ns') + expected = to_timedelta(0,box=False) + self.assertEqual(result, expected) + + # Series + expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) + result = to_timedelta(Series(['1d','1days 00:00:01'])) + tm.assert_series_equal(result, expected) + + # with units + result = Series([ np.timedelta64(0,'ns'), np.timedelta64(10,'s').astype('m8[ns]') ],dtype='m8[ns]') + expected = to_timedelta([0,10],unit='s') + tm.assert_series_equal(result, expected) + + # single element conversion + v = timedelta(seconds=1) + result = to_timedelta(v,box=False) + expected = np.timedelta64(timedelta(seconds=1)) + self.assertEqual(result, expected) + + v = np.timedelta64(timedelta(seconds=1)) + result = to_timedelta(v,box=False) + expected = np.timedelta64(timedelta(seconds=1)) + self.assertEqual(result, expected) + + # arrays of various dtypes + arr = np.array([1]*5,dtype='int64') + result = to_timedelta(arr,unit='s') + expected = Series([ np.timedelta64(1,'s') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='int64') + result = to_timedelta(arr,unit='m') + expected = Series([ np.timedelta64(1,'m') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='int64') + result = to_timedelta(arr,unit='h') + expected = Series([ np.timedelta64(1,'h') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='timedelta64[s]') + result = to_timedelta(arr) + expected = Series([ np.timedelta64(1,'s') ]*5) + tm.assert_series_equal(result, expected) + + arr = np.array([1]*5,dtype='timedelta64[D]') + result = to_timedelta(arr) + expected = Series([ np.timedelta64(1,'D') ]*5) + tm.assert_series_equal(result, expected) + + def testit(unit, transform): + + # array + result = to_timedelta(np.arange(5),unit=unit) + expected = Series([ np.timedelta64(i,transform(unit)) for i in np.arange(5).tolist() ]) + tm.assert_series_equal(result, expected) + + # scalar + result = to_timedelta(2,unit=unit) + expected = np.timedelta64(2,transform(unit)).astype('timedelta64[ns]') + self.assert_numpy_array_equal(result,expected) + + # validate all units + # GH 6855 + for unit in ['Y','M','W','D','y','w','d']: + testit(unit,lambda x: x.upper()) + for unit in ['days','day','Day','Days']: + testit(unit,lambda x: 'D') + for unit in ['h','m','s','ms','us','ns','H','S','MS','US','NS']: + testit(unit,lambda x: x.lower()) + + # offsets + + # m + testit('T',lambda x: 'm') + + # ms + testit('L',lambda x: 'ms') + + # these will error + self.assertRaises(ValueError, lambda : to_timedelta(['1h'])) + self.assertRaises(ValueError, lambda : to_timedelta(['1m'])) + self.assertRaises(ValueError, lambda : to_timedelta([1,2],unit='foo')) + self.assertRaises(ValueError, lambda : to_timedelta(1,unit='foo')) + + def test_to_timedelta_via_apply(self): + _skip_if_not_numpy17_friendly() + + # GH 5458 + expected = Series([np.timedelta64(1,'s')]) + result = Series(['00:00:01']).apply(to_timedelta) + tm.assert_series_equal(result, expected) + + result = Series([to_timedelta('00:00:01')]) + tm.assert_series_equal(result, expected) + + def test_timedelta_ops(self): + _skip_if_not_numpy17_friendly() + + # GH4984 + # make sure ops return timedeltas + s = Series([Timestamp('20130101') + timedelta(seconds=i*i) for i in range(10) ]) + td = s.diff() + + result = td.mean()[0] + # TODO This should have returned a scalar to begin with. Hack for now. + expected = to_timedelta(timedelta(seconds=9)) + tm.assert_almost_equal(result, expected) + + result = td.quantile(.1) + # This properly returned a scalar. + expected = np.timedelta64(2599999999,'ns') + tm.assert_almost_equal(result, expected) + + result = td.median()[0] + # TODO This should have returned a scalar to begin with. Hack for now. + expected = to_timedelta('00:00:08') + tm.assert_almost_equal(result, expected) + + # GH 6462 + # consistency in returned values for sum + result = td.sum()[0] + expected = to_timedelta('00:01:21') + tm.assert_almost_equal(result, expected) + + def test_timedelta_ops_scalar(self): + _skip_if_not_numpy17_friendly() + + # GH 6808 + base = pd.to_datetime('20130101 09:01:12.123456') + expected_add = pd.to_datetime('20130101 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta(10,unit='s'), + timedelta(seconds=10), + np.timedelta64(10,'s'), + np.timedelta64(10000000000,'ns'), + pd.offsets.Second(10)]: + result = base + offset + self.assertEqual(result, expected_add) + + result = base - offset + self.assertEqual(result, expected_sub) + + base = pd.to_datetime('20130102 09:01:12.123456') + expected_add = pd.to_datetime('20130103 09:01:22.123456') + expected_sub = pd.to_datetime('20130101 09:01:02.123456') + + for offset in [pd.to_timedelta('1 day, 00:00:10'), + pd.to_timedelta('1 days, 00:00:10'), + timedelta(days=1,seconds=10), + np.timedelta64(1,'D')+np.timedelta64(10,'s'), + pd.offsets.Day()+pd.offsets.Second(10)]: + result = base + offset + self.assertEqual(result, expected_add) + + result = base - offset + self.assertEqual(result, expected_sub) + + def test_to_timedelta_on_missing_values(self): + _skip_if_not_numpy17_friendly() + + # GH5438 + timedelta_NaT = np.timedelta64('NaT') + + actual = pd.to_timedelta(Series(['00:00:01', np.nan])) + expected = Series([np.timedelta64(1000000000, 'ns'), timedelta_NaT], dtype=' (3,) and sys.platform == 'win32': + raise nose.SkipTest("not used on python 3/win32") + +def _skip_if_not_windows_python_3(): + if sys.version_info < (3,) or sys.platform != 'win32': + raise nose.SkipTest("only run on python 3/win32") + + +class TestTimeSeriesDuplicates(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), + datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 3), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 4), + datetime(2000, 1, 4), datetime(2000, 1, 5)] + + self.dups = Series(np.random.randn(len(dates)), index=dates) + + def test_constructor(self): + tm.assert_isinstance(self.dups, TimeSeries) + tm.assert_isinstance(self.dups.index, DatetimeIndex) + + def test_is_unique_monotonic(self): + self.assertFalse(self.dups.index.is_unique) + + def test_index_unique(self): + uniques = self.dups.index.unique() + expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 5)]) + self.assertEqual(uniques.dtype, 'M8[ns]') # sanity + self.assertTrue(uniques.equals(expected)) + self.assertEqual(self.dups.index.nunique(), 4) + + # #2563 + self.assertTrue(isinstance(uniques, DatetimeIndex)) + + dups_local = self.dups.index.tz_localize('US/Eastern') + dups_local.name = 'foo' + result = dups_local.unique() + expected = DatetimeIndex(expected, tz='US/Eastern') + self.assertTrue(result.tz is not None) + self.assertEqual(result.name, 'foo') + self.assertTrue(result.equals(expected)) + + # NaT, note this is excluded + arr = [ 1370745748 + t for t in range(20) ] + [iNaT] + idx = DatetimeIndex(arr * 3) + self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) + + arr = [ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT] + idx = DatetimeIndex(arr * 3) + self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) + self.assertEqual(idx.nunique(), 20) + self.assertEqual(idx.nunique(dropna=False), 21) + + + def test_index_dupes_contains(self): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + self.assertTrue(d in ix) + + def test_duplicate_dates_indexing(self): + ts = self.dups + + uniques = ts.index.unique() + for date in uniques: + result = ts[date] + + mask = ts.index == date + total = (ts.index == date).sum() + expected = ts[mask] + if total > 1: + assert_series_equal(result, expected) + else: + assert_almost_equal(result, expected[0]) + + cp = ts.copy() + cp[date] = 0 + expected = Series(np.where(mask, 0, ts), index=ts.index) + assert_series_equal(cp, expected) + + self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) + + # new index + ts[datetime(2000,1,6)] = 0 + self.assertEqual(ts[datetime(2000,1,6)], 0) + + def test_range_slice(self): + idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', + '1/4/2000']) + + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts['1/2/2000':] + expected = ts[1:] + assert_series_equal(result, expected) + + result = ts['1/2/2000':'1/3/2000'] + expected = ts[1:4] + assert_series_equal(result, expected) + + def test_groupby_average_dup_values(self): + result = self.dups.groupby(level=0).mean() + expected = self.dups.groupby(self.dups.index).mean() + assert_series_equal(result, expected) + + def test_indexing_over_size_cutoff(self): + import datetime + # #1821 + + old_cutoff = _index._SIZE_CUTOFF + try: + _index._SIZE_CUTOFF = 1000 + + # create large list of non periodic datetime + dates = [] + sec = datetime.timedelta(seconds=1) + half_sec = datetime.timedelta(microseconds=500000) + d = datetime.datetime(2011, 12, 5, 20, 30) + n = 1100 + for i in range(n): + dates.append(d) + dates.append(d + sec) + dates.append(d + sec + half_sec) + dates.append(d + sec + sec + half_sec) + d += 3 * sec + + # duplicate some values in the list + duplicate_positions = np.random.randint(0, len(dates) - 1, 20) + for p in duplicate_positions: + dates[p + 1] = dates[p] + + df = DataFrame(np.random.randn(len(dates), 4), + index=dates, + columns=list('ABCD')) + + pos = n * 3 + timestamp = df.index[pos] + self.assertIn(timestamp, df.index) + + # it works! + df.ix[timestamp] + self.assertTrue(len(df.ix[[timestamp]]) > 0) + finally: + _index._SIZE_CUTOFF = old_cutoff + + def test_indexing_unordered(self): + + # GH 2437 + rng = date_range(start='2011-01-01', end='2011-01-15') + ts = Series(randn(len(rng)), index=rng) + ts2 = concat([ts[0:4],ts[-4:],ts[4:-4]]) + + for t in ts.index: + s = str(t) + expected = ts[t] + result = ts2[t] + self.assertTrue(expected == result) + + # GH 3448 (ranges) + def compare(slobj): + result = ts2[slobj].copy() + result = result.sort_index() + expected = ts[slobj] + assert_series_equal(result,expected) + + compare(slice('2011-01-01','2011-01-15')) + compare(slice('2010-12-30','2011-01-15')) + compare(slice('2011-01-01','2011-01-16')) + + # partial ranges + compare(slice('2011-01-01','2011-01-6')) + compare(slice('2011-01-06','2011-01-8')) + compare(slice('2011-01-06','2011-01-12')) + + # single values + result = ts2['2011'].sort_index() + expected = ts['2011'] + assert_series_equal(result,expected) + + # diff freq + rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') + ts = Series(np.arange(len(rng)), index=rng) + ts = ts.take(np.random.permutation(20)) + + result = ts['2005'] + for t in result.index: + self.assertTrue(t.year == 2005) + + def test_indexing(self): + + idx = date_range("2001-1-1", periods=20, freq='M') + ts = Series(np.random.rand(len(idx)),index=idx) + + # getting + + # GH 3070, make sure semantics work on Series/Frame + expected = ts['2001'] + + df = DataFrame(dict(A = ts)) + result = df['2001']['A'] + assert_series_equal(expected,result) + + # setting + ts['2001'] = 1 + expected = ts['2001'] + + df.loc['2001','A'] = 1 + + result = df['2001']['A'] + assert_series_equal(expected,result) + + # GH3546 (not including times on the last day) + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', freq='H') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected,ts) + + idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', freq='S') + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013-05'] + assert_series_equal(expected,ts) + + idx = [ Timestamp('2013-05-31 00:00'), Timestamp(datetime(2013,5,31,23,59,59,999999))] + ts = Series(lrange(len(idx)), index=idx) + expected = ts['2013'] + assert_series_equal(expected,ts) + + # GH 3925, indexing with a seconds resolution string / datetime object + df = DataFrame(randn(5,5),columns=['open','high','low','close','volume'],index=date_range('2012-01-02 18:01:00',periods=5,tz='US/Central',freq='s')) + expected = df.loc[[df.index[2]]] + result = df['2012-01-02 18:01:02'] + assert_frame_equal(result,expected) + + # this is a single date, so will raise + self.assertRaises(KeyError, df.__getitem__, df.index[2],) + + def test_recreate_from_data(self): + if _np_version_under1p7: + freqs = ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'H'] + else: + freqs = ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'H', 'N', 'C'] + + for f in freqs: + org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) + idx = DatetimeIndex(org, freq=f) + self.assertTrue(idx.equals(org)) + + org = DatetimeIndex(start='2001/02/01 09:00', freq=f, tz='US/Pacific', periods=1) + idx = DatetimeIndex(org, freq=f, tz='US/Pacific') + self.assertTrue(idx.equals(org)) + + +def assert_range_equal(left, right): + assert(left.equals(right)) + assert(left.freq == right.freq) + assert(left.tz == right.tz) + + +class TestTimeSeries(tm.TestCase): + _multiprocess_can_split_ = True + + def test_is_(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + self.assertTrue(dti.is_(dti)) + self.assertTrue(dti.is_(dti.view())) + self.assertFalse(dti.is_(dti.copy())) + + def test_dti_slicing(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + dti2 = dti[[1, 3, 5]] + + v1 = dti2[0] + v2 = dti2[1] + v3 = dti2[2] + + self.assertEqual(v1, Timestamp('2/28/2005')) + self.assertEqual(v2, Timestamp('4/30/2005')) + self.assertEqual(v3, Timestamp('6/30/2005')) + + # don't carry freq through irregular slicing + self.assertIsNone(dti2.freq) + + def test_pass_datetimeindex_to_index(self): + # Bugs in #1396 + + rng = date_range('1/1/2000', '3/1/2000') + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pydatetime(), dtype=object) + + self.assert_numpy_array_equal(idx.values, expected.values) + + def test_contiguous_boolean_preserve_freq(self): + rng = date_range('1/1/2000', '3/1/2000', freq='B') + + mask = np.zeros(len(rng), dtype=bool) + mask[10:20] = True + + masked = rng[mask] + expected = rng[10:20] + self.assertIsNotNone(expected.freq) + assert_range_equal(masked, expected) + + mask[22] = True + masked = rng[mask] + self.assertIsNone(masked.freq) + + def test_getitem_median_slice_bug(self): + index = date_range('20090415', '20090519', freq='2B') + s = Series(np.random.randn(13), index=index) + + indexer = [slice(6, 7, None)] + result = s[indexer] + expected = s[indexer[0]] + assert_series_equal(result, expected) + + def test_series_box_timestamp(self): + rng = date_range('20090415', '20090519', freq='B') + s = Series(rng) + + tm.assert_isinstance(s[5], Timestamp) + + rng = date_range('20090415', '20090519', freq='B') + s = Series(rng, index=rng) + tm.assert_isinstance(s[5], Timestamp) + + tm.assert_isinstance(s.iget_value(5), Timestamp) + + def test_date_range_ambiguous_arguments(self): + # #2538 + start = datetime(2011, 1, 1, 5, 3, 40) + end = datetime(2011, 1, 1, 8, 9, 40) + + self.assertRaises(ValueError, date_range, start, end, + freq='s', periods=10) + + def test_timestamp_to_datetime(self): + tm._skip_if_no_pytz() + rng = date_range('20090415', '20090519', + tz='US/Eastern') + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEqual(stamp, dtval) + self.assertEqual(stamp.tzinfo, dtval.tzinfo) + + def test_timestamp_to_datetime_dateutil(self): + tm._skip_if_no_pytz() + rng = date_range('20090415', '20090519', + tz='dateutil/US/Eastern') + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEqual(stamp, dtval) + self.assertEqual(stamp.tzinfo, dtval.tzinfo) + + def test_timestamp_to_datetime_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + rng = date_range('20090415', '20090519', + tz=pytz.timezone('US/Eastern')) + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEquals(stamp, dtval) + self.assertEquals(stamp.tzinfo, dtval.tzinfo) + + def test_timestamp_to_datetime_explicit_dateutil(self): + _skip_if_windows_python_3() + tm._skip_if_no_dateutil() + import dateutil + rng = date_range('20090415', '20090519', + tz=dateutil.tz.gettz('US/Eastern')) + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEquals(stamp, dtval) + self.assertEquals(stamp.tzinfo, dtval.tzinfo) + + def test_index_convert_to_datetime_array(self): + tm._skip_if_no_pytz() + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assert_isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assert_isinstance(x, datetime) + self.assertEqual(x, stamp.to_pydatetime()) + self.assertEqual(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') + rng_utc = date_range('20090415', '20090519', tz='utc') + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assert_isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assert_isinstance(x, datetime) + self.assertEquals(x, stamp.to_pydatetime()) + self.assertEquals(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', tz=pytz.timezone('US/Eastern')) + rng_utc = date_range('20090415', '20090519', tz=pytz.utc) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_index_convert_to_datetime_array_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + + def _check_rng(rng): + converted = rng.to_pydatetime() + tm.assert_isinstance(converted, np.ndarray) + for x, stamp in zip(converted, rng): + tm.assert_isinstance(x, datetime) + self.assertEquals(x, stamp.to_pydatetime()) + self.assertEquals(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', tz='dateutil/US/Eastern') + rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_ctor_str_intraday(self): + rng = DatetimeIndex(['1-1-2000 00:00:01']) + self.assertEqual(rng[0].second, 1) + + def test_series_ctor_plus_datetimeindex(self): + rng = date_range('20090415', '20090519', freq='B') + data = dict((k, 1) for k in rng) + + result = Series(data, index=rng) + self.assertIs(result.index, rng) + + def test_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index, method='pad', limit=5) + + expected = s[:2].reindex(index).fillna(method='pad') + expected[-3:] = np.nan + assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method='backfill', limit=5) + + expected = s[-2:].reindex(index).fillna(method='backfill') + expected[:3] = np.nan + assert_series_equal(result, expected) + + def test_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = s[:2].reindex(index).fillna(method='pad') + expected[-3:] = np.nan + assert_series_equal(result, expected) + + result = s[-2:].reindex(index) + result = result.fillna(method='bfill', limit=5) + + expected = s[-2:].reindex(index).fillna(method='backfill') + expected[:3] = np.nan + assert_series_equal(result, expected) + + def test_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index, method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index, method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_setitem_timestamp(self): + # 2155 + columns = DatetimeIndex(start='1/1/2012', end='2/1/2012', + freq=datetools.bday) + index = lrange(10) + data = DataFrame(columns=columns, index=index) + t = datetime(2012, 11, 1) + ts = Timestamp(t) + data[ts] = np.nan # works + + def test_sparse_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + ss = s[:2].reindex(index).to_sparse() + result = ss.fillna(method='pad', limit=5) + expected = ss.fillna(method='pad', limit=5) + expected = expected.to_dense() + expected[-3:] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + ss = s[-2:].reindex(index).to_sparse() + result = ss.fillna(method='backfill', limit=5) + expected = ss.fillna(method='backfill') + expected = expected.to_dense() + expected[:3] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + def test_sparse_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + s = s.to_sparse() + + result = s[:2].reindex(index, method='pad', limit=5) + expected = s[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected[-3:] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method='backfill', limit=5) + expected = s[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected[:3] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + def test_sparse_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index, method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index, method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + def test_sparse_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + def test_pad_require_monotonicity(self): + rng = date_range('1/1/2000', '3/1/2000', freq='B') + + rng2 = rng[::2][::-1] + + self.assertRaises(ValueError, rng2.get_indexer, rng, + method='pad') + + def test_frame_ctor_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', + freq='10s') + dates = np.asarray(rng) + + df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) + self.assertTrue(np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))) + + def test_frame_add_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', + freq='10s') + df = DataFrame(index=np.arange(len(rng))) + + df['A'] = rng + self.assertTrue(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) + + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({'year': date_range('1/1/1700', periods=50, + freq='A-DEC')}) + # it works! + repr(df) + + def test_frame_add_datetime64_col_other_units(self): + n = 100 + + units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] + + ns_dtype = np.dtype('M8[ns]') + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df[unit] = vals + + ex_vals = to_datetime(vals.astype('O')) + + self.assertEqual(df[unit].dtype, ns_dtype) + self.assertTrue((df[unit].values == ex_vals).all()) + + # Test insertion into existing datetime64 column + df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + tmp = df.copy() + + tmp['dates'] = vals + ex_vals = to_datetime(vals.astype('O')) + + self.assertTrue((tmp['dates'].values == ex_vals).all()) + + def test_to_datetime_unit(self): + + epoch = 1370745748 + s = Series([ epoch + t for t in range(20) ]) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) + assert_series_equal(result,expected) + + s = Series([ epoch + t for t in range(20) ]).astype(float) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) + assert_series_equal(result,expected) + + s = Series([ epoch + t for t in range(20) ] + [iNaT]) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + + s = Series([ epoch + t for t in range(20) ] + [iNaT]).astype(float) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + + s = concat([Series([ epoch + t for t in range(20) ]).astype(float),Series([np.nan])],ignore_index=True) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + + def test_series_ctor_datetime64(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', + freq='10s') + dates = np.asarray(rng) + + series = Series(dates) + self.assertTrue(np.issubdtype(series.dtype, np.dtype('M8[ns]'))) + + def test_index_cast_datetime64_other_units(self): + arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') + + idx = Index(arr) + + self.assertTrue((idx.values == tslib.cast_to_nanoseconds(arr)).all()) + + def test_index_astype_datetime64(self): + # valid only under 1.7! + if not _np_version_under1p7: + raise nose.SkipTest("test only valid in numpy < 1.7") + + idx = Index([datetime(2012, 1, 1)], dtype=object) + casted = idx.astype(np.dtype('M8[D]')) + + casted = idx.astype(np.dtype('M8[D]')) + expected = DatetimeIndex(idx.values) + tm.assert_isinstance(casted, DatetimeIndex) + self.assertTrue(casted.equals(expected)) + + def test_reindex_series_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + series = Series(rng) + + result = series.reindex(lrange(15)) + self.assertTrue(np.issubdtype(result.dtype, np.dtype('M8[ns]'))) + + mask = result.isnull() + self.assertTrue(mask[-5:].all()) + self.assertFalse(mask[:-5].any()) + + def test_reindex_frame_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) + + result = df.reindex(lrange(15)) + self.assertTrue(np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))) + + mask = com.isnull(result)['B'] + self.assertTrue(mask[-5:].all()) + self.assertFalse(mask[:-5].any()) + + def test_series_repr_nat(self): + series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + + result = repr(series) + expected = ('0 1970-01-01 00:00:00\n' + '1 1970-01-01 00:00:00.000001\n' + '2 1970-01-01 00:00:00.000002\n' + '3 NaT\n' + 'dtype: datetime64[ns]') + self.assertEqual(result, expected) + + def test_fillna_nat(self): + series = Series([0, 1, 2, iNaT], dtype='M8[ns]') + + filled = series.fillna(method='pad') + filled2 = series.fillna(value=series.values[2]) + + expected = series.copy() + expected.values[3] = expected.values[2] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='pad') + filled2 = df.fillna(value=series.values[2]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + + series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') + + filled = series.fillna(method='bfill') + filled2 = series.fillna(value=series[1]) + + expected = series.copy() + expected[0] = expected[1] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='bfill') + filled2 = df.fillna(value=series[1]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + + def test_string_na_nat_conversion(self): + # GH #999, #858 + + from pandas.compat import parse_date + + strings = np.array(['1/1/2000', '1/2/2000', np.nan, + '1/4/2000, 12:34:56'], dtype=object) + + expected = np.empty(4, dtype='M8[ns]') + for i, val in enumerate(strings): + if com.isnull(val): + expected[i] = iNaT + else: + expected[i] = parse_date(val) + + result = tslib.array_to_datetime(strings) + assert_almost_equal(result, expected) + + result2 = to_datetime(strings) + tm.assert_isinstance(result2, DatetimeIndex) + assert_almost_equal(result, result2) + + malformed = np.array(['1/100/2000', np.nan], dtype=object) + result = to_datetime(malformed) + assert_almost_equal(result, malformed) + + self.assertRaises(ValueError, to_datetime, malformed, + errors='raise') + + idx = ['a', 'b', 'c', 'd', 'e'] + series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, + '1/5/2000'], index=idx, name='foo') + dseries = Series([to_datetime('1/1/2000'), np.nan, + to_datetime('1/3/2000'), np.nan, + to_datetime('1/5/2000')], index=idx, name='foo') + + result = to_datetime(series) + dresult = to_datetime(dseries) + + expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) + for i in range(5): + x = series[i] + if isnull(x): + expected[i] = iNaT + else: + expected[i] = to_datetime(x) + + assert_series_equal(result, expected) + self.assertEqual(result.name, 'foo') + + assert_series_equal(dresult, expected) + self.assertEqual(dresult.name, 'foo') + + def test_to_datetime_iso8601(self): + result = to_datetime(["2012-01-01 00:00:00"]) + exp = Timestamp("2012-01-01 00:00:00") + self.assertEqual(result[0], exp) + + result = to_datetime(['20121001']) # bad iso 8601 + exp = Timestamp('2012-10-01') + self.assertEqual(result[0], exp) + + def test_to_datetime_default(self): + rs = to_datetime('2001') + xp = datetime(2001, 1, 1) + self.assertTrue(rs, xp) + + #### dayfirst is essentially broken + #### to_datetime('01-13-2012', dayfirst=True) + #### self.assertRaises(ValueError, to_datetime('01-13-2012', dayfirst=True)) + + def test_to_datetime_on_datetime64_series(self): + # #2699 + s = Series(date_range('1/1/2000', periods=10)) + + result = to_datetime(s) + self.assertEqual(result[0], s[0]) + + def test_to_datetime_with_apply(self): + # this is only locale tested with US/None locales + _skip_if_has_locale() + + # GH 5195 + # with a format and coerce a single item to_datetime fails + td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1,2,3]) + expected = pd.to_datetime(td, format='%b %y') + result = td.apply(pd.to_datetime, format='%b %y') + assert_series_equal(result, expected) + + td = pd.Series(['May 04', 'Jun 02', ''], index=[1,2,3]) + self.assertRaises(ValueError, lambda : pd.to_datetime(td,format='%b %y')) + self.assertRaises(ValueError, lambda : td.apply(pd.to_datetime, format='%b %y')) + expected = pd.to_datetime(td, format='%b %y', coerce=True) + + result = td.apply(lambda x: pd.to_datetime(x, format='%b %y', coerce=True)) + assert_series_equal(result, expected) + + def test_nat_vector_field_access(self): + idx = DatetimeIndex(['1/1/2000', None, None, '1/4/2000']) + + fields = ['year', 'quarter', 'month', 'day', 'hour', + 'minute', 'second', 'microsecond', 'nanosecond', + 'week', 'dayofyear'] + for field in fields: + result = getattr(idx, field) + expected = [getattr(x, field) if x is not NaT else -1 + for x in idx] + self.assert_numpy_array_equal(result, expected) + + def test_nat_scalar_field_access(self): + fields = ['year', 'quarter', 'month', 'day', 'hour', + 'minute', 'second', 'microsecond', 'nanosecond', + 'week', 'dayofyear'] + for field in fields: + result = getattr(NaT, field) + self.assertEqual(result, -1) + + self.assertEqual(NaT.weekday(), -1) + + def test_to_datetime_types(self): + + # empty string + result = to_datetime('') + self.assertIs(result, NaT) + + result = to_datetime(['', '']) + self.assertTrue(isnull(result).all()) + + # ints + result = Timestamp(0) + expected = to_datetime(0) + self.assertEqual(result, expected) + + # GH 3888 (strings) + expected = to_datetime(['2012'])[0] + result = to_datetime('2012') + self.assertEqual(result, expected) + + ### array = ['2012','20120101','20120101 12:01:01'] + array = ['20120101','20120101 12:01:01'] + expected = list(to_datetime(array)) + result = lmap(Timestamp,array) + tm.assert_almost_equal(result,expected) + + ### currently fails ### + ### result = Timestamp('2012') + ### expected = to_datetime('2012') + ### self.assertEqual(result, expected) + + def test_to_datetime_unprocessable_input(self): + # GH 4928 + self.assert_numpy_array_equal( + to_datetime([1, '1']), + np.array([1, '1'], dtype='O') + ) + self.assertRaises(TypeError, to_datetime, [1, '1'], errors='raise') + + def test_to_datetime_other_datetime64_units(self): + # 5/25/2012 + scalar = np.int64(1337904000000000).view('M8[us]') + as_obj = scalar.astype('O') + + index = DatetimeIndex([scalar]) + self.assertEqual(index[0], scalar.astype('O')) + + value = Timestamp(scalar) + self.assertEqual(value, as_obj) + + def test_to_datetime_list_of_integers(self): + rng = date_range('1/1/2000', periods=20) + rng = DatetimeIndex(rng.values) + + ints = list(rng.asi8) + + result = DatetimeIndex(ints) + + self.assertTrue(rng.equals(result)) + + def test_to_datetime_dt64s(self): + in_bound_dts = [ + np.datetime64('2000-01-01'), + np.datetime64('2000-01-02'), + ] + + for dt in in_bound_dts: + self.assertEqual( + pd.to_datetime(dt), + Timestamp(dt) + ) + + oob_dts = [ + np.datetime64('1000-01-01'), + np.datetime64('5000-01-02'), + ] + + for dt in oob_dts: + self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') + self.assertRaises(ValueError, tslib.Timestamp, dt) + self.assertIs(pd.to_datetime(dt, coerce=True), NaT) + + def test_to_datetime_array_of_dt64s(self): + dts = [ + np.datetime64('2000-01-01'), + np.datetime64('2000-01-02'), + ] + + # Assuming all datetimes are in bounds, to_datetime() returns + # an array that is equal to Timestamp() parsing + self.assert_numpy_array_equal( + pd.to_datetime(dts, box=False), + np.array([Timestamp(x).asm8 for x in dts]) + ) + + # A list of datetimes where the last one is out of bounds + dts_with_oob = dts + [np.datetime64('9999-01-01')] + + self.assertRaises( + ValueError, + pd.to_datetime, + dts_with_oob, + coerce=False, + errors='raise' + ) + + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=True), + np.array( + [ + Timestamp(dts_with_oob[0]).asm8, + Timestamp(dts_with_oob[1]).asm8, + iNaT, + ], + dtype='M8' + ) + ) + + # With coerce=False and errors='ignore', out of bounds datetime64s + # are converted to their .item(), which depending on the version of + # numpy is either a python datetime.datetime or datetime.date + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=False), + np.array( + [dt.item() for dt in dts_with_oob], + dtype='O' + ) + ) + + def test_index_to_datetime(self): + idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) + + result = idx.to_datetime() + expected = DatetimeIndex(datetools.to_datetime(idx.values)) + self.assertTrue(result.equals(expected)) + + today = datetime.today() + idx = Index([today], dtype=object) + result = idx.to_datetime() + expected = DatetimeIndex([today]) + self.assertTrue(result.equals(expected)) + + def test_to_datetime_freq(self): + xp = bdate_range('2000-1-1', periods=10, tz='UTC') + rs = xp.to_datetime() + self.assertEqual(xp.freq, rs.freq) + self.assertEqual(xp.tzinfo, rs.tzinfo) + + def test_range_misspecified(self): + # GH #1095 + + self.assertRaises(ValueError, date_range, '1/1/2000') + self.assertRaises(ValueError, date_range, end='1/1/2000') + self.assertRaises(ValueError, date_range, periods=10) + + self.assertRaises(ValueError, date_range, '1/1/2000', freq='H') + self.assertRaises(ValueError, date_range, end='1/1/2000', freq='H') + self.assertRaises(ValueError, date_range, periods=10, freq='H') + + def test_reasonable_keyerror(self): + # GH #1062 + index = DatetimeIndex(['1/3/2000']) + try: + index.get_loc('1/1/2000') + except KeyError as e: + self.assertIn('2000', str(e)) + + def test_reindex_with_datetimes(self): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + result = ts.reindex(list(ts.index[5:10])) + expected = ts[5:10] + tm.assert_series_equal(result, expected) + + result = ts[list(ts.index[5:10])] + tm.assert_series_equal(result, expected) + + def test_promote_datetime_date(self): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + ts_slice = ts[5:] + ts2 = ts_slice.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts + ts2 + result2 = ts2 + ts + expected = ts + ts[5:] + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + # test asfreq + result = ts2.asfreq('4H', method='ffill') + expected = ts[5:].asfreq('4H', method='ffill') + assert_series_equal(result, expected) + + result = rng.get_indexer(ts2.index) + expected = rng.get_indexer(ts_slice.index) + self.assert_numpy_array_equal(result, expected) + + def test_asfreq_normalize(self): + rng = date_range('1/1/2000 09:30', periods=20) + norm = date_range('1/1/2000', periods=20) + vals = np.random.randn(20) + ts = Series(vals, index=rng) + + result = ts.asfreq('D', normalize=True) + norm = date_range('1/1/2000', periods=20) + expected = Series(vals, index=norm) + + assert_series_equal(result, expected) + + vals = np.random.randn(20, 3) + ts = DataFrame(vals, index=rng) + + result = ts.asfreq('D', normalize=True) + expected = DataFrame(vals, index=norm) + + assert_frame_equal(result, expected) + + def test_date_range_gen_error(self): + rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') + self.assertEqual(len(rng), 4) + + def test_first_subset(self): + ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') + result = ts.first('10d') + self.assertEqual(len(result), 20) + + ts = _simple_ts('1/1/2000', '1/1/2010') + result = ts.first('10d') + self.assertEqual(len(result), 10) + + result = ts.first('3M') + expected = ts[:'3/31/2000'] + assert_series_equal(result, expected) + + result = ts.first('21D') + expected = ts[:21] + assert_series_equal(result, expected) + + result = ts[:0].first('3M') + assert_series_equal(result, ts[:0]) + + def test_last_subset(self): + ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') + result = ts.last('10d') + self.assertEqual(len(result), 20) + + ts = _simple_ts('1/1/2000', '1/1/2010') + result = ts.last('10d') + self.assertEqual(len(result), 10) + + result = ts.last('21D') + expected = ts['12/12/2009':] + assert_series_equal(result, expected) + + result = ts.last('21D') + expected = ts[-21:] + assert_series_equal(result, expected) + + result = ts[:0].last('3M') + assert_series_equal(result, ts[:0]) + + def test_add_offset(self): + rng = date_range('1/1/2000', '2/1/2000') + + result = rng + offsets.Hour(2) + expected = date_range('1/1/2000 02:00', '2/1/2000 02:00') + self.assertTrue(result.equals(expected)) + + def test_format_pre_1900_dates(self): + rng = date_range('1/1/1850', '1/1/1950', freq='A-DEC') + rng.format() + ts = Series(1, index=rng) + repr(ts) + + def test_repeat(self): + rng = date_range('1/1/2000', '1/1/2001') + + result = rng.repeat(5) + self.assertIsNone(result.freq) + self.assertEqual(len(result), 5 * len(rng)) + + def test_at_time(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + rs = ts.at_time(rng[1]) + self.assertTrue((rs.index.hour == rng[1].hour).all()) + self.assertTrue((rs.index.minute == rng[1].minute).all()) + self.assertTrue((rs.index.second == rng[1].second).all()) + + result = ts.at_time('9:30') + expected = ts.at_time(time(9, 30)) + assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts[time(9, 30)] + result_df = df.ix[time(9, 30)] + expected = ts[(rng.hour == 9) & (rng.minute == 30)] + exp_df = df[(rng.hour == 9) & (rng.minute == 30)] + + # expected.index = date_range('1/1/2000', '1/4/2000') + + assert_series_equal(result, expected) + tm.assert_frame_equal(result_df, exp_df) + + chunk = df.ix['1/4/2000':] + result = chunk.ix[time(9, 30)] + expected = result_df[-1:] + tm.assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range('1/1/2000', '1/31/2000') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.at_time(time(0, 0)) + assert_series_equal(result, ts) + + # time doesn't exist + rng = date_range('1/1/2012', freq='23Min', periods=384) + ts = Series(np.random.randn(len(rng)), rng) + rs = ts.at_time('16:00') + self.assertEqual(len(rs), 0) + + def test_at_time_frame(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + rs = ts.at_time(rng[1]) + self.assertTrue((rs.index.hour == rng[1].hour).all()) + self.assertTrue((rs.index.minute == rng[1].minute).all()) + self.assertTrue((rs.index.second == rng[1].second).all()) + + result = ts.at_time('9:30') + expected = ts.at_time(time(9, 30)) + assert_frame_equal(result, expected) + + result = ts.ix[time(9, 30)] + expected = ts.ix[(rng.hour == 9) & (rng.minute == 30)] + + assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range('1/1/2000', '1/31/2000') + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts.at_time(time(0, 0)) + assert_frame_equal(result, ts) + + # time doesn't exist + rng = date_range('1/1/2012', freq='23Min', periods=384) + ts = DataFrame(np.random.randn(len(rng), 2), rng) + rs = ts.at_time('16:00') + self.assertEqual(len(rs), 0) + + def test_between_time(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + stime = time(0, 0) + etime = time(1, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue(t >= stime) + else: + self.assertTrue(t > stime) + + if inc_end: + self.assertTrue(t <= etime) + else: + self.assertTrue(t < etime) + + result = ts.between_time('00:00', '01:00') + expected = ts.between_time(stime, etime) + assert_series_equal(result, expected) + + # across midnight + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + stime = time(22, 0) + etime = time(9, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = (12 * 11 + 1) * 4 + 1 + if not inc_start: + exp_len -= 4 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue((t >= stime) or (t <= etime)) + else: + self.assertTrue((t > stime) or (t <= etime)) + + if inc_end: + self.assertTrue((t <= etime) or (t >= stime)) + else: + self.assertTrue((t < etime) or (t >= stime)) + + def test_between_time_frame(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(0, 0) + etime = time(1, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue(t >= stime) + else: + self.assertTrue(t > stime) + + if inc_end: + self.assertTrue(t <= etime) + else: + self.assertTrue(t < etime) + + result = ts.between_time('00:00', '01:00') + expected = ts.between_time(stime, etime) + assert_frame_equal(result, expected) + + # across midnight + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = DataFrame(np.random.randn(len(rng), 2), index=rng) + stime = time(22, 0) + etime = time(9, 0) + + close_open = product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = (12 * 11 + 1) * 4 + 1 + if not inc_start: + exp_len -= 4 + if not inc_end: + exp_len -= 4 + + self.assertEqual(len(filtered), exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assertTrue((t >= stime) or (t <= etime)) + else: + self.assertTrue((t > stime) or (t <= etime)) + + if inc_end: + self.assertTrue((t <= etime) or (t >= stime)) + else: + self.assertTrue((t < etime) or (t >= stime)) + + def test_dti_constructor_preserve_dti_freq(self): + rng = date_range('1/1/2000', '1/2/2000', freq='5min') + + rng2 = DatetimeIndex(rng) + self.assertEqual(rng.freq, rng2.freq) + + def test_normalize(self): + rng = date_range('1/1/2000 9:30', periods=10, freq='D') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D') + self.assertTrue(result.equals(expected)) + + rng_ns = pd.DatetimeIndex(np.array([1380585623454345752, 1380585612343234312]).astype("datetime64[ns]")) + rng_ns_normalized = rng_ns.normalize() + expected = pd.DatetimeIndex(np.array([1380585600000000000, 1380585600000000000]).astype("datetime64[ns]")) + self.assertTrue(rng_ns_normalized.equals(expected)) + + self.assertTrue(result.is_normalized) + self.assertFalse(rng.is_normalized) + + def test_to_period(self): + from pandas.tseries.period import period_range + + ts = _simple_ts('1/1/2000', '1/1/2001') + + pts = ts.to_period() + exp = ts.copy() + exp.index = period_range('1/1/2000', '1/1/2001') + assert_series_equal(pts, exp) + + pts = ts.to_period('M') + exp.index = exp.index.asfreq('M') + self.assertTrue(pts.index.equals(exp.index.asfreq('M'))) + assert_series_equal(pts, exp) + + # GH 7606 without freq + idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04']) + exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', + '2011-01-04'], freq='D') + + s = Series(np.random.randn(4), index=idx) + expected = s.copy() + expected.index = exp_idx + assert_series_equal(s.to_period(), expected) + + df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) + expected = df.copy() + expected.index = exp_idx + assert_frame_equal(df.to_period(), expected) + + expected = df.copy() + expected.columns = exp_idx + assert_frame_equal(df.to_period(axis=1), expected) + + def create_dt64_based_index(self): + data = [Timestamp('2007-01-01 10:11:12.123456Z'), + Timestamp('2007-01-01 10:11:13.789123Z')] + index = DatetimeIndex(data) + return index + + def test_to_period_millisecond(self): + index = self.create_dt64_based_index() + + period = index.to_period(freq='L') + self.assertEqual(2, len(period)) + self.assertEqual(period[0], Period('2007-01-01 10:11:12.123Z', 'L')) + self.assertEqual(period[1], Period('2007-01-01 10:11:13.789Z', 'L')) + + def test_to_period_microsecond(self): + index = self.create_dt64_based_index() + + period = index.to_period(freq='U') + self.assertEqual(2, len(period)) + self.assertEqual(period[0], Period('2007-01-01 10:11:12.123456Z', 'U')) + self.assertEqual(period[1], Period('2007-01-01 10:11:13.789123Z', 'U')) + + def test_to_period_tz_pytz(self): + tm._skip_if_no_pytz() + from dateutil.tz import tzlocal + from pytz import utc as UTC + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz='US/Eastern') + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + self.assertTrue(ts.to_period().equals(xp)) + + ts = date_range('1/1/2000', '4/1/2000', tz=UTC) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + self.assertTrue(ts.to_period().equals(xp)) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assertEqual(result, expected) + self.assertTrue(ts.to_period().equals(xp)) + + def test_to_period_tz_explicit_pytz(self): + tm._skip_if_no_pytz() + import pytz + from dateutil.tz import tzlocal + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz=pytz.timezone('US/Eastern')) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assert_(result == expected) + self.assert_(ts.to_period().equals(xp)) + + ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assert_(result == expected) + self.assert_(ts.to_period().equals(xp)) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assert_(result == expected) + self.assert_(ts.to_period().equals(xp)) + + def test_to_period_tz_dateutil(self): + tm._skip_if_no_dateutil() + import dateutil + from dateutil.tz import tzlocal + + xp = date_range('1/1/2000', '4/1/2000').to_period() + + ts = date_range('1/1/2000', '4/1/2000', tz='dateutil/US/Eastern') + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assert_(result == expected) + self.assert_(ts.to_period().equals(xp)) + + ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assert_(result == expected) + self.assert_(ts.to_period().equals(xp)) + + ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) + + result = ts.to_period()[0] + expected = ts[0].to_period() + + self.assert_(result == expected) + self.assert_(ts.to_period().equals(xp)) + + def test_frame_to_period(self): + K = 5 + from pandas.tseries.period import period_range + + dr = date_range('1/1/2000', '1/1/2001') + pr = period_range('1/1/2000', '1/1/2001') + df = DataFrame(randn(len(dr), K), index=dr) + df['mix'] = 'a' + + pts = df.to_period() + exp = df.copy() + exp.index = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M') + self.assertTrue(pts.index.equals(exp.index.asfreq('M'))) + + df = df.T + pts = df.to_period(axis=1) + exp = df.copy() + exp.columns = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M', axis=1) + self.assertTrue(pts.columns.equals(exp.columns.asfreq('M'))) + + self.assertRaises(ValueError, df.to_period, axis=2) + + def test_timestamp_fields(self): + # extra fields from DatetimeIndex like quarter and week + idx = tm.makeDateIndex(100) + + fields = ['dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end'] + for f in fields: + expected = getattr(idx, f)[-1] + result = getattr(Timestamp(idx[-1]), f) + self.assertEqual(result, expected) + + self.assertEqual(idx.freq, Timestamp(idx[-1], idx.freq).freq) + self.assertEqual(idx.freqstr, Timestamp(idx[-1], idx.freq).freqstr) + + def test_woy_boundary(self): + # make sure weeks at year boundaries are correct + d = datetime(2013,12,31) + result = Timestamp(d).week + expected = 1 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2008,12,28) + result = Timestamp(d).week + expected = 52 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2009,12,31) + result = Timestamp(d).week + expected = 53 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2010,1,1) + result = Timestamp(d).week + expected = 53 # ISO standard + self.assertEqual(result, expected) + + d = datetime(2010,1,3) + result = Timestamp(d).week + expected = 53 # ISO standard + self.assertEqual(result, expected) + + result = np.array([Timestamp(datetime(*args)).week for args in + [(2000,1,1),(2000,1,2),(2005,1,1),(2005,1,2)]]) + self.assertTrue((result == [52, 52, 53, 53]).all()) + + def test_timestamp_date_out_of_range(self): + self.assertRaises(ValueError, Timestamp, '1676-01-01') + self.assertRaises(ValueError, Timestamp, '2263-01-01') + + # 1475 + self.assertRaises(ValueError, DatetimeIndex, ['1400-01-01']) + self.assertRaises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) + + def test_timestamp_repr(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + self.assertIn(iso8601, result) + + def test_timestamp_from_ordinal(self): + + # GH 3042 + dt = datetime(2011, 4, 16, 0, 0) + ts = Timestamp.fromordinal(dt.toordinal()) + self.assertEqual(ts.to_pydatetime(), dt) + + # with a tzinfo + stamp = Timestamp('2011-4-16', tz='US/Eastern') + dt_tz = stamp.to_pydatetime() + ts = Timestamp.fromordinal(dt_tz.toordinal(),tz='US/Eastern') + self.assertEqual(ts.to_pydatetime(), dt_tz) + + def test_datetimeindex_integers_shift(self): + rng = date_range('1/1/2000', periods=20) + + result = rng + 5 + expected = rng.shift(5) + self.assertTrue(result.equals(expected)) + + result = rng - 5 + expected = rng.shift(-5) + self.assertTrue(result.equals(expected)) + + def test_astype_object(self): + # NumPy 1.6.1 weak ns support + rng = date_range('1/1/2000', periods=20) + + casted = rng.astype('O') + exp_values = list(rng) + + self.assert_numpy_array_equal(casted, exp_values) + + def test_catch_infinite_loop(self): + offset = datetools.DateOffset(minute=5) + # blow up, don't loop forever + self.assertRaises(Exception, date_range, datetime(2011, 11, 11), + datetime(2011, 11, 12), freq=offset) + + def test_append_concat(self): + rng = date_range('5/8/2012 1:45', periods=10, freq='5T') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + + result = ts.append(ts) + result_df = df.append(df) + ex_index = DatetimeIndex(np.tile(rng.values, 2)) + self.assertTrue(result.index.equals(ex_index)) + self.assertTrue(result_df.index.equals(ex_index)) + + appended = rng.append(rng) + self.assertTrue(appended.equals(ex_index)) + + appended = rng.append([rng, rng]) + ex_index = DatetimeIndex(np.tile(rng.values, 3)) + self.assertTrue(appended.equals(ex_index)) + + # different index names + rng1 = rng.copy() + rng2 = rng.copy() + rng1.name = 'foo' + rng2.name = 'bar' + self.assertEqual(rng1.append(rng1).name, 'foo') + self.assertIsNone(rng1.append(rng2).name) + + def test_append_concat_tz(self): + #GH 2938 + tm._skip_if_no_pytz() + + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz='US/Eastern') + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz='US/Eastern') + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz='US/Eastern') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + self.assertTrue(result.index.equals(rng3)) + self.assertTrue(result_df.index.equals(rng3)) + + appended = rng.append(rng2) + self.assertTrue(appended.equals(rng3)) + + def test_append_concat_tz_explicit_pytz(self): + # GH 2938 + tm._skip_if_no_pytz() + from pytz import timezone as timezone + + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz=timezone('US/Eastern')) + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz=timezone('US/Eastern')) + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz=timezone('US/Eastern')) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + self.assert_(result.index.equals(rng3)) + self.assert_(result_df.index.equals(rng3)) + + appended = rng.append(rng2) + self.assert_(appended.equals(rng3)) + + def test_append_concat_tz_dateutil(self): + # GH 2938 + tm._skip_if_no_dateutil() + from dateutil.tz import gettz as timezone + + rng = date_range('5/8/2012 1:45', periods=10, freq='5T', + tz='dateutil/US/Eastern') + rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', + tz='dateutil/US/Eastern') + rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', + tz='dateutil/US/Eastern') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + self.assert_(result.index.equals(rng3)) + self.assert_(result_df.index.equals(rng3)) + + appended = rng.append(rng2) + self.assert_(appended.equals(rng3)) + + def test_set_dataframe_column_ns_dtype(self): + x = DataFrame([datetime.now(), datetime.now()]) + self.assertEqual(x[0].dtype, np.dtype('M8[ns]')) + + def test_groupby_count_dateparseerror(self): + dr = date_range(start='1/1/2012', freq='5min', periods=10) + + # BAD Example, datetimes first + s = Series(np.arange(10), index=[dr, lrange(10)]) + grouped = s.groupby(lambda x: x[1] % 2 == 0) + result = grouped.count() + + s = Series(np.arange(10), index=[lrange(10), dr]) + grouped = s.groupby(lambda x: x[0] % 2 == 0) + expected = grouped.count() + + assert_series_equal(result, expected) + + def test_datetimeindex_repr_short(self): + dr = date_range(start='1/1/2012', periods=1) + repr(dr) + + dr = date_range(start='1/1/2012', periods=2) + repr(dr) + + dr = date_range(start='1/1/2012', periods=3) + repr(dr) + + def test_constructor_int64_nocopy(self): + # #1624 + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr) + + arr[50:100] = -1 + self.assertTrue((index.asi8[50:100] == -1).all()) + + arr = np.arange(1000, dtype=np.int64) + index = DatetimeIndex(arr, copy=True) + + arr[50:100] = -1 + self.assertTrue((index.asi8[50:100] != -1).all()) + + def test_series_interpolate_method_values(self): + # #1646 + ts = _simple_ts('1/1/2000', '1/20/2000') + ts[::2] = np.nan + + result = ts.interpolate(method='values') + exp = ts.interpolate() + assert_series_equal(result, exp) + + def test_frame_datetime64_handling_groupby(self): + # it works! + df = DataFrame([(3, np.datetime64('2012-07-03')), + (3, np.datetime64('2012-07-04'))], + columns=['a', 'date']) + result = df.groupby('a').first() + self.assertEqual(result['date'][3], Timestamp('2012-07-03')) + + def test_series_interpolate_intraday(self): + # #1698 + index = pd.date_range('1/1/2012', periods=4, freq='12D') + ts = pd.Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(days=1)).order() + + exp = ts.reindex(new_index).interpolate(method='time') + + index = pd.date_range('1/1/2012', periods=4, freq='12H') + ts = pd.Series([0, 12, 24, 36], index) + new_index = index.append(index + pd.DateOffset(hours=1)).order() + result = ts.reindex(new_index).interpolate(method='time') + + self.assert_numpy_array_equal(result.values, exp.values) + + def test_frame_dict_constructor_datetime64_1680(self): + dr = date_range('1/1/2012', periods=10) + s = Series(dr, index=dr) + + # it works! + DataFrame({'a': 'foo', 'b': s}, index=dr) + DataFrame({'a': 'foo', 'b': s.values}, index=dr) + + def test_frame_datetime64_mixed_index_ctor_1681(self): + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + ts = Series(dr) + + # it works! + d = DataFrame({'A': 'foo', 'B': ts}, index=dr) + self.assertTrue(d['B'].isnull().all()) + + def test_frame_timeseries_to_records(self): + index = date_range('1/1/2000', periods=10) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['a', 'b', 'c']) + + result = df.to_records() + result['index'].dtype == 'M8[ns]' + + result = df.to_records(index=False) + + def test_frame_datetime64_duplicated(self): + dates = date_range('2010-07-01', end='2010-08-05') + + tst = DataFrame({'symbol': 'AAA', 'date': dates}) + result = tst.duplicated(['date', 'symbol']) + self.assertTrue((-result).all()) + + tst = DataFrame({'date': dates}) + result = tst.duplicated() + self.assertTrue((-result).all()) + + def test_timestamp_compare_with_early_datetime(self): + # e.g. datetime.min + stamp = Timestamp('2012-01-01') + + self.assertFalse(stamp == datetime.min) + self.assertFalse(stamp == datetime(1600, 1, 1)) + self.assertFalse(stamp == datetime(2700, 1, 1)) + self.assertNotEqual(stamp, datetime.min) + self.assertNotEqual(stamp, datetime(1600, 1, 1)) + self.assertNotEqual(stamp, datetime(2700, 1, 1)) + self.assertTrue(stamp > datetime(1600, 1, 1)) + self.assertTrue(stamp >= datetime(1600, 1, 1)) + self.assertTrue(stamp < datetime(2700, 1, 1)) + self.assertTrue(stamp <= datetime(2700, 1, 1)) + + def test_to_html_timestamp(self): + rng = date_range('2000-01-01', periods=10) + df = DataFrame(np.random.randn(10, 4), index=rng) + + result = df.to_html() + self.assertIn('2000-01-01', result) + + def test_to_csv_numpy_16_bug(self): + frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) + + buf = StringIO() + frame.to_csv(buf) + + result = buf.getvalue() + self.assertIn('2000-01-01', result) + + def test_series_map_box_timestamps(self): + # #2689, #2627 + s = Series(date_range('1/1/2000', periods=10)) + + def f(x): + return (x.hour, x.day, x.month) + + # it works! + s.map(f) + s.apply(f) + DataFrame(s).applymap(f) + + def test_concat_datetime_datetime64_frame(self): + # #2624 + rows = [] + rows.append([datetime(2010, 1, 1), 1]) + rows.append([datetime(2010, 1, 2), 'hi']) + + df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + + ind = date_range(start="2000/1/1", freq="D", periods=10) + df1 = DataFrame({'date': ind, 'test':lrange(10)}) + + # it works! + pd.concat([df1, df2_obj]) + + def test_period_resample(self): + # GH3609 + s = Series(range(100),index=date_range('20130101', freq='s', periods=100), dtype='float') + s[10:30] = np.nan + expected = Series([34.5, 79.5], index=[Period('2013-01-01 00:00', 'T'), Period('2013-01-01 00:01', 'T')]) + result = s.to_period().resample('T', kind='period') + assert_series_equal(result, expected) + result2 = s.resample('T', kind='period') + assert_series_equal(result2, expected) + + def test_period_resample_with_local_timezone_pytz(self): + # GH5430 + tm._skip_if_no_pytz() + import pytz + + local_timezone = pytz.timezone('America/Los_Angeles') + + start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) + # 1 day later + end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) + + index = pd.date_range(start, end, freq='H') + + series = pd.Series(1, index=index) + series = series.tz_convert(local_timezone) + result = series.resample('D', kind='period') + # Create the expected series + expected_index = (pd.period_range(start=start, end=end, freq='D') - 1) # Index is moved back a day with the timezone conversion from UTC to Pacific + expected = pd.Series(1, index=expected_index) + assert_series_equal(result, expected) + + def test_period_resample_with_local_timezone_dateutil(self): + # GH5430 + tm._skip_if_no_dateutil() + import dateutil + + local_timezone = 'dateutil/America/Los_Angeles' + + start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=dateutil.tz.tzutc()) + # 1 day later + end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc()) + + index = pd.date_range(start, end, freq='H') + + series = pd.Series(1, index=index) + series = series.tz_convert(local_timezone) + result = series.resample('D', kind='period') + # Create the expected series + expected_index = (pd.period_range(start=start, end=end, freq='D') - 1) # Index is moved back a day with the timezone conversion from UTC to Pacific + expected = pd.Series(1, index=expected_index) + assert_series_equal(result, expected) + + + def test_pickle(self): + #GH4606 + from pandas.compat import cPickle + import pickle + + for pick in [pickle, cPickle]: + p = pick.loads(pick.dumps(NaT)) + self.assertTrue(p is NaT) + + idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06']) + idx_p = pick.loads(pick.dumps(idx)) + self.assertTrue(idx_p[0] == idx[0]) + self.assertTrue(idx_p[1] is NaT) + self.assertTrue(idx_p[2] == idx[2]) + + +def _simple_ts(start, end, freq='D'): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + +class TestDatetimeIndex(tm.TestCase): + _multiprocess_can_split_ = True + + def test_hash_error(self): + index = date_range('20010101', periods=10) + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(index).__name__): + hash(index) + + def test_stringified_slice_with_tz(self): + #GH2658 + import datetime + start=datetime.datetime.now() + idx=DatetimeIndex(start=start,freq="1d",periods=10) + df=DataFrame(lrange(10),index=idx) + df["2013-01-14 23:44:34.437768-05:00":] # no exception here + + def test_append_join_nondatetimeindex(self): + rng = date_range('1/1/2000', periods=10) + idx = Index(['a', 'b', 'c', 'd']) + + result = rng.append(idx) + tm.assert_isinstance(result[0], Timestamp) + + # it works + rng.join(idx, how='outer') + + def test_astype(self): + rng = date_range('1/1/2000', periods=10) + + result = rng.astype('i8') + self.assert_numpy_array_equal(result, rng.asi8) + + def test_to_period_nofreq(self): + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + self.assertRaises(ValueError, idx.to_period) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], + freq='infer') + self.assertEqual(idx.freqstr, 'D') + expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', '2000-01-03'], freq='D') + self.assertTrue(idx.to_period().equals(expected)) + + # GH 7606 + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + self.assertEqual(idx.freqstr, None) + self.assertTrue(idx.to_period().equals(expected)) + + def test_000constructor_resolution(self): + # 2252 + t1 = Timestamp((1352934390 * 1000000000) + 1000000 + 1000 + 1) + idx = DatetimeIndex([t1]) + + self.assertEqual(idx.nanosecond[0], t1.nanosecond) + + def test_constructor_coverage(self): + rng = date_range('1/1/2000', periods=10.5) + exp = date_range('1/1/2000', periods=10) + self.assertTrue(rng.equals(exp)) + + self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', + periods='foo', freq='D') + + self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', + end='1/10/2000') + + self.assertRaises(ValueError, DatetimeIndex, '1/1/2000') + + # generator expression + gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) + result = DatetimeIndex(gen) + expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) + for i in range(10)]) + self.assertTrue(result.equals(expected)) + + # NumPy string array + strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype('O')) + self.assertTrue(result.equals(expected)) + + from_ints = DatetimeIndex(expected.asi8) + self.assertTrue(from_ints.equals(expected)) + + # non-conforming + self.assertRaises(ValueError, DatetimeIndex, + ['2000-01-01', '2000-01-02', '2000-01-04'], + freq='D') + + self.assertRaises(ValueError, DatetimeIndex, + start='2011-01-01', freq='b') + self.assertRaises(ValueError, DatetimeIndex, + end='2011-01-01', freq='B') + self.assertRaises(ValueError, DatetimeIndex, periods=10, freq='D') + + def test_constructor_name(self): + idx = DatetimeIndex(start='2000-01-01', periods=1, freq='A', + name='TEST') + self.assertEqual(idx.name, 'TEST') + + def test_comparisons_coverage(self): + rng = date_range('1/1/2000', periods=10) + + # raise TypeError for now + self.assertRaises(TypeError, rng.__lt__, rng[3].value) + + result = rng == list(rng) + exp = rng == rng + self.assert_numpy_array_equal(result, exp) + + def test_comparisons_nat(self): + fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) + fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) + + didx1 = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, + '2014-05-01', '2014-07-01']) + didx2 = pd.DatetimeIndex(['2014-02-01', '2014-03-01', pd.NaT, pd.NaT, + '2014-06-01', '2014-07-01']) + darr = np.array([np.datetime64('2014-02-01 00:00Z'), + np.datetime64('2014-03-01 00:00Z'), + np.datetime64('nat'), np.datetime64('nat'), + np.datetime64('2014-06-01 00:00Z'), + np.datetime64('2014-07-01 00:00Z')]) + + if _np_version_under1p7: + # cannot test array because np.datetime('nat') returns today's date + cases = [(fidx1, fidx2), (didx1, didx2)] + else: + cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] + + # Check pd.NaT is handles as the same as np.nan + for idx1, idx2 in cases: + result = idx1 < idx2 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + result = idx2 > idx1 + expected = np.array([True, False, False, False, True, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= idx2 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + result = idx2 >= idx1 + expected = np.array([True, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 == idx2 + expected = np.array([False, False, False, False, False, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 != idx2 + expected = np.array([True, True, True, True, True, False]) + self.assert_numpy_array_equal(result, expected) + + for idx1, val in [(fidx1, np.nan), (didx1, pd.NaT)]: + result = idx1 < val + expected = np.array([False, False, False, False, False, False]) + self.assert_numpy_array_equal(result, expected) + result = idx1 > val + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= val + self.assert_numpy_array_equal(result, expected) + result = idx1 >= val + self.assert_numpy_array_equal(result, expected) + + result = idx1 == val + self.assert_numpy_array_equal(result, expected) + + result = idx1 != val + expected = np.array([True, True, True, True, True, True]) + self.assert_numpy_array_equal(result, expected) + + # Check pd.NaT is handles as the same as np.nan + for idx1, val in [(fidx1, 3), (didx1, datetime(2014, 3, 1))]: + result = idx1 < val + expected = np.array([True, False, False, False, False, False]) + self.assert_numpy_array_equal(result, expected) + result = idx1 > val + expected = np.array([False, False, False, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 <= val + expected = np.array([True, False, True, False, False, False]) + self.assert_numpy_array_equal(result, expected) + result = idx1 >= val + expected = np.array([False, False, True, False, True, True]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 == val + expected = np.array([False, False, True, False, False, False]) + self.assert_numpy_array_equal(result, expected) + + result = idx1 != val + expected = np.array([True, True, False, True, True, True]) + self.assert_numpy_array_equal(result, expected) + + def test_map(self): + rng = date_range('1/1/2000', periods=10) + + f = lambda x: x.strftime('%Y%m%d') + result = rng.map(f) + exp = [f(x) for x in rng] + self.assert_numpy_array_equal(result, exp) + + def test_add_union(self): + rng = date_range('1/1/2000', periods=5) + rng2 = date_range('1/6/2000', periods=5) + + result = rng + rng2 + expected = rng.union(rng2) + self.assertTrue(result.equals(expected)) + + def test_misc_coverage(self): + rng = date_range('1/1/2000', periods=5) + result = rng.groupby(rng.day) + tm.assert_isinstance(list(result.values())[0][0], Timestamp) + + idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) + self.assertTrue(idx.equals(list(idx))) + + non_datetime = Index(list('abc')) + self.assertFalse(idx.equals(list(non_datetime))) + + def test_union_coverage(self): + idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) + ordered = DatetimeIndex(idx.order(), freq='infer') + result = ordered.union(idx) + self.assertTrue(result.equals(ordered)) + + result = ordered[:0].union(ordered) + self.assertTrue(result.equals(ordered)) + self.assertEqual(result.freq, ordered.freq) + + def test_union_bug_1730(self): + rng_a = date_range('1/1/2012', periods=4, freq='3H') + rng_b = date_range('1/1/2012', periods=4, freq='4H') + + result = rng_a.union(rng_b) + exp = DatetimeIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) + self.assertTrue(result.equals(exp)) + + def test_union_bug_1745(self): + left = DatetimeIndex(['2012-05-11 15:19:49.695000']) + right = DatetimeIndex(['2012-05-29 13:04:21.322000', + '2012-05-11 15:27:24.873000', + '2012-05-11 15:31:05.350000']) + + result = left.union(right) + exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) + self.assertTrue(result.equals(exp)) + + def test_union_bug_4564(self): + from pandas import DateOffset + left = date_range("2013-01-01", "2013-02-01") + right = left + DateOffset(minutes=15) + + result = left.union(right) + exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) + self.assertTrue(result.equals(exp)) + + def test_intersection_bug_1708(self): + from pandas import DateOffset + index_1 = date_range('1/1/2012', periods=4, freq='12H') + index_2 = index_1 + DateOffset(hours=1) + + result = index_1 & index_2 + self.assertEqual(len(result), 0) + + # def test_add_timedelta64(self): + # rng = date_range('1/1/2000', periods=5) + # delta = rng.values[3] - rng.values[1] + + # result = rng + delta + # expected = rng + timedelta(2) + # self.assertTrue(result.equals(expected)) + + def test_get_duplicates(self): + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', + '2000-01-03', '2000-01-03', '2000-01-04']) + + result = idx.get_duplicates() + ex = DatetimeIndex(['2000-01-02', '2000-01-03']) + self.assertTrue(result.equals(ex)) + + def test_argmin_argmax(self): + idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02']) + self.assertEqual(idx.argmin(), 1) + self.assertEqual(idx.argmax(), 0) + + def test_order(self): + idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02']) + + ordered = idx.order() + self.assertTrue(ordered.is_monotonic) + + ordered = idx.order(ascending=False) + self.assertTrue(ordered[::-1].is_monotonic) + + ordered, dexer = idx.order(return_indexer=True) + self.assertTrue(ordered.is_monotonic) + self.assert_numpy_array_equal(dexer, [1, 2, 0]) + + ordered, dexer = idx.order(return_indexer=True, ascending=False) + self.assertTrue(ordered[::-1].is_monotonic) + self.assert_numpy_array_equal(dexer, [0, 2, 1]) + + def test_insert(self): + idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02'], name='idx') + + result = idx.insert(2, datetime(2000, 1, 5)) + exp = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-05', + '2000-01-02'], name='idx') + self.assertTrue(result.equals(exp)) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, 'inserted') + expected = Index([datetime(2000, 1, 4), 'inserted', datetime(2000, 1, 1), + datetime(2000, 1, 2)], name='idx') + self.assertNotIsInstance(result, DatetimeIndex) + tm.assert_index_equal(result, expected) + self.assertEqual(result.name, expected.name) + + idx = date_range('1/1/2000', periods=3, freq='M', name='idx') + + # preserve freq + expected_0 = DatetimeIndex(['1999-12-31', '2000-01-31', '2000-02-29', + '2000-03-31'], name='idx', freq='M') + expected_3 = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', + '2000-04-30'], name='idx', freq='M') + + # reset freq to None + expected_1_nofreq = DatetimeIndex(['2000-01-31', '2000-01-31', '2000-02-29', + '2000-03-31'], name='idx', freq=None) + expected_3_nofreq = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', + '2000-01-02'], name='idx', freq=None) + + cases = [(0, datetime(1999, 12, 31), expected_0), + (-3, datetime(1999, 12, 31), expected_0), + (3, datetime(2000, 4, 30), expected_3), + (1, datetime(2000, 1, 31), expected_1_nofreq), + (3, datetime(2000, 1, 2), expected_3_nofreq)] + + for n, d, expected in cases: + result = idx.insert(n, d) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + # reset freq to None + result = idx.insert(3, datetime(2000, 1, 2)) + expected = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', + '2000-01-02'], name='idx', freq=None) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertTrue(result.freq is None) + + # GH 7299 + tm._skip_if_no_pytz() + import pytz + + idx = date_range('1/1/2000', periods=3, freq='D', tz='Asia/Tokyo', name='idx') + with tm.assertRaises(ValueError): + result = idx.insert(3, pd.Timestamp('2000-01-04')) + with tm.assertRaises(ValueError): + result = idx.insert(3, datetime(2000, 1, 4)) + with tm.assertRaises(ValueError): + result = idx.insert(3, pd.Timestamp('2000-01-04', tz='US/Eastern')) + with tm.assertRaises(ValueError): + result = idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone('US/Eastern'))) + + for tz in ['US/Pacific', 'Asia/Singapore']: + idx = date_range('1/1/2000 09:00', periods=6, freq='H', tz=tz, name='idx') + # preserve freq + expected = date_range('1/1/2000 09:00', periods=7, freq='H', tz=tz, name='idx') + for d in [pd.Timestamp('2000-01-01 15:00', tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 15))]: + + result = idx.insert(6, d) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) + + expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 10:00', '2000-01-01 11:00', + '2000-01-01 12:00', '2000-01-01 13:00', '2000-01-01 14:00', + '2000-01-01 10:00'], name='idx', + tz=tz, freq=None) + # reset freq to None + for d in [pd.Timestamp('2000-01-01 10:00', tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 10))]: + result = idx.insert(6, d) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertTrue(result.freq is None) + self.assertEqual(result.tz, expected.tz) + + def test_delete(self): + idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx') + + # prserve freq + expected_0 = date_range(start='2000-02-01', periods=4, freq='M', name='idx') + expected_4 = date_range(start='2000-01-01', periods=4, freq='M', name='idx') + + # reset freq to None + expected_1 = DatetimeIndex(['2000-01-31', '2000-03-31', '2000-04-30', + '2000-05-31'], freq=None, name='idx') + + cases ={0: expected_0, -5: expected_0, + -1: expected_4, 4: expected_4, + 1: expected_1} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = idx.delete(5) + + for tz in [None, 'Asia/Tokyo', 'US/Pacific']: + idx = date_range(start='2000-01-01 09:00', periods=10, + freq='H', name='idx', tz=tz) + + expected = date_range(start='2000-01-01 10:00', periods=9, + freq='H', name='idx', tz=tz) + result = idx.delete(0) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freqstr, 'H') + self.assertEqual(result.tz, expected.tz) + + expected = date_range(start='2000-01-01 09:00', periods=9, + freq='H', name='idx', tz=tz) + result = idx.delete(-1) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freqstr, 'H') + self.assertEqual(result.tz, expected.tz) + + def test_delete_slice(self): + idx = date_range(start='2000-01-01', periods=10, freq='D', name='idx') + + # prserve freq + expected_0_2 = date_range(start='2000-01-04', periods=7, freq='D', name='idx') + expected_7_9 = date_range(start='2000-01-01', periods=7, freq='D', name='idx') + + # reset freq to None + expected_3_5 = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', + '2000-01-07', '2000-01-08', '2000-01-09', + '2000-01-10'], freq=None, name='idx') + + cases ={(0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + result = idx.delete(slice(n[0], n[-1] + 1)) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + + for tz in [None, 'Asia/Tokyo', 'US/Pacific']: + ts = pd.Series(1, index=pd.date_range('2000-01-01 09:00', periods=10, + freq='H', name='idx', tz=tz)) + # preserve freq + result = ts.drop(ts.index[:5]).index + expected = pd.date_range('2000-01-01 14:00', periods=5, freq='H', name='idx', tz=tz) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) + + # reset freq to None + result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index + expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 11:00', '2000-01-01 13:00', + '2000-01-01 15:00', '2000-01-01 17:00'], + freq=None, name='idx', tz=tz) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) + + def test_take(self): + dates = [datetime(2010, 1, 1, 14), datetime(2010, 1, 1, 15), + datetime(2010, 1, 1, 17), datetime(2010, 1, 1, 21)] + + for tz in [None, 'US/Eastern', 'Asia/Tokyo']: + idx = DatetimeIndex(start='2010-01-01 09:00', end='2010-02-01 09:00', + freq='H', tz=tz, name='idx') + expected = DatetimeIndex(dates, freq=None, name='idx', tz=tz) + + taken1 = idx.take([5, 6, 8, 12]) + taken2 = idx[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + self.assertTrue(taken.equals(expected)) + tm.assert_isinstance(taken, DatetimeIndex) + self.assertIsNone(taken.freq) + self.assertEqual(taken.tz, expected.tz) + self.assertEqual(taken.name, expected.name) + + def test_map_bug_1677(self): + index = DatetimeIndex(['2012-04-25 09:30:00.393000']) + f = index.asof + + result = index.map(f) + expected = np.array([f(index[0])]) + self.assert_numpy_array_equal(result, expected) + + def test_groupby_function_tuple_1677(self): + df = DataFrame(np.random.rand(100), + index=date_range("1/1/2000", periods=100)) + monthly_group = df.groupby(lambda x: (x.year, x.month)) + + result = monthly_group.mean() + tm.assert_isinstance(result.index[0], tuple) + + def test_append_numpy_bug_1681(self): + # another datetime64 bug + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + a = DataFrame() + c = DataFrame({'A': 'foo', 'B': dr}, index=dr) + + result = a.append(c) + self.assertTrue((result['B'] == dr).all()) + + def test_isin(self): + index = tm.makeDateIndex(4) + result = index.isin(index) + self.assertTrue(result.all()) + + result = index.isin(list(index)) + self.assertTrue(result.all()) + + assert_almost_equal(index.isin([index[2], 5]), + [False, False, True, False]) + + def test_union(self): + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = Int64Index(np.arange(10, 30, 2)) + result = i1.union(i2) + expected = Int64Index(np.arange(0, 30, 2)) + self.assert_numpy_array_equal(result, expected) + + def test_union_with_DatetimeIndex(self): + i1 = Int64Index(np.arange(0, 20, 2)) + i2 = DatetimeIndex(start='2012-01-03 00:00:00', periods=10, freq='D') + i1.union(i2) # Works + i2.union(i1) # Fails with "AttributeError: can't set attribute" + + def test_time(self): + rng = pd.date_range('1/1/2000', freq='12min', periods=10) + result = pd.Index(rng).time + expected = [t.time() for t in rng] + self.assertTrue((result == expected).all()) + + def test_date(self): + rng = pd.date_range('1/1/2000', freq='12H', periods=10) + result = pd.Index(rng).date + expected = [t.date() for t in rng] + self.assertTrue((result == expected).all()) + + def test_does_not_convert_mixed_integer(self): + df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: + randn(), r_idx_type='i', c_idx_type='dt') + cols = df.columns.join(df.index, how='outer') + joined = cols.join(df.columns) + self.assertEqual(cols.dtype, np.dtype('O')) + self.assertEqual(cols.dtype, joined.dtype) + assert_array_equal(cols.values, joined.values) + + def test_slice_keeps_name(self): + # GH4226 + st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles') + et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles') + dr = pd.date_range(st, et, freq='H', name='timebucket') + self.assertEqual(dr[1:].name, dr.name) + + def test_join_self(self): + index = date_range('1/1/2000', periods=10) + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = index.join(index, how=kind) + self.assertIs(index, joined) + + def assert_index_parameters(self, index): + assert index.freq == '40960N' + assert index.inferred_freq == '40960N' + + def test_ns_index(self): + tm._skip_if_not_numpy17_friendly() + + nsamples = 400 + ns = int(1e9 / 24414) + dtstart = np.datetime64('2012-09-20T00:00:00') + + dt = dtstart + np.arange(nsamples) * np.timedelta64(ns, 'ns') + freq = ns * pd.datetools.Nano() + index = pd.DatetimeIndex(dt, freq=freq, name='time') + self.assert_index_parameters(index) + + new_index = pd.DatetimeIndex(start=index[0], end=index[-1], freq=index.freq) + self.assert_index_parameters(new_index) + + def test_join_with_period_index(self): + df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args: + np.random.randint(2), c_idx_type='p', + r_idx_type='dt') + s = df.iloc[:5, 0] + joins = 'left', 'right', 'inner', 'outer' + + for join in joins: + with tm.assertRaisesRegexp(ValueError, 'can only call with other ' + 'PeriodIndex-ed objects'): + df.columns.join(s.index, how=join) + + def test_factorize(self): + idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', + '2014-02', '2014-03', '2014-03']) + + exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + arr, idx = idx1.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + # tz must be preserved + idx1 = idx1.tz_localize('Asia/Tokyo') + exp_idx = exp_idx.tz_localize('Asia/Tokyo') + + arr, idx = idx1.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01', + '2014-03', '2014-01']) + + exp_arr = np.array([2, 2, 1, 0, 2, 0]) + exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) + arr, idx = idx2.factorize(sort=True) + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01']) + arr, idx = idx2.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(exp_idx)) + + # freq must be preserved + idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo') + exp_arr = np.array([0, 1, 2, 3]) + arr, idx = idx3.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + self.assertTrue(idx.equals(idx3)) + + +class TestDatetime64(tm.TestCase): + """ + Also test support for datetime64[ns] in Series / DataFrame + """ + + def setUp(self): + dti = DatetimeIndex(start=datetime(2005, 1, 1), + end=datetime(2005, 1, 10), freq='Min') + self.series = Series(rand(len(dti)), dti) + + def test_datetimeindex_accessors(self): + dti = DatetimeIndex( + freq='D', start=datetime(1998, 1, 1), periods=365) + + self.assertEqual(dti.year[0], 1998) + self.assertEqual(dti.month[0], 1) + self.assertEqual(dti.day[0], 1) + self.assertEqual(dti.hour[0], 0) + self.assertEqual(dti.minute[0], 0) + self.assertEqual(dti.second[0], 0) + self.assertEqual(dti.microsecond[0], 0) + self.assertEqual(dti.dayofweek[0], 3) + + self.assertEqual(dti.dayofyear[0], 1) + self.assertEqual(dti.dayofyear[120], 121) + + self.assertEqual(dti.weekofyear[0], 1) + self.assertEqual(dti.weekofyear[120], 18) + + self.assertEqual(dti.quarter[0], 1) + self.assertEqual(dti.quarter[120], 2) + + self.assertEqual(dti.is_month_start[0], True) + self.assertEqual(dti.is_month_start[1], False) + self.assertEqual(dti.is_month_start[31], True) + self.assertEqual(dti.is_quarter_start[0], True) + self.assertEqual(dti.is_quarter_start[90], True) + self.assertEqual(dti.is_year_start[0], True) + self.assertEqual(dti.is_year_start[364], False) + self.assertEqual(dti.is_month_end[0], False) + self.assertEqual(dti.is_month_end[30], True) + self.assertEqual(dti.is_month_end[31], False) + self.assertEqual(dti.is_month_end[364], True) + self.assertEqual(dti.is_quarter_end[0], False) + self.assertEqual(dti.is_quarter_end[30], False) + self.assertEqual(dti.is_quarter_end[89], True) + self.assertEqual(dti.is_quarter_end[364], True) + self.assertEqual(dti.is_year_end[0], False) + self.assertEqual(dti.is_year_end[364], True) + + self.assertEqual(len(dti.year), 365) + self.assertEqual(len(dti.month), 365) + self.assertEqual(len(dti.day), 365) + self.assertEqual(len(dti.hour), 365) + self.assertEqual(len(dti.minute), 365) + self.assertEqual(len(dti.second), 365) + self.assertEqual(len(dti.microsecond), 365) + self.assertEqual(len(dti.dayofweek), 365) + self.assertEqual(len(dti.dayofyear), 365) + self.assertEqual(len(dti.weekofyear), 365) + self.assertEqual(len(dti.quarter), 365) + self.assertEqual(len(dti.is_month_start), 365) + self.assertEqual(len(dti.is_month_end), 365) + self.assertEqual(len(dti.is_quarter_start), 365) + self.assertEqual(len(dti.is_quarter_end), 365) + self.assertEqual(len(dti.is_year_start), 365) + self.assertEqual(len(dti.is_year_end), 365) + + dti = DatetimeIndex( + freq='BQ-FEB', start=datetime(1998, 1, 1), periods=4) + + self.assertEqual(sum(dti.is_quarter_start), 0) + self.assertEqual(sum(dti.is_quarter_end), 4) + self.assertEqual(sum(dti.is_year_start), 0) + self.assertEqual(sum(dti.is_year_end), 1) + + # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, CBD requires np >= 1.7 + if not _np_version_under1p7: + bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu') + dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) + self.assertRaises(ValueError, lambda: dti.is_month_start) + + dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + + self.assertEqual(dti.is_month_start[0], 1) + + tests = [ + (Timestamp('2013-06-01', offset='M').is_month_start, 1), + (Timestamp('2013-06-01', offset='BM').is_month_start, 0), + (Timestamp('2013-06-03', offset='M').is_month_start, 0), + (Timestamp('2013-06-03', offset='BM').is_month_start, 1), + (Timestamp('2013-02-28', offset='Q-FEB').is_month_end, 1), + (Timestamp('2013-02-28', offset='Q-FEB').is_quarter_end, 1), + (Timestamp('2013-02-28', offset='Q-FEB').is_year_end, 1), + (Timestamp('2013-03-01', offset='Q-FEB').is_month_start, 1), + (Timestamp('2013-03-01', offset='Q-FEB').is_quarter_start, 1), + (Timestamp('2013-03-01', offset='Q-FEB').is_year_start, 1), + (Timestamp('2013-03-31', offset='QS-FEB').is_month_end, 1), + (Timestamp('2013-03-31', offset='QS-FEB').is_quarter_end, 0), + (Timestamp('2013-03-31', offset='QS-FEB').is_year_end, 0), + (Timestamp('2013-02-01', offset='QS-FEB').is_month_start, 1), + (Timestamp('2013-02-01', offset='QS-FEB').is_quarter_start, 1), + (Timestamp('2013-02-01', offset='QS-FEB').is_year_start, 1), + (Timestamp('2013-06-30', offset='BQ').is_month_end, 0), + (Timestamp('2013-06-30', offset='BQ').is_quarter_end, 0), + (Timestamp('2013-06-30', offset='BQ').is_year_end, 0), + (Timestamp('2013-06-28', offset='BQ').is_month_end, 1), + (Timestamp('2013-06-28', offset='BQ').is_quarter_end, 1), + (Timestamp('2013-06-28', offset='BQ').is_year_end, 0), + (Timestamp('2013-06-30', offset='BQS-APR').is_month_end, 0), + (Timestamp('2013-06-30', offset='BQS-APR').is_quarter_end, 0), + (Timestamp('2013-06-30', offset='BQS-APR').is_year_end, 0), + (Timestamp('2013-06-28', offset='BQS-APR').is_month_end, 1), + (Timestamp('2013-06-28', offset='BQS-APR').is_quarter_end, 1), + (Timestamp('2013-03-29', offset='BQS-APR').is_year_end, 1), + (Timestamp('2013-11-01', offset='AS-NOV').is_year_start, 1), + (Timestamp('2013-10-31', offset='AS-NOV').is_year_end, 1)] + + for ts, value in tests: + self.assertEqual(ts, value) + + + def test_nanosecond_field(self): + dti = DatetimeIndex(np.arange(10)) + + self.assert_numpy_array_equal(dti.nanosecond, np.arange(10)) + + def test_datetimeindex_diff(self): + dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), + periods=100) + dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), + periods=98) + self.assertEqual(len(dti1.diff(dti2)), 2) + + def test_fancy_getitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + + self.assertEqual(s[48], 48) + self.assertEqual(s['1/2/2009'], 48) + self.assertEqual(s['2009-1-2'], 48) + self.assertEqual(s[datetime(2009, 1, 2)], 48) + self.assertEqual(s[lib.Timestamp(datetime(2009, 1, 2))], 48) + self.assertRaises(KeyError, s.__getitem__, '2009-1-3') + + assert_series_equal(s['3/6/2009':'2009-06-05'], + s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) + + def test_fancy_setitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005, 1, 1), + end=datetime(2010, 1, 1)) + + s = Series(np.arange(len(dti)), index=dti) + s[48] = -1 + self.assertEqual(s[48], -1) + s['1/2/2009'] = -2 + self.assertEqual(s[48], -2) + s['1/2/2009':'2009-06-05'] = -3 + self.assertTrue((s[48:54] == -3).all()) + + def test_datetimeindex_constructor(self): + arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] + self.assertRaises(Exception, DatetimeIndex, arr) + + arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] + idx1 = DatetimeIndex(arr) + + arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] + idx2 = DatetimeIndex(arr) + + arr = [lib.Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', + '2005-01-04'] + idx3 = DatetimeIndex(arr) + + arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', + '2005-01-04'], dtype='O') + idx4 = DatetimeIndex(arr) + + arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) + idx5 = DatetimeIndex(arr) + + arr = to_datetime( + ['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04']) + idx6 = DatetimeIndex(arr) + + idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) + idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, + yearfirst=True) + self.assertTrue(idx7.equals(idx8)) + + for other in [idx2, idx3, idx4, idx5, idx6]: + self.assertTrue((idx1.values == other.values).all()) + + sdate = datetime(1999, 12, 25) + edate = datetime(2000, 1, 1) + idx = DatetimeIndex(start=sdate, freq='1B', periods=20) + self.assertEqual(len(idx), 20) + self.assertEqual(idx[0], sdate + 0 * dt.bday) + self.assertEqual(idx.freq, 'B') + + idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) + self.assertEqual(len(idx), 20) + self.assertEqual(idx[-1], edate) + self.assertEqual(idx.freq, '5D') + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=dt.Week(weekday=6)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=dt.QuarterBegin(startingMonth=1)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=dt.BQuarterEnd(startingMonth=12)) + self.assertEqual(len(idx1), len(idx2)) + self.assertEqual(idx1.offset, idx2.offset) + + def test_dayfirst(self): + # GH 5917 + arr = ['10/02/2014', '11/02/2014', '12/02/2014'] + expected = DatetimeIndex([datetime(2014, 2, 10), + datetime(2014, 2, 11), + datetime(2014, 2, 12)]) + idx1 = DatetimeIndex(arr, dayfirst=True) + idx2 = DatetimeIndex(np.array(arr), dayfirst=True) + idx3 = to_datetime(arr, dayfirst=True) + idx4 = to_datetime(np.array(arr), dayfirst=True) + idx5 = DatetimeIndex(Index(arr), dayfirst=True) + idx6 = DatetimeIndex(Series(arr), dayfirst=True) + self.assertTrue(expected.equals(idx1)) + self.assertTrue(expected.equals(idx2)) + self.assertTrue(expected.equals(idx3)) + self.assertTrue(expected.equals(idx4)) + self.assertTrue(expected.equals(idx5)) + self.assertTrue(expected.equals(idx6)) + + def test_dti_snap(self): + dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', + '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') + + res = dti.snap(freq='W-MON') + exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') + exp = exp.repeat([3, 4]) + self.assertTrue((res == exp).all()) + + res = dti.snap(freq='B') + + exp = date_range('1/1/2002', '1/7/2002', freq='b') + exp = exp.repeat([1, 1, 1, 2, 2]) + self.assertTrue((res == exp).all()) + + def test_dti_reset_index_round_trip(self): + dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') + d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) + d2 = d1.reset_index() + self.assertEqual(d2.dtypes[0], np.dtype('M8[ns]')) + d3 = d2.set_index('index') + assert_frame_equal(d1, d3, check_names=False) + + # #2329 + stamp = datetime(2012, 11, 22) + df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) + df = df.set_index('Date') + + self.assertEqual(df.index[0], stamp) + self.assertEqual(df.reset_index()['Date'][0], stamp) + + def test_dti_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') + idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') + + df = df.set_index(idx1) + self.assertTrue(df.index.equals(idx1)) + df = df.reindex(idx2) + self.assertTrue(df.index.equals(idx2)) + + def test_datetimeindex_union_join_empty(self): + dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') + empty = Index([]) + + result = dti.union(empty) + tm.assert_isinstance(result, DatetimeIndex) + self.assertIs(result, result) + + result = dti.join(empty) + tm.assert_isinstance(result, DatetimeIndex) + + def test_series_set_value(self): + # #1561 + + dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] + index = DatetimeIndex(dates) + + s = Series().set_value(dates[0], 1.) + s2 = s.set_value(dates[1], np.nan) + + exp = Series([1., np.nan], index=index) + + assert_series_equal(s2, exp) + + # s = Series(index[:1], index[:1]) + # s2 = s.set_value(dates[1], index[1]) + # self.assertEqual(s2.values.dtype, 'M8[ns]') + + @slow + def test_slice_locs_indexerror(self): + times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) + for i in range(100000)] + s = Series(lrange(100000), times) + s.ix[datetime(1900, 1, 1):datetime(2100, 1, 1)] + + def test_slicing_datetimes(self): + + # GH 7523 + + # unique + df = DataFrame(np.arange(4.,dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) for i in [1,2,3,4]]) + result = df.ix[datetime(2001,1,1,10):] + assert_frame_equal(result,df) + result = df.ix[:datetime(2001,1,4,10)] + assert_frame_equal(result,df) + result = df.ix[datetime(2001,1,1,10):datetime(2001,1,4,10)] + assert_frame_equal(result,df) + + result = df.ix[datetime(2001,1,1,11):] + expected = df.iloc[1:] + assert_frame_equal(result,expected) + result = df.ix['20010101 11':] + assert_frame_equal(result,expected) + + # duplicates + df = pd.DataFrame(np.arange(5.,dtype='float64'), + index=[datetime(2001, 1, i, 10, 00) for i in [1,2,2,3,4]]) + + result = df.ix[datetime(2001,1,1,10):] + assert_frame_equal(result,df) + result = df.ix[:datetime(2001,1,4,10)] + assert_frame_equal(result,df) + result = df.ix[datetime(2001,1,1,10):datetime(2001,1,4,10)] + assert_frame_equal(result,df) + + result = df.ix[datetime(2001,1,1,11):] + expected = df.iloc[1:] + assert_frame_equal(result,expected) + result = df.ix['20010101 11':] + assert_frame_equal(result,expected) + +class TestSeriesDatetime64(tm.TestCase): + + def setUp(self): + self.series = Series(date_range('1/1/2000', periods=10)) + + def test_auto_conversion(self): + series = Series(list(date_range('1/1/2000', periods=10))) + self.assertEqual(series.dtype, 'M8[ns]') + + def test_constructor_cant_cast_datetime64(self): + self.assertRaises(TypeError, Series, + date_range('1/1/2000', periods=10), dtype=float) + + def test_series_comparison_scalars(self): + val = datetime(2000, 1, 4) + result = self.series > val + expected = np.array([x > val for x in self.series]) + self.assert_numpy_array_equal(result, expected) + + val = self.series[5] + result = self.series > val + expected = np.array([x > val for x in self.series]) + self.assert_numpy_array_equal(result, expected) + + def test_between(self): + left, right = self.series[[2, 7]] + + result = self.series.between(left, right) + expected = (self.series >= left) & (self.series <= right) + assert_series_equal(result, expected) + + #---------------------------------------------------------------------- + # NaT support + + def test_NaT_scalar(self): + series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + + val = series[3] + self.assertTrue(com.isnull(val)) + + series[2] = val + self.assertTrue(com.isnull(series[2])) + + def test_set_none_nan(self): + self.series[3] = None + self.assertIs(self.series[3], NaT) + + self.series[3:5] = None + self.assertIs(self.series[4], NaT) + + self.series[5] = np.nan + self.assertIs(self.series[5], NaT) + + self.series[5:7] = np.nan + self.assertIs(self.series[6], NaT) + + def test_intercept_astype_object(self): + + # this test no longer makes sense as series is by default already M8[ns] + expected = self.series.astype('object') + + df = DataFrame({'a': self.series, + 'b': np.random.randn(len(self.series))}) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) + + df = DataFrame({'a': self.series, + 'b': ['foo'] * len(self.series)}) + + result = df.values.squeeze() + self.assertTrue((result[:, 0] == expected.values).all()) + + def test_union(self): + rng1 = date_range('1/1/1999', '1/1/2012', freq='MS') + s1 = Series(np.random.randn(len(rng1)), rng1) + + rng2 = date_range('1/1/1980', '12/1/2001', freq='MS') + s2 = Series(np.random.randn(len(rng2)), rng2) + df = DataFrame({'s1': s1, 's2': s2}) + self.assertEqual(df.index.values.dtype, np.dtype('M8[ns]')) + + def test_intersection(self): + # GH 4690 (with tz) + for tz in [None, 'Asia/Tokyo']: + rng = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') + + # if target has the same name, it is preserved + rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx') + expected2 = date_range('6/1/2000', '6/20/2000', freq='D', name='idx') + + # if target name is different, it will be reset + rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other') + expected3 = date_range('6/1/2000', '6/20/2000', freq='D', name=None) + + result2 = rng.intersection(rng2) + result3 = rng.intersection(rng3) + for (result, expected) in [(result2, expected2), (result3, expected3)]: + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertEqual(result.freq, expected.freq) + self.assertEqual(result.tz, expected.tz) + + # non-monotonic + rng = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-02', '2011-01-03'], + tz=tz, name='idx') + + rng2 = DatetimeIndex(['2011-01-04', '2011-01-02', '2011-02-02', '2011-02-03'], + tz=tz, name='idx') + expected2 = DatetimeIndex(['2011-01-04', '2011-01-02'], tz=tz, name='idx') + + rng3 = DatetimeIndex(['2011-01-04', '2011-01-02', '2011-02-02', '2011-02-03'], + tz=tz, name='other') + expected3 = DatetimeIndex(['2011-01-04', '2011-01-02'], tz=tz, name=None) + + result2 = rng.intersection(rng2) + result3 = rng.intersection(rng3) + for (result, expected) in [(result2, expected2), (result3, expected3)]: + print(result, expected) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + self.assertIsNone(result.freq) + self.assertEqual(result.tz, expected.tz) + + # empty same freq GH2129 + rng = date_range('6/1/2000', '6/15/2000', freq='T') + result = rng[0:0].intersection(rng) + self.assertEqual(len(result), 0) + + result = rng.intersection(rng[0:0]) + self.assertEqual(len(result), 0) + + def test_date_range_bms_bug(self): + # #1645 + rng = date_range('1/1/2000', periods=10, freq='BMS') + + ex_first = Timestamp('2000-01-03') + self.assertEqual(rng[0], ex_first) + + def test_string_index_series_name_converted(self): + # #1644 + df = DataFrame(np.random.randn(10, 4), + index=date_range('1/1/2000', periods=10)) + + result = df.ix['1/3/2000'] + self.assertEqual(result.name, df.index[2]) + + result = df.T['1/3/2000'] + self.assertEqual(result.name, df.index[2]) + + +class TestTimestamp(tm.TestCase): + + def test_class_ops_pytz(self): + tm._skip_if_no_pytz() + from pytz import timezone + + def compare(x, y): + self.assertEqual(int(Timestamp(x).value / 1e9), int(Timestamp(y).value / 1e9)) + + compare(Timestamp.now(), datetime.now()) + compare(Timestamp.now('UTC'), datetime.now(timezone('UTC'))) + compare(Timestamp.utcnow(), datetime.utcnow()) + compare(Timestamp.today(), datetime.today()) + + def test_class_ops_dateutil(self): + tm._skip_if_no_dateutil() + from dateutil.tz import tzutc + + def compare(x,y): + self.assertEqual(int(np.round(Timestamp(x).value/1e9)), int(np.round(Timestamp(y).value/1e9))) + + compare(Timestamp.now(),datetime.now()) + compare(Timestamp.now('UTC'), datetime.now(tzutc())) + compare(Timestamp.utcnow(),datetime.utcnow()) + compare(Timestamp.today(),datetime.today()) + + def test_basics_nanos(self): + val = np.int64(946684800000000000).view('M8[ns]') + stamp = Timestamp(val.view('i8') + 500) + self.assertEqual(stamp.year, 2000) + self.assertEqual(stamp.month, 1) + self.assertEqual(stamp.microsecond, 0) + self.assertEqual(stamp.nanosecond, 500) + + def test_unit(self): + def check(val,unit=None,h=1,s=1,us=0): + stamp = Timestamp(val, unit=unit) + self.assertEqual(stamp.year, 2000) + self.assertEqual(stamp.month, 1) + self.assertEqual(stamp.day, 1) + self.assertEqual(stamp.hour, h) + if unit != 'D': + self.assertEqual(stamp.minute, 1) + self.assertEqual(stamp.second, s) + self.assertEqual(stamp.microsecond, us) + else: + self.assertEqual(stamp.minute, 0) + self.assertEqual(stamp.second, 0) + self.assertEqual(stamp.microsecond, 0) + self.assertEqual(stamp.nanosecond, 0) + + ts = Timestamp('20000101 01:01:01') + val = ts.value + days = (ts - Timestamp('1970-01-01')).days + + check(val) + check(val/long(1000),unit='us') + check(val/long(1000000),unit='ms') + check(val/long(1000000000),unit='s') + check(days,unit='D',h=0) + + # using truediv, so these are like floats + if compat.PY3: + check((val+500000)/long(1000000000),unit='s',us=500) + check((val+500000000)/long(1000000000),unit='s',us=500000) + check((val+500000)/long(1000000),unit='ms',us=500) + + # get chopped in py2 + else: + check((val+500000)/long(1000000000),unit='s') + check((val+500000000)/long(1000000000),unit='s') + check((val+500000)/long(1000000),unit='ms') + + # ok + check((val+500000)/long(1000),unit='us',us=500) + check((val+500000000)/long(1000000),unit='ms',us=500000) + + # floats + check(val/1000.0 + 5,unit='us',us=5) + check(val/1000.0 + 5000,unit='us',us=5000) + check(val/1000000.0 + 0.5,unit='ms',us=500) + check(val/1000000.0 + 0.005,unit='ms',us=5) + check(val/1000000000.0 + 0.5,unit='s',us=500000) + check(days + 0.5,unit='D',h=12) + + # nan + result = Timestamp(np.nan) + self.assertIs(result, NaT) + + result = Timestamp(None) + self.assertIs(result, NaT) + + result = Timestamp(iNaT) + self.assertIs(result, NaT) + + result = Timestamp(NaT) + self.assertIs(result, NaT) + + def test_comparison(self): + # 5-18-2012 00:00:00.000 + stamp = long(1337299200000000000) + + val = Timestamp(stamp) + + self.assertEqual(val, val) + self.assertFalse(val != val) + self.assertFalse(val < val) + self.assertTrue(val <= val) + self.assertFalse(val > val) + self.assertTrue(val >= val) + + other = datetime(2012, 5, 18) + self.assertEqual(val, other) + self.assertFalse(val != other) + self.assertFalse(val < other) + self.assertTrue(val <= other) + self.assertFalse(val > other) + self.assertTrue(val >= other) + + other = Timestamp(stamp + 100) + + self.assertNotEqual(val, other) + self.assertNotEqual(val, other) + self.assertTrue(val < other) + self.assertTrue(val <= other) + self.assertTrue(other > val) + self.assertTrue(other >= val) + + def test_cant_compare_tz_naive_w_aware(self): + tm._skip_if_no_pytz() + # #1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz='utc') + + self.assertRaises(Exception, a.__eq__, b) + self.assertRaises(Exception, a.__ne__, b) + self.assertRaises(Exception, a.__lt__, b) + self.assertRaises(Exception, a.__gt__, b) + self.assertRaises(Exception, b.__eq__, a) + self.assertRaises(Exception, b.__ne__, a) + self.assertRaises(Exception, b.__lt__, a) + self.assertRaises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) + self.assertRaises(Exception, a.to_pydatetime().__eq__, b) + else: + self.assertFalse(a == b.to_pydatetime()) + self.assertFalse(a.to_pydatetime() == b) + + def test_cant_compare_tz_naive_w_aware_explicit_pytz(self): + tm._skip_if_no_pytz() + from pytz import utc + # #1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz=utc) + + self.assertRaises(Exception, a.__eq__, b) + self.assertRaises(Exception, a.__ne__, b) + self.assertRaises(Exception, a.__lt__, b) + self.assertRaises(Exception, a.__gt__, b) + self.assertRaises(Exception, b.__eq__, a) + self.assertRaises(Exception, b.__ne__, a) + self.assertRaises(Exception, b.__lt__, a) + self.assertRaises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) + self.assertRaises(Exception, a.to_pydatetime().__eq__, b) + else: + self.assertFalse(a == b.to_pydatetime()) + self.assertFalse(a.to_pydatetime() == b) + + def test_cant_compare_tz_naive_w_aware_dateutil(self): + tm._skip_if_no_dateutil() + from dateutil.tz import tzutc + utc = tzutc() + # #1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz=utc) + + self.assertRaises(Exception, a.__eq__, b) + self.assertRaises(Exception, a.__ne__, b) + self.assertRaises(Exception, a.__lt__, b) + self.assertRaises(Exception, a.__gt__, b) + self.assertRaises(Exception, b.__eq__, a) + self.assertRaises(Exception, b.__ne__, a) + self.assertRaises(Exception, b.__lt__, a) + self.assertRaises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) + self.assertRaises(Exception, a.to_pydatetime().__eq__, b) + else: + self.assertFalse(a == b.to_pydatetime()) + self.assertFalse(a.to_pydatetime() == b) + + def test_delta_preserve_nanos(self): + val = Timestamp(long(1337299200000000123)) + result = val + timedelta(1) + self.assertEqual(result.nanosecond, val.nanosecond) + + def test_frequency_misc(self): + self.assertEqual(fmod.get_freq_group('T'), + fmod.FreqGroup.FR_MIN) + + code, stride = fmod.get_freq_code(offsets.Hour()) + self.assertEqual(code, fmod.FreqGroup.FR_HR) + + code, stride = fmod.get_freq_code((5, 'T')) + self.assertEqual(code, fmod.FreqGroup.FR_MIN) + self.assertEqual(stride, 5) + + offset = offsets.Hour() + result = fmod.to_offset(offset) + self.assertEqual(result, offset) + + result = fmod.to_offset((5, 'T')) + expected = offsets.Minute(5) + self.assertEqual(result, expected) + + self.assertRaises(ValueError, fmod.get_freq_code, (5, 'baz')) + + self.assertRaises(ValueError, fmod.to_offset, '100foo') + + self.assertRaises(ValueError, fmod.to_offset, ('', '')) + + result = fmod.get_standard_freq(offsets.Hour()) + self.assertEqual(result, 'H') + + def test_hash_equivalent(self): + d = {datetime(2011, 1, 1): 5} + stamp = Timestamp(datetime(2011, 1, 1)) + self.assertEqual(d[stamp], 5) + + def test_timestamp_compare_scalars(self): + # case where ndim == 0 + lhs = np.datetime64(datetime(2013, 12, 6)) + rhs = Timestamp('now') + nat = Timestamp('nat') + + ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', + 'ne': 'ne'} + + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + + if pd._np_version_under1p7: + # you have to convert to timestamp for this to work with numpy + # scalars + expected = left_f(Timestamp(lhs), rhs) + + # otherwise a TypeError is thrown + if left not in ('eq', 'ne'): + with tm.assertRaises(TypeError): + left_f(lhs, rhs) + else: + expected = left_f(lhs, rhs) + + result = right_f(rhs, lhs) + self.assertEqual(result, expected) + + expected = left_f(rhs, nat) + result = right_f(nat, rhs) + self.assertEqual(result, expected) + + def test_timestamp_compare_series(self): + # make sure we can compare Timestamps on the right AND left hand side + # GH4982 + s = Series(date_range('20010101', periods=10), name='dates') + s_nat = s.copy(deep=True) + + s[0] = pd.Timestamp('nat') + s[3] = pd.Timestamp('nat') + + ops = {'lt': 'gt', 'le': 'ge', 'eq': 'eq', 'ne': 'ne'} + + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + + # no nats + expected = left_f(s, Timestamp('20010109')) + result = right_f(Timestamp('20010109'), s) + tm.assert_series_equal(result, expected) + + # nats + expected = left_f(s, Timestamp('nat')) + result = right_f(Timestamp('nat'), s) + tm.assert_series_equal(result, expected) + + # compare to timestamp with series containing nats + expected = left_f(s_nat, Timestamp('20010109')) + result = right_f(Timestamp('20010109'), s_nat) + tm.assert_series_equal(result, expected) + + # compare to nat with series containing nats + expected = left_f(s_nat, Timestamp('nat')) + result = right_f(Timestamp('nat'), s_nat) + tm.assert_series_equal(result, expected) + + +class TestSlicing(tm.TestCase): + + def test_slice_year(self): + dti = DatetimeIndex(freq='B', start=datetime(2005, 1, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + result = s['2005'] + expected = s[s.index.year == 2005] + assert_series_equal(result, expected) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + result = df.ix['2005'] + expected = df[df.index.year == 2005] + assert_frame_equal(result, expected) + + rng = date_range('1/1/2000', '1/1/2010') + + result = rng.get_loc('2009') + expected = slice(3288, 3653) + self.assertEqual(result, expected) + + def test_slice_quarter(self): + dti = DatetimeIndex(freq='D', start=datetime(2000, 6, 1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + self.assertEqual(len(s['2001Q1']), 90) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEqual(len(df.ix['1Q01']), 90) + + def test_slice_month(self): + dti = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(dti)), index=dti) + self.assertEqual(len(s['2005-11']), 30) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEqual(len(df.ix['2005-11']), 30) + + assert_series_equal(s['2005-11'], s['11-2005']) + + def test_partial_slice(self): + rng = DatetimeIndex(freq='D', start=datetime(2005, 1, 1), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-05':'2006-02'] + expected = s['20050501':'20060228'] + assert_series_equal(result, expected) + + result = s['2005-05':] + expected = s['20050501':] + assert_series_equal(result, expected) + + result = s[:'2006-02'] + expected = s[:'20060228'] + assert_series_equal(result, expected) + + result = s['2005-1-1'] + self.assertEqual(result, s.irow(0)) + + self.assertRaises(Exception, s.__getitem__, '2004-12-31') + + def test_partial_slice_daily(self): + rng = DatetimeIndex(freq='H', start=datetime(2005, 1, 31), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-31'] + assert_series_equal(result, s.ix[:24]) + + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00') + + def test_partial_slice_hourly(self): + rng = DatetimeIndex(freq='T', start=datetime(2005, 1, 1, 20, 0, 0), + periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-1'] + assert_series_equal(result, s.ix[:60 * 4]) + + result = s['2005-1-1 20'] + assert_series_equal(result, s.ix[:60]) + + self.assertEqual(s['2005-1-1 20:00'], s.ix[0]) + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:15') + + def test_partial_slice_minutely(self): + rng = DatetimeIndex(freq='S', start=datetime(2005, 1, 1, 23, 59, 0), + periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-1-1 23:59'] + assert_series_equal(result, s.ix[:60]) + + result = s['2005-1-1'] + assert_series_equal(result, s.ix[:60]) + + self.assertEqual(s[Timestamp('2005-1-1 23:59:00')], s.ix[0]) + self.assertRaises(Exception, s.__getitem__, '2004-12-31 00:00:00') + + def test_partial_slicing_with_multiindex(self): + + # GH 4758 + # partial string indexing with a multi-index buggy + df = DataFrame({'ACCOUNT':["ACCT1", "ACCT1", "ACCT1", "ACCT2"], + 'TICKER':["ABC", "MNP", "XYZ", "XYZ"], + 'val':[1,2,3,4]}, + index=date_range("2013-06-19 09:30:00", periods=4, freq='5T')) + df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True) + + expected = DataFrame([[1]],index=Index(['ABC'],name='TICKER'),columns=['val']) + result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')] + assert_frame_equal(result, expected) + + expected = df_multi.loc[(pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')] + result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] + assert_series_equal(result, expected) + + # this is a KeyError as we don't do partial string selection on multi-levels + def f(): + df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] + self.assertRaises(KeyError, f) + + # GH 4294 + # partial slice on a series mi + s = pd.DataFrame(randn(1000, 1000), index=pd.date_range('2000-1-1', periods=1000)).stack() + + s2 = s[:-1].copy() + expected = s2['2000-1-4'] + result = s2[pd.Timestamp('2000-1-4')] + assert_series_equal(result, expected) + + result = s[pd.Timestamp('2000-1-4')] + expected = s['2000-1-4'] + assert_series_equal(result, expected) + + df2 = pd.DataFrame(s) + expected = df2.ix['2000-1-4'] + result = df2.ix[pd.Timestamp('2000-1-4')] + assert_frame_equal(result, expected) + + def test_date_range_normalize(self): + snap = datetime.today() + n = 50 + + rng = date_range(snap, periods=n, normalize=False, freq='2D') + + offset = timedelta(2) + values = np.array([snap + i * offset for i in range(n)], + dtype='M8[ns]') + + self.assert_numpy_array_equal(rng, values) + + rng = date_range( + '1/1/2000 08:15', periods=n, normalize=False, freq='B') + the_time = time(8, 15) + for val in rng: + self.assertEqual(val.time(), the_time) + + def test_timedelta(self): + # this is valid too + index = date_range('1/1/2000', periods=50, freq='B') + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + self.assertTrue(tm.equalContents(index, back)) + self.assertEqual(shifted.freq, index.freq) + self.assertEqual(shifted.freq, back.freq) + + result = index - timedelta(1) + expected = index + timedelta(-1) + self.assertTrue(result.equals(expected)) + + # GH4134, buggy with timedeltas + rng = date_range('2013', '2014') + s = Series(rng) + result1 = rng - pd.offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - pd.offsets.Hour(1)) + self.assertTrue(result1.equals(result4)) + self.assertTrue(result2.equals(result3)) + + def test_shift(self): + ts = Series(np.random.randn(5), + index=date_range('1/1/2000', periods=5, freq='H')) + + result = ts.shift(1, freq='5T') + exp_index = ts.index.shift(1, freq='5T') + self.assertTrue(result.index.equals(exp_index)) + + # GH #1063, multiple of same base + result = ts.shift(1, freq='4H') + exp_index = ts.index + datetools.Hour(4) + self.assertTrue(result.index.equals(exp_index)) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + self.assertRaises(ValueError, idx.shift, 1) + + def test_setops_preserve_freq(self): + for tz in [None, 'Asia/Tokyo', 'US/Eastern']: + rng = date_range('1/1/2000', '1/1/2002', name='idx', tz=tz) + + result = rng[:50].union(rng[50:100]) + self.assertEqual(result.name, rng.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].union(rng[30:100]) + self.assertEqual(result.name, rng.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].union(rng[60:100]) + self.assertEqual(result.name, rng.name) + self.assertIsNone(result.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].intersection(rng[25:75]) + self.assertEqual(result.name, rng.name) + self.assertEqual(result.freqstr, 'D') + self.assertEqual(result.tz, rng.tz) + + nofreq = DatetimeIndex(list(rng[25:75]), name='other') + result = rng[:50].union(nofreq) + self.assertIsNone(result.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + result = rng[:50].intersection(nofreq) + self.assertIsNone(result.name) + self.assertEqual(result.freq, rng.freq) + self.assertEqual(result.tz, rng.tz) + + def test_min_max(self): + rng = date_range('1/1/2000', '12/31/2000') + rng2 = rng.take(np.random.permutation(len(rng))) + + the_min = rng2.min() + the_max = rng2.max() + tm.assert_isinstance(the_min, Timestamp) + tm.assert_isinstance(the_max, Timestamp) + self.assertEqual(the_min, rng[0]) + self.assertEqual(the_max, rng[-1]) + + self.assertEqual(rng.min(), rng[0]) + self.assertEqual(rng.max(), rng[-1]) + + def test_min_max_series(self): + rng = date_range('1/1/2000', periods=10, freq='4h') + lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] + df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), + 'L': lvls}) + + result = df.TS.max() + exp = Timestamp(df.TS.iget(-1)) + self.assertTrue(isinstance(result, Timestamp)) + self.assertEqual(result, exp) + + result = df.TS.min() + exp = Timestamp(df.TS.iget(0)) + self.assertTrue(isinstance(result, Timestamp)) + self.assertEqual(result, exp) + + def test_from_M8_structured(self): + dates = [(datetime(2012, 9, 9, 0, 0), + datetime(2012, 9, 8, 15, 10))] + arr = np.array(dates, + dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) + df = DataFrame(arr) + + self.assertEqual(df['Date'][0], dates[0][0]) + self.assertEqual(df['Forecasting'][0], dates[0][1]) + + s = Series(arr['Date']) + self.assertTrue(s[0], Timestamp) + self.assertEqual(s[0], dates[0][0]) + + s = Series.from_array(arr['Date'], Index([0])) + self.assertEqual(s[0], dates[0][0]) + + def test_get_level_values_box(self): + from pandas import MultiIndex + + dates = date_range('1/1/2000', periods=4) + levels = [dates, [0, 1]] + labels = [[0, 0, 1, 1, 2, 2, 3, 3], + [0, 1, 0, 1, 0, 1, 0, 1]] + + index = MultiIndex(levels=levels, labels=labels) + + self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp)) + + def test_frame_apply_dont_convert_datetime64(self): + from pandas.tseries.offsets import BDay + df = DataFrame({'x1': [datetime(1996, 1, 1)]}) + + df = df.applymap(lambda x: x + BDay()) + df = df.applymap(lambda x: x + BDay()) + + self.assertTrue(df.x1.dtype == 'M8[ns]') + + def test_date_range_fy5252(self): + dr = date_range(start="2013-01-01", + periods=2, + freq=offsets.FY5253(startingMonth=1, + weekday=3, + variation="nearest")) + self.assertEqual(dr[0], Timestamp('2013-01-31')) + self.assertEqual(dr[1], Timestamp('2014-01-30')) + +class TimeConversionFormats(tm.TestCase): + def test_to_datetime_format(self): + values = ['1/1/2000', '1/2/2000', '1/3/2000'] + + results1 = [ Timestamp('20000101'), Timestamp('20000201'), + Timestamp('20000301') ] + results2 = [ Timestamp('20000101'), Timestamp('20000102'), + Timestamp('20000103') ] + for vals, expecteds in [ (values, (Index(results1), Index(results2))), + (Series(values),(Series(results1), Series(results2))), + (values[0], (results1[0], results2[0])), + (values[1], (results1[1], results2[1])), + (values[2], (results1[2], results2[2])) ]: + + for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): + result = to_datetime(vals, format=fmt) + expected = expecteds[i] + + if isinstance(expected, Series): + assert_series_equal(result, Series(expected)) + elif isinstance(expected, Timestamp): + self.assertEqual(result, expected) + else: + self.assertTrue(result.equals(expected)) + + def test_to_datetime_format_YYYYMMDD(self): + s = Series([19801222,19801222] + [19810105]*5) + expected = Series([ Timestamp(x) for x in s.apply(str) ]) + + result = to_datetime(s,format='%Y%m%d') + assert_series_equal(result, expected) + + result = to_datetime(s.apply(str),format='%Y%m%d') + assert_series_equal(result, expected) + + # with NaT + expected = Series([Timestamp("19801222"),Timestamp("19801222")] + [Timestamp("19810105")]*5) + expected[2] = np.nan + s[2] = np.nan + + result = to_datetime(s,format='%Y%m%d') + assert_series_equal(result, expected) + + # string with NaT + s = s.apply(str) + s[2] = 'nat' + result = to_datetime(s,format='%Y%m%d') + assert_series_equal(result, expected) + + + def test_to_datetime_format_microsecond(self): + val = '01-Apr-2011 00:00:01.978' + format = '%d-%b-%Y %H:%M:%S.%f' + result = to_datetime(val, format=format) + exp = dt.datetime.strptime(val, format) + self.assertEqual(result, exp) + + def test_to_datetime_format_time(self): + data = [ + ['01/10/2010 15:20', '%m/%d/%Y %H:%M', Timestamp('2010-01-10 15:20')], + ['01/10/2010 05:43', '%m/%d/%Y %I:%M', Timestamp('2010-01-10 05:43')], + ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S', Timestamp('2010-01-10 13:56:01')]#, + #['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', Timestamp('2010-01-10 20:14')], + #['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', Timestamp('2010-01-10 07:40')], + #['01/10/2010 09:12:56 AM', '%m/%d/%Y %I:%M:%S %p', Timestamp('2010-01-10 09:12:56')] + ] + for s, format, dt in data: + self.assertEqual(to_datetime(s, format=format), dt) + + def test_to_datetime_format_weeks(self): + data = [ + ['2009324', '%Y%W%w', Timestamp('2009-08-13')], + ['2013020', '%Y%U%w', Timestamp('2013-01-13')] + ] + for s, format, dt in data: + self.assertEqual(to_datetime(s, format=format), dt) + +class TestToDatetimeInferFormat(tm.TestCase): + def test_to_datetime_infer_datetime_format_consistent_format(self): + time_series = pd.Series( + pd.date_range('20000101', periods=50, freq='H') + ) + + test_formats = [ + '%m-%d-%Y', + '%m/%d/%Y %H:%M:%S.%f', + '%Y-%m-%dT%H:%M:%S.%f', + ] + + for test_format in test_formats: + s_as_dt_strings = time_series.apply( + lambda x: x.strftime(test_format) + ) + + with_format = pd.to_datetime(s_as_dt_strings, format=test_format) + no_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=False + ) + yes_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=True + ) + + # Whether the format is explicitly passed, it is inferred, or + # it is not inferred, the results should all be the same + self.assert_numpy_array_equal(with_format, no_infer) + self.assert_numpy_array_equal(no_infer, yes_infer) + + def test_to_datetime_infer_datetime_format_inconsistent_format(self): + test_series = pd.Series( + np.array([ + '01/01/2011 00:00:00', + '01-02-2011 00:00:00', + '2011-01-03T00:00:00', + ])) + + # When the format is inconsistent, infer_datetime_format should just + # fallback to the default parsing + self.assert_numpy_array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + ) + + test_series = pd.Series( + np.array([ + 'Jan/01/2011', + 'Feb/01/2011', + 'Mar/01/2011', + ])) + + self.assert_numpy_array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + ) + + def test_to_datetime_infer_datetime_format_series_with_nans(self): + test_series = pd.Series( + np.array([ + '01/01/2011 00:00:00', + np.nan, + '01/03/2011 00:00:00', + np.nan, + ])) + + self.assert_numpy_array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + ) + + def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): + test_series = pd.Series( + np.array([ + np.nan, + np.nan, + '01/01/2011 00:00:00', + '01/02/2011 00:00:00', + '01/03/2011 00:00:00', + ])) + + self.assert_numpy_array_equal( + pd.to_datetime(test_series, infer_datetime_format=False), + pd.to_datetime(test_series, infer_datetime_format=True) + ) + + +class TestGuessDatetimeFormat(tm.TestCase): + def test_guess_datetime_format_with_parseable_formats(self): + dt_string_to_format = ( + ('20111230', '%Y%m%d'), + ('2011-12-30', '%Y-%m-%d'), + ('30-12-2011', '%d-%m-%Y'), + ('2011-12-30 00:00:00', '%Y-%m-%d %H:%M:%S'), + ('2011-12-30T00:00:00', '%Y-%m-%dT%H:%M:%S'), + ('2011-12-30 00:00:00.000000', '%Y-%m-%d %H:%M:%S.%f'), + ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_with_dayfirst(self): + ambiguous_string = '01/01/2011' + self.assertEqual( + tools._guess_datetime_format(ambiguous_string, dayfirst=True), + '%d/%m/%Y' + ) + self.assertEqual( + tools._guess_datetime_format(ambiguous_string, dayfirst=False), + '%m/%d/%Y' + ) + + def test_guess_datetime_format_with_locale_specific_formats(self): + # The month names will vary depending on the locale, in which + # case these wont be parsed properly (dateutil can't parse them) + _skip_if_has_locale() + + dt_string_to_format = ( + ('30/Dec/2011', '%d/%b/%Y'), + ('30/December/2011', '%d/%B/%Y'), + ('30/Dec/2011 00:00:00', '%d/%b/%Y %H:%M:%S'), + ) + + for dt_string, dt_format in dt_string_to_format: + self.assertEqual( + tools._guess_datetime_format(dt_string), + dt_format + ) + + def test_guess_datetime_format_invalid_inputs(self): + # A datetime string must include a year, month and a day for it + # to be guessable, in addition to being a string that looks like + # a datetime + invalid_dts = [ + '2013', + '01/2013', + '12:00:00', + '1/1/1/1', + 'this_is_not_a_datetime', + '51a', + 9, + datetime(2011, 1, 1), + ] + + for invalid_dt in invalid_dts: + self.assertTrue(tools._guess_datetime_format(invalid_dt) is None) + + def test_guess_datetime_format_for_array(self): + expected_format = '%Y-%m-%d %H:%M:%S.%f' + dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) + + test_arrays = [ + np.array([dt_string, dt_string, dt_string], dtype='O'), + np.array([np.nan, np.nan, dt_string], dtype='O'), + np.array([dt_string, 'random_string'], dtype='O'), + ] + + for test_array in test_arrays: + self.assertEqual( + tools._guess_datetime_format_for_array(test_array), + expected_format + ) + + format_for_string_of_nans = tools._guess_datetime_format_for_array( + np.array([np.nan, np.nan, np.nan], dtype='O') + ) + self.assertTrue(format_for_string_of_nans is None) + + +class TestTimestampToJulianDate(tm.TestCase): + + def test_compare_1700(self): + r = Timestamp('1700-06-23').to_julian_date() + self.assertEqual(r, 2342145.5) + + def test_compare_2000(self): + r = Timestamp('2000-04-12').to_julian_date() + self.assertEqual(r, 2451646.5) + + def test_compare_2100(self): + r = Timestamp('2100-08-12').to_julian_date() + self.assertEqual(r, 2488292.5) + + def test_compare_hour01(self): + r = Timestamp('2000-08-12T01:00:00').to_julian_date() + self.assertEqual(r, 2451768.5416666666666666) + + def test_compare_hour13(self): + r = Timestamp('2000-08-12T13:00:00').to_julian_date() + self.assertEqual(r, 2451769.0416666666666666) + + +class TestDateTimeIndexToJulianDate(tm.TestCase): + def test_1700(self): + r1 = Float64Index([2345897.5, + 2345898.5, + 2345899.5, + 2345900.5, + 2345901.5]) + r2 = date_range(start=Timestamp('1710-10-01'), + periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_2000(self): + r1 = Float64Index([2451601.5, + 2451602.5, + 2451603.5, + 2451604.5, + 2451605.5]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_hour(self): + r1 = Float64Index([2451601.5, + 2451601.5416666666666666, + 2451601.5833333333333333, + 2451601.625, + 2451601.6666666666666666]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='H').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_minute(self): + r1 = Float64Index([2451601.5, + 2451601.5006944444444444, + 2451601.5013888888888888, + 2451601.5020833333333333, + 2451601.5027777777777777]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='T').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_second(self): + r1 = Float64Index([2451601.5, + 2451601.500011574074074, + 2451601.5000231481481481, + 2451601.5000347222222222, + 2451601.5000462962962962]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='S').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_timeseries_legacy.py b/pandas/tseries/tests/test_timeseries_legacy.py new file mode 100644 index 00000000..1f811af0 --- /dev/null +++ b/pandas/tseries/tests/test_timeseries_legacy.py @@ -0,0 +1,238 @@ +# pylint: disable-msg=E1101,W0612 +from datetime import datetime, time, timedelta +import sys +import os + +import nose + +import numpy as np +randn = np.random.randn + +from pandas import (Index, Series, TimeSeries, DataFrame, + isnull, date_range, Timestamp, DatetimeIndex, + Int64Index, to_datetime, bdate_range) + +import pandas.core.datetools as datetools +import pandas.tseries.offsets as offsets +import pandas as pd + +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +from pandas.compat import( + range, long, StringIO, lrange, lmap, map, zip, cPickle as pickle, product +) +from pandas import read_pickle +from numpy.random import rand +import pandas.compat as compat +from pandas.core.datetools import BDay + + +# infortunately, too much has changed to handle these legacy pickles +# class TestLegacySupport(unittest.TestCase): +class LegacySupport(object): + + _multiprocess_can_split_ = True + + @classmethod + def setUpClass(cls): + if compat.PY3: + raise nose.SkipTest("not compatible with Python >= 3") + + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'frame.pickle') + + with open(filepath, 'rb') as f: + cls.frame = pickle.load(f) + + filepath = os.path.join(pth, 'data', 'series.pickle') + with open(filepath, 'rb') as f: + cls.series = pickle.load(f) + + def test_pass_offset_warn(self): + buf = StringIO() + + sys.stderr = buf + DatetimeIndex(start='1/1/2000', periods=10, offset='H') + sys.stderr = sys.__stderr__ + + def test_unpickle_legacy_frame(self): + dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', + freq=BDay(1)) + + unpickled = self.frame + + self.assertEqual(type(unpickled.index), DatetimeIndex) + self.assertEqual(len(unpickled), 10) + self.assertTrue((unpickled.columns == Int64Index(np.arange(5))).all()) + self.assertTrue((unpickled.index == dtindex).all()) + self.assertEqual(unpickled.index.offset, BDay(1, normalize=True)) + + def test_unpickle_legacy_series(self): + from pandas.core.datetools import BDay + + unpickled = self.series + + dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', + freq=BDay(1)) + + self.assertEqual(type(unpickled.index), DatetimeIndex) + self.assertEqual(len(unpickled), 10) + self.assertTrue((unpickled.index == dtindex).all()) + self.assertEqual(unpickled.index.offset, BDay(1, normalize=True)) + + def test_unpickle_legacy_len0_daterange(self): + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'series_daterange0.pickle') + + result = pd.read_pickle(filepath) + + ex_index = DatetimeIndex([], freq='B') + + self.assertTrue(result.index.equals(ex_index)) + tm.assert_isinstance(result.index.freq, offsets.BDay) + self.assertEqual(len(result), 0) + + def test_arithmetic_interaction(self): + index = self.frame.index + obj_index = index.asobject + + dseries = Series(rand(len(index)), index=index) + oseries = Series(dseries.values, index=obj_index) + + result = dseries + oseries + expected = dseries * 2 + tm.assert_isinstance(result.index, DatetimeIndex) + assert_series_equal(result, expected) + + result = dseries + oseries[:5] + expected = dseries + dseries[:5] + tm.assert_isinstance(result.index, DatetimeIndex) + assert_series_equal(result, expected) + + def test_join_interaction(self): + index = self.frame.index + obj_index = index.asobject + + def _check_join(left, right, how='inner'): + ra, rb, rc = left.join(right, how=how, return_indexers=True) + ea, eb, ec = left.join(DatetimeIndex(right), how=how, + return_indexers=True) + + tm.assert_isinstance(ra, DatetimeIndex) + self.assertTrue(ra.equals(ea)) + + assert_almost_equal(rb, eb) + assert_almost_equal(rc, ec) + + _check_join(index[:15], obj_index[5:], how='inner') + _check_join(index[:15], obj_index[5:], how='outer') + _check_join(index[:15], obj_index[5:], how='right') + _check_join(index[:15], obj_index[5:], how='left') + + def test_join_nonunique(self): + idx1 = to_datetime(['2012-11-06 16:00:11.477563', + '2012-11-06 16:00:11.477563']) + idx2 = to_datetime(['2012-11-06 15:11:09.006507', + '2012-11-06 15:11:09.006507']) + rs = idx1.join(idx2, how='outer') + self.assertTrue(rs.is_monotonic) + + def test_unpickle_daterange(self): + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'daterange_073.pickle') + + rng = read_pickle(filepath) + tm.assert_isinstance(rng[0], datetime) + tm.assert_isinstance(rng.offset, offsets.BDay) + self.assertEqual(rng.values.dtype, object) + + def test_setops(self): + index = self.frame.index + obj_index = index.asobject + + result = index[:5].union(obj_index[5:]) + expected = index + tm.assert_isinstance(result, DatetimeIndex) + self.assertTrue(result.equals(expected)) + + result = index[:10].intersection(obj_index[5:]) + expected = index[5:10] + tm.assert_isinstance(result, DatetimeIndex) + self.assertTrue(result.equals(expected)) + + result = index[:10] - obj_index[5:] + expected = index[:5] + tm.assert_isinstance(result, DatetimeIndex) + self.assertTrue(result.equals(expected)) + + def test_index_conversion(self): + index = self.frame.index + obj_index = index.asobject + + conv = DatetimeIndex(obj_index) + self.assertTrue(conv.equals(index)) + + self.assertRaises(ValueError, DatetimeIndex, ['a', 'b', 'c', 'd']) + + def test_tolist(self): + rng = date_range('1/1/2000', periods=10) + + result = rng.tolist() + tm.assert_isinstance(result[0], Timestamp) + + def test_object_convert_fail(self): + idx = DatetimeIndex([NaT]) + self.assertRaises(ValueError, idx.astype, 'O') + + def test_setops_conversion_fail(self): + index = self.frame.index + + right = Index(['a', 'b', 'c', 'd']) + + result = index.union(right) + expected = Index(np.concatenate([index.asobject, right])) + self.assertTrue(result.equals(expected)) + + result = index.intersection(right) + expected = Index([]) + self.assertTrue(result.equals(expected)) + + def test_legacy_time_rules(self): + rules = [('WEEKDAY', 'B'), + ('EOM', 'BM'), + ('W@MON', 'W-MON'), ('W@TUE', 'W-TUE'), ('W@WED', 'W-WED'), + ('W@THU', 'W-THU'), ('W@FRI', 'W-FRI'), + ('Q@JAN', 'BQ-JAN'), ('Q@FEB', 'BQ-FEB'), ('Q@MAR', 'BQ-MAR'), + ('A@JAN', 'BA-JAN'), ('A@FEB', 'BA-FEB'), ('A@MAR', 'BA-MAR'), + ('A@APR', 'BA-APR'), ('A@MAY', 'BA-MAY'), ('A@JUN', 'BA-JUN'), + ('A@JUL', 'BA-JUL'), ('A@AUG', 'BA-AUG'), ('A@SEP', 'BA-SEP'), + ('A@OCT', 'BA-OCT'), ('A@NOV', 'BA-NOV'), ('A@DEC', 'BA-DEC'), + ('WOM@1FRI', 'WOM-1FRI'), ('WOM@2FRI', 'WOM-2FRI'), + ('WOM@3FRI', 'WOM-3FRI'), ('WOM@4FRI', 'WOM-4FRI')] + + start, end = '1/1/2000', '1/1/2010' + + for old_freq, new_freq in rules: + old_rng = date_range(start, end, freq=old_freq) + new_rng = date_range(start, end, freq=new_freq) + self.assertTrue(old_rng.equals(new_rng)) + + # test get_legacy_offset_name + offset = datetools.get_offset(new_freq) + old_name = datetools.get_legacy_offset_name(offset) + self.assertEqual(old_name, old_freq) + + def test_ms_vs_MS(self): + left = datetools.get_offset('ms') + right = datetools.get_offset('MS') + self.assertEqual(left, datetools.Milli()) + self.assertEqual(right, datetools.MonthBegin()) + + def test_rule_aliases(self): + rule = datetools.to_offset('10us') + self.assertEqual(rule, datetools.Micro(10)) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py new file mode 100644 index 00000000..9c374716 --- /dev/null +++ b/pandas/tseries/tests/test_timezones.py @@ -0,0 +1,1140 @@ +# pylint: disable-msg=E1101,W0612 +from datetime import datetime, timedelta, tzinfo, date +import sys +import os +import nose + +import numpy as np +import pytz + +from pandas import (Index, Series, DataFrame, isnull, Timestamp) + +from pandas import DatetimeIndex, to_datetime, NaT +from pandas import tslib + +import pandas.core.datetools as datetools +import pandas.tseries.offsets as offsets +from pandas.tseries.index import bdate_range, date_range +import pandas.tseries.tools as tools +from pytz import NonExistentTimeError + +import pandas.util.testing as tm + +from pandas.util.testing import assert_frame_equal +from pandas.compat import lrange, zip + +from pandas import _np_version_under1p7 + + +try: + import pytz +except ImportError: + pass + +try: + import dateutil +except ImportError: + pass + + +class FixedOffset(tzinfo): + """Fixed offset in minutes east from UTC.""" + + def __init__(self, offset, name): + self.__offset = timedelta(minutes=offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return timedelta(0) + +fixed_off = FixedOffset(-420, '-07:00') +fixed_off_no_name = FixedOffset(-330, None) + + +class TestTimeZoneSupportPytz(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + tm._skip_if_no_pytz() + + def tz(self, tz): + ''' Construct a timezone object from a string. Overridden in subclass to parameterize tests. ''' + return pytz.timezone(tz) + + def tzstr(self, tz): + ''' Construct a timezone string from a string. Overridden in subclass to parameterize tests. ''' + return tz + + def localize(self, tz, x): + return tz.localize(x) + + def cmptz(self, tz1, tz2): + ''' Compare two timezones. Overridden in subclass to parameterize tests. ''' + return tz1.zone == tz2.zone + + def test_utc_to_local_no_modify(self): + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) + + # Values are unmodified + self.assert_(np.array_equal(rng.asi8, rng_eastern.asi8)) + + self.assert_(self.cmptz(rng_eastern.tz, self.tz('US/Eastern'))) + + def test_utc_to_local_no_modify_explicit(self): + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(self.tz('US/Eastern')) + + # Values are unmodified + self.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) + + self.assertEqual(rng_eastern.tz, self.tz('US/Eastern')) + + + def test_localize_utc_conversion(self): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range('3/10/2012', '3/11/2012', freq='30T') + + converted = rng.tz_localize(self.tzstr('US/Eastern')) + expected_naive = rng + offsets.Hour(5) + self.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) + + # DST ambiguity, this should fail + rng = date_range('3/11/2012', '3/12/2012', freq='30T') + # Is this really how it should fail?? + self.assertRaises(NonExistentTimeError, rng.tz_localize, self.tzstr('US/Eastern')) + + def test_localize_utc_conversion_explicit(self): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range('3/10/2012', '3/11/2012', freq='30T') + converted = rng.tz_localize(self.tz('US/Eastern')) + expected_naive = rng + offsets.Hour(5) + self.assert_(np.array_equal(converted.asi8, expected_naive.asi8)) + + # DST ambiguity, this should fail + rng = date_range('3/11/2012', '3/12/2012', freq='30T') + # Is this really how it should fail?? + self.assertRaises(NonExistentTimeError, rng.tz_localize, self.tz('US/Eastern')) + + def test_timestamp_tz_localize(self): + stamp = Timestamp('3/11/2012 04:00') + + result = stamp.tz_localize(self.tzstr('US/Eastern')) + expected = Timestamp('3/11/2012 04:00', tz=self.tzstr('US/Eastern')) + self.assertEqual(result.hour, expected.hour) + self.assertEqual(result, expected) + + def test_timestamp_tz_localize_explicit(self): + stamp = Timestamp('3/11/2012 04:00') + + result = stamp.tz_localize(self.tz('US/Eastern')) + expected = Timestamp('3/11/2012 04:00', tz=self.tz('US/Eastern')) + self.assertEqual(result.hour, expected.hour) + self.assertEqual(result, expected) + + def test_timestamp_constructed_by_date_and_tz(self): + # Fix Issue 2993, Timestamp cannot be constructed by datetime.date + # and tz correctly + + result = Timestamp(date(2012, 3, 11), tz=self.tzstr('US/Eastern')) + + expected = Timestamp('3/11/2012', tz=self.tzstr('US/Eastern')) + self.assertEqual(result.hour, expected.hour) + self.assertEqual(result, expected) + + def test_timestamp_constructed_by_date_and_tz_explicit(self): + # Fix Issue 2993, Timestamp cannot be constructed by datetime.date + # and tz correctly + + result = Timestamp(date(2012, 3, 11), tz=self.tz('US/Eastern')) + + expected = Timestamp('3/11/2012', tz=self.tz('US/Eastern')) + self.assertEquals(result.hour, expected.hour) + self.assertEquals(result, expected) + + def test_timestamp_to_datetime_tzoffset(self): + # tzoffset + from dateutil.tz import tzoffset + tzinfo = tzoffset(None, 7200) + expected = Timestamp('3/11/2012 04:00', tz=tzinfo) + result = Timestamp(expected.to_datetime()) + self.assertEqual(expected, result) + + def test_timedelta_push_over_dst_boundary(self): + # #1389 + + # 4 hours before DST transition + stamp = Timestamp('3/10/2012 22:00', tz=self.tzstr('US/Eastern')) + + result = stamp + timedelta(hours=6) + + # spring forward, + "7" hours + expected = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) + + self.assertEquals(result, expected) + + def test_timedelta_push_over_dst_boundary_explicit(self): + # #1389 + + # 4 hours before DST transition + stamp = Timestamp('3/10/2012 22:00', tz=self.tz('US/Eastern')) + + result = stamp + timedelta(hours=6) + + # spring forward, + "7" hours + expected = Timestamp('3/11/2012 05:00', tz=self.tz('US/Eastern')) + + self.assertEqual(result, expected) + + def test_tz_localize_dti(self): + from pandas.tseries.offsets import Hour + + dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', + freq='L') + dti2 = dti.tz_localize(self.tzstr('US/Eastern')) + + dti_utc = DatetimeIndex(start='1/1/2005 05:00', + end='1/1/2005 5:00:30.256', freq='L', + tz='utc') + + self.assert_numpy_array_equal(dti2.values, dti_utc.values) + + dti3 = dti2.tz_convert(self.tzstr('US/Pacific')) + self.assert_numpy_array_equal(dti3.values, dti_utc.values) + + dti = DatetimeIndex(start='11/6/2011 1:59', + end='11/6/2011 2:00', freq='L') + self.assertRaises(pytz.AmbiguousTimeError, dti.tz_localize, + self.tzstr('US/Eastern')) + + dti = DatetimeIndex(start='3/13/2011 1:59', end='3/13/2011 2:00', + freq='L') + self.assertRaises( + pytz.NonExistentTimeError, dti.tz_localize, self.tzstr('US/Eastern')) + + def test_tz_localize_empty_series(self): + # #2248 + + ts = Series() + + ts2 = ts.tz_localize('utc') + self.assertTrue(ts2.index.tz == pytz.utc) + + ts2 = ts.tz_localize(self.tzstr('US/Eastern')) + self.assertTrue(self.cmptz(ts2.index.tz, self.tz('US/Eastern'))) + + def test_astimezone(self): + utc = Timestamp('3/11/2012 22:00', tz='UTC') + expected = utc.tz_convert(self.tzstr('US/Eastern')) + result = utc.astimezone(self.tzstr('US/Eastern')) + self.assertEqual(expected, result) + tm.assert_isinstance(result, Timestamp) + + def test_create_with_tz(self): + stamp = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) + self.assertEqual(stamp.hour, 5) + + rng = date_range( + '3/11/2012 04:00', periods=10, freq='H', tz=self.tzstr('US/Eastern')) + + self.assertEqual(stamp, rng[1]) + + utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') + self.assertIs(utc_stamp.tzinfo, pytz.utc) + self.assertEqual(utc_stamp.hour, 5) + + stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') + self.assertEqual(utc_stamp.hour, 5) + + def test_create_with_fixed_tz(self): + off = FixedOffset(420, '+07:00') + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + self.assertEqual(off, rng.tz) + + rng2 = date_range(start, periods=len(rng), tz=off) + self.assertTrue(rng.equals(rng2)) + + rng3 = date_range( + '3/11/2012 05:00:00+07:00', '6/11/2012 05:00:00+07:00') + self.assertTrue((rng.values == rng3.values).all()) + + def test_create_with_fixedoffset_noname(self): + off = fixed_off_no_name + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + self.assertEqual(off, rng.tz) + + idx = Index([start, end]) + self.assertEqual(off, idx.tz) + + def test_date_range_localize(self): + rng = date_range( + '3/11/2012 03:00', periods=15, freq='H', tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], + tz='US/Eastern') + rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') + rng3 = rng3.tz_localize('US/Eastern') + + self.assertTrue(rng.equals(rng3)) + + # DST transition time + val = rng[0] + exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') + + self.assertEqual(val.hour, 3) + self.assertEqual(exp.hour, 3) + self.assertEqual(val, exp) # same UTC value + self.assertTrue(rng[:2].equals(rng2)) + + # Right before the DST transition + rng = date_range( + '3/11/2012 00:00', periods=2, freq='H', tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], + tz='US/Eastern') + self.assertTrue(rng.equals(rng2)) + exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') + self.assertEqual(exp.hour, 0) + self.assertEqual(rng[0], exp) + exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') + self.assertEqual(exp.hour, 1) + self.assertEqual(rng[1], exp) + + rng = date_range('3/11/2012 00:00', periods=10, freq='H', + tz='US/Eastern') + self.assertEqual(rng[2].hour, 3) + + def test_utc_box_timestamp_and_localize(self): + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) + + tz = self.tz('US/Eastern') + expected = rng[-1].astimezone(tz) + + stamp = rng_eastern[-1] + self.assertEqual(stamp, expected) + self.assertEqual(stamp.tzinfo, expected.tzinfo) + + # right tzinfo + rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) + # test not valid for dateutil timezones. + # self.assertIn('EDT', repr(rng_eastern[0].tzinfo)) + self.assert_('EDT' in repr(rng_eastern[0].tzinfo) or 'tzfile' in repr(rng_eastern[0].tzinfo)) + + def test_timestamp_tz_convert(self): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + idx = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) + + conv = idx[0].tz_convert(self.tzstr('US/Pacific')) + expected = idx.tz_convert(self.tzstr('US/Pacific'))[0] + + self.assertEqual(conv, expected) + + def test_pass_dates_localize_to_utc(self): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + + idx = DatetimeIndex(strdates) + conv = idx.tz_localize(self.tzstr('US/Eastern')) + + fromdates = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) + + self.assertEqual(conv.tz, fromdates.tz) + self.assert_numpy_array_equal(conv.values, fromdates.values) + + def test_field_access_localize(self): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + rng = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) + self.assertTrue((rng.hour == 0).all()) + + # a more unusual time zone, #1946 + dr = date_range('2011-10-02 00:00', freq='h', periods=10, + tz=self.tzstr('America/Atikokan')) + + expected = np.arange(10) + self.assert_numpy_array_equal(dr.hour, expected) + + def test_with_tz(self): + tz = self.tz('US/Central') + + # just want it to work + start = datetime(2011, 3, 12, tzinfo=pytz.utc) + dr = bdate_range(start, periods=50, freq=datetools.Hour()) + self.assertIs(dr.tz, pytz.utc) + + # DateRange with naive datetimes + dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) + dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) + + # normalized + central = dr.tz_convert(tz) + self.assertIs(central.tz, tz) + comp = self.localize(tz, central[0].to_pydatetime().replace(tzinfo=None)).tzinfo + self.assertIs(central[0].tz, comp) + + # compare vs a localized tz + comp = self.localize(tz, dr[0].to_pydatetime().replace(tzinfo=None)).tzinfo + self.assertIs(central[0].tz, comp) + + # datetimes with tzinfo set + dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), + '1/1/2009', tz=pytz.utc) + + self.assertRaises(Exception, bdate_range, + datetime(2005, 1, 1, tzinfo=pytz.utc), + '1/1/2009', tz=tz) + + def test_tz_localize(self): + dr = bdate_range('1/1/2009', '1/1/2010') + dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) + localized = dr.tz_localize(pytz.utc) + self.assert_numpy_array_equal(dr_utc, localized) + + def test_with_tz_ambiguous_times(self): + tz = self.tz('US/Eastern') + + # March 13, 2011, spring forward, skip from 2 AM to 3 AM + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, + freq=datetools.Hour()) + self.assertRaises(pytz.NonExistentTimeError, dr.tz_localize, tz) + + # after dst transition, it works + dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, + freq=datetools.Hour(), tz=tz) + + # November 6, 2011, fall back, repeat 2 AM hour + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, + freq=datetools.Hour()) + self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) + + # UTC is OK + dr = date_range(datetime(2011, 3, 13), periods=48, + freq=datetools.Minute(30), tz=pytz.utc) + + def test_infer_dst(self): + # November 6, 2011, fall back, repeat 2 AM hour + # With no repeated hours, we cannot infer the transition + tz = self.tz('US/Eastern') + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=datetools.Hour()) + self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, + tz, infer_dst=True) + + # With repeated hours, we can infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=datetools.Hour(), tz=tz) + di = DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', + '11/06/2011 01:00', '11/06/2011 02:00', + '11/06/2011 03:00']) + localized = di.tz_localize(tz, infer_dst=True) + self.assert_numpy_array_equal(dr, localized) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, + freq=datetools.Hour()) + localized = dr.tz_localize(tz) + localized_infer = dr.tz_localize(tz, infer_dst=True) + self.assert_numpy_array_equal(localized, localized_infer) + + + # test utility methods + def test_infer_tz(self): + eastern = self.tz('US/Eastern') + utc = pytz.utc + + _start = datetime(2001, 1, 1) + _end = datetime(2009, 1, 1) + + start = self.localize(eastern, _start) + end = self.localize(eastern, _end) + assert(tools._infer_tzinfo(start, end) is self.localize(eastern, _start).tzinfo) + assert(tools._infer_tzinfo(start, None) is self.localize(eastern, _start).tzinfo) + assert(tools._infer_tzinfo(None, end) is self.localize(eastern, _end).tzinfo) + + start = utc.localize(_start) + end = utc.localize(_end) + assert(tools._infer_tzinfo(start, end) is utc) + + end = self.localize(eastern, _end) + self.assertRaises(Exception, tools._infer_tzinfo, start, end) + self.assertRaises(Exception, tools._infer_tzinfo, end, start) + + def test_tz_string(self): + result = date_range('1/1/2000', periods=10, tz=self.tzstr('US/Eastern')) + expected = date_range('1/1/2000', periods=10, + tz=self.tz('US/Eastern')) + + self.assertTrue(result.equals(expected)) + + def test_take_dont_lose_meta(self): + tm._skip_if_no_pytz() + rng = date_range('1/1/2000', periods=20, tz=self.tzstr('US/Eastern')) + + result = rng.take(lrange(5)) + self.assertEqual(result.tz, rng.tz) + self.assertEqual(result.freq, rng.freq) + + def test_index_with_timezone_repr(self): + rng = date_range('4/13/2010', '5/6/2010') + + rng_eastern = rng.tz_localize(self.tzstr('US/Eastern')) + + rng_repr = repr(rng_eastern) + self.assertIn('2010-04-13 00:00:00', rng_repr) + + def test_index_astype_asobject_tzinfos(self): + # #1345 + + # dates around a dst transition + rng = date_range('2/13/2010', '5/6/2010', tz=self.tzstr('US/Eastern')) + + objs = rng.asobject + for i, x in enumerate(objs): + exval = rng[i] + self.assertEqual(x, exval) + self.assertEqual(x.tzinfo, exval.tzinfo) + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + self.assertEqual(x, exval) + self.assertEqual(x.tzinfo, exval.tzinfo) + + def test_localized_at_time_between_time(self): + from datetime import time + + rng = date_range('4/16/2012', '5/1/2012', freq='H') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_local = ts.tz_localize(self.tzstr('US/Eastern')) + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize(self.tzstr('US/Eastern')) + tm.assert_series_equal(result, expected) + self.assertTrue(self.cmptz(result.index.tz, self.tz('US/Eastern'))) + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize(self.tzstr('US/Eastern')) + tm.assert_series_equal(result, expected) + self.assertTrue(self.cmptz(result.index.tz, self.tz('US/Eastern'))) + + def test_string_index_alias_tz_aware(self): + rng = date_range('1/1/2000', periods=10, tz=self.tzstr('US/Eastern')) + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts['1/3/2000'] + self.assertAlmostEqual(result, ts[2]) + + def test_fixed_offset(self): + dates = [datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off)] + result = to_datetime(dates) + self.assertEqual(result.tz, fixed_off) + + def test_fixedtz_topydatetime(self): + dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off)]) + result = to_datetime(dates).to_pydatetime() + self.assert_numpy_array_equal(dates, result) + result = to_datetime(dates)._mpl_repr() + self.assert_numpy_array_equal(dates, result) + + def test_convert_tz_aware_datetime_datetime(self): + # #1581 + + tz = self.tz('US/Eastern') + + dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)] + + dates_aware = [self.localize(tz, x) for x in dates] + result = to_datetime(dates_aware) + self.assertTrue(self.cmptz(result.tz, self.tz('US/Eastern'))) + + converted = to_datetime(dates_aware, utc=True) + ex_vals = [Timestamp(x).value for x in dates_aware] + self.assert_numpy_array_equal(converted.asi8, ex_vals) + self.assertIs(converted.tz, pytz.utc) + + def test_to_datetime_utc(self): + from dateutil.parser import parse + arr = np.array([parse('2012-06-13T01:39:00Z')], dtype=object) + + result = to_datetime(arr, utc=True) + self.assertIs(result.tz, pytz.utc) + + def test_to_datetime_tzlocal(self): + from dateutil.parser import parse + from dateutil.tz import tzlocal + dt = parse('2012-06-13T01:39:00Z') + dt = dt.replace(tzinfo=tzlocal()) + + arr = np.array([dt], dtype=object) + + result = to_datetime(arr, utc=True) + self.assertIs(result.tz, pytz.utc) + + rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) + arr = rng.to_pydatetime() + result = to_datetime(arr, utc=True) + self.assertIs(result.tz, pytz.utc) + + def test_frame_no_datetime64_dtype(self): + + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) + e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) + self.assertEqual(e['B'].dtype, 'M8[ns]') + + # GH 2810 (with timezones) + datetimes_naive = [ ts.to_pydatetime() for ts in dr ] + datetimes_with_tz = [ ts.to_pydatetime() for ts in dr_tz ] + df = DataFrame({'dr' : dr, 'dr_tz' : dr_tz, + 'datetimes_naive': datetimes_naive, + 'datetimes_with_tz' : datetimes_with_tz }) + result = df.get_dtype_counts() + expected = Series({ 'datetime64[ns]' : 3, 'object' : 1 }) + tm.assert_series_equal(result, expected) + + def test_hongkong_tz_convert(self): + # #1673 + dr = date_range( + '2012-01-01', '2012-01-10', freq='D', tz='Hongkong') + + # it works! + dr.hour + + def test_tz_convert_unsorted(self): + dr = date_range('2012-03-09', freq='H', periods=100, tz='utc') + dr = dr.tz_convert(self.tzstr('US/Eastern')) + + result = dr[::-1].hour + exp = dr.hour[::-1] + tm.assert_almost_equal(result, exp) + + def test_shift_localized(self): + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) + + result = dr_tz.shift(1, '10T') + self.assertEqual(result.tz, dr_tz.tz) + + def test_tz_aware_asfreq(self): + dr = date_range( + '2011-12-01', '2012-07-20', freq='D', tz=self.tzstr('US/Eastern')) + + s = Series(np.random.randn(len(dr)), index=dr) + + # it works! + s.asfreq('T') + + def test_static_tzinfo(self): + # it works! + index = DatetimeIndex([datetime(2012, 1, 1)], tz=self.tzstr('EST')) + index.hour + index[0] + + def test_tzaware_datetime_to_index(self): + d = [datetime(2012, 8, 19, tzinfo=self.tz('US/Eastern'))] + + index = DatetimeIndex(d) + self.assertTrue(self.cmptz(index.tz, self.tz('US/Eastern'))) + + def test_date_range_span_dst_transition(self): + # #1778 + + # Standard -> Daylight Savings Time + dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI', + tz='US/Eastern') + + self.assertTrue((dr.hour == 0).all()) + + dr = date_range('2012-11-02', periods=10, tz=self.tzstr('US/Eastern')) + self.assertTrue((dr.hour == 0).all()) + + def test_convert_datetime_list(self): + dr = date_range('2012-06-02', periods=10, tz=self.tzstr('US/Eastern')) + + dr2 = DatetimeIndex(list(dr), name='foo') + self.assertTrue(dr.equals(dr2)) + self.assertEqual(dr.tz, dr2.tz) + self.assertEqual(dr2.name, 'foo') + + def test_frame_from_records_utc(self): + rec = {'datum': 1.5, + 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} + + # it works + DataFrame.from_records([rec], index='begin_time') + + def test_frame_reset_index(self): + dr = date_range('2012-06-02', periods=10, tz=self.tzstr('US/Eastern')) + df = DataFrame(np.random.randn(len(dr)), dr) + roundtripped = df.reset_index().set_index('index') + xp = df.index.tz + rs = roundtripped.index.tz + self.assertEqual(xp, rs) + + def test_dateutil_tzoffset_support(self): + from dateutil.tz import tzoffset + values = [188.5, 328.25] + tzinfo = tzoffset(None, 7200) + index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), + datetime(2012, 5, 11, 12, tzinfo=tzinfo)] + series = Series(data=values, index=index) + + self.assertEqual(series.index.tz, tzinfo) + + # it works! #2443 + repr(series.index[0]) + + def test_getitem_pydatetime_tz(self): + index = date_range(start='2012-12-24 16:00', + end='2012-12-24 18:00', freq='H', + tz=self.tzstr('Europe/Berlin')) + ts = Series(index=index, data=index.hour) + time_pandas = Timestamp('2012-12-24 17:00', tz=self.tzstr('Europe/Berlin')) + time_datetime = self.localize(self.tz('Europe/Berlin'), datetime(2012, 12, 24, 17, 0)) + self.assertEqual(ts[time_pandas], ts[time_datetime]) + + def test_index_drop_dont_lose_tz(self): + # #2621 + ind = date_range("2012-12-01", periods=10, tz="utc") + ind = ind.drop(ind[-1]) + + self.assertTrue(ind.tz is not None) + + def test_datetimeindex_tz(self): + """ Test different DatetimeIndex constructions with timezone + Follow-up of #4229 + """ + + arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00'] + + idx1 = to_datetime(arr).tz_localize(self.tzstr('US/Eastern')) + idx2 = DatetimeIndex(start="2005-11-10 08:00:00", freq='H', periods=2, tz=self.tzstr('US/Eastern')) + idx3 = DatetimeIndex(arr, tz=self.tzstr('US/Eastern')) + idx4 = DatetimeIndex(np.array(arr), tz=self.tzstr('US/Eastern')) + + for other in [idx2, idx3, idx4]: + self.assertTrue(idx1.equals(other)) + + def test_datetimeindex_tz_nat(self): + idx = to_datetime([Timestamp("2013-1-1", tz=self.tzstr('US/Eastern')), NaT]) + + self.assertTrue(isnull(idx[1])) + self.assertTrue(idx[0].tzinfo is not None) + + +class TestTimeZoneSupportDateutil(TestTimeZoneSupportPytz): + _multiprocess_can_split_ = True + + def setUp(self): + tm._skip_if_no_dateutil() + + def tz(self, tz): + ''' + Construct a dateutil timezone. + Use tslib.maybe_get_tz so that we get the filename on the tz right + on windows. See #7337. + ''' + return tslib.maybe_get_tz('dateutil/' + tz) + + def tzstr(self, tz): + ''' Construct a timezone string from a string. Overridden in subclass to parameterize tests. ''' + return 'dateutil/' + tz + + def cmptz(self, tz1, tz2): + ''' Compare two timezones. Overridden in subclass to parameterize tests. ''' + return tz1 == tz2 + + def localize(self, tz, x): + return x.replace(tzinfo=tz) + + def test_utc_with_system_utc(self): + if sys.platform == 'win32': + raise nose.SkipTest('Skipped on win32 due to dateutil bug.') + + from pandas.tslib import maybe_get_tz + + # from system utc to real utc + ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + # check that the time hasn't changed. + self.assertEqual(ts, ts.tz_convert(dateutil.tz.tzutc())) + + # from system utc to real utc + ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + # check that the time hasn't changed. + self.assertEqual(ts, ts.tz_convert(dateutil.tz.tzutc())) + + +class TestTimeZoneCacheKey(tm.TestCase): + def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): + tzs = pytz.common_timezones + for tz_name in tzs: + if tz_name == 'UTC': + # skip utc as it's a special case in dateutil + continue + tz_p = tslib.maybe_get_tz(tz_name) + tz_d = tslib.maybe_get_tz('dateutil/' + tz_name) + if tz_d is None: + # skip timezones that dateutil doesn't know about. + continue + self.assertNotEqual(tslib._p_tz_cache_key(tz_p), tslib._p_tz_cache_key(tz_d)) + + +class TestTimeZones(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + tm._skip_if_no_pytz() + + def test_index_equals_with_tz(self): + left = date_range('1/1/2011', periods=100, freq='H', tz='utc') + right = date_range('1/1/2011', periods=100, freq='H', + tz='US/Eastern') + + self.assertFalse(left.equals(right)) + + def test_tz_localize_naive(self): + rng = date_range('1/1/2011', periods=100, freq='H') + + conv = rng.tz_localize('US/Pacific') + exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') + + self.assertTrue(conv.equals(exp)) + + def test_series_frame_tz_localize(self): + + rng = date_range('1/1/2011', periods=100, freq='H') + ts = Series(1, index=rng) + + result = ts.tz_localize('utc') + self.assertEqual(result.index.tz.zone, 'UTC') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_localize('utc') + expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) + self.assertEqual(result.index.tz.zone, 'UTC') + assert_frame_equal(result, expected) + + df = df.T + result = df.tz_localize('utc', axis=1) + self.assertEqual(result.columns.tz.zone, 'UTC') + assert_frame_equal(result, expected.T) + + # Can't localize if already tz-aware + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + ts = Series(1, index=rng) + tm.assertRaisesRegexp(TypeError, 'Already tz-aware', ts.tz_localize, 'US/Eastern') + + def test_series_frame_tz_convert(self): + rng = date_range('1/1/2011', periods=200, freq='D', + tz='US/Eastern') + ts = Series(1, index=rng) + + result = ts.tz_convert('Europe/Berlin') + self.assertEqual(result.index.tz.zone, 'Europe/Berlin') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_convert('Europe/Berlin') + expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) + self.assertEqual(result.index.tz.zone, 'Europe/Berlin') + assert_frame_equal(result, expected) + + df = df.T + result = df.tz_convert('Europe/Berlin', axis=1) + self.assertEqual(result.columns.tz.zone, 'Europe/Berlin') + assert_frame_equal(result, expected.T) + + # can't convert tz-naive + rng = date_range('1/1/2011', periods=200, freq='D') + ts = Series(1, index=rng) + tm.assertRaisesRegexp(TypeError, "Cannot convert tz-naive", ts.tz_convert, 'US/Eastern') + + def test_join_utc_convert(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng.tz_convert('US/Eastern') + right = rng.tz_convert('Europe/Berlin') + + for how in ['inner', 'outer', 'left', 'right']: + result = left.join(left[:-5], how=how) + tm.assert_isinstance(result, DatetimeIndex) + self.assertEqual(result.tz, left.tz) + + result = left.join(right[:-5], how=how) + tm.assert_isinstance(result, DatetimeIndex) + self.assertEqual(result.tz.zone, 'UTC') + + def test_join_aware(self): + rng = date_range('1/1/2011', periods=10, freq='H') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_utc = ts.tz_localize('utc') + + self.assertRaises(Exception, ts.__add__, ts_utc) + self.assertRaises(Exception, ts_utc.__add__, ts) + + test1 = DataFrame(np.zeros((6, 3)), + index=date_range("2012-11-15 00:00:00", periods=6, + freq="100L", tz="US/Central")) + test2 = DataFrame(np.zeros((3, 3)), + index=date_range("2012-11-15 00:00:00", periods=3, + freq="250L", tz="US/Central"), + columns=lrange(3, 6)) + + result = test1.join(test2, how='outer') + ex_index = test1.index.union(test2.index) + + self.assertTrue(result.index.equals(ex_index)) + self.assertTrue(result.index.tz.zone == 'US/Central') + + # non-overlapping + rng = date_range("2012-11-15 00:00:00", periods=6, + freq="H", tz="US/Central") + + rng2 = date_range("2012-11-15 12:00:00", periods=6, + freq="H", tz="US/Eastern") + + result = rng.union(rng2) + self.assertTrue(result.tz.zone == 'UTC') + + def test_align_aware(self): + idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') + df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) + df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) + new1, new2 = df1.align(df2) + self.assertEqual(df1.index.tz, new1.index.tz) + self.assertEqual(df2.index.tz, new2.index.tz) + + def test_append_aware(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ts1 = Series(np.random.randn(len(rng1)), index=rng1) + ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ts1.append(ts2) + self.assertEqual(ts_result.index.tz, rng1.tz) + + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='UTC') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='UTC') + ts1 = Series(np.random.randn(len(rng1)), index=rng1) + ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ts1.append(ts2) + utc = rng1.tz + self.assertEqual(utc, ts_result.index.tz) + + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Central') + ts1 = Series(np.random.randn(len(rng1)), index=rng1) + ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ts1.append(ts2) + self.assertEqual(utc, ts_result.index.tz) + + def test_append_aware_naive(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ts1 = Series(np.random.randn(len(rng1)), index=rng1) + ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ts1.append(ts2) + self.assertTrue(ts_result.index.equals( + ts1.index.asobject.append(ts2.index.asobject))) + + # mixed + + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = lrange(100) + ts1 = Series(np.random.randn(len(rng1)), index=rng1) + ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ts1.append(ts2) + self.assertTrue(ts_result.index.equals( + ts1.index.asobject.append(ts2.index))) + + def test_equal_join_ensure_utc(self): + rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_moscow = ts.tz_convert('Europe/Moscow') + + result = ts + ts_moscow + self.assertIs(result.index.tz, pytz.utc) + + result = ts_moscow + ts + self.assertIs(result.index.tz, pytz.utc) + + df = DataFrame({'a': ts}) + df_moscow = df.tz_convert('Europe/Moscow') + result = df + df_moscow + self.assertIs(result.index.tz, pytz.utc) + + result = df_moscow + df + self.assertIs(result.index.tz, pytz.utc) + + def test_arith_utc_convert(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + perm = np.random.permutation(100)[:90] + ts1 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('US/Eastern')) + + perm = np.random.permutation(100)[:90] + ts2 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('Europe/Berlin')) + + result = ts1 + ts2 + + uts1 = ts1.tz_convert('utc') + uts2 = ts2.tz_convert('utc') + expected = uts1 + uts2 + + self.assertEqual(result.index.tz, pytz.UTC) + tm.assert_series_equal(result, expected) + + def test_intersection(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng[10:90][::-1] + right = rng[20:80][::-1] + + self.assertEqual(left.tz, rng.tz) + result = left.intersection(right) + self.assertEqual(result.tz, left.tz) + + def test_timestamp_equality_different_timezones(self): + utc_range = date_range('1/1/2000', periods=20, tz='UTC') + + eastern_range = utc_range.tz_convert('US/Eastern') + berlin_range = utc_range.tz_convert('Europe/Berlin') + + for a, b, c in zip(utc_range, eastern_range, berlin_range): + self.assertEqual(a, b) + self.assertEqual(b, c) + self.assertEqual(a, c) + + self.assertTrue((utc_range == eastern_range).all()) + self.assertTrue((utc_range == berlin_range).all()) + self.assertTrue((berlin_range == eastern_range).all()) + + def test_datetimeindex_tz(self): + rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', + tz='US/Eastern') + rng2 = DatetimeIndex(data=rng, tz='US/Eastern') + self.assertTrue(rng.equals(rng2)) + + def test_normalize_tz(self): + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz='US/Eastern') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz='US/Eastern') + self.assertTrue(result.equals(expected)) + + self.assertTrue(result.is_normalized) + self.assertFalse(rng.is_normalized) + + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz='UTC') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz='UTC') + self.assertTrue(result.equals(expected)) + + self.assertTrue(result.is_normalized) + self.assertFalse(rng.is_normalized) + + from dateutil.tz import tzlocal + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz=tzlocal()) + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz=tzlocal()) + self.assertTrue(result.equals(expected)) + + self.assertTrue(result.is_normalized) + self.assertFalse(rng.is_normalized) + + def test_tzaware_offset(self): + dates = date_range('2012-11-01', periods=3, tz='US/Pacific') + offset = dates + offsets.Hour(5) + self.assertEqual(dates[0] + offsets.Hour(5), offset[0]) + + # GH 6818 + for tz in ['UTC', 'US/Pacific', 'Asia/Tokyo']: + dates = date_range('2010-11-01 00:00', periods=3, tz=tz, freq='H') + expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', + '2010-11-01 07:00'], freq='H', tz=tz) + + offset = dates + offsets.Hour(5) + self.assertTrue(offset.equals(expected)) + if not _np_version_under1p7: + offset = dates + np.timedelta64(5, 'h') + self.assertTrue(offset.equals(expected)) + offset = dates + timedelta(hours=5) + self.assertTrue(offset.equals(expected)) + + def test_nat(self): + # GH 5546 + dates = [NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize('US/Pacific') + self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Pacific'))) + idx = idx.tz_convert('US/Eastern') + self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Eastern'))) + idx = idx.tz_convert('UTC') + self.assertTrue(idx.equals(DatetimeIndex(dates, tz='UTC'))) + + dates = ['2010-12-01 00:00', '2010-12-02 00:00', NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize('US/Pacific') + self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Pacific'))) + idx = idx.tz_convert('US/Eastern') + expected = ['2010-12-01 03:00', '2010-12-02 03:00', NaT] + self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + + idx = idx + offsets.Hour(5) + expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] + self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + idx = idx.tz_convert('US/Pacific') + expected = ['2010-12-01 05:00', '2010-12-02 05:00', NaT] + self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Pacific'))) + + if not _np_version_under1p7: + idx = idx + np.timedelta64(3, 'h') + expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] + self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Pacific'))) + + idx = idx.tz_convert('US/Eastern') + expected = ['2010-12-01 11:00', '2010-12-02 11:00', NaT] + self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py new file mode 100644 index 00000000..a47d6a17 --- /dev/null +++ b/pandas/tseries/tests/test_tslib.py @@ -0,0 +1,491 @@ +import nose + +import numpy as np + +from pandas import tslib +import datetime + +from pandas.core.api import Timestamp, Series +from pandas.tslib import period_asfreq, period_ordinal +from pandas.tseries.index import date_range +from pandas.tseries.frequencies import get_freq +import pandas.tseries.offsets as offsets +from pandas import _np_version_under1p7 +import pandas.util.testing as tm +from pandas.util.testing import assert_series_equal + +class TestTimestamp(tm.TestCase): + def test_repr(self): + date = '2014-03-07' + tz = 'US/Eastern' + freq = 'M' + + date_only = Timestamp(date) + self.assertIn(date, repr(date_only)) + self.assertNotIn(tz, repr(date_only)) + self.assertNotIn(freq, repr(date_only)) + self.assertEqual(date_only, eval(repr(date_only))) + + date_tz = Timestamp(date, tz=tz) + self.assertIn(date, repr(date_tz)) + self.assertIn(tz, repr(date_tz)) + self.assertNotIn(freq, repr(date_tz)) + self.assertEqual(date_tz, eval(repr(date_tz))) + + date_freq = Timestamp(date, offset=freq) + self.assertIn(date, repr(date_freq)) + self.assertNotIn(tz, repr(date_freq)) + self.assertIn(freq, repr(date_freq)) + self.assertEqual(date_freq, eval(repr(date_freq))) + + date_tz_freq = Timestamp(date, tz=tz, offset=freq) + self.assertIn(date, repr(date_tz_freq)) + self.assertIn(tz, repr(date_tz_freq)) + self.assertIn(freq, repr(date_tz_freq)) + self.assertEqual(date_tz_freq, eval(repr(date_tz_freq))) + + # this can cause the tz field to be populated, but it's redundant to information in the datestring + date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) + self.assertIn('2014-03-13 00:00:00-0400', repr(date_with_utc_offset)) + self.assertNotIn('tzoffset', repr(date_with_utc_offset)) + self.assertEqual(date_with_utc_offset, eval(repr(date_with_utc_offset))) + + def test_bounds_with_different_units(self): + out_of_bounds_dates = ( + '1677-09-21', + '2262-04-12', + ) + + time_units = ('D', 'h', 'm', 's', 'ms', 'us') + + for date_string in out_of_bounds_dates: + for unit in time_units: + self.assertRaises( + ValueError, + Timestamp, + np.datetime64(date_string, dtype='M8[%s]' % unit) + ) + + in_bounds_dates = ( + '1677-09-23', + '2262-04-11', + ) + + for date_string in in_bounds_dates: + for unit in time_units: + Timestamp( + np.datetime64(date_string, dtype='M8[%s]' % unit) + ) + + def test_tz(self): + t = '2014-02-01 09:00' + ts = Timestamp(t) + local = ts.tz_localize('Asia/Tokyo') + self.assertEqual(local.hour, 9) + self.assertEqual(local, Timestamp(t, tz='Asia/Tokyo')) + conv = local.tz_convert('US/Eastern') + self.assertEqual(conv, + Timestamp('2014-01-31 19:00', tz='US/Eastern')) + self.assertEqual(conv.hour, 19) + + # preserves nanosecond + ts = Timestamp(t) + offsets.Nano(5) + local = ts.tz_localize('Asia/Tokyo') + self.assertEqual(local.hour, 9) + self.assertEqual(local.nanosecond, 5) + conv = local.tz_convert('US/Eastern') + self.assertEqual(conv.nanosecond, 5) + self.assertEqual(conv.hour, 19) + + def test_barely_oob_dts(self): + one_us = np.timedelta64(1).astype('timedelta64[us]') + + # By definition we can't go out of bounds in [ns], so we + # convert the datetime64s to [us] so we can go out of bounds + min_ts_us = np.datetime64(Timestamp.min).astype('M8[us]') + max_ts_us = np.datetime64(Timestamp.max).astype('M8[us]') + + # No error for the min/max datetimes + Timestamp(min_ts_us) + Timestamp(max_ts_us) + + # One us less than the minimum is an error + self.assertRaises(ValueError, Timestamp, min_ts_us - one_us) + + # One us more than the maximum is an error + self.assertRaises(ValueError, Timestamp, max_ts_us + one_us) + + +class TestDatetimeParsingWrappers(tm.TestCase): + def test_does_not_convert_mixed_integer(self): + bad_date_strings = ( + '-50000', + '999', + '123.1234', + 'm', + 'T' + ) + + for bad_date_string in bad_date_strings: + self.assertFalse( + tslib._does_string_look_like_datetime(bad_date_string) + ) + + good_date_strings = ( + '2012-01-01', + '01/01/2012', + 'Mon Sep 16, 2013', + '01012012', + '0101', + '1-1', + ) + + for good_date_string in good_date_strings: + self.assertTrue( + tslib._does_string_look_like_datetime(good_date_string) + ) + + +class TestArrayToDatetime(tm.TestCase): + def test_parsing_valid_dates(self): + arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np.array( + [ + '2013-01-01T00:00:00.000000000-0000', + '2013-01-02T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + + arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np.array( + [ + '2013-09-16T00:00:00.000000000-0000', + '2013-09-17T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + + def test_number_looking_strings_not_into_datetime(self): + # #4601 + # These strings don't look like datetimes so they shouldn't be + # attempted to be converted + arr = np.array(['-352.737091', '183.575577'], dtype=object) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) + + arr = np.array(['1', '2', '3', '4', '5'], dtype=object) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) + + def test_coercing_dates_outside_of_datetime64_ns_bounds(self): + invalid_dates = [ + datetime.date(1000, 1, 1), + datetime.datetime(1000, 1, 1), + '1000-01-01', + 'Jan 1, 1000', + np.datetime64('1000-01-01'), + ] + + for invalid_date in invalid_dates: + self.assertRaises( + ValueError, + tslib.array_to_datetime, + np.array([invalid_date], dtype='object'), + coerce=False, + raise_=True, + ) + self.assertTrue( + np.array_equal( + tslib.array_to_datetime( + np.array([invalid_date], dtype='object'), coerce=True + ), + np.array([tslib.iNaT], dtype='M8[ns]') + ) + ) + + arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( + [ + tslib.iNaT, + '2000-01-01T00:00:00.000000000-0000' + ], + dtype='M8[ns]' + ) + ) + + def test_coerce_of_invalid_datetimes(self): + arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) + + # Without coercing, the presence of any invalid dates prevents + # any values from being converted + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) + + # With coercing, the invalid dates becomes iNaT + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( + [ + '2013-01-01T00:00:00.000000000-0000', + tslib.iNaT, + tslib.iNaT + ], + dtype='M8[ns]' + ) + ) + + def test_parsing_timezone_offsets(self): + # All of these datetime strings with offsets are equivalent + # to the same datetime after the timezone offset is added + dt_strings = [ + '01-01-2013 08:00:00+08:00', + '2013-01-01T08:00:00.000000000+0800', + '2012-12-31T16:00:00.000000000-0800', + '12-31-2012 23:00:00-01:00', + ] + + expected_output = tslib.array_to_datetime( + np.array(['01-01-2013 00:00:00'], dtype=object) + ) + + for dt_string in dt_strings: + self.assert_numpy_array_equal( + tslib.array_to_datetime( + np.array([dt_string], dtype=object) + ), + expected_output + ) + +class TestTimestampNsOperations(tm.TestCase): + def setUp(self): + tm._skip_if_not_numpy17_friendly() + self.timestamp = Timestamp(datetime.datetime.utcnow()) + + def assert_ns_timedelta(self, modified_timestamp, expected_value): + value = self.timestamp.value + modified_value = modified_timestamp.value + + self.assertEqual(modified_value - value, expected_value) + + def test_timedelta_ns_arithmetic(self): + self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'ns'), -123) + + def test_timedelta_ns_based_arithmetic(self): + self.assert_ns_timedelta(self.timestamp + np.timedelta64(1234567898, 'ns'), 1234567898) + + def test_timedelta_us_arithmetic(self): + self.assert_ns_timedelta(self.timestamp + np.timedelta64(-123, 'us'), -123000) + + def test_timedelta_ms_arithmetic(self): + time = self.timestamp + np.timedelta64(-123, 'ms') + self.assert_ns_timedelta(time, -123000000) + + def test_nanosecond_string_parsing(self): + self.timestamp = Timestamp('2013-05-01 07:15:45.123456789') + self.assertEqual(self.timestamp.value, 1367392545123456000) + + def test_nanosecond_timestamp(self): + # GH 7610 + expected = 1293840000000000005 + t = Timestamp('2011-01-01') + offsets.Nano(5) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 5) + + t = Timestamp(t) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 5) + + t = Timestamp(np.datetime64('2011-01-01 00:00:00.000000005Z')) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000005')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 5) + + expected = 1293840000000000010 + t = t + offsets.Nano(5) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 10) + + t = Timestamp(t) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 10) + + t = Timestamp(np.datetime64('2011-01-01 00:00:00.000000010Z')) + self.assertEqual(repr(t), "Timestamp('2011-01-01 00:00:00.000000010')") + self.assertEqual(t.value, expected) + self.assertEqual(t.nanosecond, 10) + + def test_nat_arithmetic(self): + # GH 6873 + nat = tslib.NaT + t = Timestamp('2014-01-01') + dt = datetime.datetime(2014, 1, 1) + delta = datetime.timedelta(3600) + + # Timestamp / datetime + for (left, right) in [(nat, nat), (nat, t), (dt, nat)]: + # NaT + Timestamp-like should raise TypeError + with tm.assertRaises(TypeError): + left + right + with tm.assertRaises(TypeError): + right + left + + # NaT - Timestamp-like (or inverse) returns NaT + self.assertTrue((left - right) is tslib.NaT) + self.assertTrue((right - left) is tslib.NaT) + + # timedelta-like + # offsets are tested in test_offsets.py + for (left, right) in [(nat, delta)]: + # NaT + timedelta-like returns NaT + self.assertTrue((left + right) is tslib.NaT) + # timedelta-like + NaT should raise TypeError + with tm.assertRaises(TypeError): + right + left + + self.assertTrue((left - right) is tslib.NaT) + with tm.assertRaises(TypeError): + right - left + + if _np_version_under1p7: + self.assertEqual(nat + np.timedelta64(1, 'h'), tslib.NaT) + with tm.assertRaises(TypeError): + np.timedelta64(1, 'h') + nat + + self.assertEqual(nat - np.timedelta64(1, 'h'), tslib.NaT) + with tm.assertRaises(TypeError): + np.timedelta64(1, 'h') - nat + + +class TestTslib(tm.TestCase): + + def test_intraday_conversion_factors(self): + self.assertEqual(period_asfreq(1, get_freq('D'), get_freq('H'), False), 24) + self.assertEqual(period_asfreq(1, get_freq('D'), get_freq('T'), False), 1440) + self.assertEqual(period_asfreq(1, get_freq('D'), get_freq('S'), False), 86400) + self.assertEqual(period_asfreq(1, get_freq('D'), get_freq('L'), False), 86400000) + self.assertEqual(period_asfreq(1, get_freq('D'), get_freq('U'), False), 86400000000) + self.assertEqual(period_asfreq(1, get_freq('D'), get_freq('N'), False), 86400000000000) + + self.assertEqual(period_asfreq(1, get_freq('H'), get_freq('T'), False), 60) + self.assertEqual(period_asfreq(1, get_freq('H'), get_freq('S'), False), 3600) + self.assertEqual(period_asfreq(1, get_freq('H'), get_freq('L'), False), 3600000) + self.assertEqual(period_asfreq(1, get_freq('H'), get_freq('U'), False), 3600000000) + self.assertEqual(period_asfreq(1, get_freq('H'), get_freq('N'), False), 3600000000000) + + self.assertEqual(period_asfreq(1, get_freq('T'), get_freq('S'), False), 60) + self.assertEqual(period_asfreq(1, get_freq('T'), get_freq('L'), False), 60000) + self.assertEqual(period_asfreq(1, get_freq('T'), get_freq('U'), False), 60000000) + self.assertEqual(period_asfreq(1, get_freq('T'), get_freq('N'), False), 60000000000) + + self.assertEqual(period_asfreq(1, get_freq('S'), get_freq('L'), False), 1000) + self.assertEqual(period_asfreq(1, get_freq('S'), get_freq('U'), False), 1000000) + self.assertEqual(period_asfreq(1, get_freq('S'), get_freq('N'), False), 1000000000) + + self.assertEqual(period_asfreq(1, get_freq('L'), get_freq('U'), False), 1000) + self.assertEqual(period_asfreq(1, get_freq('L'), get_freq('N'), False), 1000000) + + self.assertEqual(period_asfreq(1, get_freq('U'), get_freq('N'), False), 1000) + + def test_period_ordinal_start_values(self): + # information for 1.1.1970 + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('Y'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('M'))) + self.assertEqual(1, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('W'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('D'))) + self.assertEqual(0, period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('B'))) + + def test_period_ordinal_week(self): + self.assertEqual(1, period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, get_freq('W'))) + self.assertEqual(2, period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, get_freq('W'))) + + self.assertEqual(2284, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, get_freq('W'))) + self.assertEqual(2285, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, get_freq('W'))) + + def test_period_ordinal_business_day(self): + # Thursday + self.assertEqual(11415, period_ordinal(2013, 10, 3, 0, 0, 0, 0, 0, get_freq('B'))) + # Friday + self.assertEqual(11416, period_ordinal(2013, 10, 4, 0, 0, 0, 0, 0, get_freq('B'))) + # Saturday + self.assertEqual(11417, period_ordinal(2013, 10, 5, 0, 0, 0, 0, 0, get_freq('B'))) + # Sunday + self.assertEqual(11417, period_ordinal(2013, 10, 6, 0, 0, 0, 0, 0, get_freq('B'))) + # Monday + self.assertEqual(11417, period_ordinal(2013, 10, 7, 0, 0, 0, 0, 0, get_freq('B'))) + # Tuesday + self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, get_freq('B'))) + +class TestTimestampOps(tm.TestCase): + def test_timestamp_and_datetime(self): + self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) + self.assertEqual((datetime.datetime(2013, 10, 12) - Timestamp(datetime.datetime(2013, 10, 13))).days, -1) + + def test_timestamp_and_series(self): + timestamp_series = Series(date_range('2014-03-17', periods=2, freq='D', tz='US/Eastern')) + first_timestamp = timestamp_series[0] + + if not _np_version_under1p7: + delta_series = Series([np.timedelta64(0, 'D'), np.timedelta64(1, 'D')]) + assert_series_equal(timestamp_series - first_timestamp, delta_series) + assert_series_equal(first_timestamp - timestamp_series, -delta_series) + + def test_addition_subtraction_types(self): + # Assert on the types resulting from Timestamp +/- various date/time objects + datetime_instance = datetime.datetime(2014, 3, 4) + timedelta_instance = datetime.timedelta(seconds=1) + # build a timestamp with a frequency, since then it supports addition/subtraction of integers + timestamp_instance = date_range(datetime_instance, periods=1, freq='D')[0] + + self.assertEqual(type(timestamp_instance + 1), Timestamp) + self.assertEqual(type(timestamp_instance - 1), Timestamp) + + # Timestamp + datetime not supported, though subtraction is supported and yields timedelta + self.assertEqual(type(timestamp_instance - datetime_instance), datetime.timedelta) + + self.assertEqual(type(timestamp_instance + timedelta_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta_instance), Timestamp) + + if not _np_version_under1p7: + # Timestamp +/- datetime64 not supported, so not tested (could possibly assert error raised?) + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual(type(timestamp_instance + timedelta64_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta64_instance), Timestamp) + + def test_addition_subtraction_preserve_frequency(self): + timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0] + timedelta_instance = datetime.timedelta(days=1) + original_freq = timestamp_instance.freq + self.assertEqual((timestamp_instance + 1).freq, original_freq) + self.assertEqual((timestamp_instance - 1).freq, original_freq) + self.assertEqual((timestamp_instance + timedelta_instance).freq, original_freq) + self.assertEqual((timestamp_instance - timedelta_instance).freq, original_freq) + + if not _np_version_under1p7: + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual((timestamp_instance + timedelta64_instance).freq, original_freq) + self.assertEqual((timestamp_instance - timedelta64_instance).freq, original_freq) + + def test_resolution(self): + + for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], + [tslib.D_RESO, tslib.D_RESO, tslib.D_RESO, tslib.D_RESO, + tslib.H_RESO, tslib.T_RESO,tslib.S_RESO, tslib.MS_RESO, tslib.US_RESO]): + for tz in [None, 'Asia/Tokyo', 'US/Eastern']: + idx = date_range(start='2013-04-01', periods=30, freq=freq, tz=tz) + result = tslib.resolution(idx.asi8, idx.tz) + self.assertEqual(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py new file mode 100644 index 00000000..df556cdc --- /dev/null +++ b/pandas/tseries/tests/test_util.py @@ -0,0 +1,107 @@ +from pandas.compat import range +import nose + +import numpy as np +from numpy.testing.decorators import slow + +from pandas import Series, date_range +import pandas.util.testing as tm + +from datetime import datetime, date + +from pandas.tseries.tools import normalize_date +from pandas.tseries.util import pivot_annual, isleapyear + + +class TestPivotAnnual(tm.TestCase): + """ + New pandas of scikits.timeseries pivot_annual + """ + def test_daily(self): + rng = date_range('1/1/2000', '12/31/2004', freq='D') + ts = Series(np.random.randn(len(rng)), index=rng) + + annual = pivot_annual(ts, 'D') + + doy = ts.index.dayofyear + doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 + + for i in range(1, 367): + subset = ts[doy == i] + subset.index = [x.year for x in subset.index] + + tm.assert_series_equal(annual[i].dropna(), subset) + + # check leap days + leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)] + day = leaps.index.dayofyear[0] + leaps.index = leaps.index.year + tm.assert_series_equal(annual[day].dropna(), leaps) + + def test_hourly(self): + rng_hourly = date_range( + '1/1/1994', periods=(18 * 8760 + 4 * 24), freq='H') + data_hourly = np.random.randint(100, 350, rng_hourly.size) + ts_hourly = Series(data_hourly, index=rng_hourly) + + grouped = ts_hourly.groupby(ts_hourly.index.year) + hoy = grouped.apply(lambda x: x.reset_index(drop=True)) + hoy = hoy.index.droplevel(0).values + hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 + hoy += 1 + + annual = pivot_annual(ts_hourly) + + ts_hourly = ts_hourly.astype(float) + for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: + subset = ts_hourly[hoy == i] + subset.index = [x.year for x in subset.index] + + tm.assert_series_equal(annual[i].dropna(), subset) + + leaps = ts_hourly[(ts_hourly.index.month == 2) & + (ts_hourly.index.day == 29) & + (ts_hourly.index.hour == 0)] + hour = leaps.index.dayofyear[0] * 24 - 23 + leaps.index = leaps.index.year + tm.assert_series_equal(annual[hour].dropna(), leaps) + + def test_weekly(self): + pass + + def test_monthly(self): + rng = date_range('1/1/2000', '12/31/2004', freq='M') + ts = Series(np.random.randn(len(rng)), index=rng) + + annual = pivot_annual(ts, 'M') + + month = ts.index.month + for i in range(1, 13): + subset = ts[month == i] + subset.index = [x.year for x in subset.index] + tm.assert_series_equal(annual[i].dropna(), subset) + + def test_period_monthly(self): + pass + + def test_period_daily(self): + pass + + def test_period_weekly(self): + pass + + +def test_normalize_date(): + value = date(2012, 9, 7) + + result = normalize_date(value) + assert(result == datetime(2012, 9, 7)) + + value = datetime(2012, 9, 7, 12) + + result = normalize_date(value) + assert(result == datetime(2012, 9, 7)) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py new file mode 100644 index 00000000..0d6d74db --- /dev/null +++ b/pandas/tseries/timedeltas.py @@ -0,0 +1,251 @@ +""" +timedelta support tools +""" + +import re +from datetime import timedelta + +import numpy as np +import pandas.tslib as tslib +from pandas import compat, _np_version_under1p7 +from pandas.core.common import (ABCSeries, is_integer, is_integer_dtype, is_timedelta64_dtype, + _values_from_object, is_list_like, isnull, _ensure_object) + +repr_timedelta = tslib.repr_timedelta64 +repr_timedelta64 = tslib.repr_timedelta64 + +def to_timedelta(arg, box=True, unit='ns'): + """ + Convert argument to timedelta + + Parameters + ---------- + arg : string, timedelta, array of strings (with possible NAs) + box : boolean, default True + If True returns a Series of the results, if False returns ndarray of values + unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, which is an integer/float number + + Returns + ------- + ret : timedelta64/arrays of timedelta64 if parsing succeeded + """ + if _np_version_under1p7: + raise ValueError("to_timedelta is not support for numpy < 1.7") + + unit = _validate_timedelta_unit(unit) + + def _convert_listlike(arg, box, unit): + + if isinstance(arg, (list,tuple)): + arg = np.array(arg, dtype='O') + + if is_timedelta64_dtype(arg): + value = arg.astype('timedelta64[ns]') + elif is_integer_dtype(arg): + + # these are shortcutable + value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]') + else: + try: + value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit) + except: + value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit) for r in arg ]) + + if box: + from pandas import Series + value = Series(value,dtype='m8[ns]') + return value + + if arg is None: + return arg + elif isinstance(arg, ABCSeries): + from pandas import Series + values = _convert_listlike(arg.values, box=False, unit=unit) + return Series(values, index=arg.index, name=arg.name, dtype='m8[ns]') + elif is_list_like(arg): + return _convert_listlike(arg, box=box, unit=unit) + + # ...so it must be a scalar value. Return scalar. + return _coerce_scalar_to_timedelta_type(arg, unit=unit) + +_unit_map = { + 'Y' : 'Y', + 'y' : 'Y', + 'W' : 'W', + 'w' : 'W', + 'D' : 'D', + 'd' : 'D', + 'days' : 'D', + 'Days' : 'D', + 'day' : 'D', + 'Day' : 'D', + 'M' : 'M', + 'H' : 'h', + 'h' : 'h', + 'm' : 'm', + 'T' : 'm', + 'S' : 's', + 's' : 's', + 'L' : 'ms', + 'MS' : 'ms', + 'ms' : 'ms', + 'US' : 'us', + 'us' : 'us', + 'NS' : 'ns', + 'ns' : 'ns', + } + +def _validate_timedelta_unit(arg): + """ provide validation / translation for timedelta short units """ + try: + return _unit_map[arg] + except: + raise ValueError("invalid timedelta unit {0} provided".format(arg)) + +_short_search = re.compile( + "^\s*(?P-?)\s*(?P\d*\.?\d*)\s*(?Pd|s|ms|us|ns)?\s*$",re.IGNORECASE) +_full_search = re.compile( + "^\s*(?P-?)\s*(?P\d+)?\s*(days|d|day)?,?\s*(?P